Mercurial > silorider
changeset 33:9e4eb3f2754e
Improve handling of character limits in html stripping
The code now more closely keeps track of character counts during html
stripping, and should be absolutely exact. When the limit is exceeded,
it now restarts the stripping without any URLs to prevent incorrect
trimming. It also better preserves whitespace in the original post.
New tests are added for Twitter silo to ensure it works as expected.
author | Ludovic Chabant <ludovic@chabant.com> |
---|---|
date | Wed, 10 May 2023 16:10:12 -0700 |
parents | 2265920c4688 |
children | 8c513e43673d |
files | silorider/format.py tests/test_format.py tests/test_silos_twitter.py |
diffstat | 3 files changed, 154 insertions(+), 26 deletions(-) [+] |
line wrap: on
line diff
--- a/silorider/format.py Wed May 10 16:06:46 2023 -0700 +++ b/silorider/format.py Wed May 10 16:10:12 2023 -0700 @@ -1,4 +1,5 @@ import re +import string import textwrap import bs4 from .config import has_lxml @@ -10,6 +11,7 @@ ctx = HtmlStrippingContext() if url_flattener: ctx.url_flattener = url_flattener + # Don't add the limit yet. name = get_best_text(entry, ctx) if not name: @@ -20,23 +22,29 @@ if limit: text_length = ctx.text_length if do_add_url and url: + # We need to add the URL at the end of the post, so account + # for it plus a space by making the text length limit smaller. limit -= 1 + ctx.url_flattener.measureUrl(url) shortened = text_length > limit if shortened: - # If we have to shorten the text, but we haven't taken the - # URL into account yet, let's see if we have to include it now! - # (this happens when we only want to include it when the text - # is shortened) + # We need to shorten the text! We can't really reason about it + # anymore at this point because we could have URLs inside the + # text that don't measure the correct number of characters + # (such as with Twitter's URL shortening). Let's just start + # again with a limit that's our max limit, minus the room + # needed to add the link to the post. if not do_add_url and add_url == 'auto' and url: do_add_url = True limit -= 1 + ctx.url_flattener.measureUrl(url) - if limit <= 0: - raise Exception("Can't shorten post name.") + ctx = HtmlStrippingContext() + ctx.limit = limit + if url_flattener: + ctx.url_flattener = url_flattener + name = get_best_text(entry, ctx) - name = textwrap.shorten(name, width=limit, placeholder="...") - + # Actually add the url to the original post now. if do_add_url and url: name += ' ' + url return name @@ -64,12 +72,43 @@ class HtmlStrippingContext: def __init__(self): - self.url_mode = URLMODE_BOTTOM_LIST + # Mode for inserting URLs + self.url_mode = URLMODE_LAST + # List of URLs to insert self.urls = [] + # Indices of URLs that should not get a leading whitespace self.nosp_urls = [] + # Object that can measure and shorten URLs self.url_flattener = _NullUrlFlattener() + # Limit for how long the text can be + self.limit = -1 + + # Accumulated text length when accounting for shortened URLs self.text_length = 0 + # Whether limit was reached + self.limit_reached = False + def processText(self, txt, allow_shorten=True): + added_len = len(txt) + next_text_length = self.text_length + added_len + if self.limit <= 0 or next_text_length <= self.limit: + self.text_length = next_text_length + return txt + + if allow_shorten: + max_allowed = self.limit - self.text_length + short_txt = textwrap.shorten( + txt, + width=max_allowed, + expand_tabs=False, + replace_whitespace=False, + placeholder="...") + self.text_length += len(short_txt) + self.limit_reached = True + return short_txt + else: + self.limit_reached = True + return '' def get_best_text(entry, ctx=None, *, plain=True): @@ -102,8 +141,10 @@ # If URLs are inline, insert them where we left our marker. If not, replace # our markers with an empty string and append the URLs at the end. + # If we reached the limit with the text alone, replace URLs with empty + # strings and bail out. keys = ['url:%d' % i for i in range(len(ctx.urls))] - if ctx.url_mode == URLMODE_INLINE: + if not ctx.limit_reached and ctx.url_mode == URLMODE_INLINE: url_repl = [' ' + u for u in ctx.urls] # Some URLs didn't have any text to be placed next to, so for those # we don't need any extra space before. @@ -113,12 +154,26 @@ else: urls = dict(zip(keys, [''] * len(ctx.urls))) outtxt = outtxt % urls + if ctx.limit_reached: + return outtxt if ctx.url_mode != URLMODE_INLINE and ctx.urls: - outtxt = outtxt.rstrip() if ctx.url_mode == URLMODE_LAST: - outtxt += ' ' + ' '.join(ctx.urls) + # Don't add unnecessary whitespace. + # NOTE: our final measure of the text might be one character + # too long because of this, but that's desirable. + if outtxt[-1] not in string.whitespace: + outtxt += ' ' + outtxt += ' '.join(ctx.urls) elif ctx.url_mode == URLMODE_BOTTOM_LIST: - outtxt += '\n' + '\n'.join(ctx.urls) + # If the last character of the text is a whitespace, replace + # it with a newline. + # NOTE: our final measure of the text might be one character + # too long because of this, but that's desirable. + if outtxt[-1] in string.whitespace: + outtxt = outtxt[:-1] + '\n' + else: + outtxt += '\n' + outtxt += '\n'.join(ctx.urls) # Add the length of URLs to the text length. for url in ctx.urls: @@ -128,8 +183,8 @@ # One space per URL except the explicitly no-space-urls. ctx.text_length += len(ctx.urls) - len(ctx.nosp_urls) else: - # One space or newline per URL, plus the first one. - ctx.text_length += len(ctx.urls) + 1 + # One space or newline per URL. + ctx.text_length += len(ctx.urls) return outtxt @@ -140,9 +195,27 @@ def _do_strip_html(elem, ctx): if isinstance(elem, bs4.NavigableString): - raw_txt = str(elem) - ctx.text_length += len(raw_txt) - return _escape_percents(raw_txt) + # Don't necessarily include this bit of text... + # If it belongs to a paragraph, include it. If not, include it + # only if there are not paragraphs in its siblings (because that + # means this is the white-space between the paragraph tags) + include_this = False + for parent in elem.parents: + if parent and parent.name == 'p': + include_this = True + break + else: + next_sib = next(elem.next_siblings, None) + prev_sib = next(elem.previous_siblings, None) + if ((prev_sib is None or prev_sib.name != 'p') and + (next_sib is None or next_sib.name != 'p')): + include_this = True + + if include_this: + raw_txt = str(elem) + return _escape_percents(ctx.processText(raw_txt)) + else: + return '' if elem.name == 'a': try: @@ -160,7 +233,7 @@ # flattener computed a custom text length. If not, do the # standard computation. if ctx.text_length == old_text_length: - ctx.text_length += len(href_flattened) + return ctx.processText(href_flattened, False) return href_flattened # If we have a simple hyperlink where the text is a substring of @@ -187,8 +260,7 @@ if c.name == 'li': outtxt += ('%s. ' % (i + 1)) + _do_strip_html(c, ctx) outtxt += '\n' - ctx.text_length += len(outtxt) - return outtxt + return ctx.processText(outtxt) if elem.name == 'ul': outtxt = '' @@ -196,8 +268,7 @@ if c.name == 'li': outtxt += '- ' + _do_strip_html(c, ctx) outtxt += '\n' - ctx.text_length += len(outtxt) - return outtxt + return ctx.processText(outtxt) return ''.join([_do_strip_html(c, ctx) for c in elem.children])
--- a/tests/test_format.py Wed May 10 16:06:46 2023 -0700 +++ b/tests/test_format.py Wed May 10 16:10:12 2023 -0700 @@ -45,9 +45,9 @@ @pytest.mark.parametrize("text, expected", [ - ("<p>Something with <a href=\"http://example.org/blah\">a link</a>", + ("<p>Something with <a href=\"http://example.org/blah\">a link</a></p>", "Something with a link\nhttp://example.org/blah"), - ("<p>Something with a link <a href=\"http://example.org/blah\">http://example.org</a>", # NOQA + ("<p>Something with a link <a href=\"http://example.org/blah\">http://example.org</a></p>", # NOQA "Something with a link\nhttp://example.org/blah"), ("<p>Something with <a href=\"http://example.org/first\">one link here</a> and <a href=\"http://example.org/second\">another there</a>...</p>", # NOQA "Something with one link here and another there...\nhttp://example.org/first\nhttp://example.org/second") # NOQA
--- a/tests/test_silos_twitter.py Wed May 10 16:06:46 2023 -0700 +++ b/tests/test_silos_twitter.py Wed May 10 16:10:12 2023 -0700 @@ -51,7 +51,7 @@ ctx, _ = cli.run('process') assert ctx.cache.wasPosted('test', '/01234.html') toot = ctx.silos[0].client.tweets[0] - assert toot == ("Hey @jack you should fix your stuff!", []) + assert toot == ("Hey @jack\nyou should fix your stuff!", []) def test_one_micropost_with_one_photo(cli, feedutil, tweetmock, monkeypatch): @@ -96,6 +96,63 @@ ['/fullimg1.jpg', '/fullimg2.jpg']) +def test_micropost_with_long_text_and_link(cli, feedutil, tweetmock, monkeypatch): + feed = cli.createTempFeed(feedutil.makeFeed( + """<div class="p-name"> + <p>This a pretty long text that has a link in it :) We want to make sure it gets to the limit of what Twitter allows, so that we can test there won't be any off-by-one errors in measurements. Here is a <a href="https://docs.python.org/3/library/textwrap.html">link to Python's textwrap module</a>, which is appropriate!!!</p> + </div> + <a class="u-url" href="/01234.html">permalink</a>""" + )) + + cli.appendSiloConfig('test', 'twitter', url='/blah') + cli.setFeedConfig('feed', feed) + tweetmock.installTokens(cli, 'test') + + ctx, _ = cli.run('process') + assert ctx.cache.wasPosted('test', '/01234.html') + toot = ctx.silos[0].client.tweets[0] + assert toot == ("This a pretty long text that has a link in it :) We want to make sure it gets to the limit of what Twitter allows, so that we can test there won't be any off-by-one errors in measurements. Here is a link to Python's textwrap module, which is appropriate!!! https://docs.python.org/3/library/textwrap.html", + []) + + +def test_micropost_with_too_long_text_and_link_1(cli, feedutil, tweetmock, monkeypatch): + feed = cli.createTempFeed(feedutil.makeFeed( + """<div class="p-name"> + <p>This time we have a text that's slightly too long, with <a href="https://thisdoesntmatter.com">a link here</a>. We'll be one character too long, with a short word at the end to test the shortening algorithm. Otherwise, don't worry about it. Blah blah blah. Trying to get to the limit. Almost here yes</p> + </div> + <a class="u-url" href="/01234.html">permalink</a>""" + )) + + cli.appendSiloConfig('test', 'twitter', url='/blah') + cli.setFeedConfig('feed', feed) + tweetmock.installTokens(cli, 'test') + + ctx, _ = cli.run('process') + assert ctx.cache.wasPosted('test', '/01234.html') + toot = ctx.silos[0].client.tweets[0] + assert toot == ("This time we have a text that's slightly too long, with a link here. We'll be one character too long, with a short word at the end to test the shortening algorithm. Otherwise, don't worry about it. Blah blah blah. Trying to get to the limit. Almost here... /01234.html", + []) + + +def test_micropost_with_too_long_text_and_link_2(cli, feedutil, tweetmock, monkeypatch): + feed = cli.createTempFeed(feedutil.makeFeed( + """<div class="p-name"> + <p>This time we have a text that's slightly too long, with <a href="https://thisdoesntmatter.com">a link here</a>. We'll be one character too long, with a loooooong word at the end to test the shortening algorithm. Otherwise, don't worry about it. Blah blah blah. Our long word is: califragilisticastuff</p> + </div> + <a class="u-url" href="/01234.html">permalink</a>""" + )) + + cli.appendSiloConfig('test', 'twitter', url='/blah') + cli.setFeedConfig('feed', feed) + tweetmock.installTokens(cli, 'test') + + ctx, _ = cli.run('process') + assert ctx.cache.wasPosted('test', '/01234.html') + toot = ctx.silos[0].client.tweets[0] + assert toot == ("This time we have a text that's slightly too long, with a link here. We'll be one character too long, with a loooooong word at the end to test the shortening algorithm. Otherwise, don't worry about it. Blah blah blah. Our long word is:... /01234.html", + []) + + @pytest.fixture(scope='session') def tweetmock(): from silorider.silos.twitter import TwitterSilo