changeset 33:9e4eb3f2754e

Improve handling of character limits in html stripping The code now more closely keeps track of character counts during html stripping, and should be absolutely exact. When the limit is exceeded, it now restarts the stripping without any URLs to prevent incorrect trimming. It also better preserves whitespace in the original post. New tests are added for Twitter silo to ensure it works as expected.
author Ludovic Chabant <ludovic@chabant.com>
date Wed, 10 May 2023 16:10:12 -0700
parents 2265920c4688
children 8c513e43673d
files silorider/format.py tests/test_format.py tests/test_silos_twitter.py
diffstat 3 files changed, 154 insertions(+), 26 deletions(-) [+]
line wrap: on
line diff
--- a/silorider/format.py	Wed May 10 16:06:46 2023 -0700
+++ b/silorider/format.py	Wed May 10 16:10:12 2023 -0700
@@ -1,4 +1,5 @@
 import re
+import string
 import textwrap
 import bs4
 from .config import has_lxml
@@ -10,6 +11,7 @@
     ctx = HtmlStrippingContext()
     if url_flattener:
         ctx.url_flattener = url_flattener
+    # Don't add the limit yet.
 
     name = get_best_text(entry, ctx)
     if not name:
@@ -20,23 +22,29 @@
     if limit:
         text_length = ctx.text_length
         if do_add_url and url:
+            # We need to add the URL at the end of the post, so account
+            # for it plus a space by making the text length limit smaller.
             limit -= 1 + ctx.url_flattener.measureUrl(url)
 
         shortened = text_length > limit
         if shortened:
-            # If we have to shorten the text, but we haven't taken the
-            # URL into account yet, let's see if we have to include it now!
-            # (this happens when we only want to include it when the text
-            #  is shortened)
+            # We need to shorten the text! We can't really reason about it
+            # anymore at this point because we could have URLs inside the
+            # text that don't measure the correct number of characters
+            # (such as with Twitter's URL shortening). Let's just start
+            # again with a limit that's our max limit, minus the room
+            # needed to add the link to the post.
             if not do_add_url and add_url == 'auto' and url:
                 do_add_url = True
                 limit -= 1 + ctx.url_flattener.measureUrl(url)
 
-        if limit <= 0:
-            raise Exception("Can't shorten post name.")
+            ctx = HtmlStrippingContext()
+            ctx.limit = limit
+            if url_flattener:
+                ctx.url_flattener = url_flattener
+            name = get_best_text(entry, ctx)
 
-        name = textwrap.shorten(name, width=limit, placeholder="...")
-
+    # Actually add the url to the original post now.
     if do_add_url and url:
         name += ' ' + url
     return name
@@ -64,12 +72,43 @@
 
 class HtmlStrippingContext:
     def __init__(self):
-        self.url_mode = URLMODE_BOTTOM_LIST
+        # Mode for inserting URLs
+        self.url_mode = URLMODE_LAST
+        # List of URLs to insert
         self.urls = []
+        # Indices of URLs that should not get a leading whitespace
         self.nosp_urls = []
+        # Object that can measure and shorten URLs
         self.url_flattener = _NullUrlFlattener()
+        # Limit for how long the text can be
+        self.limit = -1
+
+        # Accumulated text length when accounting for shortened URLs
         self.text_length = 0
+        # Whether limit was reached
+        self.limit_reached = False
 
+    def processText(self, txt, allow_shorten=True):
+        added_len = len(txt)
+        next_text_length = self.text_length + added_len
+        if self.limit <= 0 or next_text_length <= self.limit:
+            self.text_length = next_text_length
+            return txt
+
+        if allow_shorten:
+            max_allowed = self.limit - self.text_length
+            short_txt = textwrap.shorten(
+                txt,
+                width=max_allowed,
+                expand_tabs=False,
+                replace_whitespace=False,
+                placeholder="...")
+            self.text_length += len(short_txt)
+            self.limit_reached = True
+            return short_txt
+        else:
+            self.limit_reached = True
+            return ''
 
 
 def get_best_text(entry, ctx=None, *, plain=True):
@@ -102,8 +141,10 @@
 
     # If URLs are inline, insert them where we left our marker. If not, replace
     # our markers with an empty string and append the URLs at the end.
+    # If we reached the limit with the text alone, replace URLs with empty
+    # strings and bail out.
     keys = ['url:%d' % i for i in range(len(ctx.urls))]
-    if ctx.url_mode == URLMODE_INLINE:
+    if not ctx.limit_reached and ctx.url_mode == URLMODE_INLINE:
         url_repl = [' ' + u for u in ctx.urls]
         # Some URLs didn't have any text to be placed next to, so for those
         # we don't need any extra space before.
@@ -113,12 +154,26 @@
     else:
         urls = dict(zip(keys, [''] * len(ctx.urls)))
     outtxt = outtxt % urls
+    if ctx.limit_reached:
+        return outtxt
     if ctx.url_mode != URLMODE_INLINE and ctx.urls:
-        outtxt = outtxt.rstrip()
         if ctx.url_mode == URLMODE_LAST:
-            outtxt += ' ' + ' '.join(ctx.urls)
+            # Don't add unnecessary whitespace.
+            # NOTE: our final measure of the text might be one character
+            #       too long because of this, but that's desirable.
+            if outtxt[-1] not in string.whitespace:
+                outtxt += ' '
+            outtxt += ' '.join(ctx.urls)
         elif ctx.url_mode == URLMODE_BOTTOM_LIST:
-            outtxt += '\n' + '\n'.join(ctx.urls)
+            # If the last character of the text is a whitespace, replace
+            # it with a newline.
+            # NOTE: our final measure of the text might be one character
+            #       too long because of this, but that's desirable.
+            if outtxt[-1] in string.whitespace:
+                outtxt = outtxt[:-1] + '\n'
+            else:
+                outtxt += '\n'
+            outtxt += '\n'.join(ctx.urls)
 
     # Add the length of URLs to the text length.
     for url in ctx.urls:
@@ -128,8 +183,8 @@
         # One space per URL except the explicitly no-space-urls.
         ctx.text_length += len(ctx.urls) - len(ctx.nosp_urls)
     else:
-        # One space or newline per URL, plus the first one.
-        ctx.text_length += len(ctx.urls) + 1
+        # One space or newline per URL.
+        ctx.text_length += len(ctx.urls)
 
     return outtxt
 
@@ -140,9 +195,27 @@
 
 def _do_strip_html(elem, ctx):
     if isinstance(elem, bs4.NavigableString):
-        raw_txt = str(elem)
-        ctx.text_length += len(raw_txt)
-        return _escape_percents(raw_txt)
+        # Don't necessarily include this bit of text...
+        # If it belongs to a paragraph, include it. If not, include it
+        # only if there are not paragraphs in its siblings (because that
+        # means this is the white-space between the paragraph tags)
+        include_this = False
+        for parent in elem.parents:
+            if parent and parent.name == 'p':
+                include_this = True
+                break
+        else:
+            next_sib = next(elem.next_siblings, None)
+            prev_sib = next(elem.previous_siblings, None)
+            if ((prev_sib is None or prev_sib.name != 'p') and
+                (next_sib is None or next_sib.name != 'p')):
+                include_this = True
+
+        if include_this:
+            raw_txt = str(elem)
+            return _escape_percents(ctx.processText(raw_txt))
+        else:
+            return ''
 
     if elem.name == 'a':
         try:
@@ -160,7 +233,7 @@
                 # flattener computed a custom text length. If not, do the
                 # standard computation.
                 if ctx.text_length == old_text_length:
-                    ctx.text_length += len(href_flattened)
+                    return ctx.processText(href_flattened, False)
                 return href_flattened
 
             # If we have a simple hyperlink where the text is a substring of
@@ -187,8 +260,7 @@
             if c.name == 'li':
                 outtxt += ('%s. ' % (i + 1)) + _do_strip_html(c, ctx)
                 outtxt += '\n'
-        ctx.text_length += len(outtxt)
-        return outtxt
+        return ctx.processText(outtxt)
 
     if elem.name == 'ul':
         outtxt = ''
@@ -196,8 +268,7 @@
             if c.name == 'li':
                 outtxt += '- ' + _do_strip_html(c, ctx)
                 outtxt += '\n'
-        ctx.text_length += len(outtxt)
-        return outtxt
+        return ctx.processText(outtxt)
 
     return ''.join([_do_strip_html(c, ctx) for c in elem.children])
 
--- a/tests/test_format.py	Wed May 10 16:06:46 2023 -0700
+++ b/tests/test_format.py	Wed May 10 16:10:12 2023 -0700
@@ -45,9 +45,9 @@
 
 
 @pytest.mark.parametrize("text, expected", [
-    ("<p>Something with <a href=\"http://example.org/blah\">a link</a>",
+    ("<p>Something with <a href=\"http://example.org/blah\">a link</a></p>",
      "Something with a link\nhttp://example.org/blah"),
-    ("<p>Something with a link <a href=\"http://example.org/blah\">http://example.org</a>",  # NOQA
+    ("<p>Something with a link <a href=\"http://example.org/blah\">http://example.org</a></p>",  # NOQA
      "Something with a link\nhttp://example.org/blah"),
     ("<p>Something with <a href=\"http://example.org/first\">one link here</a> and <a href=\"http://example.org/second\">another there</a>...</p>",  # NOQA
      "Something with one link here and another there...\nhttp://example.org/first\nhttp://example.org/second")  # NOQA
--- a/tests/test_silos_twitter.py	Wed May 10 16:06:46 2023 -0700
+++ b/tests/test_silos_twitter.py	Wed May 10 16:10:12 2023 -0700
@@ -51,7 +51,7 @@
     ctx, _ = cli.run('process')
     assert ctx.cache.wasPosted('test', '/01234.html')
     toot = ctx.silos[0].client.tweets[0]
-    assert toot == ("Hey @jack you should fix your stuff!", [])
+    assert toot == ("Hey @jack\nyou should fix your stuff!", [])
 
 
 def test_one_micropost_with_one_photo(cli, feedutil, tweetmock, monkeypatch):
@@ -96,6 +96,63 @@
                     ['/fullimg1.jpg', '/fullimg2.jpg'])
 
 
+def test_micropost_with_long_text_and_link(cli, feedutil, tweetmock, monkeypatch):
+    feed = cli.createTempFeed(feedutil.makeFeed(
+		"""<div class="p-name">
+	<p>This a pretty long text that has a link in it :) We want to make sure it gets to the limit of what Twitter allows, so that we can test there won't be any off-by-one errors in measurements. Here is a <a href="https://docs.python.org/3/library/textwrap.html">link to Python's textwrap module</a>, which is appropriate!!!</p>
+	</div>
+	<a class="u-url" href="/01234.html">permalink</a>"""
+    ))
+
+    cli.appendSiloConfig('test', 'twitter', url='/blah')
+    cli.setFeedConfig('feed', feed)
+    tweetmock.installTokens(cli, 'test')
+
+    ctx, _ = cli.run('process')
+    assert ctx.cache.wasPosted('test', '/01234.html')
+    toot = ctx.silos[0].client.tweets[0]
+    assert toot == ("This a pretty long text that has a link in it :) We want to make sure it gets to the limit of what Twitter allows, so that we can test there won't be any off-by-one errors in measurements. Here is a link to Python's textwrap module, which is appropriate!!! https://docs.python.org/3/library/textwrap.html",
+		[])
+
+
+def test_micropost_with_too_long_text_and_link_1(cli, feedutil, tweetmock, monkeypatch):
+    feed = cli.createTempFeed(feedutil.makeFeed(
+		"""<div class="p-name">
+	<p>This time we have a text that's slightly too long, with <a href="https://thisdoesntmatter.com">a link here</a>. We'll be one character too long, with a short word at the end to test the shortening algorithm. Otherwise, don't worry about it. Blah blah blah. Trying to get to the limit. Almost here yes</p>
+	</div>
+	<a class="u-url" href="/01234.html">permalink</a>"""
+    ))
+
+    cli.appendSiloConfig('test', 'twitter', url='/blah')
+    cli.setFeedConfig('feed', feed)
+    tweetmock.installTokens(cli, 'test')
+
+    ctx, _ = cli.run('process')
+    assert ctx.cache.wasPosted('test', '/01234.html')
+    toot = ctx.silos[0].client.tweets[0]
+    assert toot == ("This time we have a text that's slightly too long, with a link here. We'll be one character too long, with a short word at the end to test the shortening algorithm. Otherwise, don't worry about it. Blah blah blah. Trying to get to the limit. Almost here... /01234.html",
+		[])
+
+
+def test_micropost_with_too_long_text_and_link_2(cli, feedutil, tweetmock, monkeypatch):
+    feed = cli.createTempFeed(feedutil.makeFeed(
+		"""<div class="p-name">
+	<p>This time we have a text that's slightly too long, with <a href="https://thisdoesntmatter.com">a link here</a>. We'll be one character too long, with a loooooong word at the end to test the shortening algorithm. Otherwise, don't worry about it. Blah blah blah. Our long word is: califragilisticastuff</p>
+	</div>
+	<a class="u-url" href="/01234.html">permalink</a>"""
+    ))
+
+    cli.appendSiloConfig('test', 'twitter', url='/blah')
+    cli.setFeedConfig('feed', feed)
+    tweetmock.installTokens(cli, 'test')
+
+    ctx, _ = cli.run('process')
+    assert ctx.cache.wasPosted('test', '/01234.html')
+    toot = ctx.silos[0].client.tweets[0]
+    assert toot == ("This time we have a text that's slightly too long, with a link here. We'll be one character too long, with a loooooong word at the end to test the shortening algorithm. Otherwise, don't worry about it. Blah blah blah. Our long word is:... /01234.html",
+		[])
+
+
 @pytest.fixture(scope='session')
 def tweetmock():
     from silorider.silos.twitter import TwitterSilo