Mercurial > silorider

--- a/silorider/format.py	Wed May 10 16:06:46 2023 -0700
+++ b/silorider/format.py	Wed May 10 16:10:12 2023 -0700
@@ -1,4 +1,5 @@
 import re
+import string
 import textwrap
 import bs4
 from .config import has_lxml
@@ -10,6 +11,7 @@
     ctx = HtmlStrippingContext()
     if url_flattener:
         ctx.url_flattener = url_flattener
+    # Don't add the limit yet.

     name = get_best_text(entry, ctx)
     if not name:
@@ -20,23 +22,29 @@
     if limit:
         text_length = ctx.text_length
         if do_add_url and url:
+            # We need to add the URL at the end of the post, so account
+            # for it plus a space by making the text length limit smaller.
             limit -= 1 + ctx.url_flattener.measureUrl(url)

         shortened = text_length > limit
         if shortened:
-            # If we have to shorten the text, but we haven't taken the
-            # URL into account yet, let's see if we have to include it now!
-            # (this happens when we only want to include it when the text
-            #  is shortened)
+            # We need to shorten the text! We can't really reason about it
+            # anymore at this point because we could have URLs inside the
+            # text that don't measure the correct number of characters
+            # (such as with Twitter's URL shortening). Let's just start
+            # again with a limit that's our max limit, minus the room
+            # needed to add the link to the post.
             if not do_add_url and add_url == 'auto' and url:
                 do_add_url = True
                 limit -= 1 + ctx.url_flattener.measureUrl(url)

-        if limit <= 0:
-            raise Exception("Can't shorten post name.")
+            ctx = HtmlStrippingContext()
+            ctx.limit = limit
+            if url_flattener:
+                ctx.url_flattener = url_flattener
+            name = get_best_text(entry, ctx)

-        name = textwrap.shorten(name, width=limit, placeholder="...")
-
+    # Actually add the url to the original post now.
     if do_add_url and url:
         name += ' ' + url
     return name
@@ -64,12 +72,43 @@

 class HtmlStrippingContext:
     def __init__(self):
-        self.url_mode = URLMODE_BOTTOM_LIST
+        # Mode for inserting URLs
+        self.url_mode = URLMODE_LAST
+        # List of URLs to insert
         self.urls = []
+        # Indices of URLs that should not get a leading whitespace
         self.nosp_urls = []
+        # Object that can measure and shorten URLs
         self.url_flattener = _NullUrlFlattener()
+        # Limit for how long the text can be
+        self.limit = -1
+
+        # Accumulated text length when accounting for shortened URLs
         self.text_length = 0
+        # Whether limit was reached
+        self.limit_reached = False

+    def processText(self, txt, allow_shorten=True):
+        added_len = len(txt)
+        next_text_length = self.text_length + added_len
+        if self.limit <= 0 or next_text_length <= self.limit:
+            self.text_length = next_text_length
+            return txt
+
+        if allow_shorten:
+            max_allowed = self.limit - self.text_length
+            short_txt = textwrap.shorten(
+                txt,
+                width=max_allowed,
+                expand_tabs=False,
+                replace_whitespace=False,
+                placeholder="...")
+            self.text_length += len(short_txt)
+            self.limit_reached = True
+            return short_txt
+        else:
+            self.limit_reached = True
+            return ''


 def get_best_text(entry, ctx=None, *, plain=True):
@@ -102,8 +141,10 @@

     # If URLs are inline, insert them where we left our marker. If not, replace
     # our markers with an empty string and append the URLs at the end.
+    # If we reached the limit with the text alone, replace URLs with empty
+    # strings and bail out.
     keys = ['url:%d' % i for i in range(len(ctx.urls))]
-    if ctx.url_mode == URLMODE_INLINE:
+    if not ctx.limit_reached and ctx.url_mode == URLMODE_INLINE:
         url_repl = [' ' + u for u in ctx.urls]
         # Some URLs didn't have any text to be placed next to, so for those
         # we don't need any extra space before.
@@ -113,12 +154,26 @@
     else:
         urls = dict(zip(keys, [''] * len(ctx.urls)))
     outtxt = outtxt % urls
+    if ctx.limit_reached:
+        return outtxt
     if ctx.url_mode != URLMODE_INLINE and ctx.urls:
-        outtxt = outtxt.rstrip()
         if ctx.url_mode == URLMODE_LAST:
-            outtxt += ' ' + ' '.join(ctx.urls)
+            # Don't add unnecessary whitespace.
+            # NOTE: our final measure of the text might be one character
+            #       too long because of this, but that's desirable.
+            if outtxt[-1] not in string.whitespace:
+                outtxt += ' '
+            outtxt += ' '.join(ctx.urls)
         elif ctx.url_mode == URLMODE_BOTTOM_LIST:
-            outtxt += '\n' + '\n'.join(ctx.urls)
+            # If the last character of the text is a whitespace, replace
+            # it with a newline.
+            # NOTE: our final measure of the text might be one character
+            #       too long because of this, but that's desirable.
+            if outtxt[-1] in string.whitespace:
+                outtxt = outtxt[:-1] + '\n'
+            else:
+                outtxt += '\n'
+            outtxt += '\n'.join(ctx.urls)

     # Add the length of URLs to the text length.
     for url in ctx.urls:
@@ -128,8 +183,8 @@
         # One space per URL except the explicitly no-space-urls.
         ctx.text_length += len(ctx.urls) - len(ctx.nosp_urls)
     else:
-        # One space or newline per URL, plus the first one.
-        ctx.text_length += len(ctx.urls) + 1
+        # One space or newline per URL.
+        ctx.text_length += len(ctx.urls)

     return outtxt

@@ -140,9 +195,27 @@

 def _do_strip_html(elem, ctx):
     if isinstance(elem, bs4.NavigableString):
-        raw_txt = str(elem)
-        ctx.text_length += len(raw_txt)
-        return _escape_percents(raw_txt)
+        # Don't necessarily include this bit of text...
+        # If it belongs to a paragraph, include it. If not, include it
+        # only if there are not paragraphs in its siblings (because that
+        # means this is the white-space between the paragraph tags)
+        include_this = False
+        for parent in elem.parents:
+            if parent and parent.name == 'p':
+                include_this = True
+                break
+        else:
+            next_sib = next(elem.next_siblings, None)
+            prev_sib = next(elem.previous_siblings, None)
+            if ((prev_sib is None or prev_sib.name != 'p') and
+                (next_sib is None or next_sib.name != 'p')):
+                include_this = True
+
+        if include_this:
+            raw_txt = str(elem)
+            return _escape_percents(ctx.processText(raw_txt))
+        else:
+            return ''

     if elem.name == 'a':
         try:
@@ -160,7 +233,7 @@
                 # flattener computed a custom text length. If not, do the
                 # standard computation.
                 if ctx.text_length == old_text_length:
-                    ctx.text_length += len(href_flattened)
+                    return ctx.processText(href_flattened, False)
                 return href_flattened

             # If we have a simple hyperlink where the text is a substring of
@@ -187,8 +260,7 @@
             if c.name == 'li':
                 outtxt += ('%s. ' % (i + 1)) + _do_strip_html(c, ctx)
                 outtxt += '\n'
-        ctx.text_length += len(outtxt)
-        return outtxt
+        return ctx.processText(outtxt)

     if elem.name == 'ul':
         outtxt = ''
@@ -196,8 +268,7 @@
             if c.name == 'li':
                 outtxt += '- ' + _do_strip_html(c, ctx)
                 outtxt += '\n'
-        ctx.text_length += len(outtxt)
-        return outtxt
+        return ctx.processText(outtxt)

     return ''.join([_do_strip_html(c, ctx) for c in elem.children])
--- a/tests/test_format.py	Wed May 10 16:06:46 2023 -0700
+++ b/tests/test_format.py	Wed May 10 16:10:12 2023 -0700
@@ -45,9 +45,9 @@


 @pytest.mark.parametrize("text, expected", [
-    ("<p>Something with <a href=\"http://example.org/blah\">a link</a>",
+    ("<p>Something with <a href=\"http://example.org/blah\">a link</a></p>",
      "Something with a link\nhttp://example.org/blah"),
-    ("<p>Something with a link <a href=\"http://example.org/blah\">http://example.org</a>",  # NOQA
+    ("<p>Something with a link <a href=\"http://example.org/blah\">http://example.org</a></p>",  # NOQA
      "Something with a link\nhttp://example.org/blah"),
     ("<p>Something with <a href=\"http://example.org/first\">one link here</a> and <a href=\"http://example.org/second\">another there</a>...</p>",  # NOQA
      "Something with one link here and another there...\nhttp://example.org/first\nhttp://example.org/second")  # NOQA
--- a/tests/test_silos_twitter.py	Wed May 10 16:06:46 2023 -0700
+++ b/tests/test_silos_twitter.py	Wed May 10 16:10:12 2023 -0700
@@ -51,7 +51,7 @@
     ctx, _ = cli.run('process')
     assert ctx.cache.wasPosted('test', '/01234.html')
     toot = ctx.silos[0].client.tweets[0]
-    assert toot == ("Hey @jack you should fix your stuff!", [])
+    assert toot == ("Hey @jack\nyou should fix your stuff!", [])


 def test_one_micropost_with_one_photo(cli, feedutil, tweetmock, monkeypatch):
@@ -96,6 +96,63 @@
                     ['/fullimg1.jpg', '/fullimg2.jpg'])


+def test_micropost_with_long_text_and_link(cli, feedutil, tweetmock, monkeypatch):
+    feed = cli.createTempFeed(feedutil.makeFeed(
+		"""<div class="p-name">
+	<p>This a pretty long text that has a link in it :) We want to make sure it gets to the limit of what Twitter allows, so that we can test there won't be any off-by-one errors in measurements. Here is a <a href="https://docs.python.org/3/library/textwrap.html">link to Python's textwrap module</a>, which is appropriate!!!</p>
+	</div>
+	<a class="u-url" href="/01234.html">permalink</a>"""
+    ))
+
+    cli.appendSiloConfig('test', 'twitter', url='/blah')
+    cli.setFeedConfig('feed', feed)
+    tweetmock.installTokens(cli, 'test')
+
+    ctx, _ = cli.run('process')
+    assert ctx.cache.wasPosted('test', '/01234.html')
+    toot = ctx.silos[0].client.tweets[0]
+    assert toot == ("This a pretty long text that has a link in it :) We want to make sure it gets to the limit of what Twitter allows, so that we can test there won't be any off-by-one errors in measurements. Here is a link to Python's textwrap module, which is appropriate!!! https://docs.python.org/3/library/textwrap.html",
+		[])
+
+
+def test_micropost_with_too_long_text_and_link_1(cli, feedutil, tweetmock, monkeypatch):
+    feed = cli.createTempFeed(feedutil.makeFeed(
+		"""<div class="p-name">
+	<p>This time we have a text that's slightly too long, with <a href="https://thisdoesntmatter.com">a link here</a>. We'll be one character too long, with a short word at the end to test the shortening algorithm. Otherwise, don't worry about it. Blah blah blah. Trying to get to the limit. Almost here yes</p>
+	</div>
+	<a class="u-url" href="/01234.html">permalink</a>"""
+    ))
+
+    cli.appendSiloConfig('test', 'twitter', url='/blah')
+    cli.setFeedConfig('feed', feed)
+    tweetmock.installTokens(cli, 'test')
+
+    ctx, _ = cli.run('process')
+    assert ctx.cache.wasPosted('test', '/01234.html')
+    toot = ctx.silos[0].client.tweets[0]
+    assert toot == ("This time we have a text that's slightly too long, with a link here. We'll be one character too long, with a short word at the end to test the shortening algorithm. Otherwise, don't worry about it. Blah blah blah. Trying to get to the limit. Almost here... /01234.html",
+		[])
+
+
+def test_micropost_with_too_long_text_and_link_2(cli, feedutil, tweetmock, monkeypatch):
+    feed = cli.createTempFeed(feedutil.makeFeed(
+		"""<div class="p-name">
+	<p>This time we have a text that's slightly too long, with <a href="https://thisdoesntmatter.com">a link here</a>. We'll be one character too long, with a loooooong word at the end to test the shortening algorithm. Otherwise, don't worry about it. Blah blah blah. Our long word is: califragilisticastuff</p>
+	</div>
+	<a class="u-url" href="/01234.html">permalink</a>"""
+    ))
+
+    cli.appendSiloConfig('test', 'twitter', url='/blah')
+    cli.setFeedConfig('feed', feed)
+    tweetmock.installTokens(cli, 'test')
+
+    ctx, _ = cli.run('process')
+    assert ctx.cache.wasPosted('test', '/01234.html')
+    toot = ctx.silos[0].client.tweets[0]
+    assert toot == ("This time we have a text that's slightly too long, with a link here. We'll be one character too long, with a loooooong word at the end to test the shortening algorithm. Otherwise, don't worry about it. Blah blah blah. Our long word is:... /01234.html",
+		[])
+
+
 @pytest.fixture(scope='session')
 def tweetmock():
     from silorider.silos.twitter import TwitterSilo