changeset 25:fb93d3fbff4e

Support transforming twitter profile URLs into mentions.
author Ludovic Chabant <ludovic@chabant.com>
date Sun, 08 Sep 2019 16:11:26 -0700
parents 2d064ee0c5e6
children c8e0d4c12f92
files silorider/format.py silorider/silos/twitter.py tests/test_format.py tests/test_silos_twitter.py
diffstat 4 files changed, 76 insertions(+), 13 deletions(-) [+]
line wrap: on
line diff
--- a/silorider/format.py	Sun Sep 08 16:09:40 2019 -0700
+++ b/silorider/format.py	Sun Sep 08 16:11:26 2019 -0700
@@ -5,9 +5,9 @@
 from .config import has_lxml
 
 
-def format_entry(entry, limit=None, add_url='auto'):
+def format_entry(entry, limit=None, add_url='auto', url_flattener=None):
     url = entry.url
-    name = get_best_text(entry)
+    name = get_best_text(entry, url_flattener=url_flattener)
     if not name:
         raise Exception("Can't find best text for entry: %s" % url)
 
@@ -37,7 +37,12 @@
     return name
 
 
-def get_best_text(entry, *, plain=True, inline_urls=True):
+class UrlFlattener:
+    def replaceHref(self, text, url):
+        return None
+
+
+def get_best_text(entry, *, plain=True, inline_urls=True, url_flattener=None):
     elem = entry.htmlFind(class_='p-title')
     if not elem:
         elem = entry.htmlFind(class_='p-name')
@@ -48,14 +53,20 @@
         if not plain:
             text = '\n'.join([str(c) for c in elem.contents])
             return str(text)
-        return strip_html(elem, inline_urls=inline_urls)
+        return strip_html(elem, inline_urls=inline_urls,
+                          url_flattener=url_flattener)
 
     return None
 
 
-def strip_html(bs_elem, *, inline_urls=True):
+def strip_html(bs_elem, *, inline_urls=True, url_flattener=None):
+    if isinstance(bs_elem, str):
+        bs_elem = bs4.BeautifulSoup(bs_elem,
+                                    'lxml' if has_lxml else 'html5lib')
+
     outtxt = ''
     ctx = _HtmlStripping()
+    ctx.url_flattener = url_flattener
     for c in bs_elem.children:
         outtxt += _do_strip_html(c, ctx)
 
@@ -73,6 +84,7 @@
 class _HtmlStripping:
     def __init__(self):
         self.urls = []
+        self.url_flattener = None
 
 
 def _escape_percents(txt):
@@ -92,8 +104,20 @@
         if len(cnts) == 1:
             href_txt = cnts[0].string
             if href_txt in href:
+                # If we have a simple hyperlink where the text is a
+                # substring of the target URL, just return the URL.
                 return _escape_percents(href)
 
+            if ctx.url_flattener:
+                # Use an URL flattener if we have one.
+                href_parsed = urllib.parse.urlparse(href)
+                href_flattened = ctx.url_flattener.replaceHref(
+                    href_txt, href_parsed)
+                if href_flattened is not None:
+                    return href_flattened
+
+        # No easy way to simplify this hyperlink... let's put a marker
+        # for the URL to be later replaced in the text.
         a_txt = ''.join([_do_strip_html(c, ctx)
                          for c in cnts])
         a_txt += '%%(url:%d)s' % len(ctx.urls)
--- a/silorider/silos/twitter.py	Sun Sep 08 16:09:40 2019 -0700
+++ b/silorider/silos/twitter.py	Sun Sep 08 16:11:26 2019 -0700
@@ -67,7 +67,8 @@
             access_token_secret=access_secret)
 
     def postEntry(self, entry, ctx):
-        tweettxt = self.formatEntry(entry, limit=280)
+        tweettxt = self.formatEntry(entry, limit=280,
+                                    url_flattener=TwitterUrlFlattener())
         if not tweettxt:
             raise Exception("Can't find any content to use for the tweet!")
 
@@ -79,3 +80,20 @@
         tweettxt = self.formatEntry(entry, limit=280)
         logger.info("Tweet would be:")
         logger.info(tweettxt)
+
+
+TWITTER_NETLOCS = ['twitter.com', 'www.twitter.com']
+
+
+class TwitterUrlFlattener:
+    def replaceHref(self, text, url):
+        # Is it a Twitter URL?
+        if url.netloc not in TWITTER_NETLOCS:
+            return None
+
+        path = url.path.lstrip('/')
+        # Is it a profile URL?
+        if '/' not in path:
+            return '@' + path
+
+        return None
--- a/tests/test_format.py	Sun Sep 08 16:09:40 2019 -0700
+++ b/tests/test_format.py	Sun Sep 08 16:11:26 2019 -0700
@@ -5,15 +5,19 @@
 test_url = 'https://example.org/article'
 
 
-class TestEntry:
-    pass
-
+def _make_test_entry(best_name, is_micropost):
+    class TestEntry:
+        def __init__(self):
+            self.is_micropost = is_micropost
+            self.url = test_url
 
-def _make_test_entry(best_name, is_micropost):
+        def get(self, _):
+            return best_name
+
+        def htmlFind(self, *args, **kwargs):
+            return best_name
+
     entry = TestEntry()
-    entry.get = lambda n: best_name
-    entry.is_micropost = is_micropost
-    entry.url = test_url
     return entry
 
 
--- a/tests/test_silos_twitter.py	Sun Sep 08 16:09:40 2019 -0700
+++ b/tests/test_silos_twitter.py	Sun Sep 08 16:11:26 2019 -0700
@@ -37,6 +37,23 @@
     assert toot == ("This is a quick update.", [])
 
 
+def test_one_micropost_with_mention(cli, feedutil, tweetmock):
+    feed = cli.createTempFeed(feedutil.makeFeed(
+        """<p class="p-name">Hey <a href="https://twitter.com/jack">Jacky</a>
+you should fix your stuff!</p>
+<a class="u-url" href="/01234.html">permalink</a>"""
+    ))
+
+    cli.appendSiloConfig('test', 'twitter', url='/blah')
+    cli.setFeedConfig('feed', feed)
+    tweetmock.installTokens(cli, 'test')
+
+    ctx, _ = cli.run('process')
+    assert ctx.cache.wasPosted('test', '/01234.html')
+    toot = ctx.silos[0].client.tweets[0]
+    assert toot == ("Hey @jack you should fix your stuff!", [])
+
+
 def test_one_micropost_with_one_photo(cli, feedutil, tweetmock, monkeypatch):
     feed = cli.createTempFeed(feedutil.makeFeed(
         """<p class="p-name">This is a quick photo update.</p>