changeset 18:a921cc2306bc

Do our own HTML parsing/stripping of micropost contents. - This lets us properly handle various forms of linking. - Add tests for processing posts with links. - Fix configuration in tests. - Basic error handling for processing posts.
author Ludovic Chabant <ludovic@chabant.com>
date Sun, 16 Sep 2018 21:16:20 -0700
parents 678278cb85b1
children d3c4c5082bbc
files silorider/commands/process.py silorider/format.py silorider/parse.py silorider/silos/print.py silorider/silos/twitter.py tests/conftest.py tests/test_commands_populate.py tests/test_format.py tests/test_silos_mastodon.py tests/test_silos_twitter.py
diffstat 10 files changed, 221 insertions(+), 41 deletions(-) [+]
line wrap: on
line diff
--- a/silorider/commands/process.py	Mon Jul 30 18:19:04 2018 -0700
+++ b/silorider/commands/process.py	Sun Sep 16 21:16:20 2018 -0700
@@ -46,29 +46,37 @@
             silo.onPostEnd()
 
     def processEntry(self, entry):
-        if self.isEntryFiltered(entry):
-            logger.debug("Entry is filtered out: %s" % entry.best_name)
-            return
-
         entry_url = entry.get('url')
         if not entry_url:
             logger.warning("Found entry without a URL.")
             return
 
+        if self.isEntryFiltered(entry):
+            logger.debug("Entry is filtered out: %s" % entry_url)
+            return
+
         postctx = SiloPostingContext(self.ctx)
         no_cache = self.ctx.args.no_cache
-        logger.debug("Processing entry: %s" % entry.best_name)
+        logger.debug("Processing entry: %s" % entry_url)
         for silo in self.silos:
             if no_cache or not self.ctx.cache.wasPosted(silo.name, entry_url):
                 if not self.ctx.args.dry_run:
-                    silo.postEntry(entry, postctx)
-                    self.ctx.cache.addPost(silo.name, entry_url)
+                    try:
+                        did_post = silo.postEntry(entry, postctx)
+                    except Exception as ex:
+                        did_post = False
+                        logger.error("Error posting: %s" % entry_url)
+                        logger.error(ex)
+                        if self.ctx.args.verbose:
+                            raise
+                    if did_post is True or did_post is None:
+                        self.ctx.cache.addPost(silo.name, entry_url)
                 else:
                     logger.info("Would post entry on %s: %s" %
-                                (silo.name, entry.best_name))
+                                (silo.name, entry_url))
             else:
                 logger.debug("Skipping already posted entry on %s: %s" %
-                             (silo.name, entry.best_name))
+                             (silo.name, entry_url))
 
     def isEntryFiltered(self, entry):
         if not self.config.has_section('filter'):
--- a/silorider/format.py	Mon Jul 30 18:19:04 2018 -0700
+++ b/silorider/format.py	Sun Sep 16 21:16:20 2018 -0700
@@ -1,10 +1,18 @@
 import re
+import urllib.parse
 import textwrap
+import bs4
+
+try:
+    import lxml  # NOQA
+    _bs_parser = 'xml'
+except ImportError:
+    _bs_parser = 'html.parser'
 
 
 def format_entry(entry, limit=None, add_url='auto'):
     url = entry.url
-    name = entry.best_name
+    name = get_best_text(entry)
 
     do_add_url = ((add_url is True) or
                   (add_url == 'auto' and not entry.is_micropost))
@@ -32,6 +40,91 @@
     return name
 
 
+def get_best_text(entry, *, plain=True, inline_urls=True):
+    text = entry.get('title')
+    if not text:
+        text = entry.get('name')
+        if not text:
+            text = entry.get('content')
+
+    if text:
+        if not plain:
+            return text
+        return strip_html(text, inline_urls=inline_urls)
+
+    return None
+
+
+def strip_html(txt, *, inline_urls=True):
+    outtxt = ''
+    ctx = _HtmlStripping()
+    soup = bs4.BeautifulSoup(txt, _bs_parser)
+    for c in soup.children:
+        outtxt += _do_strip_html(c, ctx)
+
+    keys = ['url:%d' % i for i in range(len(ctx.urls))]
+    if inline_urls:
+        urls = dict(zip(keys, [' ' + u for u in ctx.urls]))
+    else:
+        urls = dict(zip(keys, [''] * len(ctx.urls)))
+    outtxt = outtxt % urls
+    if not inline_urls and ctx.urls:
+        outtxt += '\n' + '\n'.join(ctx.urls)
+    return outtxt
+
+
+class _HtmlStripping:
+    def __init__(self):
+        self.urls = []
+
+
+def _do_strip_html(elem, ctx):
+    if isinstance(elem, bs4.NavigableString):
+        return str(elem)
+
+    if elem.name == 'a':
+        try:
+            href = elem['href']
+        except KeyError:
+            href = None
+        cnts = list(elem.contents)
+        if len(cnts) == 1:
+            href_txt = cnts[0].string
+            href_parsed = urllib.parse.urlparse(href)
+            print("Checking:", href_txt, href_parsed.hostname)
+            if href_txt in [
+                    href,
+                    href_parsed.netloc,
+                    '%s://%s' % (href_parsed.scheme, href_parsed.netloc),
+                    '%s://%s%s' % (href_parsed.scheme, href_parsed.netloc,
+                                   href_parsed.path)]:
+                return href
+
+        a_txt = ''.join([_do_strip_html(c, ctx)
+                         for c in cnts])
+        a_txt += '%%(url:%d)s' % len(ctx.urls)
+        ctx.urls.append(href)
+        return a_txt
+
+    if elem.name == 'ol':
+        outtxt = ''
+        for i, c in enumerate(elem.children):
+            if c.name == 'li':
+                outtxt += ('%s. ' % (i + 1)) + _do_strip_html(c, ctx)
+                outtxt += '\n'
+        return outtxt
+
+    if elem.name == 'ul':
+        outtxt = ''
+        for c in elem.children:
+            if c.name == 'li':
+                outtxt += '- ' + _do_strip_html(c, ctx)
+                outtxt += '\n'
+        return outtxt
+
+    return ''.join([_do_strip_html(c, ctx) for c in elem.children])
+
+
 re_sentence_end = re.compile(r'[\w\]\)\"\'\.]\.\s|[\?\!]\s')
 
 
--- a/silorider/parse.py	Mon Jul 30 18:19:04 2018 -0700
+++ b/silorider/parse.py	Sun Sep 16 21:16:20 2018 -0700
@@ -75,16 +75,6 @@
     def html_element(self):
         return self._bs_obj
 
-    @property
-    def best_name(self):
-        self.interpret()
-
-        for pn in ['title', 'name', 'content-plain', 'content']:
-            pv = self._props.get(pn)
-            if pv:
-                return pv
-        return None
-
     def __getattr__(self, name):
         try:
             return self._doGet(name)
--- a/silorider/silos/print.py	Mon Jul 30 18:19:04 2018 -0700
+++ b/silorider/silos/print.py	Sun Sep 16 21:16:20 2018 -0700
@@ -1,6 +1,7 @@
 import datetime
 import textwrap
 from .base import Silo
+from ..format import get_best_text
 
 
 class PrintSilo(Silo):
@@ -15,14 +16,14 @@
         tokens = {}
         shorten = (self.getConfigItem('shorten', '').lower() in
                    ['true', 'yes', 'on', '1'])
-        names = self.getConfigItem('items', 'best_name,category,published')
+        names = self.getConfigItem('items', 'best_text,category,published')
         names = names.split(',')
         for n in names:
             if n == 'type':
                 tokens['type'] = entry.entry_type
 
-            elif n == 'best_name':
-                tokens['best_name'] = entry.best_name
+            elif n == 'best_text':
+                tokens['best_text'] = get_best_text(entry)
 
             else:
                 v = entry.get(n)
--- a/silorider/silos/twitter.py	Mon Jul 30 18:19:04 2018 -0700
+++ b/silorider/silos/twitter.py	Sun Sep 16 21:16:20 2018 -0700
@@ -1,6 +1,6 @@
 import logging
 import twitter
-from .base import Silo, upload_silo_media
+from .base import Silo
 
 
 logger = logging.getLogger(__name__)
--- a/tests/conftest.py	Mon Jul 30 18:19:04 2018 -0700
+++ b/tests/conftest.py	Sun Sep 16 21:16:20 2018 -0700
@@ -47,6 +47,7 @@
 [cache]
 uri=memory://for_test
 """
+        self._feedcfg = []
         self._pre_hooks = []
         self._cleanup = []
 
@@ -70,6 +71,14 @@
         self._cfgtxt += cfgtxt
         return self
 
+    def appendFeedConfig(self, feed_name, feed_url):
+        self._feedcfg.append((feed_name, feed_url))
+        return self
+
+    def setFeedConfig(self, feed_name, feed_url):
+        self._feedcfg = []
+        return self.appendFeedConfig(feed_name, feed_url)
+
     def appendSiloConfig(self, silo_name, silo_type, **options):
         cfgtxt = '[silo:%s]\n' % silo_name
         cfgtxt += 'type=%s\n' % silo_type
@@ -83,11 +92,16 @@
 
     def run(self, *args):
         pre_args = []
-        if self._cfgtxt:
+        if self._cfgtxt or self._feedcfg:
+            cfgtxt = self._cfgtxt
+            cfgtxt += '\n\n[urls]\n'
+            for n, u in self._feedcfg:
+                cfgtxt += '%s=%s\n' % (n, u)
+
             tmpfd, tmpcfg = tempfile.mkstemp()
             print("Creating temporary configuration file: %s" % tmpcfg)
             with os.fdopen(tmpfd, 'w') as tmpfp:
-                tmpfp.write(self._cfgtxt)
+                tmpfp.write(cfgtxt)
             self._cleanup.append(tmpcfg)
             pre_args = ['-c', tmpcfg]
 
@@ -124,7 +138,10 @@
 
             print("Cleaning %d temporary files." % len(self._cleanup))
             for tmpname in self._cleanup:
-                os.remove(tmpname)
+                try:
+                    os.remove(tmpname)
+                except FileNotFoundError:
+                    pass
 
         return main_ctx, main_res
 
--- a/tests/test_commands_populate.py	Mon Jul 30 18:19:04 2018 -0700
+++ b/tests/test_commands_populate.py	Sun Sep 16 21:16:20 2018 -0700
@@ -13,9 +13,10 @@
 
 
 def test_populate(cli):
+    feed = cli.createTempFeed(feed1)
     cli.appendSiloConfig('test', 'print', items='name')
-    feed = cli.createTempFeed(feed1)
-    ctx, _ = cli.run('populate', feed, '-s', 'test')
+    cli.setFeedConfig('feed', feed)
+    ctx, _ = cli.run('populate', '-s', 'test')
     assert ctx.cache.wasPosted('test', 'https://example.org/a-new-article')
 
 
@@ -40,9 +41,10 @@
 
 
 def test_populate_until(cli):
+    feed = cli.createTempFeed(feed2)
     cli.appendSiloConfig('test', 'print', items='name')
-    feed = cli.createTempFeed(feed2)
-    ctx, _ = cli.run('populate', feed, '-s', 'test', '--until', '2018-01-08')
+    cli.setFeedConfig('feed', feed)
+    ctx, _ = cli.run('populate', '-s', 'test', '--until', '2018-01-08')
     assert ctx.cache.wasPosted('test', 'https://example.org/first-article')
     assert ctx.cache.wasPosted('test', 'https://example.org/second-article')
     assert not ctx.cache.wasPosted('test', 'https://example.org/third-article')
--- a/tests/test_format.py	Mon Jul 30 18:19:04 2018 -0700
+++ b/tests/test_format.py	Sun Sep 16 21:16:20 2018 -0700
@@ -1,5 +1,5 @@
 import pytest
-from silorider.format import format_entry
+from silorider.format import format_entry, strip_html
 
 
 test_url = 'https://example.org/article'
@@ -11,12 +11,44 @@
 
 def _make_test_entry(best_name, is_micropost):
     entry = TestEntry()
-    entry.best_name = best_name
+    entry.get = lambda n: best_name
     entry.is_micropost = is_micropost
     entry.url = test_url
     return entry
 
 
+@pytest.mark.parametrize("text, expected", [
+    ("<p>Something</p>",
+     "Something"),
+    ("<p>Something with <em>emphasis</em> in it</p>",
+     "Something with emphasis in it"),
+    ("<p>Something with <a href=\"http://example.org/blah\">a link</a>",
+     "Something with a link http://example.org/blah"),
+    ("<p>Something with a link <a href=\"http://example.org/blah\">http://example.org</a>",  # NOQA
+     "Something with a link http://example.org/blah"),
+    ("<p>Something with <a href=\"http://example.org/first\">one link here</a> and <a href=\"http://example.org/second\">another there</a>...</p>",  # NOQA
+     "Something with one link here http://example.org/first and another there http://example.org/second...")  # NOQA
+])
+def test_strip_html(text, expected):
+    actual = strip_html(text)
+    assert actual == expected
+
+
+@pytest.mark.parametrize("text, expected", [
+    ("<p>Something with <a href=\"http://example.org/blah\">a link</a>",
+     "Something with a link\nhttp://example.org/blah"),
+    ("<p>Something with a link <a href=\"http://example.org/blah\">http://example.org</a>",  # NOQA
+     "Something with a link http://example.org/blah"),
+    ("<p>Something with <a href=\"http://example.org/first\">one link here</a> and <a href=\"http://example.org/second\">another there</a>...</p>",  # NOQA
+     "Something with one link here and another there...\nhttp://example.org/first\nhttp://example.org/second")  # NOQA
+])
+def test_strip_html_with_bottom_urls(text, expected):
+    actual = strip_html(text, inline_urls=False)
+    print(actual)
+    print(expected)
+    assert actual == expected
+
+
 @pytest.mark.parametrize("title, limit, add_url, expected", [
     ('A test entry', None, False, 'A test entry'),
     ('A test entry', None, 'auto', 'A test entry ' + test_url),
--- a/tests/test_silos_mastodon.py	Mon Jul 30 18:19:04 2018 -0700
+++ b/tests/test_silos_mastodon.py	Sun Sep 16 21:16:20 2018 -0700
@@ -13,9 +13,10 @@
     ))
 
     cli.appendSiloConfig('test', 'mastodon', url='/blah')
+    cli.setFeedConfig('feed', feed)
     mastmock.installTokens(cli, 'test')
 
-    ctx, _ = cli.run('process', feed)
+    ctx, _ = cli.run('process')
     assert ctx.cache.wasPosted('test', 'https://example.org/a-new-article')
     toot = ctx.silos[0].client.toots[0]
     assert toot == ('A new article https://example.org/a-new-article',
@@ -29,9 +30,10 @@
     ))
 
     cli.appendSiloConfig('test', 'mastodon', url='/blah')
+    cli.setFeedConfig('feed', feed)
     mastmock.installTokens(cli, 'test')
 
-    ctx, _ = cli.run('process', feed)
+    ctx, _ = cli.run('process')
     assert ctx.cache.wasPosted('test', '/01234.html')
     toot = ctx.silos[0].client.toots[0]
     assert toot == ("This is a quick update.", None, 'public')
@@ -47,6 +49,7 @@
     ))
 
     cli.appendSiloConfig('test', 'mastodon', url='/blah')
+    cli.setFeedConfig('feed', feed)
     mastmock.installTokens(cli, 'test')
 
     with monkeypatch.context() as m:
@@ -54,7 +57,7 @@
         mock_urllib(m)
         m.setattr(silorider.silos.mastodon.MastodonSilo, '_media_callback',
                   _patched_media_callback)
-        ctx, _ = cli.run('process', feed)
+        ctx, _ = cli.run('process')
 
     assert ctx.cache.wasPosted('test', '/01234.html')
     media = ctx.silos[0].client.media[0]
@@ -74,6 +77,7 @@
     ))
 
     cli.appendSiloConfig('test', 'mastodon', url='/blah')
+    cli.setFeedConfig('feed', feed)
     mastmock.installTokens(cli, 'test')
 
     with monkeypatch.context() as m:
@@ -81,7 +85,7 @@
         mock_urllib(m)
         m.setattr(silorider.silos.mastodon.MastodonSilo, '_media_callback',
                   _patched_media_callback)
-        ctx, _ = cli.run('process', feed)
+        ctx, _ = cli.run('process')
 
     assert ctx.cache.wasPosted('test', '/01234.html')
     media = ctx.silos[0].client.media[0]
@@ -92,6 +96,35 @@
     assert toot == ("This is a photo update with 2 photos.", [1, 2], 'public')
 
 
+def test_one_micropost_with_links(cli, feedutil, mastmock):
+    cli.appendSiloConfig('test', 'mastodon', url='/blah')
+    mastmock.installTokens(cli, 'test')
+
+    feed = cli.createTempFeed(feedutil.makeFeed(
+        """<p class="p-name">This is a link: http://example.org/blah</p>
+<a class="u-url" href="/01234.html">permalink</a>"""))
+    cli.setFeedConfig('feed', feed)
+    ctx, _ = cli.run('process')
+    toot = ctx.silos[0].client.toots[0]
+    assert toot == ("This is a link: http://example.org/blah", None, 'public')
+
+    feed = cli.createTempFeed(feedutil.makeFeed(
+        """<p class="e-content">This is another link: <a href="http://example.org/blah">http://example.org/blah</a></p>
+<a class="u-uri" href="/01234.html">permalink</a>"""))  # NOQA
+    cli.setFeedConfig('feed', feed)
+    ctx, _ = cli.run('process')
+    toot = ctx.silos[0].client.toots[0]
+    assert toot == ("This is another link: http://example.org/blah", None, 'public')  # NOQA
+
+    feed = cli.createTempFeed(feedutil.makeFeed(
+        """<p class="e-content">This is yet <a href="http://example.org/blah">another link</a></p>
+<a class="u-uri" href="/01234.html">permalink</a>"""))  # NOQA
+    cli.setFeedConfig('feed', feed)
+    ctx, _ = cli.run('process')
+    toot = ctx.silos[0].client.toots[0]
+    assert toot == ("This is yet another link http://example.org/blah", None, 'public')  # NOQA
+
+
 def _patched_media_callback(self, tmpfile, mt):
     return self.client.media_post(tmpfile, mt)
 
--- a/tests/test_silos_twitter.py	Mon Jul 30 18:19:04 2018 -0700
+++ b/tests/test_silos_twitter.py	Sun Sep 16 21:16:20 2018 -0700
@@ -12,9 +12,10 @@
     ))
 
     cli.appendSiloConfig('test', 'twitter', url='/blah')
+    cli.setFeedConfig('feed', feed)
     tweetmock.installTokens(cli, 'test')
 
-    ctx, _ = cli.run('process', feed)
+    ctx, _ = cli.run('process')
     assert ctx.cache.wasPosted('test', 'https://example.org/a-new-article')
     toot = ctx.silos[0].client.tweets[0]
     assert toot == ('A new article https://example.org/a-new-article', [])
@@ -27,9 +28,10 @@
     ))
 
     cli.appendSiloConfig('test', 'twitter', url='/blah')
+    cli.setFeedConfig('feed', feed)
     tweetmock.installTokens(cli, 'test')
 
-    ctx, _ = cli.run('process', feed)
+    ctx, _ = cli.run('process')
     assert ctx.cache.wasPosted('test', '/01234.html')
     toot = ctx.silos[0].client.tweets[0]
     assert toot == ("This is a quick update.", [])
@@ -45,9 +47,10 @@
     ))
 
     cli.appendSiloConfig('test', 'twitter', url='/blah')
+    cli.setFeedConfig('feed', feed)
     tweetmock.installTokens(cli, 'test')
 
-    ctx, _ = cli.run('process', feed)
+    ctx, _ = cli.run('process')
 
     assert ctx.cache.wasPosted('test', '/01234.html')
     toot = ctx.silos[0].client.tweets[0]
@@ -65,9 +68,10 @@
     ))
 
     cli.appendSiloConfig('test', 'twitter', url='/blah')
+    cli.setFeedConfig('feed', feed)
     tweetmock.installTokens(cli, 'test')
 
-    ctx, _ = cli.run('process', feed)
+    ctx, _ = cli.run('process')
 
     assert ctx.cache.wasPosted('test', '/01234.html')
     toot = ctx.silos[0].client.tweets[0]