Mercurial > silorider
changeset 18:a921cc2306bc
Do our own HTML parsing/stripping of micropost contents.
- This lets us properly handle various forms of linking.
- Add tests for processing posts with links.
- Fix configuration in tests.
- Basic error handling for processing posts.
author | Ludovic Chabant <ludovic@chabant.com> |
---|---|
date | Sun, 16 Sep 2018 21:16:20 -0700 |
parents | 678278cb85b1 |
children | d3c4c5082bbc |
files | silorider/commands/process.py silorider/format.py silorider/parse.py silorider/silos/print.py silorider/silos/twitter.py tests/conftest.py tests/test_commands_populate.py tests/test_format.py tests/test_silos_mastodon.py tests/test_silos_twitter.py |
diffstat | 10 files changed, 221 insertions(+), 41 deletions(-) [+] |
line wrap: on
line diff
--- a/silorider/commands/process.py Mon Jul 30 18:19:04 2018 -0700 +++ b/silorider/commands/process.py Sun Sep 16 21:16:20 2018 -0700 @@ -46,29 +46,37 @@ silo.onPostEnd() def processEntry(self, entry): - if self.isEntryFiltered(entry): - logger.debug("Entry is filtered out: %s" % entry.best_name) - return - entry_url = entry.get('url') if not entry_url: logger.warning("Found entry without a URL.") return + if self.isEntryFiltered(entry): + logger.debug("Entry is filtered out: %s" % entry_url) + return + postctx = SiloPostingContext(self.ctx) no_cache = self.ctx.args.no_cache - logger.debug("Processing entry: %s" % entry.best_name) + logger.debug("Processing entry: %s" % entry_url) for silo in self.silos: if no_cache or not self.ctx.cache.wasPosted(silo.name, entry_url): if not self.ctx.args.dry_run: - silo.postEntry(entry, postctx) - self.ctx.cache.addPost(silo.name, entry_url) + try: + did_post = silo.postEntry(entry, postctx) + except Exception as ex: + did_post = False + logger.error("Error posting: %s" % entry_url) + logger.error(ex) + if self.ctx.args.verbose: + raise + if did_post is True or did_post is None: + self.ctx.cache.addPost(silo.name, entry_url) else: logger.info("Would post entry on %s: %s" % - (silo.name, entry.best_name)) + (silo.name, entry_url)) else: logger.debug("Skipping already posted entry on %s: %s" % - (silo.name, entry.best_name)) + (silo.name, entry_url)) def isEntryFiltered(self, entry): if not self.config.has_section('filter'):
--- a/silorider/format.py Mon Jul 30 18:19:04 2018 -0700 +++ b/silorider/format.py Sun Sep 16 21:16:20 2018 -0700 @@ -1,10 +1,18 @@ import re +import urllib.parse import textwrap +import bs4 + +try: + import lxml # NOQA + _bs_parser = 'xml' +except ImportError: + _bs_parser = 'html.parser' def format_entry(entry, limit=None, add_url='auto'): url = entry.url - name = entry.best_name + name = get_best_text(entry) do_add_url = ((add_url is True) or (add_url == 'auto' and not entry.is_micropost)) @@ -32,6 +40,91 @@ return name +def get_best_text(entry, *, plain=True, inline_urls=True): + text = entry.get('title') + if not text: + text = entry.get('name') + if not text: + text = entry.get('content') + + if text: + if not plain: + return text + return strip_html(text, inline_urls=inline_urls) + + return None + + +def strip_html(txt, *, inline_urls=True): + outtxt = '' + ctx = _HtmlStripping() + soup = bs4.BeautifulSoup(txt, _bs_parser) + for c in soup.children: + outtxt += _do_strip_html(c, ctx) + + keys = ['url:%d' % i for i in range(len(ctx.urls))] + if inline_urls: + urls = dict(zip(keys, [' ' + u for u in ctx.urls])) + else: + urls = dict(zip(keys, [''] * len(ctx.urls))) + outtxt = outtxt % urls + if not inline_urls and ctx.urls: + outtxt += '\n' + '\n'.join(ctx.urls) + return outtxt + + +class _HtmlStripping: + def __init__(self): + self.urls = [] + + +def _do_strip_html(elem, ctx): + if isinstance(elem, bs4.NavigableString): + return str(elem) + + if elem.name == 'a': + try: + href = elem['href'] + except KeyError: + href = None + cnts = list(elem.contents) + if len(cnts) == 1: + href_txt = cnts[0].string + href_parsed = urllib.parse.urlparse(href) + print("Checking:", href_txt, href_parsed.hostname) + if href_txt in [ + href, + href_parsed.netloc, + '%s://%s' % (href_parsed.scheme, href_parsed.netloc), + '%s://%s%s' % (href_parsed.scheme, href_parsed.netloc, + href_parsed.path)]: + return href + + a_txt = ''.join([_do_strip_html(c, ctx) + for c in cnts]) + a_txt += '%%(url:%d)s' % len(ctx.urls) + ctx.urls.append(href) + return a_txt + + if elem.name == 'ol': + outtxt = '' + for i, c in enumerate(elem.children): + if c.name == 'li': + outtxt += ('%s. ' % (i + 1)) + _do_strip_html(c, ctx) + outtxt += '\n' + return outtxt + + if elem.name == 'ul': + outtxt = '' + for c in elem.children: + if c.name == 'li': + outtxt += '- ' + _do_strip_html(c, ctx) + outtxt += '\n' + return outtxt + + return ''.join([_do_strip_html(c, ctx) for c in elem.children]) + + re_sentence_end = re.compile(r'[\w\]\)\"\'\.]\.\s|[\?\!]\s')
--- a/silorider/parse.py Mon Jul 30 18:19:04 2018 -0700 +++ b/silorider/parse.py Sun Sep 16 21:16:20 2018 -0700 @@ -75,16 +75,6 @@ def html_element(self): return self._bs_obj - @property - def best_name(self): - self.interpret() - - for pn in ['title', 'name', 'content-plain', 'content']: - pv = self._props.get(pn) - if pv: - return pv - return None - def __getattr__(self, name): try: return self._doGet(name)
--- a/silorider/silos/print.py Mon Jul 30 18:19:04 2018 -0700 +++ b/silorider/silos/print.py Sun Sep 16 21:16:20 2018 -0700 @@ -1,6 +1,7 @@ import datetime import textwrap from .base import Silo +from ..format import get_best_text class PrintSilo(Silo): @@ -15,14 +16,14 @@ tokens = {} shorten = (self.getConfigItem('shorten', '').lower() in ['true', 'yes', 'on', '1']) - names = self.getConfigItem('items', 'best_name,category,published') + names = self.getConfigItem('items', 'best_text,category,published') names = names.split(',') for n in names: if n == 'type': tokens['type'] = entry.entry_type - elif n == 'best_name': - tokens['best_name'] = entry.best_name + elif n == 'best_text': + tokens['best_text'] = get_best_text(entry) else: v = entry.get(n)
--- a/silorider/silos/twitter.py Mon Jul 30 18:19:04 2018 -0700 +++ b/silorider/silos/twitter.py Sun Sep 16 21:16:20 2018 -0700 @@ -1,6 +1,6 @@ import logging import twitter -from .base import Silo, upload_silo_media +from .base import Silo logger = logging.getLogger(__name__)
--- a/tests/conftest.py Mon Jul 30 18:19:04 2018 -0700 +++ b/tests/conftest.py Sun Sep 16 21:16:20 2018 -0700 @@ -47,6 +47,7 @@ [cache] uri=memory://for_test """ + self._feedcfg = [] self._pre_hooks = [] self._cleanup = [] @@ -70,6 +71,14 @@ self._cfgtxt += cfgtxt return self + def appendFeedConfig(self, feed_name, feed_url): + self._feedcfg.append((feed_name, feed_url)) + return self + + def setFeedConfig(self, feed_name, feed_url): + self._feedcfg = [] + return self.appendFeedConfig(feed_name, feed_url) + def appendSiloConfig(self, silo_name, silo_type, **options): cfgtxt = '[silo:%s]\n' % silo_name cfgtxt += 'type=%s\n' % silo_type @@ -83,11 +92,16 @@ def run(self, *args): pre_args = [] - if self._cfgtxt: + if self._cfgtxt or self._feedcfg: + cfgtxt = self._cfgtxt + cfgtxt += '\n\n[urls]\n' + for n, u in self._feedcfg: + cfgtxt += '%s=%s\n' % (n, u) + tmpfd, tmpcfg = tempfile.mkstemp() print("Creating temporary configuration file: %s" % tmpcfg) with os.fdopen(tmpfd, 'w') as tmpfp: - tmpfp.write(self._cfgtxt) + tmpfp.write(cfgtxt) self._cleanup.append(tmpcfg) pre_args = ['-c', tmpcfg] @@ -124,7 +138,10 @@ print("Cleaning %d temporary files." % len(self._cleanup)) for tmpname in self._cleanup: - os.remove(tmpname) + try: + os.remove(tmpname) + except FileNotFoundError: + pass return main_ctx, main_res
--- a/tests/test_commands_populate.py Mon Jul 30 18:19:04 2018 -0700 +++ b/tests/test_commands_populate.py Sun Sep 16 21:16:20 2018 -0700 @@ -13,9 +13,10 @@ def test_populate(cli): + feed = cli.createTempFeed(feed1) cli.appendSiloConfig('test', 'print', items='name') - feed = cli.createTempFeed(feed1) - ctx, _ = cli.run('populate', feed, '-s', 'test') + cli.setFeedConfig('feed', feed) + ctx, _ = cli.run('populate', '-s', 'test') assert ctx.cache.wasPosted('test', 'https://example.org/a-new-article') @@ -40,9 +41,10 @@ def test_populate_until(cli): + feed = cli.createTempFeed(feed2) cli.appendSiloConfig('test', 'print', items='name') - feed = cli.createTempFeed(feed2) - ctx, _ = cli.run('populate', feed, '-s', 'test', '--until', '2018-01-08') + cli.setFeedConfig('feed', feed) + ctx, _ = cli.run('populate', '-s', 'test', '--until', '2018-01-08') assert ctx.cache.wasPosted('test', 'https://example.org/first-article') assert ctx.cache.wasPosted('test', 'https://example.org/second-article') assert not ctx.cache.wasPosted('test', 'https://example.org/third-article')
--- a/tests/test_format.py Mon Jul 30 18:19:04 2018 -0700 +++ b/tests/test_format.py Sun Sep 16 21:16:20 2018 -0700 @@ -1,5 +1,5 @@ import pytest -from silorider.format import format_entry +from silorider.format import format_entry, strip_html test_url = 'https://example.org/article' @@ -11,12 +11,44 @@ def _make_test_entry(best_name, is_micropost): entry = TestEntry() - entry.best_name = best_name + entry.get = lambda n: best_name entry.is_micropost = is_micropost entry.url = test_url return entry +@pytest.mark.parametrize("text, expected", [ + ("<p>Something</p>", + "Something"), + ("<p>Something with <em>emphasis</em> in it</p>", + "Something with emphasis in it"), + ("<p>Something with <a href=\"http://example.org/blah\">a link</a>", + "Something with a link http://example.org/blah"), + ("<p>Something with a link <a href=\"http://example.org/blah\">http://example.org</a>", # NOQA + "Something with a link http://example.org/blah"), + ("<p>Something with <a href=\"http://example.org/first\">one link here</a> and <a href=\"http://example.org/second\">another there</a>...</p>", # NOQA + "Something with one link here http://example.org/first and another there http://example.org/second...") # NOQA +]) +def test_strip_html(text, expected): + actual = strip_html(text) + assert actual == expected + + +@pytest.mark.parametrize("text, expected", [ + ("<p>Something with <a href=\"http://example.org/blah\">a link</a>", + "Something with a link\nhttp://example.org/blah"), + ("<p>Something with a link <a href=\"http://example.org/blah\">http://example.org</a>", # NOQA + "Something with a link http://example.org/blah"), + ("<p>Something with <a href=\"http://example.org/first\">one link here</a> and <a href=\"http://example.org/second\">another there</a>...</p>", # NOQA + "Something with one link here and another there...\nhttp://example.org/first\nhttp://example.org/second") # NOQA +]) +def test_strip_html_with_bottom_urls(text, expected): + actual = strip_html(text, inline_urls=False) + print(actual) + print(expected) + assert actual == expected + + @pytest.mark.parametrize("title, limit, add_url, expected", [ ('A test entry', None, False, 'A test entry'), ('A test entry', None, 'auto', 'A test entry ' + test_url),
--- a/tests/test_silos_mastodon.py Mon Jul 30 18:19:04 2018 -0700 +++ b/tests/test_silos_mastodon.py Sun Sep 16 21:16:20 2018 -0700 @@ -13,9 +13,10 @@ )) cli.appendSiloConfig('test', 'mastodon', url='/blah') + cli.setFeedConfig('feed', feed) mastmock.installTokens(cli, 'test') - ctx, _ = cli.run('process', feed) + ctx, _ = cli.run('process') assert ctx.cache.wasPosted('test', 'https://example.org/a-new-article') toot = ctx.silos[0].client.toots[0] assert toot == ('A new article https://example.org/a-new-article', @@ -29,9 +30,10 @@ )) cli.appendSiloConfig('test', 'mastodon', url='/blah') + cli.setFeedConfig('feed', feed) mastmock.installTokens(cli, 'test') - ctx, _ = cli.run('process', feed) + ctx, _ = cli.run('process') assert ctx.cache.wasPosted('test', '/01234.html') toot = ctx.silos[0].client.toots[0] assert toot == ("This is a quick update.", None, 'public') @@ -47,6 +49,7 @@ )) cli.appendSiloConfig('test', 'mastodon', url='/blah') + cli.setFeedConfig('feed', feed) mastmock.installTokens(cli, 'test') with monkeypatch.context() as m: @@ -54,7 +57,7 @@ mock_urllib(m) m.setattr(silorider.silos.mastodon.MastodonSilo, '_media_callback', _patched_media_callback) - ctx, _ = cli.run('process', feed) + ctx, _ = cli.run('process') assert ctx.cache.wasPosted('test', '/01234.html') media = ctx.silos[0].client.media[0] @@ -74,6 +77,7 @@ )) cli.appendSiloConfig('test', 'mastodon', url='/blah') + cli.setFeedConfig('feed', feed) mastmock.installTokens(cli, 'test') with monkeypatch.context() as m: @@ -81,7 +85,7 @@ mock_urllib(m) m.setattr(silorider.silos.mastodon.MastodonSilo, '_media_callback', _patched_media_callback) - ctx, _ = cli.run('process', feed) + ctx, _ = cli.run('process') assert ctx.cache.wasPosted('test', '/01234.html') media = ctx.silos[0].client.media[0] @@ -92,6 +96,35 @@ assert toot == ("This is a photo update with 2 photos.", [1, 2], 'public') +def test_one_micropost_with_links(cli, feedutil, mastmock): + cli.appendSiloConfig('test', 'mastodon', url='/blah') + mastmock.installTokens(cli, 'test') + + feed = cli.createTempFeed(feedutil.makeFeed( + """<p class="p-name">This is a link: http://example.org/blah</p> +<a class="u-url" href="/01234.html">permalink</a>""")) + cli.setFeedConfig('feed', feed) + ctx, _ = cli.run('process') + toot = ctx.silos[0].client.toots[0] + assert toot == ("This is a link: http://example.org/blah", None, 'public') + + feed = cli.createTempFeed(feedutil.makeFeed( + """<p class="e-content">This is another link: <a href="http://example.org/blah">http://example.org/blah</a></p> +<a class="u-uri" href="/01234.html">permalink</a>""")) # NOQA + cli.setFeedConfig('feed', feed) + ctx, _ = cli.run('process') + toot = ctx.silos[0].client.toots[0] + assert toot == ("This is another link: http://example.org/blah", None, 'public') # NOQA + + feed = cli.createTempFeed(feedutil.makeFeed( + """<p class="e-content">This is yet <a href="http://example.org/blah">another link</a></p> +<a class="u-uri" href="/01234.html">permalink</a>""")) # NOQA + cli.setFeedConfig('feed', feed) + ctx, _ = cli.run('process') + toot = ctx.silos[0].client.toots[0] + assert toot == ("This is yet another link http://example.org/blah", None, 'public') # NOQA + + def _patched_media_callback(self, tmpfile, mt): return self.client.media_post(tmpfile, mt)
--- a/tests/test_silos_twitter.py Mon Jul 30 18:19:04 2018 -0700 +++ b/tests/test_silos_twitter.py Sun Sep 16 21:16:20 2018 -0700 @@ -12,9 +12,10 @@ )) cli.appendSiloConfig('test', 'twitter', url='/blah') + cli.setFeedConfig('feed', feed) tweetmock.installTokens(cli, 'test') - ctx, _ = cli.run('process', feed) + ctx, _ = cli.run('process') assert ctx.cache.wasPosted('test', 'https://example.org/a-new-article') toot = ctx.silos[0].client.tweets[0] assert toot == ('A new article https://example.org/a-new-article', []) @@ -27,9 +28,10 @@ )) cli.appendSiloConfig('test', 'twitter', url='/blah') + cli.setFeedConfig('feed', feed) tweetmock.installTokens(cli, 'test') - ctx, _ = cli.run('process', feed) + ctx, _ = cli.run('process') assert ctx.cache.wasPosted('test', '/01234.html') toot = ctx.silos[0].client.tweets[0] assert toot == ("This is a quick update.", []) @@ -45,9 +47,10 @@ )) cli.appendSiloConfig('test', 'twitter', url='/blah') + cli.setFeedConfig('feed', feed) tweetmock.installTokens(cli, 'test') - ctx, _ = cli.run('process', feed) + ctx, _ = cli.run('process') assert ctx.cache.wasPosted('test', '/01234.html') toot = ctx.silos[0].client.tweets[0] @@ -65,9 +68,10 @@ )) cli.appendSiloConfig('test', 'twitter', url='/blah') + cli.setFeedConfig('feed', feed) tweetmock.installTokens(cli, 'test') - ctx, _ = cli.run('process', feed) + ctx, _ = cli.run('process') assert ctx.cache.wasPosted('test', '/01234.html') toot = ctx.silos[0].client.tweets[0]