# HG changeset patch # User Ludovic Chabant # Date 1681933618 25200 # Node ID c898b4df0f29bf2b1ba0e6456e37b7d03b12020b # Parent c8e0d4c12f92db28b96cf38bf89479cee7867d4a Use context for html stripping, with support for custom URL sizes For instance in Twitter URLs are 23 characters long since they use their own URL shortening service. Without taking this into account, post lengths would not be calculated correctly. diff -r c8e0d4c12f92 -r c898b4df0f29 silorider/format.py --- a/silorider/format.py Sun May 22 21:31:36 2022 -0700 +++ b/silorider/format.py Wed Apr 19 12:46:58 2023 -0700 @@ -1,5 +1,4 @@ import re -import urllib.parse import textwrap import bs4 from .config import has_lxml @@ -7,25 +6,31 @@ def format_entry(entry, limit=None, add_url='auto', url_flattener=None): url = entry.url - name = get_best_text(entry, url_flattener=url_flattener) + + ctx = HtmlStrippingContext() + if url_flattener: + ctx.url_flattener = url_flattener + + name = get_best_text(entry, ctx) if not name: raise Exception("Can't find best text for entry: %s" % url) do_add_url = ((add_url is True) or (add_url == 'auto' and not entry.is_micropost)) if limit: + text_length = ctx.text_length if do_add_url and url: - limit -= 1 + len(url) + limit -= 1 + ctx.url_flattener.measureUrl(url) - shortened = len(name) > limit + shortened = text_length > limit if shortened: # If we have to shorten the text, but we haven't taken the - # URL into account yet, let's see if we have to include now! + # URL into account yet, let's see if we have to include it now! # (this happens when we only want to include it when the text # is shortened) if not do_add_url and add_url == 'auto' and url: do_add_url = True - limit -= 1 + len(url) + limit -= 1 + ctx.url_flattener.measureUrl(url) if limit <= 0: raise Exception("Can't shorten post name.") @@ -38,11 +43,36 @@ class UrlFlattener: - def replaceHref(self, text, url): + def replaceHref(self, text, url, ctx): + raise NotImplementedError() + + def measureUrl(self, url): + raise NotImplementedError() + + +class _NullUrlFlattener(UrlFlattener): + def replaceHref(self, text, url, ctx): return None + def measureUrl(self, url): + return len(url) -def get_best_text(entry, *, plain=True, inline_urls=True, url_flattener=None): + +URLMODE_INLINE = 0 +URLMODE_LAST = 1 +URLMODE_BOTTOM_LIST = 2 + +class HtmlStrippingContext: + def __init__(self): + self.url_mode = URLMODE_BOTTOM_LIST + self.urls = [] + self.nosp_urls = [] + self.url_flattener = _NullUrlFlattener() + self.text_length = 0 + + + +def get_best_text(entry, ctx=None, *, plain=True): elem = entry.htmlFind(class_='p-title') if not elem: elem = entry.htmlFind(class_='p-name') @@ -53,38 +83,55 @@ if not plain: text = '\n'.join([str(c) for c in elem.contents]) return str(text) - return strip_html(elem, inline_urls=inline_urls, - url_flattener=url_flattener) + return strip_html(elem, ctx) return None -def strip_html(bs_elem, *, inline_urls=True, url_flattener=None): +def strip_html(bs_elem, ctx=None): if isinstance(bs_elem, str): bs_elem = bs4.BeautifulSoup(bs_elem, 'lxml' if has_lxml else 'html5lib') + # Prepare stuff and run stripping on all HTML elements. outtxt = '' - ctx = _HtmlStripping() - ctx.url_flattener = url_flattener + if ctx is None: + ctx = HtmlStrippingContext() for c in bs_elem.children: outtxt += _do_strip_html(c, ctx) + # If URLs are inline, insert them where we left our marker. If not, replace + # our markers with an empty string and append the URLs at the end. keys = ['url:%d' % i for i in range(len(ctx.urls))] - if inline_urls: - urls = dict(zip(keys, [' ' + u for u in ctx.urls])) + if ctx.url_mode == URLMODE_INLINE: + url_repl = [' ' + u for u in ctx.urls] + # Some URLs didn't have any text to be placed next to, so for those + # we don't need any extra space before. + for i in ctx.nosp_urls: + url_repl[i] = url_repl[i][1:] + urls = dict(zip(keys, url_repl)) else: urls = dict(zip(keys, [''] * len(ctx.urls))) outtxt = outtxt % urls - if not inline_urls and ctx.urls: - outtxt += '\n' + '\n'.join(ctx.urls) - return outtxt - + if ctx.url_mode != URLMODE_INLINE and ctx.urls: + outtxt = outtxt.rstrip() + if ctx.url_mode == URLMODE_LAST: + outtxt += ' ' + ' '.join(ctx.urls) + elif ctx.url_mode == URLMODE_BOTTOM_LIST: + outtxt += '\n' + '\n'.join(ctx.urls) -class _HtmlStripping: - def __init__(self): - self.urls = [] - self.url_flattener = None + # Add the length of URLs to the text length. + for url in ctx.urls: + ctx.text_length += ctx.url_flattener.measureUrl(url) + # Add spaces and other extra characters to the text length. + if ctx.url_mode == URLMODE_INLINE: + # One space per URL except the explicitly no-space-urls. + ctx.text_length += len(ctx.urls) - len(ctx.nosp_urls) + else: + # One space or newline per URL, plus the first one. + ctx.text_length += len(ctx.urls) + 1 + + return outtxt def _escape_percents(txt): @@ -93,7 +140,9 @@ def _do_strip_html(elem, ctx): if isinstance(elem, bs4.NavigableString): - return _escape_percents(str(elem)) + raw_txt = str(elem) + ctx.text_length += len(raw_txt) + return _escape_percents(raw_txt) if elem.name == 'a': try: @@ -102,22 +151,30 @@ href = None cnts = list(elem.contents) if len(cnts) == 1: + # Use the URL flattener to reformat the hyperlink. href_txt = cnts[0].string - if href_txt in href: - # If we have a simple hyperlink where the text is a - # substring of the target URL, just return the URL. - return _escape_percents(href) + old_text_length = ctx.text_length + href_flattened = ctx.url_flattener.replaceHref(href_txt, href, ctx) + if href_flattened is not None: + # We have a reformatted URL. Use that, but check if the + # flattener computed a custom text length. If not, do the + # standard computation. + if ctx.text_length == old_text_length: + ctx.text_length += len(href_flattened) + return href_flattened - if ctx.url_flattener: - # Use an URL flattener if we have one. - href_parsed = urllib.parse.urlparse(href) - href_flattened = ctx.url_flattener.replaceHref( - href_txt, href_parsed) - if href_flattened is not None: - return href_flattened + # If we have a simple hyperlink where the text is a substring of + # the target URL, just return the URL. + if href_txt in href: + a_txt = '%%(url:%d)s' % len(ctx.urls) + ctx.nosp_urls.append(len(ctx.urls)) + ctx.urls.append(href) + # No text length to add. + return a_txt # No easy way to simplify this hyperlink... let's put a marker # for the URL to be later replaced in the text. + # Text length is accumulated through recursive calls to _do_strip_html. a_txt = ''.join([_do_strip_html(c, ctx) for c in cnts]) a_txt += '%%(url:%d)s' % len(ctx.urls) @@ -130,6 +187,7 @@ if c.name == 'li': outtxt += ('%s. ' % (i + 1)) + _do_strip_html(c, ctx) outtxt += '\n' + ctx.text_length += len(outtxt) return outtxt if elem.name == 'ul': @@ -138,6 +196,7 @@ if c.name == 'li': outtxt += '- ' + _do_strip_html(c, ctx) outtxt += '\n' + ctx.text_length += len(outtxt) return outtxt return ''.join([_do_strip_html(c, ctx) for c in elem.children]) diff -r c8e0d4c12f92 -r c898b4df0f29 silorider/silos/twitter.py --- a/silorider/silos/twitter.py Sun May 22 21:31:36 2022 -0700 +++ b/silorider/silos/twitter.py Wed Apr 19 12:46:58 2023 -0700 @@ -1,6 +1,8 @@ import logging import twitter +import urllib.parse from .base import Silo +from ..format import UrlFlattener logger = logging.getLogger(__name__) @@ -77,16 +79,22 @@ self.client.PostUpdate(tweettxt, media=media_urls) def dryRunPostEntry(self, entry, ctx): - tweettxt = self.formatEntry(entry, limit=280) + tweettxt = self.formatEntry(entry, limit=280, + url_flattener=TwitterUrlFlattener()) logger.info("Tweet would be:") logger.info(tweettxt) + media_urls = entry.get('photo', [], force_list=True) + if media_urls: + logger.info("...with photos: %s" % str(media_urls)) TWITTER_NETLOCS = ['twitter.com', 'www.twitter.com'] -class TwitterUrlFlattener: - def replaceHref(self, text, url): +class TwitterUrlFlattener(UrlFlattener): + def replaceHref(self, text, raw_url, ctx): + url = urllib.parse.urlparse(raw_url) + # Is it a Twitter URL? if url.netloc not in TWITTER_NETLOCS: return None @@ -97,3 +105,6 @@ return '@' + path return None + + def measureUrl(self, raw_url): + return 23 diff -r c8e0d4c12f92 -r c898b4df0f29 tests/test_format.py --- a/tests/test_format.py Sun May 22 21:31:36 2022 -0700 +++ b/tests/test_format.py Wed Apr 19 12:46:58 2023 -0700 @@ -1,5 +1,7 @@ import pytest -from silorider.format import format_entry, strip_html +from silorider.format import ( + format_entry, strip_html, HtmlStrippingContext, + URLMODE_INLINE, URLMODE_LAST, URLMODE_BOTTOM_LIST) test_url = 'https://example.org/article' @@ -34,7 +36,11 @@ "Something with one link here http://example.org/first and another there http://example.org/second...") # NOQA ]) def test_strip_html(text, expected): - actual = strip_html(text) + ctx = HtmlStrippingContext() + ctx.url_mode = URLMODE_INLINE + actual = strip_html(text, ctx) + print(actual) + print(expected) assert actual == expected @@ -42,12 +48,14 @@ ("

Something with a link", "Something with a link\nhttp://example.org/blah"), ("

Something with a link http://example.org", # NOQA - "Something with a link http://example.org/blah"), + "Something with a link\nhttp://example.org/blah"), ("

Something with one link here and another there...

", # NOQA "Something with one link here and another there...\nhttp://example.org/first\nhttp://example.org/second") # NOQA ]) def test_strip_html_with_bottom_urls(text, expected): - actual = strip_html(text, inline_urls=False) + ctx = HtmlStrippingContext() + ctx.url_mode = URLMODE_BOTTOM_LIST + actual = strip_html(text, ctx) print(actual) print(expected) assert actual == expected