view silorider/format.py @ 34:8c513e43673d

Add debug information when an entry is found without a URL
author Ludovic Chabant <ludovic@chabant.com>
date Wed, 10 May 2023 16:10:35 -0700
parents 9e4eb3f2754e
children 0f98784bcc40
line wrap: on
line source

import re
import string
import textwrap
import bs4
from .config import has_lxml


def format_entry(entry, limit=None, add_url='auto', url_flattener=None):
    url = entry.url

    ctx = HtmlStrippingContext()
    if url_flattener:
        ctx.url_flattener = url_flattener
    # Don't add the limit yet.

    name = get_best_text(entry, ctx)
    if not name:
        raise Exception("Can't find best text for entry: %s" % url)

    do_add_url = ((add_url is True) or
                  (add_url == 'auto' and not entry.is_micropost))
    if limit:
        text_length = ctx.text_length
        if do_add_url and url:
            # We need to add the URL at the end of the post, so account
            # for it plus a space by making the text length limit smaller.
            limit -= 1 + ctx.url_flattener.measureUrl(url)

        shortened = text_length > limit
        if shortened:
            # We need to shorten the text! We can't really reason about it
            # anymore at this point because we could have URLs inside the
            # text that don't measure the correct number of characters
            # (such as with Twitter's URL shortening). Let's just start
            # again with a limit that's our max limit, minus the room
            # needed to add the link to the post.
            if not do_add_url and add_url == 'auto' and url:
                do_add_url = True
                limit -= 1 + ctx.url_flattener.measureUrl(url)

            ctx = HtmlStrippingContext()
            ctx.limit = limit
            if url_flattener:
                ctx.url_flattener = url_flattener
            name = get_best_text(entry, ctx)

    # Actually add the url to the original post now.
    if do_add_url and url:
        name += ' ' + url
    return name


class UrlFlattener:
    def replaceHref(self, text, url, ctx):
        raise NotImplementedError()

    def measureUrl(self, url):
        raise NotImplementedError()


class _NullUrlFlattener(UrlFlattener):
    def replaceHref(self, text, url, ctx):
        return None

    def measureUrl(self, url):
        return len(url)


URLMODE_INLINE = 0
URLMODE_LAST = 1
URLMODE_BOTTOM_LIST = 2

class HtmlStrippingContext:
    def __init__(self):
        # Mode for inserting URLs
        self.url_mode = URLMODE_LAST
        # List of URLs to insert
        self.urls = []
        # Indices of URLs that should not get a leading whitespace
        self.nosp_urls = []
        # Object that can measure and shorten URLs
        self.url_flattener = _NullUrlFlattener()
        # Limit for how long the text can be
        self.limit = -1

        # Accumulated text length when accounting for shortened URLs
        self.text_length = 0
        # Whether limit was reached
        self.limit_reached = False

    def processText(self, txt, allow_shorten=True):
        added_len = len(txt)
        next_text_length = self.text_length + added_len
        if self.limit <= 0 or next_text_length <= self.limit:
            self.text_length = next_text_length
            return txt

        if allow_shorten:
            max_allowed = self.limit - self.text_length
            short_txt = textwrap.shorten(
                txt,
                width=max_allowed,
                expand_tabs=False,
                replace_whitespace=False,
                placeholder="...")
            self.text_length += len(short_txt)
            self.limit_reached = True
            return short_txt
        else:
            self.limit_reached = True
            return ''


def get_best_text(entry, ctx=None, *, plain=True):
    elem = entry.htmlFind(class_='p-title')
    if not elem:
        elem = entry.htmlFind(class_='p-name')
    if not elem:
        elem = entry.htmlFind(class_='e-content')

    if elem:
        if not plain:
            text = '\n'.join([str(c) for c in elem.contents])
            return str(text)
        return strip_html(elem, ctx)

    return None


def strip_html(bs_elem, ctx=None):
    if isinstance(bs_elem, str):
        bs_elem = bs4.BeautifulSoup(bs_elem,
                                    'lxml' if has_lxml else 'html5lib')

    # Prepare stuff and run stripping on all HTML elements.
    outtxt = ''
    if ctx is None:
        ctx = HtmlStrippingContext()
    for c in bs_elem.children:
        outtxt += _do_strip_html(c, ctx)

    # If URLs are inline, insert them where we left our marker. If not, replace
    # our markers with an empty string and append the URLs at the end.
    # If we reached the limit with the text alone, replace URLs with empty
    # strings and bail out.
    keys = ['url:%d' % i for i in range(len(ctx.urls))]
    if not ctx.limit_reached and ctx.url_mode == URLMODE_INLINE:
        url_repl = [' ' + u for u in ctx.urls]
        # Some URLs didn't have any text to be placed next to, so for those
        # we don't need any extra space before.
        for i in ctx.nosp_urls:
            url_repl[i] = url_repl[i][1:]
        urls = dict(zip(keys, url_repl))
    else:
        urls = dict(zip(keys, [''] * len(ctx.urls)))
    outtxt = outtxt % urls
    if ctx.limit_reached:
        return outtxt
    if ctx.url_mode != URLMODE_INLINE and ctx.urls:
        if ctx.url_mode == URLMODE_LAST:
            # Don't add unnecessary whitespace.
            # NOTE: our final measure of the text might be one character
            #       too long because of this, but that's desirable.
            if outtxt[-1] not in string.whitespace:
                outtxt += ' '
            outtxt += ' '.join(ctx.urls)
        elif ctx.url_mode == URLMODE_BOTTOM_LIST:
            # If the last character of the text is a whitespace, replace
            # it with a newline.
            # NOTE: our final measure of the text might be one character
            #       too long because of this, but that's desirable.
            if outtxt[-1] in string.whitespace:
                outtxt = outtxt[:-1] + '\n'
            else:
                outtxt += '\n'
            outtxt += '\n'.join(ctx.urls)

    # Add the length of URLs to the text length.
    for url in ctx.urls:
        ctx.text_length += ctx.url_flattener.measureUrl(url)
    # Add spaces and other extra characters to the text length.
    if ctx.url_mode == URLMODE_INLINE:
        # One space per URL except the explicitly no-space-urls.
        ctx.text_length += len(ctx.urls) - len(ctx.nosp_urls)
    else:
        # One space or newline per URL.
        ctx.text_length += len(ctx.urls)

    return outtxt


def _escape_percents(txt):
    return txt.replace('%', '%%')


def _do_strip_html(elem, ctx):
    if isinstance(elem, bs4.NavigableString):
        # Don't necessarily include this bit of text...
        # If it belongs to a paragraph, include it. If not, include it
        # only if there are not paragraphs in its siblings (because that
        # means this is the white-space between the paragraph tags)
        include_this = False
        for parent in elem.parents:
            if parent and parent.name == 'p':
                include_this = True
                break
        else:
            next_sib = next(elem.next_siblings, None)
            prev_sib = next(elem.previous_siblings, None)
            if ((prev_sib is None or prev_sib.name != 'p') and
                (next_sib is None or next_sib.name != 'p')):
                include_this = True

        if include_this:
            raw_txt = str(elem)
            return _escape_percents(ctx.processText(raw_txt))
        else:
            return ''

    if elem.name == 'a':
        try:
            href = elem['href']
        except KeyError:
            href = None
        cnts = list(elem.contents)
        if len(cnts) == 1:
            # Use the URL flattener to reformat the hyperlink.
            href_txt = cnts[0].string
            old_text_length = ctx.text_length
            href_flattened = ctx.url_flattener.replaceHref(href_txt, href, ctx)
            if href_flattened is not None:
                # We have a reformatted URL. Use that, but check if the
                # flattener computed a custom text length. If not, do the
                # standard computation.
                if ctx.text_length == old_text_length:
                    return ctx.processText(href_flattened, False)
                return href_flattened

            # If we have a simple hyperlink where the text is a substring of
            # the target URL, just return the URL.
            if href_txt in href:
                a_txt = '%%(url:%d)s' % len(ctx.urls)
                ctx.nosp_urls.append(len(ctx.urls))
                ctx.urls.append(href)
                # No text length to add.
                return a_txt

        # No easy way to simplify this hyperlink... let's put a marker
        # for the URL to be later replaced in the text.
        # Text length is accumulated through recursive calls to _do_strip_html.
        a_txt = ''.join([_do_strip_html(c, ctx)
                         for c in cnts])
        a_txt += '%%(url:%d)s' % len(ctx.urls)
        ctx.urls.append(href)
        return a_txt

    if elem.name == 'ol':
        outtxt = ''
        for i, c in enumerate(elem.children):
            if c.name == 'li':
                outtxt += ('%s. ' % (i + 1)) + _do_strip_html(c, ctx)
                outtxt += '\n'
        return ctx.processText(outtxt)

    if elem.name == 'ul':
        outtxt = ''
        for c in elem.children:
            if c.name == 'li':
                outtxt += '- ' + _do_strip_html(c, ctx)
                outtxt += '\n'
        return ctx.processText(outtxt)

    return ''.join([_do_strip_html(c, ctx) for c in elem.children])


re_sentence_end = re.compile(r'[\w\]\)\"\'\.]\.\s|[\?\!]\s')


def shorten_text(txt, limit):
    if len(txt) <= limit:
        return (txt, False)

    m = re_sentence_end.search(txt)
    if m and m.end <= (limit + 1):
        return (txt[:m.end - 1], True)

    shorter = textwrap.shorten(
        txt, width=limit, placeholder="...")
    return (shorter, True)