view silorider/format.py @ 29:20d4cf433704

Improve media attachment MIME detection, and fallback to JPEG on fail
author Ludovic Chabant <ludovic@chabant.com>
date Wed, 19 Apr 2023 12:48:58 -0700
parents c898b4df0f29
children 9e4eb3f2754e
line wrap: on
line source

import re
import textwrap
import bs4
from .config import has_lxml


def format_entry(entry, limit=None, add_url='auto', url_flattener=None):
    url = entry.url

    ctx = HtmlStrippingContext()
    if url_flattener:
        ctx.url_flattener = url_flattener

    name = get_best_text(entry, ctx)
    if not name:
        raise Exception("Can't find best text for entry: %s" % url)

    do_add_url = ((add_url is True) or
                  (add_url == 'auto' and not entry.is_micropost))
    if limit:
        text_length = ctx.text_length
        if do_add_url and url:
            limit -= 1 + ctx.url_flattener.measureUrl(url)

        shortened = text_length > limit
        if shortened:
            # If we have to shorten the text, but we haven't taken the
            # URL into account yet, let's see if we have to include it now!
            # (this happens when we only want to include it when the text
            #  is shortened)
            if not do_add_url and add_url == 'auto' and url:
                do_add_url = True
                limit -= 1 + ctx.url_flattener.measureUrl(url)

        if limit <= 0:
            raise Exception("Can't shorten post name.")

        name = textwrap.shorten(name, width=limit, placeholder="...")

    if do_add_url and url:
        name += ' ' + url
    return name


class UrlFlattener:
    def replaceHref(self, text, url, ctx):
        raise NotImplementedError()

    def measureUrl(self, url):
        raise NotImplementedError()


class _NullUrlFlattener(UrlFlattener):
    def replaceHref(self, text, url, ctx):
        return None

    def measureUrl(self, url):
        return len(url)


URLMODE_INLINE = 0
URLMODE_LAST = 1
URLMODE_BOTTOM_LIST = 2

class HtmlStrippingContext:
    def __init__(self):
        self.url_mode = URLMODE_BOTTOM_LIST
        self.urls = []
        self.nosp_urls = []
        self.url_flattener = _NullUrlFlattener()
        self.text_length = 0



def get_best_text(entry, ctx=None, *, plain=True):
    elem = entry.htmlFind(class_='p-title')
    if not elem:
        elem = entry.htmlFind(class_='p-name')
    if not elem:
        elem = entry.htmlFind(class_='e-content')

    if elem:
        if not plain:
            text = '\n'.join([str(c) for c in elem.contents])
            return str(text)
        return strip_html(elem, ctx)

    return None


def strip_html(bs_elem, ctx=None):
    if isinstance(bs_elem, str):
        bs_elem = bs4.BeautifulSoup(bs_elem,
                                    'lxml' if has_lxml else 'html5lib')

    # Prepare stuff and run stripping on all HTML elements.
    outtxt = ''
    if ctx is None:
        ctx = HtmlStrippingContext()
    for c in bs_elem.children:
        outtxt += _do_strip_html(c, ctx)

    # If URLs are inline, insert them where we left our marker. If not, replace
    # our markers with an empty string and append the URLs at the end.
    keys = ['url:%d' % i for i in range(len(ctx.urls))]
    if ctx.url_mode == URLMODE_INLINE:
        url_repl = [' ' + u for u in ctx.urls]
        # Some URLs didn't have any text to be placed next to, so for those
        # we don't need any extra space before.
        for i in ctx.nosp_urls:
            url_repl[i] = url_repl[i][1:]
        urls = dict(zip(keys, url_repl))
    else:
        urls = dict(zip(keys, [''] * len(ctx.urls)))
    outtxt = outtxt % urls
    if ctx.url_mode != URLMODE_INLINE and ctx.urls:
        outtxt = outtxt.rstrip()
        if ctx.url_mode == URLMODE_LAST:
            outtxt += ' ' + ' '.join(ctx.urls)
        elif ctx.url_mode == URLMODE_BOTTOM_LIST:
            outtxt += '\n' + '\n'.join(ctx.urls)

    # Add the length of URLs to the text length.
    for url in ctx.urls:
        ctx.text_length += ctx.url_flattener.measureUrl(url)
    # Add spaces and other extra characters to the text length.
    if ctx.url_mode == URLMODE_INLINE:
        # One space per URL except the explicitly no-space-urls.
        ctx.text_length += len(ctx.urls) - len(ctx.nosp_urls)
    else:
        # One space or newline per URL, plus the first one.
        ctx.text_length += len(ctx.urls) + 1

    return outtxt


def _escape_percents(txt):
    return txt.replace('%', '%%')


def _do_strip_html(elem, ctx):
    if isinstance(elem, bs4.NavigableString):
        raw_txt = str(elem)
        ctx.text_length += len(raw_txt)
        return _escape_percents(raw_txt)

    if elem.name == 'a':
        try:
            href = elem['href']
        except KeyError:
            href = None
        cnts = list(elem.contents)
        if len(cnts) == 1:
            # Use the URL flattener to reformat the hyperlink.
            href_txt = cnts[0].string
            old_text_length = ctx.text_length
            href_flattened = ctx.url_flattener.replaceHref(href_txt, href, ctx)
            if href_flattened is not None:
                # We have a reformatted URL. Use that, but check if the
                # flattener computed a custom text length. If not, do the
                # standard computation.
                if ctx.text_length == old_text_length:
                    ctx.text_length += len(href_flattened)
                return href_flattened

            # If we have a simple hyperlink where the text is a substring of
            # the target URL, just return the URL.
            if href_txt in href:
                a_txt = '%%(url:%d)s' % len(ctx.urls)
                ctx.nosp_urls.append(len(ctx.urls))
                ctx.urls.append(href)
                # No text length to add.
                return a_txt

        # No easy way to simplify this hyperlink... let's put a marker
        # for the URL to be later replaced in the text.
        # Text length is accumulated through recursive calls to _do_strip_html.
        a_txt = ''.join([_do_strip_html(c, ctx)
                         for c in cnts])
        a_txt += '%%(url:%d)s' % len(ctx.urls)
        ctx.urls.append(href)
        return a_txt

    if elem.name == 'ol':
        outtxt = ''
        for i, c in enumerate(elem.children):
            if c.name == 'li':
                outtxt += ('%s. ' % (i + 1)) + _do_strip_html(c, ctx)
                outtxt += '\n'
        ctx.text_length += len(outtxt)
        return outtxt

    if elem.name == 'ul':
        outtxt = ''
        for c in elem.children:
            if c.name == 'li':
                outtxt += '- ' + _do_strip_html(c, ctx)
                outtxt += '\n'
        ctx.text_length += len(outtxt)
        return outtxt

    return ''.join([_do_strip_html(c, ctx) for c in elem.children])


re_sentence_end = re.compile(r'[\w\]\)\"\'\.]\.\s|[\?\!]\s')


def shorten_text(txt, limit):
    if len(txt) <= limit:
        return (txt, False)

    m = re_sentence_end.search(txt)
    if m and m.end <= (limit + 1):
        return (txt[:m.end - 1], True)

    shorter = textwrap.shorten(
        txt, width=limit, placeholder="...")
    return (shorter, True)