view piecrust/admin/textutil.py @ 852:4850f8c21b6e

core: Start of the big refactor for PieCrust 3.0. * Everything is a `ContentSource`, including assets directories. * Most content sources are subclasses of the base file-system source. * A source is processed by a "pipeline", and there are 2 built-in pipelines, one for assets and one for pages. The asset pipeline is vaguely functional, but the page pipeline is completely broken right now. * Rewrite the baking process as just running appropriate pipelines on each content item. This should allow for better parallelization.
author Ludovic Chabant <ludovic@chabant.com>
date Wed, 17 May 2017 00:11:48 -0700
parents 5e91bc0e3b4d
children
line wrap: on
line source

from html.parser import HTMLParser


def text_preview(txt, length=100, *, max_length=None, offset=0):
    max_length = max_length or (length + 50)
    extract = txt[offset:offset + length]
    if len(txt) > offset + length:
        for i in range(offset + length,
                       min(offset + max_length, len(txt))):
            c = txt[i]
            if c not in [' ', '\t', '\r', '\n']:
                extract += c
            else:
                extract += '...'
                break
    return extract


class MLStripper(HTMLParser):
    def __init__(self):
        super(MLStripper, self).__init__()
        self.reset()
        self.strict = False
        self.convert_charrefs = True
        self.fed = []

    def handle_data(self, d):
        self.fed.append(d)

    def handle_entityref(self, name):
        self.fed.append('&%s;' % name)

    def get_data(self):
        return ''.join(self.fed)


def html_to_text(html):
    s = MLStripper()
    s.feed(html)
    return s.get_data()