Mercurial > silorider
changeset 53:805c7d768caa
Make feed parser more resilient, add ability to modify html doc.
- Better handle errors found while parsing the html feed.
- Add CSS-selector-based configuration to modify the html feed before parsing
it. This allows using SiloRider with a feed that doesn't have proper
micro-format markup.
author | Ludovic Chabant <ludovic@chabant.com> |
---|---|
date | Sun, 08 Oct 2023 13:52:26 -0700 |
parents | 5d77532cd3ea |
children | e3d2e13e8853 |
files | silorider/commands/process.py silorider/commands/utils.py silorider/parse.py |
diffstat | 3 files changed, 128 insertions(+), 43 deletions(-) [+] |
line wrap: on
line diff
--- a/silorider/commands/process.py Sun Oct 08 13:49:43 2023 -0700 +++ b/silorider/commands/process.py Sun Oct 08 13:52:26 2023 -0700 @@ -9,15 +9,16 @@ def process_urls(ctx): - for url in get_named_urls(ctx.config, ctx.args.url): + for name, url in get_named_urls(ctx.config, ctx.args.url): logger.info("Processing %s" % url) - p = Processor(ctx, url) + p = Processor(ctx, name, url) p.process() class Processor: - def __init__(self, ctx, url): + def __init__(self, ctx, name, url): self.ctx = ctx + self.name = name self.url = url self._silos = get_named_silos(ctx.silos, ctx.args.silo) @@ -32,7 +33,7 @@ def process(self): self.preProcess() - feed = parse_url(self.url) + feed = parse_url(self.url, self.name, self.config) for entry in feed.entries: self.processEntry(entry)
--- a/silorider/commands/utils.py Sun Oct 08 13:49:43 2023 -0700 +++ b/silorider/commands/utils.py Sun Oct 08 13:52:26 2023 -0700 @@ -1,5 +1,5 @@ import logging -import datetime +import dateutil.parser from ..parse import parse_url @@ -11,9 +11,9 @@ if config.has_section('urls'): named_urls = config.items('urls') if not names: - return [url for (_, url) in named_urls] + return named_urls - return [url for (name, url) in named_urls + return [(name, url) for (name, url) in named_urls if name in names] @@ -34,9 +34,7 @@ def populate_cache(ctx): - import dateutil.parser - - urls = get_named_urls(ctx.config, ctx.args.url) + named_urls = get_named_urls(ctx.config, ctx.args.url) until_dt = None if ctx.args.until: @@ -44,15 +42,15 @@ logger.debug("Populating cache until: %s" % until_dt) until_dt = until_dt.timestamp() - for url in urls: + for name, url in named_urls: logger.info("Caching entries from %s" % url) - _populate_cache_for_url(url, ctx, until_dt=until_dt) + _populate_cache_for_url(name, url, ctx, until_dt=until_dt) -def _populate_cache_for_url(url, ctx, until_dt=None): +def _populate_cache_for_url(name, url, ctx, until_dt=None): silos = get_named_silos(ctx.silos, ctx.args.silo) - feed = parse_url(url) + feed = parse_url(url, name, ctx.config) for entry in feed.entries: entry_url = entry.get('url')
--- a/silorider/parse.py Sun Oct 08 13:49:43 2023 -0700 +++ b/silorider/parse.py Sun Oct 08 13:52:26 2023 -0700 @@ -1,13 +1,28 @@ import os.path import logging -import datetime +import configparser +import urllib.request +import bs4 +import mf2py +import dateutil.parser +from datetime import datetime, date, timezone, timedelta +from .config import has_lxml logger = logging.getLogger(__name__) +default_dt = datetime.fromtimestamp(0, tz=timezone(timedelta(0))) -def parse_url(url_or_path): - mf_obj = parse_mf2(url_or_path) + +def _get_entry_published_dt(entry): + dt = entry.get('published', default_dt) + if isinstance(dt, date): + dt = datetime.combine(dt, datetime.now().time()) + return dt + + +def parse_url(url_or_path, name, config): + mf_obj = parse_mf2(url_or_path, name, config) matcher = EntryMatcher(mf_obj.to_dict(), mf_obj.__doc__) feed = Feed(url_or_path, matcher.mf_dict) @@ -26,26 +41,79 @@ sorted_entries = sorted( entries, - key=lambda e: e.get( - 'published', datetime.datetime.fromtimestamp( - 0, - tz=datetime.timezone(datetime.timedelta(0)))), + key=_get_entry_published_dt, reverse=False) feed.entries = sorted_entries + logger.debug("Parsed %d entries for: %s" % (len(sorted_entries), url_or_path)) return feed -def parse_mf2(url_or_path): - import mf2py - logger.debug("Fetching %s..." % url_or_path) +def parse_mf2(url_or_path, name, config): + # Get the URL or file contents. + logger.debug("Fetching %s" % url_or_path) if os.path.exists(url_or_path): - obj = open(url_or_path, 'r', encoding='utf8') - params = {'doc': obj} + with open(url_or_path, 'r', encoding='utf8') as fp: + html_raw = fp.read() else: - params = {'url': url_or_path} + with urllib.request.urlopen(url_or_path) as req: + html_raw = req.read() + + # Load this into an HTML document and optionally patch it. + html_doc = bs4.BeautifulSoup( + html_raw, + 'lxml' if has_lxml else 'html5lib') + _modify_html_doc(html_doc, name, config) + + # Parse the microformats! return mf2py.Parser( - html_parser='html5lib', img_with_alt=True, **params) + doc=html_doc, + html_parser='html5lib', + img_with_alt=True) + + +def _modify_html_doc(doc, name, config): + try: + class_mods = config.items('classes:%s' % name) + except configparser.NoSectionError: + return + + logger.debug("Modifying HTML doc:") + for selector, to_add in class_mods: + elems = list(doc.select(selector)) + if not elems: + logger.warning("No elements matched by rule: %s" % selector) + continue + for elem in elems: + logger.debug("Adding %s to %s" % (to_add, elem.name)) + if to_add == 'dt-published': + _insert_html_datetime_published(doc, elem) + else: + if 'class' not in elem.attrs: + elem['class'] = [] + elem['class'].append(to_add) + + +def _insert_html_datetime_published(doc, elem): + dt_str = str(elem.string) + try: + dt = dateutil.parser.parse(dt_str) + except dateutil.parser.ParseError as err: + logger.error("Can't parse published date: %s" % err) + return + + if dt.hour == 0 and dt.minute == 0 and dt.second == 0: + now_time = datetime.now().time() + dt = datetime.combine(dt.date(), now_time) + + time_el = doc.new_tag('time') + time_el['class'] = ['dt-published'] + time_el['datetime'] = dt.isoformat(' ', 'seconds') + time_el.append(dt_str) + + elem.clear() + elem.append(time_el) + logger.debug("Adding datetime attribute: %s" % dt) class InvalidEntryException(Exception): @@ -155,24 +223,42 @@ next_el = {} items = mf_dict.get('items', []) - if len(items) == 1 and items[0]['type'][0] == 'h-feed': - items = items[0].get('children', []) - - for e in items: - types = e.get('type') - if not types: + for item in items: + item_types = item.get('type', []) + if 'h-feed' not in item_types: continue - entry_type = types[0] - if entry_type not in els_by_type: - ebt = list(bf_doc.find_all(class_=entry_type)) - els_by_type[entry_type] = ebt - next_el[entry_type] = 0 + children = item.get('children', []) + logger.debug("Matching %d feed items" % len(children)) + for e in children: + e_types = e.get('type') + if not e_types: + continue + + # We only look at the first type on any element. + entry_type = e_types[0] - els = els_by_type[entry_type] - e_and_el = (e, els[next_el[entry_type]]) - self.entries.append(e_and_el) - next_el[entry_type] += 1 + # Get the list of all elements of that type from the doc. + if entry_type not in els_by_type: + ebt = list(bf_doc.find_all(class_=entry_type)) + els_by_type[entry_type] = ebt + next_el[entry_type] = 0 + if len(ebt) == 0: + logger.warning("Found no elements of type: %s" % entry_type) + + # We figure that mf2py found elements in the same order as + # they are found in the document, so we associate the two + # in order. + els = els_by_type[entry_type] + try: + e_and_el = (e, els[next_el[entry_type]]) + self.entries.append(e_and_el) + except IndexError: + logger.error( + "Ran out of elements in document! Found %d elements " + "of type '%s' but was trying to get element %d" % + (len(els), str(e_types), next_el[entry_type])) + next_el[entry_type] += 1 def strip_img_alt(photos):