Mercurial > silorider
view silorider/commands/process.py @ 18:a921cc2306bc
Do our own HTML parsing/stripping of micropost contents.
- This lets us properly handle various forms of linking.
- Add tests for processing posts with links.
- Fix configuration in tests.
- Basic error handling for processing posts.
author | Ludovic Chabant <ludovic@chabant.com> |
---|---|
date | Sun, 16 Sep 2018 21:16:20 -0700 |
parents | cb1dc5c864d8 |
children | b6a127ca3727 |
line wrap: on
line source
import logging from .utils import get_named_silos, get_named_urls from ..silos.base import SiloPostingContext from ..parse import parse_url logger = logging.getLogger(__name__) def process_urls(ctx): for url in get_named_urls(ctx.config, ctx.args.url): logger.info("Processing %s" % url) p = Processor(ctx, url) p.process() class Processor: def __init__(self, ctx, url): self.ctx = ctx self.url = url self._silos = get_named_silos(ctx.silos, ctx.args.silo) @property def config(self): return self.ctx.config @property def silos(self): return self._silos def process(self): self.preProcess() feed = parse_url(self.url) for entry in feed.entries: self.processEntry(entry) self.postProcess() def preProcess(self): for silo in self.silos: silo.onPostStart() def postProcess(self): for silo in self.silos: silo.onPostEnd() def processEntry(self, entry): entry_url = entry.get('url') if not entry_url: logger.warning("Found entry without a URL.") return if self.isEntryFiltered(entry): logger.debug("Entry is filtered out: %s" % entry_url) return postctx = SiloPostingContext(self.ctx) no_cache = self.ctx.args.no_cache logger.debug("Processing entry: %s" % entry_url) for silo in self.silos: if no_cache or not self.ctx.cache.wasPosted(silo.name, entry_url): if not self.ctx.args.dry_run: try: did_post = silo.postEntry(entry, postctx) except Exception as ex: did_post = False logger.error("Error posting: %s" % entry_url) logger.error(ex) if self.ctx.args.verbose: raise if did_post is True or did_post is None: self.ctx.cache.addPost(silo.name, entry_url) else: logger.info("Would post entry on %s: %s" % (silo.name, entry_url)) else: logger.debug("Skipping already posted entry on %s: %s" % (silo.name, entry_url)) def isEntryFiltered(self, entry): if not self.config.has_section('filter'): return False items = self.config.items('filter') for name, value in items: if name.startswith('include_'): propname = name[8:] propvalue = entry.get(propname) for inc_val in value.split(','): if inc_val in propvalue: break else: return True elif name.startswith('exclude_'): propname = name[8:] propvalue = entry.get(propname) for excl_val in value.split(','): if excl_val in propvalue: return True return False