view silorider/commands/process.py @ 66:4caf6720d1dd

Only process silos that did not throw an exception in pre-process.
author Ludovic Chabant <ludovic@chabant.com>
date Tue, 26 Dec 2023 16:27:28 -0800
parents b7da3d97ea99
children c5bf03406a33
line wrap: on
line source

import logging
import dateparser
from .utils import get_named_silos, get_named_urls
from ..silos.base import SiloPostingContext, upload_silo_media
from ..parse import parse_url


logger = logging.getLogger(__name__)


def process_urls(ctx):
    for name, url in get_named_urls(ctx.config, ctx.args.url):
        logger.info("Processing %s" % url)
        p = Processor(ctx, name, url)
        p.process()


class Processor:
    def __init__(self, ctx, name, url):
        self.ctx = ctx
        self.name = name
        self.url = url
        self._silos = get_named_silos(ctx.silos, ctx.args.silo)

    @property
    def config(self):
        return self.ctx.config

    @property
    def silos(self):
        return self._silos

    def process(self):
        ok_silos = self.preProcess()

        # Get all silos to return a profile URL handler.
        profile_url_handlers = {}
        for silo in ok_silos:
            handler = silo.getProfileUrlHandler()
            if handler:
                profile_url_handlers[silo.SILO_TYPE] = handler

        postctx = SiloPostingContext(self.ctx, profile_url_handlers)
        feed = parse_url(self.url, self.name, self.config)
        for entry in feed.entries:
            self.processEntry(ok_silos, postctx, entry)

        self.postProcess(ok_silos)

    def preProcess(self):
        # Pre-parse the "since" and "until" dates/times.
        if self.ctx.args.since:
            self.ctx.args.since = dateparser.parse(self.ctx.args.since)
        if self.ctx.args.until:
            self.ctx.args.until = dateparser.parse(self.ctx.args.until)

        # Go over the silos needed for this command (i.e. potentially
        # filtered by passing `-s`) and call their `onPostStart`.
        ok_silos = []
        for silo in self.silos:
            try:
                silo.onPostStart(self.ctx)
                ok_silos.append(silo)
            except Exception as ex:
                logger.error("Error during pre-process of silo '%s'" % silo.name)
                logger.error(ex)
        return ok_silos

    def postProcess(self, silos):
        for silo in silos:
            silo.onPostEnd(self.ctx)

    def processEntry(self, silos, postctx, entry):
        entry_url = entry.get('url')
        if not entry_url:
            logger.warning("Found entry without a URL: %s" % repr(entry._mf_entry))
            return

        if self.isEntryFiltered(entry):
            logger.debug("Entry is filtered out: %s" % entry_url)
            return

        no_cache = self.ctx.args.no_cache
        only_since = self.ctx.args.since
        only_until = self.ctx.args.until

        logger.debug("Processing entry: %s" % entry_url)
        for silo in silos:
            if only_since or only_until:
                entry_dt = entry.get('published')
                if not entry_dt:
                    logger.warning(
                        "Skipping entry with no published date/time "
                        "for %s: %s" % (silo.name, entry_url))
                    continue

                # Strip entry datetime's time-zone information if we
                # don't have a time-zone info from the command line.
                if ((only_since and not only_since.tzinfo) or
                    (only_until and not only_until.tzinfo)):
                    entry_dt = entry_dt.replace(tzinfo=None)

                if only_since and entry_dt < only_since:
                    logger.info(
                        "Skipping entry older than specified date/time "
                        "for %s: %s" % (silo.name, entry_url))
                    continue
                if only_until and entry_dt > only_until:
                    logger.info(
                        "Skipping entry newer than specified date/time "
                        "for %s: %s" % (silo.name, entry_url))
                    continue

            if not no_cache and self.ctx.cache.wasPosted(silo.name, entry_url):
                logger.debug("Skipping already posted entry on %s: %s" %
                             (silo.name, entry_url))
                continue

            entry_card = silo.getEntryCard(entry, postctx)
            if not entry_card:
                logger.error("Can't find any content to use for entry: %s" % entry_url)
                continue

            media_callback = silo.mediaCallback
            if self.ctx.args.dry_run:
                media_callback = silo.dryRunMediaCallback
            media_ids = upload_silo_media(entry_card, 'photo', media_callback)

            if not self.ctx.args.dry_run:
                logger.debug("Posting to '%s': %s" % (silo.name, entry_url))
                try:
                    did_post = silo.postEntry(entry_card, media_ids, postctx)
                except Exception as ex:
                    did_post = False
                    logger.error("Error posting: %s" % entry_url)
                    logger.error(ex)
                    if self.ctx.args.verbose:
                        raise
                if did_post is True or did_post is None:
                    self.ctx.cache.addPost(silo.name, entry_url)
            else:
                logger.info("Would post to '%s': %s" % (silo.name, entry_url))
                silo.dryRunPostEntry(entry_card, media_ids, postctx)

    def isEntryFiltered(self, entry):
        if not self.config.has_section('filter'):
            return False

        items = self.config.items('filter')

        for name, value in items:
            if name.startswith('include_'):
                propname = name[8:]
                propvalue = entry.get(propname)
                for inc_val in value.split(','):
                    if inc_val in propvalue:
                        break
                else:
                    return True

            elif name.startswith('exclude_'):
                propname = name[8:]
                propvalue = entry.get(propname)
                for excl_val in value.split(','):
                    if excl_val in propvalue:
                        return True

        return False