changeset 1114:8af2ea1f5c34

tasks: Add new `tasks` command and infrastructure, with `mention` task. * The new command lets `chef` run tasks from a queue. * The webmention endpoint now adds a mention task. * Moved mention handling code to a task runner.
author Ludovic Chabant <ludovic@chabant.com>
date Thu, 22 Feb 2018 22:12:45 -0800
parents 29c51b981c17
children 11b9d0c8bd62
files piecrust/admin/siteinfo.py piecrust/admin/views/mentions.py piecrust/commands/builtin/tasks.py piecrust/pathutil.py piecrust/plugins/base.py piecrust/plugins/builtin.py piecrust/tasks/base.py piecrust/tasks/mentions.py setup.py
diffstat 9 files changed, 385 insertions(+), 110 deletions(-) [+]
line wrap: on
line diff
--- a/piecrust/admin/siteinfo.py	Wed Feb 21 21:21:42 2018 -0800
+++ b/piecrust/admin/siteinfo.py	Thu Feb 22 22:12:45 2018 -0800
@@ -128,6 +128,13 @@
         except subprocess.TimeoutExpired:
             flash("Publish process is still running... check the log later.")
 
+    def runTask(self, task_id):
+        args = [
+            '--no-color',
+            'tasks', 'run',
+            '-t', task_id]
+        self._runChef(args)
+
     def _runChef(self, args):
         chef_path = os.path.realpath(os.path.join(
             os.path.dirname(__file__),
--- a/piecrust/admin/views/mentions.py	Wed Feb 21 21:21:42 2018 -0800
+++ b/piecrust/admin/views/mentions.py	Thu Feb 22 22:12:45 2018 -0800
@@ -1,14 +1,7 @@
-import os
-import os.path
-import json
-import time
 import logging
-import requests
-from bs4 import BeautifulSoup
-from flask import current_app, g, request, make_response, abort
+from flask import g, request, make_response, abort
 from ..blueprint import foodtruck_bp
-from piecrust.app import PieCrustFactory
-from piecrust.serving.util import get_requested_page
+from piecrust.tasks.base import TaskManager
 
 
 logger = logging.getLogger(__name__)
@@ -26,105 +19,18 @@
         logger.error("Source and target are the same.")
         abort(400)
 
-    # See if we need to do this synchronously or asynchronously, and other
-    # things we should know up-front.
-    wmcfg = g.site.piecrust_app.config.get('webmention')
-    if wmcfg.get('use_task_queue') is True:
-        tasks_dir = os.path.join(g.site.piecrust_app.root_dir, '_tasks')
-        _ensure_dir(tasks_dir)
-        task_data = {
-            'type': 'webmention',
-            'data': {'source': src_url, 'target': tgt_url}}
-        task_path = os.path.join(tasks_dir, '%s.json' % int(time.time()))
-        with open(task_path, 'w', encoding='utf8') as fp:
-            json.dump(task_data, fp)
-        return make_response("Webmention queued.", 202, [])
-
-    # Find if we have a page at the target URL.
-    # To do that we need to spin up a PieCrust app that knows how the website
-    # works. Because the website might have been baked with custom settings
-    # (usually the site root URL) there's a good chance we need to apply
-    # some variants, which the user can specify in the config.
-    pcappfac = PieCrustFactory(
-        current_app.config['FOODTRUCK_ROOT_DIR'],
-        cache_key='webmention')
-    if wmcfg.get('config_variant'):
-        pcappfac.config_variants = [wmcfg.get('config_variant')]
-    if wmcfg.get('config_variants'):
-        pcappfac.config_variants = list(wmcfg.get('config_variants'))
-    if wmcfg.get('config_values'):
-        pcappfac.config_values = list(wmcfg.get('config_values').items())
-    pcapp = pcappfac.create()
-    try:
-        req_page = get_requested_page(pcapp, tgt_url)
-        if req_page.page is None:
-            abort(404)
-    except Exception as ex:
-        logger.error("Can't check webmention target page: %s" % tgt_url)
-        logger.exception(ex)
-        abort(404)
-
-    # Grab the source URL's contents and see if anything references the
-    # target (ours) URL.
-    src_t = requests.get(src_url)
-    src_html = BeautifulSoup(src_t.text, 'html.parser')
-    for link in src_html.find_all('a'):
-        href = link.get('href')
-        if href == tgt_url:
-            break
-    else:
-        logger.error("Source '%s' doesn't link to target: %s" %
-                     (src_url, tgt_url))
-        abort(400)
+    # Create the task for handling this mention.
+    pcapp = g.site.piecrust_app
+    task_manager = TaskManager(pcapp)
+    task_id = task_manager.createTask('mention', {
+        'source': src_url,
+        'target': tgt_url})
 
-    # Find something to quote for this webmention. We find an `h-entry`
-    # to get a title, excerpt, and/or text.
-    blurb = None
-    hentry = src_html.find(class_='h-entry')
-    if hentry:
-        try:
-            pname = hentry.find(class_='p-name')
-            pauthor = hentry.find(class_='p-author')
-            blurb = {
-                'pname': _bs4_contents_str(pname),
-                'pauthor': _bs4_contents_str(pauthor)}
-        except:  # NOQA
-            logger.error("Couldn't get h-entry info.")
-
-    dirname, _ = os.path.splitext(req_page.page.content_spec)
-    dirname += '-assets'
-    _ensure_dir(dirname)
-    mention_path = os.path.join(dirname, 'mentions.json')
-    try:
-        with open(mention_path, 'r', encoding='utf-8') as fp:
-            mention = json.load(fp)
-    except IOError:
-        mention = {'mentions': []}
+    # Either run the task now in a background process (for cheap and simple
+    # setups), or leave the task there to be picked up later when someone
+    # runs the task queue eventually.
+    wmcfg = pcapp.config.get('webmention')
+    if not wmcfg.get('use_task_queue'):
+        g.site.runTask(task_id)
 
-    for m in mention['mentions']:
-        if m['source'] == src_url:
-            return
-
-    new_mention = {'source': src_url}
-    if blurb:
-        new_mention.update(blurb)
-
-    mention['mentions'].append(new_mention)
-
-    with open(mention_path, 'w', encoding='utf-8') as fp:
-        json.dump(mention, fp)
-    logger.info("Received webmention from: %s" % src_url)
-
-    return make_response(("Webmention received.", 202, []))
-
-
-def _bs4_contents_str(node):
-    return ''.join([str(c).strip() for c in node.contents])
-
-
-def _ensure_dir(path, mode=0o775):
-    try:
-        os.makedirs(path, mode=mode, exist_ok=True)
-    except OSError:
-        pass
-
+    return make_response("Webmention queued.", 202, [])
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/piecrust/commands/builtin/tasks.py	Thu Feb 22 22:12:45 2018 -0800
@@ -0,0 +1,64 @@
+import os.path
+import logging
+from piecrust.commands.base import ChefCommand
+
+
+logger = logging.getLogger(__name__)
+
+
+class TasksCommand(ChefCommand):
+    """ Command for managing and running task queues.
+    """
+    def __init__(self):
+        super().__init__()
+        self.name = 'tasks'
+        self.description = "Manages and runs various tasks."
+
+    def setupParser(self, parser, app):
+        subparsers = parser.add_subparsers()
+
+        p = subparsers.add_parser(
+            'list',
+            help="Show the list of tasks current in the queue.")
+        p.set_defaults(sub_func=self._listTasks)
+
+        p = subparsers.add_parser(
+            'run',
+            help="Runs the current task queue.")
+        p.add_argument(
+            '-k', '--keep-queue',
+            action='store_true',
+            help="Don't delete the task queue files.")
+        p.add_argument(
+            '-t', '--task',
+            help="Specify which task to run.")
+        p.set_defaults(sub_func=self._runTasks)
+
+    def run(self, ctx):
+        if hasattr(ctx.args, 'sub_func'):
+            ctx.args.sub_func(ctx)
+
+    def _listTasks(self, ctx):
+        from piecrust.tasks.base import TaskManager
+
+        root_dir = ctx.app.root_dir
+        tm = TaskManager(ctx.app)
+        tm.getTasks()
+        tasks = list(tm.getTasks())
+        logger.info("Task queue contains %d tasks" % len(tasks))
+        for path, task_type, task_data in tasks:
+            logger.info(" - [%s] %s" %
+                        (task_type, os.path.relpath(path, root_dir)))
+
+    def _runTasks(self, ctx):
+        from piecrust.tasks.base import TaskManager
+
+        only_task = ctx.args.task
+        if only_task and os.path.isfile(only_task):
+            only_task, _ = os.path.splitext(os.path.basename(only_task))
+
+        tm = TaskManager(ctx.app)
+        tm.runQueue(
+            only_task=only_task,
+            clear_queue=False)  # (not ctx.args.keep_queue))
+
--- a/piecrust/pathutil.py	Wed Feb 21 21:21:42 2018 -0800
+++ b/piecrust/pathutil.py	Thu Feb 22 22:12:45 2018 -0800
@@ -56,3 +56,9 @@
             res.append(n)
     return res
 
+
+def ensure_dir(path, mode=0o755):
+    try:
+        os.makedirs(path, mode=mode, exist_ok=True)
+    except OSError:
+        pass
--- a/piecrust/plugins/base.py	Wed Feb 21 21:21:42 2018 -0800
+++ b/piecrust/plugins/base.py	Thu Feb 22 22:12:45 2018 -0800
@@ -44,6 +44,9 @@
     def getPublishers(self):
         return []
 
+    def getTaskRunners(self):
+        return []
+
     def initialize(self, app):
         pass
 
@@ -105,6 +108,9 @@
     def getPublishers(self):
         return self._getPluginComponents('getPublishers')
 
+    def getTaskRunners(self):
+        return self._getPluginComponents('getTaskRunners')
+
     def _ensureLoaded(self):
         if self._plugins is not None:
             return
--- a/piecrust/plugins/builtin.py	Wed Feb 21 21:21:42 2018 -0800
+++ b/piecrust/plugins/builtin.py	Thu Feb 22 22:12:45 2018 -0800
@@ -17,6 +17,7 @@
         from piecrust.commands.builtin.publishing import PublishCommand
         from piecrust.commands.builtin.scaffolding import PrepareCommand
         from piecrust.commands.builtin.serving import ServeCommand
+        from piecrust.commands.builtin.tasks import TasksCommand
         from piecrust.commands.builtin.themes import ThemesCommand
         from piecrust.commands.builtin.util import (
             InitCommand, PurgeCommand, ImportCommand)
@@ -39,7 +40,9 @@
             ShowRecordCommand(),
             ServeCommand(),
             AdministrationPanelCommand(),
-            PublishCommand()]
+            PublishCommand(),
+            TasksCommand()
+        ]
 
     def getCommandExtensions(self):
         from piecrust.commands.builtin.scaffolding import (
@@ -165,3 +168,8 @@
             SftpPublisher,
             RsyncPublisher]
 
+    def getTaskRunners(self):
+        from piecrust.tasks.mentions import MentionTaskRunner
+
+        return [
+            MentionTaskRunner]
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/piecrust/tasks/base.py	Thu Feb 22 22:12:45 2018 -0800
@@ -0,0 +1,109 @@
+import os
+import os.path
+import json
+import time
+import logging
+from piecrust.chefutil import format_timed
+
+
+TASKS_DIR = '_tasks'
+
+
+logger = logging.getLogger(__name__)
+
+
+class TaskContext:
+    def __init__(self):
+        pass
+
+
+class TaskRunner:
+    TASK_TYPE = 'undefined'
+
+    def __init__(self, app):
+        self.app = app
+
+    def runTask(self, task_data, ctx):
+        raise NotImplementedError()
+
+
+class TaskManager:
+    def __init__(self, app, *, time_threshold=1):
+        self.app = app
+        self.time_threshold = time_threshold
+        self._runners = None
+
+    @property
+    def tasks_dir(self):
+        return os.path.join(self.app.root_dir, TASKS_DIR)
+
+    def createTask(self, task_type, task_data):
+        from piecrust.pathutil import ensure_dir
+
+        tasks_dir = self.tasks_dir
+        ensure_dir(tasks_dir)
+        new_task = {
+            'type': task_type,
+            'data': task_data}
+        task_id = str(int(time.time()))
+        task_path = os.path.join(tasks_dir, '%s.json' % task_id)
+        with open(task_path, 'w', encoding='utf8') as fp:
+            json.dump(new_task, fp)
+        return task_id
+
+    def getTasks(self, *, only_task=None):
+        max_time = time.time() - self.time_threshold
+        tasks_dir = self.tasks_dir
+        try:
+            task_files = os.listdir(tasks_dir)
+        except (IOError, OSError):
+            task_files = []
+
+        for tf in task_files:
+            tfname, _ = os.path.splitext(tf)
+            if only_task and tfname != only_task:
+                continue
+
+            tf_path = os.path.join(tasks_dir, tf)
+            task_time = os.path.getmtime(tf_path)
+            if task_time >= max_time:
+                logger.debug("Skipping task '%s' because it's too new." % tf)
+                continue
+
+            with open(tf_path, 'r', encoding='utf8') as fp:
+                task_data = json.load(fp)
+
+            task_type = task_data.get('task')
+            task_payload = task_data.get('data')
+            yield (tf_path, task_type, task_payload)
+
+    def runQueue(self, *, only_task=None, clear_queue=True):
+        start_time = time.perf_counter()
+
+        tasks = list(self.getTasks(only_task=only_task))
+        for path, task_type, task_data in tasks:
+            if not task_type:
+                logger.error("Got task with no type: %s" % path)
+                continue
+
+            runner = self._getRunner(task_type)
+            if runner is None:
+                logger.error("No task runner for type: %s" % task_type)
+                continue
+
+            ctx = TaskContext()
+            runner.runTask(task_data, ctx)
+
+            if clear_queue:
+                os.remove(path)
+
+        logger.info(format_timed(
+            start_time, "Ran %d tasks." % len(tasks)))
+
+    def _getRunner(self, task_type):
+        if self._runners is None:
+            self._runners = {}
+            for r in self.app.plugin_loader.getTaskRunners():
+                self._runners[r.TASK_TYPE] = r(self.app)
+
+        return self._runners.get(task_type)
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/piecrust/tasks/mentions.py	Thu Feb 22 22:12:45 2018 -0800
@@ -0,0 +1,166 @@
+import os
+import os.path
+import json
+import logging
+from piecrust.tasks.base import TaskRunner
+
+
+logger = logging.getLogger(__name__)
+
+
+class InvalidMentionTargetError(Exception):
+    pass
+
+
+class SourceDoesntLinkToTargetError(Exception):
+    pass
+
+
+class DuplicateMentionError(Exception):
+    pass
+
+
+class MentionTaskRunner(TaskRunner):
+    TASK_TYPE = 'mention'
+
+    def runTask(self, data, ctx):
+        import json
+        import requests
+        from bs4 import BeautifulSoup
+        from piecrust.app import PieCrustFactory
+        from piecrust.serving.util import get_requested_page
+
+        src_url = data['source']
+        tgt_url = data['target']
+
+        # Find if we have a page at the target URL.  To do that we need to spin
+        # up a PieCrust app that knows how the website works. Because the
+        # website might have been baked with custom settings (usually the site
+        # root URL) there's a good chance we need to apply some variants, which
+        # the user can specify in the config.
+        pcappfac = PieCrustFactory(self.app.root_dir,
+                                   cache_key='webmention')
+        wmcfg = self.app.config.get('webmention')
+        if wmcfg.get('config_variant'):
+            pcappfac.config_variants = [wmcfg.get('config_variant')]
+        if wmcfg.get('config_variants'):
+            pcappfac.config_variants = list(wmcfg.get('config_variants'))
+        if wmcfg.get('config_values'):
+            pcappfac.config_values = list(wmcfg.get('config_values').items())
+        pcapp = pcappfac.create()
+        logger.debug("Locating page: %s" % tgt_url)
+        try:
+            req_page = get_requested_page(pcapp, tgt_url)
+            if req_page.page is None:
+                raise InvalidMentionTargetError()
+        except Exception as ex:
+            logger.error("Can't check webmention target page: %s" % tgt_url)
+            logger.exception(ex)
+            raise InvalidMentionTargetError()
+
+        # Grab the source URL's contents and see if anything references the
+        # target (ours) URL.
+        logger.debug("Fetching mention source: %s" % src_url)
+        src_t = requests.get(src_url)
+        src_html = BeautifulSoup(src_t.text, 'html.parser')
+        for link in src_html.find_all('a'):
+            href = link.get('href')
+            if href == tgt_url:
+                break
+        else:
+            logger.error("Source '%s' doesn't link to target: %s" %
+                         (src_url, tgt_url))
+            raise SourceDoesntLinkToTargetError()
+
+        # Load the previous mentions and find any pre-existing mention from the
+        # source URL.
+        mention_path, mention_data = _load_page_mentions(req_page.page)
+        for m in mention_data['mentions']:
+            if m['source'] == src_url:
+                logger.error("Duplicate mention found from: %s" % src_url)
+                raise DuplicateMentionError()
+
+        # Make the new mention.
+        new_mention = {'source': src_url}
+
+        # Parse the microformats on the page, see if there's anything
+        # interesting we can use.
+        mf2_info = _get_mention_info_from_mf2(src_url, src_html)
+        if mf2_info:
+            new_mention.update(mf2_info)
+
+        # Add the new mention.
+        mention_data['mentions'].append(new_mention)
+
+        with open(mention_path, 'w', encoding='utf-8') as fp:
+            json.dump(mention_data, fp)
+        logger.info("Received webmention from: %s" % src_url)
+
+
+def _get_mention_info_from_mf2(base_url, bs_html):
+    import mf2py
+    from urllib.parse import urljoin
+
+    mf2 = mf2py.parse(bs_html)
+    mf2_items = mf2.get('items')
+    if not mf2_items:
+        return None
+
+    hentry = next(filter(
+        lambda i: 'h-entry' in i['type'],
+        mf2_items), None)
+    if not hentry:
+        return None
+
+    info = {}
+    hentry_props = hentry['properties']
+
+    pnames = hentry_props.get('name')
+    if pnames:
+        info['name'] = pnames[0]
+
+    urls = hentry_props.get('url')
+    if urls:
+        info['url'] = urljoin(base_url, urls[0])
+
+    pubdates = hentry_props.get('published')
+    if pubdates:
+        info['published'] = pubdates[0]
+
+    contents = hentry_props.get('content')
+    if contents:
+        info['content'] = contents[0]['html']
+
+    authors = hentry_props.get('author')
+    if authors:
+        hcard = next(filter(
+            lambda i: 'h-card' in i['type'],
+            authors), None)
+        if hcard:
+            hcard_props = hcard['properties']
+            hcard_names = hcard_props.get('name')
+            if hcard_names:
+                info['author_name'] = hcard_names[0]
+            hcard_photos = hcard_props.get('photo')
+            if hcard_photos:
+                info['author_photo'] = urljoin(base_url, hcard_photos[0])
+            hcard_urls = hcard_props.get('url')
+            if hcard_urls:
+                info['author_url'] = urljoin(base_url, hcard_urls[0])
+
+    return info
+
+
+def _load_page_mentions(page):
+    from piecrust.pathutil import ensure_dir
+
+    logger.debug("Loading page mentions for: %s" % page.content_spec)
+    dirname, _ = os.path.splitext(page.content_spec)
+    dirname += '-assets'
+    ensure_dir(dirname)
+    mention_path = os.path.join(dirname, 'mentions.json')
+    try:
+        with open(mention_path, 'r', encoding='utf-8') as fp:
+            return mention_path, json.load(fp)
+    except IOError:
+        return mention_path, {'mentions': []}
--- a/setup.py	Wed Feb 21 21:21:42 2018 -0800
+++ b/setup.py	Thu Feb 22 22:12:45 2018 -0800
@@ -154,6 +154,7 @@
 install_requires = [
     'colorama>=0.3.3',
     'compressinja>=0.0.2',
+    'beautifulsoup4>=4.6.0',
     'Flask>=0.10.1',
     'Flask-IndieAuth>=0.0.3.2',
     'Flask-Login>=0.3.2',
@@ -161,6 +162,7 @@
     'Jinja2>=2.10',
     'Markdown>=2.6.2',
     'MarkupSafe>=1.0',
+    'mf2py>=1.0.5',
     'paramiko>=2.0.0',
     'Pillow>=4.3.0',
     'Pygments>=2.0.2',
@@ -168,6 +170,7 @@
     'python-dateutil>=2.4.2',
     'PyYAML>=3.11',
     'repoze.lru>=0.6',
+    'requests>=2.18.0',
     'smartypants>=1.8.6',
     'strict-rfc3339>=0.5',
     'textile>=2.2.2',