Mercurial > piecrust2
changeset 1114:8af2ea1f5c34
tasks: Add new `tasks` command and infrastructure, with `mention` task.
* The new command lets `chef` run tasks from a queue.
* The webmention endpoint now adds a mention task.
* Moved mention handling code to a task runner.
author | Ludovic Chabant <ludovic@chabant.com> |
---|---|
date | Thu, 22 Feb 2018 22:12:45 -0800 |
parents | 29c51b981c17 |
children | 11b9d0c8bd62 |
files | piecrust/admin/siteinfo.py piecrust/admin/views/mentions.py piecrust/commands/builtin/tasks.py piecrust/pathutil.py piecrust/plugins/base.py piecrust/plugins/builtin.py piecrust/tasks/base.py piecrust/tasks/mentions.py setup.py |
diffstat | 9 files changed, 385 insertions(+), 110 deletions(-) [+] |
line wrap: on
line diff
--- a/piecrust/admin/siteinfo.py Wed Feb 21 21:21:42 2018 -0800 +++ b/piecrust/admin/siteinfo.py Thu Feb 22 22:12:45 2018 -0800 @@ -128,6 +128,13 @@ except subprocess.TimeoutExpired: flash("Publish process is still running... check the log later.") + def runTask(self, task_id): + args = [ + '--no-color', + 'tasks', 'run', + '-t', task_id] + self._runChef(args) + def _runChef(self, args): chef_path = os.path.realpath(os.path.join( os.path.dirname(__file__),
--- a/piecrust/admin/views/mentions.py Wed Feb 21 21:21:42 2018 -0800 +++ b/piecrust/admin/views/mentions.py Thu Feb 22 22:12:45 2018 -0800 @@ -1,14 +1,7 @@ -import os -import os.path -import json -import time import logging -import requests -from bs4 import BeautifulSoup -from flask import current_app, g, request, make_response, abort +from flask import g, request, make_response, abort from ..blueprint import foodtruck_bp -from piecrust.app import PieCrustFactory -from piecrust.serving.util import get_requested_page +from piecrust.tasks.base import TaskManager logger = logging.getLogger(__name__) @@ -26,105 +19,18 @@ logger.error("Source and target are the same.") abort(400) - # See if we need to do this synchronously or asynchronously, and other - # things we should know up-front. - wmcfg = g.site.piecrust_app.config.get('webmention') - if wmcfg.get('use_task_queue') is True: - tasks_dir = os.path.join(g.site.piecrust_app.root_dir, '_tasks') - _ensure_dir(tasks_dir) - task_data = { - 'type': 'webmention', - 'data': {'source': src_url, 'target': tgt_url}} - task_path = os.path.join(tasks_dir, '%s.json' % int(time.time())) - with open(task_path, 'w', encoding='utf8') as fp: - json.dump(task_data, fp) - return make_response("Webmention queued.", 202, []) - - # Find if we have a page at the target URL. - # To do that we need to spin up a PieCrust app that knows how the website - # works. Because the website might have been baked with custom settings - # (usually the site root URL) there's a good chance we need to apply - # some variants, which the user can specify in the config. - pcappfac = PieCrustFactory( - current_app.config['FOODTRUCK_ROOT_DIR'], - cache_key='webmention') - if wmcfg.get('config_variant'): - pcappfac.config_variants = [wmcfg.get('config_variant')] - if wmcfg.get('config_variants'): - pcappfac.config_variants = list(wmcfg.get('config_variants')) - if wmcfg.get('config_values'): - pcappfac.config_values = list(wmcfg.get('config_values').items()) - pcapp = pcappfac.create() - try: - req_page = get_requested_page(pcapp, tgt_url) - if req_page.page is None: - abort(404) - except Exception as ex: - logger.error("Can't check webmention target page: %s" % tgt_url) - logger.exception(ex) - abort(404) - - # Grab the source URL's contents and see if anything references the - # target (ours) URL. - src_t = requests.get(src_url) - src_html = BeautifulSoup(src_t.text, 'html.parser') - for link in src_html.find_all('a'): - href = link.get('href') - if href == tgt_url: - break - else: - logger.error("Source '%s' doesn't link to target: %s" % - (src_url, tgt_url)) - abort(400) + # Create the task for handling this mention. + pcapp = g.site.piecrust_app + task_manager = TaskManager(pcapp) + task_id = task_manager.createTask('mention', { + 'source': src_url, + 'target': tgt_url}) - # Find something to quote for this webmention. We find an `h-entry` - # to get a title, excerpt, and/or text. - blurb = None - hentry = src_html.find(class_='h-entry') - if hentry: - try: - pname = hentry.find(class_='p-name') - pauthor = hentry.find(class_='p-author') - blurb = { - 'pname': _bs4_contents_str(pname), - 'pauthor': _bs4_contents_str(pauthor)} - except: # NOQA - logger.error("Couldn't get h-entry info.") - - dirname, _ = os.path.splitext(req_page.page.content_spec) - dirname += '-assets' - _ensure_dir(dirname) - mention_path = os.path.join(dirname, 'mentions.json') - try: - with open(mention_path, 'r', encoding='utf-8') as fp: - mention = json.load(fp) - except IOError: - mention = {'mentions': []} + # Either run the task now in a background process (for cheap and simple + # setups), or leave the task there to be picked up later when someone + # runs the task queue eventually. + wmcfg = pcapp.config.get('webmention') + if not wmcfg.get('use_task_queue'): + g.site.runTask(task_id) - for m in mention['mentions']: - if m['source'] == src_url: - return - - new_mention = {'source': src_url} - if blurb: - new_mention.update(blurb) - - mention['mentions'].append(new_mention) - - with open(mention_path, 'w', encoding='utf-8') as fp: - json.dump(mention, fp) - logger.info("Received webmention from: %s" % src_url) - - return make_response(("Webmention received.", 202, [])) - - -def _bs4_contents_str(node): - return ''.join([str(c).strip() for c in node.contents]) - - -def _ensure_dir(path, mode=0o775): - try: - os.makedirs(path, mode=mode, exist_ok=True) - except OSError: - pass - + return make_response("Webmention queued.", 202, [])
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/piecrust/commands/builtin/tasks.py Thu Feb 22 22:12:45 2018 -0800 @@ -0,0 +1,64 @@ +import os.path +import logging +from piecrust.commands.base import ChefCommand + + +logger = logging.getLogger(__name__) + + +class TasksCommand(ChefCommand): + """ Command for managing and running task queues. + """ + def __init__(self): + super().__init__() + self.name = 'tasks' + self.description = "Manages and runs various tasks." + + def setupParser(self, parser, app): + subparsers = parser.add_subparsers() + + p = subparsers.add_parser( + 'list', + help="Show the list of tasks current in the queue.") + p.set_defaults(sub_func=self._listTasks) + + p = subparsers.add_parser( + 'run', + help="Runs the current task queue.") + p.add_argument( + '-k', '--keep-queue', + action='store_true', + help="Don't delete the task queue files.") + p.add_argument( + '-t', '--task', + help="Specify which task to run.") + p.set_defaults(sub_func=self._runTasks) + + def run(self, ctx): + if hasattr(ctx.args, 'sub_func'): + ctx.args.sub_func(ctx) + + def _listTasks(self, ctx): + from piecrust.tasks.base import TaskManager + + root_dir = ctx.app.root_dir + tm = TaskManager(ctx.app) + tm.getTasks() + tasks = list(tm.getTasks()) + logger.info("Task queue contains %d tasks" % len(tasks)) + for path, task_type, task_data in tasks: + logger.info(" - [%s] %s" % + (task_type, os.path.relpath(path, root_dir))) + + def _runTasks(self, ctx): + from piecrust.tasks.base import TaskManager + + only_task = ctx.args.task + if only_task and os.path.isfile(only_task): + only_task, _ = os.path.splitext(os.path.basename(only_task)) + + tm = TaskManager(ctx.app) + tm.runQueue( + only_task=only_task, + clear_queue=False) # (not ctx.args.keep_queue)) +
--- a/piecrust/pathutil.py Wed Feb 21 21:21:42 2018 -0800 +++ b/piecrust/pathutil.py Thu Feb 22 22:12:45 2018 -0800 @@ -56,3 +56,9 @@ res.append(n) return res + +def ensure_dir(path, mode=0o755): + try: + os.makedirs(path, mode=mode, exist_ok=True) + except OSError: + pass
--- a/piecrust/plugins/base.py Wed Feb 21 21:21:42 2018 -0800 +++ b/piecrust/plugins/base.py Thu Feb 22 22:12:45 2018 -0800 @@ -44,6 +44,9 @@ def getPublishers(self): return [] + def getTaskRunners(self): + return [] + def initialize(self, app): pass @@ -105,6 +108,9 @@ def getPublishers(self): return self._getPluginComponents('getPublishers') + def getTaskRunners(self): + return self._getPluginComponents('getTaskRunners') + def _ensureLoaded(self): if self._plugins is not None: return
--- a/piecrust/plugins/builtin.py Wed Feb 21 21:21:42 2018 -0800 +++ b/piecrust/plugins/builtin.py Thu Feb 22 22:12:45 2018 -0800 @@ -17,6 +17,7 @@ from piecrust.commands.builtin.publishing import PublishCommand from piecrust.commands.builtin.scaffolding import PrepareCommand from piecrust.commands.builtin.serving import ServeCommand + from piecrust.commands.builtin.tasks import TasksCommand from piecrust.commands.builtin.themes import ThemesCommand from piecrust.commands.builtin.util import ( InitCommand, PurgeCommand, ImportCommand) @@ -39,7 +40,9 @@ ShowRecordCommand(), ServeCommand(), AdministrationPanelCommand(), - PublishCommand()] + PublishCommand(), + TasksCommand() + ] def getCommandExtensions(self): from piecrust.commands.builtin.scaffolding import ( @@ -165,3 +168,8 @@ SftpPublisher, RsyncPublisher] + def getTaskRunners(self): + from piecrust.tasks.mentions import MentionTaskRunner + + return [ + MentionTaskRunner]
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/piecrust/tasks/base.py Thu Feb 22 22:12:45 2018 -0800 @@ -0,0 +1,109 @@ +import os +import os.path +import json +import time +import logging +from piecrust.chefutil import format_timed + + +TASKS_DIR = '_tasks' + + +logger = logging.getLogger(__name__) + + +class TaskContext: + def __init__(self): + pass + + +class TaskRunner: + TASK_TYPE = 'undefined' + + def __init__(self, app): + self.app = app + + def runTask(self, task_data, ctx): + raise NotImplementedError() + + +class TaskManager: + def __init__(self, app, *, time_threshold=1): + self.app = app + self.time_threshold = time_threshold + self._runners = None + + @property + def tasks_dir(self): + return os.path.join(self.app.root_dir, TASKS_DIR) + + def createTask(self, task_type, task_data): + from piecrust.pathutil import ensure_dir + + tasks_dir = self.tasks_dir + ensure_dir(tasks_dir) + new_task = { + 'type': task_type, + 'data': task_data} + task_id = str(int(time.time())) + task_path = os.path.join(tasks_dir, '%s.json' % task_id) + with open(task_path, 'w', encoding='utf8') as fp: + json.dump(new_task, fp) + return task_id + + def getTasks(self, *, only_task=None): + max_time = time.time() - self.time_threshold + tasks_dir = self.tasks_dir + try: + task_files = os.listdir(tasks_dir) + except (IOError, OSError): + task_files = [] + + for tf in task_files: + tfname, _ = os.path.splitext(tf) + if only_task and tfname != only_task: + continue + + tf_path = os.path.join(tasks_dir, tf) + task_time = os.path.getmtime(tf_path) + if task_time >= max_time: + logger.debug("Skipping task '%s' because it's too new." % tf) + continue + + with open(tf_path, 'r', encoding='utf8') as fp: + task_data = json.load(fp) + + task_type = task_data.get('task') + task_payload = task_data.get('data') + yield (tf_path, task_type, task_payload) + + def runQueue(self, *, only_task=None, clear_queue=True): + start_time = time.perf_counter() + + tasks = list(self.getTasks(only_task=only_task)) + for path, task_type, task_data in tasks: + if not task_type: + logger.error("Got task with no type: %s" % path) + continue + + runner = self._getRunner(task_type) + if runner is None: + logger.error("No task runner for type: %s" % task_type) + continue + + ctx = TaskContext() + runner.runTask(task_data, ctx) + + if clear_queue: + os.remove(path) + + logger.info(format_timed( + start_time, "Ran %d tasks." % len(tasks))) + + def _getRunner(self, task_type): + if self._runners is None: + self._runners = {} + for r in self.app.plugin_loader.getTaskRunners(): + self._runners[r.TASK_TYPE] = r(self.app) + + return self._runners.get(task_type)
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/piecrust/tasks/mentions.py Thu Feb 22 22:12:45 2018 -0800 @@ -0,0 +1,166 @@ +import os +import os.path +import json +import logging +from piecrust.tasks.base import TaskRunner + + +logger = logging.getLogger(__name__) + + +class InvalidMentionTargetError(Exception): + pass + + +class SourceDoesntLinkToTargetError(Exception): + pass + + +class DuplicateMentionError(Exception): + pass + + +class MentionTaskRunner(TaskRunner): + TASK_TYPE = 'mention' + + def runTask(self, data, ctx): + import json + import requests + from bs4 import BeautifulSoup + from piecrust.app import PieCrustFactory + from piecrust.serving.util import get_requested_page + + src_url = data['source'] + tgt_url = data['target'] + + # Find if we have a page at the target URL. To do that we need to spin + # up a PieCrust app that knows how the website works. Because the + # website might have been baked with custom settings (usually the site + # root URL) there's a good chance we need to apply some variants, which + # the user can specify in the config. + pcappfac = PieCrustFactory(self.app.root_dir, + cache_key='webmention') + wmcfg = self.app.config.get('webmention') + if wmcfg.get('config_variant'): + pcappfac.config_variants = [wmcfg.get('config_variant')] + if wmcfg.get('config_variants'): + pcappfac.config_variants = list(wmcfg.get('config_variants')) + if wmcfg.get('config_values'): + pcappfac.config_values = list(wmcfg.get('config_values').items()) + pcapp = pcappfac.create() + logger.debug("Locating page: %s" % tgt_url) + try: + req_page = get_requested_page(pcapp, tgt_url) + if req_page.page is None: + raise InvalidMentionTargetError() + except Exception as ex: + logger.error("Can't check webmention target page: %s" % tgt_url) + logger.exception(ex) + raise InvalidMentionTargetError() + + # Grab the source URL's contents and see if anything references the + # target (ours) URL. + logger.debug("Fetching mention source: %s" % src_url) + src_t = requests.get(src_url) + src_html = BeautifulSoup(src_t.text, 'html.parser') + for link in src_html.find_all('a'): + href = link.get('href') + if href == tgt_url: + break + else: + logger.error("Source '%s' doesn't link to target: %s" % + (src_url, tgt_url)) + raise SourceDoesntLinkToTargetError() + + # Load the previous mentions and find any pre-existing mention from the + # source URL. + mention_path, mention_data = _load_page_mentions(req_page.page) + for m in mention_data['mentions']: + if m['source'] == src_url: + logger.error("Duplicate mention found from: %s" % src_url) + raise DuplicateMentionError() + + # Make the new mention. + new_mention = {'source': src_url} + + # Parse the microformats on the page, see if there's anything + # interesting we can use. + mf2_info = _get_mention_info_from_mf2(src_url, src_html) + if mf2_info: + new_mention.update(mf2_info) + + # Add the new mention. + mention_data['mentions'].append(new_mention) + + with open(mention_path, 'w', encoding='utf-8') as fp: + json.dump(mention_data, fp) + logger.info("Received webmention from: %s" % src_url) + + +def _get_mention_info_from_mf2(base_url, bs_html): + import mf2py + from urllib.parse import urljoin + + mf2 = mf2py.parse(bs_html) + mf2_items = mf2.get('items') + if not mf2_items: + return None + + hentry = next(filter( + lambda i: 'h-entry' in i['type'], + mf2_items), None) + if not hentry: + return None + + info = {} + hentry_props = hentry['properties'] + + pnames = hentry_props.get('name') + if pnames: + info['name'] = pnames[0] + + urls = hentry_props.get('url') + if urls: + info['url'] = urljoin(base_url, urls[0]) + + pubdates = hentry_props.get('published') + if pubdates: + info['published'] = pubdates[0] + + contents = hentry_props.get('content') + if contents: + info['content'] = contents[0]['html'] + + authors = hentry_props.get('author') + if authors: + hcard = next(filter( + lambda i: 'h-card' in i['type'], + authors), None) + if hcard: + hcard_props = hcard['properties'] + hcard_names = hcard_props.get('name') + if hcard_names: + info['author_name'] = hcard_names[0] + hcard_photos = hcard_props.get('photo') + if hcard_photos: + info['author_photo'] = urljoin(base_url, hcard_photos[0]) + hcard_urls = hcard_props.get('url') + if hcard_urls: + info['author_url'] = urljoin(base_url, hcard_urls[0]) + + return info + + +def _load_page_mentions(page): + from piecrust.pathutil import ensure_dir + + logger.debug("Loading page mentions for: %s" % page.content_spec) + dirname, _ = os.path.splitext(page.content_spec) + dirname += '-assets' + ensure_dir(dirname) + mention_path = os.path.join(dirname, 'mentions.json') + try: + with open(mention_path, 'r', encoding='utf-8') as fp: + return mention_path, json.load(fp) + except IOError: + return mention_path, {'mentions': []}
--- a/setup.py Wed Feb 21 21:21:42 2018 -0800 +++ b/setup.py Thu Feb 22 22:12:45 2018 -0800 @@ -154,6 +154,7 @@ install_requires = [ 'colorama>=0.3.3', 'compressinja>=0.0.2', + 'beautifulsoup4>=4.6.0', 'Flask>=0.10.1', 'Flask-IndieAuth>=0.0.3.2', 'Flask-Login>=0.3.2', @@ -161,6 +162,7 @@ 'Jinja2>=2.10', 'Markdown>=2.6.2', 'MarkupSafe>=1.0', + 'mf2py>=1.0.5', 'paramiko>=2.0.0', 'Pillow>=4.3.0', 'Pygments>=2.0.2', @@ -168,6 +170,7 @@ 'python-dateutil>=2.4.2', 'PyYAML>=3.11', 'repoze.lru>=0.6', + 'requests>=2.18.0', 'smartypants>=1.8.6', 'strict-rfc3339>=0.5', 'textile>=2.2.2',