Mercurial > piecrust2
comparison piecrust/tasks/mentions.py @ 1114:8af2ea1f5c34
tasks: Add new `tasks` command and infrastructure, with `mention` task.
* The new command lets `chef` run tasks from a queue.
* The webmention endpoint now adds a mention task.
* Moved mention handling code to a task runner.
author | Ludovic Chabant <ludovic@chabant.com> |
---|---|
date | Thu, 22 Feb 2018 22:12:45 -0800 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
1113:29c51b981c17 | 1114:8af2ea1f5c34 |
---|---|
1 import os | |
2 import os.path | |
3 import json | |
4 import logging | |
5 from piecrust.tasks.base import TaskRunner | |
6 | |
7 | |
8 logger = logging.getLogger(__name__) | |
9 | |
10 | |
11 class InvalidMentionTargetError(Exception): | |
12 pass | |
13 | |
14 | |
15 class SourceDoesntLinkToTargetError(Exception): | |
16 pass | |
17 | |
18 | |
19 class DuplicateMentionError(Exception): | |
20 pass | |
21 | |
22 | |
23 class MentionTaskRunner(TaskRunner): | |
24 TASK_TYPE = 'mention' | |
25 | |
26 def runTask(self, data, ctx): | |
27 import json | |
28 import requests | |
29 from bs4 import BeautifulSoup | |
30 from piecrust.app import PieCrustFactory | |
31 from piecrust.serving.util import get_requested_page | |
32 | |
33 src_url = data['source'] | |
34 tgt_url = data['target'] | |
35 | |
36 # Find if we have a page at the target URL. To do that we need to spin | |
37 # up a PieCrust app that knows how the website works. Because the | |
38 # website might have been baked with custom settings (usually the site | |
39 # root URL) there's a good chance we need to apply some variants, which | |
40 # the user can specify in the config. | |
41 pcappfac = PieCrustFactory(self.app.root_dir, | |
42 cache_key='webmention') | |
43 wmcfg = self.app.config.get('webmention') | |
44 if wmcfg.get('config_variant'): | |
45 pcappfac.config_variants = [wmcfg.get('config_variant')] | |
46 if wmcfg.get('config_variants'): | |
47 pcappfac.config_variants = list(wmcfg.get('config_variants')) | |
48 if wmcfg.get('config_values'): | |
49 pcappfac.config_values = list(wmcfg.get('config_values').items()) | |
50 pcapp = pcappfac.create() | |
51 logger.debug("Locating page: %s" % tgt_url) | |
52 try: | |
53 req_page = get_requested_page(pcapp, tgt_url) | |
54 if req_page.page is None: | |
55 raise InvalidMentionTargetError() | |
56 except Exception as ex: | |
57 logger.error("Can't check webmention target page: %s" % tgt_url) | |
58 logger.exception(ex) | |
59 raise InvalidMentionTargetError() | |
60 | |
61 # Grab the source URL's contents and see if anything references the | |
62 # target (ours) URL. | |
63 logger.debug("Fetching mention source: %s" % src_url) | |
64 src_t = requests.get(src_url) | |
65 src_html = BeautifulSoup(src_t.text, 'html.parser') | |
66 for link in src_html.find_all('a'): | |
67 href = link.get('href') | |
68 if href == tgt_url: | |
69 break | |
70 else: | |
71 logger.error("Source '%s' doesn't link to target: %s" % | |
72 (src_url, tgt_url)) | |
73 raise SourceDoesntLinkToTargetError() | |
74 | |
75 # Load the previous mentions and find any pre-existing mention from the | |
76 # source URL. | |
77 mention_path, mention_data = _load_page_mentions(req_page.page) | |
78 for m in mention_data['mentions']: | |
79 if m['source'] == src_url: | |
80 logger.error("Duplicate mention found from: %s" % src_url) | |
81 raise DuplicateMentionError() | |
82 | |
83 # Make the new mention. | |
84 new_mention = {'source': src_url} | |
85 | |
86 # Parse the microformats on the page, see if there's anything | |
87 # interesting we can use. | |
88 mf2_info = _get_mention_info_from_mf2(src_url, src_html) | |
89 if mf2_info: | |
90 new_mention.update(mf2_info) | |
91 | |
92 # Add the new mention. | |
93 mention_data['mentions'].append(new_mention) | |
94 | |
95 with open(mention_path, 'w', encoding='utf-8') as fp: | |
96 json.dump(mention_data, fp) | |
97 logger.info("Received webmention from: %s" % src_url) | |
98 | |
99 | |
100 def _get_mention_info_from_mf2(base_url, bs_html): | |
101 import mf2py | |
102 from urllib.parse import urljoin | |
103 | |
104 mf2 = mf2py.parse(bs_html) | |
105 mf2_items = mf2.get('items') | |
106 if not mf2_items: | |
107 return None | |
108 | |
109 hentry = next(filter( | |
110 lambda i: 'h-entry' in i['type'], | |
111 mf2_items), None) | |
112 if not hentry: | |
113 return None | |
114 | |
115 info = {} | |
116 hentry_props = hentry['properties'] | |
117 | |
118 pnames = hentry_props.get('name') | |
119 if pnames: | |
120 info['name'] = pnames[0] | |
121 | |
122 urls = hentry_props.get('url') | |
123 if urls: | |
124 info['url'] = urljoin(base_url, urls[0]) | |
125 | |
126 pubdates = hentry_props.get('published') | |
127 if pubdates: | |
128 info['published'] = pubdates[0] | |
129 | |
130 contents = hentry_props.get('content') | |
131 if contents: | |
132 info['content'] = contents[0]['html'] | |
133 | |
134 authors = hentry_props.get('author') | |
135 if authors: | |
136 hcard = next(filter( | |
137 lambda i: 'h-card' in i['type'], | |
138 authors), None) | |
139 if hcard: | |
140 hcard_props = hcard['properties'] | |
141 hcard_names = hcard_props.get('name') | |
142 if hcard_names: | |
143 info['author_name'] = hcard_names[0] | |
144 hcard_photos = hcard_props.get('photo') | |
145 if hcard_photos: | |
146 info['author_photo'] = urljoin(base_url, hcard_photos[0]) | |
147 hcard_urls = hcard_props.get('url') | |
148 if hcard_urls: | |
149 info['author_url'] = urljoin(base_url, hcard_urls[0]) | |
150 | |
151 return info | |
152 | |
153 | |
154 def _load_page_mentions(page): | |
155 from piecrust.pathutil import ensure_dir | |
156 | |
157 logger.debug("Loading page mentions for: %s" % page.content_spec) | |
158 dirname, _ = os.path.splitext(page.content_spec) | |
159 dirname += '-assets' | |
160 ensure_dir(dirname) | |
161 mention_path = os.path.join(dirname, 'mentions.json') | |
162 try: | |
163 with open(mention_path, 'r', encoding='utf-8') as fp: | |
164 return mention_path, json.load(fp) | |
165 except IOError: | |
166 return mention_path, {'mentions': []} |