Mercurial > piecrust2
comparison piecrust/sources/taxonomy.py @ 852:4850f8c21b6e
core: Start of the big refactor for PieCrust 3.0.
* Everything is a `ContentSource`, including assets directories.
* Most content sources are subclasses of the base file-system source.
* A source is processed by a "pipeline", and there are 2 built-in pipelines,
one for assets and one for pages. The asset pipeline is vaguely functional,
but the page pipeline is completely broken right now.
* Rewrite the baking process as just running appropriate pipelines on each
content item. This should allow for better parallelization.
author | Ludovic Chabant <ludovic@chabant.com> |
---|---|
date | Wed, 17 May 2017 00:11:48 -0700 |
parents | |
children | 08e02c2a2a1a |
comparison
equal
deleted
inserted
replaced
851:2c7e57d80bba | 852:4850f8c21b6e |
---|---|
1 import re | |
2 import time | |
3 import logging | |
4 import unidecode | |
5 from piecrust.chefutil import format_timed, format_timed_scope | |
6 from piecrust.configuration import ConfigurationError | |
7 from piecrust.data.filters import ( | |
8 PaginationFilter, SettingFilterClause, | |
9 page_value_accessor) | |
10 from piecrust.routing import RouteParameter | |
11 from piecrust.sources.base import ContentSource, GeneratedContentException | |
12 | |
13 | |
14 logger = logging.getLogger(__name__) | |
15 | |
16 | |
17 SLUGIFY_ENCODE = 1 | |
18 SLUGIFY_TRANSLITERATE = 2 | |
19 SLUGIFY_LOWERCASE = 4 | |
20 SLUGIFY_DOT_TO_DASH = 8 | |
21 SLUGIFY_SPACE_TO_DASH = 16 | |
22 | |
23 | |
24 re_first_dot_to_dash = re.compile(r'^\.+') | |
25 re_dot_to_dash = re.compile(r'\.+') | |
26 re_space_to_dash = re.compile(r'\s+') | |
27 | |
28 | |
29 class Taxonomy(object): | |
30 def __init__(self, name, config): | |
31 self.name = name | |
32 self.config = config | |
33 self.term_name = config.get('term', name) | |
34 self.is_multiple = bool(config.get('multiple', False)) | |
35 self.separator = config.get('separator', '/') | |
36 self.page_ref = config.get('page') | |
37 | |
38 @property | |
39 def setting_name(self): | |
40 if self.is_multiple: | |
41 return self.name | |
42 return self.term_name | |
43 | |
44 | |
45 class TaxonomySource(ContentSource): | |
46 """ A page generator that handles taxonomies, _i.e._ lists of keywords | |
47 that pages are labelled with, and for which we need to generate | |
48 listing pages. | |
49 """ | |
50 SOURCE_NAME = 'taxonomy' | |
51 | |
52 def __init__(self, app, name, config): | |
53 super().__init__(app, name, config) | |
54 | |
55 tax_name = config.get('taxonomy') | |
56 if tax_name is None: | |
57 raise ConfigurationError( | |
58 "Generator '%s' requires a taxonomy name." % name) | |
59 tax_config = app.config.get('site/taxonomies/' + tax_name) | |
60 if tax_config is None: | |
61 raise ConfigurationError( | |
62 "Error initializing generator '%s', no such taxonomy: %s", | |
63 (name, tax_name)) | |
64 self.taxonomy = Taxonomy(tax_name, tax_config) | |
65 | |
66 sm = config.get('slugify_mode') | |
67 if not sm: | |
68 sm = app.config.get('site/slugify_mode', 'encode') | |
69 self.slugify_mode = _parse_slugify_mode(sm) | |
70 self.slugifier = _Slugifier(self.taxonomy, self.slugify_mode) | |
71 | |
72 def getContents(self, group): | |
73 raise GeneratedContentException() | |
74 | |
75 def getSupportedRouteParameters(self): | |
76 name = self.taxonomy.term_name | |
77 param_type = (RouteParameter.TYPE_PATH if self.taxonomy.is_multiple | |
78 else RouteParameter.TYPE_STRING) | |
79 return [RouteParameter(name, param_type, | |
80 variadic=self.taxonomy.is_multiple)] | |
81 | |
82 def slugify(self, term): | |
83 return self.slugifier.slugify(term) | |
84 | |
85 def slugifyMultiple(self, terms): | |
86 return self.slugifier.slugifyMultiple(terms) | |
87 | |
88 def prepareRenderContext(self, ctx): | |
89 # Set the pagination source as the source we're generating for. | |
90 ctx.pagination_source = self.source | |
91 | |
92 # Get the taxonomy terms from the route metadata... this can come from | |
93 # the browser's URL (while serving) or from the baking (see `bake` | |
94 # method below). In both cases, we expect to have the *slugified* | |
95 # version of the term, because we're going to set a filter that also | |
96 # slugifies the terms found on each page. | |
97 # | |
98 # This is because: | |
99 # * while serving, we get everything from the request URL, so we only | |
100 # have the slugified version. | |
101 # * if 2 slightly different terms "collide" into the same slugified | |
102 # term, we'll get a merge of the 2 on the listing page, which is | |
103 # what the user expects. | |
104 # | |
105 tax_terms, is_combination = self._getTaxonomyTerms( | |
106 ctx.page.route_metadata) | |
107 self._setTaxonomyFilter(ctx, tax_terms, is_combination) | |
108 | |
109 # Add some custom data for rendering. | |
110 ctx.custom_data.update({ | |
111 self.taxonomy.term_name: tax_terms, | |
112 'is_multiple_%s' % self.taxonomy.term_name: is_combination}) | |
113 # Add some "plural" version of the term... so for instance, if this | |
114 # is the "tags" taxonomy, "tag" will have one term most of the time, | |
115 # except when it's a combination. Here, we add "tags" as something that | |
116 # is always a tuple, even when it's not a combination. | |
117 if (self.taxonomy.is_multiple and | |
118 self.taxonomy.name != self.taxonomy.term_name): | |
119 mult_val = tax_terms | |
120 if not is_combination: | |
121 mult_val = (mult_val,) | |
122 ctx.custom_data[self.taxonomy.name] = mult_val | |
123 | |
124 def _getSource(self): | |
125 return self.app.getSource(self.config['source']) | |
126 | |
127 def _getTaxonomyTerms(self, route_metadata): | |
128 # Get the individual slugified terms from the route metadata. | |
129 all_values = route_metadata.get(self.taxonomy.term_name) | |
130 if all_values is None: | |
131 raise Exception("'%s' values couldn't be found in route metadata" % | |
132 self.taxonomy.term_name) | |
133 | |
134 # If it's a "multiple" taxonomy, we need to potentially split the | |
135 # route value into the individual terms (_e.g._ when listing all pages | |
136 # that have 2 given tags, we need to get each of those 2 tags). | |
137 if self.taxonomy.is_multiple: | |
138 sep = self.taxonomy.separator | |
139 if sep in all_values: | |
140 return tuple(all_values.split(sep)), True | |
141 # Not a "multiple" taxonomy, so there's only the one value. | |
142 return all_values, False | |
143 | |
144 def _setTaxonomyFilter(self, ctx, term_value, is_combination): | |
145 # Set up the filter that will check the pages' terms. | |
146 flt = PaginationFilter(value_accessor=page_value_accessor) | |
147 flt.addClause(HasTaxonomyTermsFilterClause( | |
148 self.taxonomy, self.slugify_mode, term_value, is_combination)) | |
149 ctx.pagination_filter = flt | |
150 | |
151 def onRouteFunctionUsed(self, route, route_metadata): | |
152 # Get the values, and slugify them appropriately. | |
153 values = route_metadata[self.taxonomy.term_name] | |
154 if self.taxonomy.is_multiple: | |
155 # TODO: here we assume the route has been properly configured. | |
156 slugified_values = self.slugifyMultiple((str(v) for v in values)) | |
157 route_val = self.taxonomy.separator.join(slugified_values) | |
158 else: | |
159 slugified_values = self.slugify(str(values)) | |
160 route_val = slugified_values | |
161 | |
162 # We need to register this use of a taxonomy term. | |
163 eis = self.app.env.exec_info_stack | |
164 cpi = eis.current_page_info.render_ctx.current_pass_info | |
165 if cpi: | |
166 utt = cpi.getCustomInfo('used_taxonomy_terms', [], True) | |
167 utt.append(slugified_values) | |
168 | |
169 # Put the slugified values in the route metadata so they're used to | |
170 # generate the URL. | |
171 route_metadata[self.taxonomy.term_name] = route_val | |
172 | |
173 def bake(self, ctx): | |
174 if not self.page_ref.exists: | |
175 logger.debug( | |
176 "No page found at '%s', skipping taxonomy '%s'." % | |
177 (self.page_ref, self.taxonomy.name)) | |
178 return | |
179 | |
180 logger.debug("Baking %s pages...", self.taxonomy.name) | |
181 analyzer = _TaxonomyTermsAnalyzer(self.source_name, self.taxonomy, | |
182 self.slugify_mode) | |
183 with format_timed_scope(logger, 'gathered taxonomy terms', | |
184 level=logging.DEBUG, colored=False): | |
185 analyzer.analyze(ctx) | |
186 | |
187 start_time = time.perf_counter() | |
188 page_count = self._bakeTaxonomyTerms(ctx, analyzer) | |
189 if page_count > 0: | |
190 logger.info(format_timed( | |
191 start_time, | |
192 "baked %d %s pages for %s." % ( | |
193 page_count, self.taxonomy.term_name, self.source_name))) | |
194 | |
195 def _bakeTaxonomyTerms(self, ctx, analyzer): | |
196 # Start baking those terms. | |
197 logger.debug( | |
198 "Baking '%s' for source '%s': %d terms" % | |
199 (self.taxonomy.name, self.source_name, | |
200 len(analyzer.dirty_slugified_terms))) | |
201 | |
202 route = self.app.getGeneratorRoute(self.name) | |
203 if route is None: | |
204 raise Exception("No routes have been defined for generator: %s" % | |
205 self.name) | |
206 | |
207 logger.debug("Using taxonomy page: %s" % self.page_ref) | |
208 fac = self.page_ref.getFactory() | |
209 | |
210 job_count = 0 | |
211 for slugified_term in analyzer.dirty_slugified_terms: | |
212 extra_route_metadata = { | |
213 self.taxonomy.term_name: slugified_term} | |
214 | |
215 # Use the slugified term as the record's extra key seed. | |
216 logger.debug( | |
217 "Queuing: %s [%s=%s]" % | |
218 (fac.ref_spec, self.taxonomy.name, slugified_term)) | |
219 ctx.queueBakeJob(fac, route, extra_route_metadata, slugified_term) | |
220 job_count += 1 | |
221 ctx.runJobQueue() | |
222 | |
223 # Now we create bake entries for all the terms that were *not* dirty. | |
224 # This is because otherwise, on the next incremental bake, we wouldn't | |
225 # find any entry for those things, and figure that we need to delete | |
226 # their outputs. | |
227 for prev_entry, cur_entry in ctx.getAllPageRecords(): | |
228 # Only consider taxonomy-related entries that don't have any | |
229 # current version (i.e. they weren't baked just now). | |
230 if prev_entry and not cur_entry: | |
231 try: | |
232 t = ctx.getSeedFromRecordExtraKey(prev_entry.extra_key) | |
233 except InvalidRecordExtraKey: | |
234 continue | |
235 | |
236 if analyzer.isKnownSlugifiedTerm(t): | |
237 logger.debug("Creating unbaked entry for %s term: %s" % | |
238 (self.name, t)) | |
239 ctx.collapseRecord(prev_entry) | |
240 else: | |
241 logger.debug("Term %s in %s isn't used anymore." % | |
242 (self.name, t)) | |
243 | |
244 return job_count | |
245 | |
246 | |
247 class HasTaxonomyTermsFilterClause(SettingFilterClause): | |
248 def __init__(self, taxonomy, slugify_mode, value, is_combination): | |
249 super(HasTaxonomyTermsFilterClause, self).__init__( | |
250 taxonomy.setting_name, value) | |
251 self._taxonomy = taxonomy | |
252 self._is_combination = is_combination | |
253 self._slugifier = _Slugifier(taxonomy, slugify_mode) | |
254 | |
255 def pageMatches(self, fil, page): | |
256 if self._taxonomy.is_multiple: | |
257 # Multiple taxonomy, i.e. it supports multiple terms, like tags. | |
258 page_values = fil.value_accessor(page, self.name) | |
259 if page_values is None or not isinstance(page_values, list): | |
260 return False | |
261 | |
262 page_set = set(map(self._slugifier.slugify, page_values)) | |
263 if self._is_combination: | |
264 # Multiple taxonomy, and multiple terms to match. Check that | |
265 # the ones to match are all in the page's terms. | |
266 value_set = set(self.value) | |
267 return value_set.issubset(page_set) | |
268 else: | |
269 # Multiple taxonomy, one term to match. | |
270 return self.value in page_set | |
271 else: | |
272 # Single taxonomy. Just compare the values. | |
273 page_value = fil.value_accessor(page, self.name) | |
274 if page_value is None: | |
275 return False | |
276 page_value = self._slugifier.slugify(page_value) | |
277 return page_value == self.value | |
278 | |
279 | |
280 class _TaxonomyTermsAnalyzer(object): | |
281 def __init__(self, source_name, taxonomy, slugify_mode): | |
282 self.source_name = source_name | |
283 self.taxonomy = taxonomy | |
284 self.slugifier = _Slugifier(taxonomy, slugify_mode) | |
285 self._all_terms = {} | |
286 self._single_dirty_slugified_terms = set() | |
287 self._all_dirty_slugified_terms = None | |
288 | |
289 @property | |
290 def dirty_slugified_terms(self): | |
291 """ Returns the slugified terms that have been 'dirtied' during | |
292 this bake. | |
293 """ | |
294 return self._all_dirty_slugified_terms | |
295 | |
296 def isKnownSlugifiedTerm(self, term): | |
297 """ Returns whether the given slugified term has been seen during | |
298 this bake. | |
299 """ | |
300 return term in self._all_terms | |
301 | |
302 def analyze(self, ctx): | |
303 # Build the list of terms for our taxonomy, and figure out which ones | |
304 # are 'dirty' for the current bake. | |
305 # | |
306 # Remember all terms used. | |
307 for _, cur_entry in ctx.getAllPageRecords(): | |
308 if cur_entry and not cur_entry.was_overriden: | |
309 cur_terms = cur_entry.config.get(self.taxonomy.setting_name) | |
310 if cur_terms: | |
311 if not self.taxonomy.is_multiple: | |
312 self._addTerm(cur_entry.path, cur_terms) | |
313 else: | |
314 self._addTerms(cur_entry.path, cur_terms) | |
315 | |
316 # Re-bake all taxonomy terms that include new or changed pages, by | |
317 # marking them as 'dirty'. | |
318 for prev_entry, cur_entry in ctx.getBakedPageRecords(): | |
319 if cur_entry.source_name != self.source_name: | |
320 continue | |
321 | |
322 entries = [cur_entry] | |
323 if prev_entry: | |
324 entries.append(prev_entry) | |
325 | |
326 for e in entries: | |
327 entry_terms = e.config.get(self.taxonomy.setting_name) | |
328 if entry_terms: | |
329 if not self.taxonomy.is_multiple: | |
330 self._single_dirty_slugified_terms.add( | |
331 self.slugifier.slugify(entry_terms)) | |
332 else: | |
333 self._single_dirty_slugified_terms.update( | |
334 (self.slugifier.slugify(t) | |
335 for t in entry_terms)) | |
336 | |
337 self._all_dirty_slugified_terms = list( | |
338 self._single_dirty_slugified_terms) | |
339 logger.debug("Gathered %d dirty taxonomy terms", | |
340 len(self._all_dirty_slugified_terms)) | |
341 | |
342 # Re-bake the combination pages for terms that are 'dirty'. | |
343 # We make all terms into tuple, even those that are not actual | |
344 # combinations, so that we have less things to test further down the | |
345 # line. | |
346 # | |
347 # Add the combinations to that list. We get those combinations from | |
348 # wherever combinations were used, so they're coming from the | |
349 # `onRouteFunctionUsed` method. | |
350 if self.taxonomy.is_multiple: | |
351 known_combinations = set() | |
352 for _, cur_entry in ctx.getAllPageRecords(): | |
353 if cur_entry: | |
354 used_terms = _get_all_entry_taxonomy_terms(cur_entry) | |
355 for terms in used_terms: | |
356 if len(terms) > 1: | |
357 known_combinations.add(terms) | |
358 | |
359 dcc = 0 | |
360 for terms in known_combinations: | |
361 if not self._single_dirty_slugified_terms.isdisjoint( | |
362 set(terms)): | |
363 self._all_dirty_slugified_terms.append( | |
364 self.taxonomy.separator.join(terms)) | |
365 dcc += 1 | |
366 logger.debug("Gathered %d term combinations, with %d dirty." % | |
367 (len(known_combinations), dcc)) | |
368 | |
369 def _addTerms(self, entry_path, terms): | |
370 for t in terms: | |
371 self._addTerm(entry_path, t) | |
372 | |
373 def _addTerm(self, entry_path, term): | |
374 st = self.slugifier.slugify(term) | |
375 orig_terms = self._all_terms.setdefault(st, []) | |
376 if orig_terms and orig_terms[0] != term: | |
377 logger.warning( | |
378 "Term '%s' in '%s' is slugified to '%s' which conflicts with " | |
379 "previously existing '%s'. The two will be merged." % | |
380 (term, entry_path, st, orig_terms[0])) | |
381 orig_terms.append(term) | |
382 | |
383 | |
384 def _get_all_entry_taxonomy_terms(entry): | |
385 res = set() | |
386 for o in entry.subs: | |
387 for pinfo in o.render_info: | |
388 if pinfo: | |
389 terms = pinfo.getCustomInfo('used_taxonomy_terms') | |
390 if terms: | |
391 res |= set(terms) | |
392 return res | |
393 | |
394 | |
395 class _Slugifier(object): | |
396 def __init__(self, taxonomy, mode): | |
397 self.taxonomy = taxonomy | |
398 self.mode = mode | |
399 | |
400 def slugifyMultiple(self, terms): | |
401 return tuple(map(self.slugify, terms)) | |
402 | |
403 def slugify(self, term): | |
404 if self.mode & SLUGIFY_TRANSLITERATE: | |
405 term = unidecode.unidecode(term) | |
406 if self.mode & SLUGIFY_LOWERCASE: | |
407 term = term.lower() | |
408 if self.mode & SLUGIFY_DOT_TO_DASH: | |
409 term = re_first_dot_to_dash.sub('', term) | |
410 term = re_dot_to_dash.sub('-', term) | |
411 if self.mode & SLUGIFY_SPACE_TO_DASH: | |
412 term = re_space_to_dash.sub('-', term) | |
413 return term | |
414 | |
415 | |
416 def _parse_slugify_mode(value): | |
417 mapping = { | |
418 'encode': SLUGIFY_ENCODE, | |
419 'transliterate': SLUGIFY_TRANSLITERATE, | |
420 'lowercase': SLUGIFY_LOWERCASE, | |
421 'dot_to_dash': SLUGIFY_DOT_TO_DASH, | |
422 'space_to_dash': SLUGIFY_SPACE_TO_DASH} | |
423 mode = 0 | |
424 for v in value.split(','): | |
425 f = mapping.get(v.strip()) | |
426 if f is None: | |
427 if v == 'iconv': | |
428 raise Exception("'iconv' is not supported as a slugify mode " | |
429 "in PieCrust2. Use 'transliterate'.") | |
430 raise Exception("Unknown slugify flag: %s" % v) | |
431 mode |= f | |
432 return mode | |
433 |