comparison piecrust/sources/taxonomy.py @ 852:4850f8c21b6e

core: Start of the big refactor for PieCrust 3.0. * Everything is a `ContentSource`, including assets directories. * Most content sources are subclasses of the base file-system source. * A source is processed by a "pipeline", and there are 2 built-in pipelines, one for assets and one for pages. The asset pipeline is vaguely functional, but the page pipeline is completely broken right now. * Rewrite the baking process as just running appropriate pipelines on each content item. This should allow for better parallelization.
author Ludovic Chabant <ludovic@chabant.com>
date Wed, 17 May 2017 00:11:48 -0700
parents
children 08e02c2a2a1a
comparison
equal deleted inserted replaced
851:2c7e57d80bba 852:4850f8c21b6e
1 import re
2 import time
3 import logging
4 import unidecode
5 from piecrust.chefutil import format_timed, format_timed_scope
6 from piecrust.configuration import ConfigurationError
7 from piecrust.data.filters import (
8 PaginationFilter, SettingFilterClause,
9 page_value_accessor)
10 from piecrust.routing import RouteParameter
11 from piecrust.sources.base import ContentSource, GeneratedContentException
12
13
14 logger = logging.getLogger(__name__)
15
16
17 SLUGIFY_ENCODE = 1
18 SLUGIFY_TRANSLITERATE = 2
19 SLUGIFY_LOWERCASE = 4
20 SLUGIFY_DOT_TO_DASH = 8
21 SLUGIFY_SPACE_TO_DASH = 16
22
23
24 re_first_dot_to_dash = re.compile(r'^\.+')
25 re_dot_to_dash = re.compile(r'\.+')
26 re_space_to_dash = re.compile(r'\s+')
27
28
29 class Taxonomy(object):
30 def __init__(self, name, config):
31 self.name = name
32 self.config = config
33 self.term_name = config.get('term', name)
34 self.is_multiple = bool(config.get('multiple', False))
35 self.separator = config.get('separator', '/')
36 self.page_ref = config.get('page')
37
38 @property
39 def setting_name(self):
40 if self.is_multiple:
41 return self.name
42 return self.term_name
43
44
45 class TaxonomySource(ContentSource):
46 """ A page generator that handles taxonomies, _i.e._ lists of keywords
47 that pages are labelled with, and for which we need to generate
48 listing pages.
49 """
50 SOURCE_NAME = 'taxonomy'
51
52 def __init__(self, app, name, config):
53 super().__init__(app, name, config)
54
55 tax_name = config.get('taxonomy')
56 if tax_name is None:
57 raise ConfigurationError(
58 "Generator '%s' requires a taxonomy name." % name)
59 tax_config = app.config.get('site/taxonomies/' + tax_name)
60 if tax_config is None:
61 raise ConfigurationError(
62 "Error initializing generator '%s', no such taxonomy: %s",
63 (name, tax_name))
64 self.taxonomy = Taxonomy(tax_name, tax_config)
65
66 sm = config.get('slugify_mode')
67 if not sm:
68 sm = app.config.get('site/slugify_mode', 'encode')
69 self.slugify_mode = _parse_slugify_mode(sm)
70 self.slugifier = _Slugifier(self.taxonomy, self.slugify_mode)
71
72 def getContents(self, group):
73 raise GeneratedContentException()
74
75 def getSupportedRouteParameters(self):
76 name = self.taxonomy.term_name
77 param_type = (RouteParameter.TYPE_PATH if self.taxonomy.is_multiple
78 else RouteParameter.TYPE_STRING)
79 return [RouteParameter(name, param_type,
80 variadic=self.taxonomy.is_multiple)]
81
82 def slugify(self, term):
83 return self.slugifier.slugify(term)
84
85 def slugifyMultiple(self, terms):
86 return self.slugifier.slugifyMultiple(terms)
87
88 def prepareRenderContext(self, ctx):
89 # Set the pagination source as the source we're generating for.
90 ctx.pagination_source = self.source
91
92 # Get the taxonomy terms from the route metadata... this can come from
93 # the browser's URL (while serving) or from the baking (see `bake`
94 # method below). In both cases, we expect to have the *slugified*
95 # version of the term, because we're going to set a filter that also
96 # slugifies the terms found on each page.
97 #
98 # This is because:
99 # * while serving, we get everything from the request URL, so we only
100 # have the slugified version.
101 # * if 2 slightly different terms "collide" into the same slugified
102 # term, we'll get a merge of the 2 on the listing page, which is
103 # what the user expects.
104 #
105 tax_terms, is_combination = self._getTaxonomyTerms(
106 ctx.page.route_metadata)
107 self._setTaxonomyFilter(ctx, tax_terms, is_combination)
108
109 # Add some custom data for rendering.
110 ctx.custom_data.update({
111 self.taxonomy.term_name: tax_terms,
112 'is_multiple_%s' % self.taxonomy.term_name: is_combination})
113 # Add some "plural" version of the term... so for instance, if this
114 # is the "tags" taxonomy, "tag" will have one term most of the time,
115 # except when it's a combination. Here, we add "tags" as something that
116 # is always a tuple, even when it's not a combination.
117 if (self.taxonomy.is_multiple and
118 self.taxonomy.name != self.taxonomy.term_name):
119 mult_val = tax_terms
120 if not is_combination:
121 mult_val = (mult_val,)
122 ctx.custom_data[self.taxonomy.name] = mult_val
123
124 def _getSource(self):
125 return self.app.getSource(self.config['source'])
126
127 def _getTaxonomyTerms(self, route_metadata):
128 # Get the individual slugified terms from the route metadata.
129 all_values = route_metadata.get(self.taxonomy.term_name)
130 if all_values is None:
131 raise Exception("'%s' values couldn't be found in route metadata" %
132 self.taxonomy.term_name)
133
134 # If it's a "multiple" taxonomy, we need to potentially split the
135 # route value into the individual terms (_e.g._ when listing all pages
136 # that have 2 given tags, we need to get each of those 2 tags).
137 if self.taxonomy.is_multiple:
138 sep = self.taxonomy.separator
139 if sep in all_values:
140 return tuple(all_values.split(sep)), True
141 # Not a "multiple" taxonomy, so there's only the one value.
142 return all_values, False
143
144 def _setTaxonomyFilter(self, ctx, term_value, is_combination):
145 # Set up the filter that will check the pages' terms.
146 flt = PaginationFilter(value_accessor=page_value_accessor)
147 flt.addClause(HasTaxonomyTermsFilterClause(
148 self.taxonomy, self.slugify_mode, term_value, is_combination))
149 ctx.pagination_filter = flt
150
151 def onRouteFunctionUsed(self, route, route_metadata):
152 # Get the values, and slugify them appropriately.
153 values = route_metadata[self.taxonomy.term_name]
154 if self.taxonomy.is_multiple:
155 # TODO: here we assume the route has been properly configured.
156 slugified_values = self.slugifyMultiple((str(v) for v in values))
157 route_val = self.taxonomy.separator.join(slugified_values)
158 else:
159 slugified_values = self.slugify(str(values))
160 route_val = slugified_values
161
162 # We need to register this use of a taxonomy term.
163 eis = self.app.env.exec_info_stack
164 cpi = eis.current_page_info.render_ctx.current_pass_info
165 if cpi:
166 utt = cpi.getCustomInfo('used_taxonomy_terms', [], True)
167 utt.append(slugified_values)
168
169 # Put the slugified values in the route metadata so they're used to
170 # generate the URL.
171 route_metadata[self.taxonomy.term_name] = route_val
172
173 def bake(self, ctx):
174 if not self.page_ref.exists:
175 logger.debug(
176 "No page found at '%s', skipping taxonomy '%s'." %
177 (self.page_ref, self.taxonomy.name))
178 return
179
180 logger.debug("Baking %s pages...", self.taxonomy.name)
181 analyzer = _TaxonomyTermsAnalyzer(self.source_name, self.taxonomy,
182 self.slugify_mode)
183 with format_timed_scope(logger, 'gathered taxonomy terms',
184 level=logging.DEBUG, colored=False):
185 analyzer.analyze(ctx)
186
187 start_time = time.perf_counter()
188 page_count = self._bakeTaxonomyTerms(ctx, analyzer)
189 if page_count > 0:
190 logger.info(format_timed(
191 start_time,
192 "baked %d %s pages for %s." % (
193 page_count, self.taxonomy.term_name, self.source_name)))
194
195 def _bakeTaxonomyTerms(self, ctx, analyzer):
196 # Start baking those terms.
197 logger.debug(
198 "Baking '%s' for source '%s': %d terms" %
199 (self.taxonomy.name, self.source_name,
200 len(analyzer.dirty_slugified_terms)))
201
202 route = self.app.getGeneratorRoute(self.name)
203 if route is None:
204 raise Exception("No routes have been defined for generator: %s" %
205 self.name)
206
207 logger.debug("Using taxonomy page: %s" % self.page_ref)
208 fac = self.page_ref.getFactory()
209
210 job_count = 0
211 for slugified_term in analyzer.dirty_slugified_terms:
212 extra_route_metadata = {
213 self.taxonomy.term_name: slugified_term}
214
215 # Use the slugified term as the record's extra key seed.
216 logger.debug(
217 "Queuing: %s [%s=%s]" %
218 (fac.ref_spec, self.taxonomy.name, slugified_term))
219 ctx.queueBakeJob(fac, route, extra_route_metadata, slugified_term)
220 job_count += 1
221 ctx.runJobQueue()
222
223 # Now we create bake entries for all the terms that were *not* dirty.
224 # This is because otherwise, on the next incremental bake, we wouldn't
225 # find any entry for those things, and figure that we need to delete
226 # their outputs.
227 for prev_entry, cur_entry in ctx.getAllPageRecords():
228 # Only consider taxonomy-related entries that don't have any
229 # current version (i.e. they weren't baked just now).
230 if prev_entry and not cur_entry:
231 try:
232 t = ctx.getSeedFromRecordExtraKey(prev_entry.extra_key)
233 except InvalidRecordExtraKey:
234 continue
235
236 if analyzer.isKnownSlugifiedTerm(t):
237 logger.debug("Creating unbaked entry for %s term: %s" %
238 (self.name, t))
239 ctx.collapseRecord(prev_entry)
240 else:
241 logger.debug("Term %s in %s isn't used anymore." %
242 (self.name, t))
243
244 return job_count
245
246
247 class HasTaxonomyTermsFilterClause(SettingFilterClause):
248 def __init__(self, taxonomy, slugify_mode, value, is_combination):
249 super(HasTaxonomyTermsFilterClause, self).__init__(
250 taxonomy.setting_name, value)
251 self._taxonomy = taxonomy
252 self._is_combination = is_combination
253 self._slugifier = _Slugifier(taxonomy, slugify_mode)
254
255 def pageMatches(self, fil, page):
256 if self._taxonomy.is_multiple:
257 # Multiple taxonomy, i.e. it supports multiple terms, like tags.
258 page_values = fil.value_accessor(page, self.name)
259 if page_values is None or not isinstance(page_values, list):
260 return False
261
262 page_set = set(map(self._slugifier.slugify, page_values))
263 if self._is_combination:
264 # Multiple taxonomy, and multiple terms to match. Check that
265 # the ones to match are all in the page's terms.
266 value_set = set(self.value)
267 return value_set.issubset(page_set)
268 else:
269 # Multiple taxonomy, one term to match.
270 return self.value in page_set
271 else:
272 # Single taxonomy. Just compare the values.
273 page_value = fil.value_accessor(page, self.name)
274 if page_value is None:
275 return False
276 page_value = self._slugifier.slugify(page_value)
277 return page_value == self.value
278
279
280 class _TaxonomyTermsAnalyzer(object):
281 def __init__(self, source_name, taxonomy, slugify_mode):
282 self.source_name = source_name
283 self.taxonomy = taxonomy
284 self.slugifier = _Slugifier(taxonomy, slugify_mode)
285 self._all_terms = {}
286 self._single_dirty_slugified_terms = set()
287 self._all_dirty_slugified_terms = None
288
289 @property
290 def dirty_slugified_terms(self):
291 """ Returns the slugified terms that have been 'dirtied' during
292 this bake.
293 """
294 return self._all_dirty_slugified_terms
295
296 def isKnownSlugifiedTerm(self, term):
297 """ Returns whether the given slugified term has been seen during
298 this bake.
299 """
300 return term in self._all_terms
301
302 def analyze(self, ctx):
303 # Build the list of terms for our taxonomy, and figure out which ones
304 # are 'dirty' for the current bake.
305 #
306 # Remember all terms used.
307 for _, cur_entry in ctx.getAllPageRecords():
308 if cur_entry and not cur_entry.was_overriden:
309 cur_terms = cur_entry.config.get(self.taxonomy.setting_name)
310 if cur_terms:
311 if not self.taxonomy.is_multiple:
312 self._addTerm(cur_entry.path, cur_terms)
313 else:
314 self._addTerms(cur_entry.path, cur_terms)
315
316 # Re-bake all taxonomy terms that include new or changed pages, by
317 # marking them as 'dirty'.
318 for prev_entry, cur_entry in ctx.getBakedPageRecords():
319 if cur_entry.source_name != self.source_name:
320 continue
321
322 entries = [cur_entry]
323 if prev_entry:
324 entries.append(prev_entry)
325
326 for e in entries:
327 entry_terms = e.config.get(self.taxonomy.setting_name)
328 if entry_terms:
329 if not self.taxonomy.is_multiple:
330 self._single_dirty_slugified_terms.add(
331 self.slugifier.slugify(entry_terms))
332 else:
333 self._single_dirty_slugified_terms.update(
334 (self.slugifier.slugify(t)
335 for t in entry_terms))
336
337 self._all_dirty_slugified_terms = list(
338 self._single_dirty_slugified_terms)
339 logger.debug("Gathered %d dirty taxonomy terms",
340 len(self._all_dirty_slugified_terms))
341
342 # Re-bake the combination pages for terms that are 'dirty'.
343 # We make all terms into tuple, even those that are not actual
344 # combinations, so that we have less things to test further down the
345 # line.
346 #
347 # Add the combinations to that list. We get those combinations from
348 # wherever combinations were used, so they're coming from the
349 # `onRouteFunctionUsed` method.
350 if self.taxonomy.is_multiple:
351 known_combinations = set()
352 for _, cur_entry in ctx.getAllPageRecords():
353 if cur_entry:
354 used_terms = _get_all_entry_taxonomy_terms(cur_entry)
355 for terms in used_terms:
356 if len(terms) > 1:
357 known_combinations.add(terms)
358
359 dcc = 0
360 for terms in known_combinations:
361 if not self._single_dirty_slugified_terms.isdisjoint(
362 set(terms)):
363 self._all_dirty_slugified_terms.append(
364 self.taxonomy.separator.join(terms))
365 dcc += 1
366 logger.debug("Gathered %d term combinations, with %d dirty." %
367 (len(known_combinations), dcc))
368
369 def _addTerms(self, entry_path, terms):
370 for t in terms:
371 self._addTerm(entry_path, t)
372
373 def _addTerm(self, entry_path, term):
374 st = self.slugifier.slugify(term)
375 orig_terms = self._all_terms.setdefault(st, [])
376 if orig_terms and orig_terms[0] != term:
377 logger.warning(
378 "Term '%s' in '%s' is slugified to '%s' which conflicts with "
379 "previously existing '%s'. The two will be merged." %
380 (term, entry_path, st, orig_terms[0]))
381 orig_terms.append(term)
382
383
384 def _get_all_entry_taxonomy_terms(entry):
385 res = set()
386 for o in entry.subs:
387 for pinfo in o.render_info:
388 if pinfo:
389 terms = pinfo.getCustomInfo('used_taxonomy_terms')
390 if terms:
391 res |= set(terms)
392 return res
393
394
395 class _Slugifier(object):
396 def __init__(self, taxonomy, mode):
397 self.taxonomy = taxonomy
398 self.mode = mode
399
400 def slugifyMultiple(self, terms):
401 return tuple(map(self.slugify, terms))
402
403 def slugify(self, term):
404 if self.mode & SLUGIFY_TRANSLITERATE:
405 term = unidecode.unidecode(term)
406 if self.mode & SLUGIFY_LOWERCASE:
407 term = term.lower()
408 if self.mode & SLUGIFY_DOT_TO_DASH:
409 term = re_first_dot_to_dash.sub('', term)
410 term = re_dot_to_dash.sub('-', term)
411 if self.mode & SLUGIFY_SPACE_TO_DASH:
412 term = re_space_to_dash.sub('-', term)
413 return term
414
415
416 def _parse_slugify_mode(value):
417 mapping = {
418 'encode': SLUGIFY_ENCODE,
419 'transliterate': SLUGIFY_TRANSLITERATE,
420 'lowercase': SLUGIFY_LOWERCASE,
421 'dot_to_dash': SLUGIFY_DOT_TO_DASH,
422 'space_to_dash': SLUGIFY_SPACE_TO_DASH}
423 mode = 0
424 for v in value.split(','):
425 f = mapping.get(v.strip())
426 if f is None:
427 if v == 'iconv':
428 raise Exception("'iconv' is not supported as a slugify mode "
429 "in PieCrust2. Use 'transliterate'.")
430 raise Exception("Unknown slugify flag: %s" % v)
431 mode |= f
432 return mode
433