comparison piecrust/sources/taxonomy.py @ 855:448710d84121

refactor: Get the taxonomy support back to a functional state. There's now a taxonomy content source that wraps another normal content source like a blog posts' source. It works in tandem with a taxonomy content pipeline that will do the heavy lifting of figuring out what kind of terms exist and need to be baked.
author Ludovic Chabant <ludovic@chabant.com>
date Tue, 06 Jun 2017 00:26:21 -0700
parents 08e02c2a2a1a
children 9bb22bbe093c
comparison
equal deleted inserted replaced
854:08e02c2a2a1a 855:448710d84121
1 import io
1 import re 2 import re
2 import time 3 import time
3 import logging 4 import logging
4 import unidecode 5 import unidecode
5 from piecrust.chefutil import format_timed, format_timed_scope 6 from werkzeug.utils import cached_property
6 from piecrust.configuration import ConfigurationError 7 from piecrust.configuration import ConfigurationError
7 from piecrust.data.filters import ( 8 from piecrust.data.filters import (
8 PaginationFilter, SettingFilterClause) 9 PaginationFilter, SettingFilterClause)
9 from piecrust.pipelines.base import ContentPipeline 10 from piecrust.page import Page
11 from piecrust.pipelines._pagebaker import PageBaker
12 from piecrust.pipelines._pagerecords import PagePipelineRecordEntry
13 from piecrust.pipelines.base import (
14 ContentPipeline, get_record_name_for_source)
15 from piecrust.pipelines.records import RecordHistory
10 from piecrust.routing import RouteParameter 16 from piecrust.routing import RouteParameter
11 from piecrust.sources.base import ContentSource, GeneratedContentException 17 from piecrust.sources.base import (
18 ContentItem, ContentSource, GeneratedContentException)
12 19
13 20
14 logger = logging.getLogger(__name__) 21 logger = logging.getLogger(__name__)
15 22
16 23
42 if self.is_multiple: 49 if self.is_multiple:
43 return self.name 50 return self.name
44 return self.term_name 51 return self.term_name
45 52
46 53
54 _taxonomy_index = """---
55 layout: %(template)s
56 ---
57 """
58
59
47 class TaxonomySource(ContentSource): 60 class TaxonomySource(ContentSource):
48 """ A content source that generates taxonomy listing pages. 61 """ A content source that generates taxonomy listing pages.
49 """ 62 """
50 SOURCE_NAME = 'taxonomy' 63 SOURCE_NAME = 'taxonomy'
51 DEFAULT_PIPELINE_NAME = 'taxonomy' 64 DEFAULT_PIPELINE_NAME = 'taxonomy'
52 65
53 def __init__(self, app, name, config): 66 def __init__(self, app, name, config):
54 super().__init__(app, name, config) 67 super().__init__(app, name, config)
55 68
69 source_name = config.get('source')
70 if source_name is None:
71 raise ConfigurationError(
72 "Taxonomy source '%s' requires an inner source." % name)
73 self._inner_source_name = source_name
74
56 tax_name = config.get('taxonomy') 75 tax_name = config.get('taxonomy')
57 if tax_name is None: 76 if tax_name is None:
58 raise ConfigurationError( 77 raise ConfigurationError(
59 "Taxonomy source '%s' requires a taxonomy name." % name) 78 "Taxonomy source '%s' requires a taxonomy name." % name)
60 self.taxonomy = _get_taxonomy(app, tax_name) 79 self.taxonomy = _get_taxonomy(app, tax_name)
61 80
62 sm = config.get('slugify_mode') 81 sm = config.get('slugify_mode')
63 self.slugifier = _get_slugifier(app, self.taxonomy, sm) 82 self.slugifier = _get_slugifier(app, self.taxonomy, sm)
83
84 tpl_name = config.get('template', '_%s.html' % tax_name)
85 self._raw_item = _taxonomy_index % {'template': tpl_name}
86
87 @cached_property
88 def inner_source(self):
89 return self.app.getSource(self._inner_source_name)
90
91 def openItem(self, item, mode='r', **kwargs):
92 return io.StringIO(self._raw_item)
93
94 def getItemMtime(self, item):
95 return time.time()
64 96
65 def getContents(self, group): 97 def getContents(self, group):
66 # Our content is procedurally generated from other content sources, 98 # Our content is procedurally generated from other content sources,
67 # so we really don't support listing anything here -- it would be 99 # so we really don't support listing anything here -- it would be
68 # quite costly. 100 # quite costly.
76 param_type = (RouteParameter.TYPE_PATH if self.taxonomy.is_multiple 108 param_type = (RouteParameter.TYPE_PATH if self.taxonomy.is_multiple
77 else RouteParameter.TYPE_STRING) 109 else RouteParameter.TYPE_STRING)
78 return [RouteParameter(name, param_type, 110 return [RouteParameter(name, param_type,
79 variadic=self.taxonomy.is_multiple)] 111 variadic=self.taxonomy.is_multiple)]
80 112
113 def findContent(self, route_params):
114 slugified_term = route_params[self.taxonomy.term_name]
115 spec = '_index[%s]' % slugified_term
116 metadata = {'term': slugified_term,
117 'route_params': {
118 self.taxonomy.term_name: slugified_term}
119 }
120 return ContentItem(spec, metadata)
121
81 def slugify(self, term): 122 def slugify(self, term):
82 return self.slugifier.slugify(term) 123 return self.slugifier.slugify(term)
83 124
84 def slugifyMultiple(self, terms): 125 def slugifyMultiple(self, terms):
85 return self.slugifier.slugifyMultiple(terms) 126 return self.slugifier.slugifyMultiple(terms)
86 127
87 def prepareRenderContext(self, ctx): 128 def prepareRenderContext(self, ctx):
88 # Set the pagination source as the source we're generating for. 129 # Set the pagination source as the source we're generating for.
89 ctx.pagination_source = self.source 130 ctx.pagination_source = self.inner_source
90 131
91 # Get the taxonomy terms from the route metadata... this can come from 132 # Get the taxonomy terms from the route metadata... this can come from
92 # the browser's URL (while serving) or from the baking (see `bake` 133 # the browser's URL (while serving) or from the baking (see `bake`
93 # method below). In both cases, we expect to have the *slugified* 134 # method below). In both cases, we expect to have the *slugified*
94 # version of the term, because we're going to set a filter that also 135 # version of the term, because we're going to set a filter that also
139 180
140 def _setTaxonomyFilter(self, ctx, term_value, is_combination): 181 def _setTaxonomyFilter(self, ctx, term_value, is_combination):
141 # Set up the filter that will check the pages' terms. 182 # Set up the filter that will check the pages' terms.
142 flt = PaginationFilter() 183 flt = PaginationFilter()
143 flt.addClause(HasTaxonomyTermsFilterClause( 184 flt.addClause(HasTaxonomyTermsFilterClause(
144 self.taxonomy, self.slugify.mode, term_value, is_combination)) 185 self.taxonomy, self.slugifier.mode, term_value, is_combination))
145 ctx.pagination_filter = flt 186 ctx.pagination_filter = flt
146 187
147 def onRouteFunctionUsed(self, route_params): 188 def onRouteFunctionUsed(self, route_params):
148 # Get the values, and slugify them appropriately. 189 # Get the values, and slugify them appropriately.
149 values = route_params[self.taxonomy.term_name] 190 values = route_params[self.taxonomy.term_name]
171 def __init__(self, taxonomy, slugify_mode, value, is_combination): 212 def __init__(self, taxonomy, slugify_mode, value, is_combination):
172 super().__init__(taxonomy.setting_name, value) 213 super().__init__(taxonomy.setting_name, value)
173 self._taxonomy = taxonomy 214 self._taxonomy = taxonomy
174 self._is_combination = is_combination 215 self._is_combination = is_combination
175 self._slugifier = _Slugifier(taxonomy, slugify_mode) 216 self._slugifier = _Slugifier(taxonomy, slugify_mode)
176 217 if taxonomy.is_multiple:
177 def pageMatches(self, fil, page): 218 self.pageMatches = self._pageMatchesAny
178 if self._taxonomy.is_multiple:
179 # Multiple taxonomy, i.e. it supports multiple terms, like tags.
180 page_values = fil.value_accessor(page, self.name)
181 if page_values is None or not isinstance(page_values, list):
182 return False
183
184 page_set = set(map(self._slugifier.slugify, page_values))
185 if self._is_combination:
186 # Multiple taxonomy, and multiple terms to match. Check that
187 # the ones to match are all in the page's terms.
188 value_set = set(self.value)
189 return value_set.issubset(page_set)
190 else:
191 # Multiple taxonomy, one term to match.
192 return self.value in page_set
193 else: 219 else:
194 # Single taxonomy. Just compare the values. 220 self.pageMatches = self._pageMatchesSingle
195 page_value = fil.value_accessor(page, self.name) 221
196 if page_value is None: 222 def _pageMatchesAny(self, fil, page):
197 return False 223 # Multiple taxonomy, i.e. it supports multiple terms, like tags.
198 page_value = self._slugifier.slugify(page_value) 224 page_values = page.config.get(self.name)
199 return page_value == self.value 225 if page_values is None or not isinstance(page_values, list):
226 return False
227
228 page_set = set(map(self._slugifier.slugify, page_values))
229 if self._is_combination:
230 # Multiple taxonomy, and multiple terms to match. Check that
231 # the ones to match are all in the page's terms.
232 value_set = set(self.value)
233 return value_set.issubset(page_set)
234 else:
235 # Multiple taxonomy, one term to match.
236 return self.value in page_set
237
238 def _pageMatchesSingle(self, fil, page):
239 # Single taxonomy. Just compare the values.
240 page_value = page.config.get(self.name)
241 if page_value is None:
242 return False
243 page_value = self._slugifier.slugify(page_value)
244 return page_value == self.value
200 245
201 246
202 def _get_taxonomy(app, tax_name): 247 def _get_taxonomy(app, tax_name):
203 tax_config = app.config.get('site/taxonomies/' + tax_name) 248 tax_config = app.config.get('site/taxonomies/' + tax_name)
204 if tax_config is None: 249 if tax_config is None:
211 slugify_mode = app.config.get('site/slugify_mode', 'encode') 256 slugify_mode = app.config.get('site/slugify_mode', 'encode')
212 sm = _parse_slugify_mode(slugify_mode) 257 sm = _parse_slugify_mode(slugify_mode)
213 return _Slugifier(taxonomy, sm) 258 return _Slugifier(taxonomy, sm)
214 259
215 260
261 class TaxonomyPipelineRecordEntry(PagePipelineRecordEntry):
262 def __init__(self):
263 super().__init__()
264 self.term = None
265
266
216 class TaxonomyPipeline(ContentPipeline): 267 class TaxonomyPipeline(ContentPipeline):
217 PIPELINE_NAME = 'taxonomy' 268 PIPELINE_NAME = 'taxonomy'
218 PASS_NUM = 1 269 PASS_NUM = 1
270 RECORD_ENTRY_CLASS = TaxonomyPipelineRecordEntry
219 271
220 def __init__(self, source, ctx): 272 def __init__(self, source, ctx):
221 if not isinstance(source, TaxonomySource): 273 if not isinstance(source, TaxonomySource):
222 raise Exception("The taxonomy pipeline only supports taxonomy " 274 raise Exception("The taxonomy pipeline only supports taxonomy "
223 "content sources.") 275 "content sources.")
224 276
225 super().__init__(source, ctx) 277 super().__init__(source, ctx)
278 self.inner_source = source.inner_source
226 self.taxonomy = source.taxonomy 279 self.taxonomy = source.taxonomy
227 self.slugifier = source.slugifier 280 self.slugifier = source.slugifier
228 281 self._tpl_name = source.config['template']
229 def buildJobs(self): 282 self._analyzer = None
230 logger.debug("Building taxonomy pages for source: %s" % 283 self._pagebaker = None
231 self.source.name) 284
232 analyzer = _TaxonomyTermsAnalyzer(self) 285 def initialize(self):
233 with format_timed_scope(logger, 'gathered taxonomy terms', 286 self._pagebaker = PageBaker(self.app,
234 level=logging.DEBUG, colored=False): 287 self.ctx.out_dir,
235 analyzer.analyze(ctx) 288 force=self.ctx.force)
236 289 self._pagebaker.startWriterQueue()
237 def bake(self, ctx): 290
238 if not self.page_ref.exists: 291 def shutdown(self):
239 logger.debug( 292 self._pagebaker.stopWriterQueue()
240 "No page found at '%s', skipping taxonomy '%s'." % 293
241 (self.page_ref, self.taxonomy.name)) 294 def createJobs(self, ctx):
242 return 295 logger.debug("Building '%s' taxonomy pages for source: %s" %
243 296 (self.taxonomy.name, self.inner_source.name))
244 logger.debug("Baking %s pages...", self.taxonomy.name) 297 self._analyzer = _TaxonomyTermsAnalyzer(self, ctx.record_histories)
245 analyzer = _TaxonomyTermsAnalyzer(self.source_name, self.taxonomy, 298 self._analyzer.analyze()
246 self.slugify_mode) 299
247 with format_timed_scope(logger, 'gathered taxonomy terms', 300 logger.debug("Queuing %d '%s' jobs." %
248 level=logging.DEBUG, colored=False): 301 (len(self._analyzer.dirty_slugified_terms),
249 analyzer.analyze(ctx) 302 self.taxonomy.name))
250 303 jobs = []
251 start_time = time.perf_counter() 304 for slugified_term in self._analyzer.dirty_slugified_terms:
252 page_count = self._bakeTaxonomyTerms(ctx, analyzer) 305 item = ContentItem(
253 if page_count > 0: 306 '_index[%s]' % slugified_term,
254 logger.info(format_timed( 307 {'term': slugified_term,
255 start_time, 308 'route_params': {
256 "baked %d %s pages for %s." % ( 309 self.taxonomy.term_name: slugified_term}
257 page_count, self.taxonomy.term_name, self.source_name))) 310 })
258 311 jobs.append(self.createJob(item))
259 def _bakeTaxonomyTerms(self, ctx, analyzer): 312 if len(jobs) > 0:
260 # Start baking those terms. 313 return jobs
261 logger.debug( 314 return None
262 "Baking '%s' for source '%s': %d terms" % 315
263 (self.taxonomy.name, self.source_name, 316 def run(self, job, ctx, result):
264 len(analyzer.dirty_slugified_terms))) 317 content_item = job.content_item
265 318 logger.debug("Rendering '%s' page: %s" %
266 route = self.app.getGeneratorRoute(self.name) 319 (self.taxonomy.name, content_item.metadata['term']))
267 if route is None: 320
268 raise Exception("No routes have been defined for generator: %s" % 321 page = Page(self.source, job.content_item)
269 self.name) 322 prev_entry = ctx.previous_entry
270 323 cur_entry = result.record_entry
271 logger.debug("Using taxonomy page: %s" % self.page_ref) 324 self._pagebaker.bake(page, prev_entry, cur_entry, [])
272 fac = self.page_ref.getFactory() 325
273 326 def postJobRun(self, ctx):
274 job_count = 0 327 # We create bake entries for all the terms that were *not* dirty.
275 for slugified_term in analyzer.dirty_slugified_terms:
276 extra_route_params = {
277 self.taxonomy.term_name: slugified_term}
278
279 # Use the slugified term as the record's extra key seed.
280 logger.debug(
281 "Queuing: %s [%s=%s]" %
282 (fac.ref_spec, self.taxonomy.name, slugified_term))
283 ctx.queueBakeJob(fac, route, extra_route_params, slugified_term)
284 job_count += 1
285 ctx.runJobQueue()
286
287 # Now we create bake entries for all the terms that were *not* dirty.
288 # This is because otherwise, on the next incremental bake, we wouldn't 328 # This is because otherwise, on the next incremental bake, we wouldn't
289 # find any entry for those things, and figure that we need to delete 329 # find any entry for those things, and figure that we need to delete
290 # their outputs. 330 # their outputs.
291 for prev_entry, cur_entry in ctx.getAllPageRecords(): 331 analyzer = self._analyzer
292 # Only consider taxonomy-related entries that don't have any 332 for prev, cur in ctx.record_history.diffs:
293 # current version (i.e. they weren't baked just now). 333 # Only consider entries that don't have any current version
294 if prev_entry and not cur_entry: 334 # (i.e. they weren't baked just now).
295 try: 335 if prev and not cur:
296 t = ctx.getSeedFromRecordExtraKey(prev_entry.extra_key) 336 t = prev.term
297 except InvalidRecordExtraKey:
298 continue
299
300 if analyzer.isKnownSlugifiedTerm(t): 337 if analyzer.isKnownSlugifiedTerm(t):
301 logger.debug("Creating unbaked entry for %s term: %s" % 338 logger.debug("Creating unbaked entry for '%s' term: %s" %
302 (self.name, t)) 339 (self.taxonomy.name, t))
303 ctx.collapseRecord(prev_entry) 340 cur.term = t
341 cur.out_paths = list(prev.out_paths)
342 cur.errors = list(prev.errors)
304 else: 343 else:
305 logger.debug("Term %s in %s isn't used anymore." % 344 logger.debug("Term '%s' in '%s' isn't used anymore." %
306 (self.name, t)) 345 (t, self.taxonomy.name))
307
308 return job_count
309 346
310 347
311 class _TaxonomyTermsAnalyzer(object): 348 class _TaxonomyTermsAnalyzer(object):
312 def __init__(self, source): 349 def __init__(self, pipeline, record_histories):
313 self.source = source 350 self.pipeline = pipeline
351 self.record_histories = record_histories
314 self._all_terms = {} 352 self._all_terms = {}
315 self._single_dirty_slugified_terms = set() 353 self._single_dirty_slugified_terms = set()
316 self._all_dirty_slugified_terms = None 354 self._all_dirty_slugified_terms = None
317 355
318 @property 356 @property
326 """ Returns whether the given slugified term has been seen during 364 """ Returns whether the given slugified term has been seen during
327 this bake. 365 this bake.
328 """ 366 """
329 return term in self._all_terms 367 return term in self._all_terms
330 368
331 def analyze(self, ctx): 369 def analyze(self):
332 # Build the list of terms for our taxonomy, and figure out which ones 370 # Build the list of terms for our taxonomy, and figure out which ones
333 # are 'dirty' for the current bake. 371 # are 'dirty' for the current bake.
334 # 372 #
335 # Remember all terms used. 373 # Remember all terms used.
336 for _, cur_entry in ctx.getAllPageRecords(): 374 source = self.pipeline.inner_source
337 if cur_entry and not cur_entry.was_overriden: 375 taxonomy = self.pipeline.taxonomy
338 cur_terms = cur_entry.config.get(self.taxonomy.setting_name) 376 slugifier = self.pipeline.slugifier
377
378 record_name = get_record_name_for_source(source)
379 current_records = self.record_histories.current
380 cur_rec = current_records.getRecord(record_name)
381 for cur_entry in cur_rec.getEntries():
382 if not cur_entry.was_overriden:
383 cur_terms = cur_entry.config.get(taxonomy.setting_name)
339 if cur_terms: 384 if cur_terms:
340 if not self.taxonomy.is_multiple: 385 if not taxonomy.is_multiple:
341 self._addTerm(cur_entry.path, cur_terms) 386 self._addTerm(
387 slugifier, cur_entry.item_spec, cur_terms)
342 else: 388 else:
343 self._addTerms(cur_entry.path, cur_terms) 389 self._addTerms(
390 slugifier, cur_entry.item_spec, cur_terms)
344 391
345 # Re-bake all taxonomy terms that include new or changed pages, by 392 # Re-bake all taxonomy terms that include new or changed pages, by
346 # marking them as 'dirty'. 393 # marking them as 'dirty'.
347 for prev_entry, cur_entry in ctx.getBakedPageRecords(): 394 previous_records = self.record_histories.previous
348 if cur_entry.source_name != self.source_name: 395 prev_rec = previous_records.getRecord(record_name)
349 continue 396 history = RecordHistory(prev_rec, cur_rec)
350 397 history.build()
398 for prev_entry, cur_entry in history.diffs:
351 entries = [cur_entry] 399 entries = [cur_entry]
352 if prev_entry: 400 if prev_entry:
353 entries.append(prev_entry) 401 entries.append(prev_entry)
354 402
355 for e in entries: 403 for e in entries:
356 entry_terms = e.config.get(self.taxonomy.setting_name) 404 entry_terms = e.config.get(taxonomy.setting_name)
357 if entry_terms: 405 if entry_terms:
358 if not self.taxonomy.is_multiple: 406 if not taxonomy.is_multiple:
359 self._single_dirty_slugified_terms.add( 407 self._single_dirty_slugified_terms.add(
360 self.slugifier.slugify(entry_terms)) 408 slugifier.slugify(entry_terms))
361 else: 409 else:
362 self._single_dirty_slugified_terms.update( 410 self._single_dirty_slugified_terms.update(
363 (self.slugifier.slugify(t) 411 (slugifier.slugify(t)
364 for t in entry_terms)) 412 for t in entry_terms))
365 413
366 self._all_dirty_slugified_terms = list( 414 self._all_dirty_slugified_terms = list(
367 self._single_dirty_slugified_terms) 415 self._single_dirty_slugified_terms)
368 logger.debug("Gathered %d dirty taxonomy terms", 416 logger.debug("Gathered %d dirty taxonomy terms",
374 # line. 422 # line.
375 # 423 #
376 # Add the combinations to that list. We get those combinations from 424 # Add the combinations to that list. We get those combinations from
377 # wherever combinations were used, so they're coming from the 425 # wherever combinations were used, so they're coming from the
378 # `onRouteFunctionUsed` method. 426 # `onRouteFunctionUsed` method.
379 if self.taxonomy.is_multiple: 427 if taxonomy.is_multiple:
380 known_combinations = set() 428 known_combinations = set()
381 for _, cur_entry in ctx.getAllPageRecords(): 429 for cur_entry in cur_rec.getEntries():
382 if cur_entry: 430 used_terms = _get_all_entry_taxonomy_terms(cur_entry)
383 used_terms = _get_all_entry_taxonomy_terms(cur_entry) 431 for terms in used_terms:
384 for terms in used_terms: 432 if len(terms) > 1:
385 if len(terms) > 1: 433 known_combinations.add(terms)
386 known_combinations.add(terms)
387 434
388 dcc = 0 435 dcc = 0
389 for terms in known_combinations: 436 for terms in known_combinations:
390 if not self._single_dirty_slugified_terms.isdisjoint( 437 if not self._single_dirty_slugified_terms.isdisjoint(
391 set(terms)): 438 set(terms)):
392 self._all_dirty_slugified_terms.append( 439 self._all_dirty_slugified_terms.append(
393 self.taxonomy.separator.join(terms)) 440 taxonomy.separator.join(terms))
394 dcc += 1 441 dcc += 1
395 logger.debug("Gathered %d term combinations, with %d dirty." % 442 logger.debug("Gathered %d term combinations, with %d dirty." %
396 (len(known_combinations), dcc)) 443 (len(known_combinations), dcc))
397 444
398 def _addTerms(self, entry_path, terms): 445 def _addTerms(self, slugifier, item_spec, terms):
399 for t in terms: 446 for t in terms:
400 self._addTerm(entry_path, t) 447 self._addTerm(slugifier, item_spec, t)
401 448
402 def _addTerm(self, entry_path, term): 449 def _addTerm(self, slugifier, item_spec, term):
403 st = self.slugifier.slugify(term) 450 st = slugifier.slugify(term)
404 orig_terms = self._all_terms.setdefault(st, []) 451 orig_terms = self._all_terms.setdefault(st, [])
405 if orig_terms and orig_terms[0] != term: 452 if orig_terms and orig_terms[0] != term:
406 logger.warning( 453 logger.warning(
407 "Term '%s' in '%s' is slugified to '%s' which conflicts with " 454 "Term '%s' in '%s' is slugified to '%s' which conflicts with "
408 "previously existing '%s'. The two will be merged." % 455 "previously existing '%s'. The two will be merged." %
409 (term, entry_path, st, orig_terms[0])) 456 (term, item_spec, st, orig_terms[0]))
410 orig_terms.append(term) 457 orig_terms.append(term)
411 458
412 459
413 def _get_all_entry_taxonomy_terms(entry): 460 def _get_all_entry_taxonomy_terms(entry):
414 res = set() 461 res = set()