comparison piecrust/sources/taxonomy.py @ 1136:5f97b5b59dfe

bake: Optimize cache handling for the baking process. - Get rid of the 2-level pipeline runs... handle a single set of passes. - Go back to load/render segments/layout passes for pages. - Add descriptions of what each job batch does. - Improve the taxonomy pipeline so it doesn't re-bake terms that don't need to be re-baked. - Simplify some of the code.
author Ludovic Chabant <ludovic@chabant.com>
date Mon, 23 Apr 2018 21:47:49 -0700
parents 1857dbd4580f
children 9f3e702a8a69
comparison
equal deleted inserted replaced
1135:6350ee084273 1136:5f97b5b59dfe
5 from piecrust.configuration import ConfigurationError 5 from piecrust.configuration import ConfigurationError
6 from piecrust.data.filters import ( 6 from piecrust.data.filters import (
7 PaginationFilter, SettingFilterClause) 7 PaginationFilter, SettingFilterClause)
8 from piecrust.page import Page 8 from piecrust.page import Page
9 from piecrust.pipelines._pagebaker import PageBaker 9 from piecrust.pipelines._pagebaker import PageBaker
10 from piecrust.pipelines._pagerecords import ( 10 from piecrust.pipelines._pagerecords import PagePipelineRecordEntry
11 PagePipelineRecordEntry,
12 add_page_job_result, merge_job_result_into_record_entry)
13 from piecrust.pipelines.base import ( 11 from piecrust.pipelines.base import (
14 ContentPipeline, get_record_name_for_source, 12 ContentPipeline, get_record_name_for_source, create_job)
15 create_job, content_item_from_job)
16 from piecrust.pipelines.records import RecordHistory
17 from piecrust.routing import RouteParameter 13 from piecrust.routing import RouteParameter
18 from piecrust.sources.base import ContentItem 14 from piecrust.sources.base import ContentItem
19 from piecrust.sources.generator import GeneratorSourceBase 15 from piecrust.sources.generator import GeneratorSourceBase
20 16
21 17
305 301
306 entry = rec_fac(record_entry_spec) 302 entry = rec_fac(record_entry_spec)
307 current_record.addEntry(entry) 303 current_record.addEntry(entry)
308 304
309 if len(jobs) > 0: 305 if len(jobs) > 0:
310 return jobs 306 return jobs, "taxonomize"
311 return None 307 return None, None
312 308
313 def run(self, job, ctx, result): 309 def run(self, job, ctx, result):
314 term = job['term'] 310 term = job['term']
315 content_item = ContentItem('_index', 311 content_item = ContentItem('_index',
316 {'term': term, 312 {'term': term,
322 logger.debug("Rendering '%s' page: %s" % 318 logger.debug("Rendering '%s' page: %s" %
323 (self.taxonomy.name, page.source_metadata['term'])) 319 (self.taxonomy.name, page.source_metadata['term']))
324 prev_entry = ctx.previous_entry 320 prev_entry = ctx.previous_entry
325 rdr_subs = self._pagebaker.bake(page, prev_entry) 321 rdr_subs = self._pagebaker.bake(page, prev_entry)
326 322
327 add_page_job_result(result)
328 result['subs'] = rdr_subs 323 result['subs'] = rdr_subs
329 result['term'] = page.source_metadata['term'] 324 result['term'] = page.source_metadata['term']
330 325
331 def handleJobResult(self, result, ctx): 326 def handleJobResult(self, result, ctx):
332 existing = ctx.record_entry 327 existing = ctx.record_entry
333 merge_job_result_into_record_entry(existing, result) 328 existing.subs = result['subs']
334 existing.term = result['term'] 329 existing.term = result['term']
335 330
336 def postJobRun(self, ctx): 331 def postJobRun(self, ctx):
337 # We create bake entries for all the terms that were *not* dirty. 332 # We create bake entries for all the terms that were *not* dirty.
338 # This is because otherwise, on the next incremental bake, we wouldn't 333 # This is because otherwise, on the next incremental bake, we wouldn't
360 class _TaxonomyTermsAnalyzer(object): 355 class _TaxonomyTermsAnalyzer(object):
361 def __init__(self, pipeline, record_histories): 356 def __init__(self, pipeline, record_histories):
362 self.pipeline = pipeline 357 self.pipeline = pipeline
363 self.record_histories = record_histories 358 self.record_histories = record_histories
364 self._all_terms = {} 359 self._all_terms = {}
365 self._single_dirty_slugified_terms = set()
366 self._all_dirty_slugified_terms = None 360 self._all_dirty_slugified_terms = None
367 361
368 @property 362 @property
369 def dirty_slugified_terms(self): 363 def dirty_slugified_terms(self):
370 """ Returns the slugified terms that have been 'dirtied' during 364 """ Returns the slugified terms that have been 'dirtied' during
379 return term in self._all_terms 373 return term in self._all_terms
380 374
381 def analyze(self): 375 def analyze(self):
382 # Build the list of terms for our taxonomy, and figure out which ones 376 # Build the list of terms for our taxonomy, and figure out which ones
383 # are 'dirty' for the current bake. 377 # are 'dirty' for the current bake.
384 #
385 # Remember all terms used.
386 source = self.pipeline.inner_source 378 source = self.pipeline.inner_source
387 taxonomy = self.pipeline.taxonomy 379 taxonomy = self.pipeline.taxonomy
388 slugifier = self.pipeline.slugifier 380 slugifier = self.pipeline.slugifier
389 381
382 tax_is_mult = taxonomy.is_multiple
383 tax_setting_name = taxonomy.setting_name
384
385 # First, go over all of our source's pages seen during this bake.
386 # Gather all the taxonomy terms they have, and also keep track of
387 # the ones used by the pages that were actually rendered (instead of
388 # those that were up-to-date and skipped).
389 single_dirty_slugified_terms = set()
390 current_records = self.record_histories.current
390 record_name = get_record_name_for_source(source) 391 record_name = get_record_name_for_source(source)
391 current_records = self.record_histories.current
392 cur_rec = current_records.getRecord(record_name) 392 cur_rec = current_records.getRecord(record_name)
393 for cur_entry in cur_rec.getEntries(): 393 for cur_entry in cur_rec.getEntries():
394 if not cur_entry.was_overriden: 394 if cur_entry.hasFlag(PagePipelineRecordEntry.FLAG_OVERRIDEN):
395 cur_terms = cur_entry.config.get(taxonomy.setting_name) 395 continue
396 if cur_terms: 396
397 if not taxonomy.is_multiple: 397 cur_terms = cur_entry.config.get(tax_setting_name)
398 self._addTerm( 398 if not cur_terms:
399 slugifier, cur_entry.item_spec, cur_terms) 399 continue
400 else: 400
401 self._addTerms( 401 if not tax_is_mult:
402 slugifier, cur_entry.item_spec, cur_terms) 402 self._addTerm(
403 403 slugifier, cur_entry.item_spec, cur_terms)
404 # Re-bake all taxonomy terms that include new or changed pages, by 404 else:
405 # marking them as 'dirty'. 405 self._addTerms(
406 history = self.record_histories.getHistory(record_name).copy() 406 slugifier, cur_entry.item_spec, cur_terms)
407 history.build() 407
408 for prev_entry, cur_entry in history.diffs: 408 if cur_entry.hasFlag(
409 entries = [cur_entry] 409 PagePipelineRecordEntry.FLAG_SEGMENTS_RENDERED):
410 if prev_entry: 410 if not tax_is_mult:
411 entries.append(prev_entry) 411 single_dirty_slugified_terms.add(
412 412 slugifier.slugify(cur_terms))
413 for e in entries: 413 else:
414 if e and e.was_any_sub_baked: 414 single_dirty_slugified_terms.update(
415 entry_terms = e.config.get(taxonomy.setting_name) 415 (slugifier.slugify(t)
416 if entry_terms: 416 for t in cur_terms))
417 if not taxonomy.is_multiple:
418 self._single_dirty_slugified_terms.add(
419 slugifier.slugify(entry_terms))
420 else:
421 self._single_dirty_slugified_terms.update(
422 (slugifier.slugify(t)
423 for t in entry_terms))
424 417
425 self._all_dirty_slugified_terms = list( 418 self._all_dirty_slugified_terms = list(
426 self._single_dirty_slugified_terms) 419 single_dirty_slugified_terms)
427 logger.debug("Gathered %d dirty taxonomy terms", 420 logger.debug("Gathered %d dirty taxonomy terms",
428 len(self._all_dirty_slugified_terms)) 421 len(self._all_dirty_slugified_terms))
429 422
430 # Re-bake the combination pages for terms that are 'dirty'. 423 # Re-bake the combination pages for terms that are 'dirty'.
431 # We make all terms into tuple, even those that are not actual 424 # We make all terms into tuple, even those that are not actual
436 # wherever combinations were used, so they're coming from the 429 # wherever combinations were used, so they're coming from the
437 # `onRouteFunctionUsed` method. And because combinations can be used 430 # `onRouteFunctionUsed` method. And because combinations can be used
438 # by any page in the website (anywhere someone can ask for an URL 431 # by any page in the website (anywhere someone can ask for an URL
439 # to the combination page), it means we check all the records, not 432 # to the combination page), it means we check all the records, not
440 # just the record for our source. 433 # just the record for our source.
441 if taxonomy.is_multiple: 434 if tax_is_mult:
442 known_combinations = set() 435 known_combinations = set()
443 for rec in current_records.records: 436 for rec in current_records.records:
444 # Cheap way to test if a record contains entries that 437 # Cheap way to test if a record contains entries that
445 # are sub-types of a page entry: test the first one. 438 # are sub-types of a page entry: test the first one.
446 first_entry = next(iter(rec.getEntries()), None) 439 first_entry = next(iter(rec.getEntries()), None)
454 if len(terms) > 1: 447 if len(terms) > 1:
455 known_combinations.add(terms) 448 known_combinations.add(terms)
456 449
457 dcc = 0 450 dcc = 0
458 for terms in known_combinations: 451 for terms in known_combinations:
459 if not self._single_dirty_slugified_terms.isdisjoint( 452 if not single_dirty_slugified_terms.isdisjoint(
460 set(terms)): 453 set(terms)):
461 self._all_dirty_slugified_terms.append( 454 self._all_dirty_slugified_terms.append(
462 taxonomy.separator.join(terms)) 455 taxonomy.separator.join(terms))
463 dcc += 1 456 dcc += 1
464 logger.debug("Gathered %d term combinations, with %d dirty." % 457 logger.debug("Gathered %d term combinations, with %d dirty." %