diff piecrust/sources/taxonomy.py @ 1136:5f97b5b59dfe

bake: Optimize cache handling for the baking process. - Get rid of the 2-level pipeline runs... handle a single set of passes. - Go back to load/render segments/layout passes for pages. - Add descriptions of what each job batch does. - Improve the taxonomy pipeline so it doesn't re-bake terms that don't need to be re-baked. - Simplify some of the code.
author Ludovic Chabant <ludovic@chabant.com>
date Mon, 23 Apr 2018 21:47:49 -0700
parents 1857dbd4580f
children 9f3e702a8a69
line wrap: on
line diff
--- a/piecrust/sources/taxonomy.py	Mon Apr 23 21:37:43 2018 -0700
+++ b/piecrust/sources/taxonomy.py	Mon Apr 23 21:47:49 2018 -0700
@@ -7,13 +7,9 @@
     PaginationFilter, SettingFilterClause)
 from piecrust.page import Page
 from piecrust.pipelines._pagebaker import PageBaker
-from piecrust.pipelines._pagerecords import (
-    PagePipelineRecordEntry,
-    add_page_job_result, merge_job_result_into_record_entry)
+from piecrust.pipelines._pagerecords import PagePipelineRecordEntry
 from piecrust.pipelines.base import (
-    ContentPipeline, get_record_name_for_source,
-    create_job, content_item_from_job)
-from piecrust.pipelines.records import RecordHistory
+    ContentPipeline, get_record_name_for_source, create_job)
 from piecrust.routing import RouteParameter
 from piecrust.sources.base import ContentItem
 from piecrust.sources.generator import GeneratorSourceBase
@@ -307,8 +303,8 @@
             current_record.addEntry(entry)
 
         if len(jobs) > 0:
-            return jobs
-        return None
+            return jobs, "taxonomize"
+        return None, None
 
     def run(self, job, ctx, result):
         term = job['term']
@@ -324,13 +320,12 @@
         prev_entry = ctx.previous_entry
         rdr_subs = self._pagebaker.bake(page, prev_entry)
 
-        add_page_job_result(result)
         result['subs'] = rdr_subs
         result['term'] = page.source_metadata['term']
 
     def handleJobResult(self, result, ctx):
         existing = ctx.record_entry
-        merge_job_result_into_record_entry(existing, result)
+        existing.subs = result['subs']
         existing.term = result['term']
 
     def postJobRun(self, ctx):
@@ -362,7 +357,6 @@
         self.pipeline = pipeline
         self.record_histories = record_histories
         self._all_terms = {}
-        self._single_dirty_slugified_terms = set()
         self._all_dirty_slugified_terms = None
 
     @property
@@ -381,49 +375,48 @@
     def analyze(self):
         # Build the list of terms for our taxonomy, and figure out which ones
         # are 'dirty' for the current bake.
-        #
-        # Remember all terms used.
         source = self.pipeline.inner_source
         taxonomy = self.pipeline.taxonomy
         slugifier = self.pipeline.slugifier
 
+        tax_is_mult = taxonomy.is_multiple
+        tax_setting_name = taxonomy.setting_name
+
+        # First, go over all of our source's pages seen during this bake.
+        # Gather all the taxonomy terms they have, and also keep track of
+        # the ones used by the pages that were actually rendered (instead of
+        # those that were up-to-date and skipped).
+        single_dirty_slugified_terms = set()
+        current_records = self.record_histories.current
         record_name = get_record_name_for_source(source)
-        current_records = self.record_histories.current
         cur_rec = current_records.getRecord(record_name)
         for cur_entry in cur_rec.getEntries():
-            if not cur_entry.was_overriden:
-                cur_terms = cur_entry.config.get(taxonomy.setting_name)
-                if cur_terms:
-                    if not taxonomy.is_multiple:
-                        self._addTerm(
-                            slugifier, cur_entry.item_spec, cur_terms)
-                    else:
-                        self._addTerms(
-                            slugifier, cur_entry.item_spec, cur_terms)
+            if cur_entry.hasFlag(PagePipelineRecordEntry.FLAG_OVERRIDEN):
+                continue
+
+            cur_terms = cur_entry.config.get(tax_setting_name)
+            if not cur_terms:
+                continue
 
-        # Re-bake all taxonomy terms that include new or changed pages, by
-        # marking them as 'dirty'.
-        history = self.record_histories.getHistory(record_name).copy()
-        history.build()
-        for prev_entry, cur_entry in history.diffs:
-            entries = [cur_entry]
-            if prev_entry:
-                entries.append(prev_entry)
+            if not tax_is_mult:
+                self._addTerm(
+                    slugifier, cur_entry.item_spec, cur_terms)
+            else:
+                self._addTerms(
+                    slugifier, cur_entry.item_spec, cur_terms)
 
-            for e in entries:
-                if e and e.was_any_sub_baked:
-                    entry_terms = e.config.get(taxonomy.setting_name)
-                    if entry_terms:
-                        if not taxonomy.is_multiple:
-                            self._single_dirty_slugified_terms.add(
-                                slugifier.slugify(entry_terms))
-                        else:
-                            self._single_dirty_slugified_terms.update(
-                                (slugifier.slugify(t)
-                                 for t in entry_terms))
+            if cur_entry.hasFlag(
+                    PagePipelineRecordEntry.FLAG_SEGMENTS_RENDERED):
+                if not tax_is_mult:
+                    single_dirty_slugified_terms.add(
+                        slugifier.slugify(cur_terms))
+                else:
+                    single_dirty_slugified_terms.update(
+                        (slugifier.slugify(t)
+                         for t in cur_terms))
 
         self._all_dirty_slugified_terms = list(
-            self._single_dirty_slugified_terms)
+            single_dirty_slugified_terms)
         logger.debug("Gathered %d dirty taxonomy terms",
                      len(self._all_dirty_slugified_terms))
 
@@ -438,7 +431,7 @@
         # by any page in the website (anywhere someone can ask for an URL
         # to the combination page), it means we check all the records, not
         # just the record for our source.
-        if taxonomy.is_multiple:
+        if tax_is_mult:
             known_combinations = set()
             for rec in current_records.records:
                 # Cheap way to test if a record contains entries that
@@ -456,7 +449,7 @@
 
             dcc = 0
             for terms in known_combinations:
-                if not self._single_dirty_slugified_terms.isdisjoint(
+                if not single_dirty_slugified_terms.isdisjoint(
                         set(terms)):
                     self._all_dirty_slugified_terms.append(
                         taxonomy.separator.join(terms))