changeset 789:b8e760b3413e

bake: Fix how slugified taxonomy terms are handled. This fixes a problem where multiple terms all slugifying to the same thing would lead to a fatal bake error.
author Ludovic Chabant <ludovic@chabant.com>
date Mon, 05 Sep 2016 21:03:00 -0700
parents 276030ea7972
children 4cbe057a8b6a
files piecrust/generation/taxonomy.py tests/bakes/test_simple_tags.yaml
diffstat 2 files changed, 205 insertions(+), 107 deletions(-) [+]
line wrap: on
line diff
--- a/piecrust/generation/taxonomy.py	Mon Sep 05 21:02:03 2016 -0700
+++ b/piecrust/generation/taxonomy.py	Mon Sep 05 21:03:00 2016 -0700
@@ -33,7 +33,6 @@
         self.is_multiple = bool(config.get('multiple', False))
         self.separator = config.get('separator', '/')
         self.page_ref = config.get('page')
-        self._source_page_refs = {}
 
     @property
     def setting_name(self):
@@ -43,6 +42,10 @@
 
 
 class TaxonomyPageGenerator(PageGenerator):
+    """ A page generator that handles taxonomies, _i.e._ lists of keywords
+        that pages are labelled with, and for which we need to generate
+        listing pages.
+    """
     GENERATOR_NAME = 'taxonomy'
 
     def __init__(self, app, name, config):
@@ -63,68 +66,95 @@
         if not sm:
             sm = app.config.get('site/slugify_mode', 'encode')
         self.slugify_mode = _parse_slugify_mode(sm)
+        self.slugifier = _Slugifier(self.taxonomy, self.slugify_mode)
+
+    def slugify(self, term):
+        return self.slugifier.slugify(term)
+
+    def slugifyMultiple(self, terms):
+        return self.slugifier.slugifyMultiple(terms)
 
     def prepareRenderContext(self, ctx):
-        self._setPaginationSource(ctx)
+        # Set the pagination source as the source we're generating for.
+        ctx.pagination_source = self.source
 
+        # Get the taxonomy terms from the route metadata... this can come from
+        # the browser's URL (while serving) or from the baking (see `bake`
+        # method below). In both cases, we expect to have the *slugified*
+        # version of the term, because we're going to set a filter that also
+        # slugifies the terms found on each page.
+        #
+        # This is because:
+        #  * while serving, we get everything from the request URL, so we only
+        #    have the slugified version.
+        #  * if 2 slightly different terms "collide" into the same slugified
+        #    term, we'll get a merge of the 2 on the listing page, which is
+        #    what the user expects.
+        #
         tax_terms, is_combination = self._getTaxonomyTerms(
                 ctx.page.route_metadata)
         self._setTaxonomyFilter(ctx, tax_terms, is_combination)
 
+        # Add some custom data for rendering.
         ctx.custom_data.update({
                 self.taxonomy.term_name: tax_terms,
                 'is_multiple_%s' % self.taxonomy.term_name: is_combination})
+        # Add some "plural" version of the term... so for instance, if this
+        # is the "tags" taxonomy, "tag" will have one term most of the time,
+        # except when it's a combination. Here, we add "tags" as something that
+        # is always a tuple, even when it's not a combination.
         if (self.taxonomy.is_multiple and
                 self.taxonomy.name != self.taxonomy.term_name):
             mult_val = tax_terms
             if not is_combination:
                 mult_val = (mult_val,)
             ctx.custom_data[self.taxonomy.name] = mult_val
-        logger.debug("Prepared render context with: %s" % ctx.custom_data)
 
     def _getTaxonomyTerms(self, route_metadata):
+        # Get the individual slugified terms from the route metadata.
         all_values = route_metadata.get(self.taxonomy.term_name)
         if all_values is None:
             raise Exception("'%s' values couldn't be found in route metadata" %
                             self.taxonomy.term_name)
 
+        # If it's a "multiple" taxonomy, we need to potentially split the
+        # route value into the individual terms (_e.g._ when listing all pages
+        # that have 2 given tags, we need to get each of those 2 tags).
         if self.taxonomy.is_multiple:
             sep = self.taxonomy.separator
             if sep in all_values:
                 return tuple(all_values.split(sep)), True
+        # Not a "multiple" taxonomy, so there's only the one value.
         return all_values, False
 
     def _setTaxonomyFilter(self, ctx, term_value, is_combination):
+        # Set up the filter that will check the pages' terms.
         flt = PaginationFilter(value_accessor=page_value_accessor)
         flt.addClause(HasTaxonomyTermsFilterClause(
                 self.taxonomy, self.slugify_mode, term_value, is_combination))
         ctx.pagination_filter = flt
 
-    def _setPaginationSource(self, ctx):
-        ctx.pagination_source = self.source
-
     def onRouteFunctionUsed(self, route, route_metadata):
-        # Get the values.
+        # Get the values, and slugify them appropriately.
         values = route_metadata[self.taxonomy.term_name]
         if self.taxonomy.is_multiple:
-            #TODO: here we assume the route has been properly configured.
-            values = tuple([str(v) for v in values])
+            # TODO: here we assume the route has been properly configured.
+            slugified_values = self.slugifyMultiple((str(v) for v in values))
+            route_val = self.taxonomy.separator.join(slugified_values)
         else:
-            values = (str(values),)
+            slugified_values = self.slugify(str(values))
+            route_val = slugified_values
 
         # We need to register this use of a taxonomy term.
         eis = self.app.env.exec_info_stack
         cpi = eis.current_page_info.render_ctx.current_pass_info
         if cpi:
             utt = cpi.getCustomInfo('used_taxonomy_terms', [], True)
-            utt.append(values)
+            utt.append(slugified_values)
 
-        # We need to slugify the terms before they get transformed
-        # into URL-bits.
-        s = _Slugifier(self.taxonomy, self.slugify_mode)
-        str_values = s.slugify(values)
-        route_metadata[self.taxonomy.term_name] = str_values
-        logger.debug("Changed route metadata to: %s" % route_metadata)
+        # Put the slugified values in the route metadata so they're used to
+        # generate the URL.
+        route_metadata[self.taxonomy.term_name] = route_val
 
     def bake(self, ctx):
         if not self.page_ref.exists:
@@ -134,81 +164,26 @@
             return
 
         logger.debug("Baking %s pages...", self.taxonomy.name)
+        analyzer = _TaxonomyTermsAnalyzer(self.source_name, self.taxonomy,
+                                          self.slugify_mode)
         with format_timed_scope(logger, 'gathered taxonomy terms',
                                 level=logging.DEBUG, colored=False):
-            all_terms, dirty_terms = self._buildDirtyTaxonomyTerms(ctx)
+            analyzer.analyze(ctx)
 
         start_time = time.perf_counter()
-        page_count = self._bakeTaxonomyTerms(ctx, all_terms, dirty_terms)
+        page_count = self._bakeTaxonomyTerms(ctx, analyzer)
         if page_count > 0:
             logger.info(format_timed(
                 start_time,
                 "baked %d %s pages for %s." % (
                     page_count, self.taxonomy.term_name, self.source_name)))
 
-    def _buildDirtyTaxonomyTerms(self, ctx):
-        # Build the list of terms for our taxonomy, and figure out which ones
-        # are 'dirty' for the current bake.
-        logger.debug("Gathering dirty taxonomy terms")
-        all_terms = set()
-        single_dirty_terms = set()
-
-        # Re-bake all taxonomy terms that include new or changed pages.
-        for prev_entry, cur_entry in ctx.getBakedPageRecords():
-            if cur_entry.source_name != self.source_name:
-                continue
-
-            entries = [cur_entry]
-            if prev_entry:
-                entries.append(prev_entry)
-
-            terms = []
-            for e in entries:
-                entry_terms = e.config.get(self.taxonomy.setting_name)
-                if entry_terms:
-                    if not self.taxonomy.is_multiple:
-                        terms.append(entry_terms)
-                    else:
-                        terms += entry_terms
-            single_dirty_terms.update(terms)
-
-        # Remember all terms used.
-        for _, cur_entry in ctx.getAllPageRecords():
-            if cur_entry and not cur_entry.was_overriden:
-                cur_terms = cur_entry.config.get(self.taxonomy.setting_name)
-                if cur_terms:
-                    if not self.taxonomy.is_multiple:
-                        all_terms.add(cur_terms)
-                    else:
-                        all_terms |= set(cur_terms)
-
-        # Re-bake the combination pages for terms that are 'dirty'.
-        # We make all terms into tuple, even those that are not actual
-        # combinations, so that we have less things to test further down the
-        # line.
-        dirty_terms = [(t,) for t in single_dirty_terms]
-        # Add the combinations to that list.
-        if self.taxonomy.is_multiple:
-            known_combinations = set()
-            logger.debug("Gathering dirty term combinations")
-            for _, cur_entry in ctx.getAllPageRecords():
-                if cur_entry:
-                    used_terms = _get_all_entry_taxonomy_terms(cur_entry)
-                    for terms in used_terms:
-                        if len(terms) > 1:
-                            known_combinations.add(terms)
-
-            for terms in known_combinations:
-                if not single_dirty_terms.isdisjoint(set(terms)):
-                    dirty_terms.append(terms)
-
-        return all_terms, dirty_terms
-
-    def _bakeTaxonomyTerms(self, ctx, all_terms, dirty_terms):
+    def _bakeTaxonomyTerms(self, ctx, analyzer):
         # Start baking those terms.
         logger.debug(
-                "Baking '%s' for source '%s': %s" %
-                (self.taxonomy.name, self.source_name, dirty_terms))
+                "Baking '%s' for source '%s': %d terms" %
+                (self.taxonomy.name, self.source_name,
+                 len(analyzer.dirty_slugified_terms)))
 
         route = self.app.getGeneratorRoute(self.name)
         if route is None:
@@ -219,17 +194,14 @@
         fac = self.page_ref.getFactory()
 
         job_count = 0
-        s = _Slugifier(self.taxonomy, self.slugify_mode)
-        for term in dirty_terms:
-            if not self.taxonomy.is_multiple:
-                term = term[0]
-            slugified_term = s.slugify(term)
-            extra_route_metadata = {self.taxonomy.term_name: slugified_term}
+        for slugified_term in analyzer.dirty_slugified_terms:
+            extra_route_metadata = {
+                self.taxonomy.term_name: slugified_term}
 
-            # Use the slugified term as the record extra key.
+            # Use the slugified term as the record's extra key seed.
             logger.debug(
-                    "Queuing: %s [%s=%s]" %
-                    (fac.ref_spec, self.taxonomy.name, slugified_term))
+                "Queuing: %s [%s=%s]" %
+                (fac.ref_spec, self.taxonomy.name, slugified_term))
             ctx.queueBakeJob(fac, route, extra_route_metadata, slugified_term)
             job_count += 1
         ctx.runJobQueue()
@@ -247,7 +219,7 @@
                 except InvalidRecordExtraKey:
                     continue
 
-                if t in all_terms:
+                if analyzer.isKnownSlugifiedTerm(t):
                     logger.debug("Creating unbaked entry for %s term: %s" %
                                  (self.name, t))
                     ctx.collapseRecord(prev_entry)
@@ -258,17 +230,6 @@
         return job_count
 
 
-def _get_all_entry_taxonomy_terms(entry):
-    res = set()
-    for o in entry.subs:
-        for pinfo in o.render_info:
-            if pinfo:
-                terms = pinfo.getCustomInfo('used_taxonomy_terms')
-                if terms:
-                    res |= set(terms)
-    return res
-
-
 class HasTaxonomyTermsFilterClause(SettingFilterClause):
     def __init__(self, taxonomy, slugify_mode, value, is_combination):
         super(HasTaxonomyTermsFilterClause, self).__init__(
@@ -302,18 +263,130 @@
             return page_value == self.value
 
 
+class _TaxonomyTermsAnalyzer(object):
+    def __init__(self, source_name, taxonomy, slugify_mode):
+        self.source_name = source_name
+        self.taxonomy = taxonomy
+        self.slugifier = _Slugifier(taxonomy, slugify_mode)
+        self._all_terms = {}
+        self._single_dirty_slugified_terms = set()
+        self._all_dirty_slugified_terms = None
+
+    @property
+    def dirty_slugified_terms(self):
+        """ Returns the slugified terms that have been 'dirtied' during
+            this bake.
+        """
+        return self._all_dirty_slugified_terms
+
+    def isKnownSlugifiedTerm(self, term):
+        """ Returns whether the given slugified term has been seen during
+            this bake.
+        """
+        return term in self._all_terms
+
+    def analyze(self, ctx):
+        # Build the list of terms for our taxonomy, and figure out which ones
+        # are 'dirty' for the current bake.
+        #
+        # Remember all terms used.
+        for _, cur_entry in ctx.getAllPageRecords():
+            if cur_entry and not cur_entry.was_overriden:
+                cur_terms = cur_entry.config.get(self.taxonomy.setting_name)
+                if cur_terms:
+                    if not self.taxonomy.is_multiple:
+                        self._addTerm(cur_entry.path, cur_terms)
+                    else:
+                        self._addTerms(cur_entry.path, cur_terms)
+
+        # Re-bake all taxonomy terms that include new or changed pages, by
+        # marking them as 'dirty'.
+        for prev_entry, cur_entry in ctx.getBakedPageRecords():
+            if cur_entry.source_name != self.source_name:
+                continue
+
+            entries = [cur_entry]
+            if prev_entry:
+                entries.append(prev_entry)
+
+            for e in entries:
+                entry_terms = e.config.get(self.taxonomy.setting_name)
+                if entry_terms:
+                    if not self.taxonomy.is_multiple:
+                        self._single_dirty_slugified_terms.add(
+                            self.slugifier.slugify(entry_terms))
+                    else:
+                        self._single_dirty_slugified_terms.update(
+                            (self.slugifier.slugify(t)
+                             for t in entry_terms))
+
+        self._all_dirty_slugified_terms = list(
+            self._single_dirty_slugified_terms)
+        logger.debug("Gathered %d dirty taxonomy terms",
+                     len(self._all_dirty_slugified_terms))
+
+        # Re-bake the combination pages for terms that are 'dirty'.
+        # We make all terms into tuple, even those that are not actual
+        # combinations, so that we have less things to test further down the
+        # line.
+        #
+        # Add the combinations to that list. We get those combinations from
+        # wherever combinations were used, so they're coming from the
+        # `onRouteFunctionUsed` method.
+        if self.taxonomy.is_multiple:
+            known_combinations = set()
+            for _, cur_entry in ctx.getAllPageRecords():
+                if cur_entry:
+                    used_terms = _get_all_entry_taxonomy_terms(cur_entry)
+                    for terms in used_terms:
+                        if len(terms) > 1:
+                            known_combinations.add(terms)
+
+            dcc = 0
+            for terms in known_combinations:
+                if not self._single_dirty_slugified_terms.isdisjoint(
+                        set(terms)):
+                    self._all_dirty_slugified_terms.append(
+                        self.taxonomy.separator.join(terms))
+                    dcc += 1
+            logger.debug("Gathered %d term combinations, with %d dirty." %
+                         (len(known_combinations), dcc))
+
+    def _addTerms(self, entry_path, terms):
+        for t in terms:
+            self._addTerm(entry_path, t)
+
+    def _addTerm(self, entry_path, term):
+        st = self.slugifier.slugify(term)
+        orig_terms = self._all_terms.setdefault(st, [])
+        if orig_terms and orig_terms[0] != term:
+            logger.warning(
+                "Term '%s' in '%s' is slugified to '%s' which conflicts with "
+                "previously existing '%s'. The two will be merged." %
+                (term, entry_path, st, orig_terms[0]))
+        orig_terms.append(term)
+
+
+def _get_all_entry_taxonomy_terms(entry):
+    res = set()
+    for o in entry.subs:
+        for pinfo in o.render_info:
+            if pinfo:
+                terms = pinfo.getCustomInfo('used_taxonomy_terms')
+                if terms:
+                    res |= set(terms)
+    return res
+
+
 class _Slugifier(object):
     def __init__(self, taxonomy, mode):
         self.taxonomy = taxonomy
         self.mode = mode
 
+    def slugifyMultiple(self, terms):
+        return tuple(map(self.slugify, terms))
+
     def slugify(self, term):
-        if isinstance(term, tuple):
-            return self.taxonomy.separator.join(
-                    map(self._slugifyOne, term))
-        return self._slugifyOne(term)
-
-    def _slugifyOne(self, term):
         if self.mode & SLUGIFY_TRANSLITERATE:
             term = unidecode.unidecode(term)
         if self.mode & SLUGIFY_LOWERCASE:
--- a/tests/bakes/test_simple_tags.yaml	Mon Sep 05 21:02:03 2016 -0700
+++ b/tests/bakes/test_simple_tags.yaml	Mon Sep 05 21:03:00 2016 -0700
@@ -89,4 +89,29 @@
         Pages in foo, bar
         Post 02
         Post 01
+---
+config:
+    site:
+        slugify_mode: space_to_dash
+in:
+    posts/2016-09-01_post01.md: |
+        ---
+        title: Post 01
+        tags: [foo bar]
+        ---
+    posts/2016-09-02_post2.md: |
+        ---
+        title: Post 02
+        tags: ['foo-bar']
+        ---
+    pages/_tag.md: |
+        Pages in {{pctagurl(tag)}}
+        {% for p in pagination.posts -%}
+        {{p.title}}
+        {% endfor %}
+outfiles:
+    tag/foo-bar.html: |
+        Pages in /tag/foo-bar.html
+        Post 02
+        Post 01