comparison piecrust/generation/taxonomy.py @ 789:b8e760b3413e

bake: Fix how slugified taxonomy terms are handled. This fixes a problem where multiple terms all slugifying to the same thing would lead to a fatal bake error.
author Ludovic Chabant <ludovic@chabant.com>
date Mon, 05 Sep 2016 21:03:00 -0700
parents 661f7ba15762
children 58ebf50235a5
comparison
equal deleted inserted replaced
788:276030ea7972 789:b8e760b3413e
31 self.config = config 31 self.config = config
32 self.term_name = config.get('term', name) 32 self.term_name = config.get('term', name)
33 self.is_multiple = bool(config.get('multiple', False)) 33 self.is_multiple = bool(config.get('multiple', False))
34 self.separator = config.get('separator', '/') 34 self.separator = config.get('separator', '/')
35 self.page_ref = config.get('page') 35 self.page_ref = config.get('page')
36 self._source_page_refs = {}
37 36
38 @property 37 @property
39 def setting_name(self): 38 def setting_name(self):
40 if self.is_multiple: 39 if self.is_multiple:
41 return self.name 40 return self.name
42 return self.term_name 41 return self.term_name
43 42
44 43
45 class TaxonomyPageGenerator(PageGenerator): 44 class TaxonomyPageGenerator(PageGenerator):
45 """ A page generator that handles taxonomies, _i.e._ lists of keywords
46 that pages are labelled with, and for which we need to generate
47 listing pages.
48 """
46 GENERATOR_NAME = 'taxonomy' 49 GENERATOR_NAME = 'taxonomy'
47 50
48 def __init__(self, app, name, config): 51 def __init__(self, app, name, config):
49 super(TaxonomyPageGenerator, self).__init__(app, name, config) 52 super(TaxonomyPageGenerator, self).__init__(app, name, config)
50 53
61 64
62 sm = config.get('slugify_mode') 65 sm = config.get('slugify_mode')
63 if not sm: 66 if not sm:
64 sm = app.config.get('site/slugify_mode', 'encode') 67 sm = app.config.get('site/slugify_mode', 'encode')
65 self.slugify_mode = _parse_slugify_mode(sm) 68 self.slugify_mode = _parse_slugify_mode(sm)
69 self.slugifier = _Slugifier(self.taxonomy, self.slugify_mode)
70
71 def slugify(self, term):
72 return self.slugifier.slugify(term)
73
74 def slugifyMultiple(self, terms):
75 return self.slugifier.slugifyMultiple(terms)
66 76
67 def prepareRenderContext(self, ctx): 77 def prepareRenderContext(self, ctx):
68 self._setPaginationSource(ctx) 78 # Set the pagination source as the source we're generating for.
69 79 ctx.pagination_source = self.source
80
81 # Get the taxonomy terms from the route metadata... this can come from
82 # the browser's URL (while serving) or from the baking (see `bake`
83 # method below). In both cases, we expect to have the *slugified*
84 # version of the term, because we're going to set a filter that also
85 # slugifies the terms found on each page.
86 #
87 # This is because:
88 # * while serving, we get everything from the request URL, so we only
89 # have the slugified version.
90 # * if 2 slightly different terms "collide" into the same slugified
91 # term, we'll get a merge of the 2 on the listing page, which is
92 # what the user expects.
93 #
70 tax_terms, is_combination = self._getTaxonomyTerms( 94 tax_terms, is_combination = self._getTaxonomyTerms(
71 ctx.page.route_metadata) 95 ctx.page.route_metadata)
72 self._setTaxonomyFilter(ctx, tax_terms, is_combination) 96 self._setTaxonomyFilter(ctx, tax_terms, is_combination)
73 97
98 # Add some custom data for rendering.
74 ctx.custom_data.update({ 99 ctx.custom_data.update({
75 self.taxonomy.term_name: tax_terms, 100 self.taxonomy.term_name: tax_terms,
76 'is_multiple_%s' % self.taxonomy.term_name: is_combination}) 101 'is_multiple_%s' % self.taxonomy.term_name: is_combination})
102 # Add some "plural" version of the term... so for instance, if this
103 # is the "tags" taxonomy, "tag" will have one term most of the time,
104 # except when it's a combination. Here, we add "tags" as something that
105 # is always a tuple, even when it's not a combination.
77 if (self.taxonomy.is_multiple and 106 if (self.taxonomy.is_multiple and
78 self.taxonomy.name != self.taxonomy.term_name): 107 self.taxonomy.name != self.taxonomy.term_name):
79 mult_val = tax_terms 108 mult_val = tax_terms
80 if not is_combination: 109 if not is_combination:
81 mult_val = (mult_val,) 110 mult_val = (mult_val,)
82 ctx.custom_data[self.taxonomy.name] = mult_val 111 ctx.custom_data[self.taxonomy.name] = mult_val
83 logger.debug("Prepared render context with: %s" % ctx.custom_data)
84 112
85 def _getTaxonomyTerms(self, route_metadata): 113 def _getTaxonomyTerms(self, route_metadata):
114 # Get the individual slugified terms from the route metadata.
86 all_values = route_metadata.get(self.taxonomy.term_name) 115 all_values = route_metadata.get(self.taxonomy.term_name)
87 if all_values is None: 116 if all_values is None:
88 raise Exception("'%s' values couldn't be found in route metadata" % 117 raise Exception("'%s' values couldn't be found in route metadata" %
89 self.taxonomy.term_name) 118 self.taxonomy.term_name)
90 119
120 # If it's a "multiple" taxonomy, we need to potentially split the
121 # route value into the individual terms (_e.g._ when listing all pages
122 # that have 2 given tags, we need to get each of those 2 tags).
91 if self.taxonomy.is_multiple: 123 if self.taxonomy.is_multiple:
92 sep = self.taxonomy.separator 124 sep = self.taxonomy.separator
93 if sep in all_values: 125 if sep in all_values:
94 return tuple(all_values.split(sep)), True 126 return tuple(all_values.split(sep)), True
127 # Not a "multiple" taxonomy, so there's only the one value.
95 return all_values, False 128 return all_values, False
96 129
97 def _setTaxonomyFilter(self, ctx, term_value, is_combination): 130 def _setTaxonomyFilter(self, ctx, term_value, is_combination):
131 # Set up the filter that will check the pages' terms.
98 flt = PaginationFilter(value_accessor=page_value_accessor) 132 flt = PaginationFilter(value_accessor=page_value_accessor)
99 flt.addClause(HasTaxonomyTermsFilterClause( 133 flt.addClause(HasTaxonomyTermsFilterClause(
100 self.taxonomy, self.slugify_mode, term_value, is_combination)) 134 self.taxonomy, self.slugify_mode, term_value, is_combination))
101 ctx.pagination_filter = flt 135 ctx.pagination_filter = flt
102 136
103 def _setPaginationSource(self, ctx):
104 ctx.pagination_source = self.source
105
106 def onRouteFunctionUsed(self, route, route_metadata): 137 def onRouteFunctionUsed(self, route, route_metadata):
107 # Get the values. 138 # Get the values, and slugify them appropriately.
108 values = route_metadata[self.taxonomy.term_name] 139 values = route_metadata[self.taxonomy.term_name]
109 if self.taxonomy.is_multiple: 140 if self.taxonomy.is_multiple:
110 #TODO: here we assume the route has been properly configured. 141 # TODO: here we assume the route has been properly configured.
111 values = tuple([str(v) for v in values]) 142 slugified_values = self.slugifyMultiple((str(v) for v in values))
143 route_val = self.taxonomy.separator.join(slugified_values)
112 else: 144 else:
113 values = (str(values),) 145 slugified_values = self.slugify(str(values))
146 route_val = slugified_values
114 147
115 # We need to register this use of a taxonomy term. 148 # We need to register this use of a taxonomy term.
116 eis = self.app.env.exec_info_stack 149 eis = self.app.env.exec_info_stack
117 cpi = eis.current_page_info.render_ctx.current_pass_info 150 cpi = eis.current_page_info.render_ctx.current_pass_info
118 if cpi: 151 if cpi:
119 utt = cpi.getCustomInfo('used_taxonomy_terms', [], True) 152 utt = cpi.getCustomInfo('used_taxonomy_terms', [], True)
120 utt.append(values) 153 utt.append(slugified_values)
121 154
122 # We need to slugify the terms before they get transformed 155 # Put the slugified values in the route metadata so they're used to
123 # into URL-bits. 156 # generate the URL.
124 s = _Slugifier(self.taxonomy, self.slugify_mode) 157 route_metadata[self.taxonomy.term_name] = route_val
125 str_values = s.slugify(values)
126 route_metadata[self.taxonomy.term_name] = str_values
127 logger.debug("Changed route metadata to: %s" % route_metadata)
128 158
129 def bake(self, ctx): 159 def bake(self, ctx):
130 if not self.page_ref.exists: 160 if not self.page_ref.exists:
131 logger.debug( 161 logger.debug(
132 "No page found at '%s', skipping taxonomy '%s'." % 162 "No page found at '%s', skipping taxonomy '%s'." %
133 (self.page_ref, self.taxonomy.name)) 163 (self.page_ref, self.taxonomy.name))
134 return 164 return
135 165
136 logger.debug("Baking %s pages...", self.taxonomy.name) 166 logger.debug("Baking %s pages...", self.taxonomy.name)
167 analyzer = _TaxonomyTermsAnalyzer(self.source_name, self.taxonomy,
168 self.slugify_mode)
137 with format_timed_scope(logger, 'gathered taxonomy terms', 169 with format_timed_scope(logger, 'gathered taxonomy terms',
138 level=logging.DEBUG, colored=False): 170 level=logging.DEBUG, colored=False):
139 all_terms, dirty_terms = self._buildDirtyTaxonomyTerms(ctx) 171 analyzer.analyze(ctx)
140 172
141 start_time = time.perf_counter() 173 start_time = time.perf_counter()
142 page_count = self._bakeTaxonomyTerms(ctx, all_terms, dirty_terms) 174 page_count = self._bakeTaxonomyTerms(ctx, analyzer)
143 if page_count > 0: 175 if page_count > 0:
144 logger.info(format_timed( 176 logger.info(format_timed(
145 start_time, 177 start_time,
146 "baked %d %s pages for %s." % ( 178 "baked %d %s pages for %s." % (
147 page_count, self.taxonomy.term_name, self.source_name))) 179 page_count, self.taxonomy.term_name, self.source_name)))
148 180
149 def _buildDirtyTaxonomyTerms(self, ctx): 181 def _bakeTaxonomyTerms(self, ctx, analyzer):
150 # Build the list of terms for our taxonomy, and figure out which ones
151 # are 'dirty' for the current bake.
152 logger.debug("Gathering dirty taxonomy terms")
153 all_terms = set()
154 single_dirty_terms = set()
155
156 # Re-bake all taxonomy terms that include new or changed pages.
157 for prev_entry, cur_entry in ctx.getBakedPageRecords():
158 if cur_entry.source_name != self.source_name:
159 continue
160
161 entries = [cur_entry]
162 if prev_entry:
163 entries.append(prev_entry)
164
165 terms = []
166 for e in entries:
167 entry_terms = e.config.get(self.taxonomy.setting_name)
168 if entry_terms:
169 if not self.taxonomy.is_multiple:
170 terms.append(entry_terms)
171 else:
172 terms += entry_terms
173 single_dirty_terms.update(terms)
174
175 # Remember all terms used.
176 for _, cur_entry in ctx.getAllPageRecords():
177 if cur_entry and not cur_entry.was_overriden:
178 cur_terms = cur_entry.config.get(self.taxonomy.setting_name)
179 if cur_terms:
180 if not self.taxonomy.is_multiple:
181 all_terms.add(cur_terms)
182 else:
183 all_terms |= set(cur_terms)
184
185 # Re-bake the combination pages for terms that are 'dirty'.
186 # We make all terms into tuple, even those that are not actual
187 # combinations, so that we have less things to test further down the
188 # line.
189 dirty_terms = [(t,) for t in single_dirty_terms]
190 # Add the combinations to that list.
191 if self.taxonomy.is_multiple:
192 known_combinations = set()
193 logger.debug("Gathering dirty term combinations")
194 for _, cur_entry in ctx.getAllPageRecords():
195 if cur_entry:
196 used_terms = _get_all_entry_taxonomy_terms(cur_entry)
197 for terms in used_terms:
198 if len(terms) > 1:
199 known_combinations.add(terms)
200
201 for terms in known_combinations:
202 if not single_dirty_terms.isdisjoint(set(terms)):
203 dirty_terms.append(terms)
204
205 return all_terms, dirty_terms
206
207 def _bakeTaxonomyTerms(self, ctx, all_terms, dirty_terms):
208 # Start baking those terms. 182 # Start baking those terms.
209 logger.debug( 183 logger.debug(
210 "Baking '%s' for source '%s': %s" % 184 "Baking '%s' for source '%s': %d terms" %
211 (self.taxonomy.name, self.source_name, dirty_terms)) 185 (self.taxonomy.name, self.source_name,
186 len(analyzer.dirty_slugified_terms)))
212 187
213 route = self.app.getGeneratorRoute(self.name) 188 route = self.app.getGeneratorRoute(self.name)
214 if route is None: 189 if route is None:
215 raise Exception("No routes have been defined for generator: %s" % 190 raise Exception("No routes have been defined for generator: %s" %
216 self.name) 191 self.name)
217 192
218 logger.debug("Using taxonomy page: %s" % self.page_ref) 193 logger.debug("Using taxonomy page: %s" % self.page_ref)
219 fac = self.page_ref.getFactory() 194 fac = self.page_ref.getFactory()
220 195
221 job_count = 0 196 job_count = 0
222 s = _Slugifier(self.taxonomy, self.slugify_mode) 197 for slugified_term in analyzer.dirty_slugified_terms:
223 for term in dirty_terms: 198 extra_route_metadata = {
224 if not self.taxonomy.is_multiple: 199 self.taxonomy.term_name: slugified_term}
225 term = term[0] 200
226 slugified_term = s.slugify(term) 201 # Use the slugified term as the record's extra key seed.
227 extra_route_metadata = {self.taxonomy.term_name: slugified_term}
228
229 # Use the slugified term as the record extra key.
230 logger.debug( 202 logger.debug(
231 "Queuing: %s [%s=%s]" % 203 "Queuing: %s [%s=%s]" %
232 (fac.ref_spec, self.taxonomy.name, slugified_term)) 204 (fac.ref_spec, self.taxonomy.name, slugified_term))
233 ctx.queueBakeJob(fac, route, extra_route_metadata, slugified_term) 205 ctx.queueBakeJob(fac, route, extra_route_metadata, slugified_term)
234 job_count += 1 206 job_count += 1
235 ctx.runJobQueue() 207 ctx.runJobQueue()
236 208
237 # Now we create bake entries for all the terms that were *not* dirty. 209 # Now we create bake entries for all the terms that were *not* dirty.
245 try: 217 try:
246 t = ctx.getSeedFromRecordExtraKey(prev_entry.extra_key) 218 t = ctx.getSeedFromRecordExtraKey(prev_entry.extra_key)
247 except InvalidRecordExtraKey: 219 except InvalidRecordExtraKey:
248 continue 220 continue
249 221
250 if t in all_terms: 222 if analyzer.isKnownSlugifiedTerm(t):
251 logger.debug("Creating unbaked entry for %s term: %s" % 223 logger.debug("Creating unbaked entry for %s term: %s" %
252 (self.name, t)) 224 (self.name, t))
253 ctx.collapseRecord(prev_entry) 225 ctx.collapseRecord(prev_entry)
254 else: 226 else:
255 logger.debug("Term %s in %s isn't used anymore." % 227 logger.debug("Term %s in %s isn't used anymore." %
256 (self.name, t)) 228 (self.name, t))
257 229
258 return job_count 230 return job_count
259
260
261 def _get_all_entry_taxonomy_terms(entry):
262 res = set()
263 for o in entry.subs:
264 for pinfo in o.render_info:
265 if pinfo:
266 terms = pinfo.getCustomInfo('used_taxonomy_terms')
267 if terms:
268 res |= set(terms)
269 return res
270 231
271 232
272 class HasTaxonomyTermsFilterClause(SettingFilterClause): 233 class HasTaxonomyTermsFilterClause(SettingFilterClause):
273 def __init__(self, taxonomy, slugify_mode, value, is_combination): 234 def __init__(self, taxonomy, slugify_mode, value, is_combination):
274 super(HasTaxonomyTermsFilterClause, self).__init__( 235 super(HasTaxonomyTermsFilterClause, self).__init__(
300 return False 261 return False
301 page_value = self._slugifier.slugify(page_value) 262 page_value = self._slugifier.slugify(page_value)
302 return page_value == self.value 263 return page_value == self.value
303 264
304 265
266 class _TaxonomyTermsAnalyzer(object):
267 def __init__(self, source_name, taxonomy, slugify_mode):
268 self.source_name = source_name
269 self.taxonomy = taxonomy
270 self.slugifier = _Slugifier(taxonomy, slugify_mode)
271 self._all_terms = {}
272 self._single_dirty_slugified_terms = set()
273 self._all_dirty_slugified_terms = None
274
275 @property
276 def dirty_slugified_terms(self):
277 """ Returns the slugified terms that have been 'dirtied' during
278 this bake.
279 """
280 return self._all_dirty_slugified_terms
281
282 def isKnownSlugifiedTerm(self, term):
283 """ Returns whether the given slugified term has been seen during
284 this bake.
285 """
286 return term in self._all_terms
287
288 def analyze(self, ctx):
289 # Build the list of terms for our taxonomy, and figure out which ones
290 # are 'dirty' for the current bake.
291 #
292 # Remember all terms used.
293 for _, cur_entry in ctx.getAllPageRecords():
294 if cur_entry and not cur_entry.was_overriden:
295 cur_terms = cur_entry.config.get(self.taxonomy.setting_name)
296 if cur_terms:
297 if not self.taxonomy.is_multiple:
298 self._addTerm(cur_entry.path, cur_terms)
299 else:
300 self._addTerms(cur_entry.path, cur_terms)
301
302 # Re-bake all taxonomy terms that include new or changed pages, by
303 # marking them as 'dirty'.
304 for prev_entry, cur_entry in ctx.getBakedPageRecords():
305 if cur_entry.source_name != self.source_name:
306 continue
307
308 entries = [cur_entry]
309 if prev_entry:
310 entries.append(prev_entry)
311
312 for e in entries:
313 entry_terms = e.config.get(self.taxonomy.setting_name)
314 if entry_terms:
315 if not self.taxonomy.is_multiple:
316 self._single_dirty_slugified_terms.add(
317 self.slugifier.slugify(entry_terms))
318 else:
319 self._single_dirty_slugified_terms.update(
320 (self.slugifier.slugify(t)
321 for t in entry_terms))
322
323 self._all_dirty_slugified_terms = list(
324 self._single_dirty_slugified_terms)
325 logger.debug("Gathered %d dirty taxonomy terms",
326 len(self._all_dirty_slugified_terms))
327
328 # Re-bake the combination pages for terms that are 'dirty'.
329 # We make all terms into tuple, even those that are not actual
330 # combinations, so that we have less things to test further down the
331 # line.
332 #
333 # Add the combinations to that list. We get those combinations from
334 # wherever combinations were used, so they're coming from the
335 # `onRouteFunctionUsed` method.
336 if self.taxonomy.is_multiple:
337 known_combinations = set()
338 for _, cur_entry in ctx.getAllPageRecords():
339 if cur_entry:
340 used_terms = _get_all_entry_taxonomy_terms(cur_entry)
341 for terms in used_terms:
342 if len(terms) > 1:
343 known_combinations.add(terms)
344
345 dcc = 0
346 for terms in known_combinations:
347 if not self._single_dirty_slugified_terms.isdisjoint(
348 set(terms)):
349 self._all_dirty_slugified_terms.append(
350 self.taxonomy.separator.join(terms))
351 dcc += 1
352 logger.debug("Gathered %d term combinations, with %d dirty." %
353 (len(known_combinations), dcc))
354
355 def _addTerms(self, entry_path, terms):
356 for t in terms:
357 self._addTerm(entry_path, t)
358
359 def _addTerm(self, entry_path, term):
360 st = self.slugifier.slugify(term)
361 orig_terms = self._all_terms.setdefault(st, [])
362 if orig_terms and orig_terms[0] != term:
363 logger.warning(
364 "Term '%s' in '%s' is slugified to '%s' which conflicts with "
365 "previously existing '%s'. The two will be merged." %
366 (term, entry_path, st, orig_terms[0]))
367 orig_terms.append(term)
368
369
370 def _get_all_entry_taxonomy_terms(entry):
371 res = set()
372 for o in entry.subs:
373 for pinfo in o.render_info:
374 if pinfo:
375 terms = pinfo.getCustomInfo('used_taxonomy_terms')
376 if terms:
377 res |= set(terms)
378 return res
379
380
305 class _Slugifier(object): 381 class _Slugifier(object):
306 def __init__(self, taxonomy, mode): 382 def __init__(self, taxonomy, mode):
307 self.taxonomy = taxonomy 383 self.taxonomy = taxonomy
308 self.mode = mode 384 self.mode = mode
309 385
386 def slugifyMultiple(self, terms):
387 return tuple(map(self.slugify, terms))
388
310 def slugify(self, term): 389 def slugify(self, term):
311 if isinstance(term, tuple):
312 return self.taxonomy.separator.join(
313 map(self._slugifyOne, term))
314 return self._slugifyOne(term)
315
316 def _slugifyOne(self, term):
317 if self.mode & SLUGIFY_TRANSLITERATE: 390 if self.mode & SLUGIFY_TRANSLITERATE:
318 term = unidecode.unidecode(term) 391 term = unidecode.unidecode(term)
319 if self.mode & SLUGIFY_LOWERCASE: 392 if self.mode & SLUGIFY_LOWERCASE:
320 term = term.lower() 393 term = term.lower()
321 if self.mode & SLUGIFY_DOT_TO_DASH: 394 if self.mode & SLUGIFY_DOT_TO_DASH: