comparison piecrust/sources/taxonomy.py @ 854:08e02c2a2a1a

core: Keep refactoring, this time to prepare for generator sources. - Make a few APIs simpler. - Content pipelines create their own jobs, so that generator sources can keep aborting in `getContents`, but rely on their pipeline to generate pages for baking.
author Ludovic Chabant <ludovic@chabant.com>
date Sun, 04 Jun 2017 23:34:28 -0700
parents 4850f8c21b6e
children 448710d84121
comparison
equal deleted inserted replaced
853:f070a4fc033c 854:08e02c2a2a1a
3 import logging 3 import logging
4 import unidecode 4 import unidecode
5 from piecrust.chefutil import format_timed, format_timed_scope 5 from piecrust.chefutil import format_timed, format_timed_scope
6 from piecrust.configuration import ConfigurationError 6 from piecrust.configuration import ConfigurationError
7 from piecrust.data.filters import ( 7 from piecrust.data.filters import (
8 PaginationFilter, SettingFilterClause, 8 PaginationFilter, SettingFilterClause)
9 page_value_accessor) 9 from piecrust.pipelines.base import ContentPipeline
10 from piecrust.routing import RouteParameter 10 from piecrust.routing import RouteParameter
11 from piecrust.sources.base import ContentSource, GeneratedContentException 11 from piecrust.sources.base import ContentSource, GeneratedContentException
12 12
13 13
14 logger = logging.getLogger(__name__) 14 logger = logging.getLogger(__name__)
25 re_dot_to_dash = re.compile(r'\.+') 25 re_dot_to_dash = re.compile(r'\.+')
26 re_space_to_dash = re.compile(r'\s+') 26 re_space_to_dash = re.compile(r'\s+')
27 27
28 28
29 class Taxonomy(object): 29 class Taxonomy(object):
30 """ Describes a taxonomy.
31 """
30 def __init__(self, name, config): 32 def __init__(self, name, config):
31 self.name = name 33 self.name = name
32 self.config = config 34 self.config = config
33 self.term_name = config.get('term', name) 35 self.term_name = config.get('term', name)
34 self.is_multiple = bool(config.get('multiple', False)) 36 self.is_multiple = bool(config.get('multiple', False))
41 return self.name 43 return self.name
42 return self.term_name 44 return self.term_name
43 45
44 46
45 class TaxonomySource(ContentSource): 47 class TaxonomySource(ContentSource):
46 """ A page generator that handles taxonomies, _i.e._ lists of keywords 48 """ A content source that generates taxonomy listing pages.
47 that pages are labelled with, and for which we need to generate
48 listing pages.
49 """ 49 """
50 SOURCE_NAME = 'taxonomy' 50 SOURCE_NAME = 'taxonomy'
51 DEFAULT_PIPELINE_NAME = 'taxonomy'
51 52
52 def __init__(self, app, name, config): 53 def __init__(self, app, name, config):
53 super().__init__(app, name, config) 54 super().__init__(app, name, config)
54 55
55 tax_name = config.get('taxonomy') 56 tax_name = config.get('taxonomy')
56 if tax_name is None: 57 if tax_name is None:
57 raise ConfigurationError( 58 raise ConfigurationError(
58 "Generator '%s' requires a taxonomy name." % name) 59 "Taxonomy source '%s' requires a taxonomy name." % name)
59 tax_config = app.config.get('site/taxonomies/' + tax_name) 60 self.taxonomy = _get_taxonomy(app, tax_name)
60 if tax_config is None:
61 raise ConfigurationError(
62 "Error initializing generator '%s', no such taxonomy: %s",
63 (name, tax_name))
64 self.taxonomy = Taxonomy(tax_name, tax_config)
65 61
66 sm = config.get('slugify_mode') 62 sm = config.get('slugify_mode')
67 if not sm: 63 self.slugifier = _get_slugifier(app, self.taxonomy, sm)
68 sm = app.config.get('site/slugify_mode', 'encode')
69 self.slugify_mode = _parse_slugify_mode(sm)
70 self.slugifier = _Slugifier(self.taxonomy, self.slugify_mode)
71 64
72 def getContents(self, group): 65 def getContents(self, group):
66 # Our content is procedurally generated from other content sources,
67 # so we really don't support listing anything here -- it would be
68 # quite costly.
69 #
70 # Instead, our pipeline (the `TaxonomyPipeline`) will generate
71 # content items for us when it is asked to produce bake jobs.
73 raise GeneratedContentException() 72 raise GeneratedContentException()
74 73
75 def getSupportedRouteParameters(self): 74 def getSupportedRouteParameters(self):
76 name = self.taxonomy.term_name 75 name = self.taxonomy.term_name
77 param_type = (RouteParameter.TYPE_PATH if self.taxonomy.is_multiple 76 param_type = (RouteParameter.TYPE_PATH if self.taxonomy.is_multiple
100 # have the slugified version. 99 # have the slugified version.
101 # * if 2 slightly different terms "collide" into the same slugified 100 # * if 2 slightly different terms "collide" into the same slugified
102 # term, we'll get a merge of the 2 on the listing page, which is 101 # term, we'll get a merge of the 2 on the listing page, which is
103 # what the user expects. 102 # what the user expects.
104 # 103 #
105 tax_terms, is_combination = self._getTaxonomyTerms( 104 route_params = ctx.page.source_metadata['route_params']
106 ctx.page.route_metadata) 105 tax_terms, is_combination = self._getTaxonomyTerms(route_params)
107 self._setTaxonomyFilter(ctx, tax_terms, is_combination) 106 self._setTaxonomyFilter(ctx, tax_terms, is_combination)
108 107
109 # Add some custom data for rendering. 108 # Add some custom data for rendering.
110 ctx.custom_data.update({ 109 ctx.custom_data.update({
111 self.taxonomy.term_name: tax_terms, 110 self.taxonomy.term_name: tax_terms,
112 'is_multiple_%s' % self.taxonomy.term_name: is_combination}) 111 'is_multiple_%s' % self.taxonomy.term_name: is_combination})
113 # Add some "plural" version of the term... so for instance, if this 112 # Add some "plural" version of the term... so for instance, if this
114 # is the "tags" taxonomy, "tag" will have one term most of the time, 113 # is the "tags" taxonomy, "tag" will have one term most of the time,
115 # except when it's a combination. Here, we add "tags" as something that 114 # except when it's a combination. Here, we add "tags" as something that
116 # is always a tuple, even when it's not a combination. 115 # is always a tuple, even when it's not a combination.
117 if (self.taxonomy.is_multiple and 116 if (self.taxonomy.is_multiple and
119 mult_val = tax_terms 118 mult_val = tax_terms
120 if not is_combination: 119 if not is_combination:
121 mult_val = (mult_val,) 120 mult_val = (mult_val,)
122 ctx.custom_data[self.taxonomy.name] = mult_val 121 ctx.custom_data[self.taxonomy.name] = mult_val
123 122
124 def _getSource(self): 123 def _getTaxonomyTerms(self, route_params):
125 return self.app.getSource(self.config['source'])
126
127 def _getTaxonomyTerms(self, route_metadata):
128 # Get the individual slugified terms from the route metadata. 124 # Get the individual slugified terms from the route metadata.
129 all_values = route_metadata.get(self.taxonomy.term_name) 125 all_values = route_params.get(self.taxonomy.term_name)
130 if all_values is None: 126 if all_values is None:
131 raise Exception("'%s' values couldn't be found in route metadata" % 127 raise Exception("'%s' values couldn't be found in route metadata" %
132 self.taxonomy.term_name) 128 self.taxonomy.term_name)
133 129
134 # If it's a "multiple" taxonomy, we need to potentially split the 130 # If it's a "multiple" taxonomy, we need to potentially split the
141 # Not a "multiple" taxonomy, so there's only the one value. 137 # Not a "multiple" taxonomy, so there's only the one value.
142 return all_values, False 138 return all_values, False
143 139
144 def _setTaxonomyFilter(self, ctx, term_value, is_combination): 140 def _setTaxonomyFilter(self, ctx, term_value, is_combination):
145 # Set up the filter that will check the pages' terms. 141 # Set up the filter that will check the pages' terms.
146 flt = PaginationFilter(value_accessor=page_value_accessor) 142 flt = PaginationFilter()
147 flt.addClause(HasTaxonomyTermsFilterClause( 143 flt.addClause(HasTaxonomyTermsFilterClause(
148 self.taxonomy, self.slugify_mode, term_value, is_combination)) 144 self.taxonomy, self.slugify.mode, term_value, is_combination))
149 ctx.pagination_filter = flt 145 ctx.pagination_filter = flt
150 146
151 def onRouteFunctionUsed(self, route, route_metadata): 147 def onRouteFunctionUsed(self, route_params):
152 # Get the values, and slugify them appropriately. 148 # Get the values, and slugify them appropriately.
153 values = route_metadata[self.taxonomy.term_name] 149 values = route_params[self.taxonomy.term_name]
154 if self.taxonomy.is_multiple: 150 if self.taxonomy.is_multiple:
155 # TODO: here we assume the route has been properly configured. 151 # TODO: here we assume the route has been properly configured.
156 slugified_values = self.slugifyMultiple((str(v) for v in values)) 152 slugified_values = self.slugifyMultiple((str(v) for v in values))
157 route_val = self.taxonomy.separator.join(slugified_values) 153 route_val = self.taxonomy.separator.join(slugified_values)
158 else: 154 else:
159 slugified_values = self.slugify(str(values)) 155 slugified_values = self.slugify(str(values))
160 route_val = slugified_values 156 route_val = slugified_values
161 157
162 # We need to register this use of a taxonomy term. 158 # We need to register this use of a taxonomy term.
163 eis = self.app.env.exec_info_stack 159 rcs = self.app.env.render_ctx_stack
164 cpi = eis.current_page_info.render_ctx.current_pass_info 160 cpi = rcs.current_ctx.current_pass_info
165 if cpi: 161 if cpi:
166 utt = cpi.getCustomInfo('used_taxonomy_terms', [], True) 162 utt = cpi.getCustomInfo('used_taxonomy_terms', [], True)
167 utt.append(slugified_values) 163 utt.append(slugified_values)
168 164
169 # Put the slugified values in the route metadata so they're used to 165 # Put the slugified values in the route metadata so they're used to
170 # generate the URL. 166 # generate the URL.
171 route_metadata[self.taxonomy.term_name] = route_val 167 route_params[self.taxonomy.term_name] = route_val
172
173 def bake(self, ctx):
174 if not self.page_ref.exists:
175 logger.debug(
176 "No page found at '%s', skipping taxonomy '%s'." %
177 (self.page_ref, self.taxonomy.name))
178 return
179
180 logger.debug("Baking %s pages...", self.taxonomy.name)
181 analyzer = _TaxonomyTermsAnalyzer(self.source_name, self.taxonomy,
182 self.slugify_mode)
183 with format_timed_scope(logger, 'gathered taxonomy terms',
184 level=logging.DEBUG, colored=False):
185 analyzer.analyze(ctx)
186
187 start_time = time.perf_counter()
188 page_count = self._bakeTaxonomyTerms(ctx, analyzer)
189 if page_count > 0:
190 logger.info(format_timed(
191 start_time,
192 "baked %d %s pages for %s." % (
193 page_count, self.taxonomy.term_name, self.source_name)))
194
195 def _bakeTaxonomyTerms(self, ctx, analyzer):
196 # Start baking those terms.
197 logger.debug(
198 "Baking '%s' for source '%s': %d terms" %
199 (self.taxonomy.name, self.source_name,
200 len(analyzer.dirty_slugified_terms)))
201
202 route = self.app.getGeneratorRoute(self.name)
203 if route is None:
204 raise Exception("No routes have been defined for generator: %s" %
205 self.name)
206
207 logger.debug("Using taxonomy page: %s" % self.page_ref)
208 fac = self.page_ref.getFactory()
209
210 job_count = 0
211 for slugified_term in analyzer.dirty_slugified_terms:
212 extra_route_metadata = {
213 self.taxonomy.term_name: slugified_term}
214
215 # Use the slugified term as the record's extra key seed.
216 logger.debug(
217 "Queuing: %s [%s=%s]" %
218 (fac.ref_spec, self.taxonomy.name, slugified_term))
219 ctx.queueBakeJob(fac, route, extra_route_metadata, slugified_term)
220 job_count += 1
221 ctx.runJobQueue()
222
223 # Now we create bake entries for all the terms that were *not* dirty.
224 # This is because otherwise, on the next incremental bake, we wouldn't
225 # find any entry for those things, and figure that we need to delete
226 # their outputs.
227 for prev_entry, cur_entry in ctx.getAllPageRecords():
228 # Only consider taxonomy-related entries that don't have any
229 # current version (i.e. they weren't baked just now).
230 if prev_entry and not cur_entry:
231 try:
232 t = ctx.getSeedFromRecordExtraKey(prev_entry.extra_key)
233 except InvalidRecordExtraKey:
234 continue
235
236 if analyzer.isKnownSlugifiedTerm(t):
237 logger.debug("Creating unbaked entry for %s term: %s" %
238 (self.name, t))
239 ctx.collapseRecord(prev_entry)
240 else:
241 logger.debug("Term %s in %s isn't used anymore." %
242 (self.name, t))
243
244 return job_count
245 168
246 169
247 class HasTaxonomyTermsFilterClause(SettingFilterClause): 170 class HasTaxonomyTermsFilterClause(SettingFilterClause):
248 def __init__(self, taxonomy, slugify_mode, value, is_combination): 171 def __init__(self, taxonomy, slugify_mode, value, is_combination):
249 super(HasTaxonomyTermsFilterClause, self).__init__( 172 super().__init__(taxonomy.setting_name, value)
250 taxonomy.setting_name, value)
251 self._taxonomy = taxonomy 173 self._taxonomy = taxonomy
252 self._is_combination = is_combination 174 self._is_combination = is_combination
253 self._slugifier = _Slugifier(taxonomy, slugify_mode) 175 self._slugifier = _Slugifier(taxonomy, slugify_mode)
254 176
255 def pageMatches(self, fil, page): 177 def pageMatches(self, fil, page):
275 return False 197 return False
276 page_value = self._slugifier.slugify(page_value) 198 page_value = self._slugifier.slugify(page_value)
277 return page_value == self.value 199 return page_value == self.value
278 200
279 201
202 def _get_taxonomy(app, tax_name):
203 tax_config = app.config.get('site/taxonomies/' + tax_name)
204 if tax_config is None:
205 raise ConfigurationError("No such taxonomy: %s" % tax_name)
206 return Taxonomy(tax_name, tax_config)
207
208
209 def _get_slugifier(app, taxonomy, slugify_mode=None):
210 if slugify_mode is None:
211 slugify_mode = app.config.get('site/slugify_mode', 'encode')
212 sm = _parse_slugify_mode(slugify_mode)
213 return _Slugifier(taxonomy, sm)
214
215
216 class TaxonomyPipeline(ContentPipeline):
217 PIPELINE_NAME = 'taxonomy'
218 PASS_NUM = 1
219
220 def __init__(self, source, ctx):
221 if not isinstance(source, TaxonomySource):
222 raise Exception("The taxonomy pipeline only supports taxonomy "
223 "content sources.")
224
225 super().__init__(source, ctx)
226 self.taxonomy = source.taxonomy
227 self.slugifier = source.slugifier
228
229 def buildJobs(self):
230 logger.debug("Building taxonomy pages for source: %s" %
231 self.source.name)
232 analyzer = _TaxonomyTermsAnalyzer(self)
233 with format_timed_scope(logger, 'gathered taxonomy terms',
234 level=logging.DEBUG, colored=False):
235 analyzer.analyze(ctx)
236
237 def bake(self, ctx):
238 if not self.page_ref.exists:
239 logger.debug(
240 "No page found at '%s', skipping taxonomy '%s'." %
241 (self.page_ref, self.taxonomy.name))
242 return
243
244 logger.debug("Baking %s pages...", self.taxonomy.name)
245 analyzer = _TaxonomyTermsAnalyzer(self.source_name, self.taxonomy,
246 self.slugify_mode)
247 with format_timed_scope(logger, 'gathered taxonomy terms',
248 level=logging.DEBUG, colored=False):
249 analyzer.analyze(ctx)
250
251 start_time = time.perf_counter()
252 page_count = self._bakeTaxonomyTerms(ctx, analyzer)
253 if page_count > 0:
254 logger.info(format_timed(
255 start_time,
256 "baked %d %s pages for %s." % (
257 page_count, self.taxonomy.term_name, self.source_name)))
258
259 def _bakeTaxonomyTerms(self, ctx, analyzer):
260 # Start baking those terms.
261 logger.debug(
262 "Baking '%s' for source '%s': %d terms" %
263 (self.taxonomy.name, self.source_name,
264 len(analyzer.dirty_slugified_terms)))
265
266 route = self.app.getGeneratorRoute(self.name)
267 if route is None:
268 raise Exception("No routes have been defined for generator: %s" %
269 self.name)
270
271 logger.debug("Using taxonomy page: %s" % self.page_ref)
272 fac = self.page_ref.getFactory()
273
274 job_count = 0
275 for slugified_term in analyzer.dirty_slugified_terms:
276 extra_route_params = {
277 self.taxonomy.term_name: slugified_term}
278
279 # Use the slugified term as the record's extra key seed.
280 logger.debug(
281 "Queuing: %s [%s=%s]" %
282 (fac.ref_spec, self.taxonomy.name, slugified_term))
283 ctx.queueBakeJob(fac, route, extra_route_params, slugified_term)
284 job_count += 1
285 ctx.runJobQueue()
286
287 # Now we create bake entries for all the terms that were *not* dirty.
288 # This is because otherwise, on the next incremental bake, we wouldn't
289 # find any entry for those things, and figure that we need to delete
290 # their outputs.
291 for prev_entry, cur_entry in ctx.getAllPageRecords():
292 # Only consider taxonomy-related entries that don't have any
293 # current version (i.e. they weren't baked just now).
294 if prev_entry and not cur_entry:
295 try:
296 t = ctx.getSeedFromRecordExtraKey(prev_entry.extra_key)
297 except InvalidRecordExtraKey:
298 continue
299
300 if analyzer.isKnownSlugifiedTerm(t):
301 logger.debug("Creating unbaked entry for %s term: %s" %
302 (self.name, t))
303 ctx.collapseRecord(prev_entry)
304 else:
305 logger.debug("Term %s in %s isn't used anymore." %
306 (self.name, t))
307
308 return job_count
309
310
280 class _TaxonomyTermsAnalyzer(object): 311 class _TaxonomyTermsAnalyzer(object):
281 def __init__(self, source_name, taxonomy, slugify_mode): 312 def __init__(self, source):
282 self.source_name = source_name 313 self.source = source
283 self.taxonomy = taxonomy
284 self.slugifier = _Slugifier(taxonomy, slugify_mode)
285 self._all_terms = {} 314 self._all_terms = {}
286 self._single_dirty_slugified_terms = set() 315 self._single_dirty_slugified_terms = set()
287 self._all_dirty_slugified_terms = None 316 self._all_dirty_slugified_terms = None
288 317
289 @property 318 @property
413 return term 442 return term
414 443
415 444
416 def _parse_slugify_mode(value): 445 def _parse_slugify_mode(value):
417 mapping = { 446 mapping = {
418 'encode': SLUGIFY_ENCODE, 447 'encode': SLUGIFY_ENCODE,
419 'transliterate': SLUGIFY_TRANSLITERATE, 448 'transliterate': SLUGIFY_TRANSLITERATE,
420 'lowercase': SLUGIFY_LOWERCASE, 449 'lowercase': SLUGIFY_LOWERCASE,
421 'dot_to_dash': SLUGIFY_DOT_TO_DASH, 450 'dot_to_dash': SLUGIFY_DOT_TO_DASH,
422 'space_to_dash': SLUGIFY_SPACE_TO_DASH} 451 'space_to_dash': SLUGIFY_SPACE_TO_DASH}
423 mode = 0 452 mode = 0
424 for v in value.split(','): 453 for v in value.split(','):
425 f = mapping.get(v.strip()) 454 f = mapping.get(v.strip())
426 if f is None: 455 if f is None:
427 if v == 'iconv': 456 if v == 'iconv':