Mercurial > piecrust2
view piecrust/sources/taxonomy.py @ 1136:5f97b5b59dfe
bake: Optimize cache handling for the baking process.
- Get rid of the 2-level pipeline runs... handle a single set of passes.
- Go back to load/render segments/layout passes for pages.
- Add descriptions of what each job batch does.
- Improve the taxonomy pipeline so it doesn't re-bake terms that don't need
to be re-baked.
- Simplify some of the code.
author | Ludovic Chabant <ludovic@chabant.com> |
---|---|
date | Mon, 23 Apr 2018 21:47:49 -0700 |
parents | 1857dbd4580f |
children | 9f3e702a8a69 |
line wrap: on
line source
import re import copy import logging import unidecode from piecrust.configuration import ConfigurationError from piecrust.data.filters import ( PaginationFilter, SettingFilterClause) from piecrust.page import Page from piecrust.pipelines._pagebaker import PageBaker from piecrust.pipelines._pagerecords import PagePipelineRecordEntry from piecrust.pipelines.base import ( ContentPipeline, get_record_name_for_source, create_job) from piecrust.routing import RouteParameter from piecrust.sources.base import ContentItem from piecrust.sources.generator import GeneratorSourceBase logger = logging.getLogger(__name__) SLUGIFY_ENCODE = 1 SLUGIFY_TRANSLITERATE = 2 SLUGIFY_LOWERCASE = 4 SLUGIFY_DOT_TO_DASH = 8 SLUGIFY_SPACE_TO_DASH = 16 re_first_dot_to_dash = re.compile(r'^\.+') re_dot_to_dash = re.compile(r'\.+') re_space_to_dash = re.compile(r'\s+') class Taxonomy(object): """ Describes a taxonomy. """ def __init__(self, name, config): self.name = name self.config = config self.term_name = config.get('term', name) self.is_multiple = bool(config.get('multiple', False)) self.separator = config.get('separator', '/') self.page_ref = config.get('page') @property def setting_name(self): if self.is_multiple: return self.name return self.term_name _taxonomy_index = """--- layout: %(template)s --- """ class TaxonomySource(GeneratorSourceBase): """ A content source that generates taxonomy listing pages. """ SOURCE_NAME = 'taxonomy' DEFAULT_PIPELINE_NAME = 'taxonomy' def __init__(self, app, name, config): super().__init__(app, name, config) tax_name = config.get('taxonomy') if tax_name is None: raise ConfigurationError( "Taxonomy source '%s' requires a taxonomy name." % name) self.taxonomy = _get_taxonomy(app, tax_name) sm = config.get('slugify_mode') self.slugifier = _get_slugifier(app, self.taxonomy, sm) tpl_name = config.get('template', '_%s.html' % tax_name) self._raw_item = _taxonomy_index % {'template': tpl_name} def getSupportedRouteParameters(self): name = self.taxonomy.term_name param_type = (RouteParameter.TYPE_PATH if self.taxonomy.is_multiple else RouteParameter.TYPE_STRING) return [RouteParameter(name, param_type, variadic=self.taxonomy.is_multiple)] def findContentFromRoute(self, route_params): slugified_term = route_params[self.taxonomy.term_name] spec = '_index' metadata = {'term': slugified_term, 'route_params': { self.taxonomy.term_name: slugified_term} } return ContentItem(spec, metadata) def slugify(self, term): return self.slugifier.slugify(term) def slugifyMultiple(self, terms): return self.slugifier.slugifyMultiple(terms) def prepareRenderContext(self, ctx): # Set the pagination source as the source we're generating for. ctx.pagination_source = self.inner_source # Get the taxonomy terms from the route metadata... this can come from # the browser's URL (while serving) or from the baking (see `bake` # method below). In both cases, we expect to have the *slugified* # version of the term, because we're going to set a filter that also # slugifies the terms found on each page. # # This is because: # * while serving, we get everything from the request URL, so we only # have the slugified version. # * if 2 slightly different terms "collide" into the same slugified # term, we'll get a merge of the 2 on the listing page, which is # what the user expects. # route_params = ctx.page.source_metadata['route_params'] tax_terms, is_combination = self._getTaxonomyTerms(route_params) self._setTaxonomyFilter(ctx, tax_terms, is_combination) # Add some custom data for rendering. ctx.custom_data.update({ self.taxonomy.term_name: tax_terms, 'is_multiple_%s' % self.taxonomy.term_name: is_combination}) # Add some "plural" version of the term... so for instance, if this # is the "tags" taxonomy, "tag" will have one term most of the time, # except when it's a combination. Here, we add "tags" as something that # is always a tuple, even when it's not a combination. if (self.taxonomy.is_multiple and self.taxonomy.name != self.taxonomy.term_name): mult_val = tax_terms if not is_combination: mult_val = (mult_val,) ctx.custom_data[self.taxonomy.name] = mult_val def _getTaxonomyTerms(self, route_params): # Get the individual slugified terms from the route metadata. all_values = route_params.get(self.taxonomy.term_name) if all_values is None: raise Exception("'%s' values couldn't be found in route metadata" % self.taxonomy.term_name) # If it's a "multiple" taxonomy, we need to potentially split the # route value into the individual terms (_e.g._ when listing all pages # that have 2 given tags, we need to get each of those 2 tags). if self.taxonomy.is_multiple: sep = self.taxonomy.separator if sep in all_values: return tuple(all_values.split(sep)), True # Not a "multiple" taxonomy, so there's only the one value. return all_values, False def _setTaxonomyFilter(self, ctx, term_value, is_combination): # Set up the filter that will check the pages' terms. flt = PaginationFilter() flt.addClause(HasTaxonomyTermsFilterClause( self.taxonomy, self.slugifier.mode, term_value, is_combination)) ctx.pagination_filter = flt def onRouteFunctionUsed(self, route_params): # Get the values, and slugify them appropriately. # If this is a "multiple" taxonomy, `values` will be a tuple of # terms. If not, `values` will just be a term. values = route_params[self.taxonomy.term_name] tax_is_multiple = self.taxonomy.is_multiple if tax_is_multiple: slugified_values = self.slugifyMultiple((str(v) for v in values)) route_val = self.taxonomy.separator.join(slugified_values) else: slugified_values = self.slugify(str(values)) route_val = slugified_values # We need to register this use of a taxonomy term. # Because the render info gets serialized across bake worker # processes, we can only use basic JSON-able structures, which # excludes `set`... hence the awkward use of `list`. # Also, note that the tuples we're putting in there will be # transformed into lists so we'll have to convert back. rcs = self.app.env.render_ctx_stack ri = rcs.current_ctx.render_info utt = ri.get('used_taxonomy_terms') if utt is None: ri['used_taxonomy_terms'] = [slugified_values] else: if slugified_values not in utt: utt.append(slugified_values) # Put the slugified values in the route metadata so they're used to # generate the URL. route_params[self.taxonomy.term_name] = route_val class HasTaxonomyTermsFilterClause(SettingFilterClause): def __init__(self, taxonomy, slugify_mode, value, is_combination): super().__init__(taxonomy.setting_name, value) self._taxonomy = taxonomy self._is_combination = is_combination self._slugifier = _Slugifier(taxonomy, slugify_mode) if taxonomy.is_multiple: self.pageMatches = self._pageMatchesAny else: self.pageMatches = self._pageMatchesSingle def _pageMatchesAny(self, fil, page): # Multiple taxonomy, i.e. it supports multiple terms, like tags. page_values = page.config.get(self.name) if page_values is None or not isinstance(page_values, list): return False page_set = set(map(self._slugifier.slugify, page_values)) if self._is_combination: # Multiple taxonomy, and multiple terms to match. Check that # the ones to match are all in the page's terms. value_set = set(self.value) return value_set.issubset(page_set) else: # Multiple taxonomy, one term to match. return self.value in page_set def _pageMatchesSingle(self, fil, page): # Single taxonomy. Just compare the values. page_value = page.config.get(self.name) if page_value is None: return False page_value = self._slugifier.slugify(page_value) return page_value == self.value def _get_taxonomy(app, tax_name): tax_config = app.config.get('site/taxonomies/' + tax_name) if tax_config is None: raise ConfigurationError("No such taxonomy: %s" % tax_name) return Taxonomy(tax_name, tax_config) def _get_slugifier(app, taxonomy, slugify_mode=None): if slugify_mode is None: slugify_mode = app.config.get('site/slugify_mode', 'encode') sm = _parse_slugify_mode(slugify_mode) return _Slugifier(taxonomy, sm) class TaxonomyPipelineRecordEntry(PagePipelineRecordEntry): def __init__(self): super().__init__() self.term = None class TaxonomyPipeline(ContentPipeline): PIPELINE_NAME = 'taxonomy' PASS_NUM = 10 RECORD_ENTRY_CLASS = TaxonomyPipelineRecordEntry def __init__(self, source, ctx): if not isinstance(source, TaxonomySource): raise Exception("The taxonomy pipeline only supports taxonomy " "content sources.") super().__init__(source, ctx) self.inner_source = source.inner_source self.taxonomy = source.taxonomy self.slugifier = source.slugifier self._tpl_name = source.config['template'] self._analyzer = None self._pagebaker = None def initialize(self): self._pagebaker = PageBaker(self.app, self.ctx.out_dir, force=self.ctx.force) self._pagebaker.startWriterQueue() def shutdown(self): self._pagebaker.stopWriterQueue() def createJobs(self, ctx): logger.debug("Caching template page for taxonomy '%s'." % self.taxonomy.name) page = self.app.getPage(self.source, ContentItem('_index', {})) page._load() logger.debug("Building '%s' taxonomy pages for source: %s" % (self.taxonomy.name, self.inner_source.name)) self._analyzer = _TaxonomyTermsAnalyzer(self, ctx.record_histories) self._analyzer.analyze() logger.debug("Queuing %d '%s' jobs." % (len(self._analyzer.dirty_slugified_terms), self.taxonomy.name)) jobs = [] rec_fac = self.createRecordEntry current_record = ctx.current_record for slugified_term in self._analyzer.dirty_slugified_terms: item_spec = '_index' record_entry_spec = '_index[%s]' % slugified_term jobs.append(create_job(self, item_spec, term=slugified_term, record_entry_spec=record_entry_spec)) entry = rec_fac(record_entry_spec) current_record.addEntry(entry) if len(jobs) > 0: return jobs, "taxonomize" return None, None def run(self, job, ctx, result): term = job['term'] content_item = ContentItem('_index', {'term': term, 'route_params': { self.taxonomy.term_name: term} }) page = Page(self.source, content_item) logger.debug("Rendering '%s' page: %s" % (self.taxonomy.name, page.source_metadata['term'])) prev_entry = ctx.previous_entry rdr_subs = self._pagebaker.bake(page, prev_entry) result['subs'] = rdr_subs result['term'] = page.source_metadata['term'] def handleJobResult(self, result, ctx): existing = ctx.record_entry existing.subs = result['subs'] existing.term = result['term'] def postJobRun(self, ctx): # We create bake entries for all the terms that were *not* dirty. # This is because otherwise, on the next incremental bake, we wouldn't # find any entry for those things, and figure that we need to delete # their outputs. analyzer = self._analyzer record = ctx.record_history.current for prev, cur in ctx.record_history.diffs: # Only consider entries that don't have any current version # (i.e. they weren't baked just now). if prev and not cur: t = prev.term if analyzer.isKnownSlugifiedTerm(t): logger.debug("Creating unbaked entry for '%s' term: %s" % (self.taxonomy.name, t)) cur = copy.deepcopy(prev) cur.flags = \ PagePipelineRecordEntry.FLAG_COLLAPSED_FROM_LAST_RUN record.addEntry(cur) else: logger.debug("Term '%s' in '%s' isn't used anymore." % (t, self.taxonomy.name)) class _TaxonomyTermsAnalyzer(object): def __init__(self, pipeline, record_histories): self.pipeline = pipeline self.record_histories = record_histories self._all_terms = {} self._all_dirty_slugified_terms = None @property def dirty_slugified_terms(self): """ Returns the slugified terms that have been 'dirtied' during this bake. """ return self._all_dirty_slugified_terms def isKnownSlugifiedTerm(self, term): """ Returns whether the given slugified term has been seen during this bake. """ return term in self._all_terms def analyze(self): # Build the list of terms for our taxonomy, and figure out which ones # are 'dirty' for the current bake. source = self.pipeline.inner_source taxonomy = self.pipeline.taxonomy slugifier = self.pipeline.slugifier tax_is_mult = taxonomy.is_multiple tax_setting_name = taxonomy.setting_name # First, go over all of our source's pages seen during this bake. # Gather all the taxonomy terms they have, and also keep track of # the ones used by the pages that were actually rendered (instead of # those that were up-to-date and skipped). single_dirty_slugified_terms = set() current_records = self.record_histories.current record_name = get_record_name_for_source(source) cur_rec = current_records.getRecord(record_name) for cur_entry in cur_rec.getEntries(): if cur_entry.hasFlag(PagePipelineRecordEntry.FLAG_OVERRIDEN): continue cur_terms = cur_entry.config.get(tax_setting_name) if not cur_terms: continue if not tax_is_mult: self._addTerm( slugifier, cur_entry.item_spec, cur_terms) else: self._addTerms( slugifier, cur_entry.item_spec, cur_terms) if cur_entry.hasFlag( PagePipelineRecordEntry.FLAG_SEGMENTS_RENDERED): if not tax_is_mult: single_dirty_slugified_terms.add( slugifier.slugify(cur_terms)) else: single_dirty_slugified_terms.update( (slugifier.slugify(t) for t in cur_terms)) self._all_dirty_slugified_terms = list( single_dirty_slugified_terms) logger.debug("Gathered %d dirty taxonomy terms", len(self._all_dirty_slugified_terms)) # Re-bake the combination pages for terms that are 'dirty'. # We make all terms into tuple, even those that are not actual # combinations, so that we have less things to test further down the # line. # # Add the combinations to that list. We get those combinations from # wherever combinations were used, so they're coming from the # `onRouteFunctionUsed` method. And because combinations can be used # by any page in the website (anywhere someone can ask for an URL # to the combination page), it means we check all the records, not # just the record for our source. if tax_is_mult: known_combinations = set() for rec in current_records.records: # Cheap way to test if a record contains entries that # are sub-types of a page entry: test the first one. first_entry = next(iter(rec.getEntries()), None) if (first_entry is None or not isinstance(first_entry, PagePipelineRecordEntry)): continue for cur_entry in rec.getEntries(): used_terms = _get_all_entry_taxonomy_terms(cur_entry) for terms in used_terms: if len(terms) > 1: known_combinations.add(terms) dcc = 0 for terms in known_combinations: if not single_dirty_slugified_terms.isdisjoint( set(terms)): self._all_dirty_slugified_terms.append( taxonomy.separator.join(terms)) dcc += 1 logger.debug("Gathered %d term combinations, with %d dirty." % (len(known_combinations), dcc)) def _addTerms(self, slugifier, item_spec, terms): for t in terms: self._addTerm(slugifier, item_spec, t) def _addTerm(self, slugifier, item_spec, term): st = slugifier.slugify(term) orig_terms = self._all_terms.setdefault(st, []) if orig_terms and orig_terms[0] != term: logger.warning( "Term '%s' in '%s' is slugified to '%s' which conflicts with " "previously existing '%s'. The two will be merged." % (term, item_spec, st, orig_terms[0])) orig_terms.append(term) def _get_all_entry_taxonomy_terms(entry): res = set() for o in entry.subs: pinfo = o['render_info'] terms = pinfo.get('used_taxonomy_terms') if terms: res |= set([tuple(t) for t in terms]) return res class _Slugifier(object): def __init__(self, taxonomy, mode): self.taxonomy = taxonomy self.mode = mode def slugifyMultiple(self, terms): return tuple(map(self.slugify, terms)) def slugify(self, term): if self.mode & SLUGIFY_TRANSLITERATE: term = unidecode.unidecode(term) if self.mode & SLUGIFY_LOWERCASE: term = term.lower() if self.mode & SLUGIFY_DOT_TO_DASH: term = re_first_dot_to_dash.sub('', term) term = re_dot_to_dash.sub('-', term) if self.mode & SLUGIFY_SPACE_TO_DASH: term = re_space_to_dash.sub('-', term) return term def _parse_slugify_mode(value): mapping = { 'encode': SLUGIFY_ENCODE, 'transliterate': SLUGIFY_TRANSLITERATE, 'lowercase': SLUGIFY_LOWERCASE, 'dot_to_dash': SLUGIFY_DOT_TO_DASH, 'space_to_dash': SLUGIFY_SPACE_TO_DASH} mode = 0 for v in value.split(','): f = mapping.get(v.strip()) if f is None: if v == 'iconv': raise Exception("'iconv' is not supported as a slugify mode " "in PieCrust2. Use 'transliterate'.") raise Exception("Unknown slugify flag: %s" % v) mode |= f return mode