Mercurial > piecrust2
view piecrust/sources/blogarchives.py @ 1136:5f97b5b59dfe
bake: Optimize cache handling for the baking process.
- Get rid of the 2-level pipeline runs... handle a single set of passes.
- Go back to load/render segments/layout passes for pages.
- Add descriptions of what each job batch does.
- Improve the taxonomy pipeline so it doesn't re-bake terms that don't need
to be re-baked.
- Simplify some of the code.
author | Ludovic Chabant <ludovic@chabant.com> |
---|---|
date | Mon, 23 Apr 2018 21:47:49 -0700 |
parents | ba809c221a27 |
children | 9f3e702a8a69 |
line wrap: on
line source
import time import logging import datetime import collections from piecrust.data.filters import PaginationFilter, IFilterClause from piecrust.dataproviders.pageiterator import ( PageIterator, HardCodedFilterIterator, DateSortIterator) from piecrust.page import Page from piecrust.pipelines._pagebaker import PageBaker from piecrust.pipelines._pagerecords import PagePipelineRecordEntry from piecrust.pipelines.base import ( ContentPipeline, create_job, get_record_name_for_source) from piecrust.routing import RouteParameter from piecrust.sources.base import ContentItem from piecrust.sources.generator import GeneratorSourceBase from piecrust.sources.list import ListSource logger = logging.getLogger(__name__) _year_index = """--- layout: %(template)s --- """ class BlogArchivesSource(GeneratorSourceBase): SOURCE_NAME = 'blog_archives' DEFAULT_PIPELINE_NAME = 'blog_archives' def __init__(self, app, name, config): super().__init__(app, name, config) tpl_name = config.get('template', '_year.html') self._raw_item = _year_index % {'template': tpl_name} def getSupportedRouteParameters(self): return [RouteParameter('year', RouteParameter.TYPE_INT4)] def findContentFromRoute(self, route_params): year = route_params['year'] return ContentItem( '_index', {'route_params': {'year': year}}) def prepareRenderContext(self, ctx): ctx.pagination_source = self.inner_source route_params = ctx.page.source_metadata['route_params'] year = route_params.get('year') if year is None: raise Exception( "Can't find the archive year in the route metadata") if type(year) is not int: raise Exception( "The route for generator '%s' should specify an integer " "parameter for 'year'." % self.name) flt = PaginationFilter() flt.addClause(IsFromYearFilterClause(year)) ctx.pagination_filter = flt ctx.custom_data['year'] = year flt2 = PaginationFilter() flt2.addClause(IsFromYearFilterClause(year)) it = PageIterator(self.inner_source) it._simpleNonSortedWrap(HardCodedFilterIterator, flt2) it._wrapAsSort(DateSortIterator, reverse=False) ctx.custom_data['archives'] = it ctx.custom_data['monthly_archives'] = _MonthlyArchiveData( self.inner_source, year) class IsFromYearFilterClause(IFilterClause): def __init__(self, year): self.year = year def pageMatches(self, fil, page): return (page.datetime.year == self.year) class _MonthlyArchiveData(collections.abc.Mapping): def __init__(self, inner_source, year): self._inner_source = inner_source self._year = year self._months = None def __iter__(self): self._load() return iter(self._months) def __len__(self): self._load() return len(self._months) def __getitem__(self, i): self._load() return self._months[i] def _load(self): if self._months is not None: return month_index = {} for page in self._inner_source.getAllPages(): if page.datetime.year != self._year: continue month = page.datetime.month posts_this_month = month_index.get(month) if posts_this_month is None: posts_this_month = [] month_index[month] = posts_this_month posts_this_month.append(page.content_item) self._months = [] for m in sorted(month_index.keys()): timestamp = time.mktime((self._year, m, 1, 0, 0, 0, 0, 0, -1)) ptm = month_index[m] it = PageIterator(ListSource(self._inner_source, ptm)) it._wrapAsSort(DateSortIterator, reverse=False) self._months.append({ 'timestamp': timestamp, 'posts': it }) class BlogArchivesPipelineRecordEntry(PagePipelineRecordEntry): def __init__(self): super().__init__() self.year = None class BlogArchivesPipeline(ContentPipeline): PIPELINE_NAME = 'blog_archives' PASS_NUM = 10 RECORD_ENTRY_CLASS = BlogArchivesPipelineRecordEntry def __init__(self, source, ctx): if not isinstance(source, BlogArchivesSource): raise Exception("The blog archives pipeline only supports blog " "archives content sources.") super().__init__(source, ctx) self.inner_source = source.inner_source self._tpl_name = source.config['template'] self._all_years = None self._dirty_years = None self._pagebaker = None def initialize(self): self._pagebaker = PageBaker(self.app, self.ctx.out_dir, force=self.ctx.force) self._pagebaker.startWriterQueue() def shutdown(self): self._pagebaker.stopWriterQueue() def createJobs(self, ctx): logger.debug("Caching template page for blog archives '%s'." % self.inner_source.name) page = self.app.getPage(self.source, ContentItem('_index', {})) page._load() logger.debug("Building blog archives for: %s" % self.inner_source.name) self._buildDirtyYears(ctx) logger.debug("Got %d dirty years out of %d." % (len(self._dirty_years), len(self._all_years))) jobs = [] rec_fac = self.createRecordEntry current_record = ctx.current_record for y in self._dirty_years: record_entry_spec = '_index[%04d]' % y jobs.append(create_job(self, '_index', year=y, record_entry_spec=record_entry_spec)) entry = rec_fac(record_entry_spec) current_record.addEntry(entry) if len(jobs) > 0: return jobs, "archive" return None, None def run(self, job, ctx, result): year = job['year'] content_item = ContentItem('_index', {'year': year, 'route_params': {'year': year}}) page = Page(self.source, content_item) prev_entry = ctx.previous_entry rdr_subs = self._pagebaker.bake(page, prev_entry) result['subs'] = rdr_subs result['year'] = page.source_metadata['year'] def handleJobResult(self, result, ctx): existing = ctx.record_entry existing.subs = result['subs'] existing.year = result['year'] def postJobRun(self, ctx): # Create bake entries for the years that were *not* dirty. # Otherwise, when checking for deleted pages, we would not find any # outputs and would delete those files. all_str_years = [str(y) for y in self._all_years] for prev, cur in ctx.record_history.diffs: if prev and not cur: y = prev.year if y in all_str_years: logger.debug( "Creating unbaked entry for year %s archive." % y) cur.year = y cur.out_paths = list(prev.out_paths) cur.errors = list(prev.errors) else: logger.debug( "No page references year %s anymore." % y) def _buildDirtyYears(self, ctx): all_years = set() dirty_years = set() record_name = get_record_name_for_source(self.inner_source) current_records = ctx.record_histories.current cur_rec = current_records.getRecord(record_name) for cur_entry in cur_rec.getEntries(): dt = datetime.datetime.fromtimestamp(cur_entry.timestamp) all_years.add(dt.year) if cur_entry.hasFlag( PagePipelineRecordEntry.FLAG_SEGMENTS_RENDERED): dirty_years.add(dt.year) self._all_years = all_years self._dirty_years = dirty_years