view piecrust/sources/blogarchives.py @ 1188:a7c43131d871

bake: Fix file write flushing problem with Python 3.8+ Writing the cache files fails in Python 3.8 because it looks like flushing behaviour has changed. We need to explicitly flush. And even then, in very rare occurrences, it looks like it can still run into racing conditions, so we do a very hacky and ugly "retry" loop when fetching cached data :(
author Ludovic Chabant <ludovic@chabant.com>
date Tue, 15 Jun 2021 22:36:23 -0700
parents 9f3e702a8a69
children
line wrap: on
line source

import time
import logging
import datetime
import collections
from piecrust.data.filters import PaginationFilter, IFilterClause
from piecrust.dataproviders.pageiterator import (
    PageIterator, HardCodedFilterIterator, DateSortIterator)
from piecrust.page import Page
from piecrust.pipelines._pagebaker import PageBaker
from piecrust.pipelines._pagerecords import PagePipelineRecordEntry
from piecrust.pipelines.base import (
    ContentPipeline,
    create_job, get_record_name_for_source)
from piecrust.routing import RouteParameter
from piecrust.sources.base import ContentItem
from piecrust.sources.generator import GeneratorSourceBase
from piecrust.sources.list import ListSource


logger = logging.getLogger(__name__)


_year_index = """---
layout: %(template)s
---
"""


class BlogArchivesSource(GeneratorSourceBase):
    SOURCE_NAME = 'blog_archives'
    DEFAULT_PIPELINE_NAME = 'blog_archives'

    def __init__(self, app, name, config):
        super().__init__(app, name, config)

        tpl_name = config.get('template', '_year.html')
        self._raw_item = _year_index % {'template': tpl_name}

    def getSupportedRouteParameters(self):
        return [RouteParameter('year', RouteParameter.TYPE_INT4)]

    def findContentFromRoute(self, route_params):
        year = route_params['year']
        return ContentItem(
            '_index[%s]' % year,
            {'route_params': {'year': year}})

    def prepareRenderContext(self, ctx):
        ctx.pagination_source = self.inner_source

        route_params = ctx.page.source_metadata['route_params']
        year = route_params.get('year')
        if year is None:
            raise Exception(
                "Can't find the archive year in the route metadata")
        if type(year) is not int:
            raise Exception(
                "The route for generator '%s' should specify an integer "
                "parameter for 'year'." % self.name)

        flt = PaginationFilter()
        flt.addClause(IsFromYearFilterClause(year))
        ctx.pagination_filter = flt

        ctx.custom_data['year'] = year

        flt2 = PaginationFilter()
        flt2.addClause(IsFromYearFilterClause(year))
        it = PageIterator(self.inner_source)
        it._simpleNonSortedWrap(HardCodedFilterIterator, flt2)
        it._wrapAsSort(DateSortIterator, reverse=False)
        ctx.custom_data['archives'] = it

        ctx.custom_data['monthly_archives'] = _MonthlyArchiveData(
            self.inner_source, year)


class IsFromYearFilterClause(IFilterClause):
    def __init__(self, year):
        self.year = year

    def pageMatches(self, fil, page):
        return (page.datetime.year == self.year)


class _MonthlyArchiveData(collections.abc.Mapping):
    def __init__(self, inner_source, year):
        self._inner_source = inner_source
        self._year = year
        self._months = None

    def __iter__(self):
        self._load()
        return iter(self._months)

    def __len__(self):
        self._load()
        return len(self._months)

    def __getitem__(self, i):
        self._load()
        return self._months[i]

    def _load(self):
        if self._months is not None:
            return

        month_index = {}
        for page in self._inner_source.getAllPages():
            if page.datetime.year != self._year:
                continue

            month = page.datetime.month

            posts_this_month = month_index.get(month)
            if posts_this_month is None:
                posts_this_month = []
                month_index[month] = posts_this_month
            posts_this_month.append(page.content_item)

        self._months = []
        for m in sorted(month_index.keys()):
            timestamp = time.mktime((self._year, m, 1, 0, 0, 0, 0, 0, -1))

            ptm = month_index[m]
            it = PageIterator(ListSource(self._inner_source, ptm))
            it._wrapAsSort(DateSortIterator, reverse=False)

            self._months.append({
                'timestamp': timestamp,
                'posts': it
            })


class BlogArchivesPipelineRecordEntry(PagePipelineRecordEntry):
    def __init__(self):
        super().__init__()
        self.year = None


class BlogArchivesPipeline(ContentPipeline):
    PIPELINE_NAME = 'blog_archives'
    PASS_NUM = 10
    RECORD_ENTRY_CLASS = BlogArchivesPipelineRecordEntry

    def __init__(self, source, ctx):
        if not isinstance(source, BlogArchivesSource):
            raise Exception("The blog archives pipeline only supports blog "
                            "archives content sources.")

        super().__init__(source, ctx)
        self.inner_source = source.inner_source
        self._tpl_name = source.config['template']
        self._all_years = None
        self._dirty_years = None
        self._pagebaker = None

    def initialize(self):
        self._pagebaker = PageBaker(self.app,
                                    self.ctx.out_dir,
                                    force=self.ctx.force)
        self._pagebaker.startWriterQueue()

    def shutdown(self):
        self._pagebaker.stopWriterQueue()

    def createJobs(self, ctx):
        logger.debug("Building blog archives for: %s" %
                     self.inner_source.name)
        self._buildDirtyYears(ctx)
        logger.debug("Got %d dirty years out of %d." %
                     (len(self._dirty_years), len(self._all_years)))

        jobs = []
        rec_fac = self.createRecordEntry
        current_record = ctx.current_record

        for y in self._dirty_years:
            item_spec = '_index[%04d]' % y

            jobs.append(create_job(self, item_spec,
                                   year=y))

            entry = rec_fac(item_spec)
            current_record.addEntry(entry)

        if len(jobs) > 0:
            return jobs, "archive"
        return None, None

    def run(self, job, ctx, result):
        year = job['year']
        content_item = ContentItem('_index[%04d]' % year,
                                   {'year': year,
                                    'route_params': {'year': year}})
        page = Page(self.source, content_item)

        prev_entry = ctx.previous_entry
        rdr_subs = self._pagebaker.bake(page, prev_entry)

        result['subs'] = rdr_subs
        result['year'] = page.source_metadata['year']

    def handleJobResult(self, result, ctx):
        existing = ctx.record_entry
        existing.subs = result['subs']
        existing.year = result['year']

    def postJobRun(self, ctx):
        # Create bake entries for the years that were *not* dirty.
        # Otherwise, when checking for deleted pages, we would not find any
        # outputs and would delete those files.
        all_str_years = [str(y) for y in self._all_years]
        for prev, cur in ctx.record_history.diffs:
            if prev and not cur:
                y = prev.year
                if y in all_str_years:
                    logger.debug(
                        "Creating unbaked entry for year %s archive." % y)
                    cur.year = y
                    cur.out_paths = list(prev.out_paths)
                    cur.errors = list(prev.errors)
                else:
                    logger.debug(
                        "No page references year %s anymore." % y)

    def _buildDirtyYears(self, ctx):
        all_years = set()
        dirty_years = set()

        record_name = get_record_name_for_source(self.inner_source)
        current_records = ctx.record_histories.current
        cur_rec = current_records.getRecord(record_name)
        for cur_entry in cur_rec.getEntries():
            dt = datetime.datetime.fromtimestamp(cur_entry.timestamp)
            all_years.add(dt.year)
            if cur_entry.hasFlag(
                    PagePipelineRecordEntry.FLAG_SEGMENTS_RENDERED):
                dirty_years.add(dt.year)

        self._all_years = all_years
        self._dirty_years = dirty_years