view piecrust/baking/single.py @ 415:0e9a94b7fdfa

bake: Improve bake record information. * Store things in the bake record that require less interaction between the master process and the workers. For instance, don't store the paginator object in the render pass info -- instead, just store whether pagination was used, and whether it had more items. * Simplify information passing between workers and bake passes by saving the rendering info to the JSON cache. This means the "render first sub" job doesn't have to return anything except errors now. * Add more performance counter info.
author Ludovic Chabant <ludovic@chabant.com>
date Sat, 20 Jun 2015 19:23:16 -0700
parents e7b865f8f335
children 21e26ed867b6
line wrap: on
line source

import os.path
import shutil
import codecs
import logging
import urllib.parse
from piecrust import ASSET_DIR_SUFFIX
from piecrust.baking.records import SubPageBakeInfo
from piecrust.rendering import (
        QualifiedPage, PageRenderingContext, render_page,
        PASS_FORMATTING)
from piecrust.uriutil import split_uri


logger = logging.getLogger(__name__)


def copy_public_page_config(config):
    res = config.getDeepcopy()
    for k in list(res.keys()):
        if k.startswith('__'):
            del res[k]
    return res


class BakingError(Exception):
    pass


class PageBaker(object):
    def __init__(self, app, out_dir, force=False, copy_assets=True):
        self.app = app
        self.out_dir = out_dir
        self.force = force
        self.copy_assets = copy_assets
        self.site_root = app.config.get('site/root')
        self.pretty_urls = app.config.get('site/pretty_urls')

    def getOutputPath(self, uri):
        uri_root, uri_path = split_uri(self.app, uri)

        bake_path = [self.out_dir]
        decoded_uri = urllib.parse.unquote(uri_path)
        if self.pretty_urls:
            bake_path.append(decoded_uri)
            bake_path.append('index.html')
        elif decoded_uri == '':
            bake_path.append('index.html')
        else:
            bake_path.append(decoded_uri)

        return os.path.normpath(os.path.join(*bake_path))

    def bake(self, factory, route, route_metadata, prev_entry,
             dirty_source_names, tax_info=None):
        # Get the page.
        page = factory.buildPage()

        # Start baking the sub-pages.
        cur_sub = 1
        has_more_subs = True
        sub_entries = []

        while has_more_subs:
            # Get the URL and path for this sub-page.
            sub_uri = route.getUri(route_metadata, sub_num=cur_sub,
                                   provider=page)
            logger.debug("Baking '%s' [%d]..." % (sub_uri, cur_sub))
            out_path = self.getOutputPath(sub_uri)

            # Create the sub-entry for the bake record.
            sub_entry = SubPageBakeInfo(sub_uri, out_path)
            sub_entries.append(sub_entry)

            # Find a corresponding sub-entry in the previous bake record.
            prev_sub_entry = None
            if prev_entry:
                try:
                    prev_sub_entry = prev_entry.getSub(cur_sub)
                except IndexError:
                    pass

            # Figure out if we need to invalidate or force anything.
            force_this_sub, invalidate_formatting = _compute_force_flags(
                    prev_sub_entry, sub_entry, dirty_source_names)
            force_this_sub = force_this_sub or self.force

            # Check for up-to-date outputs.
            do_bake = True
            if not force_this_sub:
                try:
                    in_path_time = page.path_mtime
                    out_path_time = os.path.getmtime(out_path)
                    if out_path_time >= in_path_time:
                        do_bake = False
                except OSError:
                    # File doesn't exist, we'll need to bake.
                    pass

            # If this page didn't bake because it's already up-to-date.
            # Keep trying for as many subs as we know this page has.
            if not do_bake:
                sub_entry.render_info = prev_sub_entry.copyRenderInfo()
                sub_entry.flags = SubPageBakeInfo.FLAG_NONE

                if prev_entry.num_subs >= cur_sub + 1:
                    cur_sub += 1
                    has_more_subs = True
                    logger.debug("  %s is up to date, skipping to next "
                                 "sub-page." % out_path)
                    continue

                logger.debug("  %s is up to date, skipping bake." % out_path)
                break

            # All good, proceed.
            try:
                if invalidate_formatting:
                    cache_key = sub_uri
                    self.app.env.rendered_segments_repository.invalidate(
                            cache_key)
                    sub_entry.flags |= \
                        SubPageBakeInfo.FLAG_FORMATTING_INVALIDATED

                logger.debug("  p%d -> %s" % (cur_sub, out_path))
                qp = QualifiedPage(page, route, route_metadata)
                rp = self._bakeSingle(qp, cur_sub, out_path, tax_info)
            except Exception as ex:
                page_rel_path = os.path.relpath(page.path, self.app.root_dir)
                raise BakingError("%s: error baking '%s'." %
                                  (page_rel_path, sub_uri)) from ex

            # Record what we did.
            sub_entry.flags |= SubPageBakeInfo.FLAG_BAKED
            sub_entry.render_info = rp.copyRenderInfo()

            # Copy page assets.
            if (cur_sub == 1 and self.copy_assets and
                    sub_entry.anyPass(lambda p: p.used_assets)):
                if self.pretty_urls:
                    out_assets_dir = os.path.dirname(out_path)
                else:
                    out_assets_dir, out_name = os.path.split(out_path)
                    if sub_uri != self.site_root:
                        out_name_noext, _ = os.path.splitext(out_name)
                        out_assets_dir += out_name_noext

                logger.debug("Copying page assets to: %s" % out_assets_dir)
                _ensure_dir_exists(out_assets_dir)

                page_dirname = os.path.dirname(page.path)
                page_pathname, _ = os.path.splitext(page.path)
                in_assets_dir = page_pathname + ASSET_DIR_SUFFIX
                for fn in os.listdir(in_assets_dir):
                    full_fn = os.path.join(page_dirname, fn)
                    if os.path.isfile(full_fn):
                        dest_ap = os.path.join(out_assets_dir, fn)
                        logger.debug("  %s -> %s" % (full_fn, dest_ap))
                        shutil.copy(full_fn, dest_ap)

            # Figure out if we have more work.
            has_more_subs = False
            if sub_entry.anyPass(lambda p: p.pagination_has_more):
                cur_sub += 1
                has_more_subs = True

        return sub_entries

    def _bakeSingle(self, qualified_page, num, out_path, tax_info=None):
        ctx = PageRenderingContext(qualified_page, page_num=num)
        if tax_info:
            tax = self.app.getTaxonomy(tax_info.taxonomy_name)
            ctx.setTaxonomyFilter(tax, tax_info.term)

        rp = render_page(ctx)

        out_dir = os.path.dirname(out_path)
        _ensure_dir_exists(out_dir)

        with codecs.open(out_path, 'w', 'utf8') as fp:
            fp.write(rp.content)

        return rp


def _compute_force_flags(prev_sub_entry, sub_entry, dirty_source_names):
    # Figure out what to do with this page.
    force_this_sub = False
    invalidate_formatting = False
    sub_uri = sub_entry.out_uri
    if (prev_sub_entry and
            (prev_sub_entry.was_baked_successfully or
                prev_sub_entry.was_clean)):
        # If the current page is known to use pages from other sources,
        # see if any of those got baked, or are going to be baked for
        # some reason. If so, we need to bake this one too.
        # (this happens for instance with the main page of a blog).
        dirty_for_this, invalidated_render_passes = (
                _get_dirty_source_names_and_render_passes(
                    prev_sub_entry, dirty_source_names))
        if len(invalidated_render_passes) > 0:
            logger.debug(
                    "'%s' is known to use sources %s, which have "
                    "items that got (re)baked. Will force bake this "
                    "page. " % (sub_uri, dirty_for_this))
            sub_entry.flags |= \
                SubPageBakeInfo.FLAG_FORCED_BY_SOURCE
            force_this_sub = True

            if PASS_FORMATTING in invalidated_render_passes:
                logger.debug(
                        "Will invalidate cached formatting for '%s' "
                        "since sources were using during that pass."
                        % sub_uri)
                invalidate_formatting = True
    elif (prev_sub_entry and
            prev_sub_entry.errors):
        # Previous bake failed. We'll have to bake it again.
        logger.debug(
                "Previous record entry indicates baking failed for "
                "'%s'. Will bake it again." % sub_uri)
        sub_entry.flags |= \
            SubPageBakeInfo.FLAG_FORCED_BY_PREVIOUS_ERRORS
        force_this_sub = True
    elif not prev_sub_entry:
        # No previous record. We'll have to bake it.
        logger.debug("No previous record entry found for '%s'. Will "
                     "force bake it." % sub_uri)
        sub_entry.flags |= \
            SubPageBakeInfo.FLAG_FORCED_BY_NO_PREVIOUS
        force_this_sub = True

    return force_this_sub, invalidate_formatting


def _get_dirty_source_names_and_render_passes(sub_entry, dirty_source_names):
    dirty_for_this = set()
    invalidated_render_passes = set()
    assert sub_entry.render_info is not None
    for p, pinfo in sub_entry.render_info.items():
        for src_name in pinfo.used_source_names:
            is_dirty = (src_name in dirty_source_names)
            if is_dirty:
                invalidated_render_passes.add(p)
                dirty_for_this.add(src_name)
                break
    return dirty_for_this, invalidated_render_passes


def _ensure_dir_exists(path):
    try:
        os.makedirs(path, mode=0o755, exist_ok=True)
    except OSError:
        # In a multiprocess environment, several process may very
        # occasionally try to create the same directory at the same time.
        # Let's ignore any error and if something's really wrong (like file
        # acces permissions or whatever), then it will more legitimately fail
        # just after this when we try to write files.
        pass