changeset 871:504ddb370df8

refactor: Fixing some issues with baking assets.
author Ludovic Chabant <ludovic@chabant.com>
date Tue, 13 Jun 2017 22:30:27 -0700
parents 48d25fd68b8d
children 3004ab31bb46
files piecrust/appconfigdefaults.py piecrust/baking/baker.py piecrust/baking/worker.py piecrust/data/assetor.py piecrust/pipelines/_pagebaker.py piecrust/pipelines/_pagerecords.py piecrust/pipelines/page.py piecrust/sources/base.py piecrust/sources/blogarchives.py piecrust/sources/taxonomy.py
diffstat 10 files changed, 151 insertions(+), 142 deletions(-) [+]
line wrap: on
line diff
--- a/piecrust/appconfigdefaults.py	Mon Jun 12 22:30:06 2017 -0700
+++ b/piecrust/appconfigdefaults.py	Tue Jun 13 22:30:27 2017 -0700
@@ -24,7 +24,6 @@
         'default_auto_format': 'md',
         'default_pagination_source': None,
         'pagination_suffix': '/%num%',
-        'asset_url_format': '%uri%',
         'slugify_mode': 'encode',
         'themes_sources': [DEFAULT_THEME_SOURCE],
         'use_default_content': True,
--- a/piecrust/baking/baker.py	Mon Jun 12 22:30:06 2017 -0700
+++ b/piecrust/baking/baker.py	Tue Jun 13 22:30:27 2017 -0700
@@ -11,7 +11,7 @@
 from piecrust.pipelines.records import (
     MultiRecordHistory, MultiRecord, RecordEntry,
     load_records)
-from piecrust.sources.base import REALM_USER, REALM_THEME
+from piecrust.sources.base import REALM_USER, REALM_THEME, REALM_NAMES
 
 
 logger = logging.getLogger(__name__)
@@ -42,7 +42,7 @@
 
         # Get into bake mode.
         self.app.config.set('baker/is_baking', True)
-        self.app.config.set('site/base_asset_url_format', '%uri')
+        self.app.config.set('site/asset_url_format', '%page_uri%/%filename%')
 
         # Make sure the output directory exists.
         if not os.path.isdir(self.out_dir):
@@ -119,13 +119,14 @@
                 ppinfo.pipeline.source.config['realm'], [])
             pplist.append(ppinfo)
 
-        for pp_pass in sorted(pp_by_pass_and_realm.keys()):
-            logger.debug("Pipelines pass %d" % pp_pass)
-            pp_by_realm = pp_by_pass_and_realm[pp_pass]
+        for pp_pass_num in sorted(pp_by_pass_and_realm.keys()):
+            logger.debug("Pipelines pass %d" % pp_pass_num)
+            pp_by_realm = pp_by_pass_and_realm[pp_pass_num]
             for realm in realm_list:
                 pplist = pp_by_realm.get(realm)
                 if pplist is not None:
-                    self._bakeRealm(pool, pp_pass, record_histories, pplist)
+                    self._bakeRealm(
+                        pool, record_histories, pp_pass_num, realm, pplist)
 
         # Handle deletions, collapse records, etc.
         ppmngr.postJobRun()
@@ -213,59 +214,75 @@
                 start_time, "cache is assumed valid", colored=False))
             return True
 
-    def _bakeRealm(self, pool, pppass, record_histories, pplist):
+    def _bakeRealm(self, pool, record_histories, pp_pass_num, realm, pplist):
         # Start with the first pass, where we iterate on the content sources'
         # items and run jobs on those.
         pool.userdata.cur_pass = 0
         next_pass_jobs = {}
         pool.userdata.next_pass_jobs = next_pass_jobs
-        queued_any_job = False
+
+        start_time = time.perf_counter()
+        job_count = 0
+        realm_name = REALM_NAMES[realm].lower()
+
         for ppinfo in pplist:
             src = ppinfo.source
             pp = ppinfo.pipeline
 
             logger.debug(
-                "Queuing jobs for source '%s' using pipeline '%s' (pass 0)." %
-                (src.name, pp.PIPELINE_NAME))
+                "Queuing jobs for source '%s' using pipeline '%s' "
+                "(%s, pass 0)." %
+                (src.name, pp.PIPELINE_NAME, realm_name))
 
             next_pass_jobs[src.name] = []
-            jcctx = PipelineJobCreateContext(pppass, record_histories)
+            jcctx = PipelineJobCreateContext(pp_pass_num, record_histories)
             jobs = pp.createJobs(jcctx)
             if jobs is not None:
+                job_count += len(jobs)
                 pool.queueJobs(jobs)
-                queued_any_job = True
 
-        if not queued_any_job:
+        if job_count == 0:
             logger.debug("No jobs queued! Bailing out of this bake pass.")
             return
 
         pool.wait()
 
+        logger.info(format_timed(
+            start_time, "%d pipeline jobs completed (%s, pass 0)." %
+            (job_count, realm_name)))
+
         # Now let's see if any job created a follow-up job. Let's keep
         # processing those jobs as long as they create new ones.
         pool.userdata.cur_pass = 1
         while True:
-            had_any_job = False
-
             # Make a copy of out next pass jobs and reset the list, so
             # the first jobs to be processed don't mess it up as we're
             # still iterating on it.
             next_pass_jobs = pool.userdata.next_pass_jobs
             pool.userdata.next_pass_jobs = {}
 
+            start_time = time.perf_counter()
+            job_count = 0
+
             for sn, jobs in next_pass_jobs.items():
                 if jobs:
                     logger.debug(
-                        "Queuing jobs for source '%s' (pass %d)." %
-                        (sn, pool.userdata.cur_pass))
+                        "Queuing jobs for source '%s' (%s, pass %d)." %
+                        (sn, realm_name, pool.userdata.cur_pass))
+                    job_count += len(jobs)
                     pool.userdata.next_pass_jobs[sn] = []
                     pool.queueJobs(jobs)
-                    had_any_job = True
 
-            if not had_any_job:
+            if job_count == 0:
                 break
 
             pool.wait()
+
+            logger.info(format_timed(
+                start_time,
+                "%d pipeline jobs completed (%s, pass %d)." %
+                (job_count, realm_name, pool.userdata.cur_pass)))
+
             pool.userdata.cur_pass += 1
 
     def _logErrors(self, item_spec, errors):
--- a/piecrust/baking/worker.py	Mon Jun 12 22:30:06 2017 -0700
+++ b/piecrust/baking/worker.py	Tue Jun 13 22:30:27 2017 -0700
@@ -37,7 +37,7 @@
         app = self.ctx.appfactory.create()
         app.config.set('baker/is_baking', True)
         app.config.set('baker/worker_id', self.wid)
-        app.config.set('site/base_asset_url_format', '%uri')
+        app.config.set('site/asset_url_format', '%page_uri%/%filename%')
 
         app.env.fs_cache_only_for_main_page = True
 
--- a/piecrust/data/assetor.py	Mon Jun 12 22:30:06 2017 -0700
+++ b/piecrust/data/assetor.py	Tue Jun 13 22:30:27 2017 -0700
@@ -1,9 +1,7 @@
 import os
 import os.path
-import shutil
 import logging
 import collections.abc
-from piecrust import ASSET_DIR_SUFFIX
 from piecrust.sources.base import REL_ASSETS
 from piecrust.uriutil import multi_replace
 
@@ -47,6 +45,10 @@
         self._cacheAssets()
         return len(self._cache_list)
 
+    def _getAssetItems(self):
+        self._cacheAssets()
+        return map(lambda i: i.content_item, self._cache_map.values())
+
     def _debugRenderAssetNames(self):
         self._cacheAssets()
         return list(self._cache_map.keys())
@@ -89,9 +91,9 @@
                     (name, content_item.spec))
 
             # TODO: this assumes a file-system source!
-            uri_build_tokens['%path%'] = (
-                os.path.relpath(a.spec, root_dir).replace('\\', '/'))
-            uri_build_tokens['%filename%'] = a.metadata['filename'],
+            uri_build_tokens['%path%'] = \
+                os.path.relpath(a.spec, root_dir).replace('\\', '/')
+            uri_build_tokens['%filename%'] = a.metadata['filename']
             uri = multi_replace(asset_url_format, uri_build_tokens)
 
             self._cache_map[name] = _AssetInfo(a, uri)
@@ -102,13 +104,3 @@
         if cur_ctx is not None:
             cur_ctx.current_pass_info.used_assets = True
 
-    def copyAssets(self, dest_dir):
-        page_pathname, _ = os.path.splitext(self._page.path)
-        in_assets_dir = page_pathname + ASSET_DIR_SUFFIX
-        for fn in os.listdir(in_assets_dir):
-            full_fn = os.path.join(in_assets_dir, fn)
-            if os.path.isfile(full_fn):
-                dest_ap = os.path.join(dest_dir, fn)
-                logger.debug("  %s -> %s" % (full_fn, dest_ap))
-                shutil.copy(full_fn, dest_ap)
-
--- a/piecrust/pipelines/_pagebaker.py	Mon Jun 12 22:30:06 2017 -0700
+++ b/piecrust/pipelines/_pagebaker.py	Tue Jun 13 22:30:27 2017 -0700
@@ -1,10 +1,11 @@
 import os.path
 import queue
+import shutil
 import logging
 import threading
 import urllib.parse
 from piecrust.pipelines._pagerecords import SubPagePipelineRecordEntry
-from piecrust.rendering import RenderingContext, render_page, PASS_FORMATTING
+from piecrust.rendering import RenderingContext, render_page
 from piecrust.uriutil import split_uri
 
 
@@ -16,16 +17,17 @@
 
 
 class PageBaker(object):
-    def __init__(self, app, out_dir, force=False, copy_assets=True):
+    def __init__(self, app, out_dir, force=False):
         self.app = app
         self.out_dir = out_dir
         self.force = force
-        self.copy_assets = copy_assets
         self.site_root = app.config.get('site/root')
         self.pretty_urls = app.config.get('site/pretty_urls')
+        self._do_write = self._writeDirect
         self._writer_queue = None
         self._writer = None
         self._stats = app.env.stats
+        self._rsr = app.env.rendered_segments_repository
 
     def startWriterQueue(self):
         self._writer_queue = queue.Queue()
@@ -34,11 +36,19 @@
             target=_text_writer,
             args=(self._writer_queue,))
         self._writer.start()
+        self._do_write = self._sendToWriterQueue
 
     def stopWriterQueue(self):
         self._writer_queue.put_nowait(None)
         self._writer.join()
 
+    def _sendToWriterQueue(self, out_path, content):
+        self._writer_queue.put_nowait((out_path, content))
+
+    def _writeDirect(self, out_path, content):
+        with open(out_path, 'w', encoding='utf8') as fp:
+            fp.write(content)
+
     def getOutputPath(self, uri, pretty_urls):
         uri_root, uri_path = split_uri(self.app, uri)
 
@@ -54,12 +64,12 @@
 
         return os.path.normpath(os.path.join(*bake_path))
 
-    def bake(self, page, prev_entry, cur_entry, dirty_source_names):
-        # Start baking the sub-pages.
+    def bake(self, page, prev_entry, cur_entry):
         cur_sub = 1
         has_more_subs = True
         pretty_urls = page.config.get('pretty_urls', self.pretty_urls)
 
+        # Start baking the sub-pages.
         while has_more_subs:
             sub_uri = page.getUri(sub_num=cur_sub)
             logger.debug("Baking '%s' [%d]..." % (sub_uri, cur_sub))
@@ -67,8 +77,8 @@
             out_path = self.getOutputPath(sub_uri, pretty_urls)
 
             # Create the sub-entry for the bake record.
-            sub_entry = SubPagePipelineRecordEntry(sub_uri, out_path)
-            cur_entry.subs.append(sub_entry)
+            cur_sub_entry = SubPagePipelineRecordEntry(sub_uri, out_path)
+            cur_entry.subs.append(cur_sub_entry)
 
             # Find a corresponding sub-entry in the previous bake record.
             prev_sub_entry = None
@@ -78,28 +88,15 @@
                 except IndexError:
                     pass
 
-            # Figure out if we need to invalidate or force anything.
-            force_this_sub, invalidate_formatting = _compute_force_flags(
-                prev_sub_entry, sub_entry, dirty_source_names)
-            force_this_sub = force_this_sub or self.force
-
-            # Check for up-to-date outputs.
-            do_bake = True
-            if not force_this_sub:
-                try:
-                    in_path_time = page.content_mtime
-                    out_path_time = os.path.getmtime(out_path)
-                    if out_path_time >= in_path_time:
-                        do_bake = False
-                except OSError:
-                    # File doesn't exist, we'll need to bake.
-                    pass
+            # Figure out if we need to bake this page.
+            bake_status = _get_bake_status(page, out_path, self.force,
+                                           prev_sub_entry, cur_sub_entry)
 
             # If this page didn't bake because it's already up-to-date.
             # Keep trying for as many subs as we know this page has.
-            if not do_bake:
-                sub_entry.render_info = prev_sub_entry.copyRenderInfo()
-                sub_entry.flags = SubPagePipelineRecordEntry.FLAG_NONE
+            if bake_status == STATUS_CLEAN:
+                cur_sub_entry.render_info = prev_sub_entry.copyRenderInfo()
+                cur_sub_entry.flags = SubPagePipelineRecordEntry.FLAG_NONE
 
                 if prev_entry.num_subs >= cur_sub + 1:
                     cur_sub += 1
@@ -113,11 +110,10 @@
 
             # All good, proceed.
             try:
-                if invalidate_formatting:
+                if bake_status == STATUS_INVALIDATE_AND_BAKE:
                     cache_key = sub_uri
-                    self.app.env.rendered_segments_repository.invalidate(
-                        cache_key)
-                    sub_entry.flags |= \
+                    self._rsr.invalidate(cache_key)
+                    cur_sub_entry.flags |= \
                         SubPagePipelineRecordEntry.FLAG_FORMATTING_INVALIDATED
 
                 logger.debug("  p%d -> %s" % (cur_sub, out_path))
@@ -128,12 +124,12 @@
                                   (page.content_spec, sub_uri)) from ex
 
             # Record what we did.
-            sub_entry.flags |= SubPagePipelineRecordEntry.FLAG_BAKED
-            sub_entry.render_info = rp.copyRenderInfo()
+            cur_sub_entry.flags |= SubPagePipelineRecordEntry.FLAG_BAKED
+            cur_sub_entry.render_info = rp.copyRenderInfo()
 
             # Copy page assets.
-            if (cur_sub == 1 and self.copy_assets and
-                    sub_entry.anyPass(lambda p: p.used_assets)):
+            if (cur_sub == 1 and
+                    cur_sub_entry.anyPass(lambda p: p.used_assets)):
                 if pretty_urls:
                     out_assets_dir = os.path.dirname(out_path)
                 else:
@@ -145,11 +141,17 @@
 
                 logger.debug("Copying page assets to: %s" % out_assets_dir)
                 _ensure_dir_exists(out_assets_dir)
-                # TODO: copy assets to out dir
+                assetor = rp.data.get('assets')
+                if assetor is not None:
+                    for i in assetor._getAssetItems():
+                        fn = os.path.basename(i.spec)
+                        out_asset_path = os.path.join(out_assets_dir, fn)
+                        logger.debug("  %s -> %s" % (i.spec, out_asset_path))
+                        shutil.copy(i.spec, out_asset_path)
 
             # Figure out if we have more work.
             has_more_subs = False
-            if sub_entry.anyPass(lambda p: p.pagination_has_more):
+            if cur_sub_entry.anyPass(lambda p: p.pagination_has_more):
                 cur_sub += 1
                 has_more_subs = True
 
@@ -161,11 +163,7 @@
             rp = render_page(ctx)
 
         with self._stats.timerScope("PageSerialize"):
-            if self._writer_queue is not None:
-                self._writer_queue.put_nowait((out_path, rp.content))
-            else:
-                with open(out_path, 'w', encoding='utf8') as fp:
-                    fp.write(rp.content)
+            self._do_write(out_path, rp.content)
 
         return rp
 
@@ -188,68 +186,49 @@
             break
 
 
-def _compute_force_flags(prev_sub_entry, sub_entry, dirty_source_names):
-    # Figure out what to do with this page.
-    force_this_sub = False
-    invalidate_formatting = False
-    sub_uri = sub_entry.out_uri
-    if (prev_sub_entry and
-            (prev_sub_entry.was_baked_successfully or
-                prev_sub_entry.was_clean)):
-        # If the current page is known to use pages from other sources,
-        # see if any of those got baked, or are going to be baked for
-        # some reason. If so, we need to bake this one too.
-        # (this happens for instance with the main page of a blog).
-        dirty_for_this, invalidated_render_passes = (
-            _get_dirty_source_names_and_render_passes(
-                prev_sub_entry, dirty_source_names))
-        if len(invalidated_render_passes) > 0:
-            logger.debug(
-                "'%s' is known to use sources %s, which have "
-                "items that got (re)baked. Will force bake this "
-                "page. " % (sub_uri, dirty_for_this))
-            sub_entry.flags |= \
-                SubPagePipelineRecordEntry.FLAG_FORCED_BY_SOURCE
-            force_this_sub = True
-
-            if PASS_FORMATTING in invalidated_render_passes:
-                logger.debug(
-                    "Will invalidate cached formatting for '%s' "
-                    "since sources were using during that pass."
-                    % sub_uri)
-                invalidate_formatting = True
-    elif (prev_sub_entry and
-            prev_sub_entry.errors):
-        # Previous bake failed. We'll have to bake it again.
-        logger.debug(
-            "Previous record entry indicates baking failed for "
-            "'%s'. Will bake it again." % sub_uri)
-        sub_entry.flags |= \
-            SubPagePipelineRecordEntry.FLAG_FORCED_BY_PREVIOUS_ERRORS
-        force_this_sub = True
-    elif not prev_sub_entry:
-        # No previous record. We'll have to bake it.
-        logger.debug("No previous record entry found for '%s'. Will "
-                     "force bake it." % sub_uri)
-        sub_entry.flags |= \
-            SubPagePipelineRecordEntry.FLAG_FORCED_BY_NO_PREVIOUS
-        force_this_sub = True
-
-    return force_this_sub, invalidate_formatting
+STATUS_CLEAN = 0
+STATUS_BAKE = 1
+STATUS_INVALIDATE_AND_BAKE = 2
 
 
-def _get_dirty_source_names_and_render_passes(sub_entry, dirty_source_names):
-    dirty_for_this = set()
-    invalidated_render_passes = set()
-    for p, pinfo in enumerate(sub_entry.render_info):
-        if pinfo:
-            for src_name in pinfo.used_source_names:
-                is_dirty = (src_name in dirty_source_names)
-                if is_dirty:
-                    invalidated_render_passes.add(p)
-                    dirty_for_this.add(src_name)
-                    break
-    return dirty_for_this, invalidated_render_passes
+def _get_bake_status(page, out_path, force, prev_sub_entry, cur_sub_entry):
+    # Figure out if we need to invalidate or force anything.
+    status = _compute_force_flags(prev_sub_entry, cur_sub_entry)
+    if status != STATUS_CLEAN:
+        return status
+
+    # Easy test.
+    if force:
+        return STATUS_BAKE
+
+    # Check for up-to-date outputs.
+    in_path_time = page.content_mtime
+    try:
+        out_path_time = os.path.getmtime(out_path)
+    except OSError:
+        # File doesn't exist, we'll need to bake.
+        return STATUS_BAKE
+
+    if out_path_time <= in_path_time:
+        return STATUS_BAKE
+
+    # Nope, all good.
+    return STATUS_CLEAN
+
+
+def _compute_force_flags(prev_sub_entry, cur_sub_entry):
+    if prev_sub_entry and prev_sub_entry.errors:
+        # Previous bake failed. We'll have to bake it again.
+        cur_sub_entry.flags |= \
+            SubPagePipelineRecordEntry.FLAG_FORCED_BY_PREVIOUS_ERRORS
+        return STATUS_BAKE
+
+    if not prev_sub_entry:
+        cur_sub_entry.flags |= \
+            SubPagePipelineRecordEntry.FLAG_FORCED_BY_NO_PREVIOUS
+        return STATUS_BAKE
+
+    return STATUS_CLEAN
 
 
 def _ensure_dir_exists(path):
--- a/piecrust/pipelines/_pagerecords.py	Mon Jun 12 22:30:06 2017 -0700
+++ b/piecrust/pipelines/_pagerecords.py	Tue Jun 13 22:30:27 2017 -0700
@@ -50,6 +50,7 @@
         super().__init__()
         self.flags = self.FLAG_NONE
         self.config = None
+        self.timestamp = None
         self.subs = []
 
     @property
@@ -108,7 +109,11 @@
                 'URI': sub.out_uri,
                 'Path': sub.out_path,
                 'Flags': get_flag_descriptions(
-                    sub.flags, sub_flag_descriptions)
+                    sub.flags, sub_flag_descriptions),
+                'RenderInfo': [
+                    _describe_render_info(sub.render_info[0]),
+                    _describe_render_info(sub.render_info[1])
+                ]
             }
         return d
 
@@ -129,3 +134,12 @@
     SubPagePipelineRecordEntry.FLAG_FORMATTING_INVALIDATED:
     'formatting invalidated'
 }
+
+
+def _describe_render_info(ri):
+    return {
+        'UsedPagination': ri.used_pagination,
+        'PaginationHasMore': ri.pagination_has_more,
+        'UsedAssets': ri.used_assets,
+        'UsedSourceNames': ri.used_source_names
+    }
--- a/piecrust/pipelines/page.py	Mon Jun 12 22:30:06 2017 -0700
+++ b/piecrust/pipelines/page.py	Tue Jun 13 22:30:27 2017 -0700
@@ -70,6 +70,7 @@
 
         page = self.app.getPage(self.source, content_item)
         record_entry.config = page.config.getAll()
+        record_entry.timestamp = page.datetime.timestamp()
 
         rdrctx = RenderingContext(page)
         self.app.env.abort_source_use = True
@@ -93,4 +94,4 @@
         page = self.app.getPage(self.source, content_item)
         prev_entry = ctx.previous_entry
         cur_entry = result.record_entry
-        self._pagebaker.bake(page, prev_entry, cur_entry, [])
+        self._pagebaker.bake(page, prev_entry, cur_entry)
--- a/piecrust/sources/base.py	Mon Jun 12 22:30:06 2017 -0700
+++ b/piecrust/sources/base.py	Tue Jun 13 22:30:27 2017 -0700
@@ -83,6 +83,7 @@
         self.app = app
         self.name = name
         self.config = config or {}
+        self._cache = None
 
     @property
     def is_theme_source(self):
@@ -99,6 +100,10 @@
         raise NotImplementedError()
 
     def getAllContents(self):
+        if self._cache is not None:
+            return self._cache
+
+        cache = []
         stack = collections.deque()
         stack.append(None)
         while len(stack) > 0:
@@ -112,7 +117,9 @@
                     if c.is_group:
                         stack.append(c)
                     else:
-                        yield c
+                        cache.append(c)
+        self._cache = cache
+        return cache
 
     def getContents(self, group):
         raise NotImplementedError("'%s' doesn't implement 'getContents'." %
--- a/piecrust/sources/blogarchives.py	Mon Jun 12 22:30:06 2017 -0700
+++ b/piecrust/sources/blogarchives.py	Tue Jun 13 22:30:27 2017 -0700
@@ -188,7 +188,7 @@
         prev_entry = ctx.previous_entry
         cur_entry = result.record_entry
         cur_entry.year = job.content_item.metadata['route_params']['year']
-        self._pagebaker.bake(page, prev_entry, cur_entry, [])
+        self._pagebaker.bake(page, prev_entry, cur_entry)
 
     def postJobRun(self, ctx):
         # Create bake entries for the years that were *not* dirty.
--- a/piecrust/sources/taxonomy.py	Mon Jun 12 22:30:06 2017 -0700
+++ b/piecrust/sources/taxonomy.py	Tue Jun 13 22:30:27 2017 -0700
@@ -294,7 +294,7 @@
         prev_entry = ctx.previous_entry
         cur_entry = result.record_entry
         cur_entry.term = content_item.metadata['term']
-        self._pagebaker.bake(page, prev_entry, cur_entry, [])
+        self._pagebaker.bake(page, prev_entry, cur_entry)
 
     def postJobRun(self, ctx):
         # We create bake entries for all the terms that were *not* dirty.