changeset 91:e88e330eb8dc

Improvements to incremental baking and cache invalidating.
author Ludovic Chabant <ludovic@chabant.com>
date Fri, 05 Sep 2014 00:42:13 -0700
parents e293f08d954e
children 0dd43c5f5484
files piecrust/baking/baker.py piecrust/baking/records.py piecrust/records.py
diffstat 3 files changed, 124 insertions(+), 76 deletions(-) [+]
line wrap: on
line diff
--- a/piecrust/baking/baker.py	Fri Sep 05 00:40:45 2014 -0700
+++ b/piecrust/baking/baker.py	Fri Sep 05 00:42:13 2014 -0700
@@ -6,7 +6,9 @@
 import logging
 import threading
 import urllib.request, urllib.error, urllib.parse
-from piecrust.baking.records import TransitionalBakeRecord, BakeRecordPageEntry
+from piecrust.baking.records import (TransitionalBakeRecord,
+        BakeRecordPageEntry,
+        FLAG_OVERRIDEN, FLAG_SOURCE_MODIFIED)
 from piecrust.chefutil import format_timed, log_friendly_exception
 from piecrust.data.filters import (PaginationFilter, HasFilterClause,
         IsFilterClause, AndBooleanClause)
@@ -76,7 +78,8 @@
 
         return os.path.normpath(os.path.join(*bake_path))
 
-    def bake(self, factory, route, taxonomy_name=None, taxonomy_term=None):
+    def bake(self, factory, route, record_entry,
+            taxonomy_name=None, taxonomy_term=None):
         pagination_filter = None
         custom_data = None
         if taxonomy_name and taxonomy_term:
@@ -117,39 +120,52 @@
             logger.debug("'%s' [%s] is overriden by '%s:%s'. Skipping" %
                     (factory.ref_spec, uri, override.source_name,
                         override.rel_path))
-            entry = BakeRecordPageEntry()
-            entry.path = factory.path
-            entry.rel_path = factory.rel_path
-            entry.source_name = factory.source.name
-            entry.was_overriden = True
-
-            if self.record:
-                self.record.addEntry(entry)
-
-            return entry
+            record_entry.flags |= FLAG_OVERRIDEN
+            return
 
         cur_sub = 1
         has_more_subs = True
+        force_this = self.force
         page = factory.buildPage()
-        cur_record_entry = BakeRecordPageEntry(page)
-        cur_record_entry.taxonomy_name = taxonomy_name
-        cur_record_entry.taxonomy_term = taxonomy_term
+        record_entry.config = page.config.get().copy()
         prev_record_entry = self.record.getPreviousEntry(
                 factory.source.name, factory.rel_path,
                 taxonomy_name, taxonomy_term)
 
         logger.debug("Baking '%s'..." % uri)
+
+        # If the current page is known to use pages from other sources,
+        # see if any of those got baked, or are going to be baked for some
+        # reason. If so, we need to bake this one too.
+        # (this happens for instance with the main page of a blog).
+        if prev_record_entry:
+            any_used_src_baked = False
+            used_src_names = list(prev_record_entry.used_source_names)
+            for src_name in used_src_names:
+                entries = self.record.getCurrentEntries(src_name)
+                for e in entries:
+                    if e.was_baked or e.flags & FLAG_SOURCE_MODIFIED:
+                        any_used_src_baked = True
+                        break
+                if any_used_src_baked:
+                    break
+            if any_used_src_baked:
+                logger.debug("'%s' is known to use sources %s, at least one "
+                             "of which got baked. Will force bake this page. "
+                             % (uri, used_src_names))
+                force_this = True
+
         while has_more_subs:
             sub_uri = self.getOutputUri(uri, cur_sub)
             out_path = self.getOutputPath(sub_uri)
 
             # Check for up-to-date outputs.
             do_bake = True
-            if not self.force and prev_record_entry:
+            if not force_this and prev_record_entry:
                 try:
-                    in_path_time = page.path_mtime
+                    in_path_time = record_entry.path_mtime
                     out_path_time = os.path.getmtime(out_path)
-                    if out_path_time > in_path_time:
+                    if out_path_time > in_path_time and not any_used_src_baked:
                         do_bake = False
                 except OSError:
                     # File doesn't exist, we'll need to bake.
@@ -202,23 +218,18 @@
                     shutil.copy(ap, dest_ap)
 
             # Record what we did and figure out if we have more work.
-            cur_record_entry.out_uris.append(sub_uri)
-            cur_record_entry.out_paths.append(out_path)
-            cur_record_entry.used_source_names |= ctx.used_source_names
-            cur_record_entry.used_taxonomy_terms |= ctx.used_taxonomy_terms
+            record_entry.out_uris.append(sub_uri)
+            record_entry.out_paths.append(out_path)
+            record_entry.used_source_names |= ctx.used_source_names
+            record_entry.used_taxonomy_terms |= ctx.used_taxonomy_terms
 
             has_more_subs = False
             if ctx.used_pagination is not None:
-                cur_record_entry.addUsedSource(ctx.used_pagination._source)
+                record_entry.addUsedSource(ctx.used_pagination._source)
                 if ctx.used_pagination.has_more:
                     cur_sub += 1
                     has_more_subs = True
 
-        if self.record:
-            self.record.addEntry(cur_record_entry)
-
-        return cur_record_entry
-
     def _bakeSingle(self, page, sub_uri, num, out_path,
             pagination_filter=None, custom_data=None):
         ctx = PageRenderingContext(page, sub_uri)
@@ -314,6 +325,7 @@
         # Save the bake record.
         t = time.clock()
         record.current.bake_time = time.time()
+        record.current.out_dir = self.out_dir
         record.collapseRecords()
         record.saveCurrent(record_cache.getCachePath(record_name))
         logger.debug(format_timed(t, 'saved bake record', colored=False))
@@ -333,7 +345,8 @@
             # The configuration file was changed, or we're running a new
             # version of the app.
             reason = "not valid anymore"
-        elif not record.previous.bake_time:
+        elif (not record.previous.bake_time or
+                not record.previous.hasLatestVersion()):
             # We have no valid previous bake record.
             reason = "need bake record regeneration"
         else:
@@ -356,9 +369,11 @@
                 logger.debug("Cleaning baker cache: %s" % cache_dir)
                 shutil.rmtree(cache_dir)
             self.force = True
+            record.incremental_count = 0
             logger.info(format_timed(start_time,
                 "cleaned cache (reason: %s)" % reason))
         else:
+            record.incremental_count += 1
             logger.debug(format_timed(start_time, "cache is assumed valid",
                 colored=False))
 
@@ -381,8 +396,9 @@
                     logger.error("Can't get route for page: %s" % fac.ref_spec)
                     continue
 
-                logger.debug("Queuing: %s" % fac.ref_spec)
-                queue.addJob(BakeWorkerJob(fac, route))
+                entry = BakeRecordPageEntry(fac)
+                record.addEntry(entry)
+                queue.addJob(BakeWorkerJob(fac, route, entry))
 
         self._waitOnWorkerPool(pool, abort)
 
@@ -469,8 +485,10 @@
                             {tax.term_name: term})
                     logger.debug("Queuing: %s [%s, %s]" %
                             (fac.ref_spec, tax_name, term))
+                    entry = BakeRecordPageEntry(fac, tax_name, term)
+                    record.addEntry(entry)
                     queue.addJob(
-                            BakeWorkerJob(fac, route, tax_name, term))
+                            BakeWorkerJob(fac, route, entry, tax_name, term))
 
         self._waitOnWorkerPool(pool, abort)
 
@@ -527,32 +545,34 @@
         self._done_event = threading.Event()
 
     def addJob(self, job):
-        logger.debug("Adding job '%s:%s' to scheduler." % (
+        logger.debug("Queuing job '%s:%s'." % (
             job.factory.source.name, job.factory.rel_path))
         with self._lock:
             self.jobs.append(job)
         self._added_event.set()
 
     def onJobFinished(self, job):
-        logger.debug("Removing job '%s:%s' from scheduler." % (
+        logger.debug("Removing job '%s:%s'." % (
             job.factory.source.name, job.factory.rel_path))
         with self._lock:
             self._active_jobs.remove(job)
         self._done_event.set()
 
-    def getNextJob(self, timeout=None):
+    def getNextJob(self, wait_timeout=None, empty_timeout=None):
         self._added_event.clear()
         self._done_event.clear()
         job = self._doGetNextJob()
         while job in (self._EMPTY, self._WAIT):
-            if timeout is None:
-                return None
             if job == self._EMPTY:
+                if empty_timeout is None:
+                    return None
                 logger.debug("Waiting for a new job to be added...")
-                res = self._added_event.wait(timeout)
+                res = self._added_event.wait(empty_timeout)
             elif job == self._WAIT:
+                if wait_timeout is None:
+                    return None
                 logger.debug("Waiting for a job to be finished...")
-                res = self._done_event.wait(timeout)
+                res = self._done_event.wait(wait_timeout)
             if not res:
                 logger.debug("Timed-out. No job found.")
                 return None
@@ -573,6 +593,7 @@
                 job = self.jobs.pop(0)
                 if job == first_job:
                     # None of the jobs are ready... we need to wait.
+                    self.jobs.append(job)
                     return self._WAIT
 
             logger.debug("Job '%s:%s' is ready to go, moving to active "
@@ -586,6 +607,8 @@
         if not e:
             return True
         for sn in e.used_source_names:
+            if sn == job.factory.source.name:
+                continue
             if any(filter(lambda j: j.factory.source.name == sn, self.jobs)):
                 return False
             if any(filter(lambda j: j.factory.source.name == sn,
@@ -606,9 +629,11 @@
 
 
 class BakeWorkerJob(object):
-    def __init__(self, factory, route, taxonomy_name=None, taxonomy_term=None):
+    def __init__(self, factory, route, record_entry,
+            taxonomy_name=None, taxonomy_term=None):
         self.factory = factory
         self.route = route
+        self.record_entry = record_entry
         self.taxonomy_name = taxonomy_name
         self.taxonomy_term = taxonomy_term
 
@@ -628,7 +653,7 @@
 
     def run(self):
         while(not self.ctx.abort_event.is_set()):
-            job = self.ctx.work_queue.getNextJob()
+            job = self.ctx.work_queue.getNextJob(wait_timeout=1)
             if job is None:
                 logger.debug("[%d] No more work... shutting down." % self.wid)
                 break
@@ -648,16 +673,17 @@
     def _unsafeRun(self, job):
         start_time = time.clock()
 
-        bake_res = self._page_baker.bake(job.factory, job.route,
+        entry = job.record_entry
+        self._page_baker.bake(job.factory, job.route, entry,
                 taxonomy_name=job.taxonomy_name,
                 taxonomy_term=job.taxonomy_term)
 
-        if bake_res.was_baked:
-            uri = bake_res.out_uris[0]
+        if entry.was_baked:
+            uri = entry.out_uris[0]
             friendly_uri = uri if uri != '' else '[main page]'
             friendly_count = ''
-            if bake_res.num_subs > 1:
-                friendly_count = ' (%d pages)' % bake_res.num_subs
+            if entry.num_subs > 1:
+                friendly_count = ' (%d pages)' % entry.num_subs
             logger.info(format_timed(start_time, '[%d] %s%s' %
                     (self.wid, friendly_uri, friendly_count)))
 
--- a/piecrust/baking/records.py	Fri Sep 05 00:40:45 2014 -0700
+++ b/piecrust/baking/records.py	Fri Sep 05 00:42:13 2014 -0700
@@ -1,4 +1,6 @@
+import os.path
 import logging
+from piecrust import APP_VERSION
 from piecrust.sources.base import PageSource
 from piecrust.records import Record
 
@@ -6,6 +8,9 @@
 logger = logging.getLogger(__name__)
 
 
+RECORD_VERSION = 4
+
+
 def _get_transition_key(source_name, rel_path, taxonomy_name=None,
         taxonomy_term=None):
     key = '%s:%s' % (source_name, rel_path)
@@ -19,34 +24,44 @@
 
 
 class BakeRecord(Record):
-    VERSION = 1
-
     def __init__(self):
         super(BakeRecord, self).__init__()
         self.out_dir = None
         self.bake_time = None
+        self.app_version = APP_VERSION
+        self.record_version = RECORD_VERSION
+
+    def hasLatestVersion(self):
+        return (self.app_version == APP_VERSION and
+                self.record_version == RECORD_VERSION)
+
+    def __setstate__(self, state):
+        state.setdefault('app_version', -1)
+        state.setdefault('record_version', -1)
+        super(BakeRecord, self).__setstate__(state)
+
+
+FLAG_NONE = 0
+FLAG_SOURCE_MODIFIED = 2**0
+FLAG_OVERRIDEN = 2**1
 
 
 class BakeRecordPageEntry(object):
-    def __init__(self, page=None):
-        self.path = None
-        self.rel_path = None
-        self.source_name = None
+    def __init__(self, factory, taxonomy_name=None, taxonomy_term=None):
+        self.path = factory.path
+        self.rel_path = factory.rel_path
+        self.source_name = factory.source.name
+        self.taxonomy_name = taxonomy_name
+        self.taxonomy_term = taxonomy_term
+        self.path_mtime = os.path.getmtime(factory.path)
+
+        self.flags = FLAG_NONE
         self.config = None
-        self.taxonomy_name = None
-        self.taxonomy_term = None
-        self.was_overriden = False
         self.out_uris = []
         self.out_paths = []
         self.used_source_names = set()
         self.used_taxonomy_terms = set()
 
-        if page:
-            self.path = page.path
-            self.rel_path = page.rel_path
-            self.source_name = page.source.name
-            self.config = page.config.get()
-
     @property
     def was_baked(self):
         return len(self.out_paths) > 0
@@ -64,6 +79,11 @@
         if isinstance(source, PageSource):
             self.used_source_names.add(source.name)
 
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        del state['path_mtime']
+        return state
+
 
 class TransitionalBakeRecord(object):
     DELETION_MISSING = 1
@@ -73,6 +93,7 @@
         self.previous = BakeRecord()
         self.current = BakeRecord()
         self.transitions = {}
+        self.incremental_count = 0
         if previous_path:
             self.loadPrevious(previous_path)
         self.current.entry_added += self._onCurrentEntryAdded
@@ -93,6 +114,9 @@
         self.current.save(current_path)
 
     def addEntry(self, entry):
+        if (self.previous.bake_time and
+                entry.path_mtime >= self.previous.bake_time):
+            entry.flags |= FLAG_SOURCE_MODIFIED
         self.current.addEntry(entry)
 
     def getOverrideEntry(self, factory, uri):
@@ -120,6 +144,10 @@
             return pair[0]
         return None
 
+    def getCurrentEntries(self, source_name):
+        return [e for e in self.current.entries
+                if e.source_name == source_name]
+
     def collapseRecords(self):
         for pair in self.transitions.values():
             prev = pair[0]
@@ -129,7 +157,9 @@
                 # This page wasn't baked, so the information from last
                 # time is still valid (we didn't get any information
                 # since we didn't bake).
-                cur.was_overriden = prev.was_overriden
+                cur.flags = prev.flags
+                if prev.config:
+                    cur.config = prev.config.copy()
                 cur.out_uris = list(prev.out_uris)
                 cur.out_paths = list(prev.out_paths)
                 cur.used_source_names = set(prev.used_source_names)
--- a/piecrust/records.py	Fri Sep 05 00:40:45 2014 -0700
+++ b/piecrust/records.py	Fri Sep 05 00:42:13 2014 -0700
@@ -1,31 +1,18 @@
 import os
 import os.path
+import pickle
 import logging
-from piecrust import APP_VERSION
 from piecrust.events import Event
 
-try:
-    import pickle as pickle
-except ImportError:
-    import pickle
-
 
 logger = logging.getLogger(__name__)
 
 
 class Record(object):
-    VERSION = 1
-
     def __init__(self):
-        self.app_version = None
-        self.record_version = None
         self.entries = []
         self.entry_added = Event()
 
-    def isVersionMatch(self):
-        return (self.app_version == APP_VERSION and
-                self.record_version == self.VERSION)
-
     def addEntry(self, entry):
         self.entries.append(entry)
         self.entry_added.fire(entry)
@@ -43,6 +30,11 @@
         del odict['entry_added']
         return odict
 
+    def __setstate__(self, state):
+        for k, v in state.items():
+            setattr(self, k, v)
+        self.entry_added = Event()
+
     @staticmethod
     def load(path):
         logger.debug("Loading bake record from: %s" % path)