changeset 498:e7d7ebcd0d56

resolve: New options for resolving pages.
author Ludovic Chabant <ludovic@chabant.com>
date Sat, 06 Jun 2020 23:59:12 -0700
parents 36c3e9b1d1e3
children e75b39a762fd
files wikked/commands/manage.py wikked/resolver.py wikked/scheduler.py wikked/wiki.py
diffstat 4 files changed, 54 insertions(+), 10 deletions(-) [+]
line wrap: on
line diff
--- a/wikked/commands/manage.py	Sat Jun 06 22:24:38 2020 -0700
+++ b/wikked/commands/manage.py	Sat Jun 06 23:59:12 2020 -0700
@@ -142,6 +142,10 @@
 
     def setupParser(self, parser):
         parser.add_argument(
+                'urls',
+                action='append',
+                help="Which pages to resolve, otherwise all pages")
+        parser.add_argument(
                 '-f', '--force',
                 help="Force resolve all pages",
                 action='store_true')
@@ -149,11 +153,16 @@
                 '--parallel',
                 help="Run the operation with multiple workers in parallel",
                 action='store_true')
+        parser.add_argument(
+                '-i', '--intermediate-dir',
+                help="Write intermediate (unformatted) files to the given directory")
 
     def run(self, ctx):
         ctx.wiki.resolve(
+            only_urls=ctx.args.urls,
             force=ctx.args.force,
-            parallel=ctx.args.parallel)
+            parallel=ctx.args.parallel,
+            intermediate_dir=ctx.args.intermediate_dir)
 
 
 @register_command
--- a/wikked/resolver.py	Sat Jun 06 22:24:38 2020 -0700
+++ b/wikked/resolver.py	Sat Jun 06 23:59:12 2020 -0700
@@ -150,6 +150,9 @@
                 'include': self._runInclude
                 }
 
+        self._intermediate_dir = None
+        self._dump = None
+
     @property
     def wiki(self):
         return self.page.wiki
@@ -158,7 +161,11 @@
     def is_root(self):
         return self.page == self.ctx.root_page
 
-    def run(self):
+    def run(self, intermediate_dir=None):
+        if intermediate_dir:
+            self._intermediate_dir = intermediate_dir
+            self._dump = self._make_dump_func(intermediate_dir)
+            os.makedirs(intermediate_dir, exist_ok=True)
         try:
             return self._unsafeRun()
         except Exception as e:
@@ -167,6 +174,20 @@
             self.output = ResolveOutput(self.page)
             self.output.text = '<div class="error">%s</div>' % e
             return self.output
+        finally:
+            self._intermediate_dir = None
+            self._dump = None
+
+    def _make_dump_func(self, intermediate_dir):
+        filename = self.page.filename
+        dump_prefix = os.path.join(intermediate_dir, filename)
+
+        def _result(content, suffix):
+            dump_path = "%s.%s.txt" % (dump_prefix, suffix)
+            with open(dump_path, 'w', encoding='utf8') as fp:
+                fp.write(content)
+
+        return _result
 
     def _getPage(self, url):
         fields = ['url', 'title', 'path', 'formatted_text', 'local_meta',
@@ -199,6 +220,8 @@
 
         # Start with the page's text.
         final_text = self.page.getFormattedText()
+        if self._dump:
+            self._dump(final_text, 'formatted')
 
         # Resolve queries, includes, etc.
         def repl2(m):
@@ -217,6 +240,8 @@
             return ''
 
         final_text = re_wiki_tag.sub(repl2, final_text)
+        if self._dump:
+            self._dump(final_text, 'resolved.1')
 
         # If this is the root page, with all the includes resolved and
         # collapsed into one text, we need to run the final steps.
@@ -272,6 +297,8 @@
                         endpoint_markup)
 
             final_text = re_wiki_link.sub(repl1, final_text)
+            if self._dump:
+                self._dump(final_text, 'resolved.2')
 
             # Format the text.
             formatter = self._getFormatter(self.page.extension)
@@ -335,7 +362,7 @@
         self.ctx.url_trail.append(page.url)
         child = PageResolver(page, self.ctx, parameters, self.page_getter,
                              self.pages_meta_getter)
-        child_output = child.run()
+        child_output = child.run(intermediate_dir=self._intermediate_dir)
         self.output.add(child_output)
         self.ctx.url_trail = current_url_trail
 
--- a/wikked/scheduler.py	Sat Jun 06 22:24:38 2020 -0700
+++ b/wikked/scheduler.py	Sat Jun 06 23:59:12 2020 -0700
@@ -17,9 +17,10 @@
     """
     PAGE_REGISTRY_SIZE = 256
 
-    def __init__(self, wiki, page_urls, registry_size=None):
+    def __init__(self, wiki, page_urls, registry_size=None, intermediate_dir=None):
         self.wiki = wiki
         self.page_urls = page_urls
+        self.intermediate_dir = intermediate_dir
 
         self._cache = LRUCache(registry_size or self.PAGE_REGISTRY_SIZE)
         self._pages_meta = None
@@ -101,7 +102,8 @@
                         page_getter=self.getPage,
                         pages_meta_getter=self.getPagesMeta)
                 runner = PageResolverRunner(page, r)
-                runner.run(raise_on_failure=True)
+                runner.run(raise_on_failure=True,
+                           intermediate_dir=self.intermediate_dir)
                 self.wiki.db.cachePage(page)
 
 
@@ -114,10 +116,10 @@
         self.page = page
         self.resolver = resolver
 
-    def run(self, raise_on_failure=False):
+    def run(self, raise_on_failure=False, intermediate_dir=None):
         try:
             logger.debug("Resolving page: %s" % self.page.url)
-            result = self.resolver.run()
+            result = self.resolver.run(intermediate_dir=intermediate_dir)
         except CircularIncludeError as cie:
             if raise_on_failure:
                 raise
@@ -150,6 +152,10 @@
         self.scheduler = scheduler
         self.abort_on_failure = True
 
+    @property
+    def intermediate_dir(self):
+        return self.scheduler.intermediate_dir
+
     def isDone(self):
         return self.scheduler._done
 
@@ -194,7 +200,8 @@
                         page_getter=self.ctx.scheduler.getPage,
                         pages_meta_getter=self.ctx.scheduler.getPagesMeta)
                 runner = PageResolverRunner(page, r)
-                runner.run(raise_on_failure=self.ctx.abort_on_failure)
+                runner.run(raise_on_failure=self.ctx.abort_on_failure,
+                           intermediate_dir=self.ctx.intermediate_dir)
                 self.ctx.sendResult(job.url, page, None)
             except Exception as ex:
                 logger.exception(ex)
--- a/wikked/wiki.py	Sat Jun 06 22:24:38 2020 -0700
+++ b/wikked/wiki.py	Sat Jun 06 23:59:12 2020 -0700
@@ -324,7 +324,8 @@
         self.resolve(force=True, parallel=parallel)
         self.index.reset(self.getPages())
 
-    def resolve(self, only_urls=None, force=False, parallel=False):
+    def resolve(self, only_urls=None, force=False, parallel=False,
+                intermediate_dir=None):
         """ Compute the final info (text, meta, links) of all or a subset of
             the pages, and caches it in the DB.
         """
@@ -335,7 +336,7 @@
             page_urls = self.db.getPageUrls(uncached_only=(not force))
 
         num_workers = multiprocessing.cpu_count() if parallel else 1
-        s = ResolveScheduler(self, page_urls)
+        s = ResolveScheduler(self, page_urls, intermediate_dir=intermediate_dir)
         s.run(num_workers)
 
     def updatePage(self, url=None, path=None):