Mercurial > wikked
changeset 498:e7d7ebcd0d56
resolve: New options for resolving pages.
author | Ludovic Chabant <ludovic@chabant.com> |
---|---|
date | Sat, 06 Jun 2020 23:59:12 -0700 |
parents | 36c3e9b1d1e3 |
children | e75b39a762fd |
files | wikked/commands/manage.py wikked/resolver.py wikked/scheduler.py wikked/wiki.py |
diffstat | 4 files changed, 54 insertions(+), 10 deletions(-) [+] |
line wrap: on
line diff
--- a/wikked/commands/manage.py Sat Jun 06 22:24:38 2020 -0700 +++ b/wikked/commands/manage.py Sat Jun 06 23:59:12 2020 -0700 @@ -142,6 +142,10 @@ def setupParser(self, parser): parser.add_argument( + 'urls', + action='append', + help="Which pages to resolve, otherwise all pages") + parser.add_argument( '-f', '--force', help="Force resolve all pages", action='store_true') @@ -149,11 +153,16 @@ '--parallel', help="Run the operation with multiple workers in parallel", action='store_true') + parser.add_argument( + '-i', '--intermediate-dir', + help="Write intermediate (unformatted) files to the given directory") def run(self, ctx): ctx.wiki.resolve( + only_urls=ctx.args.urls, force=ctx.args.force, - parallel=ctx.args.parallel) + parallel=ctx.args.parallel, + intermediate_dir=ctx.args.intermediate_dir) @register_command
--- a/wikked/resolver.py Sat Jun 06 22:24:38 2020 -0700 +++ b/wikked/resolver.py Sat Jun 06 23:59:12 2020 -0700 @@ -150,6 +150,9 @@ 'include': self._runInclude } + self._intermediate_dir = None + self._dump = None + @property def wiki(self): return self.page.wiki @@ -158,7 +161,11 @@ def is_root(self): return self.page == self.ctx.root_page - def run(self): + def run(self, intermediate_dir=None): + if intermediate_dir: + self._intermediate_dir = intermediate_dir + self._dump = self._make_dump_func(intermediate_dir) + os.makedirs(intermediate_dir, exist_ok=True) try: return self._unsafeRun() except Exception as e: @@ -167,6 +174,20 @@ self.output = ResolveOutput(self.page) self.output.text = '<div class="error">%s</div>' % e return self.output + finally: + self._intermediate_dir = None + self._dump = None + + def _make_dump_func(self, intermediate_dir): + filename = self.page.filename + dump_prefix = os.path.join(intermediate_dir, filename) + + def _result(content, suffix): + dump_path = "%s.%s.txt" % (dump_prefix, suffix) + with open(dump_path, 'w', encoding='utf8') as fp: + fp.write(content) + + return _result def _getPage(self, url): fields = ['url', 'title', 'path', 'formatted_text', 'local_meta', @@ -199,6 +220,8 @@ # Start with the page's text. final_text = self.page.getFormattedText() + if self._dump: + self._dump(final_text, 'formatted') # Resolve queries, includes, etc. def repl2(m): @@ -217,6 +240,8 @@ return '' final_text = re_wiki_tag.sub(repl2, final_text) + if self._dump: + self._dump(final_text, 'resolved.1') # If this is the root page, with all the includes resolved and # collapsed into one text, we need to run the final steps. @@ -272,6 +297,8 @@ endpoint_markup) final_text = re_wiki_link.sub(repl1, final_text) + if self._dump: + self._dump(final_text, 'resolved.2') # Format the text. formatter = self._getFormatter(self.page.extension) @@ -335,7 +362,7 @@ self.ctx.url_trail.append(page.url) child = PageResolver(page, self.ctx, parameters, self.page_getter, self.pages_meta_getter) - child_output = child.run() + child_output = child.run(intermediate_dir=self._intermediate_dir) self.output.add(child_output) self.ctx.url_trail = current_url_trail
--- a/wikked/scheduler.py Sat Jun 06 22:24:38 2020 -0700 +++ b/wikked/scheduler.py Sat Jun 06 23:59:12 2020 -0700 @@ -17,9 +17,10 @@ """ PAGE_REGISTRY_SIZE = 256 - def __init__(self, wiki, page_urls, registry_size=None): + def __init__(self, wiki, page_urls, registry_size=None, intermediate_dir=None): self.wiki = wiki self.page_urls = page_urls + self.intermediate_dir = intermediate_dir self._cache = LRUCache(registry_size or self.PAGE_REGISTRY_SIZE) self._pages_meta = None @@ -101,7 +102,8 @@ page_getter=self.getPage, pages_meta_getter=self.getPagesMeta) runner = PageResolverRunner(page, r) - runner.run(raise_on_failure=True) + runner.run(raise_on_failure=True, + intermediate_dir=self.intermediate_dir) self.wiki.db.cachePage(page) @@ -114,10 +116,10 @@ self.page = page self.resolver = resolver - def run(self, raise_on_failure=False): + def run(self, raise_on_failure=False, intermediate_dir=None): try: logger.debug("Resolving page: %s" % self.page.url) - result = self.resolver.run() + result = self.resolver.run(intermediate_dir=intermediate_dir) except CircularIncludeError as cie: if raise_on_failure: raise @@ -150,6 +152,10 @@ self.scheduler = scheduler self.abort_on_failure = True + @property + def intermediate_dir(self): + return self.scheduler.intermediate_dir + def isDone(self): return self.scheduler._done @@ -194,7 +200,8 @@ page_getter=self.ctx.scheduler.getPage, pages_meta_getter=self.ctx.scheduler.getPagesMeta) runner = PageResolverRunner(page, r) - runner.run(raise_on_failure=self.ctx.abort_on_failure) + runner.run(raise_on_failure=self.ctx.abort_on_failure, + intermediate_dir=self.ctx.intermediate_dir) self.ctx.sendResult(job.url, page, None) except Exception as ex: logger.exception(ex)
--- a/wikked/wiki.py Sat Jun 06 22:24:38 2020 -0700 +++ b/wikked/wiki.py Sat Jun 06 23:59:12 2020 -0700 @@ -324,7 +324,8 @@ self.resolve(force=True, parallel=parallel) self.index.reset(self.getPages()) - def resolve(self, only_urls=None, force=False, parallel=False): + def resolve(self, only_urls=None, force=False, parallel=False, + intermediate_dir=None): """ Compute the final info (text, meta, links) of all or a subset of the pages, and caches it in the DB. """ @@ -335,7 +336,7 @@ page_urls = self.db.getPageUrls(uncached_only=(not force)) num_workers = multiprocessing.cpu_count() if parallel else 1 - s = ResolveScheduler(self, page_urls) + s = ResolveScheduler(self, page_urls, intermediate_dir=intermediate_dir) s.run(num_workers) def updatePage(self, url=None, path=None):