view piecrust/processing/sitemap.py @ 1188:a7c43131d871

bake: Fix file write flushing problem with Python 3.8+ Writing the cache files fails in Python 3.8 because it looks like flushing behaviour has changed. We need to explicitly flush. And even then, in very rare occurrences, it looks like it can still run into racing conditions, so we do a very hacky and ugly "retry" loop when fetching cached data :(
author Ludovic Chabant <ludovic@chabant.com>
date Tue, 15 Jun 2021 22:36:23 -0700
parents 727110ea112a
children
line wrap: on
line source

import os
import os.path
import time
import logging
import yaml
from piecrust.dataproviders.pageiterator import PageIterator
from piecrust.processing.base import SimpleFileProcessor


logger = logging.getLogger(__name__)


SITEMAP_HEADER = \
"""<?xml version="1.0" encoding="utf-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
"""
SITEMAP_FOOTER = "</urlset>\n"

SITEURL_HEADER =     "  <url>\n"  # NOQA: E222
SITEURL_LOC =        "    <loc>%s</loc>\n"  # NOQA: E222
SITEURL_LASTMOD =    "    <lastmod>%s</lastmod>\n"  # NOQA: E222
SITEURL_CHANGEFREQ = "    <changefreq>%s</changefreq>\n"  # NOQA: E222
SITEURL_PRIORITY =   "    <priority>%0.1f</priority>\n"  # NOQA: E222
SITEURL_FOOTER =     "  </url>\n"  # NOQA: E222


class SitemapProcessor(SimpleFileProcessor):
    PROCESSOR_NAME = 'sitemap'

    def __init__(self):
        super(SitemapProcessor, self).__init__({'sitemap': 'xml'})
        self._start_time = None

    def onPipelineStart(self, ctx):
        self._start_time = time.time()

    def _doProcess(self, in_path, out_path):
        with open(in_path, 'r') as fp:
            sitemap = yaml.safe_load(fp)

        try:
            with open(out_path, 'w') as fp:
                fp.write(SITEMAP_HEADER)
                self._writeManualLocs(sitemap, fp)
                self._writeAutoLocs(sitemap, fp)
                fp.write(SITEMAP_FOOTER)
        except:
            # If an exception occurs, delete the output file otherwise
            # the pipeline will think the output was correctly produced.
            if os.path.isfile(out_path):
                logger.debug("Error occured, removing output sitemap.")
                os.unlink(out_path)
            raise

        return True

    def _writeManualLocs(self, sitemap, fp):
        locs = sitemap.setdefault('locations', None)
        if not locs:
            return

        logger.debug("Generating manual sitemap entries.")
        for loc in locs:
            self._writeEntry(loc, fp)

    def _writeAutoLocs(self, sitemap, fp):
        source_names = sitemap.setdefault('autogen', None)
        if not source_names:
            return

        cur_time = strftime_iso8601(time.time())
        for name in source_names:
            logger.debug("Generating automatic sitemap entries for '%s'." %
                         name)
            source = self.app.getSource(name)
            if source is None:
                raise Exception("No such source: %s" % name)

            it = PageIterator(source)
            for page in it:
                uri = page['url']
                sm_cfg = page.get('sitemap')

                args = {'url': uri, 'lastmod': cur_time}
                if sm_cfg:
                    args.update(sm_cfg)

                self._writeEntry(args, fp)

    def _writeEntry(self, args, fp):
        fp.write(SITEURL_HEADER)
        fp.write(SITEURL_LOC % args['url'])
        if 'lastmod' in args:
            fp.write(SITEURL_LASTMOD % args['lastmod'])
        if 'changefreq' in args:
            fp.write(SITEURL_CHANGEFREQ % args['changefreq'])
        if 'priority' in args:
            fp.write(SITEURL_PRIORITY % args['priority'])
        fp.write(SITEURL_FOOTER)


def strftime_iso8601(t):
    return time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime(t))