annotate piecrust/processing/sitemap.py @ 1145:e94737572542

serve: Fix an issue where false positive matches were rendered as the requested page. Now we try to render the page, but also try to detect for the most common "empty" pages.
author Ludovic Chabant <ludovic@chabant.com>
date Tue, 05 Jun 2018 22:08:51 -0700
parents 45ad976712ec
children 727110ea112a
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
853
f070a4fc033c core: Continue PieCrust3 refactor, simplify pages.
Ludovic Chabant <ludovic@chabant.com>
parents: 852
diff changeset
1 import os
f070a4fc033c core: Continue PieCrust3 refactor, simplify pages.
Ludovic Chabant <ludovic@chabant.com>
parents: 852
diff changeset
2 import os.path
34
bdb103c57168 Add `sitemap` processor.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
3 import time
bdb103c57168 Add `sitemap` processor.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
4 import logging
bdb103c57168 Add `sitemap` processor.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
5 import yaml
854
08e02c2a2a1a core: Keep refactoring, this time to prepare for generator sources.
Ludovic Chabant <ludovic@chabant.com>
parents: 853
diff changeset
6 from piecrust.dataproviders.pageiterator import PageIterator
34
bdb103c57168 Add `sitemap` processor.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
7 from piecrust.processing.base import SimpleFileProcessor
bdb103c57168 Add `sitemap` processor.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
8
bdb103c57168 Add `sitemap` processor.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
9
bdb103c57168 Add `sitemap` processor.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
10 logger = logging.getLogger(__name__)
bdb103c57168 Add `sitemap` processor.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
11
bdb103c57168 Add `sitemap` processor.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
12
bdb103c57168 Add `sitemap` processor.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
13 SITEMAP_HEADER = \
979
45ad976712ec tests: Big push to get the tests to pass again.
Ludovic Chabant <ludovic@chabant.com>
parents: 854
diff changeset
14 """<?xml version="1.0" encoding="utf-8"?>
45ad976712ec tests: Big push to get the tests to pass again.
Ludovic Chabant <ludovic@chabant.com>
parents: 854
diff changeset
15 <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
45ad976712ec tests: Big push to get the tests to pass again.
Ludovic Chabant <ludovic@chabant.com>
parents: 854
diff changeset
16 """
34
bdb103c57168 Add `sitemap` processor.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
17 SITEMAP_FOOTER = "</urlset>\n"
bdb103c57168 Add `sitemap` processor.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
18
979
45ad976712ec tests: Big push to get the tests to pass again.
Ludovic Chabant <ludovic@chabant.com>
parents: 854
diff changeset
19 SITEURL_HEADER = " <url>\n" # NOQA: E222
45ad976712ec tests: Big push to get the tests to pass again.
Ludovic Chabant <ludovic@chabant.com>
parents: 854
diff changeset
20 SITEURL_LOC = " <loc>%s</loc>\n" # NOQA: E222
45ad976712ec tests: Big push to get the tests to pass again.
Ludovic Chabant <ludovic@chabant.com>
parents: 854
diff changeset
21 SITEURL_LASTMOD = " <lastmod>%s</lastmod>\n" # NOQA: E222
45ad976712ec tests: Big push to get the tests to pass again.
Ludovic Chabant <ludovic@chabant.com>
parents: 854
diff changeset
22 SITEURL_CHANGEFREQ = " <changefreq>%s</changefreq>\n" # NOQA: E222
45ad976712ec tests: Big push to get the tests to pass again.
Ludovic Chabant <ludovic@chabant.com>
parents: 854
diff changeset
23 SITEURL_PRIORITY = " <priority>%0.1f</priority>\n" # NOQA: E222
45ad976712ec tests: Big push to get the tests to pass again.
Ludovic Chabant <ludovic@chabant.com>
parents: 854
diff changeset
24 SITEURL_FOOTER = " </url>\n" # NOQA: E222
34
bdb103c57168 Add `sitemap` processor.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
25
bdb103c57168 Add `sitemap` processor.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
26
bdb103c57168 Add `sitemap` processor.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
27 class SitemapProcessor(SimpleFileProcessor):
bdb103c57168 Add `sitemap` processor.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
28 PROCESSOR_NAME = 'sitemap'
bdb103c57168 Add `sitemap` processor.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
29
bdb103c57168 Add `sitemap` processor.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
30 def __init__(self):
bdb103c57168 Add `sitemap` processor.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
31 super(SitemapProcessor, self).__init__({'sitemap': 'xml'})
bdb103c57168 Add `sitemap` processor.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
32 self._start_time = None
bdb103c57168 Add `sitemap` processor.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
33
852
4850f8c21b6e core: Start of the big refactor for PieCrust 3.0.
Ludovic Chabant <ludovic@chabant.com>
parents: 437
diff changeset
34 def onPipelineStart(self, ctx):
34
bdb103c57168 Add `sitemap` processor.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
35 self._start_time = time.time()
bdb103c57168 Add `sitemap` processor.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
36
bdb103c57168 Add `sitemap` processor.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
37 def _doProcess(self, in_path, out_path):
bdb103c57168 Add `sitemap` processor.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
38 with open(in_path, 'r') as fp:
bdb103c57168 Add `sitemap` processor.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
39 sitemap = yaml.load(fp)
bdb103c57168 Add `sitemap` processor.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
40
853
f070a4fc033c core: Continue PieCrust3 refactor, simplify pages.
Ludovic Chabant <ludovic@chabant.com>
parents: 852
diff changeset
41 try:
f070a4fc033c core: Continue PieCrust3 refactor, simplify pages.
Ludovic Chabant <ludovic@chabant.com>
parents: 852
diff changeset
42 with open(out_path, 'w') as fp:
f070a4fc033c core: Continue PieCrust3 refactor, simplify pages.
Ludovic Chabant <ludovic@chabant.com>
parents: 852
diff changeset
43 fp.write(SITEMAP_HEADER)
f070a4fc033c core: Continue PieCrust3 refactor, simplify pages.
Ludovic Chabant <ludovic@chabant.com>
parents: 852
diff changeset
44 self._writeManualLocs(sitemap, fp)
f070a4fc033c core: Continue PieCrust3 refactor, simplify pages.
Ludovic Chabant <ludovic@chabant.com>
parents: 852
diff changeset
45 self._writeAutoLocs(sitemap, fp)
f070a4fc033c core: Continue PieCrust3 refactor, simplify pages.
Ludovic Chabant <ludovic@chabant.com>
parents: 852
diff changeset
46 fp.write(SITEMAP_FOOTER)
f070a4fc033c core: Continue PieCrust3 refactor, simplify pages.
Ludovic Chabant <ludovic@chabant.com>
parents: 852
diff changeset
47 except:
f070a4fc033c core: Continue PieCrust3 refactor, simplify pages.
Ludovic Chabant <ludovic@chabant.com>
parents: 852
diff changeset
48 # If an exception occurs, delete the output file otherwise
f070a4fc033c core: Continue PieCrust3 refactor, simplify pages.
Ludovic Chabant <ludovic@chabant.com>
parents: 852
diff changeset
49 # the pipeline will think the output was correctly produced.
f070a4fc033c core: Continue PieCrust3 refactor, simplify pages.
Ludovic Chabant <ludovic@chabant.com>
parents: 852
diff changeset
50 if os.path.isfile(out_path):
f070a4fc033c core: Continue PieCrust3 refactor, simplify pages.
Ludovic Chabant <ludovic@chabant.com>
parents: 852
diff changeset
51 logger.debug("Error occured, removing output sitemap.")
f070a4fc033c core: Continue PieCrust3 refactor, simplify pages.
Ludovic Chabant <ludovic@chabant.com>
parents: 852
diff changeset
52 os.unlink(out_path)
f070a4fc033c core: Continue PieCrust3 refactor, simplify pages.
Ludovic Chabant <ludovic@chabant.com>
parents: 852
diff changeset
53 raise
34
bdb103c57168 Add `sitemap` processor.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
54
bdb103c57168 Add `sitemap` processor.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
55 return True
bdb103c57168 Add `sitemap` processor.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
56
bdb103c57168 Add `sitemap` processor.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
57 def _writeManualLocs(self, sitemap, fp):
bdb103c57168 Add `sitemap` processor.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
58 locs = sitemap.setdefault('locations', None)
bdb103c57168 Add `sitemap` processor.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
59 if not locs:
bdb103c57168 Add `sitemap` processor.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
60 return
bdb103c57168 Add `sitemap` processor.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
61
bdb103c57168 Add `sitemap` processor.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
62 logger.debug("Generating manual sitemap entries.")
bdb103c57168 Add `sitemap` processor.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
63 for loc in locs:
bdb103c57168 Add `sitemap` processor.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
64 self._writeEntry(loc, fp)
bdb103c57168 Add `sitemap` processor.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
65
bdb103c57168 Add `sitemap` processor.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
66 def _writeAutoLocs(self, sitemap, fp):
bdb103c57168 Add `sitemap` processor.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
67 source_names = sitemap.setdefault('autogen', None)
bdb103c57168 Add `sitemap` processor.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
68 if not source_names:
bdb103c57168 Add `sitemap` processor.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
69 return
bdb103c57168 Add `sitemap` processor.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
70
437
62274d805a6e bake: Tweaks to the `sitemap` processor. Add tests.
Ludovic Chabant <ludovic@chabant.com>
parents: 430
diff changeset
71 cur_time = strftime_iso8601(time.time())
34
bdb103c57168 Add `sitemap` processor.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
72 for name in source_names:
bdb103c57168 Add `sitemap` processor.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
73 logger.debug("Generating automatic sitemap entries for '%s'." %
430
21e26ed867b6 internal: Create full route metadata in one place.
Ludovic Chabant <ludovic@chabant.com>
parents: 287
diff changeset
74 name)
34
bdb103c57168 Add `sitemap` processor.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
75 source = self.app.getSource(name)
bdb103c57168 Add `sitemap` processor.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
76 if source is None:
bdb103c57168 Add `sitemap` processor.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
77 raise Exception("No such source: %s" % name)
bdb103c57168 Add `sitemap` processor.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
78
437
62274d805a6e bake: Tweaks to the `sitemap` processor. Add tests.
Ludovic Chabant <ludovic@chabant.com>
parents: 430
diff changeset
79 it = PageIterator(source)
62274d805a6e bake: Tweaks to the `sitemap` processor. Add tests.
Ludovic Chabant <ludovic@chabant.com>
parents: 430
diff changeset
80 for page in it:
62274d805a6e bake: Tweaks to the `sitemap` processor. Add tests.
Ludovic Chabant <ludovic@chabant.com>
parents: 430
diff changeset
81 uri = page['url']
62274d805a6e bake: Tweaks to the `sitemap` processor. Add tests.
Ludovic Chabant <ludovic@chabant.com>
parents: 430
diff changeset
82 sm_cfg = page.get('sitemap')
34
bdb103c57168 Add `sitemap` processor.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
83
437
62274d805a6e bake: Tweaks to the `sitemap` processor. Add tests.
Ludovic Chabant <ludovic@chabant.com>
parents: 430
diff changeset
84 args = {'url': uri, 'lastmod': cur_time}
34
bdb103c57168 Add `sitemap` processor.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
85 if sm_cfg:
234
1c4078ec3011 sitemap: Fixed typo bug.
Ludovic Chabant <ludovic@chabant.com>
parents: 147
diff changeset
86 args.update(sm_cfg)
34
bdb103c57168 Add `sitemap` processor.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
87
bdb103c57168 Add `sitemap` processor.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
88 self._writeEntry(args, fp)
bdb103c57168 Add `sitemap` processor.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
89
bdb103c57168 Add `sitemap` processor.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
90 def _writeEntry(self, args, fp):
bdb103c57168 Add `sitemap` processor.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
91 fp.write(SITEURL_HEADER)
bdb103c57168 Add `sitemap` processor.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
92 fp.write(SITEURL_LOC % args['url'])
bdb103c57168 Add `sitemap` processor.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
93 if 'lastmod' in args:
bdb103c57168 Add `sitemap` processor.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
94 fp.write(SITEURL_LASTMOD % args['lastmod'])
bdb103c57168 Add `sitemap` processor.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
95 if 'changefreq' in args:
bdb103c57168 Add `sitemap` processor.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
96 fp.write(SITEURL_CHANGEFREQ % args['changefreq'])
bdb103c57168 Add `sitemap` processor.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
97 if 'priority' in args:
bdb103c57168 Add `sitemap` processor.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
98 fp.write(SITEURL_PRIORITY % args['priority'])
bdb103c57168 Add `sitemap` processor.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
99 fp.write(SITEURL_FOOTER)
bdb103c57168 Add `sitemap` processor.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
100
bdb103c57168 Add `sitemap` processor.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
101
bdb103c57168 Add `sitemap` processor.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
102 def strftime_iso8601(t):
bdb103c57168 Add `sitemap` processor.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
103 return time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime(t))
bdb103c57168 Add `sitemap` processor.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
104