comparison piecrust/sources/posts.py @ 852:4850f8c21b6e

core: Start of the big refactor for PieCrust 3.0. * Everything is a `ContentSource`, including assets directories. * Most content sources are subclasses of the base file-system source. * A source is processed by a "pipeline", and there are 2 built-in pipelines, one for assets and one for pages. The asset pipeline is vaguely functional, but the page pipeline is completely broken right now. * Rewrite the baking process as just running appropriate pipelines on each content item. This should allow for better parallelization.
author Ludovic Chabant <ludovic@chabant.com>
date Wed, 17 May 2017 00:11:48 -0700
parents f0930178fd01
children f070a4fc033c
comparison
equal deleted inserted replaced
851:2c7e57d80bba 852:4850f8c21b6e
3 import re 3 import re
4 import logging 4 import logging
5 import datetime 5 import datetime
6 from piecrust import osutil 6 from piecrust import osutil
7 from piecrust.routing import RouteParameter 7 from piecrust.routing import RouteParameter
8 from piecrust.sources.base import ( 8 from piecrust.sources.base import REL_ASSETS, ContentItem
9 PageSource, InvalidFileSystemEndpointError, PageFactory, 9 from piecrust.sources.fs import (
10 MODE_CREATING, MODE_PARSING) 10 FSContentSource, InvalidFileSystemEndpointError)
11 from piecrust.sources.interfaces import ( 11 from piecrust.sources.interfaces import (
12 IPreparingSource, IInteractiveSource, InteractiveField) 12 IPreparingSource, IInteractiveSource, InteractiveField)
13 from piecrust.sources.mixins import SimplePaginationSourceMixin 13 from piecrust.sources.mixins import (
14 from piecrust.uriutil import multi_replace 14 SimplePaginationSourceMixin, SimpleAssetsSubDirMixin)
15 from piecrust.uriutil import uri_to_title
15 16
16 17
17 logger = logging.getLogger(__name__) 18 logger = logging.getLogger(__name__)
18 19
19 20
20 class PostsSource(PageSource, IPreparingSource, IInteractiveSource, 21 class PostsSource(FSContentSource,
21 SimplePaginationSourceMixin): 22 SimpleAssetsSubDirMixin,
23 IPreparingSource, IInteractiveSource):
22 PATH_FORMAT = None 24 PATH_FORMAT = None
23 25
24 def __init__(self, app, name, config): 26 def __init__(self, app, name, config):
25 PageSource.__init__(self, app, name, config) 27 FSContentSource.__init__(self, app, name, config)
26 self.fs_endpoint = config.get('fs_endpoint', name) 28 self.auto_formats = app.config.get('site/auto_formats')
27 self.fs_endpoint_path = os.path.join(self.root_dir, self.fs_endpoint)
28 self.supported_extensions = list(app.config.get('site/auto_formats').keys())
29 self.default_auto_format = app.config.get('site/default_auto_format') 29 self.default_auto_format = app.config.get('site/default_auto_format')
30 self._source_it_cache = None 30 self.supported_extensions = list(self.auto_formats)
31 31
32 @property 32 @property
33 def path_format(self): 33 def path_format(self):
34 return self.__class__.PATH_FORMAT 34 return self.__class__.PATH_FORMAT
35 35
36 def resolveRef(self, ref_path): 36 def _finalizeContent(self, parent_group, items, groups):
37 path = os.path.normpath(os.path.join(self.fs_endpoint_path, ref_path)) 37 SimpleAssetsSubDirMixin._onFinalizeContent(
38 metadata = self._parseMetadataFromPath(ref_path) 38 parent_group, items, groups)
39 return path, metadata 39
40 40 def getRelatedContents(self, item, relationship):
41 def getSupportedRouteParameters(self): 41 if relationship == REL_ASSETS:
42 return [ 42 SimpleAssetsSubDirMixin._getRelatedAssetsContents(item)
43 RouteParameter('slug', RouteParameter.TYPE_STRING), 43 raise NotImplementedError()
44 RouteParameter('day', RouteParameter.TYPE_INT2), 44
45 RouteParameter('month', RouteParameter.TYPE_INT2), 45 def findContent(self, route_params):
46 RouteParameter('year', RouteParameter.TYPE_INT4)] 46 year = route_params.get('year')
47 47 month = route_params.get('month')
48 def buildPageFactory(self, path): 48 day = route_params.get('day')
49 if not path.startswith(self.fs_endpoint_path): 49 slug = route_params.get('slug')
50 raise Exception("Page path '%s' isn't inside '%s'." % (
51 path, self.fs_endpoint_path))
52 rel_path = path[len(self.fs_endpoint_path):].lstrip('\\/')
53 pat = self.PATH_FORMAT % {
54 'year': 'YEAR',
55 'month': 'MONTH',
56 'day': 'DAY',
57 'slug': 'SLUG',
58 'ext': 'EXT'}
59 pat = re.escape(pat)
60 pat = multi_replace(pat, {
61 'YEAR': '(\d{4})',
62 'MONTH': '(\d{2})',
63 'DAY': '(\d{2})',
64 'SLUG': '(.*)',
65 'EXT': '(.*)'})
66 m = re.match(pat, rel_path)
67 if m is None:
68 raise Exception("'%s' isn't a proper %s page path." % (
69 rel_path, self.SOURCE_NAME))
70 return self._makeFactory(
71 rel_path,
72 m.group(4),
73 int(m.group(1)),
74 int(m.group(2)),
75 int(m.group(3)))
76
77 def findPageFactory(self, metadata, mode):
78 year = metadata.get('year')
79 month = metadata.get('month')
80 day = metadata.get('day')
81 slug = metadata.get('slug')
82 50
83 try: 51 try:
84 if year is not None: 52 if year is not None:
85 year = int(year) 53 year = int(year)
86 if month is not None: 54 if month is not None:
88 if day is not None: 56 if day is not None:
89 day = int(day) 57 day = int(day)
90 except ValueError: 58 except ValueError:
91 return None 59 return None
92 60
93 ext = metadata.get('ext') 61 ext = route_params.get('ext')
94 if ext is None: 62 if ext is None:
95 if len(self.supported_extensions) == 1: 63 if len(self.supported_extensions) == 1:
96 ext = self.supported_extensions[0] 64 ext = self.supported_extensions[0]
97 elif mode == MODE_CREATING and self.default_auto_format:
98 ext = self.default_auto_format
99 65
100 replacements = { 66 replacements = {
101 'year': '%04d' % year if year is not None else None, 67 'year': '%04d' % year if year is not None else None,
102 'month': '%02d' % month if month is not None else None, 68 'month': '%02d' % month if month is not None else None,
103 'day': '%02d' % day if day is not None else None, 69 'day': '%02d' % day if day is not None else None,
104 'slug': slug, 70 'slug': slug,
105 'ext': ext 71 'ext': ext
106 } 72 }
107 needs_recapture = False 73 needs_recapture = False
108 if year is None: 74 if year is None:
109 needs_recapture = True 75 needs_recapture = True
110 replacements['year'] = '????' 76 replacements['year'] = '????'
111 if month is None: 77 if month is None:
119 replacements['slug'] = '*' 85 replacements['slug'] = '*'
120 if ext is None: 86 if ext is None:
121 needs_recapture = True 87 needs_recapture = True
122 replacements['ext'] = '*' 88 replacements['ext'] = '*'
123 path = os.path.normpath(os.path.join( 89 path = os.path.normpath(os.path.join(
124 self.fs_endpoint_path, self.path_format % replacements)) 90 self.fs_endpoint_path, self.path_format % replacements))
125 91
126 if needs_recapture: 92 if needs_recapture:
127 if mode == MODE_CREATING:
128 raise ValueError("Not enough information to find a post path.")
129 possible_paths = osutil.glob(path) 93 possible_paths = osutil.glob(path)
130 if len(possible_paths) != 1: 94 if len(possible_paths) != 1:
131 return None 95 return None
132 path = possible_paths[0] 96 path = possible_paths[0]
133 elif mode == MODE_PARSING and not os.path.isfile(path): 97 elif not os.path.isfile(path):
134 return None 98 return None
135 99
136 rel_path = os.path.relpath(path, self.fs_endpoint_path) 100 metadata = self._parseMetadataFromPath(path)
137 rel_path = rel_path.replace('\\', '/') 101 return ContentItem(path, metadata)
138 fac_metadata = self._parseMetadataFromPath(rel_path) 102
139 return PageFactory(self, rel_path, fac_metadata) 103 def _parseMetadataFromPath(self, path):
104 regex_repl = {
105 'year': '(?P<year>\d{4})',
106 'month': '(?P<month>\d{2})',
107 'day': '(?P<day>\d{2})',
108 'slug': '(?P<slug>.*)',
109 'ext': '(?P<ext>.*)'
110 }
111 path_format_re = re.sub(r'([\-\.])', r'\\\1', self.path_format)
112 pattern = path_format_re % regex_repl + '$'
113 m = re.search(pattern, path.replace('\\', '/'))
114 if not m:
115 raise Exception("Expected to be able to match path with path "
116 "format: %s" % path)
117
118 year = int(m.group('year'))
119 month = int(m.group('month'))
120 day = int(m.group('day'))
121 timestamp = datetime.date(year, month, day)
122 metadata = {
123 'year': year,
124 'month': month,
125 'day': day,
126 'slug': m.group('slug'),
127 'date': timestamp
128 }
129 return metadata
130
131 def getSupportedRouteParameters(self):
132 return [
133 RouteParameter('slug', RouteParameter.TYPE_STRING),
134 RouteParameter('day', RouteParameter.TYPE_INT2),
135 RouteParameter('month', RouteParameter.TYPE_INT2),
136 RouteParameter('year', RouteParameter.TYPE_INT4)]
140 137
141 def getSourceIterator(self): 138 def getSourceIterator(self):
142 if self._source_it_cache is None: 139 if self._source_it_cache is None:
143 it = SimplePaginationSourceMixin.getSourceIterator(self) 140 it = SimplePaginationSourceMixin.getSourceIterator(self)
144 self._source_it_cache = list(it) 141 self._source_it_cache = list(it)
145 return self._source_it_cache 142 return self._source_it_cache
146 143
147 def setupPrepareParser(self, parser, app): 144 def setupPrepareParser(self, parser, app):
148 parser.add_argument( 145 parser.add_argument(
149 '-d', '--date', help="The date of the post, " 146 '-d', '--date', help="The date of the post, "
150 "in `year/month/day` format (defaults to today).") 147 "in `year/month/day` format (defaults to today).")
151 parser.add_argument('slug', help="The URL slug for the new post.") 148 parser.add_argument('slug', help="The URL slug for the new post.")
152 149
153 def buildMetadata(self, args): 150 def createContent(self, args):
154 dt = datetime.date.today() 151 dt = datetime.date.today()
155 if args.date: 152 if args.date:
156 if args.date == 'today': 153 if args.date == 'today':
157 pass # Keep the default we had. 154 pass # Keep the default we had.
158 elif args.date == 'tomorrow': 155 elif args.date == 'tomorrow':
168 except ValueError: 165 except ValueError:
169 raise Exception("Dates must be of the form: " 166 raise Exception("Dates must be of the form: "
170 "YEAR/MONTH/DAY.") 167 "YEAR/MONTH/DAY.")
171 dt = datetime.date(year, month, day) 168 dt = datetime.date(year, month, day)
172 169
170 slug, ext = os.path.splitext(args.slug)
171 if not ext:
172 ext = self.default_auto_format
173 year, month, day = dt.year, dt.month, dt.day 173 year, month, day = dt.year, dt.month, dt.day
174 return {'year': year, 'month': month, 'day': day, 'slug': args.slug} 174 tokens = {
175 'slug': args.slug,
176 'ext': ext,
177 'year': '%04d' % year,
178 'month': '%02d' % month,
179 'day': '%02d' % day
180 }
181 rel_path = self.path_format % tokens
182 path = os.path.join(self.fs_endpoint_path, rel_path)
183 metadata = {
184 'config': {'title': uri_to_title(slug)}
185 }
186 return ContentItem(path, metadata)
175 187
176 def getInteractiveFields(self): 188 def getInteractiveFields(self):
177 dt = datetime.date.today() 189 dt = datetime.date.today()
178 return [ 190 return [
179 InteractiveField('year', InteractiveField.TYPE_INT, dt.year), 191 InteractiveField('year', InteractiveField.TYPE_INT, dt.year),
183 195
184 def _checkFsEndpointPath(self): 196 def _checkFsEndpointPath(self):
185 if not os.path.isdir(self.fs_endpoint_path): 197 if not os.path.isdir(self.fs_endpoint_path):
186 if self.ignore_missing_dir: 198 if self.ignore_missing_dir:
187 return False 199 return False
188 raise InvalidFileSystemEndpointError(self.name, self.fs_endpoint_path) 200 raise InvalidFileSystemEndpointError(self.name,
201 self.fs_endpoint_path)
189 return True 202 return True
190 203
191 def _parseMetadataFromPath(self, path): 204 def _makeContentItem(self, path, slug, year, month, day):
192 regex_repl = {
193 'year': '(?P<year>\d{4})',
194 'month': '(?P<month>\d{2})',
195 'day': '(?P<day>\d{2})',
196 'slug': '(?P<slug>.*)',
197 'ext': '(?P<ext>.*)'
198 }
199 path_format_re = re.sub(r'([\-\.])', r'\\\1', self.path_format)
200 pattern = path_format_re % regex_repl + '$'
201 m = re.search(pattern, path.replace('\\', '/'))
202 if not m:
203 raise Exception("Expected to be able to match path with path "
204 "format: %s" % path)
205
206 year = int(m.group('year'))
207 month = int(m.group('month'))
208 day = int(m.group('day'))
209 timestamp = datetime.date(year, month, day)
210 metadata = {
211 'year': year,
212 'month': month,
213 'day': day,
214 'slug': m.group('slug'),
215 'date': timestamp
216 }
217 return metadata
218
219 def _makeFactory(self, path, slug, year, month, day):
220 path = path.replace('\\', '/') 205 path = path.replace('\\', '/')
221 timestamp = datetime.date(year, month, day) 206 timestamp = datetime.date(year, month, day)
222 metadata = { 207 metadata = {
223 'slug': slug, 208 'slug': slug,
224 'year': year, 209 'year': year,
225 'month': month, 210 'month': month,
226 'day': day, 211 'day': day,
227 'date': timestamp} 212 'date': timestamp}
228 return PageFactory(self, path, metadata) 213 return ContentItem(path, metadata)
229 214
230 215
231 class FlatPostsSource(PostsSource): 216 class FlatPostsSource(PostsSource):
232 SOURCE_NAME = 'posts/flat' 217 SOURCE_NAME = 'posts/flat'
233 PATH_FORMAT = '%(year)s-%(month)s-%(day)s_%(slug)s.%(ext)s' 218 PATH_FORMAT = '%(year)s-%(month)s-%(day)s_%(slug)s.%(ext)s'
219 PATTERN = re.compile(r'(\d{4})-(\d{2})-(\d{2})_(.*)\.(\w+)$')
234 220
235 def __init__(self, app, name, config): 221 def __init__(self, app, name, config):
236 super(FlatPostsSource, self).__init__(app, name, config) 222 super().__init__(app, name, config)
237 223
238 def buildPageFactories(self): 224 def getContents(self, group):
239 if not self._checkFsEndpointPath(): 225 if not self._checkFSEndpoint():
240 return 226 return None
241 logger.debug("Scanning for posts (flat) in: %s" % self.fs_endpoint_path) 227
242 pattern = re.compile(r'(\d{4})-(\d{2})-(\d{2})_(.*)\.(\w+)$') 228 logger.debug("Scanning for posts (flat) in: %s" %
229 self.fs_endpoint_path)
230 pattern = FlatPostsSource.PATTERN
243 _, __, filenames = next(osutil.walk(self.fs_endpoint_path)) 231 _, __, filenames = next(osutil.walk(self.fs_endpoint_path))
244 for f in filenames: 232 for f in filenames:
245 match = pattern.match(f) 233 match = pattern.match(f)
246 if match is None: 234 if match is None:
247 name, ext = os.path.splitext(f) 235 name, ext = os.path.splitext(f)
248 logger.warning("'%s' is not formatted as 'YYYY-MM-DD_slug-title.%s' " 236 logger.warning(
249 "and will be ignored. Is that a typo?" % (f, ext)) 237 "'%s' is not formatted as 'YYYY-MM-DD_slug-title.%s' "
238 "and will be ignored. Is that a typo?" % (f, ext))
250 continue 239 continue
251 yield self._makeFactory( 240 yield self._makeContentItem(
252 f, 241 f,
253 match.group(4), 242 match.group(4),
254 int(match.group(1)), 243 int(match.group(1)),
255 int(match.group(2)), 244 int(match.group(2)),
256 int(match.group(3))) 245 int(match.group(3)))
257 246
258 247
259 class ShallowPostsSource(PostsSource): 248 class ShallowPostsSource(PostsSource):
260 SOURCE_NAME = 'posts/shallow' 249 SOURCE_NAME = 'posts/shallow'
261 PATH_FORMAT = '%(year)s/%(month)s-%(day)s_%(slug)s.%(ext)s' 250 PATH_FORMAT = '%(year)s/%(month)s-%(day)s_%(slug)s.%(ext)s'
251 YEAR_PATTERN = re.compile(r'(\d{4})$')
252 FILE_PATTERN = re.compile(r'(\d{2})-(\d{2})_(.*)\.(\w+)$')
262 253
263 def __init__(self, app, name, config): 254 def __init__(self, app, name, config):
264 super(ShallowPostsSource, self).__init__(app, name, config) 255 super(ShallowPostsSource, self).__init__(app, name, config)
265 256
266 def buildPageFactories(self): 257 def getContents(self, group):
267 if not self._checkFsEndpointPath(): 258 if not self._checkFsEndpointPath():
268 return 259 return
269 logger.debug("Scanning for posts (shallow) in: %s" % self.fs_endpoint_path) 260
270 year_pattern = re.compile(r'(\d{4})$') 261 logger.debug("Scanning for posts (shallow) in: %s" %
271 file_pattern = re.compile(r'(\d{2})-(\d{2})_(.*)\.(\w+)$') 262 self.fs_endpoint_path)
263 year_pattern = ShallowPostsSource.YEAR_PATTERN
264 file_pattern = ShallowPostsSource.FILE_PATTERN
272 _, year_dirs, __ = next(osutil.walk(self.fs_endpoint_path)) 265 _, year_dirs, __ = next(osutil.walk(self.fs_endpoint_path))
273 year_dirs = [d for d in year_dirs if year_pattern.match(d)] 266 year_dirs = [d for d in year_dirs if year_pattern.match(d)]
274 for yd in year_dirs: 267 for yd in year_dirs:
275 if year_pattern.match(yd) is None: 268 if year_pattern.match(yd) is None:
276 logger.warning("'%s' is not formatted as 'YYYY' and will be ignored. " 269 logger.warning(
277 "Is that a typo?") 270 "'%s' is not formatted as 'YYYY' and will be ignored. "
271 "Is that a typo?")
278 continue 272 continue
279 year = int(yd) 273 year = int(yd)
280 year_dir = os.path.join(self.fs_endpoint_path, yd) 274 year_dir = os.path.join(self.fs_endpoint_path, yd)
281 275
282 _, __, filenames = next(osutil.walk(year_dir)) 276 _, __, filenames = next(osutil.walk(year_dir))
283 for f in filenames: 277 for f in filenames:
284 match = file_pattern.match(f) 278 match = file_pattern.match(f)
285 if match is None: 279 if match is None:
286 name, ext = os.path.splitext(f) 280 name, ext = os.path.splitext(f)
287 logger.warning("'%s' is not formatted as 'MM-DD_slug-title.%s' " 281 logger.warning(
288 "and will be ignored. Is that a typo?" % (f, ext)) 282 "'%s' is not formatted as 'MM-DD_slug-title.%s' "
283 "and will be ignored. Is that a typo?" % (f, ext))
289 continue 284 continue
290 yield self._makeFactory( 285 yield self._makeContentItem(
291 os.path.join(yd, f), 286 os.path.join(yd, f),
292 match.group(3), 287 match.group(3),
293 year, 288 year,
294 int(match.group(1)), 289 int(match.group(1)),
295 int(match.group(2))) 290 int(match.group(2)))
296 291
297 292
298 class HierarchyPostsSource(PostsSource): 293 class HierarchyPostsSource(PostsSource):
299 SOURCE_NAME = 'posts/hierarchy' 294 SOURCE_NAME = 'posts/hierarchy'
300 PATH_FORMAT = '%(year)s/%(month)s/%(day)s_%(slug)s.%(ext)s' 295 PATH_FORMAT = '%(year)s/%(month)s/%(day)s_%(slug)s.%(ext)s'
296 YEAR_PATTERN = re.compile(r'(\d{4})$')
297 MONTH_PATTERN = re.compile(r'(\d{2})$')
298 FILE_PATTERN = re.compile(r'(\d{2})_(.*)\.(\w+)$')
301 299
302 def __init__(self, app, name, config): 300 def __init__(self, app, name, config):
303 super(HierarchyPostsSource, self).__init__(app, name, config) 301 super(HierarchyPostsSource, self).__init__(app, name, config)
304 302
305 def buildPageFactories(self): 303 def getContents(self, group):
306 if not self._checkFsEndpointPath(): 304 if not self._checkFsEndpointPath():
307 return 305 return
308 logger.debug("Scanning for posts (hierarchy) in: %s" % self.fs_endpoint_path) 306
309 year_pattern = re.compile(r'(\d{4})$') 307 logger.debug("Scanning for posts (hierarchy) in: %s" %
310 month_pattern = re.compile(r'(\d{2})$') 308 self.fs_endpoint_path)
311 file_pattern = re.compile(r'(\d{2})_(.*)\.(\w+)$') 309 year_pattern = HierarchyPostsSource.YEAR_PATTERN
310 month_pattern = HierarchyPostsSource.MONTH_PATTERN
311 file_pattern = HierarchyPostsSource.FILE_PATTERN
312 _, year_dirs, __ = next(osutil.walk(self.fs_endpoint_path)) 312 _, year_dirs, __ = next(osutil.walk(self.fs_endpoint_path))
313 year_dirs = [d for d in year_dirs if year_pattern.match(d)] 313 year_dirs = [d for d in year_dirs if year_pattern.match(d)]
314 for yd in year_dirs: 314 for yd in year_dirs:
315 year = int(yd) 315 year = int(yd)
316 year_dir = os.path.join(self.fs_endpoint_path, yd) 316 year_dir = os.path.join(self.fs_endpoint_path, yd)
324 _, __, filenames = next(osutil.walk(month_dir)) 324 _, __, filenames = next(osutil.walk(month_dir))
325 for f in filenames: 325 for f in filenames:
326 match = file_pattern.match(f) 326 match = file_pattern.match(f)
327 if match is None: 327 if match is None:
328 name, ext = os.path.splitext(f) 328 name, ext = os.path.splitext(f)
329 logger.warning("'%s' is not formatted as 'DD_slug-title.%s' " 329 logger.warning(
330 "and will be ignored. Is that a typo?" % (f, ext)) 330 "'%s' is not formatted as 'DD_slug-title.%s' "
331 "and will be ignored. Is that a typo?" % (f, ext))
331 continue 332 continue
332 rel_name = os.path.join(yd, md, f) 333 rel_name = os.path.join(yd, md, f)
333 yield self._makeFactory( 334 yield self._makeContentItem(
334 rel_name, 335 rel_name,
335 match.group(2), 336 match.group(2),
336 year, 337 year,
337 month, 338 month,
338 int(match.group(1))) 339 int(match.group(1)))
339 340