Mercurial > piecrust2
comparison piecrust/importing/wordpress.py @ 302:103abb08755e
import: Make the Wordpress importer extendable, rename it to `wordpressxml`.
This is because the SQL importer will be installable as a plugin, so as to not
impose SQL dependencies on the basic PieCrust package.
author | Ludovic Chabant <ludovic@chabant.com> |
---|---|
date | Thu, 19 Mar 2015 18:30:09 -0700 |
parents | 2daa05a21026 |
children | 1ddd18ad5e76 |
comparison
equal
deleted
inserted
replaced
301:45aba3cb7228 | 302:103abb08755e |
---|---|
1 import os.path | 1 import os.path |
2 import logging | 2 import logging |
3 import datetime | 3 import datetime |
4 import yaml | 4 import yaml |
5 from urllib.parse import urlparse | 5 from collections import OrderedDict |
6 from piecrust import CONFIG_PATH | 6 from piecrust import CONFIG_PATH |
7 from piecrust.configuration import ( | |
8 ConfigurationLoader, ConfigurationDumper, merge_dicts) | |
7 from piecrust.importing.base import Importer, create_page, download_asset | 9 from piecrust.importing.base import Importer, create_page, download_asset |
8 from piecrust.sources.base import MODE_CREATING | 10 from piecrust.sources.base import MODE_CREATING |
9 | 11 |
10 | 12 |
11 logger = logging.getLogger(__name__) | 13 logger = logging.getLogger(__name__) |
12 | 14 |
13 | 15 |
14 class WordpressImporter(Importer): | 16 class WordpressImporterBase(Importer): |
15 name = 'wordpress' | |
16 description = "Imports a Wordpress blog." | |
17 | |
18 def setupParser(self, parser, app): | 17 def setupParser(self, parser, app): |
19 parser.add_argument( | 18 parser.add_argument( |
20 '--posts_fs', | 19 '--pages-source', |
21 default="hierarchy", | 20 default="pages", |
22 choices=['flat', 'shallow', 'hierarchy'], | 21 help="The source to store pages in.") |
23 help="The blog file-system type to use.") | 22 parser.add_argument( |
24 parser.add_argument( | 23 '--posts-source', |
25 '--prefix', | 24 default="posts", |
26 default="wp_", | 25 help="The source to store posts in.") |
27 help="The SQL table prefix. Defaults to `wp_`.") | |
28 parser.add_argument( | 26 parser.add_argument( |
29 '--default-post-layout', | 27 '--default-post-layout', |
30 help="The default layout to use for posts.") | 28 help="The default layout to use for posts.") |
31 parser.add_argument( | 29 parser.add_argument( |
32 '--default-post-category', | 30 '--default-post-category', |
35 '--default-page-layout', | 33 '--default-page-layout', |
36 help="The default layout to use for pages.") | 34 help="The default layout to use for pages.") |
37 parser.add_argument( | 35 parser.add_argument( |
38 '--default-page-category', | 36 '--default-page-category', |
39 help="The default category to use for pages.") | 37 help="The default category to use for pages.") |
40 parser.add_argument( | |
41 'xml_or_db_url', | |
42 help=("The exported XML archive of the Wordpress site, or " | |
43 "the URL of the SQL database.\n" | |
44 "\n" | |
45 "If an SQL database URL, it should be of the " | |
46 "form: type://user:password@server/database\n" | |
47 "\n" | |
48 "For example:\n" | |
49 "mysql://user:password@example.org/my_database")) | |
50 | 38 |
51 def importWebsite(self, app, args): | 39 def importWebsite(self, app, args): |
52 parsed_url = urlparse(args.xml_or_db_url) | 40 impl = self._getImplementation(app, args) |
53 if not parsed_url.scheme: | 41 return impl.importWebsite() |
54 impl = _XmlImporter(app, args) | 42 |
43 def _getImplementation(self, app, args): | |
44 raise NotImplementedError() | |
45 | |
46 | |
47 class _ImporterBase(object): | |
48 def __init__(self, app, args): | |
49 self.app = app | |
50 self._cat_map = {} | |
51 self._author_map = {} | |
52 self._pages_source = app.getSource(args.pages_source) | |
53 self._posts_source = app.getSource(args.posts_source) | |
54 | |
55 def importWebsite(self): | |
56 ctx = self._open() | |
57 | |
58 # Site configuration. | |
59 logger.info("Generating site configuration...") | |
60 site_config = self._getSiteConfig(ctx) | |
61 site_config.setdefault('site', {}) | |
62 site_config['site'].update({ | |
63 'post_url': '%year%/%month%/%slug%', | |
64 'category_url': 'category/%category%'}) | |
65 | |
66 site_config_path = os.path.join(self.app.root_dir, CONFIG_PATH) | |
67 with open(site_config_path, 'r') as fp: | |
68 cfg_data = yaml.load(fp, Loader=ConfigurationLoader) | |
69 | |
70 cfg_data = cfg_data or {} | |
71 merge_dicts(cfg_data, site_config) | |
72 | |
73 with open(site_config_path, 'w') as fp: | |
74 yaml.dump(cfg_data, fp, default_flow_style=False, | |
75 allow_unicode=True, | |
76 Dumper=ConfigurationDumper) | |
77 | |
78 # Content | |
79 for p in self._getPosts(ctx): | |
80 if p['type'] == 'attachment': | |
81 self._createAsset(p) | |
82 else: | |
83 self._createPost(p) | |
84 | |
85 self._close(ctx) | |
86 | |
87 def _open(self): | |
88 raise NotImplementedError() | |
89 | |
90 def _close(self, ctx): | |
91 pass | |
92 | |
93 def _getSiteConfig(self, ctx): | |
94 raise NotImplementedError() | |
95 | |
96 def _getPosts(self, ctx): | |
97 raise NotImplementedError() | |
98 | |
99 def _createAsset(self, asset_info): | |
100 download_asset(self.app, asset_info['url']) | |
101 | |
102 def _createPost(self, post_info): | |
103 post_dt = post_info['datetime'] | |
104 finder = { | |
105 'year': post_dt.year, | |
106 'month': post_dt.month, | |
107 'day': post_dt.day, | |
108 'slug': post_info['slug']} | |
109 if post_info['type'] == 'post': | |
110 source = self._posts_source | |
111 elif post_info['type'] == 'page': | |
112 source = self._pages_source | |
55 else: | 113 else: |
56 impl = _SqlImporter(app, args) | 114 raise Exception("Unknown post type: %s" % post_info['type']) |
57 return impl.importWebsite() | 115 rel_path, fac_metadata = source.findPagePath(finder, MODE_CREATING) |
58 | 116 |
59 | 117 metadata = post_info['metadata'].copy() |
60 class _XmlImporter(object): | 118 for name in ['title', 'author', 'status', 'post_id', 'post_guid', |
119 'description', 'categories']: | |
120 val = post_info.get(name) | |
121 if val is not None: | |
122 metadata[name] = val | |
123 | |
124 content = post_info['content'] | |
125 excerpt = post_info['excerpt'] | |
126 text = content | |
127 if excerpt is not None and excerpt.strip() != '': | |
128 text = "%s\n\n---excerpt---\n\n%s" % (content, excerpt) | |
129 | |
130 path = source.resolveRef(rel_path) | |
131 create_page(self.app, path, metadata, text) | |
132 | |
133 | |
134 class _XmlImporter(_ImporterBase): | |
61 ns_wp = {'wp': 'http://wordpress.org/export/1.2/'} | 135 ns_wp = {'wp': 'http://wordpress.org/export/1.2/'} |
62 ns_dc = {'dc': "http://purl.org/dc/elements/1.1/"} | 136 ns_dc = {'dc': "http://purl.org/dc/elements/1.1/"} |
63 ns_excerpt = {'excerpt': "http://wordpress.org/export/1.2/excerpt/"} | 137 ns_excerpt = {'excerpt': "http://wordpress.org/export/1.2/excerpt/"} |
64 ns_content = {'content': "http://purl.org/rss/1.0/modules/content/"} | 138 ns_content = {'content': "http://purl.org/rss/1.0/modules/content/"} |
65 | 139 |
66 def __init__(self, app, args): | 140 def __init__(self, app, args): |
67 self.app = app | 141 super(_XmlImporter, self).__init__(app, args) |
68 self.path = args.xml_or_db_url | 142 self.path = args.xml_path |
69 self.posts_fs = args.posts_fs | 143 |
70 self._cat_map = {} | 144 def _open(self): |
71 self._author_map = {} | |
72 | |
73 for cls in self.app.plugin_loader.getSources(): | |
74 if cls.SOURCE_NAME == ('posts/%s' % self.posts_fs): | |
75 src_config = { | |
76 'type': 'posts/%s' % self.posts_fs, | |
77 'fs_endpoint': 'posts', | |
78 'data_type': 'blog'} | |
79 self.posts_source = cls(app, 'posts', src_config) | |
80 break | |
81 else: | |
82 raise Exception("No such posts file-system: " % self.posts_fs) | |
83 | |
84 def importWebsite(self): | |
85 if not os.path.exists(self.path): | 145 if not os.path.exists(self.path): |
86 raise Exception("No such file: %s" % self.path) | 146 raise Exception("No such file: %s" % self.path) |
87 | 147 |
88 try: | 148 try: |
89 import xml.etree.ElementTree as ET | 149 import xml.etree.ElementTree as ET |
97 xml = xml.replace(chr(0x1e), '') | 157 xml = xml.replace(chr(0x1e), '') |
98 xml = xml.replace(chr(0x10), '') | 158 xml = xml.replace(chr(0x10), '') |
99 tree = ET.fromstring(xml) | 159 tree = ET.fromstring(xml) |
100 channel = tree.find('channel') | 160 channel = tree.find('channel') |
101 | 161 |
162 return channel | |
163 | |
164 def _getSiteConfig(self, channel): | |
102 # Get basic site information | 165 # Get basic site information |
103 title = find_text(channel, 'title') | 166 title = find_text(channel, 'title') |
104 description = find_text(channel, 'description') | 167 description = find_text(channel, 'description') |
105 site_config = { | 168 site_config = OrderedDict({ |
106 'site': { | 169 'site': { |
107 'title': title, | 170 'title': title, |
108 'description': description, | 171 'description': description} |
109 'posts_fs': self.posts_fs} | 172 }) |
110 } | |
111 logger.info("Importing '%s'" % title) | |
112 | 173 |
113 # Get authors' names. | 174 # Get authors' names. |
114 authors = {} | 175 authors = {} |
115 for a in channel.findall('wp:author', self.ns_wp): | 176 for a in channel.findall('wp:author', self.ns_wp): |
116 login = find_text(a, 'wp:author_login', self.ns_wp) | 177 login = find_text(a, 'wp:author_login', self.ns_wp) |
124 self.ns_wp), | 185 self.ns_wp), |
125 'author_id': find_text(a, 'wp:author_id', | 186 'author_id': find_text(a, 'wp:author_id', |
126 self.ns_wp)} | 187 self.ns_wp)} |
127 site_config['site']['authors'] = authors | 188 site_config['site']['authors'] = authors |
128 | 189 |
129 # Other stuff. | 190 return site_config |
130 site_config['site'].update({ | 191 |
131 'post_url': '%year%/%month%/%slug%', | 192 def _getPosts(self, channel): |
132 'category_url': 'category/%category%'}) | |
133 | |
134 logger.info("Generating site configuration...") | |
135 site_config_path = os.path.join(self.app.root_dir, CONFIG_PATH) | |
136 with open(site_config_path, 'w') as fp: | |
137 yaml.safe_dump(site_config, fp, default_flow_style=False, | |
138 allow_unicode=True) | |
139 | |
140 # Content. | |
141 for i in channel.findall('item'): | 193 for i in channel.findall('item'): |
142 post_type = find_text(i, 'wp:post_type', self.ns_wp) | 194 post_type = find_text(i, 'wp:post_type', self.ns_wp) |
143 if post_type == 'attachment': | 195 if post_type == 'attachment': |
144 self._createAsset(i) | 196 yield self._getAssetInfo(i) |
145 elif post_type == 'post': | 197 else: |
146 self._createPost(i) | 198 yield self._getPostInfo(i) |
147 | 199 |
148 self._cat_map = None | 200 def _getAssetInfo(self, node): |
149 self._author_map = None | |
150 | |
151 def _createAsset(self, node): | |
152 url = find_text(node, 'wp:attachment_url', self.ns_wp) | 201 url = find_text(node, 'wp:attachment_url', self.ns_wp) |
153 download_asset(self.app, url) | 202 return {'type': 'attachment', 'url': url} |
154 | 203 |
155 def _getPageMetadata(self, node): | 204 def _getPostInfo(self, node): |
205 post_date_str = find_text(node, 'wp:post_date', self.ns_wp) | |
206 post_date = datetime.datetime.strptime(post_date_str, | |
207 '%Y-%m-%d %H:%M:%S') | |
208 post_name = find_text(node, 'wp:post_name', self.ns_wp) | |
209 post_type = find_text(node, 'wp:post_type', self.ns_wp) | |
210 post_info = { | |
211 'type': post_type, | |
212 'slug': post_name, | |
213 'datetime': post_date} | |
214 | |
156 title = find_text(node, 'title') | 215 title = find_text(node, 'title') |
157 creator = find_text(node, 'dc:creator', self.ns_dc) | 216 creator = find_text(node, 'dc:creator', self.ns_dc) |
158 status = find_text(node, 'wp:status', self.ns_wp) | 217 status = find_text(node, 'wp:status', self.ns_wp) |
159 post_id = find_text(node, 'wp:post_id', self.ns_wp) | 218 post_id = find_text(node, 'wp:post_id', self.ns_wp) |
160 guid = find_text(node, 'guid') | 219 guid = find_text(node, 'guid') |
161 description = find_text(node, 'description') | 220 description = find_text(node, 'description') |
162 # TODO: menu order, parent, password, sticky | 221 # TODO: menu order, parent, password, sticky |
163 | 222 post_info.update({ |
164 categories = [] | |
165 for c in node.findall('category'): | |
166 nicename = str(c.attrib.get('nicename')) | |
167 categories.append(nicename) | |
168 | |
169 metadata = { | |
170 'title': title, | 223 'title': title, |
171 'author': creator, | 224 'author': creator, |
172 'status': status, | 225 'status': status, |
173 'post_id': post_id, | 226 'post_id': post_id, |
174 'post_guid': guid, | 227 'post_guid': guid, |
175 'description': description, | 228 'description': description}) |
176 'categories': categories} | 229 |
177 | 230 categories = [] |
231 for c in node.findall('category'): | |
232 nicename = str(c.attrib.get('nicename')) | |
233 categories.append(nicename) | |
234 post_info['categories'] = categories | |
235 | |
236 metadata = {} | |
178 for m in node.findall('wp:postmeta', self.ns_wp): | 237 for m in node.findall('wp:postmeta', self.ns_wp): |
179 key = find_text(m, 'wp:meta_key', self.ns_wp) | 238 key = find_text(m, 'wp:meta_key', self.ns_wp) |
180 metadata[key] = find_text(m, 'wp:meta_value', self.ns_wp) | 239 metadata[key] = find_text(m, 'wp:meta_value', self.ns_wp) |
181 | 240 post_info['metadata'] = metadata |
182 return metadata | 241 |
183 | |
184 def _getPageContents(self, node): | |
185 content = find_text(node, 'content:encoded', self.ns_content) | 242 content = find_text(node, 'content:encoded', self.ns_content) |
186 excerpt = find_text(node, 'excerpt:encoded', self.ns_excerpt) | 243 excerpt = find_text(node, 'excerpt:encoded', self.ns_excerpt) |
187 if not excerpt.strip(): | 244 post_info.update({ |
188 return content | 245 'content': content, |
189 return "%s\n\n---excerpt---\n\n%s" % (content, excerpt) | 246 'excerpt': excerpt}) |
190 | 247 |
191 def _getPageInfo(self, node): | 248 return post_info |
192 url = find_text(node, 'link') | 249 |
193 post_date_str = find_text(node, 'wp:post_date', self.ns_wp) | 250 |
194 post_date = datetime.datetime.strptime(post_date_str, | 251 class WordpressXmlImporter(WordpressImporterBase): |
195 '%Y-%m-%d %H:%M:%S') | 252 name = 'wordpress-xml' |
196 post_name = find_text(node, 'wp:post_name', self.ns_wp) | 253 description = "Imports a Wordpress blog from an exported XML archive." |
197 return { | 254 |
198 'url': url, | 255 def setupParser(self, parser, app): |
199 'slug': post_name, | 256 super(WordpressXmlImporter, self).setupParser(parser, app) |
200 'datetime': post_date} | 257 parser.add_argument( |
201 | 258 'xml_path', |
202 def _createPage(self, node): | 259 help="The path to the exported XML archive file.") |
203 info = self._getPageInfo(node) | 260 |
204 rel_path = os.path.join('pages', info['slug']) | 261 def _getImplementation(self, app, args): |
205 metadata = self._getPageMetadata(node) | 262 return _XmlImporter(app, args) |
206 contents = self._getPageContents(node) | |
207 create_page(self.app, rel_path, metadata, contents) | |
208 | |
209 def _createPost(self, node): | |
210 info = self._getPageInfo(node) | |
211 post_dt = info['datetime'] | |
212 finder = { | |
213 'year': post_dt.year, | |
214 'month': post_dt.month, | |
215 'day': post_dt.day, | |
216 'slug': info['slug']} | |
217 rel_path, fac_metadata = self.posts_source.findPagePath( | |
218 finder, MODE_CREATING) | |
219 rel_path = os.path.join('posts', rel_path) | |
220 metadata = self._getPageMetadata(node) | |
221 contents = self._getPageContents(node) | |
222 create_page(self.app, rel_path, metadata, contents) | |
223 | |
224 | |
225 class _SqlImporter(object): | |
226 def __init__(self, app, args): | |
227 self.app = app | |
228 self.db_url = args.xml_or_db_url | |
229 self.prefix = args.prefix | |
230 | |
231 def importWebsite(self): | |
232 raise NotImplementedError() | |
233 | 263 |
234 | 264 |
235 def find_text(parent, child_name, namespaces=None): | 265 def find_text(parent, child_name, namespaces=None): |
236 return str(parent.find(child_name, namespaces).text) | 266 return str(parent.find(child_name, namespaces).text) |
237 | 267 |