comparison piecrust/importing/wordpress.py @ 302:103abb08755e

import: Make the Wordpress importer extendable, rename it to `wordpressxml`. This is because the SQL importer will be installable as a plugin, so as to not impose SQL dependencies on the basic PieCrust package.
author Ludovic Chabant <ludovic@chabant.com>
date Thu, 19 Mar 2015 18:30:09 -0700
parents 2daa05a21026
children 1ddd18ad5e76
comparison
equal deleted inserted replaced
301:45aba3cb7228 302:103abb08755e
1 import os.path 1 import os.path
2 import logging 2 import logging
3 import datetime 3 import datetime
4 import yaml 4 import yaml
5 from urllib.parse import urlparse 5 from collections import OrderedDict
6 from piecrust import CONFIG_PATH 6 from piecrust import CONFIG_PATH
7 from piecrust.configuration import (
8 ConfigurationLoader, ConfigurationDumper, merge_dicts)
7 from piecrust.importing.base import Importer, create_page, download_asset 9 from piecrust.importing.base import Importer, create_page, download_asset
8 from piecrust.sources.base import MODE_CREATING 10 from piecrust.sources.base import MODE_CREATING
9 11
10 12
11 logger = logging.getLogger(__name__) 13 logger = logging.getLogger(__name__)
12 14
13 15
14 class WordpressImporter(Importer): 16 class WordpressImporterBase(Importer):
15 name = 'wordpress'
16 description = "Imports a Wordpress blog."
17
18 def setupParser(self, parser, app): 17 def setupParser(self, parser, app):
19 parser.add_argument( 18 parser.add_argument(
20 '--posts_fs', 19 '--pages-source',
21 default="hierarchy", 20 default="pages",
22 choices=['flat', 'shallow', 'hierarchy'], 21 help="The source to store pages in.")
23 help="The blog file-system type to use.") 22 parser.add_argument(
24 parser.add_argument( 23 '--posts-source',
25 '--prefix', 24 default="posts",
26 default="wp_", 25 help="The source to store posts in.")
27 help="The SQL table prefix. Defaults to `wp_`.")
28 parser.add_argument( 26 parser.add_argument(
29 '--default-post-layout', 27 '--default-post-layout',
30 help="The default layout to use for posts.") 28 help="The default layout to use for posts.")
31 parser.add_argument( 29 parser.add_argument(
32 '--default-post-category', 30 '--default-post-category',
35 '--default-page-layout', 33 '--default-page-layout',
36 help="The default layout to use for pages.") 34 help="The default layout to use for pages.")
37 parser.add_argument( 35 parser.add_argument(
38 '--default-page-category', 36 '--default-page-category',
39 help="The default category to use for pages.") 37 help="The default category to use for pages.")
40 parser.add_argument(
41 'xml_or_db_url',
42 help=("The exported XML archive of the Wordpress site, or "
43 "the URL of the SQL database.\n"
44 "\n"
45 "If an SQL database URL, it should be of the "
46 "form: type://user:password@server/database\n"
47 "\n"
48 "For example:\n"
49 "mysql://user:password@example.org/my_database"))
50 38
51 def importWebsite(self, app, args): 39 def importWebsite(self, app, args):
52 parsed_url = urlparse(args.xml_or_db_url) 40 impl = self._getImplementation(app, args)
53 if not parsed_url.scheme: 41 return impl.importWebsite()
54 impl = _XmlImporter(app, args) 42
43 def _getImplementation(self, app, args):
44 raise NotImplementedError()
45
46
47 class _ImporterBase(object):
48 def __init__(self, app, args):
49 self.app = app
50 self._cat_map = {}
51 self._author_map = {}
52 self._pages_source = app.getSource(args.pages_source)
53 self._posts_source = app.getSource(args.posts_source)
54
55 def importWebsite(self):
56 ctx = self._open()
57
58 # Site configuration.
59 logger.info("Generating site configuration...")
60 site_config = self._getSiteConfig(ctx)
61 site_config.setdefault('site', {})
62 site_config['site'].update({
63 'post_url': '%year%/%month%/%slug%',
64 'category_url': 'category/%category%'})
65
66 site_config_path = os.path.join(self.app.root_dir, CONFIG_PATH)
67 with open(site_config_path, 'r') as fp:
68 cfg_data = yaml.load(fp, Loader=ConfigurationLoader)
69
70 cfg_data = cfg_data or {}
71 merge_dicts(cfg_data, site_config)
72
73 with open(site_config_path, 'w') as fp:
74 yaml.dump(cfg_data, fp, default_flow_style=False,
75 allow_unicode=True,
76 Dumper=ConfigurationDumper)
77
78 # Content
79 for p in self._getPosts(ctx):
80 if p['type'] == 'attachment':
81 self._createAsset(p)
82 else:
83 self._createPost(p)
84
85 self._close(ctx)
86
87 def _open(self):
88 raise NotImplementedError()
89
90 def _close(self, ctx):
91 pass
92
93 def _getSiteConfig(self, ctx):
94 raise NotImplementedError()
95
96 def _getPosts(self, ctx):
97 raise NotImplementedError()
98
99 def _createAsset(self, asset_info):
100 download_asset(self.app, asset_info['url'])
101
102 def _createPost(self, post_info):
103 post_dt = post_info['datetime']
104 finder = {
105 'year': post_dt.year,
106 'month': post_dt.month,
107 'day': post_dt.day,
108 'slug': post_info['slug']}
109 if post_info['type'] == 'post':
110 source = self._posts_source
111 elif post_info['type'] == 'page':
112 source = self._pages_source
55 else: 113 else:
56 impl = _SqlImporter(app, args) 114 raise Exception("Unknown post type: %s" % post_info['type'])
57 return impl.importWebsite() 115 rel_path, fac_metadata = source.findPagePath(finder, MODE_CREATING)
58 116
59 117 metadata = post_info['metadata'].copy()
60 class _XmlImporter(object): 118 for name in ['title', 'author', 'status', 'post_id', 'post_guid',
119 'description', 'categories']:
120 val = post_info.get(name)
121 if val is not None:
122 metadata[name] = val
123
124 content = post_info['content']
125 excerpt = post_info['excerpt']
126 text = content
127 if excerpt is not None and excerpt.strip() != '':
128 text = "%s\n\n---excerpt---\n\n%s" % (content, excerpt)
129
130 path = source.resolveRef(rel_path)
131 create_page(self.app, path, metadata, text)
132
133
134 class _XmlImporter(_ImporterBase):
61 ns_wp = {'wp': 'http://wordpress.org/export/1.2/'} 135 ns_wp = {'wp': 'http://wordpress.org/export/1.2/'}
62 ns_dc = {'dc': "http://purl.org/dc/elements/1.1/"} 136 ns_dc = {'dc': "http://purl.org/dc/elements/1.1/"}
63 ns_excerpt = {'excerpt': "http://wordpress.org/export/1.2/excerpt/"} 137 ns_excerpt = {'excerpt': "http://wordpress.org/export/1.2/excerpt/"}
64 ns_content = {'content': "http://purl.org/rss/1.0/modules/content/"} 138 ns_content = {'content': "http://purl.org/rss/1.0/modules/content/"}
65 139
66 def __init__(self, app, args): 140 def __init__(self, app, args):
67 self.app = app 141 super(_XmlImporter, self).__init__(app, args)
68 self.path = args.xml_or_db_url 142 self.path = args.xml_path
69 self.posts_fs = args.posts_fs 143
70 self._cat_map = {} 144 def _open(self):
71 self._author_map = {}
72
73 for cls in self.app.plugin_loader.getSources():
74 if cls.SOURCE_NAME == ('posts/%s' % self.posts_fs):
75 src_config = {
76 'type': 'posts/%s' % self.posts_fs,
77 'fs_endpoint': 'posts',
78 'data_type': 'blog'}
79 self.posts_source = cls(app, 'posts', src_config)
80 break
81 else:
82 raise Exception("No such posts file-system: " % self.posts_fs)
83
84 def importWebsite(self):
85 if not os.path.exists(self.path): 145 if not os.path.exists(self.path):
86 raise Exception("No such file: %s" % self.path) 146 raise Exception("No such file: %s" % self.path)
87 147
88 try: 148 try:
89 import xml.etree.ElementTree as ET 149 import xml.etree.ElementTree as ET
97 xml = xml.replace(chr(0x1e), '') 157 xml = xml.replace(chr(0x1e), '')
98 xml = xml.replace(chr(0x10), '') 158 xml = xml.replace(chr(0x10), '')
99 tree = ET.fromstring(xml) 159 tree = ET.fromstring(xml)
100 channel = tree.find('channel') 160 channel = tree.find('channel')
101 161
162 return channel
163
164 def _getSiteConfig(self, channel):
102 # Get basic site information 165 # Get basic site information
103 title = find_text(channel, 'title') 166 title = find_text(channel, 'title')
104 description = find_text(channel, 'description') 167 description = find_text(channel, 'description')
105 site_config = { 168 site_config = OrderedDict({
106 'site': { 169 'site': {
107 'title': title, 170 'title': title,
108 'description': description, 171 'description': description}
109 'posts_fs': self.posts_fs} 172 })
110 }
111 logger.info("Importing '%s'" % title)
112 173
113 # Get authors' names. 174 # Get authors' names.
114 authors = {} 175 authors = {}
115 for a in channel.findall('wp:author', self.ns_wp): 176 for a in channel.findall('wp:author', self.ns_wp):
116 login = find_text(a, 'wp:author_login', self.ns_wp) 177 login = find_text(a, 'wp:author_login', self.ns_wp)
124 self.ns_wp), 185 self.ns_wp),
125 'author_id': find_text(a, 'wp:author_id', 186 'author_id': find_text(a, 'wp:author_id',
126 self.ns_wp)} 187 self.ns_wp)}
127 site_config['site']['authors'] = authors 188 site_config['site']['authors'] = authors
128 189
129 # Other stuff. 190 return site_config
130 site_config['site'].update({ 191
131 'post_url': '%year%/%month%/%slug%', 192 def _getPosts(self, channel):
132 'category_url': 'category/%category%'})
133
134 logger.info("Generating site configuration...")
135 site_config_path = os.path.join(self.app.root_dir, CONFIG_PATH)
136 with open(site_config_path, 'w') as fp:
137 yaml.safe_dump(site_config, fp, default_flow_style=False,
138 allow_unicode=True)
139
140 # Content.
141 for i in channel.findall('item'): 193 for i in channel.findall('item'):
142 post_type = find_text(i, 'wp:post_type', self.ns_wp) 194 post_type = find_text(i, 'wp:post_type', self.ns_wp)
143 if post_type == 'attachment': 195 if post_type == 'attachment':
144 self._createAsset(i) 196 yield self._getAssetInfo(i)
145 elif post_type == 'post': 197 else:
146 self._createPost(i) 198 yield self._getPostInfo(i)
147 199
148 self._cat_map = None 200 def _getAssetInfo(self, node):
149 self._author_map = None
150
151 def _createAsset(self, node):
152 url = find_text(node, 'wp:attachment_url', self.ns_wp) 201 url = find_text(node, 'wp:attachment_url', self.ns_wp)
153 download_asset(self.app, url) 202 return {'type': 'attachment', 'url': url}
154 203
155 def _getPageMetadata(self, node): 204 def _getPostInfo(self, node):
205 post_date_str = find_text(node, 'wp:post_date', self.ns_wp)
206 post_date = datetime.datetime.strptime(post_date_str,
207 '%Y-%m-%d %H:%M:%S')
208 post_name = find_text(node, 'wp:post_name', self.ns_wp)
209 post_type = find_text(node, 'wp:post_type', self.ns_wp)
210 post_info = {
211 'type': post_type,
212 'slug': post_name,
213 'datetime': post_date}
214
156 title = find_text(node, 'title') 215 title = find_text(node, 'title')
157 creator = find_text(node, 'dc:creator', self.ns_dc) 216 creator = find_text(node, 'dc:creator', self.ns_dc)
158 status = find_text(node, 'wp:status', self.ns_wp) 217 status = find_text(node, 'wp:status', self.ns_wp)
159 post_id = find_text(node, 'wp:post_id', self.ns_wp) 218 post_id = find_text(node, 'wp:post_id', self.ns_wp)
160 guid = find_text(node, 'guid') 219 guid = find_text(node, 'guid')
161 description = find_text(node, 'description') 220 description = find_text(node, 'description')
162 # TODO: menu order, parent, password, sticky 221 # TODO: menu order, parent, password, sticky
163 222 post_info.update({
164 categories = []
165 for c in node.findall('category'):
166 nicename = str(c.attrib.get('nicename'))
167 categories.append(nicename)
168
169 metadata = {
170 'title': title, 223 'title': title,
171 'author': creator, 224 'author': creator,
172 'status': status, 225 'status': status,
173 'post_id': post_id, 226 'post_id': post_id,
174 'post_guid': guid, 227 'post_guid': guid,
175 'description': description, 228 'description': description})
176 'categories': categories} 229
177 230 categories = []
231 for c in node.findall('category'):
232 nicename = str(c.attrib.get('nicename'))
233 categories.append(nicename)
234 post_info['categories'] = categories
235
236 metadata = {}
178 for m in node.findall('wp:postmeta', self.ns_wp): 237 for m in node.findall('wp:postmeta', self.ns_wp):
179 key = find_text(m, 'wp:meta_key', self.ns_wp) 238 key = find_text(m, 'wp:meta_key', self.ns_wp)
180 metadata[key] = find_text(m, 'wp:meta_value', self.ns_wp) 239 metadata[key] = find_text(m, 'wp:meta_value', self.ns_wp)
181 240 post_info['metadata'] = metadata
182 return metadata 241
183
184 def _getPageContents(self, node):
185 content = find_text(node, 'content:encoded', self.ns_content) 242 content = find_text(node, 'content:encoded', self.ns_content)
186 excerpt = find_text(node, 'excerpt:encoded', self.ns_excerpt) 243 excerpt = find_text(node, 'excerpt:encoded', self.ns_excerpt)
187 if not excerpt.strip(): 244 post_info.update({
188 return content 245 'content': content,
189 return "%s\n\n---excerpt---\n\n%s" % (content, excerpt) 246 'excerpt': excerpt})
190 247
191 def _getPageInfo(self, node): 248 return post_info
192 url = find_text(node, 'link') 249
193 post_date_str = find_text(node, 'wp:post_date', self.ns_wp) 250
194 post_date = datetime.datetime.strptime(post_date_str, 251 class WordpressXmlImporter(WordpressImporterBase):
195 '%Y-%m-%d %H:%M:%S') 252 name = 'wordpress-xml'
196 post_name = find_text(node, 'wp:post_name', self.ns_wp) 253 description = "Imports a Wordpress blog from an exported XML archive."
197 return { 254
198 'url': url, 255 def setupParser(self, parser, app):
199 'slug': post_name, 256 super(WordpressXmlImporter, self).setupParser(parser, app)
200 'datetime': post_date} 257 parser.add_argument(
201 258 'xml_path',
202 def _createPage(self, node): 259 help="The path to the exported XML archive file.")
203 info = self._getPageInfo(node) 260
204 rel_path = os.path.join('pages', info['slug']) 261 def _getImplementation(self, app, args):
205 metadata = self._getPageMetadata(node) 262 return _XmlImporter(app, args)
206 contents = self._getPageContents(node)
207 create_page(self.app, rel_path, metadata, contents)
208
209 def _createPost(self, node):
210 info = self._getPageInfo(node)
211 post_dt = info['datetime']
212 finder = {
213 'year': post_dt.year,
214 'month': post_dt.month,
215 'day': post_dt.day,
216 'slug': info['slug']}
217 rel_path, fac_metadata = self.posts_source.findPagePath(
218 finder, MODE_CREATING)
219 rel_path = os.path.join('posts', rel_path)
220 metadata = self._getPageMetadata(node)
221 contents = self._getPageContents(node)
222 create_page(self.app, rel_path, metadata, contents)
223
224
225 class _SqlImporter(object):
226 def __init__(self, app, args):
227 self.app = app
228 self.db_url = args.xml_or_db_url
229 self.prefix = args.prefix
230
231 def importWebsite(self):
232 raise NotImplementedError()
233 263
234 264
235 def find_text(parent, child_name, namespaces=None): 265 def find_text(parent, child_name, namespaces=None):
236 return str(parent.find(child_name, namespaces).text) 266 return str(parent.find(child_name, namespaces).text)
237 267