comparison piecrust/importing/wordpress.py @ 300:2daa05a21026

import: Add an XML-based Wordpress importer.
author Ludovic Chabant <ludovic@chabant.com>
date Wed, 11 Mar 2015 23:48:35 -0700
parents
children 103abb08755e
comparison
equal deleted inserted replaced
299:88bffd469b04 300:2daa05a21026
1 import os.path
2 import logging
3 import datetime
4 import yaml
5 from urllib.parse import urlparse
6 from piecrust import CONFIG_PATH
7 from piecrust.importing.base import Importer, create_page, download_asset
8 from piecrust.sources.base import MODE_CREATING
9
10
11 logger = logging.getLogger(__name__)
12
13
14 class WordpressImporter(Importer):
15 name = 'wordpress'
16 description = "Imports a Wordpress blog."
17
18 def setupParser(self, parser, app):
19 parser.add_argument(
20 '--posts_fs',
21 default="hierarchy",
22 choices=['flat', 'shallow', 'hierarchy'],
23 help="The blog file-system type to use.")
24 parser.add_argument(
25 '--prefix',
26 default="wp_",
27 help="The SQL table prefix. Defaults to `wp_`.")
28 parser.add_argument(
29 '--default-post-layout',
30 help="The default layout to use for posts.")
31 parser.add_argument(
32 '--default-post-category',
33 help="The default category to use for posts.")
34 parser.add_argument(
35 '--default-page-layout',
36 help="The default layout to use for pages.")
37 parser.add_argument(
38 '--default-page-category',
39 help="The default category to use for pages.")
40 parser.add_argument(
41 'xml_or_db_url',
42 help=("The exported XML archive of the Wordpress site, or "
43 "the URL of the SQL database.\n"
44 "\n"
45 "If an SQL database URL, it should be of the "
46 "form: type://user:password@server/database\n"
47 "\n"
48 "For example:\n"
49 "mysql://user:password@example.org/my_database"))
50
51 def importWebsite(self, app, args):
52 parsed_url = urlparse(args.xml_or_db_url)
53 if not parsed_url.scheme:
54 impl = _XmlImporter(app, args)
55 else:
56 impl = _SqlImporter(app, args)
57 return impl.importWebsite()
58
59
60 class _XmlImporter(object):
61 ns_wp = {'wp': 'http://wordpress.org/export/1.2/'}
62 ns_dc = {'dc': "http://purl.org/dc/elements/1.1/"}
63 ns_excerpt = {'excerpt': "http://wordpress.org/export/1.2/excerpt/"}
64 ns_content = {'content': "http://purl.org/rss/1.0/modules/content/"}
65
66 def __init__(self, app, args):
67 self.app = app
68 self.path = args.xml_or_db_url
69 self.posts_fs = args.posts_fs
70 self._cat_map = {}
71 self._author_map = {}
72
73 for cls in self.app.plugin_loader.getSources():
74 if cls.SOURCE_NAME == ('posts/%s' % self.posts_fs):
75 src_config = {
76 'type': 'posts/%s' % self.posts_fs,
77 'fs_endpoint': 'posts',
78 'data_type': 'blog'}
79 self.posts_source = cls(app, 'posts', src_config)
80 break
81 else:
82 raise Exception("No such posts file-system: " % self.posts_fs)
83
84 def importWebsite(self):
85 if not os.path.exists(self.path):
86 raise Exception("No such file: %s" % self.path)
87
88 try:
89 import xml.etree.ElementTree as ET
90 except ImportError:
91 logger.error("You don't seem to have any support for ElementTree "
92 "XML parsing.")
93 return 1
94
95 with open(self.path, 'r', encoding='utf8') as fp:
96 xml = fp.read()
97 xml = xml.replace(chr(0x1e), '')
98 xml = xml.replace(chr(0x10), '')
99 tree = ET.fromstring(xml)
100 channel = tree.find('channel')
101
102 # Get basic site information
103 title = find_text(channel, 'title')
104 description = find_text(channel, 'description')
105 site_config = {
106 'site': {
107 'title': title,
108 'description': description,
109 'posts_fs': self.posts_fs}
110 }
111 logger.info("Importing '%s'" % title)
112
113 # Get authors' names.
114 authors = {}
115 for a in channel.findall('wp:author', self.ns_wp):
116 login = find_text(a, 'wp:author_login', self.ns_wp)
117 authors[login] = {
118 'email': find_text(a, 'wp:author_email', self.ns_wp),
119 'display_name': find_text(a, 'wp:author_display_name',
120 self.ns_wp),
121 'first_name': find_text(a, 'wp:author_first_name',
122 self.ns_wp),
123 'last_name': find_text(a, 'wp:author_last_name',
124 self.ns_wp),
125 'author_id': find_text(a, 'wp:author_id',
126 self.ns_wp)}
127 site_config['site']['authors'] = authors
128
129 # Other stuff.
130 site_config['site'].update({
131 'post_url': '%year%/%month%/%slug%',
132 'category_url': 'category/%category%'})
133
134 logger.info("Generating site configuration...")
135 site_config_path = os.path.join(self.app.root_dir, CONFIG_PATH)
136 with open(site_config_path, 'w') as fp:
137 yaml.safe_dump(site_config, fp, default_flow_style=False,
138 allow_unicode=True)
139
140 # Content.
141 for i in channel.findall('item'):
142 post_type = find_text(i, 'wp:post_type', self.ns_wp)
143 if post_type == 'attachment':
144 self._createAsset(i)
145 elif post_type == 'post':
146 self._createPost(i)
147
148 self._cat_map = None
149 self._author_map = None
150
151 def _createAsset(self, node):
152 url = find_text(node, 'wp:attachment_url', self.ns_wp)
153 download_asset(self.app, url)
154
155 def _getPageMetadata(self, node):
156 title = find_text(node, 'title')
157 creator = find_text(node, 'dc:creator', self.ns_dc)
158 status = find_text(node, 'wp:status', self.ns_wp)
159 post_id = find_text(node, 'wp:post_id', self.ns_wp)
160 guid = find_text(node, 'guid')
161 description = find_text(node, 'description')
162 # TODO: menu order, parent, password, sticky
163
164 categories = []
165 for c in node.findall('category'):
166 nicename = str(c.attrib.get('nicename'))
167 categories.append(nicename)
168
169 metadata = {
170 'title': title,
171 'author': creator,
172 'status': status,
173 'post_id': post_id,
174 'post_guid': guid,
175 'description': description,
176 'categories': categories}
177
178 for m in node.findall('wp:postmeta', self.ns_wp):
179 key = find_text(m, 'wp:meta_key', self.ns_wp)
180 metadata[key] = find_text(m, 'wp:meta_value', self.ns_wp)
181
182 return metadata
183
184 def _getPageContents(self, node):
185 content = find_text(node, 'content:encoded', self.ns_content)
186 excerpt = find_text(node, 'excerpt:encoded', self.ns_excerpt)
187 if not excerpt.strip():
188 return content
189 return "%s\n\n---excerpt---\n\n%s" % (content, excerpt)
190
191 def _getPageInfo(self, node):
192 url = find_text(node, 'link')
193 post_date_str = find_text(node, 'wp:post_date', self.ns_wp)
194 post_date = datetime.datetime.strptime(post_date_str,
195 '%Y-%m-%d %H:%M:%S')
196 post_name = find_text(node, 'wp:post_name', self.ns_wp)
197 return {
198 'url': url,
199 'slug': post_name,
200 'datetime': post_date}
201
202 def _createPage(self, node):
203 info = self._getPageInfo(node)
204 rel_path = os.path.join('pages', info['slug'])
205 metadata = self._getPageMetadata(node)
206 contents = self._getPageContents(node)
207 create_page(self.app, rel_path, metadata, contents)
208
209 def _createPost(self, node):
210 info = self._getPageInfo(node)
211 post_dt = info['datetime']
212 finder = {
213 'year': post_dt.year,
214 'month': post_dt.month,
215 'day': post_dt.day,
216 'slug': info['slug']}
217 rel_path, fac_metadata = self.posts_source.findPagePath(
218 finder, MODE_CREATING)
219 rel_path = os.path.join('posts', rel_path)
220 metadata = self._getPageMetadata(node)
221 contents = self._getPageContents(node)
222 create_page(self.app, rel_path, metadata, contents)
223
224
225 class _SqlImporter(object):
226 def __init__(self, app, args):
227 self.app = app
228 self.db_url = args.xml_or_db_url
229 self.prefix = args.prefix
230
231 def importWebsite(self):
232 raise NotImplementedError()
233
234
235 def find_text(parent, child_name, namespaces=None):
236 return str(parent.find(child_name, namespaces).text)
237