Mercurial > piecrust2
comparison piecrust/importing/wordpress.py @ 300:2daa05a21026
import: Add an XML-based Wordpress importer.
author | Ludovic Chabant <ludovic@chabant.com> |
---|---|
date | Wed, 11 Mar 2015 23:48:35 -0700 |
parents | |
children | 103abb08755e |
comparison
equal
deleted
inserted
replaced
299:88bffd469b04 | 300:2daa05a21026 |
---|---|
1 import os.path | |
2 import logging | |
3 import datetime | |
4 import yaml | |
5 from urllib.parse import urlparse | |
6 from piecrust import CONFIG_PATH | |
7 from piecrust.importing.base import Importer, create_page, download_asset | |
8 from piecrust.sources.base import MODE_CREATING | |
9 | |
10 | |
11 logger = logging.getLogger(__name__) | |
12 | |
13 | |
14 class WordpressImporter(Importer): | |
15 name = 'wordpress' | |
16 description = "Imports a Wordpress blog." | |
17 | |
18 def setupParser(self, parser, app): | |
19 parser.add_argument( | |
20 '--posts_fs', | |
21 default="hierarchy", | |
22 choices=['flat', 'shallow', 'hierarchy'], | |
23 help="The blog file-system type to use.") | |
24 parser.add_argument( | |
25 '--prefix', | |
26 default="wp_", | |
27 help="The SQL table prefix. Defaults to `wp_`.") | |
28 parser.add_argument( | |
29 '--default-post-layout', | |
30 help="The default layout to use for posts.") | |
31 parser.add_argument( | |
32 '--default-post-category', | |
33 help="The default category to use for posts.") | |
34 parser.add_argument( | |
35 '--default-page-layout', | |
36 help="The default layout to use for pages.") | |
37 parser.add_argument( | |
38 '--default-page-category', | |
39 help="The default category to use for pages.") | |
40 parser.add_argument( | |
41 'xml_or_db_url', | |
42 help=("The exported XML archive of the Wordpress site, or " | |
43 "the URL of the SQL database.\n" | |
44 "\n" | |
45 "If an SQL database URL, it should be of the " | |
46 "form: type://user:password@server/database\n" | |
47 "\n" | |
48 "For example:\n" | |
49 "mysql://user:password@example.org/my_database")) | |
50 | |
51 def importWebsite(self, app, args): | |
52 parsed_url = urlparse(args.xml_or_db_url) | |
53 if not parsed_url.scheme: | |
54 impl = _XmlImporter(app, args) | |
55 else: | |
56 impl = _SqlImporter(app, args) | |
57 return impl.importWebsite() | |
58 | |
59 | |
60 class _XmlImporter(object): | |
61 ns_wp = {'wp': 'http://wordpress.org/export/1.2/'} | |
62 ns_dc = {'dc': "http://purl.org/dc/elements/1.1/"} | |
63 ns_excerpt = {'excerpt': "http://wordpress.org/export/1.2/excerpt/"} | |
64 ns_content = {'content': "http://purl.org/rss/1.0/modules/content/"} | |
65 | |
66 def __init__(self, app, args): | |
67 self.app = app | |
68 self.path = args.xml_or_db_url | |
69 self.posts_fs = args.posts_fs | |
70 self._cat_map = {} | |
71 self._author_map = {} | |
72 | |
73 for cls in self.app.plugin_loader.getSources(): | |
74 if cls.SOURCE_NAME == ('posts/%s' % self.posts_fs): | |
75 src_config = { | |
76 'type': 'posts/%s' % self.posts_fs, | |
77 'fs_endpoint': 'posts', | |
78 'data_type': 'blog'} | |
79 self.posts_source = cls(app, 'posts', src_config) | |
80 break | |
81 else: | |
82 raise Exception("No such posts file-system: " % self.posts_fs) | |
83 | |
84 def importWebsite(self): | |
85 if not os.path.exists(self.path): | |
86 raise Exception("No such file: %s" % self.path) | |
87 | |
88 try: | |
89 import xml.etree.ElementTree as ET | |
90 except ImportError: | |
91 logger.error("You don't seem to have any support for ElementTree " | |
92 "XML parsing.") | |
93 return 1 | |
94 | |
95 with open(self.path, 'r', encoding='utf8') as fp: | |
96 xml = fp.read() | |
97 xml = xml.replace(chr(0x1e), '') | |
98 xml = xml.replace(chr(0x10), '') | |
99 tree = ET.fromstring(xml) | |
100 channel = tree.find('channel') | |
101 | |
102 # Get basic site information | |
103 title = find_text(channel, 'title') | |
104 description = find_text(channel, 'description') | |
105 site_config = { | |
106 'site': { | |
107 'title': title, | |
108 'description': description, | |
109 'posts_fs': self.posts_fs} | |
110 } | |
111 logger.info("Importing '%s'" % title) | |
112 | |
113 # Get authors' names. | |
114 authors = {} | |
115 for a in channel.findall('wp:author', self.ns_wp): | |
116 login = find_text(a, 'wp:author_login', self.ns_wp) | |
117 authors[login] = { | |
118 'email': find_text(a, 'wp:author_email', self.ns_wp), | |
119 'display_name': find_text(a, 'wp:author_display_name', | |
120 self.ns_wp), | |
121 'first_name': find_text(a, 'wp:author_first_name', | |
122 self.ns_wp), | |
123 'last_name': find_text(a, 'wp:author_last_name', | |
124 self.ns_wp), | |
125 'author_id': find_text(a, 'wp:author_id', | |
126 self.ns_wp)} | |
127 site_config['site']['authors'] = authors | |
128 | |
129 # Other stuff. | |
130 site_config['site'].update({ | |
131 'post_url': '%year%/%month%/%slug%', | |
132 'category_url': 'category/%category%'}) | |
133 | |
134 logger.info("Generating site configuration...") | |
135 site_config_path = os.path.join(self.app.root_dir, CONFIG_PATH) | |
136 with open(site_config_path, 'w') as fp: | |
137 yaml.safe_dump(site_config, fp, default_flow_style=False, | |
138 allow_unicode=True) | |
139 | |
140 # Content. | |
141 for i in channel.findall('item'): | |
142 post_type = find_text(i, 'wp:post_type', self.ns_wp) | |
143 if post_type == 'attachment': | |
144 self._createAsset(i) | |
145 elif post_type == 'post': | |
146 self._createPost(i) | |
147 | |
148 self._cat_map = None | |
149 self._author_map = None | |
150 | |
151 def _createAsset(self, node): | |
152 url = find_text(node, 'wp:attachment_url', self.ns_wp) | |
153 download_asset(self.app, url) | |
154 | |
155 def _getPageMetadata(self, node): | |
156 title = find_text(node, 'title') | |
157 creator = find_text(node, 'dc:creator', self.ns_dc) | |
158 status = find_text(node, 'wp:status', self.ns_wp) | |
159 post_id = find_text(node, 'wp:post_id', self.ns_wp) | |
160 guid = find_text(node, 'guid') | |
161 description = find_text(node, 'description') | |
162 # TODO: menu order, parent, password, sticky | |
163 | |
164 categories = [] | |
165 for c in node.findall('category'): | |
166 nicename = str(c.attrib.get('nicename')) | |
167 categories.append(nicename) | |
168 | |
169 metadata = { | |
170 'title': title, | |
171 'author': creator, | |
172 'status': status, | |
173 'post_id': post_id, | |
174 'post_guid': guid, | |
175 'description': description, | |
176 'categories': categories} | |
177 | |
178 for m in node.findall('wp:postmeta', self.ns_wp): | |
179 key = find_text(m, 'wp:meta_key', self.ns_wp) | |
180 metadata[key] = find_text(m, 'wp:meta_value', self.ns_wp) | |
181 | |
182 return metadata | |
183 | |
184 def _getPageContents(self, node): | |
185 content = find_text(node, 'content:encoded', self.ns_content) | |
186 excerpt = find_text(node, 'excerpt:encoded', self.ns_excerpt) | |
187 if not excerpt.strip(): | |
188 return content | |
189 return "%s\n\n---excerpt---\n\n%s" % (content, excerpt) | |
190 | |
191 def _getPageInfo(self, node): | |
192 url = find_text(node, 'link') | |
193 post_date_str = find_text(node, 'wp:post_date', self.ns_wp) | |
194 post_date = datetime.datetime.strptime(post_date_str, | |
195 '%Y-%m-%d %H:%M:%S') | |
196 post_name = find_text(node, 'wp:post_name', self.ns_wp) | |
197 return { | |
198 'url': url, | |
199 'slug': post_name, | |
200 'datetime': post_date} | |
201 | |
202 def _createPage(self, node): | |
203 info = self._getPageInfo(node) | |
204 rel_path = os.path.join('pages', info['slug']) | |
205 metadata = self._getPageMetadata(node) | |
206 contents = self._getPageContents(node) | |
207 create_page(self.app, rel_path, metadata, contents) | |
208 | |
209 def _createPost(self, node): | |
210 info = self._getPageInfo(node) | |
211 post_dt = info['datetime'] | |
212 finder = { | |
213 'year': post_dt.year, | |
214 'month': post_dt.month, | |
215 'day': post_dt.day, | |
216 'slug': info['slug']} | |
217 rel_path, fac_metadata = self.posts_source.findPagePath( | |
218 finder, MODE_CREATING) | |
219 rel_path = os.path.join('posts', rel_path) | |
220 metadata = self._getPageMetadata(node) | |
221 contents = self._getPageContents(node) | |
222 create_page(self.app, rel_path, metadata, contents) | |
223 | |
224 | |
225 class _SqlImporter(object): | |
226 def __init__(self, app, args): | |
227 self.app = app | |
228 self.db_url = args.xml_or_db_url | |
229 self.prefix = args.prefix | |
230 | |
231 def importWebsite(self): | |
232 raise NotImplementedError() | |
233 | |
234 | |
235 def find_text(parent, child_name, namespaces=None): | |
236 return str(parent.find(child_name, namespaces).text) | |
237 |