Mercurial > piecrust2
comparison piecrust/importing/wordpress.py @ 300:2daa05a21026
import: Add an XML-based Wordpress importer.
| author | Ludovic Chabant <ludovic@chabant.com> |
|---|---|
| date | Wed, 11 Mar 2015 23:48:35 -0700 |
| parents | |
| children | 103abb08755e |
comparison
equal
deleted
inserted
replaced
| 299:88bffd469b04 | 300:2daa05a21026 |
|---|---|
| 1 import os.path | |
| 2 import logging | |
| 3 import datetime | |
| 4 import yaml | |
| 5 from urllib.parse import urlparse | |
| 6 from piecrust import CONFIG_PATH | |
| 7 from piecrust.importing.base import Importer, create_page, download_asset | |
| 8 from piecrust.sources.base import MODE_CREATING | |
| 9 | |
| 10 | |
| 11 logger = logging.getLogger(__name__) | |
| 12 | |
| 13 | |
| 14 class WordpressImporter(Importer): | |
| 15 name = 'wordpress' | |
| 16 description = "Imports a Wordpress blog." | |
| 17 | |
| 18 def setupParser(self, parser, app): | |
| 19 parser.add_argument( | |
| 20 '--posts_fs', | |
| 21 default="hierarchy", | |
| 22 choices=['flat', 'shallow', 'hierarchy'], | |
| 23 help="The blog file-system type to use.") | |
| 24 parser.add_argument( | |
| 25 '--prefix', | |
| 26 default="wp_", | |
| 27 help="The SQL table prefix. Defaults to `wp_`.") | |
| 28 parser.add_argument( | |
| 29 '--default-post-layout', | |
| 30 help="The default layout to use for posts.") | |
| 31 parser.add_argument( | |
| 32 '--default-post-category', | |
| 33 help="The default category to use for posts.") | |
| 34 parser.add_argument( | |
| 35 '--default-page-layout', | |
| 36 help="The default layout to use for pages.") | |
| 37 parser.add_argument( | |
| 38 '--default-page-category', | |
| 39 help="The default category to use for pages.") | |
| 40 parser.add_argument( | |
| 41 'xml_or_db_url', | |
| 42 help=("The exported XML archive of the Wordpress site, or " | |
| 43 "the URL of the SQL database.\n" | |
| 44 "\n" | |
| 45 "If an SQL database URL, it should be of the " | |
| 46 "form: type://user:password@server/database\n" | |
| 47 "\n" | |
| 48 "For example:\n" | |
| 49 "mysql://user:password@example.org/my_database")) | |
| 50 | |
| 51 def importWebsite(self, app, args): | |
| 52 parsed_url = urlparse(args.xml_or_db_url) | |
| 53 if not parsed_url.scheme: | |
| 54 impl = _XmlImporter(app, args) | |
| 55 else: | |
| 56 impl = _SqlImporter(app, args) | |
| 57 return impl.importWebsite() | |
| 58 | |
| 59 | |
| 60 class _XmlImporter(object): | |
| 61 ns_wp = {'wp': 'http://wordpress.org/export/1.2/'} | |
| 62 ns_dc = {'dc': "http://purl.org/dc/elements/1.1/"} | |
| 63 ns_excerpt = {'excerpt': "http://wordpress.org/export/1.2/excerpt/"} | |
| 64 ns_content = {'content': "http://purl.org/rss/1.0/modules/content/"} | |
| 65 | |
| 66 def __init__(self, app, args): | |
| 67 self.app = app | |
| 68 self.path = args.xml_or_db_url | |
| 69 self.posts_fs = args.posts_fs | |
| 70 self._cat_map = {} | |
| 71 self._author_map = {} | |
| 72 | |
| 73 for cls in self.app.plugin_loader.getSources(): | |
| 74 if cls.SOURCE_NAME == ('posts/%s' % self.posts_fs): | |
| 75 src_config = { | |
| 76 'type': 'posts/%s' % self.posts_fs, | |
| 77 'fs_endpoint': 'posts', | |
| 78 'data_type': 'blog'} | |
| 79 self.posts_source = cls(app, 'posts', src_config) | |
| 80 break | |
| 81 else: | |
| 82 raise Exception("No such posts file-system: " % self.posts_fs) | |
| 83 | |
| 84 def importWebsite(self): | |
| 85 if not os.path.exists(self.path): | |
| 86 raise Exception("No such file: %s" % self.path) | |
| 87 | |
| 88 try: | |
| 89 import xml.etree.ElementTree as ET | |
| 90 except ImportError: | |
| 91 logger.error("You don't seem to have any support for ElementTree " | |
| 92 "XML parsing.") | |
| 93 return 1 | |
| 94 | |
| 95 with open(self.path, 'r', encoding='utf8') as fp: | |
| 96 xml = fp.read() | |
| 97 xml = xml.replace(chr(0x1e), '') | |
| 98 xml = xml.replace(chr(0x10), '') | |
| 99 tree = ET.fromstring(xml) | |
| 100 channel = tree.find('channel') | |
| 101 | |
| 102 # Get basic site information | |
| 103 title = find_text(channel, 'title') | |
| 104 description = find_text(channel, 'description') | |
| 105 site_config = { | |
| 106 'site': { | |
| 107 'title': title, | |
| 108 'description': description, | |
| 109 'posts_fs': self.posts_fs} | |
| 110 } | |
| 111 logger.info("Importing '%s'" % title) | |
| 112 | |
| 113 # Get authors' names. | |
| 114 authors = {} | |
| 115 for a in channel.findall('wp:author', self.ns_wp): | |
| 116 login = find_text(a, 'wp:author_login', self.ns_wp) | |
| 117 authors[login] = { | |
| 118 'email': find_text(a, 'wp:author_email', self.ns_wp), | |
| 119 'display_name': find_text(a, 'wp:author_display_name', | |
| 120 self.ns_wp), | |
| 121 'first_name': find_text(a, 'wp:author_first_name', | |
| 122 self.ns_wp), | |
| 123 'last_name': find_text(a, 'wp:author_last_name', | |
| 124 self.ns_wp), | |
| 125 'author_id': find_text(a, 'wp:author_id', | |
| 126 self.ns_wp)} | |
| 127 site_config['site']['authors'] = authors | |
| 128 | |
| 129 # Other stuff. | |
| 130 site_config['site'].update({ | |
| 131 'post_url': '%year%/%month%/%slug%', | |
| 132 'category_url': 'category/%category%'}) | |
| 133 | |
| 134 logger.info("Generating site configuration...") | |
| 135 site_config_path = os.path.join(self.app.root_dir, CONFIG_PATH) | |
| 136 with open(site_config_path, 'w') as fp: | |
| 137 yaml.safe_dump(site_config, fp, default_flow_style=False, | |
| 138 allow_unicode=True) | |
| 139 | |
| 140 # Content. | |
| 141 for i in channel.findall('item'): | |
| 142 post_type = find_text(i, 'wp:post_type', self.ns_wp) | |
| 143 if post_type == 'attachment': | |
| 144 self._createAsset(i) | |
| 145 elif post_type == 'post': | |
| 146 self._createPost(i) | |
| 147 | |
| 148 self._cat_map = None | |
| 149 self._author_map = None | |
| 150 | |
| 151 def _createAsset(self, node): | |
| 152 url = find_text(node, 'wp:attachment_url', self.ns_wp) | |
| 153 download_asset(self.app, url) | |
| 154 | |
| 155 def _getPageMetadata(self, node): | |
| 156 title = find_text(node, 'title') | |
| 157 creator = find_text(node, 'dc:creator', self.ns_dc) | |
| 158 status = find_text(node, 'wp:status', self.ns_wp) | |
| 159 post_id = find_text(node, 'wp:post_id', self.ns_wp) | |
| 160 guid = find_text(node, 'guid') | |
| 161 description = find_text(node, 'description') | |
| 162 # TODO: menu order, parent, password, sticky | |
| 163 | |
| 164 categories = [] | |
| 165 for c in node.findall('category'): | |
| 166 nicename = str(c.attrib.get('nicename')) | |
| 167 categories.append(nicename) | |
| 168 | |
| 169 metadata = { | |
| 170 'title': title, | |
| 171 'author': creator, | |
| 172 'status': status, | |
| 173 'post_id': post_id, | |
| 174 'post_guid': guid, | |
| 175 'description': description, | |
| 176 'categories': categories} | |
| 177 | |
| 178 for m in node.findall('wp:postmeta', self.ns_wp): | |
| 179 key = find_text(m, 'wp:meta_key', self.ns_wp) | |
| 180 metadata[key] = find_text(m, 'wp:meta_value', self.ns_wp) | |
| 181 | |
| 182 return metadata | |
| 183 | |
| 184 def _getPageContents(self, node): | |
| 185 content = find_text(node, 'content:encoded', self.ns_content) | |
| 186 excerpt = find_text(node, 'excerpt:encoded', self.ns_excerpt) | |
| 187 if not excerpt.strip(): | |
| 188 return content | |
| 189 return "%s\n\n---excerpt---\n\n%s" % (content, excerpt) | |
| 190 | |
| 191 def _getPageInfo(self, node): | |
| 192 url = find_text(node, 'link') | |
| 193 post_date_str = find_text(node, 'wp:post_date', self.ns_wp) | |
| 194 post_date = datetime.datetime.strptime(post_date_str, | |
| 195 '%Y-%m-%d %H:%M:%S') | |
| 196 post_name = find_text(node, 'wp:post_name', self.ns_wp) | |
| 197 return { | |
| 198 'url': url, | |
| 199 'slug': post_name, | |
| 200 'datetime': post_date} | |
| 201 | |
| 202 def _createPage(self, node): | |
| 203 info = self._getPageInfo(node) | |
| 204 rel_path = os.path.join('pages', info['slug']) | |
| 205 metadata = self._getPageMetadata(node) | |
| 206 contents = self._getPageContents(node) | |
| 207 create_page(self.app, rel_path, metadata, contents) | |
| 208 | |
| 209 def _createPost(self, node): | |
| 210 info = self._getPageInfo(node) | |
| 211 post_dt = info['datetime'] | |
| 212 finder = { | |
| 213 'year': post_dt.year, | |
| 214 'month': post_dt.month, | |
| 215 'day': post_dt.day, | |
| 216 'slug': info['slug']} | |
| 217 rel_path, fac_metadata = self.posts_source.findPagePath( | |
| 218 finder, MODE_CREATING) | |
| 219 rel_path = os.path.join('posts', rel_path) | |
| 220 metadata = self._getPageMetadata(node) | |
| 221 contents = self._getPageContents(node) | |
| 222 create_page(self.app, rel_path, metadata, contents) | |
| 223 | |
| 224 | |
| 225 class _SqlImporter(object): | |
| 226 def __init__(self, app, args): | |
| 227 self.app = app | |
| 228 self.db_url = args.xml_or_db_url | |
| 229 self.prefix = args.prefix | |
| 230 | |
| 231 def importWebsite(self): | |
| 232 raise NotImplementedError() | |
| 233 | |
| 234 | |
| 235 def find_text(parent, child_name, namespaces=None): | |
| 236 return str(parent.find(child_name, namespaces).text) | |
| 237 |
