Mercurial > piecrust2
changeset 300:2daa05a21026
import: Add an XML-based Wordpress importer.
author | Ludovic Chabant <ludovic@chabant.com> |
---|---|
date | Wed, 11 Mar 2015 23:48:35 -0700 |
parents | 88bffd469b04 |
children | 45aba3cb7228 |
files | piecrust/importing/base.py piecrust/importing/wordpress.py piecrust/plugins/builtin.py |
diffstat | 3 files changed, 258 insertions(+), 3 deletions(-) [+] |
line wrap: on
line diff
--- a/piecrust/importing/base.py Wed Mar 11 23:47:14 2015 -0700 +++ b/piecrust/importing/base.py Wed Mar 11 23:48:35 2015 -0700 @@ -1,7 +1,10 @@ import os.path +import shutil import codecs import logging import yaml +from urllib.parse import urlparse +from urllib.request import urlopen from piecrust.pathutil import SiteNotFoundError, multi_fnmatch_filter @@ -57,9 +60,9 @@ self._importFile(full_fn, rel_fn, *args, **kwargs) -def create_page(app, endpoint_dir, slug, metadata, content): - path = os.path.join(app.root_dir, endpoint_dir, slug) - logging.debug("Creating page: %s" % os.path.relpath(path, app.root_dir)) +def create_page(app, rel_path, metadata, content): + path = os.path.join(app.root_dir, rel_path) + logging.info("Creating page: %s" % rel_path) header = yaml.dump(metadata) os.makedirs(os.path.dirname(path), 0o755, True) with codecs.open(path, 'w', encoding='utf8') as fp: @@ -68,3 +71,16 @@ fp.write("---\n") fp.write(content) + +def download_asset(app, url, rel_path=None, skip_if_exists=True): + if rel_path is None: + parsed_url = urlparse(url) + rel_path = 'assets/' + parsed_url.path.lstrip('/') + path = os.path.join(app.root_dir, rel_path) + if skip_if_exists and os.path.exists(path): + return + logger.info("Downloading %s" % rel_path) + os.makedirs(os.path.dirname(path), 0o755, True) + with urlopen(url) as resp, open(path, 'wb') as fp: + shutil.copyfileobj(resp, fp) +
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/piecrust/importing/wordpress.py Wed Mar 11 23:48:35 2015 -0700 @@ -0,0 +1,237 @@ +import os.path +import logging +import datetime +import yaml +from urllib.parse import urlparse +from piecrust import CONFIG_PATH +from piecrust.importing.base import Importer, create_page, download_asset +from piecrust.sources.base import MODE_CREATING + + +logger = logging.getLogger(__name__) + + +class WordpressImporter(Importer): + name = 'wordpress' + description = "Imports a Wordpress blog." + + def setupParser(self, parser, app): + parser.add_argument( + '--posts_fs', + default="hierarchy", + choices=['flat', 'shallow', 'hierarchy'], + help="The blog file-system type to use.") + parser.add_argument( + '--prefix', + default="wp_", + help="The SQL table prefix. Defaults to `wp_`.") + parser.add_argument( + '--default-post-layout', + help="The default layout to use for posts.") + parser.add_argument( + '--default-post-category', + help="The default category to use for posts.") + parser.add_argument( + '--default-page-layout', + help="The default layout to use for pages.") + parser.add_argument( + '--default-page-category', + help="The default category to use for pages.") + parser.add_argument( + 'xml_or_db_url', + help=("The exported XML archive of the Wordpress site, or " + "the URL of the SQL database.\n" + "\n" + "If an SQL database URL, it should be of the " + "form: type://user:password@server/database\n" + "\n" + "For example:\n" + "mysql://user:password@example.org/my_database")) + + def importWebsite(self, app, args): + parsed_url = urlparse(args.xml_or_db_url) + if not parsed_url.scheme: + impl = _XmlImporter(app, args) + else: + impl = _SqlImporter(app, args) + return impl.importWebsite() + + +class _XmlImporter(object): + ns_wp = {'wp': 'http://wordpress.org/export/1.2/'} + ns_dc = {'dc': "http://purl.org/dc/elements/1.1/"} + ns_excerpt = {'excerpt': "http://wordpress.org/export/1.2/excerpt/"} + ns_content = {'content': "http://purl.org/rss/1.0/modules/content/"} + + def __init__(self, app, args): + self.app = app + self.path = args.xml_or_db_url + self.posts_fs = args.posts_fs + self._cat_map = {} + self._author_map = {} + + for cls in self.app.plugin_loader.getSources(): + if cls.SOURCE_NAME == ('posts/%s' % self.posts_fs): + src_config = { + 'type': 'posts/%s' % self.posts_fs, + 'fs_endpoint': 'posts', + 'data_type': 'blog'} + self.posts_source = cls(app, 'posts', src_config) + break + else: + raise Exception("No such posts file-system: " % self.posts_fs) + + def importWebsite(self): + if not os.path.exists(self.path): + raise Exception("No such file: %s" % self.path) + + try: + import xml.etree.ElementTree as ET + except ImportError: + logger.error("You don't seem to have any support for ElementTree " + "XML parsing.") + return 1 + + with open(self.path, 'r', encoding='utf8') as fp: + xml = fp.read() + xml = xml.replace(chr(0x1e), '') + xml = xml.replace(chr(0x10), '') + tree = ET.fromstring(xml) + channel = tree.find('channel') + + # Get basic site information + title = find_text(channel, 'title') + description = find_text(channel, 'description') + site_config = { + 'site': { + 'title': title, + 'description': description, + 'posts_fs': self.posts_fs} + } + logger.info("Importing '%s'" % title) + + # Get authors' names. + authors = {} + for a in channel.findall('wp:author', self.ns_wp): + login = find_text(a, 'wp:author_login', self.ns_wp) + authors[login] = { + 'email': find_text(a, 'wp:author_email', self.ns_wp), + 'display_name': find_text(a, 'wp:author_display_name', + self.ns_wp), + 'first_name': find_text(a, 'wp:author_first_name', + self.ns_wp), + 'last_name': find_text(a, 'wp:author_last_name', + self.ns_wp), + 'author_id': find_text(a, 'wp:author_id', + self.ns_wp)} + site_config['site']['authors'] = authors + + # Other stuff. + site_config['site'].update({ + 'post_url': '%year%/%month%/%slug%', + 'category_url': 'category/%category%'}) + + logger.info("Generating site configuration...") + site_config_path = os.path.join(self.app.root_dir, CONFIG_PATH) + with open(site_config_path, 'w') as fp: + yaml.safe_dump(site_config, fp, default_flow_style=False, + allow_unicode=True) + + # Content. + for i in channel.findall('item'): + post_type = find_text(i, 'wp:post_type', self.ns_wp) + if post_type == 'attachment': + self._createAsset(i) + elif post_type == 'post': + self._createPost(i) + + self._cat_map = None + self._author_map = None + + def _createAsset(self, node): + url = find_text(node, 'wp:attachment_url', self.ns_wp) + download_asset(self.app, url) + + def _getPageMetadata(self, node): + title = find_text(node, 'title') + creator = find_text(node, 'dc:creator', self.ns_dc) + status = find_text(node, 'wp:status', self.ns_wp) + post_id = find_text(node, 'wp:post_id', self.ns_wp) + guid = find_text(node, 'guid') + description = find_text(node, 'description') + # TODO: menu order, parent, password, sticky + + categories = [] + for c in node.findall('category'): + nicename = str(c.attrib.get('nicename')) + categories.append(nicename) + + metadata = { + 'title': title, + 'author': creator, + 'status': status, + 'post_id': post_id, + 'post_guid': guid, + 'description': description, + 'categories': categories} + + for m in node.findall('wp:postmeta', self.ns_wp): + key = find_text(m, 'wp:meta_key', self.ns_wp) + metadata[key] = find_text(m, 'wp:meta_value', self.ns_wp) + + return metadata + + def _getPageContents(self, node): + content = find_text(node, 'content:encoded', self.ns_content) + excerpt = find_text(node, 'excerpt:encoded', self.ns_excerpt) + if not excerpt.strip(): + return content + return "%s\n\n---excerpt---\n\n%s" % (content, excerpt) + + def _getPageInfo(self, node): + url = find_text(node, 'link') + post_date_str = find_text(node, 'wp:post_date', self.ns_wp) + post_date = datetime.datetime.strptime(post_date_str, + '%Y-%m-%d %H:%M:%S') + post_name = find_text(node, 'wp:post_name', self.ns_wp) + return { + 'url': url, + 'slug': post_name, + 'datetime': post_date} + + def _createPage(self, node): + info = self._getPageInfo(node) + rel_path = os.path.join('pages', info['slug']) + metadata = self._getPageMetadata(node) + contents = self._getPageContents(node) + create_page(self.app, rel_path, metadata, contents) + + def _createPost(self, node): + info = self._getPageInfo(node) + post_dt = info['datetime'] + finder = { + 'year': post_dt.year, + 'month': post_dt.month, + 'day': post_dt.day, + 'slug': info['slug']} + rel_path, fac_metadata = self.posts_source.findPagePath( + finder, MODE_CREATING) + rel_path = os.path.join('posts', rel_path) + metadata = self._getPageMetadata(node) + contents = self._getPageContents(node) + create_page(self.app, rel_path, metadata, contents) + + +class _SqlImporter(object): + def __init__(self, app, args): + self.app = app + self.db_url = args.xml_or_db_url + self.prefix = args.prefix + + def importWebsite(self): + raise NotImplementedError() + + +def find_text(parent, child_name, namespaces=None): + return str(parent.find(child_name, namespaces).text) +
--- a/piecrust/plugins/builtin.py Wed Mar 11 23:47:14 2015 -0700 +++ b/piecrust/plugins/builtin.py Wed Mar 11 23:48:35 2015 -0700 @@ -19,6 +19,7 @@ from piecrust.formatting.smartypantsformatter import SmartyPantsFormatter from piecrust.importing.jekyll import JekyllImporter from piecrust.importing.piecrust import PieCrust1Importer +from piecrust.importing.wordpress import WordpressImporter from piecrust.plugins.base import PieCrustPlugin from piecrust.processing.base import CopyFileProcessor from piecrust.processing.compass import CompassProcessor @@ -108,6 +109,7 @@ def getImporters(self): return [ + WordpressImporter(), JekyllImporter(), PieCrust1Importer()]