piecrust2: piecrust/importing/wordpress.py comparison

comparison piecrust/importing/wordpress.py @ 302:103abb08755e

import: Make the Wordpress importer extendable, rename it to `wordpressxml`. This is because the SQL importer will be installable as a plugin, so as to not impose SQL dependencies on the basic PieCrust package.

author	Ludovic Chabant <ludovic@chabant.com>
date	Thu, 19 Mar 2015 18:30:09 -0700
parents	2daa05a21026
children	1ddd18ad5e76

comparison

equal deleted inserted replaced

-:45aba3cb7228
+:103abb08755e
 import os.path
 import logging
 import datetime
 import yaml
-from urllib.parse import urlparse
+from collections import OrderedDict
 from piecrust import CONFIG_PATH
+from piecrust.configuration import (
+ConfigurationLoader, ConfigurationDumper, merge_dicts)
 from piecrust.importing.base import Importer, create_page, download_asset
 from piecrust.sources.base import MODE_CREATING
 logger = logging.getLogger(__name__)
-class WordpressImporter(Importer):
+class WordpressImporterBase(Importer):
-name = 'wordpress'
-description = "Imports a Wordpress blog."
 def setupParser(self, parser, app):
 parser.add_argument(
-'--posts_fs',
+'--pages-source',
-default="hierarchy",
+default="pages",
-choices=['flat', 'shallow', 'hierarchy'],
+help="The source to store pages in.")
-help="The blog file-system type to use.")
+parser.add_argument(
-parser.add_argument(
+'--posts-source',
-'--prefix',
+default="posts",
-default="wp_",
+help="The source to store posts in.")
-help="The SQL table prefix. Defaults to `wp_`.")
 parser.add_argument(
 '--default-post-layout',
 help="The default layout to use for posts.")
 parser.add_argument(
 '--default-post-category',
 '--default-page-layout',
 help="The default layout to use for pages.")
 parser.add_argument(
 '--default-page-category',
 help="The default category to use for pages.")
-parser.add_argument(
-'xml_or_db_url',
-help=("The exported XML archive of the Wordpress site, or "
-"the URL of the SQL database.\n"
-"\n"
-"If an SQL database URL, it should be of the "
-"form:  type://user:password@server/database\n"
-"\n"
-"For example:\n"
-"mysql://user:password@example.org/my_database"))
 def importWebsite(self, app, args):
-parsed_url = urlparse(args.xml_or_db_url)
+impl = self._getImplementation(app, args)
-if not parsed_url.scheme:
+return impl.importWebsite()
-impl = _XmlImporter(app, args)
+def _getImplementation(self, app, args):
+raise NotImplementedError()
+class _ImporterBase(object):
+def __init__(self, app, args):
+self.app = app
+self._cat_map = {}
+self._author_map = {}
+self._pages_source = app.getSource(args.pages_source)
+self._posts_source = app.getSource(args.posts_source)
+def importWebsite(self):
+ctx = self._open()
+# Site configuration.
+logger.info("Generating site configuration...")
+site_config = self._getSiteConfig(ctx)
+site_config.setdefault('site', {})
+site_config['site'].update({
+'post_url': '%year%/%month%/%slug%',
+'category_url': 'category/%category%'})
+site_config_path = os.path.join(self.app.root_dir, CONFIG_PATH)
+with open(site_config_path, 'r') as fp:
+cfg_data = yaml.load(fp, Loader=ConfigurationLoader)
+cfg_data = cfg_data or {}
+merge_dicts(cfg_data, site_config)
+with open(site_config_path, 'w') as fp:
+yaml.dump(cfg_data, fp, default_flow_style=False,
+allow_unicode=True,
+Dumper=ConfigurationDumper)
+# Content
+for p in self._getPosts(ctx):
+if p['type'] == 'attachment':
+self._createAsset(p)
+else:
+self._createPost(p)
+self._close(ctx)
+def _open(self):
+raise NotImplementedError()
+def _close(self, ctx):
+pass
+def _getSiteConfig(self, ctx):
+raise NotImplementedError()
+def _getPosts(self, ctx):
+raise NotImplementedError()
+def _createAsset(self, asset_info):
+download_asset(self.app, asset_info['url'])
+def _createPost(self, post_info):
+post_dt = post_info['datetime']
+finder = {
+'year': post_dt.year,
+'month': post_dt.month,
+'day': post_dt.day,
+'slug': post_info['slug']}
+if post_info['type'] == 'post':
+source = self._posts_source
+elif post_info['type'] == 'page':
+source = self._pages_source
 else:
-impl = _SqlImporter(app, args)
+raise Exception("Unknown post type: %s" % post_info['type'])
-return impl.importWebsite()
+rel_path, fac_metadata = source.findPagePath(finder, MODE_CREATING)
+metadata = post_info['metadata'].copy()
-class _XmlImporter(object):
+for name in ['title', 'author', 'status', 'post_id', 'post_guid',
+'description', 'categories']:
+val = post_info.get(name)
+if val is not None:
+metadata[name] = val
+content = post_info['content']
+excerpt = post_info['excerpt']
+text = content
+if excerpt is not None and excerpt.strip() != '':
+text = "%s\n\n---excerpt---\n\n%s" % (content, excerpt)
+path = source.resolveRef(rel_path)
+create_page(self.app, path, metadata, text)
+class _XmlImporter(_ImporterBase):
 ns_wp = {'wp': 'http://wordpress.org/export/1.2/'}
 ns_dc = {'dc': "http://purl.org/dc/elements/1.1/"}
 ns_excerpt = {'excerpt': "http://wordpress.org/export/1.2/excerpt/"}
 ns_content = {'content': "http://purl.org/rss/1.0/modules/content/"}
 def __init__(self, app, args):
-self.app = app
+super(_XmlImporter, self).__init__(app, args)
-self.path = args.xml_or_db_url
+self.path = args.xml_path
-self.posts_fs = args.posts_fs
-self._cat_map = {}
+def _open(self):
-self._author_map = {}
-for cls in self.app.plugin_loader.getSources():
-if cls.SOURCE_NAME == ('posts/%s' % self.posts_fs):
-src_config = {
-'type': 'posts/%s' % self.posts_fs,
-'fs_endpoint': 'posts',
-'data_type': 'blog'}
-self.posts_source = cls(app, 'posts', src_config)
-break
-else:
-raise Exception("No such posts file-system: " % self.posts_fs)
-def importWebsite(self):
 if not os.path.exists(self.path):
 raise Exception("No such file: %s" % self.path)
 try:
 import xml.etree.ElementTree as ET
 xml = xml.replace(chr(0x1e), '')
 xml = xml.replace(chr(0x10), '')
 tree = ET.fromstring(xml)
 channel = tree.find('channel')
+return channel
+def _getSiteConfig(self, channel):
 # Get basic site information
 title = find_text(channel, 'title')
 description = find_text(channel, 'description')
-site_config = {
+site_config = OrderedDict({
 'site': {
 'title': title,
-'description': description,
+'description': description}
-'posts_fs': self.posts_fs}
+})
-}
-logger.info("Importing '%s'" % title)
 # Get authors' names.
 authors = {}
 for a in channel.findall('wp:author', self.ns_wp):
 login = find_text(a, 'wp:author_login', self.ns_wp)
 self.ns_wp),
 'author_id': find_text(a, 'wp:author_id',
 self.ns_wp)}
 site_config['site']['authors'] = authors
-# Other stuff.
+return site_config
-site_config['site'].update({
-'post_url': '%year%/%month%/%slug%',
+def _getPosts(self, channel):
-'category_url': 'category/%category%'})
-logger.info("Generating site configuration...")
-site_config_path = os.path.join(self.app.root_dir, CONFIG_PATH)
-with open(site_config_path, 'w') as fp:
-yaml.safe_dump(site_config, fp, default_flow_style=False,
-allow_unicode=True)
-# Content.
 for i in channel.findall('item'):
 post_type = find_text(i, 'wp:post_type', self.ns_wp)
 if post_type == 'attachment':
-self._createAsset(i)
+yield self._getAssetInfo(i)
-elif post_type == 'post':
+else:
-self._createPost(i)
+yield self._getPostInfo(i)
-self._cat_map = None
+def _getAssetInfo(self, node):
-self._author_map = None
-def _createAsset(self, node):
 url = find_text(node, 'wp:attachment_url', self.ns_wp)
-download_asset(self.app, url)
+return {'type': 'attachment', 'url': url}
-def _getPageMetadata(self, node):
+def _getPostInfo(self, node):
+post_date_str = find_text(node, 'wp:post_date', self.ns_wp)
+post_date = datetime.datetime.strptime(post_date_str,
+'%Y-%m-%d %H:%M:%S')
+post_name = find_text(node, 'wp:post_name', self.ns_wp)
+post_type = find_text(node, 'wp:post_type', self.ns_wp)
+post_info = {
+'type': post_type,
+'slug': post_name,
+'datetime': post_date}
 title = find_text(node, 'title')
 creator = find_text(node, 'dc:creator', self.ns_dc)
 status = find_text(node, 'wp:status', self.ns_wp)
 post_id = find_text(node, 'wp:post_id', self.ns_wp)
 guid = find_text(node, 'guid')
 description = find_text(node, 'description')
 # TODO: menu order, parent, password, sticky
+post_info.update({
-categories = []
-for c in node.findall('category'):
-nicename = str(c.attrib.get('nicename'))
-categories.append(nicename)
-metadata = {
 'title': title,
 'author': creator,
 'status': status,
 'post_id': post_id,
 'post_guid': guid,
-'description': description,
+'description': description})
-'categories': categories}
+categories = []
+for c in node.findall('category'):
+nicename = str(c.attrib.get('nicename'))
+categories.append(nicename)
+post_info['categories'] = categories
+metadata = {}
 for m in node.findall('wp:postmeta', self.ns_wp):
 key = find_text(m, 'wp:meta_key', self.ns_wp)
 metadata[key] = find_text(m, 'wp:meta_value', self.ns_wp)
+post_info['metadata'] = metadata
-return metadata
-def _getPageContents(self, node):
 content = find_text(node, 'content:encoded', self.ns_content)
 excerpt = find_text(node, 'excerpt:encoded', self.ns_excerpt)
-if not excerpt.strip():
+post_info.update({
-return content
+'content': content,
-return "%s\n\n---excerpt---\n\n%s" % (content, excerpt)
+'excerpt': excerpt})
-def _getPageInfo(self, node):
+return post_info
-url = find_text(node, 'link')
-post_date_str = find_text(node, 'wp:post_date', self.ns_wp)
-post_date = datetime.datetime.strptime(post_date_str,
+class WordpressXmlImporter(WordpressImporterBase):
-'%Y-%m-%d %H:%M:%S')
+name = 'wordpress-xml'
-post_name = find_text(node, 'wp:post_name', self.ns_wp)
+description = "Imports a Wordpress blog from an exported XML archive."
-return {
-'url': url,
+def setupParser(self, parser, app):
-'slug': post_name,
+super(WordpressXmlImporter, self).setupParser(parser, app)
-'datetime': post_date}
+parser.add_argument(
+'xml_path',
-def _createPage(self, node):
+help="The path to the exported XML archive file.")
-info = self._getPageInfo(node)
-rel_path = os.path.join('pages', info['slug'])
+def _getImplementation(self, app, args):
-metadata = self._getPageMetadata(node)
+return _XmlImporter(app, args)
-contents = self._getPageContents(node)
-create_page(self.app, rel_path, metadata, contents)
-def _createPost(self, node):
-info = self._getPageInfo(node)
-post_dt = info['datetime']
-finder = {
-'year': post_dt.year,
-'month': post_dt.month,
-'day': post_dt.day,
-'slug': info['slug']}
-rel_path, fac_metadata = self.posts_source.findPagePath(
-finder, MODE_CREATING)
-rel_path = os.path.join('posts', rel_path)
-metadata = self._getPageMetadata(node)
-contents = self._getPageContents(node)
-create_page(self.app, rel_path, metadata, contents)
-class _SqlImporter(object):
-def __init__(self, app, args):
-self.app = app
-self.db_url = args.xml_or_db_url
-self.prefix = args.prefix
-def importWebsite(self):
-raise NotImplementedError()
 def find_text(parent, child_name, namespaces=None):
 return str(parent.find(child_name, namespaces).text)

Mercurial > piecrust2

comparison piecrust/importing/wordpress.py @ 302:103abb08755e