changeset 302:103abb08755e

import: Make the Wordpress importer extendable, rename it to `wordpressxml`. This is because the SQL importer will be installable as a plugin, so as to not impose SQL dependencies on the basic PieCrust package.
author Ludovic Chabant <ludovic@chabant.com>
date Thu, 19 Mar 2015 18:30:09 -0700
parents 45aba3cb7228
children 5dbab01daaba
files piecrust/importing/wordpress.py piecrust/plugins/builtin.py
diffstat 2 files changed, 158 insertions(+), 128 deletions(-) [+]
line wrap: on
line diff
--- a/piecrust/importing/wordpress.py	Thu Mar 19 18:28:42 2015 -0700
+++ b/piecrust/importing/wordpress.py	Thu Mar 19 18:30:09 2015 -0700
@@ -2,8 +2,10 @@
 import logging
 import datetime
 import yaml
-from urllib.parse import urlparse
+from collections import OrderedDict
 from piecrust import CONFIG_PATH
+from piecrust.configuration import (
+        ConfigurationLoader, ConfigurationDumper, merge_dicts)
 from piecrust.importing.base import Importer, create_page, download_asset
 from piecrust.sources.base import MODE_CREATING
 
@@ -11,20 +13,16 @@
 logger = logging.getLogger(__name__)
 
 
-class WordpressImporter(Importer):
-    name = 'wordpress'
-    description = "Imports a Wordpress blog."
-
+class WordpressImporterBase(Importer):
     def setupParser(self, parser, app):
         parser.add_argument(
-                '--posts_fs',
-                default="hierarchy",
-                choices=['flat', 'shallow', 'hierarchy'],
-                help="The blog file-system type to use.")
+                '--pages-source',
+                default="pages",
+                help="The source to store pages in.")
         parser.add_argument(
-                '--prefix',
-                default="wp_",
-                help="The SQL table prefix. Defaults to `wp_`.")
+                '--posts-source',
+                default="posts",
+                help="The source to store posts in.")
         parser.add_argument(
                 '--default-post-layout',
                 help="The default layout to use for posts.")
@@ -37,51 +35,113 @@
         parser.add_argument(
                 '--default-page-category',
                 help="The default category to use for pages.")
-        parser.add_argument(
-                'xml_or_db_url',
-                help=("The exported XML archive of the Wordpress site, or "
-                      "the URL of the SQL database.\n"
-                      "\n"
-                      "If an SQL database URL, it should be of the "
-                      "form:  type://user:password@server/database\n"
-                      "\n"
-                      "For example:\n"
-                      "mysql://user:password@example.org/my_database"))
 
     def importWebsite(self, app, args):
-        parsed_url = urlparse(args.xml_or_db_url)
-        if not parsed_url.scheme:
-            impl = _XmlImporter(app, args)
-        else:
-            impl = _SqlImporter(app, args)
+        impl = self._getImplementation(app, args)
         return impl.importWebsite()
 
+    def _getImplementation(self, app, args):
+        raise NotImplementedError()
 
-class _XmlImporter(object):
+
+class _ImporterBase(object):
+    def __init__(self, app, args):
+        self.app = app
+        self._cat_map = {}
+        self._author_map = {}
+        self._pages_source = app.getSource(args.pages_source)
+        self._posts_source = app.getSource(args.posts_source)
+
+    def importWebsite(self):
+        ctx = self._open()
+
+        # Site configuration.
+        logger.info("Generating site configuration...")
+        site_config = self._getSiteConfig(ctx)
+        site_config.setdefault('site', {})
+        site_config['site'].update({
+                'post_url': '%year%/%month%/%slug%',
+                'category_url': 'category/%category%'})
+
+        site_config_path = os.path.join(self.app.root_dir, CONFIG_PATH)
+        with open(site_config_path, 'r') as fp:
+            cfg_data = yaml.load(fp, Loader=ConfigurationLoader)
+
+        cfg_data = cfg_data or {}
+        merge_dicts(cfg_data, site_config)
+
+        with open(site_config_path, 'w') as fp:
+            yaml.dump(cfg_data, fp, default_flow_style=False,
+                      allow_unicode=True,
+                      Dumper=ConfigurationDumper)
+
+        # Content
+        for p in self._getPosts(ctx):
+            if p['type'] == 'attachment':
+                self._createAsset(p)
+            else:
+                self._createPost(p)
+
+        self._close(ctx)
+
+    def _open(self):
+        raise NotImplementedError()
+
+    def _close(self, ctx):
+        pass
+
+    def _getSiteConfig(self, ctx):
+        raise NotImplementedError()
+
+    def _getPosts(self, ctx):
+        raise NotImplementedError()
+
+    def _createAsset(self, asset_info):
+        download_asset(self.app, asset_info['url'])
+
+    def _createPost(self, post_info):
+        post_dt = post_info['datetime']
+        finder = {
+                'year': post_dt.year,
+                'month': post_dt.month,
+                'day': post_dt.day,
+                'slug': post_info['slug']}
+        if post_info['type'] == 'post':
+            source = self._posts_source
+        elif post_info['type'] == 'page':
+            source = self._pages_source
+        else:
+            raise Exception("Unknown post type: %s" % post_info['type'])
+        rel_path, fac_metadata = source.findPagePath(finder, MODE_CREATING)
+
+        metadata = post_info['metadata'].copy()
+        for name in ['title', 'author', 'status', 'post_id', 'post_guid',
+                     'description', 'categories']:
+            val = post_info.get(name)
+            if val is not None:
+                metadata[name] = val
+
+        content = post_info['content']
+        excerpt = post_info['excerpt']
+        text = content
+        if excerpt is not None and excerpt.strip() != '':
+            text = "%s\n\n---excerpt---\n\n%s" % (content, excerpt)
+
+        path = source.resolveRef(rel_path)
+        create_page(self.app, path, metadata, text)
+
+
+class _XmlImporter(_ImporterBase):
     ns_wp = {'wp': 'http://wordpress.org/export/1.2/'}
     ns_dc = {'dc': "http://purl.org/dc/elements/1.1/"}
     ns_excerpt = {'excerpt': "http://wordpress.org/export/1.2/excerpt/"}
     ns_content = {'content': "http://purl.org/rss/1.0/modules/content/"}
 
     def __init__(self, app, args):
-        self.app = app
-        self.path = args.xml_or_db_url
-        self.posts_fs = args.posts_fs
-        self._cat_map = {}
-        self._author_map = {}
+        super(_XmlImporter, self).__init__(app, args)
+        self.path = args.xml_path
 
-        for cls in self.app.plugin_loader.getSources():
-            if cls.SOURCE_NAME == ('posts/%s' % self.posts_fs):
-                src_config = {
-                        'type': 'posts/%s' % self.posts_fs,
-                        'fs_endpoint': 'posts',
-                        'data_type': 'blog'}
-                self.posts_source = cls(app, 'posts', src_config)
-                break
-        else:
-            raise Exception("No such posts file-system: " % self.posts_fs)
-
-    def importWebsite(self):
+    def _open(self):
         if not os.path.exists(self.path):
             raise Exception("No such file: %s" % self.path)
 
@@ -99,16 +159,17 @@
         tree = ET.fromstring(xml)
         channel = tree.find('channel')
 
+        return channel
+
+    def _getSiteConfig(self, channel):
         # Get basic site information
         title = find_text(channel, 'title')
         description = find_text(channel, 'description')
-        site_config = {
+        site_config = OrderedDict({
                 'site': {
                     'title': title,
-                    'description': description,
-                    'posts_fs': self.posts_fs}
-                }
-        logger.info("Importing '%s'" % title)
+                    'description': description}
+                })
 
         # Get authors' names.
         authors = {}
@@ -126,33 +187,31 @@
                                            self.ns_wp)}
         site_config['site']['authors'] = authors
 
-        # Other stuff.
-        site_config['site'].update({
-                'post_url': '%year%/%month%/%slug%',
-                'category_url': 'category/%category%'})
+        return site_config
 
-        logger.info("Generating site configuration...")
-        site_config_path = os.path.join(self.app.root_dir, CONFIG_PATH)
-        with open(site_config_path, 'w') as fp:
-            yaml.safe_dump(site_config, fp, default_flow_style=False,
-                           allow_unicode=True)
-
-        # Content.
+    def _getPosts(self, channel):
         for i in channel.findall('item'):
             post_type = find_text(i, 'wp:post_type', self.ns_wp)
             if post_type == 'attachment':
-                self._createAsset(i)
-            elif post_type == 'post':
-                self._createPost(i)
+                yield self._getAssetInfo(i)
+            else:
+                yield self._getPostInfo(i)
+
+    def _getAssetInfo(self, node):
+        url = find_text(node, 'wp:attachment_url', self.ns_wp)
+        return {'type': 'attachment', 'url': url}
 
-        self._cat_map = None
-        self._author_map = None
+    def _getPostInfo(self, node):
+        post_date_str = find_text(node, 'wp:post_date', self.ns_wp)
+        post_date = datetime.datetime.strptime(post_date_str,
+                                               '%Y-%m-%d %H:%M:%S')
+        post_name = find_text(node, 'wp:post_name', self.ns_wp)
+        post_type = find_text(node, 'wp:post_type', self.ns_wp)
+        post_info = {
+                'type': post_type,
+                'slug': post_name,
+                'datetime': post_date}
 
-    def _createAsset(self, node):
-        url = find_text(node, 'wp:attachment_url', self.ns_wp)
-        download_asset(self.app, url)
-
-    def _getPageMetadata(self, node):
         title = find_text(node, 'title')
         creator = find_text(node, 'dc:creator', self.ns_dc)
         status = find_text(node, 'wp:status', self.ns_wp)
@@ -160,76 +219,47 @@
         guid = find_text(node, 'guid')
         description = find_text(node, 'description')
         # TODO: menu order, parent, password, sticky
-
-        categories = []
-        for c in node.findall('category'):
-            nicename = str(c.attrib.get('nicename'))
-            categories.append(nicename)
-
-        metadata = {
+        post_info.update({
                 'title': title,
                 'author': creator,
                 'status': status,
                 'post_id': post_id,
                 'post_guid': guid,
-                'description': description,
-                'categories': categories}
+                'description': description})
 
+        categories = []
+        for c in node.findall('category'):
+            nicename = str(c.attrib.get('nicename'))
+            categories.append(nicename)
+        post_info['categories'] = categories
+
+        metadata = {}
         for m in node.findall('wp:postmeta', self.ns_wp):
             key = find_text(m, 'wp:meta_key', self.ns_wp)
             metadata[key] = find_text(m, 'wp:meta_value', self.ns_wp)
+        post_info['metadata'] = metadata
 
-        return metadata
-
-    def _getPageContents(self, node):
         content = find_text(node, 'content:encoded', self.ns_content)
         excerpt = find_text(node, 'excerpt:encoded', self.ns_excerpt)
-        if not excerpt.strip():
-            return content
-        return "%s\n\n---excerpt---\n\n%s" % (content, excerpt)
-
-    def _getPageInfo(self, node):
-        url = find_text(node, 'link')
-        post_date_str = find_text(node, 'wp:post_date', self.ns_wp)
-        post_date = datetime.datetime.strptime(post_date_str,
-                                               '%Y-%m-%d %H:%M:%S')
-        post_name = find_text(node, 'wp:post_name', self.ns_wp)
-        return {
-                'url': url,
-                'slug': post_name,
-                'datetime': post_date}
+        post_info.update({
+                'content': content,
+                'excerpt': excerpt})
 
-    def _createPage(self, node):
-        info = self._getPageInfo(node)
-        rel_path = os.path.join('pages', info['slug'])
-        metadata = self._getPageMetadata(node)
-        contents = self._getPageContents(node)
-        create_page(self.app, rel_path, metadata, contents)
-
-    def _createPost(self, node):
-        info = self._getPageInfo(node)
-        post_dt = info['datetime']
-        finder = {
-                'year': post_dt.year,
-                'month': post_dt.month,
-                'day': post_dt.day,
-                'slug': info['slug']}
-        rel_path, fac_metadata = self.posts_source.findPagePath(
-                finder, MODE_CREATING)
-        rel_path = os.path.join('posts', rel_path)
-        metadata = self._getPageMetadata(node)
-        contents = self._getPageContents(node)
-        create_page(self.app, rel_path, metadata, contents)
+        return post_info
 
 
-class _SqlImporter(object):
-    def __init__(self, app, args):
-        self.app = app
-        self.db_url = args.xml_or_db_url
-        self.prefix = args.prefix
+class WordpressXmlImporter(WordpressImporterBase):
+    name = 'wordpress-xml'
+    description = "Imports a Wordpress blog from an exported XML archive."
 
-    def importWebsite(self):
-        raise NotImplementedError()
+    def setupParser(self, parser, app):
+        super(WordpressXmlImporter, self).setupParser(parser, app)
+        parser.add_argument(
+                'xml_path',
+                help="The path to the exported XML archive file.")
+
+    def _getImplementation(self, app, args):
+        return _XmlImporter(app, args)
 
 
 def find_text(parent, child_name, namespaces=None):
--- a/piecrust/plugins/builtin.py	Thu Mar 19 18:28:42 2015 -0700
+++ b/piecrust/plugins/builtin.py	Thu Mar 19 18:30:09 2015 -0700
@@ -19,7 +19,7 @@
 from piecrust.formatting.smartypantsformatter import SmartyPantsFormatter
 from piecrust.importing.jekyll import JekyllImporter
 from piecrust.importing.piecrust import PieCrust1Importer
-from piecrust.importing.wordpress import WordpressImporter
+from piecrust.importing.wordpress import WordpressXmlImporter
 from piecrust.plugins.base import PieCrustPlugin
 from piecrust.processing.base import CopyFileProcessor
 from piecrust.processing.compass import CompassProcessor
@@ -109,7 +109,7 @@
 
     def getImporters(self):
         return [
-                WordpressImporter(),
+                PieCrust1Importer(),
                 JekyllImporter(),
-                PieCrust1Importer()]
+                WordpressXmlImporter()]