changeset 151:f32af0888382

Added support for ElasticSearch indexing: - More configurable setup for wiki providers (SCM, indexing, etc.). - Lazy importing of provider specific packages. - Nicer search results.
author Ludovic Chabant <ludovic@chabant.com>
date Mon, 16 Dec 2013 20:59:42 -0800
parents 800a1f681557
children 8e75c12b1cc9
files manage.py static/css/wikked/main.less static/js/wikked/app.js static/js/wikked/models.js static/tpl/search-results.html wikked/indexer/base.py wikked/indexer/elastic.py wikked/indexer/native.py wikked/indexer/whoosh.py wikked/resources/defaults.cfg wikked/views/special.py wikked/wiki.py
diffstat 12 files changed, 309 insertions(+), 136 deletions(-) [+]
line wrap: on
line diff
--- a/manage.py	Sat Dec 14 21:29:22 2013 -0800
+++ b/manage.py	Mon Dec 16 20:59:42 2013 -0800
@@ -38,10 +38,13 @@
 
 
 @manager.command
-def reset(cache=False):
+def reset(cache=False, index_only=False):
     """ Re-generates the database and the full-text-search index.
     """
-    wiki.reset(cache_ext_data=cache)
+    if index_only:
+        wiki.index.reset(wiki.getPages())
+    else:
+        wiki.reset(cache_ext_data=cache)
 
 
 @manager.command
@@ -85,6 +88,14 @@
 
 
 @manager.command
+def search(query):
+    """ Searches the wiki.
+    """
+    hits = wiki.index.search(query)
+    print hits
+
+
+@manager.command
 def linksfrom(url):
     page = wiki.getPage(url)
     for l in page.links:
--- a/static/css/wikked/main.less	Sat Dec 14 21:29:22 2013 -0800
+++ b/static/css/wikked/main.less	Mon Dec 16 20:59:42 2013 -0800
@@ -20,3 +20,6 @@
     &:hover { color: @colorOrange; text-decoration: underline; }
 }
 
+em.hlt1 , em.hlt2, em.hlt3, em.hlt4, em.hlt5, em.hlt6 {
+    background: @colorGreen;
+}
--- a/static/js/wikked/app.js	Sat Dec 14 21:29:22 2013 -0800
+++ b/static/js/wikked/app.js	Mon Dec 16 20:59:42 2013 -0800
@@ -143,7 +143,7 @@
                 query = this.getQueryVariable('q');
             }
             var view = new Views.WikiSearchView({
-                model: new Models.WikiSearchModel()
+                model: new Models.WikiSearchModel({ query: query })
             });
             this.viewManager.switchView(view);
             this.navigate('/search/' + query);
--- a/static/js/wikked/models.js	Sat Dec 14 21:29:22 2013 -0800
+++ b/static/js/wikked/models.js	Mon Dec 16 20:59:42 2013 -0800
@@ -398,20 +398,13 @@
     });
 
     var WikiSearchModel = exports.WikiSearchModel = MasterPageModel.extend({
-        urlRoot: '/api/search/',
+        urlRoot: '/api/search',
         action: 'search',
         title: function() {
             return 'Search';
         },
-        execute: function(query) {
-            var $model = this;
-            $.getJSON('/api/search', { q: query })
-                .success(function (data) {
-                    $model.set('hits', data.hits);
-                })
-                .error(function() {
-                    alert("Error searching...");
-                });
+        url: function() {
+            return this.urlRoot + '?q=' + this.get('query');
         }
     });
 
--- a/static/tpl/search-results.html	Sat Dec 14 21:29:22 2013 -0800
+++ b/static/tpl/search-results.html	Mon Dec 16 20:59:42 2013 -0800
@@ -11,7 +11,7 @@
             {{#each hits}}
             <li>
             <h3><a href="{{get_read_url url}}">{{title}}</a></h3>
-            <div class="highlighted"><pre><code>{{{content_highlights}}}</code></pre></div>
+            <div class="highlighted">{{{text}}}</div>
             </li>
             {{/each}}
         </ul>
--- a/wikked/indexer/base.py	Sat Dec 14 21:29:22 2013 -0800
+++ b/wikked/indexer/base.py	Mon Dec 16 20:59:42 2013 -0800
@@ -1,3 +1,10 @@
+
+
+class HitResult(object):
+    def __init__(self, url, title, hl_text=None):
+        self.url = url
+        self.title = title
+        self.hl_text = hl_text
 
 
 class WikiIndex(object):
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/wikked/indexer/elastic.py	Mon Dec 16 20:59:42 2013 -0800
@@ -0,0 +1,147 @@
+import os.path
+import logging
+from elasticsearch import Elasticsearch
+from elasticsearch.helpers import bulk_index
+from wikked.indexer.base import HitResult, WikiIndex
+
+
+INDEX_VERSION = 1
+
+
+logger = logging.getLogger(__name__)
+
+
+class ElasticWikiIndex(WikiIndex):
+    def __init__(self):
+        pass
+
+    def initIndex(self, wiki):
+        self.es = Elasticsearch()
+        if not self.es.indices.exists('pages'):
+            logger.debug("Creating the `pages` index.")
+            self.es.indices.create('pages')
+
+    def reset(self, pages):
+        logger.debug("Reseting the ElasticSearch index.")
+        self.es.indices.delete('pages')
+        self.es.indices.create(
+                'pages',
+                body={
+                    'mappings': {
+                        'page': {
+                            'properties': {
+                                'url': {'type': 'string', 'index': 'not_analyzed'},
+                                'path': {'type': 'string', 'index': 'not_analyzed'},
+                                'time': {'type': 'float', 'index': 'not_analyzed'},
+                                'title': {'type': 'string', 'boost': 2.0},
+                                'text': {'type': 'string', 'index': 'analyzed', 'store': 'yes', 'analyzer': 'pageTextAnalyzer'}
+                                },
+                            '_meta': {
+                                'version': INDEX_VERSION
+                                }
+                            }
+                        }
+                    })
+
+        def action_maker():
+            for p in pages:
+                logger.debug("Indexing '%s'..." % p.url)
+                a = {
+                        '_index': 'pages',
+                        '_type': 'page',
+                        '_source': self._get_body(p)
+                        }
+                yield a
+
+        actions = action_maker()
+        bulk_index(self.es, actions)
+
+    def update(self, pages):
+        to_reindex = set()
+        already_indexed = set()
+
+        offset = 0
+        bucket_size = 100
+        while True:
+            logger.debug("Grabbing documents %d to %d..." % (offset, offset + bucket_size))
+            body = {
+                    'fields': ['url', 'path', 'time'],
+                    'from': offset,
+                    'size': bucket_size,
+                    'query': {
+                        'match_all': {}
+                        }
+                    }
+            docs = self.es.search(
+                    index='pages',
+                    doc_type='page',
+                    body=body)
+            total = docs['hits']['total']
+
+            for d in docs['hits']['hits']:
+                indexed_path = d['fields']['path']
+                indexed_time = d['fields']['time']
+
+                if not os.path.isfile(indexed_path):
+                    # File was deleted.
+                    self.es.delete(
+                            index='pages',
+                            doc_type='page',
+                            id=d['_id'])
+                else:
+                    already_indexed.add(indexed_path)
+                    if os.path.getmtime(indexed_path) > indexed_time:
+                        # File has changed since last index.
+                        to_reindex.add(indexed_path)
+
+            if offset + bucket_size < total:
+                offset += bucket_size
+            else:
+                break
+
+        def action_maker():
+            for p in pages:
+                if p.path in to_reindex or p.path not in already_indexed:
+                    logger.debug("Reindexing '%s'..." % p.url)
+                    a = {
+                            '_index': 'pages',
+                            '_type': 'page',
+                            '_source': self._get_body(p)
+                            }
+                    yield a
+
+        logger.debug("Indexing out-of-date pages...")
+        actions = action_maker()
+        bulk_index(self.es, actions)
+
+    def search(self, query):
+        body = {
+                'query': {
+                    'match': {'text': query}
+                    },
+                'highlight': {
+                    'tags_schema': 'styled',
+                    'fragment_size': 150,
+                    'fields': {
+                        'title': {'number_of_fragments': 0},
+                        'text': {'number_of_fragments': 5, 'order': 'score'}
+                        }
+                    }
+                }
+        res = self.es.search(
+                index='pages',
+                doc_type='page',
+                body=body)
+        logger.debug("Got %d hits." % res['hits']['total'])
+        for h in res['hits']['hits']:
+            yield HitResult(h['_source']['url'], h['_source']['title'], h['highlight']['text'])
+
+    def _get_body(self, page):
+        return {
+                'url': page.url,
+                'path': page.path,
+                'time': os.path.getmtime(page.path),
+                'title': page.title,
+                'text': page.text
+                }
+
--- a/wikked/indexer/native.py	Sat Dec 14 21:29:22 2013 -0800
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,108 +0,0 @@
-import os
-import os.path
-import codecs
-import logging
-from base import WikiIndex
-from whoosh.index import create_in, open_dir
-from whoosh.fields import Schema, ID, TEXT, STORED
-from whoosh.qparser import QueryParser
-
-
-logger = logging.getLogger(__name__)
-
-
-class WhooshWikiIndex(WikiIndex):
-    def __init__(self):
-        WikiIndex.__init__(self)
-
-    def initIndex(self, wiki):
-        self.store_dir = os.path.join(wiki.root, '.wiki', 'index')
-        if not os.path.isdir(self.store_dir):
-            logger.debug("Creating new index in: " + self.store_dir)
-            os.makedirs(self.store_dir)
-            self.ix = create_in(self.store_dir, self._getSchema())
-        else:
-            self.ix = open_dir(self.store_dir)
-
-    def reset(self, pages):
-        logger.debug("Re-creating new index in: " + self.store_dir)
-        self.ix = create_in(self.store_dir, schema=self._getSchema())
-        writer = self.ix.writer()
-        for page in pages:
-            self._indexPage(writer, page)
-        writer.commit()
-
-    def update(self, pages):
-        logger.debug("Updating index...")
-        to_reindex = set()
-        already_indexed = set()
-
-        with self.ix.searcher() as searcher:
-            writer = self.ix.writer()
-
-            for fields in searcher.all_stored_fields():
-                indexed_url = fields['url']
-                indexed_path = fields['path']
-                indexed_time = fields['time']
-
-                if not os.path.isfile(indexed_path):
-                    # File was deleted.
-                    self._unindexPage(writer, indexed_url)
-                else:
-                    already_indexed.add(indexed_path)
-                    if os.path.getmtime(indexed_path) > indexed_time:
-                        # File has changed since last index.
-                        self._unindexPage(writer, indexed_url)
-                        to_reindex.add(indexed_path)
-
-            for page in pages:
-                if page.path in to_reindex or page.path not in already_indexed:
-                    self._indexPage(writer, page)
-
-            writer.commit()
-        logger.debug("...done updating index.")
-
-    def search(self, query):
-        with self.ix.searcher() as searcher:
-            title_qp = QueryParser("title", self.ix.schema).parse(query)
-            content_qp = QueryParser("content", self.ix.schema).parse(query)
-            comp_query = title_qp | content_qp
-            results = searcher.search(comp_query)
-
-            page_infos = []
-            for hit in results:
-                page_info = {
-                        'title': hit['title'],
-                        'url': hit['url']
-                        }
-                page_info['title_highlights'] = hit.highlights('title')
-                with codecs.open(hit['path'], 'r', encoding='utf-8') as f:
-                    content = f.read()
-                page_info['content_highlights'] = hit.highlights('content', text=content)
-                page_infos.append(page_info)
-            return page_infos
-
-    def _getSchema(self):
-        schema = Schema(
-                url=ID(stored=True),
-                title=TEXT(stored=True),
-                content=TEXT,
-                path=STORED,
-                time=STORED
-                )
-        return schema
-
-    def _indexPage(self, writer, page):
-        logger.debug("Indexing '%s'." % page.url)
-        writer.add_document(
-            url=unicode(page.url),
-            title=unicode(page.title),
-            content=unicode(page.raw_text),
-            path=page.path,
-            time=os.path.getmtime(page.path)
-            )
-
-    def _unindexPage(self, writer, url):
-        logger.debug("Removing '%s' from index." % url)
-        writer.delete_by_term('url', url)
-
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/wikked/indexer/whoosh.py	Mon Dec 16 20:59:42 2013 -0800
@@ -0,0 +1,108 @@
+import os
+import os.path
+import codecs
+import logging
+from base import WikiIndex
+from whoosh.index import create_in, open_dir
+from whoosh.fields import Schema, ID, TEXT, STORED
+from whoosh.qparser import QueryParser
+
+
+logger = logging.getLogger(__name__)
+
+
+class WhooshWikiIndex(WikiIndex):
+    def __init__(self):
+        WikiIndex.__init__(self)
+
+    def initIndex(self, wiki):
+        self.store_dir = os.path.join(wiki.root, '.wiki', 'index')
+        if not os.path.isdir(self.store_dir):
+            logger.debug("Creating new index in: " + self.store_dir)
+            os.makedirs(self.store_dir)
+            self.ix = create_in(self.store_dir, self._getSchema())
+        else:
+            self.ix = open_dir(self.store_dir)
+
+    def reset(self, pages):
+        logger.debug("Re-creating new index in: " + self.store_dir)
+        self.ix = create_in(self.store_dir, schema=self._getSchema())
+        writer = self.ix.writer()
+        for page in pages:
+            self._indexPage(writer, page)
+        writer.commit()
+
+    def update(self, pages):
+        logger.debug("Updating index...")
+        to_reindex = set()
+        already_indexed = set()
+
+        with self.ix.searcher() as searcher:
+            writer = self.ix.writer()
+
+            for fields in searcher.all_stored_fields():
+                indexed_url = fields['url']
+                indexed_path = fields['path']
+                indexed_time = fields['time']
+
+                if not os.path.isfile(indexed_path):
+                    # File was deleted.
+                    self._unindexPage(writer, indexed_url)
+                else:
+                    already_indexed.add(indexed_path)
+                    if os.path.getmtime(indexed_path) > indexed_time:
+                        # File has changed since last index.
+                        self._unindexPage(writer, indexed_url)
+                        to_reindex.add(indexed_path)
+
+            for page in pages:
+                if page.path in to_reindex or page.path not in already_indexed:
+                    self._indexPage(writer, page)
+
+            writer.commit()
+        logger.debug("...done updating index.")
+
+    def search(self, query):
+        with self.ix.searcher() as searcher:
+            title_qp = QueryParser("title", self.ix.schema).parse(query)
+            content_qp = QueryParser("content", self.ix.schema).parse(query)
+            comp_query = title_qp | content_qp
+            results = searcher.search(comp_query)
+
+            page_infos = []
+            for hit in results:
+                page_info = {
+                        'title': hit['title'],
+                        'url': hit['url']
+                        }
+                page_info['title_highlights'] = hit.highlights('title')
+                with codecs.open(hit['path'], 'r', encoding='utf-8') as f:
+                    content = f.read()
+                page_info['content_highlights'] = hit.highlights('content', text=content)
+                page_infos.append(page_info)
+            return page_infos
+
+    def _getSchema(self):
+        schema = Schema(
+                url=ID(stored=True),
+                title=TEXT(stored=True),
+                content=TEXT,
+                path=STORED,
+                time=STORED
+                )
+        return schema
+
+    def _indexPage(self, writer, page):
+        logger.debug("Indexing '%s'." % page.url)
+        writer.add_document(
+            url=unicode(page.url),
+            title=unicode(page.title),
+            content=unicode(page.raw_text),
+            path=page.path,
+            time=os.path.getmtime(page.path)
+            )
+
+    def _unindexPage(self, writer, url):
+        logger.debug("Removing '%s' from index." % url)
+        writer.delete_by_term('url', url)
+
--- a/wikked/resources/defaults.cfg	Sat Dec 14 21:29:22 2013 -0800
+++ b/wikked/resources/defaults.cfg	Mon Dec 16 20:59:42 2013 -0800
@@ -4,5 +4,7 @@
 naming_policy=capitalize
 main_page=Main Page
 templates_dir=Templates
+indexer=whoosh
+database=sql
 database_url=sqlite:///%(root)s/.wiki/wiki.db
 
--- a/wikked/views/special.py	Sat Dec 14 21:29:22 2013 -0800
+++ b/wikked/views/special.py	Mon Dec 16 20:59:42 2013 -0800
@@ -30,10 +30,13 @@
 def api_search():
     query = request.args.get('q')
 
-    def is_hit_readable(hit):
-        page = get_page_or_none(hit['url'])
-        return page is None or is_page_readable(page)
-    hits = filter(is_hit_readable, g.wiki.index.search(query))
-    result = {'query': query, 'hits': hits}
+    readable_hits = []
+    hits = list(g.wiki.index.search(query))
+    for h in hits:
+        page = get_page_or_none(h.url, convert_url=False)
+        if page is not None and is_page_readable(page):
+            readable_hits.append({'url': h.url, 'title': h.title, 'text': h.hl_text})
+
+    result = {'query': query, 'hit_count': len(readable_hits), 'hits': readable_hits}
     return make_auth_response(result)
 
--- a/wikked/wiki.py	Sat Dec 14 21:29:22 2013 -0800
+++ b/wikked/wiki.py	Mon Dec 16 20:59:42 2013 -0800
@@ -4,13 +4,9 @@
 import logging
 import importlib
 from ConfigParser import SafeConfigParser, NoOptionError
-from page import FileSystemPage
-from fs import FileSystem
-from db.sql import SQLDatabase
-from scm.mercurial import MercurialCommandServerSourceControl
-from scm.git import GitLibSourceControl
-from indexer.native import WhooshWikiIndex
-from auth import UserManager
+from wikked.page import FileSystemPage
+from wikked.fs import FileSystem
+from wikked.auth import UserManager
 
 
 logger = logging.getLogger(__name__)
@@ -42,14 +38,23 @@
         return FileSystem(self.root)
 
     def index_factory(self, config):
-        return WhooshWikiIndex()
+        index_type = config.get('wiki', 'indexer')
+        if index_type == 'whoosh':
+            from wikked.indexer.whoosh import WhooshWikiIndex
+            return WhooshWikiIndex()
+        elif index_type == 'elastic':
+            from wikked.indexer.elastic import ElasticWikiIndex
+            return ElasticWikiIndex()
+        else:
+            raise InitializationError("No such indexer: " + index_type)
 
     def db_factory(self, config):
+        from wikked.db.sql import SQLDatabase
         return SQLDatabase()
 
     def scm_factory(self, config):
         try:
-            scm_type = config.get('wiki', 'scm')
+            scm_type = config.get('wiki', 'sourcecontrol')
         except NoOptionError:
             # Auto-detect
             if os.path.isdir(os.path.join(self.root, '.hg')):
@@ -61,8 +66,10 @@
                 scm_type = 'hg'
 
         if scm_type == 'hg':
+            from wikked.scm.mercurial import MercurialCommandServerSourceControl
             return MercurialCommandServerSourceControl(self.root)
         elif scm_type == 'git':
+            from wikked.scm.git import GitLibSourceControl
             return GitLibSourceControl(self.root)
         else:
             raise InitializationError("No such source control: " + scm_type)