Mercurial > wikked
changeset 151:f32af0888382
Added support for ElasticSearch indexing:
- More configurable setup for wiki providers (SCM, indexing, etc.).
- Lazy importing of provider specific packages.
- Nicer search results.
author | Ludovic Chabant <ludovic@chabant.com> |
---|---|
date | Mon, 16 Dec 2013 20:59:42 -0800 |
parents | 800a1f681557 |
children | 8e75c12b1cc9 |
files | manage.py static/css/wikked/main.less static/js/wikked/app.js static/js/wikked/models.js static/tpl/search-results.html wikked/indexer/base.py wikked/indexer/elastic.py wikked/indexer/native.py wikked/indexer/whoosh.py wikked/resources/defaults.cfg wikked/views/special.py wikked/wiki.py |
diffstat | 12 files changed, 309 insertions(+), 136 deletions(-) [+] |
line wrap: on
line diff
--- a/manage.py Sat Dec 14 21:29:22 2013 -0800 +++ b/manage.py Mon Dec 16 20:59:42 2013 -0800 @@ -38,10 +38,13 @@ @manager.command -def reset(cache=False): +def reset(cache=False, index_only=False): """ Re-generates the database and the full-text-search index. """ - wiki.reset(cache_ext_data=cache) + if index_only: + wiki.index.reset(wiki.getPages()) + else: + wiki.reset(cache_ext_data=cache) @manager.command @@ -85,6 +88,14 @@ @manager.command +def search(query): + """ Searches the wiki. + """ + hits = wiki.index.search(query) + print hits + + +@manager.command def linksfrom(url): page = wiki.getPage(url) for l in page.links:
--- a/static/css/wikked/main.less Sat Dec 14 21:29:22 2013 -0800 +++ b/static/css/wikked/main.less Mon Dec 16 20:59:42 2013 -0800 @@ -20,3 +20,6 @@ &:hover { color: @colorOrange; text-decoration: underline; } } +em.hlt1 , em.hlt2, em.hlt3, em.hlt4, em.hlt5, em.hlt6 { + background: @colorGreen; +}
--- a/static/js/wikked/app.js Sat Dec 14 21:29:22 2013 -0800 +++ b/static/js/wikked/app.js Mon Dec 16 20:59:42 2013 -0800 @@ -143,7 +143,7 @@ query = this.getQueryVariable('q'); } var view = new Views.WikiSearchView({ - model: new Models.WikiSearchModel() + model: new Models.WikiSearchModel({ query: query }) }); this.viewManager.switchView(view); this.navigate('/search/' + query);
--- a/static/js/wikked/models.js Sat Dec 14 21:29:22 2013 -0800 +++ b/static/js/wikked/models.js Mon Dec 16 20:59:42 2013 -0800 @@ -398,20 +398,13 @@ }); var WikiSearchModel = exports.WikiSearchModel = MasterPageModel.extend({ - urlRoot: '/api/search/', + urlRoot: '/api/search', action: 'search', title: function() { return 'Search'; }, - execute: function(query) { - var $model = this; - $.getJSON('/api/search', { q: query }) - .success(function (data) { - $model.set('hits', data.hits); - }) - .error(function() { - alert("Error searching..."); - }); + url: function() { + return this.urlRoot + '?q=' + this.get('query'); } });
--- a/static/tpl/search-results.html Sat Dec 14 21:29:22 2013 -0800 +++ b/static/tpl/search-results.html Mon Dec 16 20:59:42 2013 -0800 @@ -11,7 +11,7 @@ {{#each hits}} <li> <h3><a href="{{get_read_url url}}">{{title}}</a></h3> - <div class="highlighted"><pre><code>{{{content_highlights}}}</code></pre></div> + <div class="highlighted">{{{text}}}</div> </li> {{/each}} </ul>
--- a/wikked/indexer/base.py Sat Dec 14 21:29:22 2013 -0800 +++ b/wikked/indexer/base.py Mon Dec 16 20:59:42 2013 -0800 @@ -1,3 +1,10 @@ + + +class HitResult(object): + def __init__(self, url, title, hl_text=None): + self.url = url + self.title = title + self.hl_text = hl_text class WikiIndex(object):
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/wikked/indexer/elastic.py Mon Dec 16 20:59:42 2013 -0800 @@ -0,0 +1,147 @@ +import os.path +import logging +from elasticsearch import Elasticsearch +from elasticsearch.helpers import bulk_index +from wikked.indexer.base import HitResult, WikiIndex + + +INDEX_VERSION = 1 + + +logger = logging.getLogger(__name__) + + +class ElasticWikiIndex(WikiIndex): + def __init__(self): + pass + + def initIndex(self, wiki): + self.es = Elasticsearch() + if not self.es.indices.exists('pages'): + logger.debug("Creating the `pages` index.") + self.es.indices.create('pages') + + def reset(self, pages): + logger.debug("Reseting the ElasticSearch index.") + self.es.indices.delete('pages') + self.es.indices.create( + 'pages', + body={ + 'mappings': { + 'page': { + 'properties': { + 'url': {'type': 'string', 'index': 'not_analyzed'}, + 'path': {'type': 'string', 'index': 'not_analyzed'}, + 'time': {'type': 'float', 'index': 'not_analyzed'}, + 'title': {'type': 'string', 'boost': 2.0}, + 'text': {'type': 'string', 'index': 'analyzed', 'store': 'yes', 'analyzer': 'pageTextAnalyzer'} + }, + '_meta': { + 'version': INDEX_VERSION + } + } + } + }) + + def action_maker(): + for p in pages: + logger.debug("Indexing '%s'..." % p.url) + a = { + '_index': 'pages', + '_type': 'page', + '_source': self._get_body(p) + } + yield a + + actions = action_maker() + bulk_index(self.es, actions) + + def update(self, pages): + to_reindex = set() + already_indexed = set() + + offset = 0 + bucket_size = 100 + while True: + logger.debug("Grabbing documents %d to %d..." % (offset, offset + bucket_size)) + body = { + 'fields': ['url', 'path', 'time'], + 'from': offset, + 'size': bucket_size, + 'query': { + 'match_all': {} + } + } + docs = self.es.search( + index='pages', + doc_type='page', + body=body) + total = docs['hits']['total'] + + for d in docs['hits']['hits']: + indexed_path = d['fields']['path'] + indexed_time = d['fields']['time'] + + if not os.path.isfile(indexed_path): + # File was deleted. + self.es.delete( + index='pages', + doc_type='page', + id=d['_id']) + else: + already_indexed.add(indexed_path) + if os.path.getmtime(indexed_path) > indexed_time: + # File has changed since last index. + to_reindex.add(indexed_path) + + if offset + bucket_size < total: + offset += bucket_size + else: + break + + def action_maker(): + for p in pages: + if p.path in to_reindex or p.path not in already_indexed: + logger.debug("Reindexing '%s'..." % p.url) + a = { + '_index': 'pages', + '_type': 'page', + '_source': self._get_body(p) + } + yield a + + logger.debug("Indexing out-of-date pages...") + actions = action_maker() + bulk_index(self.es, actions) + + def search(self, query): + body = { + 'query': { + 'match': {'text': query} + }, + 'highlight': { + 'tags_schema': 'styled', + 'fragment_size': 150, + 'fields': { + 'title': {'number_of_fragments': 0}, + 'text': {'number_of_fragments': 5, 'order': 'score'} + } + } + } + res = self.es.search( + index='pages', + doc_type='page', + body=body) + logger.debug("Got %d hits." % res['hits']['total']) + for h in res['hits']['hits']: + yield HitResult(h['_source']['url'], h['_source']['title'], h['highlight']['text']) + + def _get_body(self, page): + return { + 'url': page.url, + 'path': page.path, + 'time': os.path.getmtime(page.path), + 'title': page.title, + 'text': page.text + } +
--- a/wikked/indexer/native.py Sat Dec 14 21:29:22 2013 -0800 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,108 +0,0 @@ -import os -import os.path -import codecs -import logging -from base import WikiIndex -from whoosh.index import create_in, open_dir -from whoosh.fields import Schema, ID, TEXT, STORED -from whoosh.qparser import QueryParser - - -logger = logging.getLogger(__name__) - - -class WhooshWikiIndex(WikiIndex): - def __init__(self): - WikiIndex.__init__(self) - - def initIndex(self, wiki): - self.store_dir = os.path.join(wiki.root, '.wiki', 'index') - if not os.path.isdir(self.store_dir): - logger.debug("Creating new index in: " + self.store_dir) - os.makedirs(self.store_dir) - self.ix = create_in(self.store_dir, self._getSchema()) - else: - self.ix = open_dir(self.store_dir) - - def reset(self, pages): - logger.debug("Re-creating new index in: " + self.store_dir) - self.ix = create_in(self.store_dir, schema=self._getSchema()) - writer = self.ix.writer() - for page in pages: - self._indexPage(writer, page) - writer.commit() - - def update(self, pages): - logger.debug("Updating index...") - to_reindex = set() - already_indexed = set() - - with self.ix.searcher() as searcher: - writer = self.ix.writer() - - for fields in searcher.all_stored_fields(): - indexed_url = fields['url'] - indexed_path = fields['path'] - indexed_time = fields['time'] - - if not os.path.isfile(indexed_path): - # File was deleted. - self._unindexPage(writer, indexed_url) - else: - already_indexed.add(indexed_path) - if os.path.getmtime(indexed_path) > indexed_time: - # File has changed since last index. - self._unindexPage(writer, indexed_url) - to_reindex.add(indexed_path) - - for page in pages: - if page.path in to_reindex or page.path not in already_indexed: - self._indexPage(writer, page) - - writer.commit() - logger.debug("...done updating index.") - - def search(self, query): - with self.ix.searcher() as searcher: - title_qp = QueryParser("title", self.ix.schema).parse(query) - content_qp = QueryParser("content", self.ix.schema).parse(query) - comp_query = title_qp | content_qp - results = searcher.search(comp_query) - - page_infos = [] - for hit in results: - page_info = { - 'title': hit['title'], - 'url': hit['url'] - } - page_info['title_highlights'] = hit.highlights('title') - with codecs.open(hit['path'], 'r', encoding='utf-8') as f: - content = f.read() - page_info['content_highlights'] = hit.highlights('content', text=content) - page_infos.append(page_info) - return page_infos - - def _getSchema(self): - schema = Schema( - url=ID(stored=True), - title=TEXT(stored=True), - content=TEXT, - path=STORED, - time=STORED - ) - return schema - - def _indexPage(self, writer, page): - logger.debug("Indexing '%s'." % page.url) - writer.add_document( - url=unicode(page.url), - title=unicode(page.title), - content=unicode(page.raw_text), - path=page.path, - time=os.path.getmtime(page.path) - ) - - def _unindexPage(self, writer, url): - logger.debug("Removing '%s' from index." % url) - writer.delete_by_term('url', url) -
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/wikked/indexer/whoosh.py Mon Dec 16 20:59:42 2013 -0800 @@ -0,0 +1,108 @@ +import os +import os.path +import codecs +import logging +from base import WikiIndex +from whoosh.index import create_in, open_dir +from whoosh.fields import Schema, ID, TEXT, STORED +from whoosh.qparser import QueryParser + + +logger = logging.getLogger(__name__) + + +class WhooshWikiIndex(WikiIndex): + def __init__(self): + WikiIndex.__init__(self) + + def initIndex(self, wiki): + self.store_dir = os.path.join(wiki.root, '.wiki', 'index') + if not os.path.isdir(self.store_dir): + logger.debug("Creating new index in: " + self.store_dir) + os.makedirs(self.store_dir) + self.ix = create_in(self.store_dir, self._getSchema()) + else: + self.ix = open_dir(self.store_dir) + + def reset(self, pages): + logger.debug("Re-creating new index in: " + self.store_dir) + self.ix = create_in(self.store_dir, schema=self._getSchema()) + writer = self.ix.writer() + for page in pages: + self._indexPage(writer, page) + writer.commit() + + def update(self, pages): + logger.debug("Updating index...") + to_reindex = set() + already_indexed = set() + + with self.ix.searcher() as searcher: + writer = self.ix.writer() + + for fields in searcher.all_stored_fields(): + indexed_url = fields['url'] + indexed_path = fields['path'] + indexed_time = fields['time'] + + if not os.path.isfile(indexed_path): + # File was deleted. + self._unindexPage(writer, indexed_url) + else: + already_indexed.add(indexed_path) + if os.path.getmtime(indexed_path) > indexed_time: + # File has changed since last index. + self._unindexPage(writer, indexed_url) + to_reindex.add(indexed_path) + + for page in pages: + if page.path in to_reindex or page.path not in already_indexed: + self._indexPage(writer, page) + + writer.commit() + logger.debug("...done updating index.") + + def search(self, query): + with self.ix.searcher() as searcher: + title_qp = QueryParser("title", self.ix.schema).parse(query) + content_qp = QueryParser("content", self.ix.schema).parse(query) + comp_query = title_qp | content_qp + results = searcher.search(comp_query) + + page_infos = [] + for hit in results: + page_info = { + 'title': hit['title'], + 'url': hit['url'] + } + page_info['title_highlights'] = hit.highlights('title') + with codecs.open(hit['path'], 'r', encoding='utf-8') as f: + content = f.read() + page_info['content_highlights'] = hit.highlights('content', text=content) + page_infos.append(page_info) + return page_infos + + def _getSchema(self): + schema = Schema( + url=ID(stored=True), + title=TEXT(stored=True), + content=TEXT, + path=STORED, + time=STORED + ) + return schema + + def _indexPage(self, writer, page): + logger.debug("Indexing '%s'." % page.url) + writer.add_document( + url=unicode(page.url), + title=unicode(page.title), + content=unicode(page.raw_text), + path=page.path, + time=os.path.getmtime(page.path) + ) + + def _unindexPage(self, writer, url): + logger.debug("Removing '%s' from index." % url) + writer.delete_by_term('url', url) +
--- a/wikked/resources/defaults.cfg Sat Dec 14 21:29:22 2013 -0800 +++ b/wikked/resources/defaults.cfg Mon Dec 16 20:59:42 2013 -0800 @@ -4,5 +4,7 @@ naming_policy=capitalize main_page=Main Page templates_dir=Templates +indexer=whoosh +database=sql database_url=sqlite:///%(root)s/.wiki/wiki.db
--- a/wikked/views/special.py Sat Dec 14 21:29:22 2013 -0800 +++ b/wikked/views/special.py Mon Dec 16 20:59:42 2013 -0800 @@ -30,10 +30,13 @@ def api_search(): query = request.args.get('q') - def is_hit_readable(hit): - page = get_page_or_none(hit['url']) - return page is None or is_page_readable(page) - hits = filter(is_hit_readable, g.wiki.index.search(query)) - result = {'query': query, 'hits': hits} + readable_hits = [] + hits = list(g.wiki.index.search(query)) + for h in hits: + page = get_page_or_none(h.url, convert_url=False) + if page is not None and is_page_readable(page): + readable_hits.append({'url': h.url, 'title': h.title, 'text': h.hl_text}) + + result = {'query': query, 'hit_count': len(readable_hits), 'hits': readable_hits} return make_auth_response(result)
--- a/wikked/wiki.py Sat Dec 14 21:29:22 2013 -0800 +++ b/wikked/wiki.py Mon Dec 16 20:59:42 2013 -0800 @@ -4,13 +4,9 @@ import logging import importlib from ConfigParser import SafeConfigParser, NoOptionError -from page import FileSystemPage -from fs import FileSystem -from db.sql import SQLDatabase -from scm.mercurial import MercurialCommandServerSourceControl -from scm.git import GitLibSourceControl -from indexer.native import WhooshWikiIndex -from auth import UserManager +from wikked.page import FileSystemPage +from wikked.fs import FileSystem +from wikked.auth import UserManager logger = logging.getLogger(__name__) @@ -42,14 +38,23 @@ return FileSystem(self.root) def index_factory(self, config): - return WhooshWikiIndex() + index_type = config.get('wiki', 'indexer') + if index_type == 'whoosh': + from wikked.indexer.whoosh import WhooshWikiIndex + return WhooshWikiIndex() + elif index_type == 'elastic': + from wikked.indexer.elastic import ElasticWikiIndex + return ElasticWikiIndex() + else: + raise InitializationError("No such indexer: " + index_type) def db_factory(self, config): + from wikked.db.sql import SQLDatabase return SQLDatabase() def scm_factory(self, config): try: - scm_type = config.get('wiki', 'scm') + scm_type = config.get('wiki', 'sourcecontrol') except NoOptionError: # Auto-detect if os.path.isdir(os.path.join(self.root, '.hg')): @@ -61,8 +66,10 @@ scm_type = 'hg' if scm_type == 'hg': + from wikked.scm.mercurial import MercurialCommandServerSourceControl return MercurialCommandServerSourceControl(self.root) elif scm_type == 'git': + from wikked.scm.git import GitLibSourceControl return GitLibSourceControl(self.root) else: raise InitializationError("No such source control: " + scm_type)