Mercurial > wikked
view wikked/indexer.py @ 68:4cb946982fca
Added POST endpoint to revert a page.
Fixed a stupid dictionary bug.
author | Ludovic Chabant <ludovic@chabant.com> |
---|---|
date | Tue, 12 Feb 2013 20:52:58 -0800 |
parents | fb6ae96756c1 |
children | 9afe4a1dbd1e |
line wrap: on
line source
import os import os.path import codecs import logging from whoosh.index import create_in, open_dir from whoosh.fields import Schema, ID, TEXT, STORED from whoosh.qparser import QueryParser class WikiIndex(object): def __init__(self, logger=None): self.logger = logger if logger is None: self.logger = logging.getLogger('wikked.index') def initIndex(self): raise NotImplementedError() def reset(self, pages): raise NotImplementedError() def update(self, pages): raise NotImplementedError() def search(self, query): raise NotImplementedError() class WhooshWikiIndex(WikiIndex): def __init__(self, store_dir, logger=None): WikiIndex.__init__(self, logger) self.store_dir = store_dir def initIndex(self): if not os.path.isdir(self.store_dir): self.logger.debug("Creating new index in: " + self.store_dir) os.makedirs(self.store_dir) self.ix = create_in(self.store_dir, self._getSchema()) else: self.ix = open_dir(self.store_dir) def reset(self, pages): self.logger.debug("Re-creating new index in: " + self.store_dir) self.ix = create_in(self.store_dir, schema=self._getSchema()) writer = self.ix.writer() for page in pages: page._ensureMeta() self._indexPage(writer, page) writer.commit() def update(self, pages): self.logger.debug("Updating index...") to_reindex = set() already_indexed = set() with self.ix.searcher() as searcher: writer = self.ix.writer() for fields in searcher.all_stored_fields(): indexed_url = fields['url'] indexed_path = fields['path'] indexed_time = fields['time'] if not os.path.isfile(indexed_path): # File was deleted. self._unindexPage(writer, indexed_url) else: already_indexed.add(indexed_path) if os.path.getmtime(indexed_path) > indexed_time: # File has changed since last index. self._unindexPage(writer, indexed_url) to_reindex.add(indexed_path) for page in pages: page._ensureMeta() page_path = page._meta['path'] if page_path in to_reindex or page_path not in already_indexed: self._indexPage(writer, page) writer.commit() self.logger.debug("...done updating index.") def search(self, query): with self.ix.searcher() as searcher: title_qp = QueryParser("title", self.ix.schema).parse(query) content_qp = QueryParser("content", self.ix.schema).parse(query) comp_query = title_qp | content_qp results = searcher.search(comp_query) page_infos = [] for hit in results: page_info = { 'title': hit['title'], 'url': hit['url'] } page_info['title_highlights'] = hit.highlights('title') with codecs.open(hit['path'], 'r', encoding='utf-8') as f: content = f.read() page_info['content_highlights'] = hit.highlights('content', text=content) page_infos.append(page_info) return page_infos def _getSchema(self): schema = Schema( url=ID(stored=True), title=TEXT(stored=True), content=TEXT, path=STORED, time=STORED ) return schema def _indexPage(self, writer, page): self.logger.debug("Indexing '%s'." % page.url) writer.add_document( url=unicode(page.url), title=unicode(page.title), content=unicode(page.raw_text), path=page._meta['path'], time=os.path.getmtime(page._meta['path']) ) def _unindexPage(self, writer, url): self.logger.debug("Removing '%s' from index." % url) writer.delete_by_term('url', url)