# HG changeset patch # User Ludovic Chabant # Date 1490230301 25200 # Node ID e0127ea76516bfdeb765f2853eee15dffcc75622 # Parent a11a7afb53a71f9d1dd2b76836551f40602709e1 index: Better handling of search previews in whoosh. diff -r a11a7afb53a7 -r e0127ea76516 wikked/indexer/whooshidx.py --- a/wikked/indexer/whooshidx.py Mon Mar 20 23:22:00 2017 -0700 +++ b/wikked/indexer/whooshidx.py Wed Mar 22 17:51:41 2017 -0700 @@ -2,8 +2,8 @@ import os.path import logging from .base import WikiIndex, HitResult -from whoosh.analysis import (StandardAnalyzer, StemmingAnalyzer, - CharsetFilter, NgramFilter) +from whoosh.analysis import ( + StemmingAnalyzer, CharsetFilter, NgramWordAnalyzer) from whoosh.fields import Schema, ID, TEXT, STORED from whoosh.highlight import WholeFragmenter, UppercaseFormatter from whoosh.index import create_in, open_dir @@ -74,15 +74,16 @@ def previewSearch(self, query): with self.ix.searcher() as searcher: - title_qp = QueryParser("title_preview", self.ix.schema).parse(query) + title_qp = QueryParser( + "title_preview", self.ix.schema).parse(query) results = searcher.search(title_qp) results.fragmenter = WholeFragmenter() hits = [] for result in results: hit = HitResult( - result['url'], - result.highlights('title_preview', text=result['title'])) + result['url'], + result.highlights('title_preview', text=result['title'])) hits.append(hit) return hits @@ -105,12 +106,12 @@ return hits def _getSchema(self): - preview_analyzer = (StandardAnalyzer() | CharsetFilter(accent_map) | - NgramFilter(minsize=1)) + preview_analyzer = NgramWordAnalyzer(minsize=2) text_analyzer = StemmingAnalyzer() | CharsetFilter(accent_map) schema = Schema( url=ID(stored=True), - title_preview=TEXT(analyzer=preview_analyzer, stored=False), + title_preview=TEXT(analyzer=preview_analyzer, + stored=False, phrase=False), title=TEXT(analyzer=text_analyzer, stored=True), text=TEXT(analyzer=text_analyzer, stored=True), path=STORED, @@ -124,7 +125,7 @@ url=page.url, title_preview=page.title, title=page.title, - text=page.text, + text=page.raw_text, path=page.path, time=os.path.getmtime(page.path) )