changeset 417:e0127ea76516

index: Better handling of search previews in whoosh.
author Ludovic Chabant <ludovic@chabant.com>
date Wed, 22 Mar 2017 17:51:41 -0700
parents a11a7afb53a7
children f8410cd802ad
files wikked/indexer/whooshidx.py
diffstat 1 files changed, 10 insertions(+), 9 deletions(-) [+]
line wrap: on
line diff
--- a/wikked/indexer/whooshidx.py	Mon Mar 20 23:22:00 2017 -0700
+++ b/wikked/indexer/whooshidx.py	Wed Mar 22 17:51:41 2017 -0700
@@ -2,8 +2,8 @@
 import os.path
 import logging
 from .base import WikiIndex, HitResult
-from whoosh.analysis import (StandardAnalyzer, StemmingAnalyzer,
-        CharsetFilter, NgramFilter)
+from whoosh.analysis import (
+    StemmingAnalyzer, CharsetFilter, NgramWordAnalyzer)
 from whoosh.fields import Schema, ID, TEXT, STORED
 from whoosh.highlight import WholeFragmenter, UppercaseFormatter
 from whoosh.index import create_in, open_dir
@@ -74,15 +74,16 @@
 
     def previewSearch(self, query):
         with self.ix.searcher() as searcher:
-            title_qp = QueryParser("title_preview", self.ix.schema).parse(query)
+            title_qp = QueryParser(
+                "title_preview", self.ix.schema).parse(query)
             results = searcher.search(title_qp)
             results.fragmenter = WholeFragmenter()
 
             hits = []
             for result in results:
                 hit = HitResult(
-                        result['url'],
-                        result.highlights('title_preview', text=result['title']))
+                    result['url'],
+                    result.highlights('title_preview', text=result['title']))
                 hits.append(hit)
             return hits
 
@@ -105,12 +106,12 @@
             return hits
 
     def _getSchema(self):
-        preview_analyzer = (StandardAnalyzer() | CharsetFilter(accent_map) |
-                NgramFilter(minsize=1))
+        preview_analyzer = NgramWordAnalyzer(minsize=2)
         text_analyzer = StemmingAnalyzer() | CharsetFilter(accent_map)
         schema = Schema(
                 url=ID(stored=True),
-                title_preview=TEXT(analyzer=preview_analyzer, stored=False),
+                title_preview=TEXT(analyzer=preview_analyzer,
+                                   stored=False, phrase=False),
                 title=TEXT(analyzer=text_analyzer, stored=True),
                 text=TEXT(analyzer=text_analyzer, stored=True),
                 path=STORED,
@@ -124,7 +125,7 @@
             url=page.url,
             title_preview=page.title,
             title=page.title,
-            text=page.text,
+            text=page.raw_text,
             path=page.path,
             time=os.path.getmtime(page.path)
             )