Mercurial > wikked
changeset 153:b4a69ee1a608
Suport for live n-gram-based search with ElasticSearch indexer.
Better ElasticSearch initialization.
author | Ludovic Chabant <ludovic@chabant.com> |
---|---|
date | Mon, 23 Dec 2013 23:16:35 -0800 |
parents | 8e75c12b1cc9 |
children | 78b8febd362f |
files | wikked/indexer/elastic.py |
diffstat | 1 files changed, 79 insertions(+), 9 deletions(-) [+] |
line wrap: on
line diff
--- a/wikked/indexer/elastic.py Mon Dec 23 13:29:00 2013 -0800 +++ b/wikked/indexer/elastic.py Mon Dec 23 23:16:35 2013 -0800 @@ -27,14 +27,53 @@ self.es.indices.create( 'pages', body={ + 'settings': { + 'analysis': { + 'analyzer': { + 'pageTitlePreviewAnalyzer': { + 'type': 'custom', + 'tokenizer': 'standard', + 'filter': ['pageTitlePreviewFilter', 'lowercase'] + }, + 'pageTextAnalyzer': { + 'type': 'custom', + 'tokenizer': 'standard', + 'filter': ['standard', 'lowercase', 'stop'], + 'char_filter': 'html_strip' + } + }, + 'filter': { + 'pageTitlePreviewFilter': { + 'type': 'edgeNGram', + 'min_gram': 2, + 'max_gram': 10, + 'token_chars': ['letter', 'digit'] + } + } + } + }, 'mappings': { 'page': { 'properties': { 'url': {'type': 'string', 'index': 'not_analyzed'}, 'path': {'type': 'string', 'index': 'not_analyzed'}, 'time': {'type': 'float', 'index': 'not_analyzed'}, - 'title': {'type': 'string', 'boost': 2.0}, - 'text': {'type': 'string', 'index': 'analyzed', 'store': 'yes', 'analyzer': 'pageTextAnalyzer'} + 'title_preview': { + 'type': 'string', + 'index': 'analyzed', + 'analyzer': 'pageTitlePreviewAnalyzer' + }, + 'title': { + 'type': 'string', + 'boost': 4.0, + 'store': 'yes' + }, + 'text': { + 'type': 'string', + 'index': 'analyzed', + 'store': 'yes', + 'analyzer': 'pageTextAnalyzer' + } }, '_meta': { 'version': INDEX_VERSION @@ -114,17 +153,21 @@ actions = action_maker() bulk_index(self.es, actions) - def search(self, query): + def previewSearch(self, query): body = { + 'explain': True, + 'fields': ['title_preview', 'url'], 'query': { - 'match': {'text': query} + 'query_string': { + 'fields': ['title_preview'], + 'query': query + } }, 'highlight': { 'tags_schema': 'styled', - 'fragment_size': 150, + 'order': 'score', 'fields': { - 'title': {'number_of_fragments': 0}, - 'text': {'number_of_fragments': 5, 'order': 'score'} + 'title_preview': {'number_of_fragments': 2} } } } @@ -132,15 +175,42 @@ index='pages', doc_type='page', body=body) - logger.debug("Got %d hits." % res['hits']['total']) for h in res['hits']['hits']: - yield HitResult(h['_source']['url'], h['_source']['title'], h['highlight']['text']) + yield HitResult(h['fields']['url'], h['highlight']['title_preview']) + + def search(self, query): + body = { + 'fields': ['url', 'title', 'text'], + 'query': { + 'query_string': { + 'fields': ['title', 'text'], + 'query': query + } + }, + 'highlight': { + 'tags_schema': 'styled', + 'order': 'score', + 'fragment_size': 150, + 'fields': { + 'title': {'number_of_fragments': 2}, + 'text': {'number_of_fragments': 5} + } + } + } + res = self.es.search( + index='pages', + doc_type='page', + body=body) + for h in res['hits']['hits']: + yield HitResult(h['fields']['url'], h['fields']['title'], + h['highlight']['text']) def _get_body(self, page): return { 'url': page.url, 'path': page.path, 'time': os.path.getmtime(page.path), + 'title_preview': page.title, 'title': page.title, 'text': page.text }