changeset 31:e3c05dccc6dd

The indexer is now opening files in UTF-8.
author Ludovic Chabant <ludovic@chabant.com>
date Sun, 06 Jan 2013 20:17:08 -0800
parents 420ff74c2e28
children 7bb67ac28c3c
files wikked/indexer.py
diffstat 1 files changed, 3 insertions(+), 2 deletions(-) [+]
line wrap: on
line diff
--- a/wikked/indexer.py	Sun Jan 06 20:16:19 2013 -0800
+++ b/wikked/indexer.py	Sun Jan 06 20:17:08 2013 -0800
@@ -1,5 +1,6 @@
 import os
 import os.path
+import codecs
 import logging
 from whoosh.index import create_in, open_dir
 from whoosh.fields import Schema, ID, KEYWORD, TEXT, STORED
@@ -91,8 +92,8 @@
                         'url': hit['url']
                         }
                 page_info['title_highlights'] = hit.highlights('title')
-                with open(hit['path']) as f:
-                    content = unicode(f.read())
+                with codecs.open(hit['path'], 'r', encoding='utf-8') as f:
+                    content = f.read()
                 page_info['content_highlights'] = hit.highlights('content', text=content)
                 page_infos.append(page_info)
             return page_infos