Mercurial > wikked
view wikked/db.py @ 64:0b4f4c23770a
Removed specific "include"-related properties from the code and DB.
(they can already be accessed via the meta)
Fixed a bug with handling includes in queries.
Better code in some places in the formatter.
author | Ludovic Chabant <ludovic@chabant.com> |
---|---|
date | Thu, 07 Feb 2013 22:34:13 -0800 |
parents | fb6ae96756c1 |
children | 9afe4a1dbd1e |
line wrap: on
line source
import os import os.path import types import string import logging import datetime import sqlite3 class conn_scope(object): """ Helper class, disguised as a function, to ensure the database has been opened before doing something. If the database wasn't open, it will be closed after the operation. """ def __init__(self, db): self.db = db self.do_close = False def __enter__(self): self.do_close = (self.db.conn is None) self.db.open() def __exit__(self, type, value, traceback): if self.do_close: self.db.close() class Database(object): """ The base class for a database cache. """ def __init__(self, logger=None): if logger is None: logger = logging.getLogger('wikked.db') self.logger = logger def initDb(self): raise NotImplementedError() def open(self): raise NotImplementedError() def close(self): raise NotImplementedError() def reset(self, pages): raise NotImplementedError() def update(self, pages): raise NotImplementedError() def getPageUrls(self, subdir=None): raise NotImplementedError() def getPages(self, subdir=None): raise NotImplementedError() def getPage(self, url): raise NotImplementedError() def pageExists(self, url): raise NotImplementedError() def getLinksTo(self, url): raise NotImplementedError() class SQLiteDatabase(Database): """ A database cache based on SQLite. """ schema_version = 1 def __init__(self, db_path, logger=None): Database.__init__(self, logger) self.db_path = db_path self.conn = None def initDb(self): create_schema = False if self.db_path != ':memory:': if not os.path.isdir(os.path.dirname(self.db_path)): # No database on disk... create one. self.logger.debug("Creating SQL database.") os.makedirs(os.path.dirname(self.db_path)) create_schema = True else: # The existing schema is outdated, re-create it. schema_version = self._getSchemaVersion() if schema_version < self.schema_version: create_schema = True else: create_schema = True if create_schema: with conn_scope(self): self._createSchema() def open(self): if self.conn is None: self.logger.debug("Opening connection") self.conn = sqlite3.connect(self.db_path, detect_types=sqlite3.PARSE_DECLTYPES | sqlite3.PARSE_COLNAMES) self.conn.row_factory = sqlite3.Row def close(self): if self.conn is not None: self.logger.debug("Closing connection") self.conn.close() self.conn = None def reset(self, pages): self.logger.debug("Re-creating SQL database.") with conn_scope(self): self._createSchema() c = self.conn.cursor() for page in pages: self._addPage(page, c) self.conn.commit() def update(self, pages): self.logger.debug("Updating SQL database...") to_update = set() already_added = set() with conn_scope(self): c = self.conn.cursor() c.execute('''SELECT id, time, path FROM pages''') for r in c.fetchall(): if not os.path.isfile(r['path']): # File was deleted. self._removePage(r['id'], c) else: already_added.add(r['path']) path_time = datetime.datetime.fromtimestamp( os.path.getmtime(r['path'])) if path_time > r['time']: # File has changed since last index. self._removePage(r['id'], c) to_update.add(r['path']) self.conn.commit() for page in pages: if (page.path in to_update or page.path not in already_added): self._addPage(page, c) self.conn.commit() self.logger.debug("...done updating SQL database.") def getPageUrls(self, subdir=None): with conn_scope(self): c = self.conn.cursor() if subdir: subdir = string.rstrip(subdir, '/') + '/%' c.execute('''SELECT url FROM pages WHERE url LIKE ?''', (subdir,)) else: c.execute('''SELECT url FROM pages''') urls = [] for row in c.fetchall(): urls.append(row['url']) return urls def getPages(self, subdir=None): with conn_scope(self): c = self.conn.cursor() if subdir: subdir = string.rstrip(subdir, '/') + '/%' c.execute('''SELECT id, url, path, time, title, raw_text, formatted_text FROM pages WHERE url LIKE ?''', (subdir,)) else: c.execute('''SELECT id, url, path, time, title, raw_text, formatted_text FROM pages''') pages = [] for row in c.fetchall(): pages.append(self._getPage(row, c)) return pages def getPage(self, url): with conn_scope(self): c = self.conn.cursor() c.execute('''SELECT id, url, path, time, title, raw_text, formatted_text FROM pages WHERE url=?''', (url,)) row = c.fetchone() if row is None: return None return self._getPage(row, c) def pageExists(self, url): with conn_scope(self): c = self.conn.cursor() c.execute('''SELECT id FROM pages WHERE url=?''', (url,)) return c.fetchone() is not None def getLinksTo(self, url): with conn_scope(self): c = self.conn.cursor() c.execute('''SELECT source FROM links WHERE target=?''', (url,)) sources = [] for r in c.fetchall(): sources.append(r['source']) return sources def _createSchema(self): self.logger.debug("Creating SQL schema...") c = self.conn.cursor() c.execute('''DROP TABLE IF EXISTS pages''') c.execute('''CREATE TABLE pages (id INTEGER PRIMARY KEY AUTOINCREMENT, time TIMESTAMP, url TEXT, path TEXT, title TEXT, raw_text TEXT, formatted_text TEXT)''') c.execute('''DROP TABLE IF EXISTS links''') c.execute('''CREATE TABLE links (id INTEGER PRIMARY KEY AUTOINCREMENT, source TEXT, target TEXT)''') c.execute('''DROP TABLE IF EXISTS meta''') c.execute('''CREATE TABLE meta (id INTEGER PRIMARY KEY AUTOINCREMENT, page_id INTEGER, name TEXT, value TEXT)''') c.execute('''DROP TABLE IF EXISTS info''') c.execute('''CREATE TABLE info (name TEXT UNIQUE NOT NULL, str_value TEXT, int_value INTEGER, time_value TIMESTAMP)''') c.execute('''INSERT INTO info (name, int_value) VALUES (?, ?)''', ('schema_version', self.schema_version)) self.conn.commit() def _getInfo(self, name, default=None): with conn_scope(self): c = self.conn.cursor() c.execute('''SELECT name, str_value FROM info WHERE name=?''', (name,)) row = c.fetchone() if row is None: return default return row['str_value'] def _getInfoInt(self, name, default=None): with conn_scope(self): c = self.conn.cursor() c.execute('''SELECT name, int_value FROM info WHERE name=?''', (name,)) row = c.fetchone() if row is None: return default return row['int_value'] def _getInfoTime(self, name, default=None): with conn_scope(self): c = self.conn.cursor() c.execute('''SELECT name, time_value FROM info WHERE name=?''', (name,)) row = c.fetchone() if row is None: return default return row['time_value'] def _getSchemaVersion(self): with conn_scope(self): c = self.conn.cursor() c.execute('''SELECT name FROM sqlite_master WHERE type="table" AND name="info"''') if c.fetchone() is None: return 0 c.execute('''SELECT int_value FROM info WHERE name="schema_version"''') row = c.fetchone() if row is None: return 0 return row[0] def _addPage(self, page, c): self.logger.debug("Adding page '%s' to SQL database." % page.url) now = datetime.datetime.now() c.execute('''INSERT INTO pages (time, url, path, title, raw_text, formatted_text) VALUES (?, ?, ?, ?, ?, ?)''', (now, page.url, page.path, page.title, page.raw_text, page.formatted_text)) page_id = c.lastrowid for name, value in page.local_meta.iteritems(): if isinstance(value, bool): value = "" if isinstance(value, types.StringTypes): c.execute('''INSERT INTO meta (page_id, name, value) VALUES (?, ?, ?)''', (page_id, name, value)) else: for v in value: c.execute('''INSERT INTO meta (page_id, name, value) VALUES (?, ?, ?)''', (page_id, name, v)) for link_url in page.local_links: c.execute('''INSERT INTO links (source, target) VALUES (?, ?)''', (page.url, link_url)) def _removePage(self, page_id, c): c.execute('''SELECT url FROM pages WHERE id=?''', (page_id,)) row = c.fetchone() self.logger.debug("Removing page '%s' [%d] from SQL database." % (row['url'], page_id)) c.execute('''DELETE FROM pages WHERE id=?''', (page_id,)) c.execute('''DELETE FROM meta WHERE page_id=?''', (page_id,)) c.execute('''DELETE FROM links WHERE source=?''', (row['url'],)) def _getPage(self, row, c): db_page = { 'url': row['url'], 'path': row['path'], 'time': row['time'], 'title': row['title'], 'content': row['raw_text'], 'formatted': row['formatted_text'], 'links': [], 'meta': {} } c.execute('''SELECT target FROM links WHERE source=?''', (row['url'],)) for r in c.fetchall(): db_page['links'].append(r['target']) c.execute('''SELECT page_id, name, value FROM meta WHERE page_id=?''', (row['id'],)) for r in c.fetchall(): value = r['value'] if value == '': value = True name = r['name'] if name not in db_page['meta']: db_page['meta'][name] = value elif db_page['meta'][name] is list: db_page['meta'][name].append(value) else: db_page['meta'][name] = [db_page['meta'][name], value] return db_page