comparison tools/import_mediawiki.py @ 426:bf65fba2854c

tools: Add some simple MediaWiki importer.
author Ludovic Chabant <ludovic@chabant.com>
date Tue, 28 Mar 2017 21:25:00 -0700
parents
children
comparison
equal deleted inserted replaced
425:e28f2c76691c 426:bf65fba2854c
1 import os
2 import os.path
3 import argparse
4 from sqlalchemy import create_engine
5
6
7 def main():
8 parser = argparse.ArgumentParser()
9 parser.add_argument('url')
10 parser.add_argument('-o', '--out', default='wikked_import')
11 parser.add_argument('--prefix', default='wiki')
12 parser.add_argument('-v', '--verbose', action='store_true')
13 parser.add_argument('--ext', default='.md')
14 args = parser.parse_args()
15
16 prefix = args.prefix
17 out_dir = args.out
18 ext = '.' + args.ext.lstrip('.')
19
20 if not out_dir:
21 parser.print_help()
22 return 1
23
24 if os.path.isdir(out_dir):
25 print("The output directory already exists!")
26 return 1
27
28 engine = create_engine(args.url, echo=args.verbose)
29 conn = engine.connect()
30
31 query = (
32 'SELECT '
33 'p.page_id,p.page_title,p.page_latest,'
34 'r.rev_id,r.rev_text_id,t.old_id,t.old_text '
35 'from %(prefix)s_page p '
36 'INNER JOIN %(prefix)s_revision r ON p.page_latest = r.rev_id '
37 'INNER JOIN %(prefix)s_text t ON r.rev_text_id = t.old_id;' %
38 {'prefix': prefix})
39 q = conn.execute(query)
40 for p in q:
41 title = p['page_title'].decode('utf8')
42 text = p['old_text'].decode('utf8')
43
44 path_noext = os.path.join(out_dir, title)
45 path = path_noext + ext
46 dirname = os.path.dirname(path)
47 if not os.path.isdir(dirname):
48 os.makedirs(dirname)
49
50 if os.path.exists(path):
51 suffnum = 2
52 while True:
53 new_path = '%s_%d%s' % (path_noext, suffnum, ext)
54 if not os.path.exists(new_path):
55 break
56 suffnum += 1
57 if suffnum > 100:
58 raise Exception("Can't find available path for: " %
59 path)
60
61 print("WARNING: %s exists" % path)
62 print("WARNING: creating %s instead" % new_path)
63 path = new_path
64
65 print(p['page_id'], title)
66 with open(path, 'w', encoding='utf8') as fp:
67 fp.write(text)
68
69 conn.close()
70
71
72 if __name__ == '__main__':
73 main()