annotate tools/import_mediawiki.py @ 464:1dc6a0a74da3

wiki: Improve consistency of absolute/relative links. - Make links from endpoint pages go to the same endpoint by default. - Add support for `:` (empty) endpoint to link outside of endpoints. - Add unit tests.
author Ludovic Chabant <ludovic@chabant.com>
date Sat, 06 Oct 2018 19:40:52 -0700
parents bf65fba2854c
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
426
bf65fba2854c tools: Add some simple MediaWiki importer.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
1 import os
bf65fba2854c tools: Add some simple MediaWiki importer.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
2 import os.path
bf65fba2854c tools: Add some simple MediaWiki importer.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
3 import argparse
bf65fba2854c tools: Add some simple MediaWiki importer.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
4 from sqlalchemy import create_engine
bf65fba2854c tools: Add some simple MediaWiki importer.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
5
bf65fba2854c tools: Add some simple MediaWiki importer.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
6
bf65fba2854c tools: Add some simple MediaWiki importer.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
7 def main():
bf65fba2854c tools: Add some simple MediaWiki importer.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
8 parser = argparse.ArgumentParser()
bf65fba2854c tools: Add some simple MediaWiki importer.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
9 parser.add_argument('url')
bf65fba2854c tools: Add some simple MediaWiki importer.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
10 parser.add_argument('-o', '--out', default='wikked_import')
bf65fba2854c tools: Add some simple MediaWiki importer.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
11 parser.add_argument('--prefix', default='wiki')
bf65fba2854c tools: Add some simple MediaWiki importer.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
12 parser.add_argument('-v', '--verbose', action='store_true')
bf65fba2854c tools: Add some simple MediaWiki importer.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
13 parser.add_argument('--ext', default='.md')
bf65fba2854c tools: Add some simple MediaWiki importer.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
14 args = parser.parse_args()
bf65fba2854c tools: Add some simple MediaWiki importer.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
15
bf65fba2854c tools: Add some simple MediaWiki importer.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
16 prefix = args.prefix
bf65fba2854c tools: Add some simple MediaWiki importer.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
17 out_dir = args.out
bf65fba2854c tools: Add some simple MediaWiki importer.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
18 ext = '.' + args.ext.lstrip('.')
bf65fba2854c tools: Add some simple MediaWiki importer.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
19
bf65fba2854c tools: Add some simple MediaWiki importer.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
20 if not out_dir:
bf65fba2854c tools: Add some simple MediaWiki importer.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
21 parser.print_help()
bf65fba2854c tools: Add some simple MediaWiki importer.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
22 return 1
bf65fba2854c tools: Add some simple MediaWiki importer.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
23
bf65fba2854c tools: Add some simple MediaWiki importer.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
24 if os.path.isdir(out_dir):
bf65fba2854c tools: Add some simple MediaWiki importer.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
25 print("The output directory already exists!")
bf65fba2854c tools: Add some simple MediaWiki importer.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
26 return 1
bf65fba2854c tools: Add some simple MediaWiki importer.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
27
bf65fba2854c tools: Add some simple MediaWiki importer.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
28 engine = create_engine(args.url, echo=args.verbose)
bf65fba2854c tools: Add some simple MediaWiki importer.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
29 conn = engine.connect()
bf65fba2854c tools: Add some simple MediaWiki importer.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
30
bf65fba2854c tools: Add some simple MediaWiki importer.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
31 query = (
bf65fba2854c tools: Add some simple MediaWiki importer.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
32 'SELECT '
bf65fba2854c tools: Add some simple MediaWiki importer.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
33 'p.page_id,p.page_title,p.page_latest,'
bf65fba2854c tools: Add some simple MediaWiki importer.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
34 'r.rev_id,r.rev_text_id,t.old_id,t.old_text '
bf65fba2854c tools: Add some simple MediaWiki importer.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
35 'from %(prefix)s_page p '
bf65fba2854c tools: Add some simple MediaWiki importer.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
36 'INNER JOIN %(prefix)s_revision r ON p.page_latest = r.rev_id '
bf65fba2854c tools: Add some simple MediaWiki importer.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
37 'INNER JOIN %(prefix)s_text t ON r.rev_text_id = t.old_id;' %
bf65fba2854c tools: Add some simple MediaWiki importer.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
38 {'prefix': prefix})
bf65fba2854c tools: Add some simple MediaWiki importer.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
39 q = conn.execute(query)
bf65fba2854c tools: Add some simple MediaWiki importer.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
40 for p in q:
bf65fba2854c tools: Add some simple MediaWiki importer.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
41 title = p['page_title'].decode('utf8')
bf65fba2854c tools: Add some simple MediaWiki importer.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
42 text = p['old_text'].decode('utf8')
bf65fba2854c tools: Add some simple MediaWiki importer.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
43
bf65fba2854c tools: Add some simple MediaWiki importer.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
44 path_noext = os.path.join(out_dir, title)
bf65fba2854c tools: Add some simple MediaWiki importer.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
45 path = path_noext + ext
bf65fba2854c tools: Add some simple MediaWiki importer.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
46 dirname = os.path.dirname(path)
bf65fba2854c tools: Add some simple MediaWiki importer.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
47 if not os.path.isdir(dirname):
bf65fba2854c tools: Add some simple MediaWiki importer.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
48 os.makedirs(dirname)
bf65fba2854c tools: Add some simple MediaWiki importer.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
49
bf65fba2854c tools: Add some simple MediaWiki importer.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
50 if os.path.exists(path):
bf65fba2854c tools: Add some simple MediaWiki importer.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
51 suffnum = 2
bf65fba2854c tools: Add some simple MediaWiki importer.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
52 while True:
bf65fba2854c tools: Add some simple MediaWiki importer.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
53 new_path = '%s_%d%s' % (path_noext, suffnum, ext)
bf65fba2854c tools: Add some simple MediaWiki importer.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
54 if not os.path.exists(new_path):
bf65fba2854c tools: Add some simple MediaWiki importer.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
55 break
bf65fba2854c tools: Add some simple MediaWiki importer.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
56 suffnum += 1
bf65fba2854c tools: Add some simple MediaWiki importer.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
57 if suffnum > 100:
bf65fba2854c tools: Add some simple MediaWiki importer.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
58 raise Exception("Can't find available path for: " %
bf65fba2854c tools: Add some simple MediaWiki importer.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
59 path)
bf65fba2854c tools: Add some simple MediaWiki importer.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
60
bf65fba2854c tools: Add some simple MediaWiki importer.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
61 print("WARNING: %s exists" % path)
bf65fba2854c tools: Add some simple MediaWiki importer.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
62 print("WARNING: creating %s instead" % new_path)
bf65fba2854c tools: Add some simple MediaWiki importer.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
63 path = new_path
bf65fba2854c tools: Add some simple MediaWiki importer.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
64
bf65fba2854c tools: Add some simple MediaWiki importer.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
65 print(p['page_id'], title)
bf65fba2854c tools: Add some simple MediaWiki importer.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
66 with open(path, 'w', encoding='utf8') as fp:
bf65fba2854c tools: Add some simple MediaWiki importer.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
67 fp.write(text)
bf65fba2854c tools: Add some simple MediaWiki importer.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
68
bf65fba2854c tools: Add some simple MediaWiki importer.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
69 conn.close()
bf65fba2854c tools: Add some simple MediaWiki importer.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
70
bf65fba2854c tools: Add some simple MediaWiki importer.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
71
bf65fba2854c tools: Add some simple MediaWiki importer.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
72 if __name__ == '__main__':
bf65fba2854c tools: Add some simple MediaWiki importer.
Ludovic Chabant <ludovic@chabant.com>
parents:
diff changeset
73 main()