-
Notifications
You must be signed in to change notification settings - Fork 17
/
mediawiki_to_md.py
executable file
·486 lines (417 loc) · 15.9 KB
/
mediawiki_to_md.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
#!/usr/bin/env python3
import argparse
import glob
import os
import re
import sys
import subprocess
import tempfile
# User configurable bits (ought to be command line options?):
debug = False
__version__ = "2.0.2"
if "-v" in sys.argv or "--version" in sys.argv:
print("This is mediawiki_to_git_md script mediawiki_to_md version " + __version__)
sys.exit(0)
if len(sys.argv) == 1:
print("This is mediawiki_to_git_md script mediawiki_to_md version " + __version__)
print("")
print("Basic Usage: ./mediawiki_to_md .")
print("")
sys.exit()
usage = """\
Run this script in a git repository where it will make commits to the
current branch based on having already parsed a MediaWiki XML dump. e.g.
$ git tag start
$ git checkout -b import_branch
$ python xml_to_git.py -i ../dump.xml
Then:
$ python mediawiki_to_md.py -i .
Tagging the repository before starting and/or making a branch makes it
easy to revert. As of v2, this records the revisions in the original
MediaWiki markup, with this script handling final commits converting the
final version into Markdown using Pandoc.
"""
parser = argparse.ArgumentParser(
prog="mediawiki_to_md.py",
description="Turn set of MediaWiki files into Markdown for GitHub Pages",
epilog=usage,
formatter_class=argparse.RawDescriptionHelpFormatter,
)
parser.add_argument(
"-i",
"--input",
metavar="NAMES",
nargs="+",
required=True,
help="MediaWiki filenames and/or foldernames within the current git repository.",
)
parser.add_argument(
"-p",
"--prefix",
metavar="PREFIX",
default="wiki/",
help="URL prefix and subfolder, default 'wiki/'.",
)
parser.add_argument(
"--mediawiki-ext",
metavar="EXT",
default="mediawiki",
help="File extension for MediaWiki files, default 'mediawiki'.",
)
parser.add_argument(
"--markdown-ext",
metavar="EXT",
default="md",
help="File extension for MarkDown files, default 'md'.",
)
args = parser.parse_args()
prefix = args.prefix
mediawiki_ext = args.mediawiki_ext
markdown_ext = args.markdown_ext
# Do these need to be configurable?:
page_prefixes_to_ignore = [
"Help:",
"MediaWiki:",
"Talk:",
"User:",
"User talk:",
] # Beware spaces vs _
default_layout = "wiki" # Can also use None; note get tagpage for category listings
git = "git" # assume on path
pandoc = "pandoc" # assume on path
def check_pandoc():
try:
child = subprocess.Popen(
[pandoc, "--version"],
text=True,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
)
except OSError:
sys.exit("Could not find pandoc on $PATH")
stdout, stderr = child.communicate()
if child.returncode:
sys.exit("Error %i from pandoc version check\n" % child.returncode)
if not stdout:
sys.exit("No output from pandoc version check\n")
for line in stdout.split("\n"):
if line.startswith("pandoc ") and "." in line:
print("Will be using " + line)
check_pandoc()
missing_users = dict()
unwanted_commits = 0
assert os.path.isdir(".git"), "Expected to be in a Git repository!"
if prefix:
assert prefix.endswith("/")
if not os.path.isdir(prefix):
os.mkdir(prefix)
def un_div(text):
"""Remove wrapping <div...>text</div> leaving just text."""
if text.strip().startswith("<div ") and text.strip().endswith("</div>"):
text = text.strip()[:-6]
text = text[text.index(">") + 1 :].strip()
return text
tmp = '<div style="float:left; maxwidth: 180px; margin-left:25px; margin-right:15px; background-color: #FFFFFF">[[Image:Pear.png|left|The Bosc Pear]]</div>'
# print(un_div(tmp))
assert un_div(tmp) == "[[Image:Pear.png|left|The Bosc Pear]]", un_div(tmp)
del tmp
def cleanup_mediawiki(text):
"""Modify mediawiki markup to make it pandoc ready.
Long term this needs to be highly configurable on a site-by-site
basis, but for now I'll put local hacks here.
Returns tuple: cleaned up text, list of any categories, title
"""
# This tag was probably setup via SyntaxHighlight GeSHi for biopython.org's wiki
#
# <python>
# import antigravity
# </python>
#
# Replacing it with the following makes pandoc happy,
#
# <source lang=python>
# import antigravity
# </source>
#
# Conversion by pandoc to GitHub Flavour Markdown gives:
#
# ``` python
# import antigravity
# ```
#
# Which is much nicer.
#
# =================================================
#
# I may have been misled by old links, but right now I don't
# think there is an easy way to get a table-of-contents with
# (GitHub Flavoured) Markdown which works on GitHub pages.
#
# Meanwhile the MediaWiki __TOC__ etc get left in the .md
# so I'm just going to remove them here.
#
new = []
categories = []
languages = ["python", "perl", "sql", "bash", "ruby", "java", "xml"]
# This is fragile, but good enough
if not text.startswith("---\ntitle: "):
sys.exit("ERROR: Missing our title header")
text = text[10:].strip()
title, text = text.split("\n", 1)
assert text.startswith("---\n")
text = text[4:]
for line in text.split("\n"):
# line is already unicode
# TODO - line = line.replace("\xe2\x80\x8e".decode("utf-8"), "") # LEFT-TO-RIGHT
# TODO - Would benefit from state tracking (for tag mismatches)
for lang in languages:
# Easy case <python> etc
if line.lower().startswith("<%s>" % lang):
line = (("<source lang=%s\n" % lang) + line[len(lang) + 2 :]).strip()
# Also cope with <python id=example> etc:
elif line.startswith("<%s " % lang) and ">" in line:
line = (("<source lang=%s " % lang) + line[len(lang) + 2 :]).strip()
# Want to support <python>print("Hello world")</python>
# where open and closing tags are on the same line:
if line.rstrip() == "</%s>" % lang:
line = "</source>"
elif line.rstrip().endswith("</%s>" % lang):
line = line.replace("</%s>" % lang, "\n</source>")
undiv = un_div(line)
if undiv in ["__TOC__", "__FORCETOC__", "__NOTOC__"]:
continue
elif undiv.startswith("[[Image:") and undiv.endswith("]]"):
# Markdown image wrapped in a div does not render on Github Pages,
# remove the div and any attempt at styling it (e.g. alignment)
line = undiv
# Look for any category tag, usually done as a single line:
while "[[Category:" in line:
tag = line[line.index("[[Category:") + 11 :]
tag = tag[: tag.index("]]")]
assert ("[[Category:%s]]" % tag) in line, "Infered %r from %s" % (tag, line)
categories.append(tag)
line = line.replace("[[Category:%s]]" % tag, "").strip()
if not line:
continue
# Special case fix for any category links,
# See https://github.com/jgm/pandoc/issues/2849
if "[[:Category:" in line:
line = line.replace("[[:Category:", "[[Category%3A")
if "[[User:" in line:
line = line.replace("[[User:", "[[User%3A")
new.append(line)
return "\n".join(new), categories, title
tmp = """\
---
title: Test
---
<div style="float:left; maxwidth: 180px; margin-left:25px; margin-right:15px; background-color: #FFF\
FFF">[[Image:Pear.png|left|The Bosc Pear]]</div>"""
assert cleanup_mediawiki(tmp) == (
"[[Image:Pear.png|left|The Bosc Pear]]",
[],
"Test",
), cleanup_mediawiki(tmp)
del tmp
def cleanup_markdown(text, source_url):
"""Post-process markdown from pandoc before saving it.
Currently only want to tweak internal wikilinks which point at
at (or are from) pages using child namespaces with slashes in them.
Problem is MediaWiki treats them as absolute (from base path),
while Jekyll will treat them as relative (to the current path).
"""
if prefix:
assert prefix.endswith("/") and source_url.startswith(prefix)
source = source_url[len(prefix) :]
assert not prefix.startswith("/")
else:
source = source_url
if "/" not in source:
return text
base, page = source.rsplit("/", 1)
# Looking for ...](URL "wikilink")... where the URL should look
# like a relative link (no http etc)
p = re.compile(']\([A-Z].* "wikilink"\)')
for old in p.findall(text):
if old.startswith(("](https:", "](http:", "](ftp:", "](mailto:", "])/")):
continue
new = "](%s" % os.path.relpath(old[2:], base)
# print("Replacing %s --> %s" % (old[1:], new[1:]))
text = text.replace(old, new)
return text
def make_cannonical(title):
"""Spaces to underscore; first letter upper case only."""
# Cannot use .title(), e.g. 'Biopython small.jpg' --> 'Biopython Small.Jpg'
title = title.replace(" ", "_")
return title[0].upper() + title[1:].lower()
def make_url(title):
"""Spaces to underscore; adds prefix; no trailing slash."""
return os.path.join(prefix, title.replace(" ", "_").replace(":", "%3A"))
def make_filename(title, ext):
"""Spaces/colons/slahses to underscores; adds extension given.
Want to avoid colons in filenames for Windows, fix the URL via
the YAML header with a permalink entry.
Likewise want to avoid slashes in filenames as causes problems
with automatic links when there are child-folders. Again we
get the desired URL via the YAML header permalink entry.
"""
return os.path.join(
prefix,
title.replace(" ", "_").replace(":", "_").replace("/", "_")
+ os.path.extsep
+ ext,
)
def ignore_by_prefix(title):
for prefix in page_prefixes_to_ignore:
if title.startswith(prefix):
return True
return False
def run(cmd_string):
# print(cmd_string)
return_code = os.system(cmd_string.encode("utf-8"))
if return_code:
sys.stderr.write("Error %i from: %s\n" % (return_code, cmd_string))
sys.exit(return_code)
def runsafe(cmd_array):
args = []
for el in cmd_array:
args.append(el.encode("utf-8"))
return_code = subprocess.call(args)
if return_code:
sys.stderr.write("Error %i from: %s\n" % (return_code, " ".join(cmd_array)))
sys.exit(return_code)
def commit_file(title, filename, date, username, contents, comment):
# commit an image or other file from its base64 encoded representation
assert username not in blocklist
assert title.startswith("File:")
if not filename:
filename = os.path.join(
prefix, make_cannonical(title[5:])
) # should already have extension
print("Commit %s %s by %s : %s" % (date, filename, username, comment[:40]))
with open(filename, "wb") as handle:
handle.write(base64.b64decode(contents))
commit_files([filename], username, date, comment)
names = []
for name in args.input:
if name.startswith("../"):
sys.exit(
f"ERROR: Input files must be within the current directory and git repo"
)
if os.path.isdir(name):
names.extend(glob.glob(name + "/*." + mediawiki_ext))
elif os.path.isfile(name) and name.endswith("." + mediawiki_ext):
names.append(name)
else:
sys.exit(f"ERROR: Unexpected input {name}")
print(f"Have {len(names)} input MediaWiki files")
print("Checking for redirects...")
redirects = {}
redirects_from = {}
for mw_filename in names:
with open(mw_filename) as handle:
original = handle.read()
assert original.startswith("---\ntitle: "), mw_filename
text, categories, title = cleanup_mediawiki(original)
if text.strip().startswith("#REDIRECT [[") and text.strip().endswith("]]"):
# Internal redirect, will become a redirect_from entry in target page
redirect = text.strip()[12:-2]
if "\n" not in redirect and "]" not in redirect:
# Maybe I should just have written a regular expression?
# We will do these AFTER converting the target using redirect_from
print(f" * redirection {mw_filename} --> {redirect}")
redirects[mw_filename] = redirect
try:
redirects_from[redirect].append(title)
except KeyError:
redirects_from[redirect] = [title]
elif text.strip().startswith("{{#externalredirect:") and text.strip().endswith(
"}}"
):
# External redirect
redirect = text.strip()[21:-2].strip()
redirects[mw_filename] = redirect
print(f" * redirection {mw_filename} --> {redirect}")
md_filename = mw_filename[: -len(mediawiki_ext)] + markdown_ext
if os.path.isfile(md_filename):
sys.stderr.write(f"WARNING - will overwrite {md_filename}\n")
with open(md_filename, "w") as handle:
handle.write("---\n")
handle.write("title: %s\n" % title)
handle.write("permalink: %s\n" % make_url(title))
handle.write(f"redirect_to: {redirect}\n")
handle.write("---\n")
handle.write("\n")
handle.write(f"You should be redirected to <{redirect}>\n")
print("Converting pages...")
for mw_filename in names:
if mw_filename in redirects:
continue
md_filename = mw_filename[: -len(mediawiki_ext)] + markdown_ext
if os.path.isfile(md_filename):
sys.stderr.write(f"WARNING - will overwrite {md_filename}\n")
print(f" * {mw_filename} --> {md_filename}")
# Yes, sadly we've opened most files twice :(
with open(mw_filename) as handle:
original = handle.read()
assert original.startswith("---\ntitle: "), mw_filename
text, categories, title = cleanup_mediawiki(original)
with tempfile.NamedTemporaryFile("w", delete=False) as handle:
handle.write(text)
tmp_mediawiki = handle.name
# TODO - Try piping text via stdin
folder, local_filename = os.path.split(md_filename)
child = subprocess.Popen(
[
pandoc,
"-f",
"mediawiki",
"-t",
# "markdown_github-hard_line_breaks",
"gfm-hard_line_breaks",
tmp_mediawiki,
],
text=True,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
)
stdout, stderr = child.communicate()
# What did pandoc think?
if stderr or child.returncode:
print(stdout)
if stderr:
sys.stderr.write(stderr)
if child.returncode:
sys.stderr.write("Error %i from pandoc\n" % child.returncode)
if not stdout:
sys.stderr.write("No output from pandoc for %r\n" % mw_filename)
if child.returncode or not stdout:
sys.exit("ERROR - Calling pandoc failed")
with open(md_filename, "w") as handle:
handle.write("---\n")
handle.write("title: %s\n" % title)
handle.write("permalink: %s\n" % make_url(title))
if title.startswith("Category:"):
# This assumes have layout template called tagpage
# which will insert the tag listing automatically
# i.e. Behaves like MediaWiki for Category:XXX
# where we mapped XXX as a tag in Jekyll
handle.write("layout: tagpage\n")
handle.write("tag: %s\n" % title[9:])
else:
# Not a category page,
if default_layout:
handle.write("layout: %s\n" % default_layout)
if categories:
# Map them to Jekyll tags as can have more than one per page:
handle.write("tags:\n")
for category in categories:
handle.write(" - %s\n" % category)
if title in redirects_from:
handle.write("redirect_from:\n")
for redirect in sorted(redirects_from[title]):
handle.write(" - %s\n" % make_url(redirect))
handle.write("---\n\n")
handle.write(cleanup_markdown(stdout, make_url(title)))
os.remove(tmp_mediawiki)
print("Done")