forked from pfeyz/wiktionary-ipa
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathparse-wiktionary.py
121 lines (104 loc) · 4.2 KB
/
parse-wiktionary.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
#!/usr/bin/env python3
"""
Fails on the following region formats
* {{a|[[w:Canadian English|CA]]; US, in accents with the [[cot-caught
merger]]}} {{IPA|/ˈdɪfθɑŋ/|/ˈdɪpθɑŋ/}}
* {{a|RP|[[antepenultimate]] [[stress]]}}
{{IPA|/trɑːnsˈleɪtɹɪsiːz/|/trænsˈleɪtɹɪsiːz/|/trɑːnzˈleɪtɹɪsiːz/|/trænzˈleɪtɹɪsiːz/}}
* {{a|RP|[[penultimate]] stress}}
{{IPA|/ˌtrɑːnsleɪˈtɹaɪsiːz/|/ˌtrænsleɪˈtɹaɪsiːz/|/ˌtrɑːnzleɪˈtɹaɪsiːz/|/ˌtrænzleɪˈtɹaɪsiːz/}}
"""
from xml import sax
import re
class IpaParser(sax.handler.ContentHandler):
special_chars = '"&<>'
region_regex = r"{{a\|(.*?)}}"
ipa_regex = r"{{IPA\|(.*?)}}"
def __init__(self, language, regions=None, *args, **kwargs):
super(sax.handler.ContentHandler, self).__init__(*args, **kwargs)
self.regions = regions if regions else []
self.language = language
self.depth = 0
self.reading_page = False
self.reading_ns = False
self.reading_title = False
self.reading_text = False
self.reading_english = False
self.reading_phonetics = False
self.reading_entry = False
self.title = None
self.entry = None
self.lines = 0
def emit_entry(self):
ipa = re.search(self.ipa_regex, self.entry)
region = re.search(self.region_regex, self.entry)
if ipa:
transcriptions = re.findall(r"/(.*?)/", ipa.group(1))
if region:
regions = region.group(1).split("|")
if not region or (set(regions) & set(self.regions)):
for tran in transcriptions:
print(self.title, tran)
#print("regions={0}".format(", ".join(regions)), end=" - ")
#
#print("ipa={0}".format(", ".join(transcriptions)))
def startElement(self, name, attrs):
if name == "page":
self.reading_page = True
if self.reading_page and name == "title":
self.reading_title = True
if self.reading_page and name == "ns":
self.reading_ns = True
if self.reading_page and name == "text":
self.reading_text = True
#print("{0}start {1}".format("| " * self.depth, name))
self.depth += 1
def characters(self, content):
if self.reading_ns and content != "0":
self.reading_page = False
if self.reading_title:
self.title = content
if self.reading_text:
if content == "=={0}==".format(self.language):
self.reading_english = True
elif re.match("^==[^=]", content):
self.reading_english = False
if self.reading_english and content == "===Pronunciation===":
self.reading_phonetics = True
elif content.startswith("==="):
self.reading_phonetics = False
self.lines += 1
if self.reading_phonetics:
if content.startswith("*"):
if self.entry:
self.emit_entry()
self.entry = content
elif self.entry and content[0] in self.special_chars:
self.entry += content
elif self.entry and self.entry[-1] in self.special_chars:
self.entry += content
elif not content.strip():
if self.entry:
self.emit_entry()
self.entry = None
self.reading_entry = False
def endElement(self, name):
self.depth -= 1
if name == "page":
self.reading_page = False
self.title = None
if name == "ns":
self.reading_ns = False
if name == "title":
self.reading_title = False
if name == "text":
self.reading_text = False
self.reading_phonetics = False
self.reading_english = False
self.lines = 0
#print("{0}end {1}".format("| " * self.depth, name))
if __name__ == "__main__":
parser = sax.make_parser()
parser.setContentHandler(IpaParser('English', ['US', 'GenAm', 'GenAM',
'North America']))
parser.parse(open("enwik.xml"))