-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathlexibank_abvd.py
60 lines (49 loc) · 1.87 KB
/
lexibank_abvd.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
import re
from pathlib import Path
from nameparser import HumanName
from clldutils.misc import slug
from pylexibank.providers import abvd
from pylexibank.util import progressbar
from pylexibank import FormSpec
def normalize_contributors(l):
for key in ['checkedby', 'typedby']:
l[key] = normalize_names(l[key])
return l
def normalize_names(names):
res = []
if names:
for name in re.split('\s+and\s+|\s*&\s*|,\s+|\s*\+\s*', names):
name = {
'Simon': 'Simon Greenhill',
'D. Mead': 'David Mead',
'Alex François': 'Alexandre François',
'Dr Alex François': 'Alexandre François',
'R. Blust': 'Robert Blust',
}.get(name, name)
name = HumanName(name.title())
res.append('{0} {1}'.format(name.first or name.title, name.last).strip())
return ' and '.join(res)
class Dataset(abvd.BVD):
dir = Path(__file__).parent
id = 'abvd'
SECTION = 'austronesian'
invalid_ids = [
261, # Duplicate West Futuna list
]
language_ids = list(range(1, 2000))
form_spec = FormSpec(
brackets={"[": "]", "{": "}", "(": ")"},
separators=";/,~",
missing_data=('-', ),
strip_inside_brackets=True,
)
def cmd_makecldf(self, args):
args.writer.add_sources(*self.etc_dir.read_bib())
concepts = args.writer.add_concepts(
id_factory=lambda c: c.id.split('-')[-1]+ '_' + slug(c.english),
lookup_factory=lambda c: c['ID'].split('_')[0]
)
for wl in progressbar(self.iter_wordlists(args.log), desc="cldfify"):
wl.to_cldf(args.writer, concepts)
# Now normalize the typedby and checkedby values:
args.writer.objects['LanguageTable'][-1] = normalize_contributors(args.writer.objects['LanguageTable'][-1])