-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathextractor.py
84 lines (65 loc) · 2.51 KB
/
extractor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import json
from typing import Dict, List
from collections import defaultdict
from pathlib import Path
from tqdm import tqdm
import executor
import stanza
import jmespath
import choppa
from choppa.srx_parser import SrxDocument
from choppa.iterators import SrxTextIterator
import re
ACUTE = chr(0x301)
GRAVE = chr(0x300)
ruleset = Path(choppa.__file__).parent / "data/srx/languagetool_segment.srx"
SRX_2_XSD = Path(choppa.__file__).parent / "data/xsd/srx20.xsd"
document = SrxDocument(ruleset=ruleset, validate_ruleset=SRX_2_XSD)
PATTERN = re.compile("\)\s—")
PREFIX = "omw-en31"
nlp = stanza.Pipeline(lang="uk", processors="tokenize,mwt,pos,lemma", verbose=False)
COUNTER = 0
def remove_accents(s: str) -> str:
return s.replace(ACUTE, "").replace(GRAVE, "")
def lemmatize(s: str) -> List[str]:
doc = nlp(s)
return [word.lemma for sent in doc.sentences for word in sent.words]
def extract_from_page(word: str) -> Dict:
try:
s = executor.execute("node", "./wtf", word, capture=True)
page = json.loads(s)
except ValueError:
page = {"json": dict()}
merged_sections = defaultdict(list)
for info_sections in jmespath.search("json.sections[].infoboxes[]", page) or {}:
for k, v in info_sections.items():
if "text" in v:
merged_sections[k].append(v["text"])
sentences: List[str] = []
for s in page["plaintext"].splitlines():
if not s.strip():
continue
for text in SrxTextIterator(document, "uk_one", s.strip(), max_lookbehind_construct_length=1024 * 10):
if not text:
continue
sentences.append(text)
title = remove_accents(page["json"]["title"]).strip().lower()
filtered_sentences: List[str] = []
for sent in tqdm(sentences):
if title in lemmatize(sent):
filtered_sentences.append(sent)
page["infoboxes"] = merged_sections
page["examples"] = filtered_sentences
if jmespath.search("json.sections[0].paragraphs[].sentences[0].text", page):
gloss = jmespath.search("json.sections[0].paragraphs[].sentences[0].text", page)[0]
else:
gloss = ""
if "—" in gloss:
if re.search(PATTERN, gloss):
_, gloss = re.split(PATTERN, gloss, 1)
else:
_, gloss = gloss.split("—", 1)
page["gloss"] = gloss.strip()
return page
if __name__ == "__main__":
print(json.dumps(extract_from_page("Буряк"), indent=4, ensure_ascii=False))