-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathpiextract.py
94 lines (72 loc) · 3.34 KB
/
piextract.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
from typing import Dict, List, Tuple
from glob import glob
import datasets
import os
def read_conll_file(file_path: str) -> Dict[str, List[List[str]]]:
# read all lines in CONLL file and strip trailing newlines
with open(file_path, "r") as input_file_stream:
conll_lines = [line.rstrip() for line in input_file_stream]
# create global dictionary for storing data
data: Dict[str, List[List[str]]] = {"tokens": [], "ner_tags": []}
# loop through lines in CONLL file
for line in conll_lines:
if line == "-DOCSTART- -X- O O":
# skip line if DOCSTART encountered
continue
elif line == "":
# append a new list as an empty string denotes
# the completion of a single annotation
data["tokens"].append([])
data["ner_tags"].append([])
else:
# in all other cases, split the line and append
# one token and one NER tag to the final list
token, ner_tag = line.split(" _ _ ")
data["tokens"][-1].append(token)
data["ner_tags"][-1].append(ner_tag)
return data
def merge_ner_tags(ner_tags: List[List[List[str]]]) -> List[List[Tuple[str, ...]]]:
# perform a nested zip operation to combine token-level NER tags
return [list(zip(*ner_tag)) for ner_tag in list(zip(*ner_tags))]
def load_piextract(directory: str) -> datasets.DatasetDict:
# define task loading order (necessary for multi-label task)
task_order = ["CollectUse_true", "CollectUse_false", "Share_true", "Share_false"]
# define global data dictionary
data: Dict[str, List[Dict[str, List[List[str]]]]] = {"train": [], "test": []}
# define empty DatasetDict
combined = datasets.DatasetDict()
# loop over tasks and CONLL files associated per task
for task in task_order:
for conll_file in glob(os.path.join(directory, task, "*.conll03")):
if os.path.basename(conll_file).startswith("train"):
split = "train"
else:
split = "test"
# append parsed CONLL file to dictionary by split
data[split].append(read_conll_file(conll_file))
# loop over each data split
for split, data_split in data.items():
# flatten tokens from all four tasks in this split
all_tokens = [data_split_subset["tokens"] for data_split_subset in data_split]
# flatten NER tags from all four tasks in this split
all_ner_tags = [
data_split_subset["ner_tags"] for data_split_subset in data_split
]
# ensure that all tokens are exactly the same (assumption for merging)
assert all([tokens == all_tokens[0] for tokens in all_tokens])
# merge all NER tags
merged_ner_tags = merge_ner_tags(all_ner_tags)
# convert dictionary into HF dataset and insert into DatasetDict
combined[split] = datasets.Dataset.from_dict(
{"tokens": all_tokens[0], "ner_tags": merged_ner_tags}
)
# make split using HF datasets internal methods
train_valid_dataset_dict = combined["train"].train_test_split(
test_size=0.15, seed=42
)
# reassign splits to combined
combined["train"] = train_valid_dataset_dict["train"]
combined["validation"] = train_valid_dataset_dict["test"]
return combined