-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpreprocessing.py
108 lines (76 loc) · 2.72 KB
/
preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
from collections import Counter, OrderedDict
import math
import random
import numpy as np
import torch
from tqdm import tqdm
DEFAULT_UNK_INDEX = 1
def set_random_seed(seed):
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
def build_entity_vocab(entity_labels, eid2idx=None):
def func_etype():
for lst in entity_labels:
for el in lst:
yield el[-1]
etype_lst = list(sorted(set(x for x in func_etype())))
etype2idx = {e: i for i, e in enumerate(etype_lst)}
return etype2idx
def indexify(sentences, word2idx, unk_index=None):
def fn(s):
for w in s:
if w not in word2idx and unk_index is None:
raise ValueError
yield word2idx.get(w, unk_index)
return [list(fn(s)) for s in tqdm(sentences, desc='indexify')]
def indexify_etype(entity_labels, etype2idx):
def func_index():
for lst in entity_labels:
yield [(*el[:-1], etype2idx[el[-1]]) for el in lst if el[-1] in etype2idx]
return list(func_index())
def batchify(examples, batch_size):
sorted_examples = list(sorted(examples, key=lambda x: len(x)))
num_batches = int(math.ceil(len(examples) / batch_size))
batches = []
for i in range(num_batches):
start = i * batch_size
end = start + batch_size
batch = sorted_examples[start:end]
batches.append(pad(batch))
return batches
def pad(examples, padding_token=0):
def convert2numpy(batch):
# Note that this is tranposed to have dimensions (batch_size, sentence_length).
return np.array(batch, dtype=np.int32).T
maxlength = np.max([len(x) for x in examples])
batch = []
for x in examples:
diff = maxlength - len(x)
padded = [0] * diff + x
batch.append(padded)
return convert2numpy(batch)
def batch_iterator(dataset, batch_size, seed=None, drop_last=False):
if seed is not None:
set_random_seed(seed)
nexamples = len(dataset)
nbatches = math.ceil(nexamples/batch_size)
index = random.sample(range(nexamples), nexamples)
for i in range(nbatches):
start = i * batch_size
end = start + batch_size
if end > nexamples and drop_last:
break
batch = [dataset[i] for i in index[start:end]]
yield batch
def prepare_batch(batch):
return batch
def synthesize_training_data(nexamples, vocab_size, min_length=10, max_length=30, seed=None):
if seed is not None:
set_random_seed(seed)
dataset = []
for i in range(nexamples):
length = np.random.randint(min_length, max_length)
example = np.random.randint(0, vocab_size, size=length).tolist()
dataset.append(example)
return dataset