-
Notifications
You must be signed in to change notification settings - Fork 0
/
model.py
143 lines (113 loc) · 4.88 KB
/
model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
from utils import macro_F1
import numpy as np
class NB:
def __init__(self, alpha = 1):
#setting hyperparameters
self.alpha = alpha
def get_word_feature(self, texts):
print('building vocabulary based on training set...')
self.word2id = {}
self.dim = 0
for text in texts:
for word in text.split():
if word not in self.word2id:
self.word2id[word] = self.dim
self.dim += 1
print('vocabulary size: ', self.dim)
def get_feature_matrix(self, texts):
#print('generating feature matrix from scratch...')
n = len(texts)
dim = len(self.word2id)
feature_matrix = np.zeros((n, dim))
cnt = 0
unseen_word_num = np.zeros((n,))
for index, text in enumerate(texts):
for word in text.split():
if word in self.word2id:
feature_matrix[index][self.word2id[word]] += 1
else:
cnt += 1
unseen_word_num[index] += 1
#print("Total unseen words: ", cnt)
return feature_matrix, unseen_word_num
def fit(self, train_texts, train_labels, label_set):
print('start training...')
self.get_word_feature(train_texts)
self.label_set = label_set
self.max_label = len(self.label_set)
self.prob_matrix = np.zeros((self.max_label, self.dim))
for text, label in zip(train_texts, train_labels):
for word in text.split():
self.prob_matrix[label][self.word2id[word]] += 1
#getting label portion.
#a problem that is worth discussing.
self.label_portion = self.prob_matrix.sum(axis = 1) / self.prob_matrix.sum()
#calculating conditional probabilities
up = self.prob_matrix + self.alpha
down = self.prob_matrix.sum(axis = 1) + self.alpha * self.dim
down = np.expand_dims(down, axis = 1)
# For unseen tokens.
self.unseen = np.ones((self.max_label,)) * self.alpha
self.unseen = self.unseen / (self.prob_matrix.sum(axis = 1) + self.alpha * self.dim)
self.prob_matrix = up / down
def predict(self, texts):
print('start predicting...')
total_num = len(texts)
res = np.zeros(total_num, dtype=int)
print('label_portion: ', self.label_portion)
#batched process, otherwise the need for memory can't be satisfied.;
batch_size = 5
for i in range(total_num // batch_size + 1):
if batch_size*(i+1) > total_num:
batch_texts = texts[batch_size*i:total_num]
else:
batch_texts = texts[batch_size*i:batch_size*(i+1)]
feature_matrix, unseen_word_num = self.get_feature_matrix(batch_texts)
feature_matrix = np.expand_dims(feature_matrix, axis = 0)
prob_matrix = np.expand_dims(self.prob_matrix, axis = 1)
# use log to switch multiply.reduce to sum operation.
midres = np.log(prob_matrix) * feature_matrix
unseen = np.expand_dims(self.unseen, axis = 1)
unseen_word_num = np.expand_dims(unseen_word_num, axis = 0)
midres = np.sum(midres, axis = 2) + np.log(unseen) * unseen_word_num
# strongly worsen the performance of sst-5
label_portion = np.expand_dims(self.label_portion, axis = 1)
midres += np.log(label_portion)
if batch_size*(i+1) > total_num:
res[batch_size*i: total_num] = np.argmax(midres, axis=0)
else:
res[batch_size*i: batch_size*(i+1)] = np.argmax(midres, axis = 0)
return res
def eval(self, results, labels):
print('start evaluating...')
#computing confusion matrix
TP = np.zeros((self.max_label,))
TN = np.zeros((self.max_label, ))
FP = np.zeros((self.max_label,))
FN = np.zeros((self.max_label,))
for res, gold_label in zip(results, labels):
if res == gold_label:
TP[res] += 1
for label in self.label_set:
TN[label] += 1
TN[res] -= 1
else:
FP[res] += 1
FN[gold_label] += 1
for label in self.label_set:
TN[label] += 1
TN[res] -= 1
TN[gold_label] -= 1
print("TP: ", TP)
print("TN: ", TN)
print("FP: ", FP)
print("FN: ", FN)
#computing macro-f1 score and accuracy score.
Macro_f1 = 0
for label in self.label_set:
Macro_f1 += macro_F1(TP[label], FP[label], FN[label])
Macro_f1 /= self.max_label
print("Macro-F1 score: ", Macro_f1)
acc = TP.sum() / (len(labels))
print("accuracy score: ", acc)
return Macro_f1, acc