-
Notifications
You must be signed in to change notification settings - Fork 0
/
code_naive_Bayes_cleaned.py
122 lines (101 loc) · 4.6 KB
/
code_naive_Bayes_cleaned.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
from cgitb import text
import csv
from utils import load_data, unique, calculate_avg_length, split_words_by_label,\
get_vocab_size, prob_Laplace_smoothing, accuracy, macro_F1
from cleandata import total_cleaning
def eval(word_to_prob, label_set, valid_texts, valid_labels, vocab_size, label_portion, unseen):
TP = {label:0 for label in label_set}
TN = {label:0 for label in label_set}
FP = {label:0 for label in label_set}
FN = {label:0 for label in label_set}
for text, gold_label in zip(valid_texts, valid_labels):
max_likelihood = 0
#Not reasonable, some cases with prob = 0(too small to be represented by float) may be predicted to label 1.
max_label = 1
for label in label_set:
#prob = label_portion[label]
prob = label_portion[label] * 1000000000
for word in set(text.split()):
if (word, label) in word_to_prob:
prob *= word_to_prob[(word, label)]
else:
prob *= unseen[label]
if prob > max_likelihood:
max_label = label
max_likelihood = prob
#print(text.split(), " predicted over! likelihood = {}, label = {}".format(max_likelihood, max_label))
if max_label == gold_label:
TP[gold_label] += 1
for label in label_set:
TN[label] += 1
TN[gold_label] -= 1
else:
for label in label_set:
TN[label] += 1
TN[gold_label] -= 1
TN[max_label] -= 1
FN[gold_label] += 1
FP[max_label] += 1
print("TP: ", TP)
print("TN: ", TN)
print("FP: ", FP)
print("FN: ", FN)
Macro_f1 = 0
for label in label_set:
Macro_f1 += macro_F1(TP[label], FP[label], FN[label])
Macro_f1 /= len(label_set)
print("Macro-F1 score: ", Macro_f1)
return Macro_f1
if __name__ == '__main__':
# Load data
train_texts, train_labels = load_data('data/sst_train.csv')
#divide the train dataset into 5 splits, train : validation = 4 : 1
valid_texts, valid_labels = train_texts[-int(len(train_texts)/5):], train_labels[-int(len(train_texts)/5):]
train_texts, train_labels = train_texts[:-int(len(train_texts)/5)], train_labels[:-int(len(train_texts)/5)]
test_texts, test_labels = load_data('data/sst_test.csv')
# Print basic statistics
print("Training set size:", len(train_texts))
print("Validation set size:", len(valid_texts))
print("Test set size:", len(test_texts))
label_set = list(unique(train_labels))
label_set.sort()
print("Unique labels:", label_set)
print("Avg. length:", calculate_avg_length(train_texts + valid_texts + test_texts))
#Data cleaning ------ Naive Bayes model requires high purity of data.
print("Executing data cleaning!")
test_texts = total_cleaning(test_texts)
train_texts = total_cleaning(train_texts)
valid_texts = total_cleaning(valid_texts)
#print(test_texts)
#After cleaning
print("Avg. length:", calculate_avg_length(train_texts + valid_texts + test_texts))
# Extract features from the texts
#Using hand-crafted Naive Bayes model to solve the problem.
#mapping (word, label) to conditional probability
word_to_prob = {}
total_labeled_words, label_portion = split_words_by_label(train_texts, train_labels, label_set)
#print(total_labeled_words)
#Question: How many words should the vocabulary contain?
#If only comes from train set, the prob of out-of-vocabulary is still quite high...
vocab_size, _ = get_vocab_size(train_texts)
#doubtful result
print(vocab_size)
# quite unbalanced data
#for label in label_portion:
#label_portion[label] /= len(train_texts)
print(label_portion)
# Train the model and evaluate it on the valid set
alpha = 0.2
for label in total_labeled_words:
for word in total_labeled_words[label]:
if (word, label) not in word_to_prob:
word_to_prob[(word, label)] = prob_Laplace_smoothing(word, total_labeled_words, label, vocab_size, label_portion, alpha)
#evaluating it on the valid set
unseen = {label: alpha / (label_portion[label] + alpha*vocab_size) for label in label_set}
print(unseen)
for label in label_portion:
#label_portion[label] /= len(train_texts)
label_portion[label] = 0.2
#eval(word_to_prob, label_set, valid_texts, valid_labels, vocab_size, label_portion, unseen)
# Test the best performing model on the test set
eval(word_to_prob, label_set, test_texts, test_labels, vocab_size, label_portion, unseen)