-
Notifications
You must be signed in to change notification settings - Fork 677
/
Copy pathNLP + Naive Bayes Classifier
199 lines (158 loc) · 5.51 KB
/
NLP + Naive Bayes Classifier
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
from gensim import utils
from gensim.models.doc2vec import LabeledSentence
from gensim.models import Doc2Vec
import numpy
import numpy as np
from random import shuffle
from sklearn.linear_model import LogisticRegression
class LabeledLineSentence(object):
def __init__(self, sources):
self.sources = sources
flipped = {}
for key, value in sources.items():
if value not in flipped:
flipped[value] = [key]
else:
raise Exception('Non-unique prefix encountered')
def __iter__(self):
for source, prefix in self.sources.items():
with utils.smart_open(source) as fin:
for item_no, line in enumerate(fin):
yield LabeledSentence(utils.to_unicode(line).split(), [prefix + '_%s' % item_no])
def to_array(self):
self.sentences = []
for source, prefix in self.sources.items():
with utils.smart_open(source) as fin:
for item_no, line in enumerate(fin):
self.sentences.append(LabeledSentence(utils.to_unicode(line).split(), [prefix + '_%s' % item_no]))
return self.sentences
def sentences_perm(self):
shuffle(self.sentences)
return self.sentences
sources = {'train-neg.txt':'TRAIN_NEG', 'train-pos.txt':'TRAIN_POS', 'train-unsup.txt':'TRAIN_UNS', 'test-pos.txt':'TEST_POS','test-neg2.txt':'TEST_NEG'}
sentences = LabeledLineSentence(sources)
sentences
model = Doc2Vec(min_count=1, window=5, size=10, sample=1e-4, negative=5, workers=8)
model.build_vocab(sentences.to_array())
for epoch in range(10):
model.train(sentences.sentences_perm())
model.save('./imdb.d2v')
model = Doc2Vec.load('./imdb.d2v')
model
model.most_similar('good')
model.syn0
sentences.to_array()
model.docvecs['TRAIN_POS_0']
train_arrays = numpy.zeros((14, 10))
train_labels = numpy.zeros(14)
for i in range(7):
prefix_train_pos = 'TRAIN_POS_' + str(i)
prefix_train_neg = 'TRAIN_NEG_' + str(i)
train_arrays[i] = model.docvecs[prefix_train_pos]
train_arrays[1 + i] = model.docvecs[prefix_train_neg]
train_labels[i] = 1
train_labels[1 + i] = 0
test_arrays = numpy.zeros((14, 10))
test_labels = numpy.zeros(14)
for i in range(7):
prefix_test_pos = 'TEST_POS_' + str(i)
test_arrays[i] = model.docvecs['TEST_POS_0']
test_arrays[1 + i] = model.docvecs['TEST_NEG_0']
test_labels[i] = 1
test_labels[1 + i] = 0
classifier = LogisticRegression()
classifier.fit(train_arrays, train_labels)
classifier.predict(train_arrays)
classifier.predict(test_arrays)
classifier.score(test_arrays, test_labels)
### DECISION TREES
from sklearn import tree
model4=tree.DecisionTreeClassifier(criterion='entropy',max_depth=20,max_features=4,max_leaf_nodes=3,min_samples_leaf=1,min_samples_split=1,splitter='best')
a=model4.fit(train_arrays, train_labels)
model4.score(train_arrays, train_labels)
pred=model4.predict(test_arrays)
sum(x==0 for x in pred-test_labels)/len(pred)
### NAIVE BAYES
from sklearn.naive_bayes import GaussianNB
model4=GaussianNB()
a=model4.fit(train_arrays, train_labels)
model4.score(train_arrays, train_labels)
pred=model4.predict(test_arrays)
sum(x==0 for x in pred-test_labels)/len(pred)
res = pred
for i in range(0,len(res)):
if res[i]>.5:
res[i]=1
else:
res[i]=0
p=[]
for i in range(0,len(res)):
if res[i]==0:
p.append('Negative')
else:
p.append('Positive')
p2=[]
for i in range(0,len(res)):
if test_labels[i]==0:
p2.append('Negative')
else:
p2.append('Positive')
from bs4 import BeautifulSoup
import nltk
import numpy as np
from nltk import sent_tokenize, word_tokenize, pos_tag
filename = "test-pos.txt"
raw_text2 = open(filename).read()
html2=raw_text2
soup = BeautifulSoup(html2,"lxml")
###### SEMANTIC
texto=[]
for string in soup.stripped_strings:
texto.append(repr(string))
texto
for script in soup(["script", "style"]):
script.extract() # rip it out
text2 = soup.get_text()
text2
sentences_pos = sent_tokenize(text2)
sentences_pos
filename = "test-neg2.txt"
raw_text2 = open(filename).read()
html=raw_text2
soup2 = BeautifulSoup(html,"lxml")
html
text2 = soup2.get_text()
text2
sentences_neg = sent_tokenize(text2)
sentences_neg
sentences=sentences_neg
acc=1-len(np.where([int(i) for i in res]-test_labels==0))/len(test_labels)
sentences_pos
sentences_neg
print('\n','NAIVE BAYES CLASSIFIER','\n')
print('Accuracy=',acc,'\n')
print('ACTUAL SENTIMENT:','\n',p2,'\n')
print('PREDICTED SENTIMENT:','\n',p,'\n')
#### OUTPUT
Out[39]:
['Next was a very nice and interesting movie.',
'Matrix was awesome.',
'Paycheck was excellent.',
'Ronin was great.',
'Pulp Fiction was very good.',
'Lion King was cute.',
'Madagascar was extremely interesting.\n']
Out[40]:
["I wish I could say that I loved it but I'd be lying.",
'There were feeble attempts at humor and historical accuracy, but alas both failed to rise to the occasion.',
'Swordfish was definitely not good.',
'Rambo was an awful movie.',
'Gamer was a bad movie.',
'I did not like Jumper.',
'Lake House was a bad one.\n']
NAIVE BAYES CLASSIFIER
Accuracy= 0.9285714285714286
ACTUAL SENTIMENT:
['Positive', 'Positive', 'Positive', 'Positive', 'Positive', 'Positive', 'Positive', 'Negative', 'Negative', 'Negative', 'Negative', 'Negative', 'Negative', 'Negative']
PREDICTED SENTIMENT:
['Positive', 'Positive', 'Positive', 'Positive', 'Positive', 'Positive', 'Positive', 'Positive', 'Negative', 'Negative', 'Negative', 'Negative', 'Negative', 'Negative']