-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathIMDb.py
59 lines (46 loc) · 1.83 KB
/
IMDb.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
# Train positive/negative
train_pos=pd.read_csv('imdb_train_pos.txt', delimiter='\n', names=['text'])
train_neg=pd.read_csv('imdb_train_neg.txt', delimiter='\n', names=['text'])
train_pos['label']=1
train_neg['label']=0
train_pos_neg=pd.concat([train_pos,train_neg])
train_shuf=train_pos_neg.sample(frac=1)
# Developement positive/negative
dev_pos=pd.read_csv('imdb_dev_pos.txt', delimiter='\n', names=['text'])
dev_neg=pd.read_csv('imdb_dev_neg.txt', delimiter='\n', names=['text'])
dev_pos['label']=1
dev_neg['label']=0
dev_pos_neg=pd.concat([dev_pos,dev_neg])
dev_shuf=dev_pos_neg.sample(frac=1)
# Test positive/negative
test_pos=pd.read_csv('imdb_test_pos.txt', delimiter='\n', names=['text'])
test_neg=pd.read_csv('imdb_test_neg.txt', delimiter='\n', names=['text'])
test_pos['label']=1
test_neg['label']=0
test_pos_neg=pd.concat([test_pos,test_neg])
test_shuf=test_pos_neg.sample(frac=1)
train_x = train_shuf['text']
train_y = train_shuf['label']
dev_x = dev_shuf['text']
dev_y = dev_shuf['label']
test_x = dev_shuf['text']
test_y = dev_shuf['label']
# Text vectorization
tfidf = TfidfVectorizer(min_df=2, max_df=0.5, ngram_range=(1,2), stop_words='english')
tfidf.fit(train_x,dev_x)
tfidf.fit(test_x)
train_x_tfidf = tfidf.transform(train_x)
dev_x_tfidf = tfidf.transform(dev_x)
test_x_tfidf = tfidf.transform(test_x)
# Logistic Regression Model
Model = LogisticRegression()
Model.fit(train_x_tfidf, train_y)
# Test prediction and model scores
Predictions_test = Model.predict(test_x_tfidf)
print('Scores for test dataset:\n',classification_report(test_y,Predictions_test))
new_review = input('Give me a new review:')
print(Model.predict(tfidf.transform([new_review])))