-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathncc_console.py
53 lines (40 loc) · 1.56 KB
/
ncc_console.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
import time
start = time.time()
df = pd.read_csv("data\comment_data_RAW.csv")
df_data = df[["comment_text", "toxic", "severe_toxic", "obscene",
"threat", "insult", "identity_hate"]]
df_x = df_data["comment_text"]
df_y = df_data[["toxic", "severe_toxic", "obscene",
"threat", "insult", "identity_hate"]]
vect = TfidfVectorizer(stop_words="english", sublinear_tf=True)
x = vect.fit_transform(df_x)
#split data
x_train, x_test, y_train, y_test = train_test_split(x, df_y, test_size=0.2, random_state=42, shuffle=True)
clf = LinearSVC()
#input text
comment = "fuck you"
data = [comment]
vector = vect.transform(data).toarray()
print("predict text:", data)
#list for text and predictions
new_data = [comment]
categories = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
for category in categories:
print('=> {}'.format(category))
clf.fit(x_train, y_train[category])
prediction = clf.predict(vector)
#add each category prediction to the list
new_data.append(prediction)
print('- Prediction:', prediction)
accuracy = clf.predict(x_test)
print('- Accuracy: {}'.format(accuracy_score(y_test[category], accuracy)))
#print list for the new data
print(new_data)
end = time.time()
print("time needed:", end - start)