-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsvm.py
164 lines (119 loc) · 4.77 KB
/
svm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
from time import time
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics
from sklearn.utils.extmath import density
from sklearn.svm import LinearSVC
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import pandas as pd
np.random.seed(42)
#load dataset
data_train = pd.read_csv('data_train.csv')
data_test = pd.read_csv('data_test.csv')
#extract features with vectorizer
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.15, min_df=3, stop_words="english", ngram_range=(1,1), norm="l1", smooth_idf=False)
X_train = vectorizer.fit_transform(data_train["truncated"].apply(lambda x: np.str_(x)))
X_test = vectorizer.transform(data_test['truncated'].apply(lambda x: np.str_(x)))
y_train, y_test = data_train["target"], data_test['target']
#extract feature names and target names (labels)
feature_names = vectorizer.get_feature_names_out()
target_names = fetch_20newsgroups(subset="train",shuffle=True,random_state=42).target_names
def benchmark(clf):
#fit model
print("Training: ")
print(clf)
t0 = time()
clf.fit(X_train, y_train)
train_time = time() - t0
print(f"train time: {train_time:.3}s")
#test model
t0 = time()
pred = clf.predict(X_test)
test_time = time() - t0
print(f"test time: {test_time:.3}s")
score = metrics.accuracy_score(y_test, pred)
print(f"accuracy: {score:.7}")
print(f"dimensionality: {clf.coef_.shape[1]}")
print(f"density: {density(clf.coef_)}")
print()
return clf
#train and test model
print("Linear SVC")
clf = benchmark(LinearSVC(C=30, dual=True, max_iter=3000))
print(clf)
#extract coefficients
coef = clf.coef_
#normalize coefficients to range [-1,1]
min_val = coef.min()
max_val = coef.max()
normalized_coef = 2 * (coef - min_val) / (max_val - min_val) - 1
#print(normalized_coef.min())
#print(normalized_coef.max())
class_dict = {"feature":feature_names}
target_names.sort()
for i in range(20):
class_dict[target_names[i]]=normalized_coef[i]
# get and store feature coefficients over whole vocab
all_coef = pd.DataFrame(class_dict)
all_coef.to_csv("vocab_coef_svm.csv")
# get and store predictions for test data
pred_test = clf.predict(X_test)
pred_test_names = []
for i in pred_test:
pred_test_names.append(target_names[i])
y_test_names = []
for i in y_test:
y_test_names.append(target_names[i])
new_dict = {"true class no":y_test, "true class name":y_test_names, "pred class no": pred_test, "pred class name":pred_test_names}
coefs_test = pd.DataFrame(new_dict)
#print(X_test.shape)
# transform coefficient matrix format
X_test = X_test.tocoo()
# get number of test instances
num_docs = X_test.shape[0]
feature_indices_per_doc = [[] for _ in range(num_docs)]
tfidf_per_doc = [[] for _ in range(num_docs)]
# get feature indices and tfidf scores per document
for doc_index, feature_index, tfidf in zip(X_test.row, X_test.col, X_test.data):
feature_indices_per_doc[doc_index].append(feature_index)
tfidf_per_doc[doc_index].append(tfidf)
coefs_test["feature ind"] = feature_indices_per_doc
coefs_test["tfidf"] = tfidf_per_doc
feature_names_per_doc = []
coef_true_per_doc = []
coef_pred_per_doc = []
#iterate over all docs
for doc in range(num_docs):
names = []
coefs_true = []
coefs_pred = []
#get name of true and pred class for doc
true = y_test_names[doc]
pred = pred_test_names[doc]
#iterate over all features in doc
for feature in range(len(feature_indices_per_doc[doc])):
#get index of feature in vocab
index = feature_indices_per_doc[doc][feature]
#get feature name and add to list
name = feature_names[index]
names.append(name)
#get feature coef for true and pred class, add to list
coef_true= all_coef.at[index, true]
coefs_true.append(coef_true)
coef_pred = all_coef.at[index, pred]
coefs_pred.append(coef_pred)
feature_names_per_doc.append(names)
coef_true_per_doc.append(coefs_true)
coef_pred_per_doc.append(coefs_pred)
coefs_test["feature names"] = feature_names_per_doc
coefs_test["coef true"] = coef_true_per_doc
coefs_test["coef pred"] = coef_pred_per_doc
def multiply_lists(list1, list2):
return [a * b for a, b in zip(list1, list2)]
# multiply coefs and tfidf scores
coefs_test['coef true*tfidf'] = coefs_test.apply(lambda row: multiply_lists(row['coef true'], row['tfidf']), axis=1)
coefs_test['coef pred*tfidf'] = coefs_test.apply(lambda row: multiply_lists(row['coef pred'], row['tfidf']), axis=1)
print(coefs_test)
# save coefficients for test data documents
coefs_test.to_csv("coefs_test.csv")