-
Notifications
You must be signed in to change notification settings - Fork 8
/
train_classifier_on_dnn_feats.py
215 lines (166 loc) · 8.35 KB
/
train_classifier_on_dnn_feats.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
import os, sys, re, cPickle
import numpy as np
import theano
from sklearn.externals import joblib
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.grid_search import GridSearchCV
from pylearn2.utils import serial
from pylearn2.space import CompositeSpace, Conv2DSpace, VectorSpace, IndexSpace
import pylearn2.config.yaml_parse as yaml_parse
import pdb
def aggregate_features(model, dataset, which_layers=[2], win_size=200, step=100):
assert np.max(which_layers) < len(model.layers)
X = model.get_input_space().make_theano_batch()
Y = model.fprop(X, return_all=True)
fprop = theano.function([X],Y)
n_classes = dataset.y.shape[1]
n_examples = len(dataset.file_list)
feat_space = model.get_input_space()
target_space = VectorSpace(dim=n_classes)
data_specs = (CompositeSpace((feat_space, target_space)), ("songlevel-features", "targets"))
iterator = dataset.iterator(mode='sequential', data_specs=data_specs)
# compute feature representation, aggregrate frames
X=[]; y=[]; Z=[]; file_list=[];
for n,el in enumerate(iterator):
# display progress indicator
sys.stdout.write('Aggregation progress: %2.0f%%\r' % (100*n/float(n_examples)))
sys.stdout.flush()
input_data = np.array(el[0], dtype=np.float32)
output_data = fprop(input_data)
feats = np.hstack([output_data[i] for i in which_layers])
true_label = el[1]
# aggregate features
agg_feat = []
for i in xrange(0, feats.shape[0]-win_size, step):
chunk = feats[i:i+win_size,:]
agg_feat.append(np.hstack((np.mean(chunk, axis=0), np.std(chunk, axis=0))))
X.append(np.vstack(agg_feat))
y.append(np.hstack([true_label] * len(agg_feat)))
Z.append(np.sum(output_data[-1], axis=0))
file_list.append(el[2])
print '' # newline
return X, y, Z, file_list
def get_features(model, dataset, which_layers=[2], n_features=100):
assert np.max(which_layers) < len(model.layers)
rng = np.random.RandomState(111)
X = model.get_input_space().make_theano_batch()
Y = model.fprop(X, return_all=True)
fprop = theano.function([X],Y)
n_classes = dataset.y.shape[1]
n_examples = len(dataset.file_list)
feat_space = model.get_input_space()
target_space = VectorSpace(dim=n_classes)
data_specs = (CompositeSpace((feat_space, target_space)), ("songlevel-features", "targets"))
iterator = dataset.iterator(mode='sequential', data_specs=data_specs)
X=[]; y=[]; Z=[]; file_list=[];
for n,el in enumerate(iterator):
# display progress indicator
sys.stdout.write('Getting features: %2.0f%%\r' % (100*n/float(n_examples)))
sys.stdout.flush()
input_data = np.array(el[0], dtype=np.float32)
output_data = fprop(input_data)
feats = np.hstack([output_data[i] for i in which_layers])
true_label = el[1]
if n_features:
ind = rng.permutation(feats.shape[0])
feats = feats[ind[:n_features],:]
X.append(feats)
y.append([true_label]*n_features)
Z.append(np.sum(output_data[-1], axis=0))
file_list.append(el[2])
print ''
return X, y, Z, file_list
def train_classifier(X_train, y_train, method='random_forest', verbose=2):
assert method in ['random_forest', 'linear_svm']
# train classifier
if method=='random_forest':
classifier = RandomForestClassifier(n_estimators=500, random_state=1234, verbose=verbose, n_jobs=2)
else:
parameters = {'C' : 10**np.arange(-2,4.)}
grid = GridSearchCV(SVC(), parameters, verbose=3)
grid.fit(X_train, y_train)
classifier = grid.best_estimator_
#classifier = SVC(C=0.5, kernel='linear', random_state=1234, verbose=verbose)
return classifier.fit(X_train, y_train)
def test_classifier(X_test, y_test, classifier, n_labels=10):
n_examples = len(y_test)
confusion = np.zeros((n_labels,n_labels))
for n, (X, true_label) in enumerate(zip(X_test,y_test)):
sys.stdout.write('Classify progress: %2.0f%%\r' % (100*n/float(n_examples)))
sys.stdout.flush()
y_pred = np.array(classifier.predict(X), dtype='int')
pred_label = np.argmax(np.bincount(y_pred, minlength=n_labels))
confusion[pred_label, true_label[0]] += 1
print ''
ave_acc = 100*(np.sum(np.diag(confusion)) / np.sum(confusion))
print "classification accuracy:", ave_acc
return confusion
def test_classifier_printf(X_test, y_test, Z_test, file_list, classifier, save_file, n_labels=10):
n_examples = len(file_list)
with open(save_file, 'w') as f:
for n, (X, true_label, Z, fname) in enumerate(zip(X_test, y_test, Z_test, file_list)):
sys.stdout.write('Classify progress: %2.0f%%\r' % (100*n/float(n_examples)))
sys.stdout.flush()
y_pred = np.array(classifier.predict(X), dtype='int')
pred_label = np.argmax(np.bincount(y_pred, minlength=n_labels))
s=''
for i in Z: s+='%2.2f\t'%i
f.write('{0}\t{1}\t{2}\t{3}\n'.format(fname, true_label[0], pred_label, s))
print ''
if __name__ == "__main__":
# example: python train_classifier_on_dnn_feats.py ./saved/S_500_RS.cpu.pkl /Users/cmke/Datasets/tzanetakis_genre --which_layers 0
import argparse, glob
parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter,
description='''Script to train/test random forest on DNN features.
''')
parser.add_argument('model_file', help='Path to trained DNN model file')
parser.add_argument('--which_layers', nargs='*', type=int, help='List of which DNN layers to use as features')
parser.add_argument('--aggregate_features', action='store_true', help='option to aggregate frames (mean/std of frames used to train classifier)')
parser.add_argument('--classifier', help="either 'random_forest' or 'linear_svm'")
parser.add_argument('--save_file', help='Output classification results to a text file')
args = parser.parse_args()
if not args.which_layers:
parser.error('Please specify --which_layers x, with x either 1, 2, 3 or 1 2 3 (layer 0 is a pre-processing layer)')
if args.aggregate_features:
print 'Using aggregate features'
else:
print 'Not using aggregate features'
if args.classifier is None:
print 'No classifer selected, using random forest'
args.classifier = 'random_forest'
# load model
model = serial.load(args.model_file)
# parse dataset from model
p = re.compile(r"which_set.*'(train)'")
trainset_yaml = model.dataset_yaml_src
validset_yaml = p.sub("which_set: 'valid'", model.dataset_yaml_src)
testset_yaml = p.sub("which_set: 'test'", model.dataset_yaml_src)
trainset = yaml_parse.load(trainset_yaml)
validset = yaml_parse.load(validset_yaml)
testset = yaml_parse.load(testset_yaml)
if args.aggregate_features:
X_train, y_train, Z_train, train_files = aggregate_features(model, trainset, which_layers=args.which_layers)
X_valid, y_valid, Z_valid, valid_files = aggregate_features(model, validset, which_layers=args.which_layers)
X_test, y_test, Z_test, test_files = aggregate_features(model, testset, which_layers=args.which_layers)
else:
X_train, y_train, Z_train, train_files = get_features(model, trainset, which_layers=args.which_layers)
X_valid, y_valid, Z_valid, valid_files = get_features(model, validset, which_layers=args.which_layers)
X_test, y_test, Z_test, test_files = get_features(model, testset, which_layers=args.which_layers)
print 'Training classifier'
X_all = np.vstack((np.vstack(X_train), np.vstack(X_valid)))
y_all = np.hstack((np.hstack(y_train), np.hstack(y_valid)))
classifier = train_classifier(X_all, y_all, method=args.classifier)
print 'Testing classifier'
if args.save_file:
test_classifier_printf(
X_test=X_test,
y_test=y_test,
Z_test=Z_test,
file_list=test_files,
classifier=classifier,
save_file=args.save_file+'.txt')
print 'Saving trained classifier'
joblib.dump(classifier, args.save_file+'.pkl', 9)
else:
confusion = test_classifier(X_test, y_test, classifier)