-
Notifications
You must be signed in to change notification settings - Fork 23
/
vocabulary.py
46 lines (38 loc) · 1.5 KB
/
vocabulary.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
import numpy
import scipy.cluster.vq as vq
import sift
class Vocabulary(object):
def __init__(self, name):
self.name = name
self.voc = []
self.idf = []
self.trainingdata = []
self.word_count = 0
def train(self, featurefiles, k=100, subsampling=10):
"""Train a vocabulary from features in files listed in |featurefiles| using
k-means with k words. Subsampling of training data can be used for speedup.
"""
image_count = len(featurefiles)
descr = []
descr.append(sift.read_features_from_file(featurefiles[0])[1])
descriptors = descr[0] # Stack features for k-means.
for i in numpy.arange(1, image_count):
descr.append(sift.read_features_from_file(featurefiles[i])[1])
descriptors = numpy.vstack((descriptors, descr[i]))
# Run k-means.
self.voc, distortion = vq.kmeans(descriptors[::subsampling, :], k, 1)
self.word_count = self.voc.shape[0]
# Project training data on vocabulary.
imwords = numpy.zeros((image_count, self.word_count))
for i in range(image_count):
imwords[i] = self.project(descr[i])
occurence_count = numpy.sum((imwords > 0)*1, axis=0)
self.idf = numpy.log(image_count / (occurence_count + 1.0))
self.trainingdata = featurefiles
def project(self, descriptors):
"""Project descriptors on the vocabulary to create a histogram of words."""
imhist = numpy.zeros((self.word_count))
words, distance = vq.vq(descriptors, self.voc)
for w in words:
imhist[w] += 1
return imhist