-
Notifications
You must be signed in to change notification settings - Fork 47
/
Copy pathmrpt.py
249 lines (217 loc) · 12.1 KB
/
mrpt.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
import numpy as np
from os import name as os_name
import mrptlib
class MRPTIndex(object):
"""
An MRPT index object
"""
def __init__(self, data, shape=None, mmap=False):
"""
Initializes an MRPT index object.
:param data: Input data either as a NxDim numpy ndarray or as a filepath to a binary file containing the data.
:param shape: Shape of the data as a tuple (N, dim). Needs to be specified only if loading the data from a file.
:param mmap: If true, the data is mapped into memory. Has effect only if the data is loaded from a file.
:return:
"""
if isinstance(data, np.ndarray):
if len(data) == 0 or len(data.shape) != 2:
raise ValueError("The data matrix should be non-empty and two-dimensional")
if data.dtype != np.float32:
raise ValueError("The data matrix should have type float32")
if not data.flags['C_CONTIGUOUS'] or not data.flags['ALIGNED']:
raise ValueError("The data matrix has to be C_CONTIGUOUS and ALIGNED")
n_samples, dim = data.shape
elif isinstance(data, str):
if not isinstance(shape, tuple) or len(shape) != 2:
raise ValueError("You must specify the shape of the data as a tuple (N, dim) "
"when loading data from a binary file")
n_samples, dim = shape
elif data is not None:
raise ValueError("Data must be either an ndarray or a filepath")
if mmap and os_name == 'nt':
raise ValueError("Memory mapping is not available on Windows")
if data is not None:
self.index = mrptlib.MrptIndex(data, n_samples, dim, mmap)
self.dim = dim
self.built = False
self.autotuned = False
def _compute_sparsity(self, projection_sparsity):
if projection_sparsity == 'auto':
return 1. / np.sqrt(self.dim)
elif projection_sparsity is None:
return 1
elif not 0 < projection_sparsity <= 1:
raise ValueError("Sparsity should be in (0, 1]")
def build(self, depth, n_trees, projection_sparsity='auto'):
"""
Builds a normal MRPT index.
:param depth: The depth of the trees; should be in the set {1, 2, ..., floor(log2(n))}.
:param n_trees: The number of trees used in the index.
:param projection_sparsity: Expected ratio of non-zero components in a projection matrix.
:return:
"""
if self.built:
raise RuntimeError("The index has already been built")
projection_sparsity = self._compute_sparsity(projection_sparsity)
self.index.build(n_trees, depth, projection_sparsity)
self.built = True
def build_autotune(self, target_recall, Q, k, trees_max=-1, depth_min=-1, depth_max=-1,
votes_max=-1, projection_sparsity='auto', shape=None):
"""
Builds an autotuned MRPT index.
:param target_recall: The target recall level (float) or None if the target recall level
is set later using the subset function.
:param Q: A matrix of test queries used for tuning, one per row.
:param k: Number of nearest neighbors searched for.
:param trees_max: Maximum number of trees grown; can be used to control the building time
and memory usage; a default value -1 sets this to min(sqrt(n), 1000).
:param depth_min: Minimum depth of trees considered when searching for optimal parameters;
a default value -1 sets this to min(log2(n), 5).
:param depth_max: Maximum depth of trees considered when searching for optimal parameters;
a default value -1 sets this to log2(n) - 4:
:param votes_max: Maximum number of votes considered when searching for optimal parameters;
a default value -1 sets this to max(trees / 10, 10).
:param projection_sparsity: Expected ratio of non-zero components in a projection matrix
:param shape: Shape of the test query matrix as a tuple (n_test, dim). Needs to be specified
only if loading the test query matrix from a file.
:return:
"""
if self.built:
raise RuntimeError("The index has already been built")
if isinstance(Q, np.ndarray):
if len(Q) == 0 or len(Q.shape) != 2:
raise ValueError("The test query matrix should be non-empty and two-dimensional")
if Q.dtype != np.float32:
raise ValueError("The test query matrix should have type float32")
if not Q.flags['C_CONTIGUOUS'] or not Q.flags['ALIGNED']:
raise ValueError("The test query matrix has to be C_CONTIGUOUS and ALIGNED")
n_test, dim = Q.shape
elif isinstance(Q, str):
if not isinstance(shape, tuple) or len(shape) != 2:
raise ValueError("You must specify the shape of the data as a tuple (n_test, dim) "
"when loading the test query matrix from a binary file")
n_test, dim = shape
else:
raise ValueError("The test query matrix must be either an ndarray or a filepath")
if dim != self.dim:
raise ValueError("The test query matrix should have the same number of columns as the data matrix")
self.built = target_recall is not None
self.autotuned = True
if target_recall is None:
target_recall = -1
projection_sparsity = self._compute_sparsity(projection_sparsity)
self.index.build_autotune(
target_recall, Q, n_test, k, trees_max, depth_min, depth_max, votes_max, projection_sparsity)
def build_autotune_sample(self, target_recall, k, n_test=100, trees_max=-1,
depth_min=-1, depth_max=-1, votes_max=-1, projection_sparsity='auto'):
"""
Builds an autotuned MRPT index.
:param target_recall: The target recall level (float) or None if the target recall level
is set later using the subset function.
:param k: Number of nearest neighbors searched for.
:param n_test: Number of test queries to sample.
:param trees_max: Maximum number of trees grown; can be used to control the building time
and memory usage; a default value -1 sets this to min(sqrt(n), 1000).
:param depth_min: Minimum depth of trees considered when searching for optimal parameters;
a default value -1 sets this to min(log2(n), 5).
:param depth_max: Maximum depth of trees considered when searching for optimal parameters;
a default value -1 sets this to log2(n) - 4:
:param votes_max: Maximum number of votes considered when searching for optimal parameters;
a default value -1 sets this to max(trees / 10, 10).
:param projection_sparsity: Expected ratio of non-zero components in a projection matrix
:return:
"""
if self.built:
raise RuntimeError("The index has already been built")
self.built = target_recall is not None
self.autotuned = True
if target_recall is None:
target_recall = -1
projection_sparsity = self._compute_sparsity(projection_sparsity)
self.index.build_autotune_sample(
target_recall, n_test, k, trees_max, depth_min, depth_max, votes_max, projection_sparsity)
def subset(self, target_recall):
"""
Create a new index for a specified target recall level by copying trees from an autotuned index grown
without a prespecified recall level.
:param target_recall: The target recall level.
:return: A new MRPT object autotuned for the specified recall level.
"""
if not self.autotuned:
raise RuntimeError("Only an autotuned index can be subset")
new_index = MRPTIndex(None)
new_index.index = self.index.subset(target_recall)
new_index.dim = self.dim
new_index.built = True
new_index.autotuned = True
return new_index
def parameters(self):
"""
Get the parameters of the index.
:return: A dictionary of the hyperparameters of the index.
"""
n_trees, depth, votes, k, qtime, recall = self.index.parameters()
if self.index.is_autotuned():
return {'n_trees': n_trees, 'depth': depth, 'k': k, 'votes': votes,
'estimated_qtime': qtime, 'estimated_recall': recall}
return {'n_trees': n_trees, 'depth': depth}
def save(self, path):
"""
Saves the MRPT index to a file.
:param path: Filepath to the location of the saved index.
:return:
"""
if not self.built:
raise RuntimeError("Cannot save index before building")
self.index.save(path)
def load(self, path):
"""
Loads the MRPT index from a file.
:param path: Filepath to the location of the index.
:return:
"""
self.index.load(path)
self.built = True
self.autotuned = self.index.is_autotuned()
def ann(self, q, k=None, votes_required=None, return_distances=False):
"""
Performs an approximate nearest neighbor query for a single query vector or multiple query vectors
in parallel. The queries are given as a numpy vector or a numpy matrix where each row contains a query.
:param q: The query object. Can be either a single query vector or a matrix with one query vector per row.
:param k: The number of nearest neighbors to be returned, has to be specified if the index has not been autotuned.
:param votes_required: The number of votes an object has to get to be included in the linear search part of the query;
has to be specified if the index has not been autotuned.
:param return_distances: Whether the distances are also returned.
:return: If return_distances is false, returns a vector or matrix of indices of the approximate
nearest neighbors in the original input data for the corresponding query. Otherwise,
returns a tuple where the first element contains the nearest neighbors and the second
element contains their distances to the query.
"""
if not self.built:
raise RuntimeError("Cannot query before building index")
if q.dtype != np.float32:
raise ValueError("The query matrix should have type float32")
if not self.autotuned and (k is None or votes_required is None):
raise ValueError("k and votes_required must be set if the index has not been autotuned")
if k is None:
k = -1
if votes_required is None:
votes_required = -1
return self.index.ann(q, k, votes_required, return_distances)
def exact_search(self, q, k, return_distances=False):
"""
Performs an exact nearest neighbor query for a single query several queries in parallel. The queries are
given as a numpy matrix where each row contains a query. Useful for measuring accuracy.
:param q: The query object. Can be either a single query vector or a matrix with one query vector per row.
:param k: The number of nearest neighbors to return.
:param return_distances: Whether the distances are also returned.
:return: If return_distances is false, returns a vector or matrix of indices of the exact
nearest neighbors in the original input data for the corresponding query. Otherwise,
returns a tuple where the first element contains the nearest neighbors and the second
element contains their distances to the query.
"""
if q.dtype != np.float32:
raise ValueError("The query matrix should have type float32")
if k < 1:
raise ValueError("k must be positive")
return self.index.exact_search(q, k, return_distances)