-
Notifications
You must be signed in to change notification settings - Fork 66
/
Copy pathvector_space.py
98 lines (74 loc) · 3.52 KB
/
vector_space.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import numpy as np
import numpy.typing as npt
from typing import Union
from sklearn.preprocessing import normalize
from scipy.sparse import csr_matrix
from small_text.classifiers import Classifier
from small_text.data import Dataset
from small_text.vector_indexes.base import VectorIndexFactory
from small_text.vector_indexes.hnsw import HNSWIndex
from small_text.query_strategies.strategies import EmbeddingBasedQueryStrategy
def _construct_graph(index, embeddings, max_distance=0.1, k=25):
import networkx as nx
graph = nx.Graph()
graph.add_nodes_from(list(range(embeddings.shape[0])))
edges = []
for i in range(embeddings.shape[0]):
indices, distances = index.search(embeddings[i].reshape(1, -1), k=k, return_distance=True)
for j, dist in zip(indices[0, 1:], distances[0, 1:]):
if dist <= max_distance:
edges.append((j, i))
graph.add_edges_from(edges)
return graph
class ProbCover(EmbeddingBasedQueryStrategy):
"""
ProbCover [YDH+22]_ queries instances by trying to maximize probability coverage of an embedding space.
For this, each labeled instance covers an area in the embedding space with a certain radius (`ball_radius`).
The strategy tries to maximize the covered area, by selecting instances with a high-density neighborhood.
.. versionadded:: 2.0.0
"""
def __init__(self, vector_index_factory=VectorIndexFactory(HNSWIndex),
k=100, ball_radius=0.1):
"""
Parameters
----------
vector_index_factory : VectorIndexFactory, default=VectorIndexFactory(HNSWIndex)
A factory that provides the vector index for nearest neighbor queries.
k : int, default=100
Number of nearest neighbors for nearest neighbor queries.
ball_radius : float, default=0.1
Radius of an embedding space ball that is given by a labeled instance at its center.
"""
self.vector_index_factory = vector_index_factory
self.k = k
self.ball_radius = ball_radius
def sample(self,
clf: Classifier,
dataset: Dataset,
indices_unlabeled: npt.NDArray[np.uint],
indices_labeled: npt.NDArray[np.uint],
y: Union[npt.NDArray[np.uint], csr_matrix],
n: int,
embeddings: npt.NDArray[np.double],
embeddings_proba: npt.NDArray[np.double] = None):
embeddings = normalize(embeddings, axis=1)
index = self.vector_index_factory.new()
index.build(embeddings)
# The paper explicitly mentions this part being implemented with sparse structures.
# For now, I prefer the readability of this solution, but this might be changed in the future.
graph = _construct_graph(index, embeddings, max_distance=self.ball_radius, k=self.k)
indices_queried = []
for i in indices_labeled:
for node in list(graph.adj[i]):
graph.remove_edge(node, i)
for _ in range(n):
index, _ = sorted(graph.degree(), key=lambda x: -x[1])[0]
for node in list(graph.adj[index]):
graph.remove_edge(node, index)
indices_queried.append(index)
return np.array(indices_queried)
def __str__(self):
# TODO: vector index factores dont have __str__ methods yet
return (f'ProbCover(vector_index_factory={self.vector_index_factory.__class__.__name__}, '
f'ball_radius={self.ball_radius}, ' \
f'k={self.k})')