-
Notifications
You must be signed in to change notification settings - Fork 0
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Update TileDB benchmarks #2
Changes from 2 commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,8 +1,4 @@ | ||
FROM tiledb_vs | ||
FROM ann-benchmarks | ||
|
||
RUN mamba install -y ansicolors docker-py h5py matplotlib numpy pyyaml psutil scipy scikit-learn jinja2 pandas | ||
|
||
WORKDIR /home/app | ||
COPY requirements.txt run_algorithm.py ./ | ||
RUN pip3 install -r requirements.txt | ||
ENTRYPOINT ["python", "-u", "run_algorithm.py"] | ||
RUN pip install tiledb tiledb-vector-search | ||
RUN python3 -c 'import tiledb.vector_search' |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4,7 +4,8 @@ | |
import tiledb | ||
|
||
from tiledb.vector_search.ingestion import ingest | ||
from tiledb.vector_search.index import IVFFlatIndex | ||
from tiledb.vector_search import IVFFlatIndex | ||
from tiledb.cloud.dag import Mode | ||
import numpy as np | ||
import multiprocessing | ||
|
||
|
@@ -15,17 +16,35 @@ class TileDBIVFFlat(BaseANN): | |
def __init__(self, metric, n_list): | ||
self._n_list = n_list | ||
self._metric = metric | ||
self.MAX_UINT64 = np.iinfo(np.dtype("uint64")).max | ||
|
||
def query(self, v, n): | ||
if self._metric == 'angular': | ||
raise NotImplementedError() | ||
self.res =self.index.query(np.array([v]).astype(numpy.float32), k=n, nthreads=multiprocessing.cpu_count(), nprobe=min(self._n_probe,self._n_list), use_nuv_implementation=True) | ||
return self.res[0] | ||
|
||
# query() returns a tuple of (distances, ids). | ||
ids = self.index.query( | ||
np.array([v]).astype(numpy.float32), | ||
k=n, | ||
nthreads=multiprocessing.cpu_count(), | ||
nprobe=min(self._n_probe,self._n_list), | ||
)[1][0] | ||
# Fix for 'OverflowError: Python int too large to convert to C long'. | ||
ids[ids == self.MAX_UINT64] = 0 | ||
return ids | ||
|
||
def batch_query(self, X, n): | ||
if self._metric == 'angular': | ||
raise NotImplementedError() | ||
self.res =self.index.query(X.astype(numpy.float32), k=n, nthreads=multiprocessing.cpu_count(), nprobe=min(self._n_probe, self._n_list), use_nuv_implementation=True) | ||
# query() returns a tuple of (distances, ids). | ||
self.res = self.index.query( | ||
X.astype(numpy.float32), | ||
k=n, | ||
nthreads=multiprocessing.cpu_count(), | ||
nprobe=min(self._n_probe, self._n_list), | ||
)[1] | ||
# Fix for 'OverflowError: Python int too large to convert to C long'. | ||
self.res[self.res == self.MAX_UINT64] = 0 | ||
|
||
def get_batch_results(self): | ||
return self.res | ||
|
@@ -47,15 +66,18 @@ def fit(self, X): | |
elif X.dtype == "float32": | ||
source_type = "F32BIN" | ||
maxtrain = min(50 * self._n_list, X.shape[0]) | ||
self.index = ingest(index_type="IVF_FLAT", | ||
array_uri=array_uri, | ||
source_uri=source_uri, | ||
source_type=source_type, | ||
size=X.shape[0], | ||
training_sample_size=maxtrain, | ||
partitions=self._n_list, | ||
input_vectors_per_work_item=100000000, | ||
) | ||
self.index = ingest( | ||
index_type="IVF_FLAT", | ||
index_uri=array_uri, | ||
source_uri=source_uri, | ||
source_type=source_type, | ||
size=X.shape[0], | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we can just use the defaults for most params in ingestion There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks, added this as a TODO and will fix this the next time I set up an EC2 instance and run this. Doing this b/c it seems nice to get this PR merged versus leaving in flux until then. |
||
training_sample_size=maxtrain, | ||
partitions=self._n_list, | ||
input_vectors_per_work_item=100000000, | ||
mode=Mode.LOCAL | ||
) | ||
# memory_budget=-1 will load the data into main memory. | ||
self.index = IVFFlatIndex(uri=array_uri, dtype=X.dtype, memory_budget=-1) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks, added this as a TODO and will fix this the next time I set up an EC2 instance and run this. Doing this b/c it seems nice to get this PR merged versus leaving in flux until then. |
||
|
||
def set_query_arguments(self, n_probe): | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We can update this to use the numpy arrays directly for ingestion
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks, added this as a TODO and will fix this the next time I set up an EC2 instance and run this. Doing this b/c it seems nice to get this PR merged versus leaving in flux until then.