From d4069c2d39cfaf11178d7ed2b4bf088fe735b493 Mon Sep 17 00:00:00 2001 From: Paris Morgan Date: Fri, 5 Apr 2024 10:01:27 +0200 Subject: [PATCH 1/3] fix benchmarks --- ann_benchmarks/algorithms/tiledb/Dockerfile | 39 ++++++++++++++--- ann_benchmarks/algorithms/tiledb/module.py | 48 +++++++++++++++------ 2 files changed, 68 insertions(+), 19 deletions(-) diff --git a/ann_benchmarks/algorithms/tiledb/Dockerfile b/ann_benchmarks/algorithms/tiledb/Dockerfile index 7e6e0b0d2..2c6b4aa8e 100644 --- a/ann_benchmarks/algorithms/tiledb/Dockerfile +++ b/ann_benchmarks/algorithms/tiledb/Dockerfile @@ -1,8 +1,35 @@ -FROM tiledb_vs +# # Mamba approach +# FROM continuumio/miniconda3:4.10.3 -RUN mamba install -y ansicolors docker-py h5py matplotlib numpy pyyaml psutil scipy scikit-learn jinja2 pandas +# RUN conda install mamba -n base -c conda-forge -WORKDIR /home/app -COPY requirements.txt run_algorithm.py ./ -RUN pip3 install -r requirements.txt -ENTRYPOINT ["python", "-u", "run_algorithm.py"] +# RUN mamba install -y -c tiledb tiledb tiledb-vector-search + +# WORKDIR /home/app +# COPY requirements.txt run_algorithm.py ./ +# RUN pip3 install -r requirements.txt +# ENTRYPOINT ["python", "-u", "run_algorithm.py"] + +# # Conda approach +# FROM ann-benchmarks + +# RUN apt update && apt install -y wget +# RUN wget https://repo.anaconda.com/archive/Anaconda3-2020.11-Linux-x86_64.sh +# RUN bash Anaconda3-2020.11-Linux-x86_64.sh -b + +# ENV PATH /root/anaconda3/bin:$PATH + +# RUN conda install -y -c tiledb tiledb +# RUN conda install -y -c conda-forge tiledb-vector-search + +# WORKDIR /home/app +# COPY requirements.txt run_algorithm.py ./ +# RUN pip3 install -r requirements.txt +# ENTRYPOINT ["python", "-u", "run_algorithm.py"] + +# Pip approach +FROM ann-benchmarks + +RUN pip install tiledb +RUN pip install tiledb-vector-search +RUN python3 -c 'import tiledb.vector_search' \ No newline at end of file diff --git a/ann_benchmarks/algorithms/tiledb/module.py b/ann_benchmarks/algorithms/tiledb/module.py index 3183bf935..759666f38 100644 --- a/ann_benchmarks/algorithms/tiledb/module.py +++ b/ann_benchmarks/algorithms/tiledb/module.py @@ -4,7 +4,8 @@ import tiledb from tiledb.vector_search.ingestion import ingest -from tiledb.vector_search.index import IVFFlatIndex +from tiledb.vector_search import IVFFlatIndex +from tiledb.cloud.dag import Mode import numpy as np import multiprocessing @@ -15,17 +16,35 @@ class TileDBIVFFlat(BaseANN): def __init__(self, metric, n_list): self._n_list = n_list self._metric = metric + self.MAX_UINT64 = np.iinfo(np.dtype("uint64")).max def query(self, v, n): if self._metric == 'angular': raise NotImplementedError() - self.res =self.index.query(np.array([v]).astype(numpy.float32), k=n, nthreads=multiprocessing.cpu_count(), nprobe=min(self._n_probe,self._n_list), use_nuv_implementation=True) - return self.res[0] + + # query() returns a tuple of (distances, ids). + ids = self.index.query( + np.array([v]).astype(numpy.float32), + k=n, + nthreads=multiprocessing.cpu_count(), + nprobe=min(self._n_probe,self._n_list), + )[1][0] + # Fix for 'OverflowError: Python int too large to convert to C long'. + ids[ids == self.MAX_UINT64] = 0 + return ids def batch_query(self, X, n): if self._metric == 'angular': raise NotImplementedError() - self.res =self.index.query(X.astype(numpy.float32), k=n, nthreads=multiprocessing.cpu_count(), nprobe=min(self._n_probe, self._n_list), use_nuv_implementation=True) + # query() returns a tuple of (distances, ids). + self.res = self.index.query( + X.astype(numpy.float32), + k=n, + nthreads=multiprocessing.cpu_count(), + nprobe=min(self._n_probe, self._n_list), + )[1] + # Fix for 'OverflowError: Python int too large to convert to C long'. + self.res[self.res == self.MAX_UINT64] = 0 def get_batch_results(self): return self.res @@ -47,15 +66,18 @@ def fit(self, X): elif X.dtype == "float32": source_type = "F32BIN" maxtrain = min(50 * self._n_list, X.shape[0]) - self.index = ingest(index_type="IVF_FLAT", - array_uri=array_uri, - source_uri=source_uri, - source_type=source_type, - size=X.shape[0], - training_sample_size=maxtrain, - partitions=self._n_list, - input_vectors_per_work_item=100000000, - ) + self.index = ingest( + index_type="IVF_FLAT", + index_uri=array_uri, + source_uri=source_uri, + source_type=source_type, + size=X.shape[0], + training_sample_size=maxtrain, + partitions=self._n_list, + input_vectors_per_work_item=100000000, + mode=Mode.LOCAL + ) + # memory_budget=-1 will load the data into main memory. self.index = IVFFlatIndex(uri=array_uri, dtype=X.dtype, memory_budget=-1) def set_query_arguments(self, n_probe): From 20652904efffa5f4aaf9487a38ded64562741add Mon Sep 17 00:00:00 2001 From: Paris Morgan Date: Fri, 5 Apr 2024 10:03:45 +0200 Subject: [PATCH 2/3] cleanup code --- ann_benchmarks/algorithms/tiledb/Dockerfile | 33 +-------------------- 1 file changed, 1 insertion(+), 32 deletions(-) diff --git a/ann_benchmarks/algorithms/tiledb/Dockerfile b/ann_benchmarks/algorithms/tiledb/Dockerfile index 2c6b4aa8e..cefda8b2f 100644 --- a/ann_benchmarks/algorithms/tiledb/Dockerfile +++ b/ann_benchmarks/algorithms/tiledb/Dockerfile @@ -1,35 +1,4 @@ -# # Mamba approach -# FROM continuumio/miniconda3:4.10.3 - -# RUN conda install mamba -n base -c conda-forge - -# RUN mamba install -y -c tiledb tiledb tiledb-vector-search - -# WORKDIR /home/app -# COPY requirements.txt run_algorithm.py ./ -# RUN pip3 install -r requirements.txt -# ENTRYPOINT ["python", "-u", "run_algorithm.py"] - -# # Conda approach -# FROM ann-benchmarks - -# RUN apt update && apt install -y wget -# RUN wget https://repo.anaconda.com/archive/Anaconda3-2020.11-Linux-x86_64.sh -# RUN bash Anaconda3-2020.11-Linux-x86_64.sh -b - -# ENV PATH /root/anaconda3/bin:$PATH - -# RUN conda install -y -c tiledb tiledb -# RUN conda install -y -c conda-forge tiledb-vector-search - -# WORKDIR /home/app -# COPY requirements.txt run_algorithm.py ./ -# RUN pip3 install -r requirements.txt -# ENTRYPOINT ["python", "-u", "run_algorithm.py"] - -# Pip approach FROM ann-benchmarks -RUN pip install tiledb -RUN pip install tiledb-vector-search +RUN pip install tiledb tiledb-vector-search RUN python3 -c 'import tiledb.vector_search' \ No newline at end of file From 0502021e83e41c360fe0192c912618bbb2591cd9 Mon Sep 17 00:00:00 2001 From: Paris Morgan Date: Mon, 13 May 2024 16:22:52 -0400 Subject: [PATCH 3/3] add todos --- ann_benchmarks/algorithms/tiledb/module.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/ann_benchmarks/algorithms/tiledb/module.py b/ann_benchmarks/algorithms/tiledb/module.py index 759666f38..396ac07ad 100644 --- a/ann_benchmarks/algorithms/tiledb/module.py +++ b/ann_benchmarks/algorithms/tiledb/module.py @@ -61,11 +61,14 @@ def fit(self, X): X.tofile(f) f.close() + # TODO: Next time we run this, just use the numpy arrays directly for ingestion. if X.dtype == "uint8": source_type = "U8BIN" elif X.dtype == "float32": source_type = "F32BIN" maxtrain = min(50 * self._n_list, X.shape[0]) + # TODO: Next time we run this, remove size, training_sample_size, partitions, and + # input_vectors_per_work_item and use the defaults instead. self.index = ingest( index_type="IVF_FLAT", index_uri=array_uri, @@ -77,6 +80,7 @@ def fit(self, X): input_vectors_per_work_item=100000000, mode=Mode.LOCAL ) + # TODO: Next time we run this, remove dtype and memory_budget as these are the defaults. # memory_budget=-1 will load the data into main memory. self.index = IVFFlatIndex(uri=array_uri, dtype=X.dtype, memory_budget=-1)