From d4069c2d39cfaf11178d7ed2b4bf088fe735b493 Mon Sep 17 00:00:00 2001
From: Paris Morgan <jparismorgan@gmail.com>
Date: Fri, 5 Apr 2024 10:01:27 +0200
Subject: [PATCH 1/3] fix benchmarks

---
 ann_benchmarks/algorithms/tiledb/Dockerfile | 39 ++++++++++++++---
 ann_benchmarks/algorithms/tiledb/module.py  | 48 +++++++++++++++------
 2 files changed, 68 insertions(+), 19 deletions(-)

diff --git a/ann_benchmarks/algorithms/tiledb/Dockerfile b/ann_benchmarks/algorithms/tiledb/Dockerfile
index 7e6e0b0d2..2c6b4aa8e 100644
--- a/ann_benchmarks/algorithms/tiledb/Dockerfile
+++ b/ann_benchmarks/algorithms/tiledb/Dockerfile
@@ -1,8 +1,35 @@
-FROM tiledb_vs
+# # Mamba approach
+# FROM continuumio/miniconda3:4.10.3
 
-RUN mamba install -y ansicolors docker-py h5py matplotlib numpy pyyaml psutil scipy scikit-learn jinja2 pandas
+# RUN conda install mamba -n base -c conda-forge
 
-WORKDIR /home/app
-COPY requirements.txt run_algorithm.py ./
-RUN pip3 install -r requirements.txt
-ENTRYPOINT ["python", "-u", "run_algorithm.py"]
+# RUN mamba install -y -c tiledb tiledb tiledb-vector-search
+
+# WORKDIR /home/app
+# COPY requirements.txt run_algorithm.py ./
+# RUN pip3 install -r requirements.txt
+# ENTRYPOINT ["python", "-u", "run_algorithm.py"]
+
+# # Conda approach
+# FROM ann-benchmarks
+
+# RUN apt update && apt install -y wget
+# RUN wget https://repo.anaconda.com/archive/Anaconda3-2020.11-Linux-x86_64.sh
+# RUN bash Anaconda3-2020.11-Linux-x86_64.sh -b
+
+# ENV PATH /root/anaconda3/bin:$PATH
+
+# RUN conda install -y -c tiledb tiledb
+# RUN conda install -y -c conda-forge tiledb-vector-search
+
+# WORKDIR /home/app
+# COPY requirements.txt run_algorithm.py ./
+# RUN pip3 install -r requirements.txt
+# ENTRYPOINT ["python", "-u", "run_algorithm.py"]
+
+# Pip approach
+FROM ann-benchmarks
+
+RUN pip install tiledb
+RUN pip install tiledb-vector-search
+RUN python3 -c 'import tiledb.vector_search'
\ No newline at end of file
diff --git a/ann_benchmarks/algorithms/tiledb/module.py b/ann_benchmarks/algorithms/tiledb/module.py
index 3183bf935..759666f38 100644
--- a/ann_benchmarks/algorithms/tiledb/module.py
+++ b/ann_benchmarks/algorithms/tiledb/module.py
@@ -4,7 +4,8 @@
 import tiledb
 
 from tiledb.vector_search.ingestion import ingest
-from tiledb.vector_search.index import IVFFlatIndex
+from tiledb.vector_search import IVFFlatIndex
+from tiledb.cloud.dag import Mode
 import numpy as np
 import multiprocessing
 
@@ -15,17 +16,35 @@ class TileDBIVFFlat(BaseANN):
     def __init__(self, metric, n_list):
         self._n_list = n_list
         self._metric = metric
+        self.MAX_UINT64 = np.iinfo(np.dtype("uint64")).max
 
     def query(self, v, n):
         if self._metric == 'angular':
             raise NotImplementedError()
-        self.res =self.index.query(np.array([v]).astype(numpy.float32), k=n, nthreads=multiprocessing.cpu_count(), nprobe=min(self._n_probe,self._n_list), use_nuv_implementation=True)
-        return self.res[0]
+
+        # query() returns a tuple of (distances, ids).
+        ids = self.index.query(
+            np.array([v]).astype(numpy.float32), 
+            k=n, 
+            nthreads=multiprocessing.cpu_count(), 
+            nprobe=min(self._n_probe,self._n_list), 
+        )[1][0]
+        # Fix for 'OverflowError: Python int too large to convert to C long'.
+        ids[ids == self.MAX_UINT64] = 0
+        return ids 
 
     def batch_query(self, X, n):
         if self._metric == 'angular':
             raise NotImplementedError()
-        self.res =self.index.query(X.astype(numpy.float32), k=n, nthreads=multiprocessing.cpu_count(), nprobe=min(self._n_probe, self._n_list), use_nuv_implementation=True)
+        # query() returns a tuple of (distances, ids).
+        self.res = self.index.query(
+            X.astype(numpy.float32), 
+            k=n, 
+            nthreads=multiprocessing.cpu_count(), 
+            nprobe=min(self._n_probe, self._n_list), 
+        )[1]
+        # Fix for 'OverflowError: Python int too large to convert to C long'.
+        self.res[self.res == self.MAX_UINT64] = 0
 
     def get_batch_results(self):
         return self.res
@@ -47,15 +66,18 @@ def fit(self, X):
         elif X.dtype == "float32":
             source_type = "F32BIN"
         maxtrain = min(50 * self._n_list, X.shape[0])
-        self.index = ingest(index_type="IVF_FLAT",
-                       array_uri=array_uri,
-                       source_uri=source_uri,
-                       source_type=source_type,
-                       size=X.shape[0],
-                       training_sample_size=maxtrain,
-                       partitions=self._n_list,
-                       input_vectors_per_work_item=100000000,
-                     )
+        self.index = ingest(
+            index_type="IVF_FLAT",
+            index_uri=array_uri,
+            source_uri=source_uri,
+            source_type=source_type,
+            size=X.shape[0],
+            training_sample_size=maxtrain,
+            partitions=self._n_list,
+            input_vectors_per_work_item=100000000,
+            mode=Mode.LOCAL
+        )
+        # memory_budget=-1 will load the data into main memory.
         self.index = IVFFlatIndex(uri=array_uri, dtype=X.dtype, memory_budget=-1)
 
     def set_query_arguments(self, n_probe):

From 20652904efffa5f4aaf9487a38ded64562741add Mon Sep 17 00:00:00 2001
From: Paris Morgan <jparismorgan@gmail.com>
Date: Fri, 5 Apr 2024 10:03:45 +0200
Subject: [PATCH 2/3] cleanup code

---
 ann_benchmarks/algorithms/tiledb/Dockerfile | 33 +--------------------
 1 file changed, 1 insertion(+), 32 deletions(-)

diff --git a/ann_benchmarks/algorithms/tiledb/Dockerfile b/ann_benchmarks/algorithms/tiledb/Dockerfile
index 2c6b4aa8e..cefda8b2f 100644
--- a/ann_benchmarks/algorithms/tiledb/Dockerfile
+++ b/ann_benchmarks/algorithms/tiledb/Dockerfile
@@ -1,35 +1,4 @@
-# # Mamba approach
-# FROM continuumio/miniconda3:4.10.3
-
-# RUN conda install mamba -n base -c conda-forge
-
-# RUN mamba install -y -c tiledb tiledb tiledb-vector-search
-
-# WORKDIR /home/app
-# COPY requirements.txt run_algorithm.py ./
-# RUN pip3 install -r requirements.txt
-# ENTRYPOINT ["python", "-u", "run_algorithm.py"]
-
-# # Conda approach
-# FROM ann-benchmarks
-
-# RUN apt update && apt install -y wget
-# RUN wget https://repo.anaconda.com/archive/Anaconda3-2020.11-Linux-x86_64.sh
-# RUN bash Anaconda3-2020.11-Linux-x86_64.sh -b
-
-# ENV PATH /root/anaconda3/bin:$PATH
-
-# RUN conda install -y -c tiledb tiledb
-# RUN conda install -y -c conda-forge tiledb-vector-search
-
-# WORKDIR /home/app
-# COPY requirements.txt run_algorithm.py ./
-# RUN pip3 install -r requirements.txt
-# ENTRYPOINT ["python", "-u", "run_algorithm.py"]
-
-# Pip approach
 FROM ann-benchmarks
 
-RUN pip install tiledb
-RUN pip install tiledb-vector-search
+RUN pip install tiledb tiledb-vector-search
 RUN python3 -c 'import tiledb.vector_search'
\ No newline at end of file

From 0502021e83e41c360fe0192c912618bbb2591cd9 Mon Sep 17 00:00:00 2001
From: Paris Morgan <jparismorgan@gmail.com>
Date: Mon, 13 May 2024 16:22:52 -0400
Subject: [PATCH 3/3] add todos

---
 ann_benchmarks/algorithms/tiledb/module.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/ann_benchmarks/algorithms/tiledb/module.py b/ann_benchmarks/algorithms/tiledb/module.py
index 759666f38..396ac07ad 100644
--- a/ann_benchmarks/algorithms/tiledb/module.py
+++ b/ann_benchmarks/algorithms/tiledb/module.py
@@ -61,11 +61,14 @@ def fit(self, X):
         X.tofile(f)
         f.close()
 
+        # TODO: Next time we run this, just use the numpy arrays directly for ingestion.
         if X.dtype == "uint8":
             source_type = "U8BIN"
         elif X.dtype == "float32":
             source_type = "F32BIN"
         maxtrain = min(50 * self._n_list, X.shape[0])
+        # TODO: Next time we run this, remove size, training_sample_size, partitions, and 
+        # input_vectors_per_work_item and use the defaults instead.
         self.index = ingest(
             index_type="IVF_FLAT",
             index_uri=array_uri,
@@ -77,6 +80,7 @@ def fit(self, X):
             input_vectors_per_work_item=100000000,
             mode=Mode.LOCAL
         )
+        # TODO: Next time we run this, remove dtype and memory_budget as these are the defaults.
         # memory_budget=-1 will load the data into main memory.
         self.index = IVFFlatIndex(uri=array_uri, dtype=X.dtype, memory_budget=-1)