Integrate IVF-PQ from RAFT (facebookresearch#3044)

Summary: Imports changes from facebookresearch#3133 and facebookresearch#3171. So this single PR adds all the changes together. - [x] Implement RaftIVFPQ class - [x] Update gtests to test correctness with RAFT enabled - [x] All googleTests for RAFT enabled IVFPQ pass - [x] Move some common functions in RaftIVFFlat and RaftIVFPQ to helper: RaftUtils.h - [x] update Quantizer retroactively after building RAFT index -- both IVFFlat and IVFPQ - [x] resolve failing LargeBatch (classical GPU) - [x] add checks for Pascal deprecation - [x] apply RMM changes from facebookresearch#3171 - [x] apply robertmaynard's changes from facebookresearch#3133 Pull Request resolved: facebookresearch#3044 Reviewed By: junjieqi Differential Revision: D51074065 Pulled By: algoriddle fbshipit-source-id: 6871257921bcaff2064a20637e2ed358acbdc363
blevesearch · Feb 21, 2024 · 27b1055 · 27b1055
1 parent 87d43b9
commit 27b1055
Show file tree

Hide file tree

Showing 40 changed files with 2,199 additions and 654 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -62,9 +62,9 @@ if(FAISS_ENABLE_GPU)
   enable_language(CUDA)
 endif()
 
-if(FAISS_ENABLE_RAFT)
-  find_package(raft COMPONENTS compiled distributed)
-endif()
+if(FAISS_ENABLE_RAFT AND NOT TARGET raft::raft)
+   find_package(raft COMPONENTS compiled distributed)
+ endif()
 
 add_subdirectory(faiss)
 

diff --git a/benchs/bench_ivfflat_raft.py b/benchs/bench_ivfflat_raft.py
@@ -0,0 +1,193 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+#
+# Copyright (c) 2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import numpy as np
+import faiss
+import time
+import argparse
+import rmm
+
+######################################################
+# Command-line parsing
+######################################################
+
+parser = argparse.ArgumentParser()
+
+
+def aa(*args, **kwargs):
+    group.add_argument(*args, **kwargs)
+
+
+group = parser.add_argument_group('benchmarking options')
+
+aa('--bm_train', default=False, action='store_true',
+   help='whether to benchmark train operation on GPU index')
+aa('--bm_add', default=False, action='store_true',
+   help='whether to benchmark add operation on GPU index')
+aa('--bm_search', default=True,
+   help='whether to benchmark search operation on GPU index')
+aa('--raft_only', default=False, action='store_true',
+   help='whether to only produce RAFT enabled benchmarks')
+
+
+group = parser.add_argument_group('IVF options')
+aa('--n_centroids', default=256, type=int,
+    help="number of IVF centroids")
+
+
+group = parser.add_argument_group('searching')
+
+aa('--k', default=100, type=int, help='nb of nearest neighbors')
+aa('--nprobe', default=50, help='nb of IVF lists to probe')
+
+args = parser.parse_args()
+
+print("args:", args)
+
+rs = np.random.RandomState(123)
+
+res = faiss.StandardGpuResources()
+
+# Use an RMM pool memory resource for device allocations
+mr = rmm.mr.PoolMemoryResource(rmm.mr.CudaMemoryResource())
+rmm.mr.set_current_device_resource(mr)
+
+def bench_train_milliseconds(index, trainVecs, use_raft):
+    co = faiss.GpuMultipleClonerOptions()
+    co.use_raft = use_raft
+    index_gpu = faiss.index_cpu_to_gpu(res, 0, index, co)
+    t0 = time.time()
+    index_gpu.train(trainVecs)
+    return 1000*(time.time() - t0)
+
+
+if args.bm_train:
+    print("=" * 40)
+    print("GPU Train Benchmarks")
+    print("=" * 40)
+    trainset_sizes = [5000, 10000, 100000, 1000000, 5000000]
+    dataset_dims = [128, 256, 1024]
+    for n_rows in trainset_sizes:
+        for n_cols in dataset_dims:
+            index = faiss.index_factory(n_cols, "IVF{},Flat".format(args.n_centroids))
+            trainVecs = rs.rand(n_rows, n_cols).astype('float32')
+            raft_gpu_train_time = bench_train_milliseconds(
+                index, trainVecs, True)
+            if args.raft_only:
+                print("Method: IVFFlat, Operation: TRAIN, dim: %d, n_centroids %d, numTrain: %d, RAFT enabled GPU train time: %.3f milliseconds" % (
+                    n_cols, args.n_centroids, n_rows, raft_gpu_train_time))
+            else:
+                classical_gpu_train_time = bench_train_milliseconds(
+                    index, trainVecs, False)
+                print("Method: IVFFlat, Operation: TRAIN, dim: %d, n_centroids %d, numTrain: %d, classical GPU train time: %.3f milliseconds, RAFT enabled GPU train time: %.3f milliseconds" % (
+                    n_cols, args.n_centroids, n_rows, classical_gpu_train_time, raft_gpu_train_time))
+
+
+def bench_add_milliseconds(index, addVecs, use_raft):
+    co = faiss.GpuMultipleClonerOptions()
+    co.use_raft = use_raft
+    index_gpu = faiss.index_cpu_to_gpu(res, 0, index, co)
+    index_gpu.copyFrom(index)
+    t0 = time.time()
+    index_gpu.add(addVecs)
+    return 1000*(time.time() - t0)
+
+
+if args.bm_add:
+    print("=" * 40)
+    print("GPU Add Benchmarks")
+    print("=" * 40)
+    addset_sizes = [5000, 10000, 100000, 1000000]
+    dataset_dims = [128, 256, 1024]
+    n_train = 10000
+    trainVecs = rs.rand(n_train, n_cols).astype('float32')
+    index = faiss.index_factory(
+        n_cols, "IVF" + str(args.n_centroids) + ",Flat")
+    index.train(trainVecs)
+    for n_rows in addset_sizes:
+        for n_cols in dataset_dims:
+            addVecs = rs.rand(n_rows, n_cols).astype('float32')
+            raft_gpu_add_time = bench_add_milliseconds(index, addVecs, True)
+            if args.raft_only:
+                print("Method: IVFFlat, Operation: ADD, dim: %d, n_centroids %d, numAdd: %d, RAFT enabled GPU add time: %.3f milliseconds" % (
+                    n_train, n_rows, n_cols, args.n_centroids, raft_gpu_add_time))
+            else:
+                classical_gpu_add_time = bench_add_milliseconds(
+                    index, addVecs, False)
+                print("Method: IVFFlat, Operation: ADD, dim: %d, n_centroids %d, numAdd: %d, classical GPU add time: %.3f milliseconds, RAFT enabled GPU add time: %.3f milliseconds" % (
+                    n_train, n_rows, n_cols, args.n_centroids, classical_gpu_add_time, raft_gpu_add_time))
+
+
+def bench_search_milliseconds(index, addVecs, queryVecs, nprobe, k, use_raft):
+    co = faiss.GpuMultipleClonerOptions()
+    co.use_raft = use_raft
+    index_gpu = faiss.index_cpu_to_gpu(res, 0, index, co)
+    index_gpu.copyFrom(index)
+    index_gpu.add(addVecs)
+    index_gpu.nprobe = nprobe
+    t0 = time.time()
+    index_gpu.search(queryVecs, k)
+    return 1000*(time.time() - t0)
+
+
+if args.bm_search:
+    print("=" * 40)
+    print("GPU Search Benchmarks")
+    print("=" * 40)
+    queryset_sizes = [5000, 10000, 100000, 500000]
+    n_train = 10000
+    n_add = 100000
+    search_bm_dims = [8, 16, 32]
+    for n_cols in search_bm_dims:
+        index = faiss.index_factory(n_cols, "IVF{},Flat".format(args.n_centroids))
+        trainVecs = rs.rand(n_train, n_cols).astype('float32')
+        index.train(trainVecs)
+        addVecs = rs.rand(n_add, n_cols).astype('float32')
+        for n_rows in queryset_sizes:
+            queryVecs = rs.rand(n_rows, n_cols).astype('float32')
+            raft_gpu_search_time = bench_search_milliseconds(
+                index, addVecs, queryVecs, args.nprobe, args.k, True)
+            if args.raft_only:
+                print("Method: IVFFlat, Operation: SEARCH, dim: %d, n_centroids: %d, numVecs: %d, numQuery: %d, nprobe: %d, k: %d, RAFT enabled GPU search time: %.3f milliseconds" % (
+                    n_cols, args.n_centroids, n_add, n_rows, args.nprobe, args.k, raft_gpu_search_time))
+            else:
+                classical_gpu_search_time = bench_search_milliseconds(
+                    index, addVecs, queryVecs, args.nprobe, args.k, False)
+                print("Method: IVFFlat, Operation: SEARCH, dim: %d, n_centroids: %d, numVecs: %d, numQuery: %d, nprobe: %d, k: %d, classical GPU search time: %.3f milliseconds, RAFT enabled GPU search time: %.3f milliseconds" % (
+                    n_cols, args.n_centroids, n_add, n_rows, args.nprobe, args.k, classical_gpu_search_time, raft_gpu_search_time))
+
+    print("=" * 40)
+    print("Large RAFT Enabled Benchmarks")
+    print("=" * 40)
+    # Avoid classical GPU Benchmarks for large datasets because of OOM for more than 500000 queries and/or large dims as well as for large k
+    queryset_sizes = [100000, 500000, 1000000]
+    large_search_bm_dims = [128, 256, 1024]
+    for n_cols in large_search_bm_dims:
+        trainVecs = rs.rand(n_train, n_cols).astype('float32')
+        index = faiss.index_factory(
+            n_cols, "IVF" + str(args.n_centroids) + ",Flat")
+        index.train(trainVecs)
+        addVecs = rs.rand(n_add, n_cols).astype('float32')
+        for n_rows in queryset_sizes:
+            queryVecs = rs.rand(n_rows, n_cols).astype('float32')
+            raft_gpu_search_time = bench_search_milliseconds(
+                index, addVecs, queryVecs, args.nprobe, args.k, True)
+            print("Method: IVFFlat, Operation: SEARCH, numTrain: %d, dim: %d, n_centroids: %d, numVecs: %d, numQuery: %d, nprobe: %d, k: %d, RAFT enabled GPU search time: %.3f milliseconds" % (
+                n_cols, args.n_centroids, n_add, n_rows, args.nprobe, args.k, raft_gpu_search_time))
diff --git a/benchs/bench_ivfpq_raft.py b/benchs/bench_ivfpq_raft.py
@@ -0,0 +1,168 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+#
+# Copyright (c) 2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import numpy as np
+import faiss
+import time
+import argparse
+import rmm
+
+######################################################
+# Command-line parsing
+######################################################
+
+parser = argparse.ArgumentParser()
+
+from datasets import load_sift1M, evaluate
+
+
+print("load data")
+xb, xq, xt, gt = load_sift1M()
+
+def aa(*args, **kwargs):
+    group.add_argument(*args, **kwargs)
+
+
+group = parser.add_argument_group('benchmarking options')
+aa('--raft_only', default=False, action='store_true',
+   help='whether to only produce RAFT enabled benchmarks')
+
+group = parser.add_argument_group('IVF options')
+aa('--bits_per_code', default=8, type=int, help='bits per code. Note that < 8 is only supported when RAFT is enabled')
+aa('--pq_len', default=2, type=int, help='number of vector elements represented by one PQ code')
+aa('--use_precomputed', default=True, type=bool, help='use precomputed codes (not with RAFT enabled)')
+
+group = parser.add_argument_group('searching')
+aa('--k', default=10, type=int, help='nb of nearest neighbors')
+aa('--nprobe', default=50, type=int, help='nb of IVF lists to probe')
+
+args = parser.parse_args()
+
+print("args:", args)
+
+rs = np.random.RandomState(123)
+
+res = faiss.StandardGpuResources()
+
+# Use an RMM pool memory resource for device allocations
+mr = rmm.mr.PoolMemoryResource(rmm.mr.CudaMemoryResource())
+rmm.mr.set_current_device_resource(mr)
+
+# A heuristic to select a suitable number of lists
+def compute_nlist(numVecs):
+    nlist = np.sqrt(numVecs)
+    if (numVecs / nlist < 1000):
+        nlist = numVecs / 1000
+    return int(nlist)
+
+
+def bench_train_milliseconds(index, trainVecs, use_raft):
+    co = faiss.GpuMultipleClonerOptions()
+    # use float 16 lookup tables to save space
+    co.useFloat16LookupTables = True
+    co.use_raft = use_raft
+    index_gpu = faiss.index_cpu_to_gpu(res, 0, index, co)
+    t0 = time.time()
+    index_gpu.train(trainVecs)
+    return 1000*(time.time() - t0)
+
+n_rows, n_cols = xb.shape
+n_train, _ = xt.shape
+M = n_cols // args.pq_len
+nlist = compute_nlist(n_rows)
+index = faiss.index_factory(n_cols, "IVF{},PQ{}x{}np".format(nlist, M, args.bits_per_code))
+
+print("=" * 40)
+print("GPU Train Benchmarks")
+print("=" * 40)
+raft_gpu_train_time = bench_train_milliseconds(index, xt, True)
+if args.raft_only:
+    print("Method: IVFPQ, Operation: TRAIN, dim: %d, n_centroids %d, numSubQuantizers %d, bitsPerCode %d, numTrain: %d, RAFT enabled GPU train time: %.3f milliseconds" % (
+        n_cols, nlist, M, args.bits_per_code, n_train, raft_gpu_train_time))
+else:
+    classical_gpu_train_time = bench_train_milliseconds(
+        index, xt, False)
+    print("Method: IVFPQ, Operation: TRAIN, dim: %d, n_centroids %d, numSubQuantizers %d, bitsPerCode %d, numTrain: %d, classical GPU train time: %.3f milliseconds, RAFT enabled GPU train time: %.3f milliseconds" % (
+        n_cols, nlist, M, args.bits_per_code, n_train, classical_gpu_train_time, raft_gpu_train_time))
+
+
+def bench_add_milliseconds(index, addVecs, use_raft):
+    co = faiss.GpuMultipleClonerOptions()
+    # use float 16 lookup tables to save space
+    co.useFloat16LookupTables = True
+    co.use_raft = use_raft
+    index_gpu = faiss.index_cpu_to_gpu(res, 0, index, co)
+    index_gpu.copyFrom(index)
+    t0 = time.time()
+    index_gpu.add(addVecs)
+    return 1000*(time.time() - t0)
+
+print("=" * 40)
+print("GPU Add Benchmarks")
+print("=" * 40)
+index.train(xt)
+raft_gpu_add_time = bench_add_milliseconds(index, xb, True)
+if args.raft_only:
+    print("Method: IVFPQ, Operation: ADD, dim: %d, n_centroids %d numSubQuantizers %d, bitsPerCode %d, numAdd %d, RAFT enabled GPU add time: %.3f milliseconds" % (
+        n_cols, nlist, M, args.bits_per_code, n_rows, raft_gpu_add_time))
+else:
+    classical_gpu_add_time = bench_add_milliseconds(
+        index, xb, False)
+    print("Method: IVFFPQ, Operation: ADD, dim: %d, n_centroids %d, numSubQuantizers %d, bitsPerCode %d, numAdd %d, classical GPU add time: %.3f milliseconds, RAFT enabled GPU add time: %.3f milliseconds" % (
+        n_cols, nlist, M, args.bits_per_code, n_rows, classical_gpu_add_time, raft_gpu_add_time))
+
+
+def bench_search_milliseconds(index, addVecs, queryVecs, nprobe, k, use_raft):
+    co = faiss.GpuMultipleClonerOptions()
+    co.use_raft = use_raft
+    co.useFloat16LookupTables = True
+    index_gpu = faiss.index_cpu_to_gpu(res, 0, index, co)
+    index_gpu.copyFrom(index)
+    index_gpu.add(addVecs)
+    index_gpu.nprobe = nprobe
+    t0 = time.time()
+    index_gpu.search(queryVecs, k)
+    return 1000*(time.time() - t0)
+
+
+if args.bm_search:
+    print("=" * 40)
+    print("GPU Search Benchmarks")
+    print("=" * 40)
+    queryset_sizes = [1, 10, 100, 1000, 10000]
+    n_train, n_cols = xt.shape
+    n_add, _ = xb.shape
+    print(xq.shape)
+    M = n_cols // args.pq_len
+    nlist = compute_nlist(n_add)
+    index = faiss.index_factory(n_cols, "IVF{},PQ{}x{}np".format(nlist, M, args.bits_per_code))
+    index.train(xt)
+    for n_rows in queryset_sizes:
+        queryVecs = xq[np.random.choice(xq.shape[0], n_rows, replace=False)]
+        raft_gpu_search_time = bench_search_milliseconds(
+            index, xb, queryVecs, args.nprobe, args.k, True)
+        if args.raft_only:
+            print("Method: IVFPQ, Operation: SEARCH, dim: %d, n_centroids: %d, numSubQuantizers %d, bitsPerCode %d, numVecs: %d, numQuery: %d, nprobe: %d, k: %d, RAFT enabled GPU search time: %.3f milliseconds" % (
+                n_cols, nlist, M, args.bits_per_code, n_add, n_rows, args.nprobe, args.k, raft_gpu_search_time))
+        else:
+            classical_gpu_search_time = bench_search_milliseconds(
+                index, xb, queryVecs, args.nprobe, args.k, False)
+            print("Method: IVFPQ, Operation: SEARCH, dim: %d, n_centroids: %d, numSubQuantizers %d, bitsPerCode %d, numVecs: %d, numQuery: %d, nprobe: %d, k: %d, classical GPU search time: %.3f milliseconds, RAFT enabled GPU search time: %.3f milliseconds" % (
+                n_cols, nlist, M, args.bits_per_code, n_add, n_rows, args.nprobe, args.k, classical_gpu_search_time, raft_gpu_search_time))
diff --git a/cmake/thirdparty/fetch_rapids.cmake b/cmake/thirdparty/fetch_rapids.cmake
@@ -15,7 +15,7 @@
 # or implied. See the License for the specific language governing permissions and limitations under
 # the License.
 # =============================================================================
-set(RAPIDS_VERSION "23.12")
+set(RAPIDS_VERSION "24.02")
 
 if(NOT EXISTS ${CMAKE_CURRENT_BINARY_DIR}/FAISS_RAPIDS.cmake)
     file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-${RAPIDS_VERSION}/RAPIDS.cmake