From 27b1055cc6c9a70b8822ec5dbdd54b43b69a47e1 Mon Sep 17 00:00:00 2001 From: Tarang Jain Date: Wed, 21 Feb 2024 06:41:08 -0800 Subject: [PATCH] Integrate IVF-PQ from RAFT (#3044) Summary: Imports changes from https://github.com/facebookresearch/faiss/issues/3133 and https://github.com/facebookresearch/faiss/issues/3171. So this single PR adds all the changes together. - [x] Implement RaftIVFPQ class - [x] Update gtests to test correctness with RAFT enabled - [x] All googleTests for RAFT enabled IVFPQ pass - [x] Move some common functions in RaftIVFFlat and RaftIVFPQ to helper: RaftUtils.h - [x] update Quantizer retroactively after building RAFT index -- both IVFFlat and IVFPQ - [x] resolve failing LargeBatch (classical GPU) - [x] add checks for Pascal deprecation - [x] apply RMM changes from https://github.com/facebookresearch/faiss/issues/3171 - [x] apply robertmaynard's changes from https://github.com/facebookresearch/faiss/issues/3133 Pull Request resolved: https://github.com/facebookresearch/faiss/pull/3044 Reviewed By: junjieqi Differential Revision: D51074065 Pulled By: algoriddle fbshipit-source-id: 6871257921bcaff2064a20637e2ed358acbdc363 --- CMakeLists.txt | 6 +- benchs/bench_ivfflat_raft.py | 193 ++++++++ benchs/bench_ivfpq_raft.py | 168 +++++++ cmake/thirdparty/fetch_rapids.cmake | 2 +- faiss/gpu/CMakeLists.txt | 31 +- faiss/gpu/GpuCloner.cpp | 7 +- faiss/gpu/GpuClonerOptions.h | 4 + faiss/gpu/GpuDistance.cu | 21 +- faiss/gpu/GpuDistance.h | 7 + faiss/gpu/GpuIndex.cu | 15 +- faiss/gpu/GpuIndex.h | 8 + faiss/gpu/GpuIndexFlat.cu | 10 +- faiss/gpu/GpuIndexIVF.cu | 51 +- faiss/gpu/GpuIndexIVFFlat.cu | 200 +++++--- faiss/gpu/GpuIndexIVFFlat.h | 4 +- faiss/gpu/GpuIndexIVFPQ.cu | 313 +++++++++--- faiss/gpu/GpuIndexIVFPQ.h | 19 +- faiss/gpu/GpuResources.h | 5 + faiss/gpu/StandardGpuResources.cpp | 39 +- faiss/gpu/StandardGpuResources.h | 11 +- faiss/gpu/impl/IVFBase.cuh | 4 +- faiss/gpu/impl/IVFPQ.cuh | 4 +- faiss/gpu/impl/RaftFlatIndex.cu | 4 +- faiss/gpu/impl/RaftFlatIndex.cuh | 2 + faiss/gpu/impl/RaftIVFFlat.cu | 250 ++++------ faiss/gpu/impl/RaftIVFFlat.cuh | 23 +- faiss/gpu/impl/RaftIVFPQ.cu | 546 +++++++++++++++++++++ faiss/gpu/impl/RaftIVFPQ.cuh | 150 ++++++ faiss/gpu/test/TestGpuIndexFlat.cpp | 1 + faiss/gpu/test/TestGpuIndexIVFFlat.cpp | 51 +- faiss/gpu/test/TestGpuIndexIVFPQ.cpp | 507 +++++++++++-------- faiss/gpu/test/TestGpuMemoryException.cpp | 1 + faiss/gpu/test/test_gpu_index.py | 24 +- faiss/gpu/test/test_gpu_index_ivfsq.py | 12 +- faiss/gpu/test/test_gpu_index_serialize.py | 11 +- faiss/gpu/test/test_multi_gpu.py | 8 +- faiss/gpu/utils/CopyUtils.cuh | 1 + faiss/gpu/utils/RaftUtils.cu | 117 +++++ faiss/gpu/{impl => utils}/RaftUtils.h | 22 +- tests/CMakeLists.txt | 1 + 40 files changed, 2199 insertions(+), 654 deletions(-) create mode 100644 benchs/bench_ivfflat_raft.py create mode 100644 benchs/bench_ivfpq_raft.py create mode 100644 faiss/gpu/impl/RaftIVFPQ.cu create mode 100644 faiss/gpu/impl/RaftIVFPQ.cuh create mode 100644 faiss/gpu/utils/RaftUtils.cu rename faiss/gpu/{impl => utils}/RaftUtils.h (73%) diff --git a/CMakeLists.txt b/CMakeLists.txt index 85c8a820bc..445b39d59e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -62,9 +62,9 @@ if(FAISS_ENABLE_GPU) enable_language(CUDA) endif() -if(FAISS_ENABLE_RAFT) - find_package(raft COMPONENTS compiled distributed) -endif() +if(FAISS_ENABLE_RAFT AND NOT TARGET raft::raft) + find_package(raft COMPONENTS compiled distributed) + endif() add_subdirectory(faiss) diff --git a/benchs/bench_ivfflat_raft.py b/benchs/bench_ivfflat_raft.py new file mode 100644 index 0000000000..9ebfcb3422 --- /dev/null +++ b/benchs/bench_ivfflat_raft.py @@ -0,0 +1,193 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +# +# Copyright (c) 2023, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import numpy as np +import faiss +import time +import argparse +import rmm + +###################################################### +# Command-line parsing +###################################################### + +parser = argparse.ArgumentParser() + + +def aa(*args, **kwargs): + group.add_argument(*args, **kwargs) + + +group = parser.add_argument_group('benchmarking options') + +aa('--bm_train', default=False, action='store_true', + help='whether to benchmark train operation on GPU index') +aa('--bm_add', default=False, action='store_true', + help='whether to benchmark add operation on GPU index') +aa('--bm_search', default=True, + help='whether to benchmark search operation on GPU index') +aa('--raft_only', default=False, action='store_true', + help='whether to only produce RAFT enabled benchmarks') + + +group = parser.add_argument_group('IVF options') +aa('--n_centroids', default=256, type=int, + help="number of IVF centroids") + + +group = parser.add_argument_group('searching') + +aa('--k', default=100, type=int, help='nb of nearest neighbors') +aa('--nprobe', default=50, help='nb of IVF lists to probe') + +args = parser.parse_args() + +print("args:", args) + +rs = np.random.RandomState(123) + +res = faiss.StandardGpuResources() + +# Use an RMM pool memory resource for device allocations +mr = rmm.mr.PoolMemoryResource(rmm.mr.CudaMemoryResource()) +rmm.mr.set_current_device_resource(mr) + +def bench_train_milliseconds(index, trainVecs, use_raft): + co = faiss.GpuMultipleClonerOptions() + co.use_raft = use_raft + index_gpu = faiss.index_cpu_to_gpu(res, 0, index, co) + t0 = time.time() + index_gpu.train(trainVecs) + return 1000*(time.time() - t0) + + +if args.bm_train: + print("=" * 40) + print("GPU Train Benchmarks") + print("=" * 40) + trainset_sizes = [5000, 10000, 100000, 1000000, 5000000] + dataset_dims = [128, 256, 1024] + for n_rows in trainset_sizes: + for n_cols in dataset_dims: + index = faiss.index_factory(n_cols, "IVF{},Flat".format(args.n_centroids)) + trainVecs = rs.rand(n_rows, n_cols).astype('float32') + raft_gpu_train_time = bench_train_milliseconds( + index, trainVecs, True) + if args.raft_only: + print("Method: IVFFlat, Operation: TRAIN, dim: %d, n_centroids %d, numTrain: %d, RAFT enabled GPU train time: %.3f milliseconds" % ( + n_cols, args.n_centroids, n_rows, raft_gpu_train_time)) + else: + classical_gpu_train_time = bench_train_milliseconds( + index, trainVecs, False) + print("Method: IVFFlat, Operation: TRAIN, dim: %d, n_centroids %d, numTrain: %d, classical GPU train time: %.3f milliseconds, RAFT enabled GPU train time: %.3f milliseconds" % ( + n_cols, args.n_centroids, n_rows, classical_gpu_train_time, raft_gpu_train_time)) + + +def bench_add_milliseconds(index, addVecs, use_raft): + co = faiss.GpuMultipleClonerOptions() + co.use_raft = use_raft + index_gpu = faiss.index_cpu_to_gpu(res, 0, index, co) + index_gpu.copyFrom(index) + t0 = time.time() + index_gpu.add(addVecs) + return 1000*(time.time() - t0) + + +if args.bm_add: + print("=" * 40) + print("GPU Add Benchmarks") + print("=" * 40) + addset_sizes = [5000, 10000, 100000, 1000000] + dataset_dims = [128, 256, 1024] + n_train = 10000 + trainVecs = rs.rand(n_train, n_cols).astype('float32') + index = faiss.index_factory( + n_cols, "IVF" + str(args.n_centroids) + ",Flat") + index.train(trainVecs) + for n_rows in addset_sizes: + for n_cols in dataset_dims: + addVecs = rs.rand(n_rows, n_cols).astype('float32') + raft_gpu_add_time = bench_add_milliseconds(index, addVecs, True) + if args.raft_only: + print("Method: IVFFlat, Operation: ADD, dim: %d, n_centroids %d, numAdd: %d, RAFT enabled GPU add time: %.3f milliseconds" % ( + n_train, n_rows, n_cols, args.n_centroids, raft_gpu_add_time)) + else: + classical_gpu_add_time = bench_add_milliseconds( + index, addVecs, False) + print("Method: IVFFlat, Operation: ADD, dim: %d, n_centroids %d, numAdd: %d, classical GPU add time: %.3f milliseconds, RAFT enabled GPU add time: %.3f milliseconds" % ( + n_train, n_rows, n_cols, args.n_centroids, classical_gpu_add_time, raft_gpu_add_time)) + + +def bench_search_milliseconds(index, addVecs, queryVecs, nprobe, k, use_raft): + co = faiss.GpuMultipleClonerOptions() + co.use_raft = use_raft + index_gpu = faiss.index_cpu_to_gpu(res, 0, index, co) + index_gpu.copyFrom(index) + index_gpu.add(addVecs) + index_gpu.nprobe = nprobe + t0 = time.time() + index_gpu.search(queryVecs, k) + return 1000*(time.time() - t0) + + +if args.bm_search: + print("=" * 40) + print("GPU Search Benchmarks") + print("=" * 40) + queryset_sizes = [5000, 10000, 100000, 500000] + n_train = 10000 + n_add = 100000 + search_bm_dims = [8, 16, 32] + for n_cols in search_bm_dims: + index = faiss.index_factory(n_cols, "IVF{},Flat".format(args.n_centroids)) + trainVecs = rs.rand(n_train, n_cols).astype('float32') + index.train(trainVecs) + addVecs = rs.rand(n_add, n_cols).astype('float32') + for n_rows in queryset_sizes: + queryVecs = rs.rand(n_rows, n_cols).astype('float32') + raft_gpu_search_time = bench_search_milliseconds( + index, addVecs, queryVecs, args.nprobe, args.k, True) + if args.raft_only: + print("Method: IVFFlat, Operation: SEARCH, dim: %d, n_centroids: %d, numVecs: %d, numQuery: %d, nprobe: %d, k: %d, RAFT enabled GPU search time: %.3f milliseconds" % ( + n_cols, args.n_centroids, n_add, n_rows, args.nprobe, args.k, raft_gpu_search_time)) + else: + classical_gpu_search_time = bench_search_milliseconds( + index, addVecs, queryVecs, args.nprobe, args.k, False) + print("Method: IVFFlat, Operation: SEARCH, dim: %d, n_centroids: %d, numVecs: %d, numQuery: %d, nprobe: %d, k: %d, classical GPU search time: %.3f milliseconds, RAFT enabled GPU search time: %.3f milliseconds" % ( + n_cols, args.n_centroids, n_add, n_rows, args.nprobe, args.k, classical_gpu_search_time, raft_gpu_search_time)) + + print("=" * 40) + print("Large RAFT Enabled Benchmarks") + print("=" * 40) + # Avoid classical GPU Benchmarks for large datasets because of OOM for more than 500000 queries and/or large dims as well as for large k + queryset_sizes = [100000, 500000, 1000000] + large_search_bm_dims = [128, 256, 1024] + for n_cols in large_search_bm_dims: + trainVecs = rs.rand(n_train, n_cols).astype('float32') + index = faiss.index_factory( + n_cols, "IVF" + str(args.n_centroids) + ",Flat") + index.train(trainVecs) + addVecs = rs.rand(n_add, n_cols).astype('float32') + for n_rows in queryset_sizes: + queryVecs = rs.rand(n_rows, n_cols).astype('float32') + raft_gpu_search_time = bench_search_milliseconds( + index, addVecs, queryVecs, args.nprobe, args.k, True) + print("Method: IVFFlat, Operation: SEARCH, numTrain: %d, dim: %d, n_centroids: %d, numVecs: %d, numQuery: %d, nprobe: %d, k: %d, RAFT enabled GPU search time: %.3f milliseconds" % ( + n_cols, args.n_centroids, n_add, n_rows, args.nprobe, args.k, raft_gpu_search_time)) diff --git a/benchs/bench_ivfpq_raft.py b/benchs/bench_ivfpq_raft.py new file mode 100644 index 0000000000..3494a18741 --- /dev/null +++ b/benchs/bench_ivfpq_raft.py @@ -0,0 +1,168 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +# +# Copyright (c) 2023, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import numpy as np +import faiss +import time +import argparse +import rmm + +###################################################### +# Command-line parsing +###################################################### + +parser = argparse.ArgumentParser() + +from datasets import load_sift1M, evaluate + + +print("load data") +xb, xq, xt, gt = load_sift1M() + +def aa(*args, **kwargs): + group.add_argument(*args, **kwargs) + + +group = parser.add_argument_group('benchmarking options') +aa('--raft_only', default=False, action='store_true', + help='whether to only produce RAFT enabled benchmarks') + +group = parser.add_argument_group('IVF options') +aa('--bits_per_code', default=8, type=int, help='bits per code. Note that < 8 is only supported when RAFT is enabled') +aa('--pq_len', default=2, type=int, help='number of vector elements represented by one PQ code') +aa('--use_precomputed', default=True, type=bool, help='use precomputed codes (not with RAFT enabled)') + +group = parser.add_argument_group('searching') +aa('--k', default=10, type=int, help='nb of nearest neighbors') +aa('--nprobe', default=50, type=int, help='nb of IVF lists to probe') + +args = parser.parse_args() + +print("args:", args) + +rs = np.random.RandomState(123) + +res = faiss.StandardGpuResources() + +# Use an RMM pool memory resource for device allocations +mr = rmm.mr.PoolMemoryResource(rmm.mr.CudaMemoryResource()) +rmm.mr.set_current_device_resource(mr) + +# A heuristic to select a suitable number of lists +def compute_nlist(numVecs): + nlist = np.sqrt(numVecs) + if (numVecs / nlist < 1000): + nlist = numVecs / 1000 + return int(nlist) + + +def bench_train_milliseconds(index, trainVecs, use_raft): + co = faiss.GpuMultipleClonerOptions() + # use float 16 lookup tables to save space + co.useFloat16LookupTables = True + co.use_raft = use_raft + index_gpu = faiss.index_cpu_to_gpu(res, 0, index, co) + t0 = time.time() + index_gpu.train(trainVecs) + return 1000*(time.time() - t0) + +n_rows, n_cols = xb.shape +n_train, _ = xt.shape +M = n_cols // args.pq_len +nlist = compute_nlist(n_rows) +index = faiss.index_factory(n_cols, "IVF{},PQ{}x{}np".format(nlist, M, args.bits_per_code)) + +print("=" * 40) +print("GPU Train Benchmarks") +print("=" * 40) +raft_gpu_train_time = bench_train_milliseconds(index, xt, True) +if args.raft_only: + print("Method: IVFPQ, Operation: TRAIN, dim: %d, n_centroids %d, numSubQuantizers %d, bitsPerCode %d, numTrain: %d, RAFT enabled GPU train time: %.3f milliseconds" % ( + n_cols, nlist, M, args.bits_per_code, n_train, raft_gpu_train_time)) +else: + classical_gpu_train_time = bench_train_milliseconds( + index, xt, False) + print("Method: IVFPQ, Operation: TRAIN, dim: %d, n_centroids %d, numSubQuantizers %d, bitsPerCode %d, numTrain: %d, classical GPU train time: %.3f milliseconds, RAFT enabled GPU train time: %.3f milliseconds" % ( + n_cols, nlist, M, args.bits_per_code, n_train, classical_gpu_train_time, raft_gpu_train_time)) + + +def bench_add_milliseconds(index, addVecs, use_raft): + co = faiss.GpuMultipleClonerOptions() + # use float 16 lookup tables to save space + co.useFloat16LookupTables = True + co.use_raft = use_raft + index_gpu = faiss.index_cpu_to_gpu(res, 0, index, co) + index_gpu.copyFrom(index) + t0 = time.time() + index_gpu.add(addVecs) + return 1000*(time.time() - t0) + +print("=" * 40) +print("GPU Add Benchmarks") +print("=" * 40) +index.train(xt) +raft_gpu_add_time = bench_add_milliseconds(index, xb, True) +if args.raft_only: + print("Method: IVFPQ, Operation: ADD, dim: %d, n_centroids %d numSubQuantizers %d, bitsPerCode %d, numAdd %d, RAFT enabled GPU add time: %.3f milliseconds" % ( + n_cols, nlist, M, args.bits_per_code, n_rows, raft_gpu_add_time)) +else: + classical_gpu_add_time = bench_add_milliseconds( + index, xb, False) + print("Method: IVFFPQ, Operation: ADD, dim: %d, n_centroids %d, numSubQuantizers %d, bitsPerCode %d, numAdd %d, classical GPU add time: %.3f milliseconds, RAFT enabled GPU add time: %.3f milliseconds" % ( + n_cols, nlist, M, args.bits_per_code, n_rows, classical_gpu_add_time, raft_gpu_add_time)) + + +def bench_search_milliseconds(index, addVecs, queryVecs, nprobe, k, use_raft): + co = faiss.GpuMultipleClonerOptions() + co.use_raft = use_raft + co.useFloat16LookupTables = True + index_gpu = faiss.index_cpu_to_gpu(res, 0, index, co) + index_gpu.copyFrom(index) + index_gpu.add(addVecs) + index_gpu.nprobe = nprobe + t0 = time.time() + index_gpu.search(queryVecs, k) + return 1000*(time.time() - t0) + + +if args.bm_search: + print("=" * 40) + print("GPU Search Benchmarks") + print("=" * 40) + queryset_sizes = [1, 10, 100, 1000, 10000] + n_train, n_cols = xt.shape + n_add, _ = xb.shape + print(xq.shape) + M = n_cols // args.pq_len + nlist = compute_nlist(n_add) + index = faiss.index_factory(n_cols, "IVF{},PQ{}x{}np".format(nlist, M, args.bits_per_code)) + index.train(xt) + for n_rows in queryset_sizes: + queryVecs = xq[np.random.choice(xq.shape[0], n_rows, replace=False)] + raft_gpu_search_time = bench_search_milliseconds( + index, xb, queryVecs, args.nprobe, args.k, True) + if args.raft_only: + print("Method: IVFPQ, Operation: SEARCH, dim: %d, n_centroids: %d, numSubQuantizers %d, bitsPerCode %d, numVecs: %d, numQuery: %d, nprobe: %d, k: %d, RAFT enabled GPU search time: %.3f milliseconds" % ( + n_cols, nlist, M, args.bits_per_code, n_add, n_rows, args.nprobe, args.k, raft_gpu_search_time)) + else: + classical_gpu_search_time = bench_search_milliseconds( + index, xb, queryVecs, args.nprobe, args.k, False) + print("Method: IVFPQ, Operation: SEARCH, dim: %d, n_centroids: %d, numSubQuantizers %d, bitsPerCode %d, numVecs: %d, numQuery: %d, nprobe: %d, k: %d, classical GPU search time: %.3f milliseconds, RAFT enabled GPU search time: %.3f milliseconds" % ( + n_cols, nlist, M, args.bits_per_code, n_add, n_rows, args.nprobe, args.k, classical_gpu_search_time, raft_gpu_search_time)) \ No newline at end of file diff --git a/cmake/thirdparty/fetch_rapids.cmake b/cmake/thirdparty/fetch_rapids.cmake index 32ec15b6a4..3e0f6b6ac4 100644 --- a/cmake/thirdparty/fetch_rapids.cmake +++ b/cmake/thirdparty/fetch_rapids.cmake @@ -15,7 +15,7 @@ # or implied. See the License for the specific language governing permissions and limitations under # the License. # ============================================================================= -set(RAPIDS_VERSION "23.12") +set(RAPIDS_VERSION "24.02") if(NOT EXISTS ${CMAKE_CURRENT_BINARY_DIR}/FAISS_RAPIDS.cmake) file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-${RAPIDS_VERSION}/RAPIDS.cmake diff --git a/faiss/gpu/CMakeLists.txt b/faiss/gpu/CMakeLists.txt index 713d3a287c..126cbe5044 100644 --- a/faiss/gpu/CMakeLists.txt +++ b/faiss/gpu/CMakeLists.txt @@ -238,12 +238,15 @@ generate_ivf_interleaved_code() if(FAISS_ENABLE_RAFT) list(APPEND FAISS_GPU_HEADERS - impl/RaftUtils.h + impl/RaftFlatIndex.cuh impl/RaftIVFFlat.cuh - impl/RaftFlatIndex.cuh) + impl/RaftIVFPQ.cuh + utils/RaftUtils.h) list(APPEND FAISS_GPU_SRC impl/RaftFlatIndex.cu - impl/RaftIVFFlat.cu) + impl/RaftIVFFlat.cu + impl/RaftIVFPQ.cu + utils/RaftUtils.cu) endif() add_library(faiss_gpu STATIC ${FAISS_GPU_SRC}) @@ -258,6 +261,28 @@ if(FAISS_ENABLE_RAFT) target_compile_definitions(faiss PUBLIC USE_NVIDIA_RAFT=1) target_compile_definitions(faiss_avx2 PUBLIC USE_NVIDIA_RAFT=1) target_compile_definitions(faiss_avx512 PUBLIC USE_NVIDIA_RAFT=1) + + # Mark all functions as hidden so that we don't generate + # global 'public' functions that also exist in libraft.so + # + # This ensures that faiss functions will call the local version + # inside libfaiss.so . This is needed to ensure that things + # like raft cublas resources are created and used within the same + # dynamic library + CUDA runtime context which are requirements + # for valid execution + # + # To still allow these classes to be used by consumers, the + # respective classes/types in the headers are explicitly marked + # as 'public' so they can be used by consumers + set_source_files_properties( + GpuDistance.cu + StandardGpuResources.cpp + impl/RaftFlatIndex.cu + impl/RaftIVFFlat.cu + impl/RaftIVFPQ.cu + utils/RaftUtils.cu + TARGET_DIRECTORY faiss + PROPERTIES COMPILE_OPTIONS "-fvisibility=hidden") target_compile_definitions(faiss_gpu PUBLIC USE_NVIDIA_RAFT=1) endif() diff --git a/faiss/gpu/GpuCloner.cpp b/faiss/gpu/GpuCloner.cpp index 4dc51f9e83..20583720f3 100644 --- a/faiss/gpu/GpuCloner.cpp +++ b/faiss/gpu/GpuCloner.cpp @@ -152,8 +152,7 @@ Index* ToGpuCloner::clone_Index(const Index* index) { config.device = device; config.indicesOptions = indicesOptions; config.flatConfig.useFloat16 = useFloat16CoarseQuantizer; - FAISS_THROW_IF_NOT_MSG( - !use_raft, "this type of index is not implemented for RAFT"); + config.use_raft = use_raft; GpuIndexIVFFlat* res = new GpuIndexIVFFlat( provider, ifl->d, ifl->nlist, ifl->metric_type, config); @@ -204,8 +203,8 @@ Index* ToGpuCloner::clone_Index(const Index* index) { config.flatConfig.useFloat16 = useFloat16CoarseQuantizer; config.useFloat16LookupTables = useFloat16; config.usePrecomputedTables = usePrecomputed; - FAISS_THROW_IF_NOT_MSG( - !use_raft, "this type of index is not implemented for RAFT"); + config.use_raft = use_raft; + config.interleavedLayout = use_raft; GpuIndexIVFPQ* res = new GpuIndexIVFPQ(provider, ipq, config); diff --git a/faiss/gpu/GpuClonerOptions.h b/faiss/gpu/GpuClonerOptions.h index fbde4c4ea4..197e09dc88 100644 --- a/faiss/gpu/GpuClonerOptions.h +++ b/faiss/gpu/GpuClonerOptions.h @@ -38,7 +38,11 @@ struct GpuClonerOptions { bool verbose = false; /// use the RAFT implementation +#if defined USE_NVIDIA_RAFT + bool use_raft = true; +#else bool use_raft = false; +#endif }; struct GpuMultipleClonerOptions : public GpuClonerOptions { diff --git a/faiss/gpu/GpuDistance.cu b/faiss/gpu/GpuDistance.cu index c363aa4bb8..a235404b14 100644 --- a/faiss/gpu/GpuDistance.cu +++ b/faiss/gpu/GpuDistance.cu @@ -31,7 +31,7 @@ #include #if defined USE_NVIDIA_RAFT -#include +#include #include #include #include @@ -51,6 +51,17 @@ using namespace raft::distance; using namespace raft::neighbors; #endif +bool should_use_raft(GpuDistanceParams args) { + cudaDeviceProp prop; + int dev = args.device >= 0 ? args.device : getCurrentDevice(); + cudaGetDeviceProperties(&prop, dev); + + if (prop.major < 7) + return false; + + return args.use_raft; +} + template void bfKnnConvert(GpuResourcesProvider* prov, const GpuDistanceParams& args) { // Validate the input data @@ -228,8 +239,8 @@ void bfKnn(GpuResourcesProvider* prov, const GpuDistanceParams& args) { #if defined USE_NVIDIA_RAFT // Note: For now, RAFT bfknn requires queries and vectors to be same layout - if (args.use_raft && args.queriesRowMajor == args.vectorsRowMajor) { - DistanceType distance = faiss_to_raft(args.metric, false); + if (should_use_raft(args) && args.queriesRowMajor == args.vectorsRowMajor) { + DistanceType distance = metricFaissToRaft(args.metric, false); auto resImpl = prov->getResources(); auto res = resImpl.get(); @@ -349,9 +360,9 @@ void bfKnn(GpuResourcesProvider* prov, const GpuDistanceParams& args) { RAFT_LOG_INFO("All synced."); } else #else - if (args.use_raft) { + if (should_use_raft(args)) { FAISS_THROW_IF_NOT_MSG( - !args.use_raft, + !should_use_raft(args), "RAFT has not been compiled into the current version so it cannot be used."); } else #endif diff --git a/faiss/gpu/GpuDistance.h b/faiss/gpu/GpuDistance.h index c0dde7fd8c..17dbee617b 100644 --- a/faiss/gpu/GpuDistance.h +++ b/faiss/gpu/GpuDistance.h @@ -9,6 +9,7 @@ #include +#pragma GCC visibility push(default) namespace faiss { namespace gpu { @@ -106,9 +107,14 @@ struct GpuDistanceParams { int device = -1; /// Should the index dispatch down to RAFT? + /// TODO: change default to true if RAFT is enabled bool use_raft = false; }; +/// A function that determines whether RAFT should be used based on various +/// conditions (such as unsupported architecture) +bool should_use_raft(GpuDistanceParams args); + /// A wrapper for gpu/impl/Distance.cuh to expose direct brute-force k-nearest /// neighbor searches on an externally-provided region of memory (e.g., from a /// pytorch tensor). @@ -168,3 +174,4 @@ void bruteForceKnn( } // namespace gpu } // namespace faiss +#pragma GCC visibility pop diff --git a/faiss/gpu/GpuIndex.cu b/faiss/gpu/GpuIndex.cu index 89952b1121..d667ae1494 100644 --- a/faiss/gpu/GpuIndex.cu +++ b/faiss/gpu/GpuIndex.cu @@ -42,6 +42,16 @@ constexpr idx_t kAddVecSize = (idx_t)512 * 1024; // FIXME: parameterize based on algorithm need constexpr idx_t kSearchVecSize = (idx_t)32 * 1024; +bool should_use_raft(GpuIndexConfig config_) { + cudaDeviceProp prop; + cudaGetDeviceProperties(&prop, config_.device); + + if (prop.major < 7) + return false; + + return config_.use_raft; +} + GpuIndex::GpuIndex( std::shared_ptr resources, int dims, @@ -64,7 +74,7 @@ GpuIndex::GpuIndex( (config_.memorySpace == MemorySpace::Unified && getFullUnifiedMemSupport(config_.device)), "Device %d does not support full CUDA 8 Unified Memory (CC 6.0+)", - config.device); + config_.device); metric_arg = metricArg; @@ -132,7 +142,8 @@ void GpuIndex::addPaged_(idx_t n, const float* x, const idx_t* ids) { if (n > 0) { idx_t totalSize = n * this->d * sizeof(float); - if (totalSize > kAddPageSize || n > kAddVecSize) { + if (!should_use_raft(config_) && + (totalSize > kAddPageSize || n > kAddVecSize)) { // How many vectors fit into kAddPageSize? idx_t maxNumVecsForPageSize = kAddPageSize / (this->d * sizeof(float)); diff --git a/faiss/gpu/GpuIndex.h b/faiss/gpu/GpuIndex.h index 629a57583d..36de98c098 100644 --- a/faiss/gpu/GpuIndex.h +++ b/faiss/gpu/GpuIndex.h @@ -38,9 +38,17 @@ struct GpuIndexConfig { MemorySpace memorySpace = MemorySpace::Device; /// Should the index dispatch down to RAFT? +#if defined USE_NVIDIA_RAFT + bool use_raft = true; +#else bool use_raft = false; +#endif }; +/// A centralized function that determines whether RAFT should +/// be used based on various conditions (such as unsupported architecture) +bool should_use_raft(GpuIndexConfig config_); + class GpuIndex : public faiss::Index { public: GpuIndex( diff --git a/faiss/gpu/GpuIndexFlat.cu b/faiss/gpu/GpuIndexFlat.cu index ef5757fbbd..d361a7182a 100644 --- a/faiss/gpu/GpuIndexFlat.cu +++ b/faiss/gpu/GpuIndexFlat.cu @@ -6,18 +6,22 @@ */ #include +#include #include #include #include #include #include #include -#include #include #include #include #include +#if defined USE_NVIDIA_RAFT +#include +#endif + namespace faiss { namespace gpu { @@ -91,7 +95,7 @@ GpuIndexFlat::~GpuIndexFlat() {} void GpuIndexFlat::resetIndex_(int dims) { #if defined USE_NVIDIA_RAFT - if (flatConfig_.use_raft) { + if (should_use_raft(config_)) { data_.reset(new RaftFlatIndex( resources_.get(), dims, @@ -99,7 +103,7 @@ void GpuIndexFlat::resetIndex_(int dims) { config_.memorySpace)); } else #else - if (flatConfig_.use_raft) { + if (should_use_raft(config_)) { FAISS_THROW_MSG( "RAFT has not been compiled into the current version so it cannot be used."); } else diff --git a/faiss/gpu/GpuIndexIVF.cu b/faiss/gpu/GpuIndexIVF.cu index c83008307d..0c5b8db686 100644 --- a/faiss/gpu/GpuIndexIVF.cu +++ b/faiss/gpu/GpuIndexIVF.cu @@ -16,11 +16,6 @@ #include #include -#if defined USE_NVIDIA_RAFT -#include -#include -#endif - namespace faiss { namespace gpu { @@ -79,9 +74,9 @@ void GpuIndexIVF::init_() { } // here we set a low # iterations because this is typically used - // for large clusterings - // (copying IndexIVF.cpp's Level1Quantizer + // for large clusterings (copying IndexIVF.cpp's Level1Quantizer cp.niter = 10; + cp.verbose = verbose; if (quantizer) { @@ -96,6 +91,7 @@ void GpuIndexIVF::init_() { GpuIndexFlatConfig config = ivfConfig_.flatConfig; // inherit our same device config.device = config_.device; + config.use_raft = config_.use_raft; if (metric_type == faiss::METRIC_L2) { quantizer = new GpuIndexFlatL2(resources_, d, config); @@ -451,43 +447,12 @@ void GpuIndexIVF::trainQuantizer_(idx_t n, const float* x) { quantizer->reset(); -#if defined USE_NVIDIA_RAFT - - if (config_.use_raft) { - const raft::device_resources& raft_handle = - resources_->getRaftHandleCurrentDevice(); - - raft::neighbors::ivf_flat::index_params raft_idx_params; - raft_idx_params.n_lists = nlist; - raft_idx_params.metric = metric_type == faiss::METRIC_L2 - ? raft::distance::DistanceType::L2Expanded - : raft::distance::DistanceType::InnerProduct; - raft_idx_params.add_data_on_build = false; - raft_idx_params.kmeans_trainset_fraction = 1.0; - raft_idx_params.kmeans_n_iters = cp.niter; - raft_idx_params.adaptive_centers = !cp.frozen_centroids; + // leverage the CPU-side k-means code, which works for the GPU + // flat index as well + Clustering clus(this->d, nlist, this->cp); + clus.verbose = verbose; + clus.train(n, x, *quantizer); - auto raft_index = raft::neighbors::ivf_flat::build( - raft_handle, raft_idx_params, x, n, (idx_t)d); - - raft_handle.sync_stream(); - - quantizer->train(nlist, raft_index.centers().data_handle()); - quantizer->add(nlist, raft_index.centers().data_handle()); - } else -#else - if (config_.use_raft) { - FAISS_THROW_MSG( - "RAFT has not been compiled into the current version so it cannot be used."); - } else -#endif - { - // leverage the CPU-side k-means code, which works for the GPU - // flat index as well - Clustering clus(this->d, nlist, this->cp); - clus.verbose = verbose; - clus.train(n, x, *quantizer); - } quantizer->is_trained = true; FAISS_ASSERT(quantizer->ntotal == nlist); } diff --git a/faiss/gpu/GpuIndexIVFFlat.cu b/faiss/gpu/GpuIndexIVFFlat.cu index 750096e153..440b449a50 100644 --- a/faiss/gpu/GpuIndexIVFFlat.cu +++ b/faiss/gpu/GpuIndexIVFFlat.cu @@ -16,7 +16,9 @@ #include #if defined USE_NVIDIA_RAFT +#include #include +#include #endif #include @@ -70,11 +72,14 @@ GpuIndexIVFFlat::GpuIndexIVFFlat( config), ivfFlatConfig_(config), reserveMemoryVecs_(0) { + FAISS_THROW_IF_NOT_MSG( + !should_use_raft(config), + "GpuIndexIVFFlat: RAFT does not support separate coarseQuantizer"); // We could have been passed an already trained coarse quantizer. There is // no other quantizer that we need to train, so this is sufficient if (this->is_trained) { FAISS_ASSERT(this->quantizer); - set_index_( + setIndex_( resources_.get(), this->d, this->nlist, @@ -92,56 +97,13 @@ GpuIndexIVFFlat::GpuIndexIVFFlat( GpuIndexIVFFlat::~GpuIndexIVFFlat() {} -void GpuIndexIVFFlat::set_index_( - GpuResources* resources, - int dim, - int nlist, - faiss::MetricType metric, - float metricArg, - bool useResidual, - /// Optional ScalarQuantizer - faiss::ScalarQuantizer* scalarQ, - bool interleavedLayout, - IndicesOptions indicesOptions, - MemorySpace space) { -#if defined USE_NVIDIA_RAFT +void GpuIndexIVFFlat::reserveMemory(size_t numVecs) { + DeviceScope scope(config_.device); - if (config_.use_raft) { - index_.reset(new RaftIVFFlat( - resources, - dim, - nlist, - metric, - metricArg, - useResidual, - scalarQ, - interleavedLayout, - indicesOptions, - space)); - } else -#else - if (config_.use_raft) { + if (should_use_raft(config_)) { FAISS_THROW_MSG( - "RAFT has not been compiled into the current version so it cannot be used."); - } else -#endif - { - index_.reset(new IVFFlat( - resources, - dim, - nlist, - metric, - metricArg, - useResidual, - scalarQ, - interleavedLayout, - indicesOptions, - space)); + "Pre-allocation of IVF lists is not supported with RAFT enabled."); } -} - -void GpuIndexIVFFlat::reserveMemory(size_t numVecs) { - DeviceScope scope(config_.device); reserveMemoryVecs_ = numVecs; if (index_) { @@ -157,7 +119,11 @@ void GpuIndexIVFFlat::copyFrom(const faiss::IndexIVFFlat* index) { // Clear out our old data index_.reset(); - baseIndex_.reset(); + + // skip base class allocations if RAFT is enabled + if (!should_use_raft(config_)) { + baseIndex_.reset(); + } // The other index might not be trained if (!index->is_trained) { @@ -169,7 +135,7 @@ void GpuIndexIVFFlat::copyFrom(const faiss::IndexIVFFlat* index) { FAISS_ASSERT(is_trained); // Copy our lists as well - set_index_( + setIndex_( resources_.get(), d, nlist, @@ -247,23 +213,61 @@ void GpuIndexIVFFlat::train(idx_t n, const float* x) { if (this->is_trained) { FAISS_ASSERT(index_); + if (should_use_raft(config_)) { + // if RAFT is enabled, copy the IVF centroids to the RAFT index in + // case it has been reset. This is because reset clears the RAFT + // index and its centroids. + // TODO: change this once the coarse quantizer is separated from + // RAFT index + updateQuantizer(); + }; return; } FAISS_ASSERT(!index_); + if (should_use_raft(config_)) { #if defined USE_NVIDIA_RAFT - if (config_.use_raft) { - // No need to copy the data to host - trainQuantizer_(n, x); - } else + setIndex_( + resources_.get(), + this->d, + this->nlist, + this->metric_type, + this->metric_arg, + false, // no residual + nullptr, // no scalar quantizer + ivfFlatConfig_.interleavedLayout, + ivfFlatConfig_.indicesOptions, + config_.memorySpace); + const raft::device_resources& raft_handle = + resources_->getRaftHandleCurrentDevice(); + + raft::neighbors::ivf_flat::index_params raft_idx_params; + raft_idx_params.n_lists = nlist; + raft_idx_params.metric = metricFaissToRaft(metric_type, false); + raft_idx_params.add_data_on_build = false; + raft_idx_params.kmeans_trainset_fraction = + static_cast(cp.max_points_per_centroid * nlist) / + static_cast(n); + raft_idx_params.kmeans_n_iters = cp.niter; + + auto raftIndex_ = + std::static_pointer_cast(index_); + + raft::neighbors::ivf_flat::index raft_ivfflat_index = + raft::neighbors::ivf_flat::build( + raft_handle, raft_idx_params, x, n, (idx_t)d); + + quantizer->train(nlist, raft_ivfflat_index.centers().data_handle()); + quantizer->add(nlist, raft_ivfflat_index.centers().data_handle()); + raft_handle.sync_stream(); + + raftIndex_->setRaftIndex(std::move(raft_ivfflat_index)); #else - if (config_.use_raft) { FAISS_THROW_MSG( "RAFT has not been compiled into the current version so it cannot be used."); - } else #endif - { + } else { // FIXME: GPUize more of this // First, make sure that the data is resident on the CPU, if it is not // on the CPU, as we depend upon parts of the CPU code @@ -272,29 +276,85 @@ void GpuIndexIVFFlat::train(idx_t n, const float* x) { resources_->getDefaultStream(config_.device), {n, this->d}); trainQuantizer_(n, hostData.data()); + + setIndex_( + resources_.get(), + this->d, + this->nlist, + this->metric_type, + this->metric_arg, + false, // no residual + nullptr, // no scalar quantizer + ivfFlatConfig_.interleavedLayout, + ivfFlatConfig_.indicesOptions, + config_.memorySpace); + updateQuantizer(); } // The quantizer is now trained; construct the IVF index - set_index_( - resources_.get(), - this->d, - this->nlist, - this->metric_type, - this->metric_arg, - false, // no residual - nullptr, // no scalar quantizer - ivfFlatConfig_.interleavedLayout, - ivfFlatConfig_.indicesOptions, - config_.memorySpace); baseIndex_ = std::static_pointer_cast(index_); - updateQuantizer(); if (reserveMemoryVecs_) { - index_->reserveMemory(reserveMemoryVecs_); + if (should_use_raft(config_)) { + FAISS_THROW_MSG( + "Pre-allocation of IVF lists is not supported with RAFT enabled."); + } else + index_->reserveMemory(reserveMemoryVecs_); } this->is_trained = true; } +void GpuIndexIVFFlat::setIndex_( + GpuResources* resources, + int dim, + int nlist, + faiss::MetricType metric, + float metricArg, + bool useResidual, + /// Optional ScalarQuantizer + faiss::ScalarQuantizer* scalarQ, + bool interleavedLayout, + IndicesOptions indicesOptions, + MemorySpace space) { + if (should_use_raft(config_)) { +#if defined USE_NVIDIA_RAFT + FAISS_THROW_IF_NOT_MSG( + ivfFlatConfig_.indicesOptions == INDICES_64_BIT, + "RAFT only supports INDICES_64_BIT"); + if (!ivfFlatConfig_.interleavedLayout) { + fprintf(stderr, + "WARN: interleavedLayout is set to False with RAFT enabled. This will be ignored.\n"); + } + index_.reset(new RaftIVFFlat( + resources, + dim, + nlist, + metric, + metricArg, + useResidual, + scalarQ, + interleavedLayout, + indicesOptions, + space)); +#else + FAISS_THROW_MSG( + "RAFT has not been compiled into the current version so it cannot be used."); +#endif + } else { + index_.reset(new IVFFlat( + resources, + dim, + nlist, + metric, + metricArg, + useResidual, + scalarQ, + interleavedLayout, + indicesOptions, + space)); + } +} + } // namespace gpu } // namespace faiss diff --git a/faiss/gpu/GpuIndexIVFFlat.h b/faiss/gpu/GpuIndexIVFFlat.h index d7508feef4..678bf8e7f4 100644 --- a/faiss/gpu/GpuIndexIVFFlat.h +++ b/faiss/gpu/GpuIndexIVFFlat.h @@ -88,7 +88,8 @@ class GpuIndexIVFFlat : public GpuIndexIVF { void train(idx_t n, const float* x) override; protected: - void set_index_( + /// Initialize appropriate index + void setIndex_( GpuResources* resources, int dim, int nlist, @@ -101,6 +102,7 @@ class GpuIndexIVFFlat : public GpuIndexIVF { IndicesOptions indicesOptions, MemorySpace space); + protected: /// Our configuration options const GpuIndexIVFFlatConfig ivfFlatConfig_; diff --git a/faiss/gpu/GpuIndexIVFPQ.cu b/faiss/gpu/GpuIndexIVFPQ.cu index 69c4cf0556..d39f036b89 100644 --- a/faiss/gpu/GpuIndexIVFPQ.cu +++ b/faiss/gpu/GpuIndexIVFPQ.cu @@ -15,6 +15,13 @@ #include #include +#if defined USE_NVIDIA_RAFT +#include +#include +#include +#include +#endif + #include namespace faiss { @@ -87,6 +94,10 @@ GpuIndexIVFPQ::GpuIndexIVFPQ( // instance this->is_trained = false; + FAISS_THROW_IF_NOT_MSG( + !config.use_raft, + "GpuIndexIVFPQ: RAFT does not support separate coarseQuantizer"); + verifyPQSettings_(); } @@ -100,7 +111,11 @@ void GpuIndexIVFPQ::copyFrom(const faiss::IndexIVFPQ* index) { // Clear out our old data index_.reset(); - baseIndex_.reset(); + + // skip base class allocations if RAFT is enabled + if (!should_use_raft(config_)) { + baseIndex_.reset(); + } pq = index->pq; subQuantizers_ = index->pq.M; @@ -127,7 +142,7 @@ void GpuIndexIVFPQ::copyFrom(const faiss::IndexIVFPQ* index) { // Copy our lists as well // The product quantizer must have data in it FAISS_ASSERT(index->pq.centroids.size() > 0); - index_.reset(new IVFPQ( + setIndex_( resources_.get(), this->d, this->nlist, @@ -140,7 +155,7 @@ void GpuIndexIVFPQ::copyFrom(const faiss::IndexIVFPQ* index) { ivfpqConfig_.interleavedLayout, (float*)index->pq.centroids.data(), ivfpqConfig_.indicesOptions, - config_.memorySpace)); + config_.memorySpace); baseIndex_ = std::static_pointer_cast(index_); // Doesn't make sense to reserve memory here @@ -169,7 +184,7 @@ void GpuIndexIVFPQ::copyTo(faiss::IndexIVFPQ* index) const { // index->by_residual = true; index->use_precomputed_table = 0; - index->code_size = subQuantizers_; + index->code_size = utils::divUp(subQuantizers_ * bitsPerCode_, 8); index->pq = faiss::ProductQuantizer(this->d, subQuantizers_, bitsPerCode_); index->do_polysemous_training = false; @@ -308,6 +323,7 @@ void GpuIndexIVFPQ::trainResidualQuantizer_(idx_t n, const float* x) { try { GpuIndexFlatConfig config; config.device = ivfpqConfig_.device; + config.use_raft = false; GpuIndexFlatL2 pqIndex(resources_, pq.dsub, config); pq.assign_index = &pqIndex; @@ -322,29 +338,6 @@ void GpuIndexIVFPQ::trainResidualQuantizer_(idx_t n, const float* x) { // use the currently assigned clustering index pq.train(n, residuals.data()); } - - index_.reset(new IVFPQ( - resources_.get(), - this->d, - this->nlist, - metric_type, - metric_arg, - subQuantizers_, - bitsPerCode_, - ivfpqConfig_.useFloat16LookupTables, - ivfpqConfig_.useMMCodeDistance, - ivfpqConfig_.interleavedLayout, - pq.centroids.data(), - ivfpqConfig_.indicesOptions, - config_.memorySpace)); - baseIndex_ = std::static_pointer_cast(index_); - updateQuantizer(); - - if (reserveMemoryVecs_) { - index_->reserveMemory(reserveMemoryVecs_); - } - - index_->setPrecomputedCodes(quantizer, usePrecomputedTables_); } void GpuIndexIVFPQ::train(idx_t n, const float* x) { @@ -356,27 +349,179 @@ void GpuIndexIVFPQ::train(idx_t n, const float* x) { if (this->is_trained) { FAISS_ASSERT(index_); + if (should_use_raft(config_)) { + // if RAFT is enabled, copy the IVF centroids to the RAFT index in + // case it has been reset. This is because reset clears the RAFT + // index and its centroids. + // TODO: change this once the coarse quantizer is separated from + // RAFT index + updateQuantizer(); + }; return; } FAISS_ASSERT(!index_); - // FIXME: GPUize more of this - // First, make sure that the data is resident on the CPU, if it is not on - // the CPU, as we depend upon parts of the CPU code - auto hostData = toHost( - (float*)x, - resources_->getDefaultStream(config_.device), - {n, this->d}); + // RAFT does not support using an external index for assignment. Fall back + // to the classical GPU impl + if (should_use_raft(config_)) { +#if defined USE_NVIDIA_RAFT + if (pq.assign_index) { + fprintf(stderr, + "WARN: The Product Quantizer's assign_index will be ignored with RAFT enabled.\n"); + } + // first initialize the index. The PQ centroids will be updated + // retroactively. + setIndex_( + resources_.get(), + this->d, + this->nlist, + metric_type, + metric_arg, + subQuantizers_, + bitsPerCode_, + ivfpqConfig_.useFloat16LookupTables, + ivfpqConfig_.useMMCodeDistance, + ivfpqConfig_.interleavedLayout, + pq.centroids.data(), + ivfpqConfig_.indicesOptions, + config_.memorySpace); + // No need to copy the data to host + const raft::device_resources& raft_handle = + resources_->getRaftHandleCurrentDevice(); + + raft::neighbors::ivf_pq::index_params raft_idx_params; + raft_idx_params.n_lists = nlist; + raft_idx_params.metric = metricFaissToRaft(metric_type, false); + raft_idx_params.kmeans_trainset_fraction = + static_cast(cp.max_points_per_centroid * nlist) / + static_cast(n); + raft_idx_params.kmeans_n_iters = cp.niter; + raft_idx_params.pq_bits = bitsPerCode_; + raft_idx_params.pq_dim = subQuantizers_; + raft_idx_params.conservative_memory_allocation = false; + raft_idx_params.add_data_on_build = false; + + auto raftIndex_ = std::static_pointer_cast(index_); + + raft::neighbors::ivf_pq::index raft_ivfpq_index = + raft::neighbors::ivf_pq::build( + raft_handle, raft_idx_params, x, n, (idx_t)d); + + auto raft_centers = raft::make_device_matrix( + raft_handle, + raft_ivfpq_index.n_lists(), + raft_ivfpq_index.dim()); + raft::neighbors::ivf_pq::helpers::extract_centers( + raft_handle, raft_ivfpq_index, raft_centers.view()); + + quantizer->train(nlist, raft_centers.data_handle()); + quantizer->add(nlist, raft_centers.data_handle()); + + raft::copy( + pq.get_centroids(0, 0), + raft_ivfpq_index.pq_centers().data_handle(), + raft_ivfpq_index.pq_centers().size(), + raft_handle.get_stream()); + raft_handle.sync_stream(); + raftIndex_->setRaftIndex(std::move(raft_ivfpq_index)); +#else + FAISS_THROW_MSG( + "RAFT has not been compiled into the current version so it cannot be used."); +#endif + } else { + // FIXME: GPUize more of this + // First, make sure that the data is resident on the CPU, if it is not + // on the CPU, as we depend upon parts of the CPU code + auto hostData = toHost( + (float*)x, + resources_->getDefaultStream(config_.device), + {n, this->d}); + + trainQuantizer_(n, hostData.data()); + trainResidualQuantizer_(n, hostData.data()); + + setIndex_( + resources_.get(), + this->d, + this->nlist, + metric_type, + metric_arg, + subQuantizers_, + bitsPerCode_, + ivfpqConfig_.useFloat16LookupTables, + ivfpqConfig_.useMMCodeDistance, + ivfpqConfig_.interleavedLayout, + pq.centroids.data(), + ivfpqConfig_.indicesOptions, + config_.memorySpace); + updateQuantizer(); + } + baseIndex_ = std::static_pointer_cast(index_); - trainQuantizer_(n, hostData.data()); - trainResidualQuantizer_(n, hostData.data()); + if (reserveMemoryVecs_) { + index_->reserveMemory(reserveMemoryVecs_); + } + + index_->setPrecomputedCodes(quantizer, usePrecomputedTables_); FAISS_ASSERT(index_); this->is_trained = true; } +void GpuIndexIVFPQ::setIndex_( + GpuResources* resources, + int dim, + idx_t nlist, + faiss::MetricType metric, + float metricArg, + int numSubQuantizers, + int bitsPerSubQuantizer, + bool useFloat16LookupTables, + bool useMMCodeDistance, + bool interleavedLayout, + float* pqCentroidData, + IndicesOptions indicesOptions, + MemorySpace space) { + if (should_use_raft(config_)) { +#if defined USE_NVIDIA_RAFT + index_.reset(new RaftIVFPQ( + resources, + dim, + nlist, + metric, + metricArg, + numSubQuantizers, + bitsPerSubQuantizer, + useFloat16LookupTables, + useMMCodeDistance, + interleavedLayout, + pqCentroidData, + indicesOptions, + space)); +#else + FAISS_THROW_MSG( + "RAFT has not been compiled into the current version so it cannot be used."); +#endif + } else { + index_.reset(new IVFPQ( + resources, + dim, + nlist, + metric, + metricArg, + numSubQuantizers, + bitsPerSubQuantizer, + useFloat16LookupTables, + useMMCodeDistance, + interleavedLayout, + pqCentroidData, + indicesOptions, + space)); + } +} + void GpuIndexIVFPQ::verifyPQSettings_() const { // Our implementation has these restrictions: @@ -384,28 +529,36 @@ void GpuIndexIVFPQ::verifyPQSettings_() const { FAISS_THROW_IF_NOT_MSG(nlist > 0, "nlist must be >0"); // up to a single byte per code - if (ivfpqConfig_.interleavedLayout) { + if (should_use_raft(config_)) { + if (!ivfpqConfig_.interleavedLayout) { + fprintf(stderr, + "WARN: interleavedLayout is set to False with RAFT enabled. This will be ignored.\n"); + } FAISS_THROW_IF_NOT_FMT( - bitsPerCode_ == 4 || bitsPerCode_ == 5 || bitsPerCode_ == 6 || - bitsPerCode_ == 8, - "Bits per code must be between 4, 5, 6 or 8 (passed %d)", + bitsPerCode_ >= 4 && bitsPerCode_ <= 8, + "Bits per code must be within closed range [4,8] (passed %d)", bitsPerCode_); - - } else { FAISS_THROW_IF_NOT_FMT( - bitsPerCode_ == 8, - "Bits per code must be 8 (passed %d)", - bitsPerCode_); + (bitsPerCode_ * subQuantizers_) % 8 == 0, + "`Bits per code * number of sub-quantizers must be a multiple of 8, (passed %u * %u = %u).", + bitsPerCode_, + subQuantizers_, + bitsPerCode_ * subQuantizers_); + } else { + if (ivfpqConfig_.interleavedLayout) { + FAISS_THROW_IF_NOT_FMT( + bitsPerCode_ == 4 || bitsPerCode_ == 5 || + bitsPerCode_ == 6 || bitsPerCode_ == 8, + "Bits per code must be between 4, 5, 6 or 8 (passed %d)", + bitsPerCode_); + } else { + FAISS_THROW_IF_NOT_FMT( + bitsPerCode_ == 8, + "Bits per code must be 8 (passed %d)", + bitsPerCode_); + } } - // Sub-quantizers must evenly divide dimensions available - FAISS_THROW_IF_NOT_FMT( - this->d % subQuantizers_ == 0, - "Number of sub-quantizers (%d) must be an " - "even divisor of the number of dimensions (%d)", - subQuantizers_, - this->d); - // The number of bytes per encoded vector must be one we support FAISS_THROW_IF_NOT_FMT( ivfpqConfig_.interleavedLayout || @@ -414,30 +567,40 @@ void GpuIndexIVFPQ::verifyPQSettings_() const { "is not supported", subQuantizers_); - // We must have enough shared memory on the current device to store - // our lookup distances - int lookupTableSize = sizeof(float); - if (ivfpqConfig_.useFloat16LookupTables) { - lookupTableSize = sizeof(half); - } + if (!should_use_raft(config_)) { + // Sub-quantizers must evenly divide dimensions available + FAISS_THROW_IF_NOT_FMT( + this->d % subQuantizers_ == 0, + "Number of sub-quantizers (%d) must be an " + "even divisor of the number of dimensions (%d)", + subQuantizers_, + this->d); + + // We must have enough shared memory on the current device to store + // our lookup distances + int lookupTableSize = sizeof(float); + if (ivfpqConfig_.useFloat16LookupTables) { + lookupTableSize = sizeof(half); + } - // 64 bytes per code is only supported with usage of float16, at 2^8 - // codes per subquantizer - size_t requiredSmemSize = - lookupTableSize * subQuantizers_ * utils::pow2(bitsPerCode_); - size_t smemPerBlock = getMaxSharedMemPerBlock(config_.device); + // 64 bytes per code is only supported with usage of float16, at 2^8 + // codes per subquantizer + size_t requiredSmemSize = + lookupTableSize * subQuantizers_ * utils::pow2(bitsPerCode_); + size_t smemPerBlock = getMaxSharedMemPerBlock(config_.device); - FAISS_THROW_IF_NOT_FMT( - requiredSmemSize <= getMaxSharedMemPerBlock(config_.device), - "Device %d has %zu bytes of shared memory, while " - "%d bits per code and %d sub-quantizers requires %zu " - "bytes. Consider useFloat16LookupTables and/or " - "reduce parameters", - config_.device, - smemPerBlock, - bitsPerCode_, - subQuantizers_, - requiredSmemSize); + FAISS_THROW_IF_NOT_FMT( + requiredSmemSize <= getMaxSharedMemPerBlock(config_.device), + "Device %d has %zu bytes of shared memory, while " + "%d bits per code and %d sub-quantizers requires %zu " + "bytes. Consider useFloat16LookupTables and/or " + "reduce parameters", + config_.device, + smemPerBlock, + bitsPerCode_, + subQuantizers_, + requiredSmemSize); + } } } // namespace gpu diff --git a/faiss/gpu/GpuIndexIVFPQ.h b/faiss/gpu/GpuIndexIVFPQ.h index 22e9961675..1084d4d0d2 100644 --- a/faiss/gpu/GpuIndexIVFPQ.h +++ b/faiss/gpu/GpuIndexIVFPQ.h @@ -33,7 +33,8 @@ struct GpuIndexIVFPQConfig : public GpuIndexIVFConfig { bool usePrecomputedTables = false; /// Use the alternative memory layout for the IVF lists - /// WARNING: this is a feature under development, do not use! + /// WARNING: this is a feature under development, and is only supported with + /// RAFT enabled for the index. Do not use if RAFT is not enabled. bool interleavedLayout = false; /// Use GEMM-backed computation of PQ code distances for the no precomputed @@ -133,6 +134,22 @@ class GpuIndexIVFPQ : public GpuIndexIVF { ProductQuantizer pq; protected: + /// Initialize appropriate index + void setIndex_( + GpuResources* resources, + int dim, + idx_t nlist, + faiss::MetricType metric, + float metricArg, + int numSubQuantizers, + int bitsPerSubQuantizer, + bool useFloat16LookupTables, + bool useMMCodeDistance, + bool interleavedLayout, + float* pqCentroidData, + IndicesOptions indicesOptions, + MemorySpace space); + /// Throws errors if configuration settings are improper void verifyPQSettings_() const; diff --git a/faiss/gpu/GpuResources.h b/faiss/gpu/GpuResources.h index 7d0459955b..fc6dd591b4 100644 --- a/faiss/gpu/GpuResources.h +++ b/faiss/gpu/GpuResources.h @@ -32,6 +32,7 @@ #if defined USE_NVIDIA_RAFT #include +#include #endif namespace faiss { @@ -159,6 +160,10 @@ struct AllocRequest : public AllocInfo { /// The size in bytes of the allocation size_t size = 0; + +#if defined USE_NVIDIA_RAFT + rmm::mr::device_memory_resource* mr = nullptr; +#endif }; /// A RAII object that manages a temporary memory request diff --git a/faiss/gpu/StandardGpuResources.cpp b/faiss/gpu/StandardGpuResources.cpp index 754025d049..004f80a27e 100644 --- a/faiss/gpu/StandardGpuResources.cpp +++ b/faiss/gpu/StandardGpuResources.cpp @@ -22,11 +22,10 @@ #if defined USE_NVIDIA_RAFT #include -#include #include +#include #include #include - #endif #include @@ -92,9 +91,8 @@ std::string allocsToString(const std::unordered_map& map) { StandardGpuResourcesImpl::StandardGpuResourcesImpl() : #if defined USE_NVIDIA_RAFT - cmr(new rmm::mr::cuda_memory_resource), - mmr(new rmm::mr::managed_memory_resource), - pmr(new rmm::mr::pinned_memory_resource), + mmr_(new rmm::mr::managed_memory_resource), + pmr_(new rmm::mr::pinned_memory_resource), #endif pinnedMemAlloc_(nullptr), pinnedMemAllocSize_(0), @@ -161,7 +159,7 @@ StandardGpuResourcesImpl::~StandardGpuResourcesImpl() { if (pinnedMemAlloc_) { #if defined USE_NVIDIA_RAFT - pmr->deallocate(pinnedMemAlloc_, pinnedMemAllocSize_); + pmr_->deallocate(pinnedMemAlloc_, pinnedMemAllocSize_); #else auto err = cudaFreeHost(pinnedMemAlloc_); FAISS_ASSERT_FMT( @@ -314,7 +312,7 @@ void StandardGpuResourcesImpl::initializeForDevice(int device) { // pinned memory allocation if (defaultStreams_.empty() && pinnedMemSize_ > 0) { try { - pinnedMemAlloc_ = pmr->allocate(pinnedMemSize_); + pinnedMemAlloc_ = pmr_->allocate(pinnedMemSize_); } catch (const std::bad_alloc& rmm_ex) { FAISS_THROW_MSG("CUDA memory allocation error"); } @@ -478,8 +476,6 @@ void* StandardGpuResourcesImpl::allocMemory(const AllocRequest& req) { void* p = nullptr; if (adjReq.space == MemorySpace::Temporary) { - // If we don't have enough space in our temporary memory manager, we - // need to allocate this request separately auto& tempMem = tempMemory_[adjReq.device]; if (adjReq.size > tempMem->getSizeAvailable()) { @@ -500,11 +496,14 @@ void* StandardGpuResourcesImpl::allocMemory(const AllocRequest& req) { // Otherwise, we can handle this locally p = tempMemory_[adjReq.device]->allocMemory(adjReq.stream, adjReq.size); - } else if (adjReq.space == MemorySpace::Device) { #if defined USE_NVIDIA_RAFT try { - p = cmr->allocate(adjReq.size, adjReq.stream); + rmm::mr::device_memory_resource* current_mr = + rmm::mr::get_per_device_resource( + rmm::cuda_device_id{adjReq.device}); + p = current_mr->allocate_async(adjReq.size, adjReq.stream); + adjReq.mr = current_mr; } catch (const std::bad_alloc& rmm_ex) { FAISS_THROW_MSG("CUDA memory allocation error"); } @@ -514,8 +513,8 @@ void* StandardGpuResourcesImpl::allocMemory(const AllocRequest& req) { // Throw if we fail to allocate if (err != cudaSuccess) { // FIXME: as of CUDA 11, a memory allocation error appears to be - // presented via cudaGetLastError as well, and needs to be cleared. - // Just call the function to clear it + // presented via cudaGetLastError as well, and needs to be + // cleared. Just call the function to clear it cudaGetLastError(); std::stringstream ss; @@ -534,7 +533,12 @@ void* StandardGpuResourcesImpl::allocMemory(const AllocRequest& req) { } else if (adjReq.space == MemorySpace::Unified) { #if defined USE_NVIDIA_RAFT try { - p = mmr->allocate(adjReq.size, adjReq.stream); + // for now, use our own managed MR to do Unified Memory allocations. + // TODO: change this to use the current device resource once RMM has + // a way to retrieve a "guaranteed" managed memory resource for a + // device. + p = mmr_->allocate_async(adjReq.size, adjReq.stream); + adjReq.mr = mmr_.get(); } catch (const std::bad_alloc& rmm_ex) { FAISS_THROW_MSG("CUDA memory allocation error"); } @@ -593,16 +597,11 @@ void StandardGpuResourcesImpl::deallocMemory(int device, void* p) { if (req.space == MemorySpace::Temporary) { tempMemory_[device]->deallocMemory(device, req.stream, req.size, p); - } else if ( req.space == MemorySpace::Device || req.space == MemorySpace::Unified) { #if defined USE_NVIDIA_RAFT - if (req.space == MemorySpace::Device) { - cmr->deallocate(p, req.size, req.stream); - } else if (req.space == MemorySpace::Unified) { - mmr->deallocate(p, req.size, req.stream); - } + req.mr->deallocate_async(p, req.size, req.stream); #else auto err = cudaFree(p); FAISS_ASSERT_FMT( diff --git a/faiss/gpu/StandardGpuResources.h b/faiss/gpu/StandardGpuResources.h index 9113de573c..661c784aee 100644 --- a/faiss/gpu/StandardGpuResources.h +++ b/faiss/gpu/StandardGpuResources.h @@ -24,8 +24,6 @@ #if defined USE_NVIDIA_RAFT #include -#include -#include #include #endif @@ -37,6 +35,7 @@ #include #include +#pragma GCC visibility push(default) namespace faiss { namespace gpu { @@ -166,14 +165,11 @@ class StandardGpuResourcesImpl : public GpuResources { * to create a subclass only for the RMM memory resources. */ - // cuda_memory_resource - std::unique_ptr cmr; - // managed_memory_resource - std::unique_ptr mmr; + std::unique_ptr mmr_; // pinned_memory_resource - std::unique_ptr pmr; + std::unique_ptr pmr_; #endif /// Pinned memory allocation for use with this GPU @@ -260,3 +256,4 @@ class StandardGpuResources : public GpuResourcesProvider { } // namespace gpu } // namespace faiss +#pragma GCC visibility pop diff --git a/faiss/gpu/impl/IVFBase.cuh b/faiss/gpu/impl/IVFBase.cuh index 2bb319d002..6b1f2ac394 100644 --- a/faiss/gpu/impl/IVFBase.cuh +++ b/faiss/gpu/impl/IVFBase.cuh @@ -41,7 +41,7 @@ class IVFBase { virtual ~IVFBase(); /// Reserve GPU memory in our inverted lists for this number of vectors - void reserveMemory(idx_t numVecs); + virtual void reserveMemory(idx_t numVecs); /// Clear out all inverted lists, but retain the coarse quantizer /// and the product quantizer info @@ -52,7 +52,7 @@ class IVFBase { /// After adding vectors, one can call this to reclaim device memory /// to exactly the amount needed. Returns space reclaimed in bytes - size_t reclaimMemory(); + virtual size_t reclaimMemory(); /// Returns the number of inverted lists idx_t getNumLists() const; diff --git a/faiss/gpu/impl/IVFPQ.cuh b/faiss/gpu/impl/IVFPQ.cuh index 3670e58edf..0d17b02c9b 100644 --- a/faiss/gpu/impl/IVFPQ.cuh +++ b/faiss/gpu/impl/IVFPQ.cuh @@ -39,7 +39,7 @@ class IVFPQ : public IVFBase { /// Enable or disable pre-computed codes. The quantizer is needed to gather /// the IVF centroids for use - void setPrecomputedCodes(Index* coarseQuantizer, bool enable); + virtual void setPrecomputedCodes(Index* coarseQuantizer, bool enable); /// Returns our set of sub-quantizers of the form /// (sub q)(code id)(sub dim) @@ -134,7 +134,7 @@ class IVFPQ : public IVFBase { Tensor& outDistances, Tensor& outIndices); - private: + protected: /// Number of sub-quantizers per vector const int numSubQuantizers_; diff --git a/faiss/gpu/impl/RaftFlatIndex.cu b/faiss/gpu/impl/RaftFlatIndex.cu index 8f5c491163..24a6d39604 100644 --- a/faiss/gpu/impl/RaftFlatIndex.cu +++ b/faiss/gpu/impl/RaftFlatIndex.cu @@ -20,7 +20,7 @@ * limitations under the License. */ -#include +#include #include #include @@ -91,7 +91,7 @@ void RaftFlatIndex::query( outDistances.getSize(0), outDistances.getSize(1)); - DistanceType distance = faiss_to_raft(metric, exactDistance); + DistanceType distance = metricFaissToRaft(metric, exactDistance); std::optional> norms_view = raft::make_device_vector_view( diff --git a/faiss/gpu/impl/RaftFlatIndex.cuh b/faiss/gpu/impl/RaftFlatIndex.cuh index 010c5aebce..d3823bbf58 100644 --- a/faiss/gpu/impl/RaftFlatIndex.cuh +++ b/faiss/gpu/impl/RaftFlatIndex.cuh @@ -28,6 +28,7 @@ #include #include +#pragma GCC visibility push(default) namespace faiss { namespace gpu { @@ -67,3 +68,4 @@ class RaftFlatIndex : public FlatIndex { } // namespace gpu } // namespace faiss +#pragma GCC visibility pop diff --git a/faiss/gpu/impl/RaftIVFFlat.cu b/faiss/gpu/impl/RaftIVFFlat.cu index 2c6afb795c..1e310723d0 100644 --- a/faiss/gpu/impl/RaftIVFFlat.cu +++ b/faiss/gpu/impl/RaftIVFFlat.cu @@ -23,31 +23,19 @@ #include #include -#include -#include -#include -#include -#include -#include +#include +#include #include -#include #include -#include -#include #include -#include -#include -#include -#include -#include #include -#include -#include -#include -#include #include #include +#include + +#include +#include namespace faiss { namespace gpu { @@ -71,18 +59,31 @@ RaftIVFFlat::RaftIVFFlat( useResidual, scalarQ, interleavedLayout, + // skip ptr allocations in base class (handled by RAFT + // internally) indicesOptions, space) { FAISS_THROW_IF_NOT_MSG( indicesOptions == INDICES_64_BIT, "only INDICES_64_BIT is supported for RAFT index"); - reset(); } RaftIVFFlat::~RaftIVFFlat() {} -/// Find the approximate k nearest neighbors for `queries` against -/// our database +void RaftIVFFlat::reserveMemory(idx_t numVecs) { + fprintf(stderr, + "WARN: reserveMemory is NOP. Pre-allocation of IVF lists is not supported with RAFT enabled.\n"); +} + +void RaftIVFFlat::reset() { + raft_knn_index.reset(); +} + +void RaftIVFFlat::setRaftIndex( + raft::neighbors::ivf_flat::index&& idx) { + raft_knn_index.emplace(std::move(idx)); +} + void RaftIVFFlat::search( Index* coarseQuantizer, Tensor& queries, @@ -90,7 +91,9 @@ void RaftIVFFlat::search( int k, Tensor& outDistances, Tensor& outIndices) { - // TODO: We probably don't want to ignore the coarse quantizer here... + /// NB: The coarse quantizer is ignored here. The user is assumed to have + /// called updateQuantizer() to modify the RAFT index if the quantizer was + /// modified externally uint32_t numQueries = queries.getSize(0); uint32_t cols = queries.getSize(1); @@ -125,7 +128,7 @@ void RaftIVFFlat::search( /// Identify NaN rows and mask their nearest neighbors auto nan_flag = raft::make_device_vector(raft_handle, numQueries); - validRowIndices_(queries, nan_flag.data_handle()); + validRowIndices(resources_, queries, nan_flag.data_handle()); raft::linalg::map_offset( raft_handle, @@ -154,65 +157,22 @@ void RaftIVFFlat::search( }); } -/// Classify and encode/add vectors to our IVF lists. -/// The input data must be on our current device. -/// Returns the number of vectors successfully added. Vectors may -/// not be able to be added because they contain NaNs. idx_t RaftIVFFlat::addVectors( Index* coarseQuantizer, Tensor& vecs, Tensor& indices) { - /// TODO: We probably don't want to ignore the coarse quantizer here + /// NB: The coarse quantizer is ignored here. The user is assumed to have + /// called updateQuantizer() to update the RAFT index if the quantizer was + /// modified externally - idx_t n_rows = vecs.getSize(0); + FAISS_ASSERT(raft_knn_index.has_value()); const raft::device_resources& raft_handle = resources_->getRaftHandleCurrentDevice(); - /// Remove NaN values - auto nan_flag = raft::make_device_vector(raft_handle, n_rows); - - validRowIndices_(vecs, nan_flag.data_handle()); - - idx_t n_rows_valid = thrust::reduce( - raft_handle.get_thrust_policy(), - nan_flag.data_handle(), - nan_flag.data_handle() + n_rows, - 0); - - if (n_rows_valid < n_rows) { - auto gather_indices = raft::make_device_vector( - raft_handle, n_rows_valid); - - auto count = thrust::make_counting_iterator(0); + /// Remove rows containing NaNs + idx_t n_rows_valid = inplaceGatherFilteredRows(resources_, vecs, indices); - thrust::copy_if( - raft_handle.get_thrust_policy(), - count, - count + n_rows, - gather_indices.data_handle(), - [nan_flag = nan_flag.data_handle()] __device__(auto i) { - return nan_flag[i]; - }); - - raft::matrix::gather( - raft_handle, - raft::make_device_matrix_view( - vecs.data(), n_rows, dim_), - raft::make_const_mdspan(gather_indices.view()), - (idx_t)16); - - auto valid_indices = raft::make_device_vector( - raft_handle, n_rows_valid); - - raft::matrix::gather( - raft_handle, - raft::make_device_matrix_view( - indices.data(), n_rows, (idx_t)1), - raft::make_const_mdspan(gather_indices.view())); - } - - FAISS_ASSERT(raft_knn_index.has_value()); raft_knn_index.emplace(raft::neighbors::ivf_flat::extend( raft_handle, raft::make_device_matrix_view( @@ -225,10 +185,6 @@ idx_t RaftIVFFlat::addVectors( return n_rows_valid; } -void RaftIVFFlat::reset() { - raft_knn_index.reset(); -} - idx_t RaftIVFFlat::getListLength(idx_t listId) const { FAISS_ASSERT(raft_knn_index.has_value()); const raft::device_resources& raft_handle = @@ -259,10 +215,11 @@ std::vector RaftIVFFlat::getListIndices(idx_t listId) const { // fetch the list indices ptr on host idx_t* list_indices_ptr; - // fetch the list indices ptr on host raft::update_host( &list_indices_ptr, - raft_knn_index.value().inds_ptrs().data_handle() + listId, + const_cast( + raft_knn_index.value().inds_ptrs().data_handle()) + + listId, 1, stream); raft_handle.sync_stream(); @@ -278,7 +235,7 @@ std::vector RaftIVFFlat::getListVectorData( idx_t listId, bool gpuFormat) const { if (gpuFormat) { - FAISS_THROW_MSG("gpuFormat is not suppported for raft indices"); + FAISS_THROW_MSG("gpuFormat should be false for RAFT indices"); } FAISS_ASSERT(raft_knn_index.has_value()); @@ -334,59 +291,71 @@ void RaftIVFFlat::searchPreassigned( } void RaftIVFFlat::updateQuantizer(Index* quantizer) { - idx_t quantizer_ntotal = quantizer->ntotal; + FAISS_THROW_IF_NOT(quantizer->is_trained); - const raft::device_resources& raft_handle = - resources_->getRaftHandleCurrentDevice(); - auto stream = raft_handle.get_stream(); + // Must match our basic IVF parameters + FAISS_THROW_IF_NOT(quantizer->d == getDim()); + FAISS_THROW_IF_NOT(quantizer->ntotal == getNumLists()); - auto total_elems = size_t(quantizer_ntotal) * size_t(quantizer->d); + size_t total_elems = quantizer->ntotal * quantizer->d; - raft::logger::get().set_level(RAFT_LEVEL_TRACE); + auto stream = resources_->getDefaultStreamCurrentDevice(); + const raft::device_resources& raft_handle = + resources_->getRaftHandleCurrentDevice(); raft::neighbors::ivf_flat::index_params pams; pams.add_data_on_build = false; - - pams.n_lists = this->numLists_; - - switch (this->metric_) { - case faiss::METRIC_L2: - pams.metric = raft::distance::DistanceType::L2Expanded; - break; - case faiss::METRIC_INNER_PRODUCT: - pams.metric = raft::distance::DistanceType::InnerProduct; - break; - default: - FAISS_THROW_MSG("Metric is not supported."); + pams.metric = metricFaissToRaft(metric_, false); + pams.n_lists = numLists_; + raft_knn_index.emplace(raft_handle, pams, static_cast(dim_)); + + raft::neighbors::ivf_flat::helpers::reset_index( + raft_handle, &raft_knn_index.value()); + + // If the index instance is a GpuIndexFlat, then we can use direct access to + // the centroids within. + auto gpuQ = dynamic_cast(quantizer); + if (gpuQ) { + auto gpuData = gpuQ->getGpuData(); + + if (gpuData->getUseFloat16()) { + // The FlatIndex keeps its data in float16; we need to reconstruct + // as float32 and store locally + DeviceTensor centroids( + resources_, + makeSpaceAlloc(AllocType::FlatData, space_, stream), + {getNumLists(), getDim()}); + + gpuData->reconstruct(0, gpuData->getSize(), centroids); + + raft::update_device( + raft_knn_index.value().centers().data_handle(), + centroids.data(), + total_elems, + stream); + } else { + /// No reconstruct needed since the centers are already in float32 + auto centroids = gpuData->getVectorsFloat32Ref(); + + raft::update_device( + raft_knn_index.value().centers().data_handle(), + centroids.data(), + total_elems, + stream); + } + } else { + // Otherwise, we need to reconstruct all vectors from the index and copy + // them to the GPU, in order to have access as needed for residual + // computation + auto vecs = std::vector(getNumLists() * getDim()); + quantizer->reconstruct_n(0, quantizer->ntotal, vecs.data()); + + raft::update_device( + raft_knn_index.value().centers().data_handle(), + vecs.data(), + total_elems, + stream); } - - raft_knn_index.emplace(raft_handle, pams, (uint32_t)this->dim_); - - cudaMemsetAsync( - raft_knn_index.value().list_sizes().data_handle(), - 0, - raft_knn_index.value().list_sizes().size() * sizeof(uint32_t), - stream); - cudaMemsetAsync( - raft_knn_index.value().data_ptrs().data_handle(), - 0, - raft_knn_index.value().data_ptrs().size() * sizeof(float*), - stream); - cudaMemsetAsync( - raft_knn_index.value().inds_ptrs().data_handle(), - 0, - raft_knn_index.value().inds_ptrs().size() * sizeof(idx_t*), - stream); - - /// Copy (reconstructed) centroids over, rather than re-training - std::vector buf_host(total_elems); - quantizer->reconstruct_n(0, quantizer_ntotal, buf_host.data()); - - raft::update_device( - raft_knn_index.value().centers().data_handle(), - buf_host.data(), - total_elems, - stream); } void RaftIVFFlat::copyInvertedListsFrom(const InvertedLists* ivf) { @@ -422,6 +391,9 @@ void RaftIVFFlat::copyInvertedListsFrom(const InvertedLists* ivf) { // store the list size list_sizes_[i] = static_cast(listSize); + // This RAFT list must currently be empty + FAISS_ASSERT(getListLength(i) == 0); + raft::neighbors::ivf::resize_list( raft_handle, raft_lists[i], @@ -483,12 +455,6 @@ void RaftIVFFlat::addEncodedVectorsToList_( idx_t numVecs) { auto stream = resources_->getDefaultStreamCurrentDevice(); - // This list must already exist - FAISS_ASSERT(raft_knn_index.has_value()); - - // This list must currently be empty - FAISS_ASSERT(getListLength(listId) == 0); - // If there's nothing to add, then there's nothing we have to do if (numVecs == 0) { return; @@ -496,7 +462,6 @@ void RaftIVFFlat::addEncodedVectorsToList_( // The GPU might have a different layout of the memory auto gpuListSizeInBytes = getGpuVectorsEncodingSize_(numVecs); - auto cpuListSizeInBytes = getCpuVectorsEncodingSize_(numVecs); // We only have int32 length representations on the GPU per each // list; the length is in sizeof(char) @@ -541,27 +506,6 @@ void RaftIVFFlat::addEncodedVectorsToList_( raft::update_device(list_indices_ptr, indices, numVecs, stream); } -void RaftIVFFlat::validRowIndices_( - Tensor& vecs, - bool* nan_flag) { - raft::device_resources& raft_handle = - resources_->getRaftHandleCurrentDevice(); - idx_t n_rows = vecs.getSize(0); - - thrust::fill_n(raft_handle.get_thrust_policy(), nan_flag, n_rows, true); - raft::linalg::map_offset( - raft_handle, - raft::make_device_vector_view(nan_flag, n_rows), - [vecs = vecs.data(), dim_ = this->dim_] __device__(idx_t i) { - for (idx_t col = 0; col < dim_; col++) { - if (!isfinite(vecs[i * dim_ + col])) { - return false; - } - } - return true; - }); -} - RaftIVFFlatCodePackerInterleaved::RaftIVFFlatCodePackerInterleaved( size_t list_size, uint32_t dim, diff --git a/faiss/gpu/impl/RaftIVFFlat.cuh b/faiss/gpu/impl/RaftIVFFlat.cuh index 3aba501c9f..4f8c89ecb0 100644 --- a/faiss/gpu/impl/RaftIVFFlat.cuh +++ b/faiss/gpu/impl/RaftIVFFlat.cuh @@ -22,16 +22,15 @@ #pragma once +#include #include -#include #include -#include - #include #include +#pragma GCC visibility push(default) namespace faiss { namespace gpu { @@ -52,6 +51,9 @@ class RaftIVFFlat : public IVFFlat { ~RaftIVFFlat() override; + /// Reserve GPU memory in our inverted lists for this number of vectors + void reserveMemory(idx_t numVecs) override; + /// Find the approximate k nearest neigbors for `queries` against /// our database void search( @@ -83,11 +85,7 @@ class RaftIVFFlat : public IVFFlat { Tensor& vecs, Tensor& indices) override; - /// Reserve GPU memory in our inverted lists for this number of vectors - // void reserveMemory(idx_t numVecs) override; - - /// Clear out all inverted lists, but retain the coarse quantizer - /// and the product quantizer info + /// Clear out the Raft index void reset() override; /// For debugging purposes, return the list length of a particular @@ -101,15 +99,17 @@ class RaftIVFFlat : public IVFFlat { std::vector getListVectorData(idx_t listId, bool gpuFormat) const override; + /// Update our Raft index with this quantizer instance; may be a CPU + /// or GPU quantizer void updateQuantizer(Index* quantizer) override; /// Copy all inverted lists from a CPU representation to ourselves void copyInvertedListsFrom(const InvertedLists* ivf) override; - /// Filter out matrix rows containing NaN values - void validRowIndices_(Tensor& vecs, bool* nan_flag); + /// Replace the Raft index + void setRaftIndex(raft::neighbors::ivf_flat::index&& idx); - protected: + private: /// Adds a set of codes and indices to a list, with the representation /// coming from the CPU equivalent void addEncodedVectorsToList_( @@ -147,3 +147,4 @@ struct RaftIVFFlatCodePackerInterleaved : CodePacker { } // namespace gpu } // namespace faiss +#pragma GCC visibility pop diff --git a/faiss/gpu/impl/RaftIVFPQ.cu b/faiss/gpu/impl/RaftIVFPQ.cu new file mode 100644 index 0000000000..b26025dd47 --- /dev/null +++ b/faiss/gpu/impl/RaftIVFPQ.cu @@ -0,0 +1,546 @@ +/** + * Copyright (c) Facebook, Inc. and its affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include + +#include +#include + +#include +#include + +namespace faiss { +namespace gpu { + +RaftIVFPQ::RaftIVFPQ( + GpuResources* resources, + int dim, + idx_t nlist, + faiss::MetricType metric, + float metricArg, + int numSubQuantizers, + int bitsPerSubQuantizer, + bool useFloat16LookupTables, + bool useMMCodeDistance, + bool interleavedLayout, + float* pqCentroidData, + IndicesOptions indicesOptions, + MemorySpace space) + : IVFPQ(resources, + dim, + nlist, + metric, + metricArg, + numSubQuantizers, + bitsPerSubQuantizer, + useFloat16LookupTables, + useMMCodeDistance, + interleavedLayout, + // skip ptr allocations in base class (handled by RAFT + // internally) false, + pqCentroidData, + indicesOptions, + space) { + FAISS_THROW_IF_NOT_MSG( + indicesOptions == INDICES_64_BIT, + "only INDICES_64_BIT is supported for RAFT index"); +} + +RaftIVFPQ::~RaftIVFPQ() {} + +void RaftIVFPQ::reserveMemory(idx_t numVecs) { + fprintf(stderr, + "WARN: reserveMemory is NOP. Pre-allocation of IVF lists is not supported with RAFT enabled.\n"); +} + +void RaftIVFPQ::reset() { + raft_knn_index.reset(); +} + +size_t RaftIVFPQ::reclaimMemory() { + fprintf(stderr, + "WARN: reclaimMemory is NOP. reclaimMemory is not supported with RAFT enabled.\n"); + return 0; +} + +void RaftIVFPQ::setPrecomputedCodes(Index* quantizer, bool enable) {} + +idx_t RaftIVFPQ::getListLength(idx_t listId) const { + FAISS_ASSERT(raft_knn_index.has_value()); + const raft::device_resources& raft_handle = + resources_->getRaftHandleCurrentDevice(); + + uint32_t size; + raft::update_host( + &size, + raft_knn_index.value().list_sizes().data_handle() + listId, + 1, + raft_handle.get_stream()); + raft_handle.sync_stream(); + + return static_cast(size); +} + +void RaftIVFPQ::updateQuantizer(Index* quantizer) { + FAISS_THROW_IF_NOT(quantizer->is_trained); + + // Must match our basic IVF parameters + FAISS_THROW_IF_NOT(quantizer->d == getDim()); + FAISS_THROW_IF_NOT(quantizer->ntotal == getNumLists()); + + auto stream = resources_->getDefaultStreamCurrentDevice(); + const raft::device_resources& raft_handle = + resources_->getRaftHandleCurrentDevice(); + + raft::neighbors::ivf_pq::index_params pams; + pams.metric = metricFaissToRaft(metric_, false); + pams.codebook_kind = raft::neighbors::ivf_pq::codebook_gen::PER_SUBSPACE; + pams.n_lists = numLists_; + pams.pq_bits = bitsPerSubQuantizer_; + pams.pq_dim = numSubQuantizers_; + raft_knn_index.emplace(raft_handle, pams, static_cast(dim_)); + + raft::neighbors::ivf_pq::helpers::reset_index( + raft_handle, &raft_knn_index.value()); + raft::neighbors::ivf_pq::helpers::make_rotation_matrix( + raft_handle, &(raft_knn_index.value()), false); + + // If the index instance is a GpuIndexFlat, then we can use direct access to + // the centroids within. + auto gpuQ = dynamic_cast(quantizer); + + if (gpuQ) { + auto gpuData = gpuQ->getGpuData(); + + if (gpuData->getUseFloat16()) { + DeviceTensor centroids( + resources_, + makeSpaceAlloc(AllocType::FlatData, space_, stream), + {getNumLists(), getDim()}); + + // The FlatIndex keeps its data in float16; we need to reconstruct + // as float32 and store locally + gpuData->reconstruct(0, gpuData->getSize(), centroids); + + raft::neighbors::ivf_pq::helpers::set_centers( + raft_handle, + &(raft_knn_index.value()), + raft::make_device_matrix_view( + centroids.data(), numLists_, dim_)); + } else { + /// No reconstruct needed since the centers are already in float32 + // The FlatIndex keeps its data in float32, so we can merely + // reference it + auto centroids = gpuData->getVectorsFloat32Ref(); + + raft::neighbors::ivf_pq::helpers::set_centers( + raft_handle, + &(raft_knn_index.value()), + raft::make_device_matrix_view( + centroids.data(), numLists_, dim_)); + } + } else { + DeviceTensor centroids( + resources_, + makeSpaceAlloc(AllocType::FlatData, space_, stream), + {getNumLists(), getDim()}); + + // Otherwise, we need to reconstruct all vectors from the index and copy + // them to the GPU, in order to have access as needed for residual + // computation + auto vecs = std::vector(getNumLists() * getDim()); + quantizer->reconstruct_n(0, quantizer->ntotal, vecs.data()); + + centroids.copyFrom(vecs, stream); + + raft::neighbors::ivf_pq::helpers::set_centers( + raft_handle, + &(raft_knn_index.value()), + raft::make_device_matrix_view( + centroids.data(), numLists_, dim_)); + } + + setPQCentroids_(); +} + +/// Return the list indices of a particular list back to the CPU +std::vector RaftIVFPQ::getListIndices(idx_t listId) const { + FAISS_ASSERT(raft_knn_index.has_value()); + const raft::device_resources& raft_handle = + resources_->getRaftHandleCurrentDevice(); + auto stream = raft_handle.get_stream(); + + idx_t listSize = getListLength(listId); + + std::vector vec(listSize); + + // fetch the list indices ptr on host + idx_t* list_indices_ptr; + + raft::update_host( + &list_indices_ptr, + const_cast( + raft_knn_index.value().inds_ptrs().data_handle()) + + listId, + 1, + stream); + raft_handle.sync_stream(); + + raft::update_host(vec.data(), list_indices_ptr, listSize, stream); + raft_handle.sync_stream(); + + return vec; +} + +/// Performs search when we are already given the IVF cells to look at +/// (GpuIndexIVF::search_preassigned implementation) +void RaftIVFPQ::searchPreassigned( + Index* coarseQuantizer, + Tensor& vecs, + Tensor& ivfDistances, + Tensor& ivfAssignments, + int k, + Tensor& outDistances, + Tensor& outIndices, + bool storePairs) { + // TODO: Fill this in! +} + +size_t RaftIVFPQ::getGpuListEncodingSize_(idx_t listId) { + return static_cast( + raft_knn_index.value().get_list_size_in_bytes(listId)); +} + +/// Return the encoded vectors of a particular list back to the CPU +std::vector RaftIVFPQ::getListVectorData(idx_t listId, bool gpuFormat) + const { + if (gpuFormat) { + FAISS_THROW_MSG( + "gpuFormat should be false for RAFT indices. Unpacked codes are flat."); + } + FAISS_ASSERT(raft_knn_index.has_value()); + + const raft::device_resources& raft_handle = + resources_->getRaftHandleCurrentDevice(); + auto stream = raft_handle.get_stream(); + + idx_t listSize = getListLength(listId); + + auto cpuListSizeInBytes = getCpuVectorsEncodingSize_(listSize); + + std::vector flat_codes( + cpuListSizeInBytes, static_cast(0)); + + idx_t maxBatchSize = 65536; + for (idx_t offset_b = 0; offset_b < listSize; offset_b += maxBatchSize) { + uint32_t batchSize = min(maxBatchSize, listSize - offset_b); + uint32_t bufferSize = getCpuVectorsEncodingSize_(batchSize); + uint32_t codesOffset = getCpuVectorsEncodingSize_(offset_b); + + // Fetch flat PQ codes for the current batch + auto codes_d = raft::make_device_vector( + raft_handle, static_cast(bufferSize)); + + raft::neighbors::ivf_pq::helpers::unpack_contiguous_list_data( + raft_handle, + raft_knn_index.value(), + codes_d.data_handle(), + batchSize, + listId, + offset_b); + + // Copy the flat PQ codes to host + raft::update_host( + flat_codes.data() + codesOffset, + codes_d.data_handle(), + bufferSize, + stream); + raft_handle.sync_stream(); + } + + return flat_codes; +} + +/// Find the approximate k nearest neighbors for `queries` against +/// our database +void RaftIVFPQ::search( + Index* coarseQuantizer, + Tensor& queries, + int nprobe, + int k, + Tensor& outDistances, + Tensor& outIndices) { + uint32_t numQueries = queries.getSize(0); + uint32_t cols = queries.getSize(1); + idx_t k_ = std::min(static_cast(k), raft_knn_index.value().size()); + + // Device is already set in GpuIndex::search + FAISS_ASSERT(raft_knn_index.has_value()); + FAISS_ASSERT(numQueries > 0); + FAISS_ASSERT(cols == dim_); + FAISS_THROW_IF_NOT(nprobe > 0 && nprobe <= numLists_); + + const raft::device_resources& raft_handle = + resources_->getRaftHandleCurrentDevice(); + raft::neighbors::ivf_pq::search_params pams; + pams.n_probes = nprobe; + pams.lut_dtype = useFloat16LookupTables_ ? CUDA_R_16F : CUDA_R_32F; + + auto queries_view = raft::make_device_matrix_view( + queries.data(), (idx_t)numQueries, (idx_t)cols); + auto out_inds_view = raft::make_device_matrix_view( + outIndices.data(), (idx_t)numQueries, (idx_t)k_); + auto out_dists_view = raft::make_device_matrix_view( + outDistances.data(), (idx_t)numQueries, (idx_t)k_); + + raft::neighbors::ivf_pq::search( + raft_handle, + pams, + raft_knn_index.value(), + queries_view, + out_inds_view, + out_dists_view); + + /// Identify NaN rows and mask their nearest neighbors + auto nan_flag = raft::make_device_vector(raft_handle, numQueries); + + validRowIndices(resources_, queries, nan_flag.data_handle()); + + raft::linalg::map_offset( + raft_handle, + raft::make_device_vector_view(outIndices.data(), numQueries * k_), + [nan_flag = nan_flag.data_handle(), + out_inds = outIndices.data(), + k_] __device__(uint32_t i) { + uint32_t row = i / k_; + if (!nan_flag[row]) + return idx_t(-1); + return out_inds[i]; + }); + + float max_val = std::numeric_limits::max(); + raft::linalg::map_offset( + raft_handle, + raft::make_device_vector_view(outDistances.data(), numQueries * k_), + [nan_flag = nan_flag.data_handle(), + out_dists = outDistances.data(), + max_val, + k_] __device__(uint32_t i) { + uint32_t row = i / k_; + if (!nan_flag[row]) + return max_val; + return out_dists[i]; + }); + raft_handle.sync_stream(); +} + +idx_t RaftIVFPQ::addVectors( + Index* coarseQuantizer, + Tensor& vecs, + Tensor& indices) { + /// NB: The coarse quantizer is ignored here. The user is assumed to have + /// called updateQuantizer() to update the RAFT index if the quantizer was + /// modified externally + + FAISS_ASSERT(raft_knn_index.has_value()); + + const raft::device_resources& raft_handle = + resources_->getRaftHandleCurrentDevice(); + + /// Remove rows containing NaNs + idx_t n_rows_valid = inplaceGatherFilteredRows(resources_, vecs, indices); + + raft_knn_index.emplace(raft::neighbors::ivf_pq::extend( + raft_handle, + raft::make_device_matrix_view( + vecs.data(), n_rows_valid, dim_), + std::make_optional>( + raft::make_device_vector_view( + indices.data(), n_rows_valid)), + raft_knn_index.value())); + + return n_rows_valid; +} + +void RaftIVFPQ::copyInvertedListsFrom(const InvertedLists* ivf) { + size_t nlist = ivf ? ivf->nlist : 0; + size_t ntotal = ivf ? ivf->compute_ntotal() : 0; + + const raft::device_resources& raft_handle = + resources_->getRaftHandleCurrentDevice(); + + std::vector list_sizes_(nlist); + std::vector indices_(ntotal); + + // the index must already exist + FAISS_ASSERT(raft_knn_index.has_value()); + + auto& raft_lists = raft_knn_index.value().lists(); + + // conservative memory alloc for cloning cpu inverted lists + raft::neighbors::ivf_pq::list_spec raft_list_spec{ + static_cast(bitsPerSubQuantizer_), + static_cast(numSubQuantizers_), + true}; + + for (size_t i = 0; i < nlist; ++i) { + size_t listSize = ivf->list_size(i); + + // GPU index can only support max int entries per list + FAISS_THROW_IF_NOT_FMT( + listSize <= (size_t)std::numeric_limits::max(), + "GPU inverted list can only support " + "%zu entries; %zu found", + (size_t)std::numeric_limits::max(), + listSize); + + // store the list size + list_sizes_[i] = static_cast(listSize); + + // This RAFT list must currently be empty + FAISS_ASSERT(getListLength(i) == 0); + + raft::neighbors::ivf::resize_list( + raft_handle, + raft_lists[i], + raft_list_spec, + static_cast(listSize), + static_cast(0)); + } + + raft::update_device( + raft_knn_index.value().list_sizes().data_handle(), + list_sizes_.data(), + nlist, + raft_handle.get_stream()); + + // Update the pointers and the sizes + raft::neighbors::ivf_pq::helpers::recompute_internal_state( + raft_handle, &(raft_knn_index.value())); + + for (size_t i = 0; i < nlist; ++i) { + size_t listSize = ivf->list_size(i); + addEncodedVectorsToList_( + i, ivf->get_codes(i), ivf->get_ids(i), listSize); + } +} + +void RaftIVFPQ::setRaftIndex(raft::neighbors::ivf_pq::index&& idx) { + raft_knn_index.emplace(std::move(idx)); + setBasePQCentroids_(); +} + +void RaftIVFPQ::addEncodedVectorsToList_( + idx_t listId, + const void* codes, + const idx_t* indices, + idx_t numVecs) { + auto stream = resources_->getDefaultStreamCurrentDevice(); + const raft::device_resources& raft_handle = + resources_->getRaftHandleCurrentDevice(); + + // If there's nothing to add, then there's nothing we have to do + if (numVecs == 0) { + return; + } + + // The GPU might have a different layout of the memory + auto gpuListSizeInBytes = getGpuListEncodingSize_(listId); + + // We only have int32 length representations on the GPU per each + // list; the length is in sizeof(char) + FAISS_ASSERT(gpuListSizeInBytes <= (size_t)std::numeric_limits::max()); + + idx_t maxBatchSize = 4096; + for (idx_t offset_b = 0; offset_b < numVecs; offset_b += maxBatchSize) { + uint32_t batchSize = min(maxBatchSize, numVecs - offset_b); + uint32_t bufferSize = getCpuVectorsEncodingSize_(batchSize); + uint32_t codesOffset = getCpuVectorsEncodingSize_(offset_b); + + // Translate the codes as needed to our preferred form + auto codes_d = raft::make_device_vector( + raft_handle, static_cast(bufferSize)); + raft::update_device( + codes_d.data_handle(), + static_cast(codes) + codesOffset, + bufferSize, + stream); + + raft::neighbors::ivf_pq::helpers::pack_contiguous_list_data( + raft_handle, + &(raft_knn_index.value()), + codes_d.data_handle(), + batchSize, + listId, + offset_b); + } + + /// Handle the indices as well + idx_t* list_indices_ptr; + + // fetch the list indices ptr on host + raft::update_host( + &list_indices_ptr, + raft_knn_index.value().inds_ptrs().data_handle() + listId, + 1, + stream); + raft_handle.sync_stream(); + + raft::update_device(list_indices_ptr, indices, numVecs, stream); +} + +void RaftIVFPQ::setPQCentroids_() { + auto stream = resources_->getDefaultStreamCurrentDevice(); + + raft::copy( + raft_knn_index.value().pq_centers().data_handle(), + pqCentroidsInnermostCode_.data(), + pqCentroidsInnermostCode_.numElements(), + stream); +} + +void RaftIVFPQ::setBasePQCentroids_() { + auto stream = resources_->getDefaultStreamCurrentDevice(); + + raft::copy( + pqCentroidsInnermostCode_.data(), + raft_knn_index.value().pq_centers().data_handle(), + raft_knn_index.value().pq_centers().size(), + stream); + + DeviceTensor pqCentroidsMiddleCode( + resources_, + makeDevAlloc(AllocType::Quantizer, stream), + {numSubQuantizers_, numSubQuantizerCodes_, dimPerSubQuantizer_}); + + runTransposeAny( + pqCentroidsInnermostCode_, 1, 2, pqCentroidsMiddleCode, stream); + + pqCentroidsMiddleCode_ = std::move(pqCentroidsMiddleCode); +} + +} // namespace gpu +} // namespace faiss \ No newline at end of file diff --git a/faiss/gpu/impl/RaftIVFPQ.cuh b/faiss/gpu/impl/RaftIVFPQ.cuh new file mode 100644 index 0000000000..a79db3c40d --- /dev/null +++ b/faiss/gpu/impl/RaftIVFPQ.cuh @@ -0,0 +1,150 @@ +/** + * Copyright (c) Facebook, Inc. and its affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +#include + +#include + +#pragma GCC visibility push(default) +namespace faiss { +namespace gpu { +/// Implementing class for IVFPQ on the GPU +class RaftIVFPQ : public IVFPQ { + public: + RaftIVFPQ( + GpuResources* resources, + int dim, + idx_t nlist, + faiss::MetricType metric, + float metricArg, + int numSubQuantizers, + int bitsPerSubQuantizer, + bool useFloat16LookupTables, + bool useMMCodeDistance, + bool interleavedLayout, + float* pqCentroidData, + IndicesOptions indicesOptions, + MemorySpace space); + + ~RaftIVFPQ() override; + + /// Reserve GPU memory in our inverted lists for this number of vectors + void reserveMemory(idx_t numVecs) override; + + /// Clear out the RAFT index + void reset() override; + + /// After adding vectors, one can call this to reclaim device memory + /// to exactly the amount needed. Returns space reclaimed in bytes + size_t reclaimMemory() override; + + /// Enable or disable pre-computed codes. The quantizer is needed to gather + /// the IVF centroids for use + void setPrecomputedCodes(Index* coarseQuantizer, bool enable) override; + + /// Find the approximate k nearest neigbors for `queries` against + /// our database + void search( + Index* coarseQuantizer, + Tensor& queries, + int nprobe, + int k, + Tensor& outDistances, + Tensor& outIndices) override; + + /// Performs search when we are already given the IVF cells to look at + /// (GpuIndexIVF::search_preassigned implementation) + void searchPreassigned( + Index* coarseQuantizer, + Tensor& vecs, + Tensor& ivfDistances, + Tensor& ivfAssignments, + int k, + Tensor& outDistances, + Tensor& outIndices, + bool storePairs) override; + + /// Return the encoded vectors of a particular list back to the CPU + std::vector getListVectorData(idx_t listId, bool gpuFormat) + const override; + + /// Update our Raft index with this quantizer instance; may be a CPU + /// or GPU quantizer + void updateQuantizer(Index* quantizer) override; + + /// Copy all inverted lists from a CPU representation to ourselves + void copyInvertedListsFrom(const InvertedLists* ivf) override; + + /// Replace the Raft index + void setRaftIndex(raft::neighbors::ivf_pq::index&& idx); + + /// Classify and encode/add vectors to our IVF lists. + /// The input data must be on our current device. + /// Returns the number of vectors successfully added. Vectors may + /// not be able to be added because they contain NaNs. + idx_t addVectors( + Index* coarseQuantizer, + Tensor& vecs, + Tensor& indices) override; + + /// For debugging purposes, return the list length of a particular + /// list + idx_t getListLength(idx_t listId) const override; + + /// Return the list indices of a particular list back to the CPU + std::vector getListIndices(idx_t listId) const override; + + private: + /// Adds a set of codes and indices to a list, with the representation + /// coming from the CPU equivalent + void addEncodedVectorsToList_( + idx_t listId, + // resident on the host + const void* codes, + // resident on the host + const idx_t* indices, + idx_t numVecs) override; + + /// Returns the encoding size for a PQ-encoded IVF list + size_t getGpuListEncodingSize_(idx_t listId); + + /// Copy the PQ centroids to the Raft index. The data is already in the + /// preferred format with the transpose performed by the IVFPQ class helper. + void setPQCentroids_(); + + /// Update the product quantizer centroids buffer held in the IVFPQ class. + /// Used when the RAFT index was updated externally. + void setBasePQCentroids_(); + + /// optional around the Raft IVF-PQ index + std::optional> raft_knn_index{ + std::nullopt}; +}; + +} // namespace gpu +} // namespace faiss +#pragma GCC visibility pop diff --git a/faiss/gpu/test/TestGpuIndexFlat.cpp b/faiss/gpu/test/TestGpuIndexFlat.cpp index 6d9c83e547..d7a7b45ec0 100644 --- a/faiss/gpu/test/TestGpuIndexFlat.cpp +++ b/faiss/gpu/test/TestGpuIndexFlat.cpp @@ -268,6 +268,7 @@ TEST(TestGpuIndexFlat, QueryEmpty) { faiss::gpu::GpuIndexFlatConfig config; config.device = 0; config.useFloat16 = false; + config.use_raft = false; int dim = 128; faiss::gpu::GpuIndexFlatL2 gpuIndex(&res, dim, config); diff --git a/faiss/gpu/test/TestGpuIndexIVFFlat.cpp b/faiss/gpu/test/TestGpuIndexIVFFlat.cpp index 9fb88e2687..a90825bffb 100644 --- a/faiss/gpu/test/TestGpuIndexIVFFlat.cpp +++ b/faiss/gpu/test/TestGpuIndexIVFFlat.cpp @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -30,7 +31,6 @@ #include #include #include -#include "faiss/gpu/GpuIndicesOptions.h" // FIXME: figure out a better way to test fp16 constexpr float kF16MaxRelErr = 0.3f; @@ -57,7 +57,7 @@ struct Options { device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1); - use_raft = false; + useRaft = false; } std::string toString() const { @@ -65,7 +65,7 @@ struct Options { str << "IVFFlat device " << device << " numVecs " << numAdd << " dim " << dim << " numCentroids " << numCentroids << " nprobe " << nprobe << " numQuery " << numQuery << " k " << k << " indicesOpt " - << indicesOpt << " use_raft " << use_raft; + << indicesOpt << " useRaft " << useRaft; return str.str(); } @@ -79,7 +79,7 @@ struct Options { int k; int device; faiss::gpu::IndicesOptions indicesOpt; - bool use_raft; + bool useRaft; }; void queryTest( @@ -110,7 +110,7 @@ void queryTest( config.device = opt.device; config.indicesOptions = opt.indicesOpt; config.flatConfig.useFloat16 = useFloat16CoarseQuantizer; - config.use_raft = opt.use_raft; + config.use_raft = opt.useRaft; faiss::gpu::GpuIndexIVFFlat gpuIndex( &res, cpuIndex.d, cpuIndex.nlist, cpuIndex.metric_type, config); @@ -137,7 +137,7 @@ void queryTest( void addTest( faiss::MetricType metricType, bool useFloat16CoarseQuantizer, - bool use_raft) { + bool useRaft) { for (int tries = 0; tries < 2; ++tries) { Options opt; @@ -162,9 +162,9 @@ void addTest( faiss::gpu::GpuIndexIVFFlatConfig config; config.device = opt.device; config.indicesOptions = - use_raft ? faiss::gpu::INDICES_64_BIT : opt.indicesOpt; + useRaft ? faiss::gpu::INDICES_64_BIT : opt.indicesOpt; config.flatConfig.useFloat16 = useFloat16CoarseQuantizer; - config.use_raft = use_raft; + config.use_raft = useRaft; faiss::gpu::GpuIndexIVFFlat gpuIndex( &res, cpuIndex.d, cpuIndex.nlist, cpuIndex.metric_type, config); @@ -188,7 +188,7 @@ void addTest( } } -void copyToTest(bool useFloat16CoarseQuantizer, bool use_raft) { +void copyToTest(bool useFloat16CoarseQuantizer, bool useRaft) { Options opt; std::vector trainVecs = faiss::gpu::randVecs(opt.numTrain, opt.dim); std::vector addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim); @@ -199,9 +199,9 @@ void copyToTest(bool useFloat16CoarseQuantizer, bool use_raft) { faiss::gpu::GpuIndexIVFFlatConfig config; config.device = opt.device; config.indicesOptions = - use_raft ? faiss::gpu::INDICES_64_BIT : opt.indicesOpt; + useRaft ? faiss::gpu::INDICES_64_BIT : opt.indicesOpt; config.flatConfig.useFloat16 = useFloat16CoarseQuantizer; - config.use_raft = use_raft; + config.use_raft = useRaft; faiss::gpu::GpuIndexIVFFlat gpuIndex( &res, opt.dim, opt.numCentroids, faiss::METRIC_L2, config); @@ -241,7 +241,7 @@ void copyToTest(bool useFloat16CoarseQuantizer, bool use_raft) { compFloat16 ? 0.30f : 0.015f); } -void copyFromTest(bool useFloat16CoarseQuantizer, bool use_raft) { +void copyFromTest(bool useFloat16CoarseQuantizer, bool useRaft) { Options opt; std::vector trainVecs = faiss::gpu::randVecs(opt.numTrain, opt.dim); std::vector addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim); @@ -260,9 +260,9 @@ void copyFromTest(bool useFloat16CoarseQuantizer, bool use_raft) { faiss::gpu::GpuIndexIVFFlatConfig config; config.device = opt.device; config.indicesOptions = - use_raft ? faiss::gpu::INDICES_64_BIT : opt.indicesOpt; + useRaft ? faiss::gpu::INDICES_64_BIT : opt.indicesOpt; config.flatConfig.useFloat16 = useFloat16CoarseQuantizer; - config.use_raft = use_raft; + config.use_raft = useRaft; faiss::gpu::GpuIndexIVFFlat gpuIndex(&res, 1, 1, faiss::METRIC_L2, config); gpuIndex.nprobe = 1; @@ -334,7 +334,7 @@ TEST(TestGpuIndexIVFFlat, Float32_Query_L2) { queryTest(opt, faiss::METRIC_L2, false); #if defined USE_NVIDIA_RAFT - opt.use_raft = true; + opt.useRaft = true; opt.indicesOpt = faiss::gpu::INDICES_64_BIT; queryTest(opt, faiss::METRIC_L2, false); #endif @@ -345,7 +345,7 @@ TEST(TestGpuIndexIVFFlat, Float32_Query_IP) { queryTest(opt, faiss::METRIC_INNER_PRODUCT, false); #if defined USE_NVIDIA_RAFT - opt.use_raft = true; + opt.useRaft = true; opt.indicesOpt = faiss::gpu::INDICES_64_BIT; queryTest(opt, faiss::METRIC_INNER_PRODUCT, false); #endif @@ -358,7 +358,7 @@ TEST(TestGpuIndexIVFFlat, LargeBatch) { queryTest(opt, faiss::METRIC_L2, false); #if defined USE_NVIDIA_RAFT - opt.use_raft = true; + opt.useRaft = true; opt.indicesOpt = faiss::gpu::INDICES_64_BIT; queryTest(opt, faiss::METRIC_L2, false); #endif @@ -371,7 +371,7 @@ TEST(TestGpuIndexIVFFlat, Float16_32_Query_L2) { queryTest(opt, faiss::METRIC_L2, true); #if defined USE_NVIDIA_RAFT - opt.use_raft = true; + opt.useRaft = true; opt.indicesOpt = faiss::gpu::INDICES_64_BIT; queryTest(opt, faiss::METRIC_L2, true); #endif @@ -382,7 +382,7 @@ TEST(TestGpuIndexIVFFlat, Float16_32_Query_IP) { queryTest(opt, faiss::METRIC_INNER_PRODUCT, true); #if defined USE_NVIDIA_RAFT - opt.use_raft = true; + opt.useRaft = true; opt.indicesOpt = faiss::gpu::INDICES_64_BIT; queryTest(opt, faiss::METRIC_INNER_PRODUCT, true); #endif @@ -399,7 +399,7 @@ TEST(TestGpuIndexIVFFlat, Float32_Query_L2_64) { queryTest(opt, faiss::METRIC_L2, false); #if defined USE_NVIDIA_RAFT - opt.use_raft = true; + opt.useRaft = true; opt.indicesOpt = faiss::gpu::INDICES_64_BIT; queryTest(opt, faiss::METRIC_L2, false); #endif @@ -411,7 +411,7 @@ TEST(TestGpuIndexIVFFlat, Float32_Query_IP_64) { queryTest(opt, faiss::METRIC_INNER_PRODUCT, false); #if defined USE_NVIDIA_RAFT - opt.use_raft = true; + opt.useRaft = true; opt.indicesOpt = faiss::gpu::INDICES_64_BIT; queryTest(opt, faiss::METRIC_INNER_PRODUCT, false); #endif @@ -423,7 +423,7 @@ TEST(TestGpuIndexIVFFlat, Float32_Query_L2_128) { queryTest(opt, faiss::METRIC_L2, false); #if defined USE_NVIDIA_RAFT - opt.use_raft = true; + opt.useRaft = true; opt.indicesOpt = faiss::gpu::INDICES_64_BIT; queryTest(opt, faiss::METRIC_L2, false); #endif @@ -435,7 +435,7 @@ TEST(TestGpuIndexIVFFlat, Float32_Query_IP_128) { queryTest(opt, faiss::METRIC_INNER_PRODUCT, false); #if defined USE_NVIDIA_RAFT - opt.use_raft = true; + opt.useRaft = true; opt.indicesOpt = faiss::gpu::INDICES_64_BIT; queryTest(opt, faiss::METRIC_INNER_PRODUCT, false); #endif @@ -499,6 +499,7 @@ TEST(TestGpuIndexIVFFlat, Float32_negative) { faiss::gpu::GpuIndexIVFFlatConfig config; config.device = opt.device; config.indicesOptions = opt.indicesOpt; + config.use_raft = false; faiss::gpu::GpuIndexIVFFlat gpuIndex( &res, cpuIndex.d, cpuIndex.nlist, cpuIndex.metric_type, config); @@ -571,6 +572,7 @@ TEST(TestGpuIndexIVFFlat, QueryNaN) { config.device = opt.device; config.indicesOptions = opt.indicesOpt; config.flatConfig.useFloat16 = faiss::gpu::randBool(); + config.use_raft = false; faiss::gpu::GpuIndexIVFFlat gpuIndex( &res, opt.dim, opt.numCentroids, faiss::METRIC_L2, config); @@ -639,6 +641,7 @@ TEST(TestGpuIndexIVFFlat, AddNaN) { config.device = opt.device; config.indicesOptions = opt.indicesOpt; config.flatConfig.useFloat16 = faiss::gpu::randBool(); + config.use_raft = false; faiss::gpu::GpuIndexIVFFlat gpuIndex( &res, opt.dim, opt.numCentroids, faiss::METRIC_L2, config); gpuIndex.nprobe = opt.nprobe; @@ -720,6 +723,7 @@ TEST(TestGpuIndexIVFFlat, UnifiedMemory) { faiss::gpu::GpuIndexIVFFlatConfig config; config.device = device; config.memorySpace = faiss::gpu::MemorySpace::Unified; + config.use_raft = false; faiss::gpu::GpuIndexIVFFlat gpuIndex( &res, dim, numCentroids, faiss::METRIC_L2, config); @@ -797,6 +801,7 @@ TEST(TestGpuIndexIVFFlat, LongIVFList) { faiss::gpu::GpuIndexIVFFlatConfig config; config.device = device; + config.use_raft = false; faiss::gpu::GpuIndexIVFFlat gpuIndex( &res, dim, numCentroids, faiss::METRIC_L2, config); diff --git a/faiss/gpu/test/TestGpuIndexIVFPQ.cpp b/faiss/gpu/test/TestGpuIndexIVFPQ.cpp index 1bdef31914..9cc52bc788 100644 --- a/faiss/gpu/test/TestGpuIndexIVFPQ.cpp +++ b/faiss/gpu/test/TestGpuIndexIVFPQ.cpp @@ -35,6 +35,22 @@ void pickEncoding(int& codes, int& dim) { } } +void pickRaftEncoding(int& codes, int& dim, int bitsPerCode) { + // Above 32 doesn't work with no precomputed codes + std::vector dimSizes{4, 8, 10, 12, 16, 20, 24, 28, 32}; + + while (true) { + codes = faiss::gpu::randVal(0, 96); + dim = codes * dimSizes[faiss::gpu::randVal(0, dimSizes.size() - 1)]; + + // for such a small test, super-low or high dim is more likely to + // generate comparison errors + if (dim < 256 && dim >= 64 && (codes * bitsPerCode) % 8 == 0) { + return; + } + } +} + struct Options { Options() { numAdd = faiss::gpu::randVal(2000, 5000); @@ -43,9 +59,10 @@ struct Options { pickEncoding(codes, dim); - // TODO: Change back to `faiss::gpu::randVal(3, 7)` when we officially - // support non-multiple of 8 subcodes for IVFPQ. + // TODO: Change back to `faiss::gpu::randVal(3, 7)` when we + // officially support non-multiple of 8 subcodes for IVFPQ. bitsPerCode = 8; + nprobe = std::min(faiss::gpu::randVal(40, 1000), numCentroids); numQuery = faiss::gpu::randVal(4, 8); @@ -66,6 +83,9 @@ struct Options { } device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1); + + interleavedLayout = false; + useRaft = false; } std::string toString() const { @@ -105,50 +125,66 @@ struct Options { faiss::gpu::IndicesOptions indicesOpt; bool useFloat16; int device; + bool interleavedLayout; + bool useRaft; }; -TEST(TestGpuIndexIVFPQ, Query_L2) { - for (int tries = 0; tries < 2; ++tries) { - Options opt; +void queryTest(Options opt, faiss::MetricType metricType) { + std::vector trainVecs = faiss::gpu::randVecs(opt.numTrain, opt.dim); + std::vector addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim); - std::vector trainVecs = - faiss::gpu::randVecs(opt.numTrain, opt.dim); - std::vector addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim); + faiss::IndexFlatL2 coarseQuantizerL2(opt.dim); + faiss::IndexFlatIP coarseQuantizerIP(opt.dim); + faiss::Index* quantizer = metricType == faiss::METRIC_L2 + ? (faiss::Index*)&coarseQuantizerL2 + : (faiss::Index*)&coarseQuantizerIP; - faiss::IndexFlatL2 coarseQuantizer(opt.dim); - faiss::IndexIVFPQ cpuIndex( - &coarseQuantizer, - opt.dim, - opt.numCentroids, - opt.codes, - opt.bitsPerCode); - cpuIndex.nprobe = opt.nprobe; - cpuIndex.train(opt.numTrain, trainVecs.data()); - cpuIndex.add(opt.numAdd, addVecs.data()); + faiss::IndexIVFPQ cpuIndex( + quantizer, opt.dim, opt.numCentroids, opt.codes, opt.bitsPerCode); + cpuIndex.metric_type = metricType; + cpuIndex.nprobe = opt.nprobe; + cpuIndex.train(opt.numTrain, trainVecs.data()); + cpuIndex.add(opt.numAdd, addVecs.data()); - // Use the default temporary memory management to test the memory - // manager - faiss::gpu::StandardGpuResources res; + // Use the default temporary memory management to test the memory + // manager + faiss::gpu::StandardGpuResources res; - faiss::gpu::GpuIndexIVFPQConfig config; - config.device = opt.device; - config.usePrecomputedTables = (tries % 2 == 0); - config.indicesOptions = opt.indicesOpt; - config.useFloat16LookupTables = opt.useFloat16; + faiss::gpu::GpuIndexIVFPQConfig config; + config.device = opt.device; + config.usePrecomputedTables = opt.usePrecomputed; + config.indicesOptions = opt.indicesOpt; + config.useFloat16LookupTables = opt.useFloat16; + config.interleavedLayout = opt.interleavedLayout; + config.use_raft = opt.useRaft; - faiss::gpu::GpuIndexIVFPQ gpuIndex(&res, &cpuIndex, config); - gpuIndex.nprobe = opt.nprobe; + faiss::gpu::GpuIndexIVFPQ gpuIndex(&res, &cpuIndex, config); + gpuIndex.nprobe = opt.nprobe; - faiss::gpu::compareIndices( - cpuIndex, - gpuIndex, - opt.numQuery, - opt.dim, - opt.k, - opt.toString(), - opt.getCompareEpsilon(), - opt.getPctMaxDiff1(), - opt.getPctMaxDiffN()); + faiss::gpu::compareIndices( + cpuIndex, + gpuIndex, + opt.numQuery, + opt.dim, + opt.k, + opt.toString(), + opt.getCompareEpsilon(), + opt.getPctMaxDiff1(), + opt.getPctMaxDiffN()); +} + +TEST(TestGpuIndexIVFPQ, Query_L2) { + for (int tries = 0; tries < 2; ++tries) { + Options opt; + opt.usePrecomputed = (tries % 2 == 0); + queryTest(opt, faiss::MetricType::METRIC_L2); + } +} + +TEST(TestGpuIndexIVFPQ, Query_IP) { + for (int tries = 0; tries < 2; ++tries) { + Options opt; + queryTest(opt, faiss::MetricType::METRIC_INNER_PRODUCT); } } @@ -161,45 +197,10 @@ TEST(TestGpuIndexIVFPQ, LargeBatch) { opt.dim = 4; opt.numQuery = 100000; opt.codes = 2; + opt.usePrecomputed = usePrecomputed; + opt.useFloat16 = false; - std::vector trainVecs = - faiss::gpu::randVecs(opt.numTrain, opt.dim); - std::vector addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim); - - faiss::IndexFlatL2 coarseQuantizer(opt.dim); - faiss::IndexIVFPQ cpuIndex( - &coarseQuantizer, - opt.dim, - opt.numCentroids, - opt.codes, - opt.bitsPerCode); - cpuIndex.nprobe = opt.nprobe; - cpuIndex.train(opt.numTrain, trainVecs.data()); - cpuIndex.add(opt.numAdd, addVecs.data()); - - // Use the default temporary memory management to test the memory - // manager - faiss::gpu::StandardGpuResources res; - - faiss::gpu::GpuIndexIVFPQConfig config; - config.device = opt.device; - config.usePrecomputedTables = usePrecomputed; - config.indicesOptions = opt.indicesOpt; - config.useFloat16LookupTables = false; - - faiss::gpu::GpuIndexIVFPQ gpuIndex(&res, &cpuIndex, config); - gpuIndex.nprobe = opt.nprobe; - - faiss::gpu::compareIndices( - cpuIndex, - gpuIndex, - opt.numQuery, - opt.dim, - opt.k, - opt.toString(), - opt.getCompareEpsilon(), - opt.getPctMaxDiff1(), - opt.getPctMaxDiffN()); + queryTest(opt, faiss::MetricType::METRIC_L2); } } @@ -234,6 +235,7 @@ void testMMCodeDistance(faiss::MetricType mt) { config.usePrecomputedTables = false; config.useMMCodeDistance = true; config.indicesOptions = opt.indicesOpt; + config.use_raft = false; // Make sure that the float16 version works as well config.useFloat16LookupTables = (tries % 2 == 0); @@ -284,6 +286,7 @@ void testMMCodeDistance(faiss::MetricType mt) { config.device = opt.device; config.usePrecomputedTables = false; config.indicesOptions = opt.indicesOpt; + config.use_raft = false; // Make sure that the float16 version works as well config.useFloat16LookupTables = (dimPerSubQ == 7); @@ -312,53 +315,6 @@ TEST(TestGpuIndexIVFPQ, Query_IP_MMCodeDistance) { testMMCodeDistance(faiss::MetricType::METRIC_INNER_PRODUCT); } -TEST(TestGpuIndexIVFPQ, Query_IP) { - for (int tries = 0; tries < 2; ++tries) { - Options opt; - - std::vector trainVecs = - faiss::gpu::randVecs(opt.numTrain, opt.dim); - std::vector addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim); - - faiss::IndexFlatIP coarseQuantizer(opt.dim); - faiss::IndexIVFPQ cpuIndex( - &coarseQuantizer, - opt.dim, - opt.numCentroids, - opt.codes, - opt.bitsPerCode); - cpuIndex.metric_type = faiss::MetricType::METRIC_INNER_PRODUCT; - - cpuIndex.nprobe = opt.nprobe; - cpuIndex.train(opt.numTrain, trainVecs.data()); - cpuIndex.add(opt.numAdd, addVecs.data()); - - // Use the default temporary memory management to test the memory - // manager - faiss::gpu::StandardGpuResources res; - - faiss::gpu::GpuIndexIVFPQConfig config; - config.device = opt.device; - config.usePrecomputedTables = false; // not supported/required for IP - config.indicesOptions = opt.indicesOpt; - config.useFloat16LookupTables = opt.useFloat16; - - faiss::gpu::GpuIndexIVFPQ gpuIndex(&res, &cpuIndex, config); - gpuIndex.nprobe = opt.nprobe; - - faiss::gpu::compareIndices( - cpuIndex, - gpuIndex, - opt.numQuery, - opt.dim, - opt.k, - opt.toString(), - opt.getCompareEpsilon(), - opt.getPctMaxDiff1(), - opt.getPctMaxDiffN()); - } -} - TEST(TestGpuIndexIVFPQ, Float16Coarse) { Options opt; @@ -384,6 +340,7 @@ TEST(TestGpuIndexIVFPQ, Float16Coarse) { config.usePrecomputedTables = opt.usePrecomputed; config.indicesOptions = opt.indicesOpt; config.useFloat16LookupTables = opt.useFloat16; + config.use_raft = false; faiss::gpu::GpuIndexIVFPQ gpuIndex(&res, &cpuIndex, config); gpuIndex.nprobe = opt.nprobe; @@ -403,104 +360,68 @@ TEST(TestGpuIndexIVFPQ, Float16Coarse) { opt.getPctMaxDiffN()); } -TEST(TestGpuIndexIVFPQ, Add_L2) { - for (int tries = 0; tries < 2; ++tries) { - Options opt; +void addTest(Options opt, faiss::MetricType metricType) { + std::vector trainVecs = faiss::gpu::randVecs(opt.numTrain, opt.dim); + std::vector addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim); - std::vector trainVecs = - faiss::gpu::randVecs(opt.numTrain, opt.dim); - std::vector addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim); + faiss::IndexFlatL2 coarseQuantizerL2(opt.dim); + faiss::IndexFlatIP coarseQuantizerIP(opt.dim); + faiss::Index* quantizer = metricType == faiss::METRIC_L2 + ? (faiss::Index*)&coarseQuantizerL2 + : (faiss::Index*)&coarseQuantizerIP; - faiss::IndexFlatL2 coarseQuantizer(opt.dim); - faiss::IndexIVFPQ cpuIndex( - &coarseQuantizer, - opt.dim, - opt.numCentroids, - opt.codes, - opt.bitsPerCode); - cpuIndex.nprobe = opt.nprobe; - cpuIndex.train(opt.numTrain, trainVecs.data()); + faiss::IndexIVFPQ cpuIndex( + quantizer, opt.dim, opt.numCentroids, opt.codes, opt.bitsPerCode); + cpuIndex.nprobe = opt.nprobe; + cpuIndex.metric_type = metricType; + cpuIndex.train(opt.numTrain, trainVecs.data()); - // Use the default temporary memory management to test the memory - // manager - faiss::gpu::StandardGpuResources res; + // Use the default temporary memory management to test the memory + // manager + faiss::gpu::StandardGpuResources res; - faiss::gpu::GpuIndexIVFPQConfig config; - config.device = opt.device; - config.usePrecomputedTables = opt.usePrecomputed; - config.indicesOptions = opt.indicesOpt; - config.useFloat16LookupTables = opt.useFloat16; + faiss::gpu::GpuIndexIVFPQConfig config; + config.device = opt.device; + config.usePrecomputedTables = opt.usePrecomputed; + config.indicesOptions = opt.indicesOpt; + config.useFloat16LookupTables = opt.useFloat16; + config.interleavedLayout = opt.interleavedLayout; + config.use_raft = opt.useRaft; - faiss::gpu::GpuIndexIVFPQ gpuIndex(&res, &cpuIndex, config); - gpuIndex.nprobe = opt.nprobe; + faiss::gpu::GpuIndexIVFPQ gpuIndex(&res, &cpuIndex, config); + gpuIndex.nprobe = opt.nprobe; - gpuIndex.add(opt.numAdd, addVecs.data()); - cpuIndex.add(opt.numAdd, addVecs.data()); + gpuIndex.add(opt.numAdd, addVecs.data()); + cpuIndex.add(opt.numAdd, addVecs.data()); - faiss::gpu::compareIndices( - cpuIndex, - gpuIndex, - opt.numQuery, - opt.dim, - opt.k, - opt.toString(), - opt.getCompareEpsilon(), - opt.getPctMaxDiff1(), - opt.getPctMaxDiffN()); + faiss::gpu::compareIndices( + cpuIndex, + gpuIndex, + opt.numQuery, + opt.dim, + opt.k, + opt.toString(), + opt.getCompareEpsilon(), + opt.getPctMaxDiff1(), + opt.getPctMaxDiffN()); +} + +TEST(TestGpuIndexIVFPQ, Add_L2) { + for (int tries = 0; tries < 2; ++tries) { + Options opt; + addTest(opt, faiss::METRIC_L2); } } TEST(TestGpuIndexIVFPQ, Add_IP) { for (int tries = 0; tries < 2; ++tries) { Options opt; - - std::vector trainVecs = - faiss::gpu::randVecs(opt.numTrain, opt.dim); - std::vector addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim); - - faiss::IndexFlatIP coarseQuantizer(opt.dim); - faiss::IndexIVFPQ cpuIndex( - &coarseQuantizer, - opt.dim, - opt.numCentroids, - opt.codes, - opt.bitsPerCode); - cpuIndex.metric_type = faiss::MetricType::METRIC_INNER_PRODUCT; - cpuIndex.nprobe = opt.nprobe; - cpuIndex.train(opt.numTrain, trainVecs.data()); - - // Use the default temporary memory management to test the memory - // manager - faiss::gpu::StandardGpuResources res; - - faiss::gpu::GpuIndexIVFPQConfig config; - config.device = opt.device; - config.usePrecomputedTables = opt.usePrecomputed; - config.indicesOptions = opt.indicesOpt; - config.useFloat16LookupTables = opt.useFloat16; - - faiss::gpu::GpuIndexIVFPQ gpuIndex(&res, &cpuIndex, config); - gpuIndex.nprobe = opt.nprobe; - - gpuIndex.add(opt.numAdd, addVecs.data()); - cpuIndex.add(opt.numAdd, addVecs.data()); - - faiss::gpu::compareIndices( - cpuIndex, - gpuIndex, - opt.numQuery, - opt.dim, - opt.k, - opt.toString(), - opt.getCompareEpsilon(), - opt.getPctMaxDiff1(), - opt.getPctMaxDiffN()); + addTest(opt, faiss::METRIC_INNER_PRODUCT); } } -TEST(TestGpuIndexIVFPQ, CopyTo) { +void copyToTest(Options opt) { for (int tries = 0; tries < 2; ++tries) { - Options opt; std::vector trainVecs = faiss::gpu::randVecs(opt.numTrain, opt.dim); std::vector addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim); @@ -511,9 +432,11 @@ TEST(TestGpuIndexIVFPQ, CopyTo) { faiss::gpu::GpuIndexIVFPQConfig config; config.device = opt.device; - config.usePrecomputedTables = (tries % 2 == 0); + config.usePrecomputedTables = false; config.indicesOptions = opt.indicesOpt; config.useFloat16LookupTables = opt.useFloat16; + config.interleavedLayout = opt.interleavedLayout; + config.use_raft = opt.useRaft; faiss::gpu::GpuIndexIVFPQ gpuIndex( &res, @@ -561,8 +484,12 @@ TEST(TestGpuIndexIVFPQ, CopyTo) { } } -TEST(TestGpuIndexIVFPQ, CopyFrom) { +TEST(TestGpuIndexIVFPQ, CopyTo) { Options opt; + copyToTest(opt); +} + +void copyFromTest(Options opt) { std::vector trainVecs = faiss::gpu::randVecs(opt.numTrain, opt.dim); std::vector addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim); @@ -585,6 +512,8 @@ TEST(TestGpuIndexIVFPQ, CopyFrom) { config.usePrecomputedTables = opt.usePrecomputed; config.indicesOptions = opt.indicesOpt; config.useFloat16LookupTables = opt.useFloat16; + config.interleavedLayout = opt.interleavedLayout; + config.use_raft = opt.useRaft; // Use garbage values to see if we overwrite them faiss::gpu::GpuIndexIVFPQ gpuIndex( @@ -621,9 +550,12 @@ TEST(TestGpuIndexIVFPQ, CopyFrom) { opt.getPctMaxDiffN()); } -TEST(TestGpuIndexIVFPQ, QueryNaN) { +TEST(TestGpuIndexIVFPQ, CopyFrom) { Options opt; + copyFromTest(opt); +} +void queryNaNTest(Options opt) { std::vector trainVecs = faiss::gpu::randVecs(opt.numTrain, opt.dim); std::vector addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim); @@ -635,6 +567,8 @@ TEST(TestGpuIndexIVFPQ, QueryNaN) { config.usePrecomputedTables = opt.usePrecomputed; config.indicesOptions = opt.indicesOpt; config.useFloat16LookupTables = opt.useFloat16; + config.use_raft = opt.useRaft; + config.interleavedLayout = opt.useRaft ? true : opt.interleavedLayout; faiss::gpu::GpuIndexIVFPQ gpuIndex( &res, @@ -670,9 +604,13 @@ TEST(TestGpuIndexIVFPQ, QueryNaN) { } } -TEST(TestGpuIndexIVFPQ, AddNaN) { +TEST(TestGpuIndexIVFPQ, QueryNaN) { Options opt; + opt.useRaft = false; + queryNaNTest(opt); +} +void addNaNTest(Options opt) { // Use the default temporary memory management to test the memory manager faiss::gpu::StandardGpuResources res; @@ -681,6 +619,8 @@ TEST(TestGpuIndexIVFPQ, AddNaN) { config.usePrecomputedTables = opt.usePrecomputed; config.indicesOptions = opt.indicesOpt; config.useFloat16LookupTables = opt.useFloat16; + config.interleavedLayout = opt.interleavedLayout; + config.use_raft = opt.useRaft; faiss::gpu::GpuIndexIVFPQ gpuIndex( &res, @@ -722,6 +662,128 @@ TEST(TestGpuIndexIVFPQ, AddNaN) { indices.data()); } +TEST(TestGpuIndexIVFPQ, AddNaN) { + Options opt; + opt.useRaft = false; + addNaNTest(opt); +} + +#if defined USE_NVIDIA_RAFT +TEST(TestGpuIndexIVFPQ, Query_L2_Raft) { + for (int tries = 0; tries < 2; ++tries) { + Options opt; + opt.bitsPerCode = faiss::gpu::randVal(4, 8); + opt.useRaft = true; + opt.interleavedLayout = true; + opt.usePrecomputed = false; + opt.indicesOpt = faiss::gpu::INDICES_64_BIT; + pickRaftEncoding(opt.codes, opt.dim, opt.bitsPerCode); + queryTest(opt, faiss::MetricType::METRIC_L2); + } +} + +TEST(TestGpuIndexIVFPQ, Query_IP_Raft) { + for (int tries = 0; tries < 2; ++tries) { + Options opt; + opt.bitsPerCode = faiss::gpu::randVal(4, 8); + opt.useRaft = true; + opt.interleavedLayout = true; + opt.usePrecomputed = false; + opt.indicesOpt = faiss::gpu::INDICES_64_BIT; + pickRaftEncoding(opt.codes, opt.dim, opt.bitsPerCode); + queryTest(opt, faiss::MetricType::METRIC_INNER_PRODUCT); + } +} + +// Large batch sizes (>= 65536) should also work +TEST(TestGpuIndexIVFPQ, LargeBatch_Raft) { + Options opt; + + // override for large sizes + opt.dim = 4; + opt.numQuery = 100000; + opt.codes = 2; + opt.useRaft = true; + opt.interleavedLayout = true; + opt.usePrecomputed = false; + opt.useFloat16 = false; + opt.indicesOpt = faiss::gpu::INDICES_64_BIT; + opt.bitsPerCode = 8; + + queryTest(opt, faiss::MetricType::METRIC_L2); +} + +TEST(TestGpuIndexIVFPQ, CopyFrom_Raft) { + Options opt; + opt.useRaft = true; + opt.interleavedLayout = true; + opt.bitsPerCode = faiss::gpu::randVal(4, 8); + opt.usePrecomputed = false; + opt.indicesOpt = faiss::gpu::INDICES_64_BIT; + pickRaftEncoding(opt.codes, opt.dim, opt.bitsPerCode); + copyFromTest(opt); +} + +TEST(TestGpuIndexIVFPQ, Add_L2_Raft) { + for (int tries = 0; tries < 2; ++tries) { + Options opt; + opt.useRaft = true; + opt.interleavedLayout = true; + opt.bitsPerCode = faiss::gpu::randVal(4, 8); + opt.usePrecomputed = false; + opt.indicesOpt = faiss::gpu::INDICES_64_BIT; + pickRaftEncoding(opt.codes, opt.dim, opt.bitsPerCode); + addTest(opt, faiss::METRIC_L2); + } +} + +TEST(TestGpuIndexIVFPQ, Add_IP_Raft) { + for (int tries = 0; tries < 2; ++tries) { + Options opt; + opt.useRaft = true; + opt.interleavedLayout = true; + opt.bitsPerCode = faiss::gpu::randVal(4, 8); + opt.usePrecomputed = false; + opt.indicesOpt = faiss::gpu::INDICES_64_BIT; + pickRaftEncoding(opt.codes, opt.dim, opt.bitsPerCode); + addTest(opt, faiss::METRIC_INNER_PRODUCT); + } +} + +TEST(TestGpuIndexIVFPQ, QueryNaN_Raft) { + Options opt; + opt.useRaft = true; + opt.interleavedLayout = true; + opt.bitsPerCode = faiss::gpu::randVal(4, 8); + opt.usePrecomputed = false; + opt.indicesOpt = faiss::gpu::INDICES_64_BIT; + pickRaftEncoding(opt.codes, opt.dim, opt.bitsPerCode); + queryNaNTest(opt); +} + +TEST(TestGpuIndexIVFPQ, AddNaN_Raft) { + Options opt; + opt.useRaft = true; + opt.interleavedLayout = true; + opt.bitsPerCode = faiss::gpu::randVal(4, 8); + opt.usePrecomputed = false; + opt.indicesOpt = faiss::gpu::INDICES_64_BIT; + pickRaftEncoding(opt.codes, opt.dim, opt.bitsPerCode); + addNaNTest(opt); +} + +TEST(TestGpuIndexIVFPQ, CopyTo_Raft) { + Options opt; + opt.useRaft = true; + opt.interleavedLayout = true; + opt.bitsPerCode = faiss::gpu::randVal(4, 8); + opt.usePrecomputed = false; + opt.indicesOpt = faiss::gpu::INDICES_64_BIT; + pickRaftEncoding(opt.codes, opt.dim, opt.bitsPerCode); + copyToTest(opt); +} +#endif + TEST(TestGpuIndexIVFPQ, UnifiedMemory) { // Construct on a random device to test multi-device, if we have // multiple devices @@ -762,6 +824,7 @@ TEST(TestGpuIndexIVFPQ, UnifiedMemory) { faiss::gpu::GpuIndexIVFPQConfig config; config.device = device; config.memorySpace = faiss::gpu::MemorySpace::Unified; + config.use_raft = false; faiss::gpu::GpuIndexIVFPQ gpuIndex( &res, @@ -784,6 +847,34 @@ TEST(TestGpuIndexIVFPQ, UnifiedMemory) { 0.015f, 0.1f, 0.015f); + +#if defined USE_NVIDIA_RAFT + config.interleavedLayout = true; + config.use_raft = true; + config.indicesOptions = faiss::gpu::INDICES_64_BIT; + + faiss::gpu::GpuIndexIVFPQ raftGpuIndex( + &res, + dim, + numCentroids, + codes, + bitsPerCode, + faiss::METRIC_L2, + config); + raftGpuIndex.copyFrom(&cpuIndex); + raftGpuIndex.nprobe = nprobe; + + faiss::gpu::compareIndices( + cpuIndex, + raftGpuIndex, + numQuery, + dim, + k, + "Unified Memory", + 0.015f, + 0.1f, + 0.015f); +#endif } int main(int argc, char** argv) { diff --git a/faiss/gpu/test/TestGpuMemoryException.cpp b/faiss/gpu/test/TestGpuMemoryException.cpp index c6f6e9bdeb..ff4be0893e 100644 --- a/faiss/gpu/test/TestGpuMemoryException.cpp +++ b/faiss/gpu/test/TestGpuMemoryException.cpp @@ -31,6 +31,7 @@ TEST(TestGpuMemoryException, AddException) { faiss::gpu::GpuIndexFlatConfig config; config.device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1); + config.use_raft = false; faiss::gpu::GpuIndexFlatL2 gpuIndexL2Broken( &res, (int)brokenAddDims, config); diff --git a/faiss/gpu/test/test_gpu_index.py b/faiss/gpu/test/test_gpu_index.py index 36a1f8a64b..620bfea198 100755 --- a/faiss/gpu/test/test_gpu_index.py +++ b/faiss/gpu/test/test_gpu_index.py @@ -24,7 +24,9 @@ def test_ivfflat_search_preassigned(self): nprobe = 10 k = 50 - idx_gpu = faiss.GpuIndexIVFFlat(res, d, nlist) + config = faiss.GpuIndexIVFFlatConfig() + config.use_raft = False + idx_gpu = faiss.GpuIndexIVFFlat(res, d, nlist, faiss.METRIC_L2, config) idx_gpu.nprobe = nprobe rs = np.random.RandomState(567) @@ -56,7 +58,9 @@ def test_ivfpq_search_preassigned(self): nprobe = 5 k = 50 - idx_gpu = faiss.GpuIndexIVFPQ(res, d, nlist, 4, 8) + config = faiss.GpuIndexIVFPQConfig() + config.use_raft = False + idx_gpu = faiss.GpuIndexIVFPQ(res, d, nlist, 4, 8, faiss.METRIC_L2, config) idx_gpu.nprobe = nprobe rs = np.random.RandomState(567) @@ -136,7 +140,9 @@ def test_ivfflat_cpu_coarse(self): # construct a GPU index using the same trained coarse quantizer # from the CPU index - idx_gpu = faiss.GpuIndexIVFFlat(res, q, d, nlist, faiss.METRIC_L2) + config = faiss.GpuIndexIVFFlatConfig() + config.use_raft = False + idx_gpu = faiss.GpuIndexIVFFlat(res, q, d, nlist, faiss.METRIC_L2, config) assert(idx_gpu.is_trained) idx_gpu.add(xb) @@ -150,7 +156,7 @@ def test_ivfflat_cpu_coarse(self): self.assertGreaterEqual((i_g == i_c).sum(), i_g.size * 0.9) self.assertTrue(np.allclose(d_g, d_c, rtol=5e-5, atol=5e-5)) - def test_ivfsq_cpu_coarse(self): + def test_ivfsq_pu_coarse(self): res = faiss.StandardGpuResources() d = 128 nb = 5000 @@ -226,8 +232,10 @@ def test_ivfpq_cpu_coarse(self): # construct a GPU index using the same trained coarse quantizer # from the CPU index + config = faiss.GpuIndexIVFPQConfig() + config.use_raft = False idx_gpu = faiss.GpuIndexIVFPQ( - res, idx_coarse_cpu, d, nlist_lvl_2, 4, 8) + res, idx_coarse_cpu, d, nlist_lvl_2, 4, 8, faiss.METRIC_L2, config) assert(not idx_gpu.is_trained) idx_gpu.train(xb) @@ -406,6 +414,7 @@ def test_indices_ivfflat(self): # Store values using 32-bit indices instead config.indicesOptions = faiss.INDICES_32_BIT + config.use_raft = False idx = faiss.GpuIndexIVFFlat(res, d, nlist, faiss.METRIC_L2, config) idx.train(xb) idx.add_with_ids(xb, xb_indices) @@ -430,6 +439,7 @@ def test_indices_ivfpq(self): xb_indices = (xb_indices_base + 4294967296).astype('int64') config = faiss.GpuIndexIVFPQConfig() + config.use_raft = False idx = faiss.GpuIndexIVFPQ(res, d, nlist, M, nbits, faiss.METRIC_L2, config) idx.train(xb) @@ -490,7 +500,9 @@ def test_sq_cpu_to_gpu(self): res = faiss.StandardGpuResources() index = faiss.index_factory(32, "SQfp16") index.add(np.random.rand(1000, 32).astype(np.float32)) - gpu_index = faiss.index_cpu_to_gpu(res, 0, index) + config = faiss.GpuClonerOptions() + config.use_raft = False + gpu_index = faiss.index_cpu_to_gpu(res, 0, index, config) self.assertIsInstance(gpu_index, faiss.GpuIndexFlat) diff --git a/faiss/gpu/test/test_gpu_index_ivfsq.py b/faiss/gpu/test/test_gpu_index_ivfsq.py index af56316509..09dcdae079 100755 --- a/faiss/gpu/test/test_gpu_index_ivfsq.py +++ b/faiss/gpu/test/test_gpu_index_ivfsq.py @@ -27,7 +27,9 @@ def make_indices_copy_from_cpu(nlist, d, qtype, by_residual, metric, clamp): res = faiss.StandardGpuResources() res.noTempMemory() - idx_gpu = faiss.GpuIndexIVFScalarQuantizer(res, idx_cpu) + config = faiss.GpuIndexIVFScalarQuantizerConfig() + config.use_raft = False + idx_gpu = faiss.GpuIndexIVFScalarQuantizer(res, idx_cpu, config) return idx_cpu, idx_gpu @@ -37,8 +39,10 @@ def make_indices_copy_from_gpu(nlist, d, qtype, by_residual, metric, clamp): res = faiss.StandardGpuResources() res.noTempMemory() + config = faiss.GpuIndexIVFScalarQuantizerConfig() + config.use_raft = False idx_gpu = faiss.GpuIndexIVFScalarQuantizer(res, d, nlist, - qtype, metric, by_residual) + qtype, metric, by_residual, config) idx_gpu.train(to_train) idx_gpu.add(to_train) @@ -63,8 +67,10 @@ def make_indices_train(nlist, d, qtype, by_residual, metric, clamp): res = faiss.StandardGpuResources() res.noTempMemory() + config = faiss.GpuIndexIVFScalarQuantizerConfig() + config.use_raft = False idx_gpu = faiss.GpuIndexIVFScalarQuantizer(res, d, nlist, - qtype, metric, by_residual) + qtype, metric, by_residual, config) assert(by_residual == idx_gpu.by_residual) idx_gpu.train(to_train) diff --git a/faiss/gpu/test/test_gpu_index_serialize.py b/faiss/gpu/test/test_gpu_index_serialize.py index 82cbe577c8..49e51af8b4 100644 --- a/faiss/gpu/test/test_gpu_index_serialize.py +++ b/faiss/gpu/test/test_gpu_index_serialize.py @@ -34,7 +34,9 @@ def test_serialize(self): indexes.append(faiss.GpuIndexIVFFlat(res, d, nlist, faiss.METRIC_L2)) # IVFSQ - indexes.append(faiss.GpuIndexIVFScalarQuantizer(res, d, nlist, faiss.ScalarQuantizer.QT_fp16)) + config = faiss.GpuIndexIVFScalarQuantizerConfig() + config.use_raft = False + indexes.append(faiss.GpuIndexIVFScalarQuantizer(res, d, nlist, faiss.ScalarQuantizer.QT_fp16, faiss.METRIC_L2, True, config)) # IVFPQ indexes.append(faiss.GpuIndexIVFPQ(res, d, nlist, 4, 8, faiss.METRIC_L2)) @@ -47,8 +49,11 @@ def test_serialize(self): ser = faiss.serialize_index(faiss.index_gpu_to_cpu(index)) cpu_index = faiss.deserialize_index(ser) - - gpu_index_restore = faiss.index_cpu_to_gpu(res, 0, cpu_index) + + gpu_cloner_options = faiss.GpuClonerOptions() + if isinstance(index, faiss.GpuIndexIVFScalarQuantizer): + gpu_cloner_options.use_raft = False + gpu_index_restore = faiss.index_cpu_to_gpu(res, 0, cpu_index, gpu_cloner_options) restore_d, restore_i = gpu_index_restore.search(query, k) diff --git a/faiss/gpu/test/test_multi_gpu.py b/faiss/gpu/test/test_multi_gpu.py index 4a63025969..e341f5715a 100644 --- a/faiss/gpu/test/test_multi_gpu.py +++ b/faiss/gpu/test/test_multi_gpu.py @@ -29,6 +29,7 @@ def test_sharded(self): co = faiss.GpuMultipleClonerOptions() co.shard = True + co.use_raft = False index = faiss.index_cpu_to_all_gpus(index_cpu, co, ngpu=2) index.add(xb) @@ -71,6 +72,7 @@ def do_test_sharded_ivf(self, index_key): co = faiss.GpuMultipleClonerOptions() co.shard = True co.common_ivf_quantizer = True + co.use_raft = False index = faiss.index_cpu_to_all_gpus(index, co, ngpu=2) index.quantizer # make sure there is indeed a quantizer @@ -111,6 +113,7 @@ def test_binary_clone(self, ngpu=1, shard=False): co = faiss.GpuMultipleClonerOptions() co.shard = shard + co.use_raft = False # index2 = faiss.index_cpu_to_all_gpus(index, ngpu=ngpu) res = faiss.StandardGpuResources() @@ -188,7 +191,9 @@ def do_cpu_to_gpu(self, index_key): ts.append(time.time()) res = faiss.StandardGpuResources() - gpu_index = faiss.index_cpu_to_gpu(res, 0, index) + co = faiss.GpuClonerOptions() + co.use_raft = False + gpu_index = faiss.index_cpu_to_gpu(res, 0, index, co) ts.append(time.time()) # Validate the layout of the memory info @@ -217,6 +222,7 @@ def do_cpu_to_gpu(self, index_key): res = [faiss.StandardGpuResources() for i in range(2)] co = faiss.GpuMultipleClonerOptions() co.shard = shard + co.use_raft = False gpu_index = faiss.index_cpu_to_gpu_multiple_py(res, index, co) diff --git a/faiss/gpu/utils/CopyUtils.cuh b/faiss/gpu/utils/CopyUtils.cuh index 637a46cbee..8ff600a049 100644 --- a/faiss/gpu/utils/CopyUtils.cuh +++ b/faiss/gpu/utils/CopyUtils.cuh @@ -119,6 +119,7 @@ inline void fromDevice(T* src, T* dst, size_t num, cudaStream_t stream) { if (dev == -1) { CUDA_VERIFY(cudaMemcpyAsync( dst, src, num * sizeof(T), cudaMemcpyDeviceToHost, stream)); + cudaStreamSynchronize(stream); } else { CUDA_VERIFY(cudaMemcpyAsync( dst, src, num * sizeof(T), cudaMemcpyDeviceToDevice, stream)); diff --git a/faiss/gpu/utils/RaftUtils.cu b/faiss/gpu/utils/RaftUtils.cu new file mode 100644 index 0000000000..ba40c54c26 --- /dev/null +++ b/faiss/gpu/utils/RaftUtils.cu @@ -0,0 +1,117 @@ +/** + * Copyright (c) Facebook, Inc. and its affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +namespace faiss { +namespace gpu { + +void validRowIndices( + GpuResources* res, + Tensor& vecs, + bool* validRows) { + idx_t n_rows = vecs.getSize(0); + idx_t dim = vecs.getSize(1); + + raft::linalg::coalescedReduction( + validRows, + vecs.data(), + dim, + n_rows, + true, + res->getDefaultStreamCurrentDevice(), + false, + [] __device__(float v, idx_t i) { return isfinite(v); }, + raft::mul_op()); +} + +idx_t inplaceGatherFilteredRows( + GpuResources* res, + Tensor& vecs, + Tensor& indices) { + raft::device_resources& raft_handle = res->getRaftHandleCurrentDevice(); + idx_t n_rows = vecs.getSize(0); + idx_t dim = vecs.getSize(1); + + auto valid_rows = + raft::make_device_vector(raft_handle, n_rows); + + validRowIndices(res, vecs, valid_rows.data_handle()); + + idx_t n_rows_valid = thrust::reduce( + raft_handle.get_thrust_policy(), + valid_rows.data_handle(), + valid_rows.data_handle() + n_rows, + 0); + + if (n_rows_valid < n_rows) { + auto gather_indices = raft::make_device_vector( + raft_handle, n_rows_valid); + + auto count = thrust::make_counting_iterator(0); + + thrust::copy_if( + raft_handle.get_thrust_policy(), + count, + count + n_rows, + gather_indices.data_handle(), + [valid_rows = valid_rows.data_handle()] __device__(auto i) { + return valid_rows[i]; + }); + + raft::matrix::gather( + raft_handle, + raft::make_device_matrix_view( + vecs.data(), n_rows, dim), + raft::make_const_mdspan(gather_indices.view()), + (idx_t)16); + + auto validIndices = raft::make_device_vector( + raft_handle, n_rows_valid); + + thrust::gather( + raft_handle.get_thrust_policy(), + gather_indices.data_handle(), + gather_indices.data_handle() + gather_indices.size(), + indices.data(), + validIndices.data_handle()); + thrust::copy( + raft_handle.get_thrust_policy(), + validIndices.data_handle(), + validIndices.data_handle() + n_rows_valid, + indices.data()); + } + return n_rows_valid; +} + +} // namespace gpu +} // namespace faiss diff --git a/faiss/gpu/impl/RaftUtils.h b/faiss/gpu/utils/RaftUtils.h similarity index 73% rename from faiss/gpu/impl/RaftUtils.h rename to faiss/gpu/utils/RaftUtils.h index f1ea19ed33..4dfafa4ec5 100644 --- a/faiss/gpu/impl/RaftUtils.h +++ b/faiss/gpu/utils/RaftUtils.h @@ -23,13 +23,16 @@ #pragma once #include -#include +#include +#include + #include +#pragma GCC visibility push(default) namespace faiss { namespace gpu { -inline raft::distance::DistanceType faiss_to_raft( +inline raft::distance::DistanceType metricFaissToRaft( MetricType metric, bool exactDistance) { switch (metric) { @@ -53,5 +56,20 @@ inline raft::distance::DistanceType faiss_to_raft( RAFT_FAIL("Distance type not supported"); } } + +/// Identify matrix rows containing non NaN values. validRows[i] is false if row +/// i contains a NaN value and true otherwise. +void validRowIndices( + GpuResources* res, + Tensor& vecs, + bool* validRows); + +/// Filter out matrix rows containing NaN values. The vectors and indices are +/// updated in-place. +idx_t inplaceGatherFilteredRows( + GpuResources* res, + Tensor& vecs, + Tensor& indices); } // namespace gpu } // namespace faiss +#pragma GCC visibility pop diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 8522fa613d..10243b9a9c 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -70,6 +70,7 @@ find_package(OpenMP REQUIRED) target_link_libraries(faiss_test PRIVATE OpenMP::OpenMP_CXX gtest_main + $<$:raft::raft> ) # Defines `gtest_discover_tests()`.