From 27b1055cc6c9a70b8822ec5dbdd54b43b69a47e1 Mon Sep 17 00:00:00 2001
From: Tarang Jain <jaintarang2015@gmail.com>
Date: Wed, 21 Feb 2024 06:41:08 -0800
Subject: [PATCH] Integrate IVF-PQ from RAFT (#3044)

Summary:
Imports changes from https://github.com/facebookresearch/faiss/issues/3133 and https://github.com/facebookresearch/faiss/issues/3171. So this single PR adds all the changes together.

- [x] Implement RaftIVFPQ class
- [x] Update gtests to test correctness with RAFT enabled
- [x] All googleTests for RAFT enabled IVFPQ pass
- [x] Move some common functions in RaftIVFFlat and RaftIVFPQ to helper: RaftUtils.h
- [x] update Quantizer retroactively after building RAFT index -- both IVFFlat and IVFPQ
- [x] resolve failing LargeBatch (classical GPU)
- [x] add checks for Pascal deprecation
- [x] apply RMM changes from https://github.com/facebookresearch/faiss/issues/3171
- [x] apply robertmaynard's changes from https://github.com/facebookresearch/faiss/issues/3133

Pull Request resolved: https://github.com/facebookresearch/faiss/pull/3044

Reviewed By: junjieqi

Differential Revision: D51074065

Pulled By: algoriddle

fbshipit-source-id: 6871257921bcaff2064a20637e2ed358acbdc363
---
 CMakeLists.txt                             |   6 +-
 benchs/bench_ivfflat_raft.py               | 193 ++++++++
 benchs/bench_ivfpq_raft.py                 | 168 +++++++
 cmake/thirdparty/fetch_rapids.cmake        |   2 +-
 faiss/gpu/CMakeLists.txt                   |  31 +-
 faiss/gpu/GpuCloner.cpp                    |   7 +-
 faiss/gpu/GpuClonerOptions.h               |   4 +
 faiss/gpu/GpuDistance.cu                   |  21 +-
 faiss/gpu/GpuDistance.h                    |   7 +
 faiss/gpu/GpuIndex.cu                      |  15 +-
 faiss/gpu/GpuIndex.h                       |   8 +
 faiss/gpu/GpuIndexFlat.cu                  |  10 +-
 faiss/gpu/GpuIndexIVF.cu                   |  51 +-
 faiss/gpu/GpuIndexIVFFlat.cu               | 200 +++++---
 faiss/gpu/GpuIndexIVFFlat.h                |   4 +-
 faiss/gpu/GpuIndexIVFPQ.cu                 | 313 +++++++++---
 faiss/gpu/GpuIndexIVFPQ.h                  |  19 +-
 faiss/gpu/GpuResources.h                   |   5 +
 faiss/gpu/StandardGpuResources.cpp         |  39 +-
 faiss/gpu/StandardGpuResources.h           |  11 +-
 faiss/gpu/impl/IVFBase.cuh                 |   4 +-
 faiss/gpu/impl/IVFPQ.cuh                   |   4 +-
 faiss/gpu/impl/RaftFlatIndex.cu            |   4 +-
 faiss/gpu/impl/RaftFlatIndex.cuh           |   2 +
 faiss/gpu/impl/RaftIVFFlat.cu              | 250 ++++------
 faiss/gpu/impl/RaftIVFFlat.cuh             |  23 +-
 faiss/gpu/impl/RaftIVFPQ.cu                | 546 +++++++++++++++++++++
 faiss/gpu/impl/RaftIVFPQ.cuh               | 150 ++++++
 faiss/gpu/test/TestGpuIndexFlat.cpp        |   1 +
 faiss/gpu/test/TestGpuIndexIVFFlat.cpp     |  51 +-
 faiss/gpu/test/TestGpuIndexIVFPQ.cpp       | 507 +++++++++++--------
 faiss/gpu/test/TestGpuMemoryException.cpp  |   1 +
 faiss/gpu/test/test_gpu_index.py           |  24 +-
 faiss/gpu/test/test_gpu_index_ivfsq.py     |  12 +-
 faiss/gpu/test/test_gpu_index_serialize.py |  11 +-
 faiss/gpu/test/test_multi_gpu.py           |   8 +-
 faiss/gpu/utils/CopyUtils.cuh              |   1 +
 faiss/gpu/utils/RaftUtils.cu               | 117 +++++
 faiss/gpu/{impl => utils}/RaftUtils.h      |  22 +-
 tests/CMakeLists.txt                       |   1 +
 40 files changed, 2199 insertions(+), 654 deletions(-)
 create mode 100644 benchs/bench_ivfflat_raft.py
 create mode 100644 benchs/bench_ivfpq_raft.py
 create mode 100644 faiss/gpu/impl/RaftIVFPQ.cu
 create mode 100644 faiss/gpu/impl/RaftIVFPQ.cuh
 create mode 100644 faiss/gpu/utils/RaftUtils.cu
 rename faiss/gpu/{impl => utils}/RaftUtils.h (73%)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 85c8a820bc..445b39d59e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -62,9 +62,9 @@ if(FAISS_ENABLE_GPU)
   enable_language(CUDA)
 endif()
 
-if(FAISS_ENABLE_RAFT)
-  find_package(raft COMPONENTS compiled distributed)
-endif()
+if(FAISS_ENABLE_RAFT AND NOT TARGET raft::raft)
+   find_package(raft COMPONENTS compiled distributed)
+ endif()
 
 add_subdirectory(faiss)
 
diff --git a/benchs/bench_ivfflat_raft.py b/benchs/bench_ivfflat_raft.py
new file mode 100644
index 0000000000..9ebfcb3422
--- /dev/null
+++ b/benchs/bench_ivfflat_raft.py
@@ -0,0 +1,193 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+#
+# Copyright (c) 2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import numpy as np
+import faiss
+import time
+import argparse
+import rmm
+
+######################################################
+# Command-line parsing
+######################################################
+
+parser = argparse.ArgumentParser()
+
+
+def aa(*args, **kwargs):
+    group.add_argument(*args, **kwargs)
+
+
+group = parser.add_argument_group('benchmarking options')
+
+aa('--bm_train', default=False, action='store_true',
+   help='whether to benchmark train operation on GPU index')
+aa('--bm_add', default=False, action='store_true',
+   help='whether to benchmark add operation on GPU index')
+aa('--bm_search', default=True,
+   help='whether to benchmark search operation on GPU index')
+aa('--raft_only', default=False, action='store_true',
+   help='whether to only produce RAFT enabled benchmarks')
+
+
+group = parser.add_argument_group('IVF options')
+aa('--n_centroids', default=256, type=int,
+    help="number of IVF centroids")
+
+
+group = parser.add_argument_group('searching')
+
+aa('--k', default=100, type=int, help='nb of nearest neighbors')
+aa('--nprobe', default=50, help='nb of IVF lists to probe')
+
+args = parser.parse_args()
+
+print("args:", args)
+
+rs = np.random.RandomState(123)
+
+res = faiss.StandardGpuResources()
+
+# Use an RMM pool memory resource for device allocations
+mr = rmm.mr.PoolMemoryResource(rmm.mr.CudaMemoryResource())
+rmm.mr.set_current_device_resource(mr)
+
+def bench_train_milliseconds(index, trainVecs, use_raft):
+    co = faiss.GpuMultipleClonerOptions()
+    co.use_raft = use_raft
+    index_gpu = faiss.index_cpu_to_gpu(res, 0, index, co)
+    t0 = time.time()
+    index_gpu.train(trainVecs)
+    return 1000*(time.time() - t0)
+
+
+if args.bm_train:
+    print("=" * 40)
+    print("GPU Train Benchmarks")
+    print("=" * 40)
+    trainset_sizes = [5000, 10000, 100000, 1000000, 5000000]
+    dataset_dims = [128, 256, 1024]
+    for n_rows in trainset_sizes:
+        for n_cols in dataset_dims:
+            index = faiss.index_factory(n_cols, "IVF{},Flat".format(args.n_centroids))
+            trainVecs = rs.rand(n_rows, n_cols).astype('float32')
+            raft_gpu_train_time = bench_train_milliseconds(
+                index, trainVecs, True)
+            if args.raft_only:
+                print("Method: IVFFlat, Operation: TRAIN, dim: %d, n_centroids %d, numTrain: %d, RAFT enabled GPU train time: %.3f milliseconds" % (
+                    n_cols, args.n_centroids, n_rows, raft_gpu_train_time))
+            else:
+                classical_gpu_train_time = bench_train_milliseconds(
+                    index, trainVecs, False)
+                print("Method: IVFFlat, Operation: TRAIN, dim: %d, n_centroids %d, numTrain: %d, classical GPU train time: %.3f milliseconds, RAFT enabled GPU train time: %.3f milliseconds" % (
+                    n_cols, args.n_centroids, n_rows, classical_gpu_train_time, raft_gpu_train_time))
+
+
+def bench_add_milliseconds(index, addVecs, use_raft):
+    co = faiss.GpuMultipleClonerOptions()
+    co.use_raft = use_raft
+    index_gpu = faiss.index_cpu_to_gpu(res, 0, index, co)
+    index_gpu.copyFrom(index)
+    t0 = time.time()
+    index_gpu.add(addVecs)
+    return 1000*(time.time() - t0)
+
+
+if args.bm_add:
+    print("=" * 40)
+    print("GPU Add Benchmarks")
+    print("=" * 40)
+    addset_sizes = [5000, 10000, 100000, 1000000]
+    dataset_dims = [128, 256, 1024]
+    n_train = 10000
+    trainVecs = rs.rand(n_train, n_cols).astype('float32')
+    index = faiss.index_factory(
+        n_cols, "IVF" + str(args.n_centroids) + ",Flat")
+    index.train(trainVecs)
+    for n_rows in addset_sizes:
+        for n_cols in dataset_dims:
+            addVecs = rs.rand(n_rows, n_cols).astype('float32')
+            raft_gpu_add_time = bench_add_milliseconds(index, addVecs, True)
+            if args.raft_only:
+                print("Method: IVFFlat, Operation: ADD, dim: %d, n_centroids %d, numAdd: %d, RAFT enabled GPU add time: %.3f milliseconds" % (
+                    n_train, n_rows, n_cols, args.n_centroids, raft_gpu_add_time))
+            else:
+                classical_gpu_add_time = bench_add_milliseconds(
+                    index, addVecs, False)
+                print("Method: IVFFlat, Operation: ADD, dim: %d, n_centroids %d, numAdd: %d, classical GPU add time: %.3f milliseconds, RAFT enabled GPU add time: %.3f milliseconds" % (
+                    n_train, n_rows, n_cols, args.n_centroids, classical_gpu_add_time, raft_gpu_add_time))
+
+
+def bench_search_milliseconds(index, addVecs, queryVecs, nprobe, k, use_raft):
+    co = faiss.GpuMultipleClonerOptions()
+    co.use_raft = use_raft
+    index_gpu = faiss.index_cpu_to_gpu(res, 0, index, co)
+    index_gpu.copyFrom(index)
+    index_gpu.add(addVecs)
+    index_gpu.nprobe = nprobe
+    t0 = time.time()
+    index_gpu.search(queryVecs, k)
+    return 1000*(time.time() - t0)
+
+
+if args.bm_search:
+    print("=" * 40)
+    print("GPU Search Benchmarks")
+    print("=" * 40)
+    queryset_sizes = [5000, 10000, 100000, 500000]
+    n_train = 10000
+    n_add = 100000
+    search_bm_dims = [8, 16, 32]
+    for n_cols in search_bm_dims:
+        index = faiss.index_factory(n_cols, "IVF{},Flat".format(args.n_centroids))
+        trainVecs = rs.rand(n_train, n_cols).astype('float32')
+        index.train(trainVecs)
+        addVecs = rs.rand(n_add, n_cols).astype('float32')
+        for n_rows in queryset_sizes:
+            queryVecs = rs.rand(n_rows, n_cols).astype('float32')
+            raft_gpu_search_time = bench_search_milliseconds(
+                index, addVecs, queryVecs, args.nprobe, args.k, True)
+            if args.raft_only:
+                print("Method: IVFFlat, Operation: SEARCH, dim: %d, n_centroids: %d, numVecs: %d, numQuery: %d, nprobe: %d, k: %d, RAFT enabled GPU search time: %.3f milliseconds" % (
+                    n_cols, args.n_centroids, n_add, n_rows, args.nprobe, args.k, raft_gpu_search_time))
+            else:
+                classical_gpu_search_time = bench_search_milliseconds(
+                    index, addVecs, queryVecs, args.nprobe, args.k, False)
+                print("Method: IVFFlat, Operation: SEARCH, dim: %d, n_centroids: %d, numVecs: %d, numQuery: %d, nprobe: %d, k: %d, classical GPU search time: %.3f milliseconds, RAFT enabled GPU search time: %.3f milliseconds" % (
+                    n_cols, args.n_centroids, n_add, n_rows, args.nprobe, args.k, classical_gpu_search_time, raft_gpu_search_time))
+
+    print("=" * 40)
+    print("Large RAFT Enabled Benchmarks")
+    print("=" * 40)
+    # Avoid classical GPU Benchmarks for large datasets because of OOM for more than 500000 queries and/or large dims as well as for large k
+    queryset_sizes = [100000, 500000, 1000000]
+    large_search_bm_dims = [128, 256, 1024]
+    for n_cols in large_search_bm_dims:
+        trainVecs = rs.rand(n_train, n_cols).astype('float32')
+        index = faiss.index_factory(
+            n_cols, "IVF" + str(args.n_centroids) + ",Flat")
+        index.train(trainVecs)
+        addVecs = rs.rand(n_add, n_cols).astype('float32')
+        for n_rows in queryset_sizes:
+            queryVecs = rs.rand(n_rows, n_cols).astype('float32')
+            raft_gpu_search_time = bench_search_milliseconds(
+                index, addVecs, queryVecs, args.nprobe, args.k, True)
+            print("Method: IVFFlat, Operation: SEARCH, numTrain: %d, dim: %d, n_centroids: %d, numVecs: %d, numQuery: %d, nprobe: %d, k: %d, RAFT enabled GPU search time: %.3f milliseconds" % (
+                n_cols, args.n_centroids, n_add, n_rows, args.nprobe, args.k, raft_gpu_search_time))
diff --git a/benchs/bench_ivfpq_raft.py b/benchs/bench_ivfpq_raft.py
new file mode 100644
index 0000000000..3494a18741
--- /dev/null
+++ b/benchs/bench_ivfpq_raft.py
@@ -0,0 +1,168 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+#
+# Copyright (c) 2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import numpy as np
+import faiss
+import time
+import argparse
+import rmm
+
+######################################################
+# Command-line parsing
+######################################################
+
+parser = argparse.ArgumentParser()
+
+from datasets import load_sift1M, evaluate
+
+
+print("load data")
+xb, xq, xt, gt = load_sift1M()
+
+def aa(*args, **kwargs):
+    group.add_argument(*args, **kwargs)
+
+
+group = parser.add_argument_group('benchmarking options')
+aa('--raft_only', default=False, action='store_true',
+   help='whether to only produce RAFT enabled benchmarks')
+
+group = parser.add_argument_group('IVF options')
+aa('--bits_per_code', default=8, type=int, help='bits per code. Note that < 8 is only supported when RAFT is enabled')
+aa('--pq_len', default=2, type=int, help='number of vector elements represented by one PQ code')
+aa('--use_precomputed', default=True, type=bool, help='use precomputed codes (not with RAFT enabled)')
+
+group = parser.add_argument_group('searching')
+aa('--k', default=10, type=int, help='nb of nearest neighbors')
+aa('--nprobe', default=50, type=int, help='nb of IVF lists to probe')
+
+args = parser.parse_args()
+
+print("args:", args)
+
+rs = np.random.RandomState(123)
+
+res = faiss.StandardGpuResources()
+
+# Use an RMM pool memory resource for device allocations
+mr = rmm.mr.PoolMemoryResource(rmm.mr.CudaMemoryResource())
+rmm.mr.set_current_device_resource(mr)
+
+# A heuristic to select a suitable number of lists
+def compute_nlist(numVecs):
+    nlist = np.sqrt(numVecs)
+    if (numVecs / nlist < 1000):
+        nlist = numVecs / 1000
+    return int(nlist)
+
+
+def bench_train_milliseconds(index, trainVecs, use_raft):
+    co = faiss.GpuMultipleClonerOptions()
+    # use float 16 lookup tables to save space
+    co.useFloat16LookupTables = True
+    co.use_raft = use_raft
+    index_gpu = faiss.index_cpu_to_gpu(res, 0, index, co)
+    t0 = time.time()
+    index_gpu.train(trainVecs)
+    return 1000*(time.time() - t0)
+
+n_rows, n_cols = xb.shape
+n_train, _ = xt.shape
+M = n_cols // args.pq_len
+nlist = compute_nlist(n_rows)
+index = faiss.index_factory(n_cols, "IVF{},PQ{}x{}np".format(nlist, M, args.bits_per_code))
+
+print("=" * 40)
+print("GPU Train Benchmarks")
+print("=" * 40)
+raft_gpu_train_time = bench_train_milliseconds(index, xt, True)
+if args.raft_only:
+    print("Method: IVFPQ, Operation: TRAIN, dim: %d, n_centroids %d, numSubQuantizers %d, bitsPerCode %d, numTrain: %d, RAFT enabled GPU train time: %.3f milliseconds" % (
+        n_cols, nlist, M, args.bits_per_code, n_train, raft_gpu_train_time))
+else:
+    classical_gpu_train_time = bench_train_milliseconds(
+        index, xt, False)
+    print("Method: IVFPQ, Operation: TRAIN, dim: %d, n_centroids %d, numSubQuantizers %d, bitsPerCode %d, numTrain: %d, classical GPU train time: %.3f milliseconds, RAFT enabled GPU train time: %.3f milliseconds" % (
+        n_cols, nlist, M, args.bits_per_code, n_train, classical_gpu_train_time, raft_gpu_train_time))
+
+
+def bench_add_milliseconds(index, addVecs, use_raft):
+    co = faiss.GpuMultipleClonerOptions()
+    # use float 16 lookup tables to save space
+    co.useFloat16LookupTables = True
+    co.use_raft = use_raft
+    index_gpu = faiss.index_cpu_to_gpu(res, 0, index, co)
+    index_gpu.copyFrom(index)
+    t0 = time.time()
+    index_gpu.add(addVecs)
+    return 1000*(time.time() - t0)
+
+print("=" * 40)
+print("GPU Add Benchmarks")
+print("=" * 40)
+index.train(xt)
+raft_gpu_add_time = bench_add_milliseconds(index, xb, True)
+if args.raft_only:
+    print("Method: IVFPQ, Operation: ADD, dim: %d, n_centroids %d numSubQuantizers %d, bitsPerCode %d, numAdd %d, RAFT enabled GPU add time: %.3f milliseconds" % (
+        n_cols, nlist, M, args.bits_per_code, n_rows, raft_gpu_add_time))
+else:
+    classical_gpu_add_time = bench_add_milliseconds(
+        index, xb, False)
+    print("Method: IVFFPQ, Operation: ADD, dim: %d, n_centroids %d, numSubQuantizers %d, bitsPerCode %d, numAdd %d, classical GPU add time: %.3f milliseconds, RAFT enabled GPU add time: %.3f milliseconds" % (
+        n_cols, nlist, M, args.bits_per_code, n_rows, classical_gpu_add_time, raft_gpu_add_time))
+
+
+def bench_search_milliseconds(index, addVecs, queryVecs, nprobe, k, use_raft):
+    co = faiss.GpuMultipleClonerOptions()
+    co.use_raft = use_raft
+    co.useFloat16LookupTables = True
+    index_gpu = faiss.index_cpu_to_gpu(res, 0, index, co)
+    index_gpu.copyFrom(index)
+    index_gpu.add(addVecs)
+    index_gpu.nprobe = nprobe
+    t0 = time.time()
+    index_gpu.search(queryVecs, k)
+    return 1000*(time.time() - t0)
+
+
+if args.bm_search:
+    print("=" * 40)
+    print("GPU Search Benchmarks")
+    print("=" * 40)
+    queryset_sizes = [1, 10, 100, 1000, 10000]
+    n_train, n_cols = xt.shape
+    n_add, _ = xb.shape
+    print(xq.shape)
+    M = n_cols // args.pq_len
+    nlist = compute_nlist(n_add)
+    index = faiss.index_factory(n_cols, "IVF{},PQ{}x{}np".format(nlist, M, args.bits_per_code))
+    index.train(xt)
+    for n_rows in queryset_sizes:
+        queryVecs = xq[np.random.choice(xq.shape[0], n_rows, replace=False)]
+        raft_gpu_search_time = bench_search_milliseconds(
+            index, xb, queryVecs, args.nprobe, args.k, True)
+        if args.raft_only:
+            print("Method: IVFPQ, Operation: SEARCH, dim: %d, n_centroids: %d, numSubQuantizers %d, bitsPerCode %d, numVecs: %d, numQuery: %d, nprobe: %d, k: %d, RAFT enabled GPU search time: %.3f milliseconds" % (
+                n_cols, nlist, M, args.bits_per_code, n_add, n_rows, args.nprobe, args.k, raft_gpu_search_time))
+        else:
+            classical_gpu_search_time = bench_search_milliseconds(
+                index, xb, queryVecs, args.nprobe, args.k, False)
+            print("Method: IVFPQ, Operation: SEARCH, dim: %d, n_centroids: %d, numSubQuantizers %d, bitsPerCode %d, numVecs: %d, numQuery: %d, nprobe: %d, k: %d, classical GPU search time: %.3f milliseconds, RAFT enabled GPU search time: %.3f milliseconds" % (
+                n_cols, nlist, M, args.bits_per_code, n_add, n_rows, args.nprobe, args.k, classical_gpu_search_time, raft_gpu_search_time))
\ No newline at end of file
diff --git a/cmake/thirdparty/fetch_rapids.cmake b/cmake/thirdparty/fetch_rapids.cmake
index 32ec15b6a4..3e0f6b6ac4 100644
--- a/cmake/thirdparty/fetch_rapids.cmake
+++ b/cmake/thirdparty/fetch_rapids.cmake
@@ -15,7 +15,7 @@
 # or implied. See the License for the specific language governing permissions and limitations under
 # the License.
 # =============================================================================
-set(RAPIDS_VERSION "23.12")
+set(RAPIDS_VERSION "24.02")
 
 if(NOT EXISTS ${CMAKE_CURRENT_BINARY_DIR}/FAISS_RAPIDS.cmake)
     file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-${RAPIDS_VERSION}/RAPIDS.cmake
diff --git a/faiss/gpu/CMakeLists.txt b/faiss/gpu/CMakeLists.txt
index 713d3a287c..126cbe5044 100644
--- a/faiss/gpu/CMakeLists.txt
+++ b/faiss/gpu/CMakeLists.txt
@@ -238,12 +238,15 @@ generate_ivf_interleaved_code()
 
 if(FAISS_ENABLE_RAFT)
   list(APPEND FAISS_GPU_HEADERS
-          impl/RaftUtils.h
+          impl/RaftFlatIndex.cuh
           impl/RaftIVFFlat.cuh
-          impl/RaftFlatIndex.cuh)
+          impl/RaftIVFPQ.cuh
+          utils/RaftUtils.h)
   list(APPEND FAISS_GPU_SRC
           impl/RaftFlatIndex.cu
-          impl/RaftIVFFlat.cu)
+          impl/RaftIVFFlat.cu
+          impl/RaftIVFPQ.cu
+          utils/RaftUtils.cu)
 endif()
 
 add_library(faiss_gpu STATIC ${FAISS_GPU_SRC})
@@ -258,6 +261,28 @@ if(FAISS_ENABLE_RAFT)
   target_compile_definitions(faiss PUBLIC USE_NVIDIA_RAFT=1)
   target_compile_definitions(faiss_avx2 PUBLIC USE_NVIDIA_RAFT=1)
   target_compile_definitions(faiss_avx512 PUBLIC USE_NVIDIA_RAFT=1)
+
+  # Mark all functions as hidden so that we don't generate
+  # global 'public' functions that also exist in libraft.so
+  #
+  # This ensures that faiss functions will call the local version
+  # inside libfaiss.so . This is needed to ensure that things
+  # like raft cublas resources are created and used within the same
+  # dynamic library + CUDA runtime context which are requirements
+  # for valid execution
+  #
+  # To still allow these classes to be used by consumers, the
+  # respective classes/types in the headers are explicitly marked
+  # as 'public' so they can be used by consumers
+  set_source_files_properties(
+    GpuDistance.cu
+    StandardGpuResources.cpp
+    impl/RaftFlatIndex.cu
+    impl/RaftIVFFlat.cu
+    impl/RaftIVFPQ.cu
+    utils/RaftUtils.cu
+    TARGET_DIRECTORY faiss
+    PROPERTIES COMPILE_OPTIONS "-fvisibility=hidden")
   target_compile_definitions(faiss_gpu PUBLIC USE_NVIDIA_RAFT=1)
 endif()
 
diff --git a/faiss/gpu/GpuCloner.cpp b/faiss/gpu/GpuCloner.cpp
index 4dc51f9e83..20583720f3 100644
--- a/faiss/gpu/GpuCloner.cpp
+++ b/faiss/gpu/GpuCloner.cpp
@@ -152,8 +152,7 @@ Index* ToGpuCloner::clone_Index(const Index* index) {
         config.device = device;
         config.indicesOptions = indicesOptions;
         config.flatConfig.useFloat16 = useFloat16CoarseQuantizer;
-        FAISS_THROW_IF_NOT_MSG(
-                !use_raft, "this type of index is not implemented for RAFT");
+        config.use_raft = use_raft;
 
         GpuIndexIVFFlat* res = new GpuIndexIVFFlat(
                 provider, ifl->d, ifl->nlist, ifl->metric_type, config);
@@ -204,8 +203,8 @@ Index* ToGpuCloner::clone_Index(const Index* index) {
         config.flatConfig.useFloat16 = useFloat16CoarseQuantizer;
         config.useFloat16LookupTables = useFloat16;
         config.usePrecomputedTables = usePrecomputed;
-        FAISS_THROW_IF_NOT_MSG(
-                !use_raft, "this type of index is not implemented for RAFT");
+        config.use_raft = use_raft;
+        config.interleavedLayout = use_raft;
 
         GpuIndexIVFPQ* res = new GpuIndexIVFPQ(provider, ipq, config);
 
diff --git a/faiss/gpu/GpuClonerOptions.h b/faiss/gpu/GpuClonerOptions.h
index fbde4c4ea4..197e09dc88 100644
--- a/faiss/gpu/GpuClonerOptions.h
+++ b/faiss/gpu/GpuClonerOptions.h
@@ -38,7 +38,11 @@ struct GpuClonerOptions {
     bool verbose = false;
 
     /// use the RAFT implementation
+#if defined USE_NVIDIA_RAFT
+    bool use_raft = true;
+#else
     bool use_raft = false;
+#endif
 };
 
 struct GpuMultipleClonerOptions : public GpuClonerOptions {
diff --git a/faiss/gpu/GpuDistance.cu b/faiss/gpu/GpuDistance.cu
index c363aa4bb8..a235404b14 100644
--- a/faiss/gpu/GpuDistance.cu
+++ b/faiss/gpu/GpuDistance.cu
@@ -31,7 +31,7 @@
 #include <faiss/gpu/utils/DeviceTensor.cuh>
 
 #if defined USE_NVIDIA_RAFT
-#include <faiss/gpu/impl/RaftUtils.h>
+#include <faiss/gpu/utils/RaftUtils.h>
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/device_resources.hpp>
 #include <raft/core/error.hpp>
@@ -51,6 +51,17 @@ using namespace raft::distance;
 using namespace raft::neighbors;
 #endif
 
+bool should_use_raft(GpuDistanceParams args) {
+    cudaDeviceProp prop;
+    int dev = args.device >= 0 ? args.device : getCurrentDevice();
+    cudaGetDeviceProperties(&prop, dev);
+
+    if (prop.major < 7)
+        return false;
+
+    return args.use_raft;
+}
+
 template <typename T>
 void bfKnnConvert(GpuResourcesProvider* prov, const GpuDistanceParams& args) {
     // Validate the input data
@@ -228,8 +239,8 @@ void bfKnn(GpuResourcesProvider* prov, const GpuDistanceParams& args) {
 
 #if defined USE_NVIDIA_RAFT
     // Note: For now, RAFT bfknn requires queries and vectors to be same layout
-    if (args.use_raft && args.queriesRowMajor == args.vectorsRowMajor) {
-        DistanceType distance = faiss_to_raft(args.metric, false);
+    if (should_use_raft(args) && args.queriesRowMajor == args.vectorsRowMajor) {
+        DistanceType distance = metricFaissToRaft(args.metric, false);
 
         auto resImpl = prov->getResources();
         auto res = resImpl.get();
@@ -349,9 +360,9 @@ void bfKnn(GpuResourcesProvider* prov, const GpuDistanceParams& args) {
         RAFT_LOG_INFO("All synced.");
     } else
 #else
-    if (args.use_raft) {
+    if (should_use_raft(args)) {
         FAISS_THROW_IF_NOT_MSG(
-                !args.use_raft,
+                !should_use_raft(args),
                 "RAFT has not been compiled into the current version so it cannot be used.");
     } else
 #endif
diff --git a/faiss/gpu/GpuDistance.h b/faiss/gpu/GpuDistance.h
index c0dde7fd8c..17dbee617b 100644
--- a/faiss/gpu/GpuDistance.h
+++ b/faiss/gpu/GpuDistance.h
@@ -9,6 +9,7 @@
 
 #include <faiss/Index.h>
 
+#pragma GCC visibility push(default)
 namespace faiss {
 namespace gpu {
 
@@ -106,9 +107,14 @@ struct GpuDistanceParams {
     int device = -1;
 
     /// Should the index dispatch down to RAFT?
+    /// TODO: change default to true if RAFT is enabled
     bool use_raft = false;
 };
 
+/// A function that determines whether RAFT should be used based on various
+/// conditions (such as unsupported architecture)
+bool should_use_raft(GpuDistanceParams args);
+
 /// A wrapper for gpu/impl/Distance.cuh to expose direct brute-force k-nearest
 /// neighbor searches on an externally-provided region of memory (e.g., from a
 /// pytorch tensor).
@@ -168,3 +174,4 @@ void bruteForceKnn(
 
 } // namespace gpu
 } // namespace faiss
+#pragma GCC visibility pop
diff --git a/faiss/gpu/GpuIndex.cu b/faiss/gpu/GpuIndex.cu
index 89952b1121..d667ae1494 100644
--- a/faiss/gpu/GpuIndex.cu
+++ b/faiss/gpu/GpuIndex.cu
@@ -42,6 +42,16 @@ constexpr idx_t kAddVecSize = (idx_t)512 * 1024;
 // FIXME: parameterize based on algorithm need
 constexpr idx_t kSearchVecSize = (idx_t)32 * 1024;
 
+bool should_use_raft(GpuIndexConfig config_) {
+    cudaDeviceProp prop;
+    cudaGetDeviceProperties(&prop, config_.device);
+
+    if (prop.major < 7)
+        return false;
+
+    return config_.use_raft;
+}
+
 GpuIndex::GpuIndex(
         std::shared_ptr<GpuResources> resources,
         int dims,
@@ -64,7 +74,7 @@ GpuIndex::GpuIndex(
                     (config_.memorySpace == MemorySpace::Unified &&
                      getFullUnifiedMemSupport(config_.device)),
             "Device %d does not support full CUDA 8 Unified Memory (CC 6.0+)",
-            config.device);
+            config_.device);
 
     metric_arg = metricArg;
 
@@ -132,7 +142,8 @@ void GpuIndex::addPaged_(idx_t n, const float* x, const idx_t* ids) {
     if (n > 0) {
         idx_t totalSize = n * this->d * sizeof(float);
 
-        if (totalSize > kAddPageSize || n > kAddVecSize) {
+        if (!should_use_raft(config_) &&
+            (totalSize > kAddPageSize || n > kAddVecSize)) {
             // How many vectors fit into kAddPageSize?
             idx_t maxNumVecsForPageSize =
                     kAddPageSize / (this->d * sizeof(float));
diff --git a/faiss/gpu/GpuIndex.h b/faiss/gpu/GpuIndex.h
index 629a57583d..36de98c098 100644
--- a/faiss/gpu/GpuIndex.h
+++ b/faiss/gpu/GpuIndex.h
@@ -38,9 +38,17 @@ struct GpuIndexConfig {
     MemorySpace memorySpace = MemorySpace::Device;
 
     /// Should the index dispatch down to RAFT?
+#if defined USE_NVIDIA_RAFT
+    bool use_raft = true;
+#else
     bool use_raft = false;
+#endif
 };
 
+/// A centralized function that determines whether RAFT should
+/// be used based on various conditions (such as unsupported architecture)
+bool should_use_raft(GpuIndexConfig config_);
+
 class GpuIndex : public faiss::Index {
    public:
     GpuIndex(
diff --git a/faiss/gpu/GpuIndexFlat.cu b/faiss/gpu/GpuIndexFlat.cu
index ef5757fbbd..d361a7182a 100644
--- a/faiss/gpu/GpuIndexFlat.cu
+++ b/faiss/gpu/GpuIndexFlat.cu
@@ -6,18 +6,22 @@
  */
 
 #include <faiss/IndexFlat.h>
+#include <faiss/gpu/GpuIndex.h>
 #include <faiss/gpu/GpuIndexFlat.h>
 #include <faiss/gpu/GpuResources.h>
 #include <faiss/gpu/impl/IndexUtils.h>
 #include <faiss/gpu/utils/DeviceUtils.h>
 #include <faiss/gpu/utils/StaticUtils.h>
 #include <faiss/gpu/impl/FlatIndex.cuh>
-#include <faiss/gpu/impl/RaftFlatIndex.cuh>
 #include <faiss/gpu/utils/ConversionOperators.cuh>
 #include <faiss/gpu/utils/CopyUtils.cuh>
 #include <faiss/gpu/utils/Float16.cuh>
 #include <limits>
 
+#if defined USE_NVIDIA_RAFT
+#include <faiss/gpu/impl/RaftFlatIndex.cuh>
+#endif
+
 namespace faiss {
 namespace gpu {
 
@@ -91,7 +95,7 @@ GpuIndexFlat::~GpuIndexFlat() {}
 void GpuIndexFlat::resetIndex_(int dims) {
 #if defined USE_NVIDIA_RAFT
 
-    if (flatConfig_.use_raft) {
+    if (should_use_raft(config_)) {
         data_.reset(new RaftFlatIndex(
                 resources_.get(),
                 dims,
@@ -99,7 +103,7 @@ void GpuIndexFlat::resetIndex_(int dims) {
                 config_.memorySpace));
     } else
 #else
-    if (flatConfig_.use_raft) {
+    if (should_use_raft(config_)) {
         FAISS_THROW_MSG(
                 "RAFT has not been compiled into the current version so it cannot be used.");
     } else
diff --git a/faiss/gpu/GpuIndexIVF.cu b/faiss/gpu/GpuIndexIVF.cu
index c83008307d..0c5b8db686 100644
--- a/faiss/gpu/GpuIndexIVF.cu
+++ b/faiss/gpu/GpuIndexIVF.cu
@@ -16,11 +16,6 @@
 #include <faiss/gpu/impl/IVFBase.cuh>
 #include <faiss/gpu/utils/CopyUtils.cuh>
 
-#if defined USE_NVIDIA_RAFT
-#include <raft/core/handle.hpp>
-#include <raft/neighbors/ivf_flat.cuh>
-#endif
-
 namespace faiss {
 namespace gpu {
 
@@ -79,9 +74,9 @@ void GpuIndexIVF::init_() {
     }
 
     // here we set a low # iterations because this is typically used
-    // for large clusterings
-    // (copying IndexIVF.cpp's Level1Quantizer
+    // for large clusterings (copying IndexIVF.cpp's Level1Quantizer
     cp.niter = 10;
+
     cp.verbose = verbose;
 
     if (quantizer) {
@@ -96,6 +91,7 @@ void GpuIndexIVF::init_() {
         GpuIndexFlatConfig config = ivfConfig_.flatConfig;
         // inherit our same device
         config.device = config_.device;
+        config.use_raft = config_.use_raft;
 
         if (metric_type == faiss::METRIC_L2) {
             quantizer = new GpuIndexFlatL2(resources_, d, config);
@@ -451,43 +447,12 @@ void GpuIndexIVF::trainQuantizer_(idx_t n, const float* x) {
 
     quantizer->reset();
 
-#if defined USE_NVIDIA_RAFT
-
-    if (config_.use_raft) {
-        const raft::device_resources& raft_handle =
-                resources_->getRaftHandleCurrentDevice();
-
-        raft::neighbors::ivf_flat::index_params raft_idx_params;
-        raft_idx_params.n_lists = nlist;
-        raft_idx_params.metric = metric_type == faiss::METRIC_L2
-                ? raft::distance::DistanceType::L2Expanded
-                : raft::distance::DistanceType::InnerProduct;
-        raft_idx_params.add_data_on_build = false;
-        raft_idx_params.kmeans_trainset_fraction = 1.0;
-        raft_idx_params.kmeans_n_iters = cp.niter;
-        raft_idx_params.adaptive_centers = !cp.frozen_centroids;
+    // leverage the CPU-side k-means code, which works for the GPU
+    // flat index as well
+    Clustering clus(this->d, nlist, this->cp);
+    clus.verbose = verbose;
+    clus.train(n, x, *quantizer);
 
-        auto raft_index = raft::neighbors::ivf_flat::build(
-                raft_handle, raft_idx_params, x, n, (idx_t)d);
-
-        raft_handle.sync_stream();
-
-        quantizer->train(nlist, raft_index.centers().data_handle());
-        quantizer->add(nlist, raft_index.centers().data_handle());
-    } else
-#else
-    if (config_.use_raft) {
-        FAISS_THROW_MSG(
-                "RAFT has not been compiled into the current version so it cannot be used.");
-    } else
-#endif
-    {
-        // leverage the CPU-side k-means code, which works for the GPU
-        // flat index as well
-        Clustering clus(this->d, nlist, this->cp);
-        clus.verbose = verbose;
-        clus.train(n, x, *quantizer);
-    }
     quantizer->is_trained = true;
     FAISS_ASSERT(quantizer->ntotal == nlist);
 }
diff --git a/faiss/gpu/GpuIndexIVFFlat.cu b/faiss/gpu/GpuIndexIVFFlat.cu
index 750096e153..440b449a50 100644
--- a/faiss/gpu/GpuIndexIVFFlat.cu
+++ b/faiss/gpu/GpuIndexIVFFlat.cu
@@ -16,7 +16,9 @@
 #include <faiss/gpu/utils/Float16.cuh>
 
 #if defined USE_NVIDIA_RAFT
+#include <faiss/gpu/utils/RaftUtils.h>
 #include <faiss/gpu/impl/RaftIVFFlat.cuh>
+#include <raft/neighbors/ivf_flat.cuh>
 #endif
 
 #include <limits>
@@ -70,11 +72,14 @@ GpuIndexIVFFlat::GpuIndexIVFFlat(
                   config),
           ivfFlatConfig_(config),
           reserveMemoryVecs_(0) {
+    FAISS_THROW_IF_NOT_MSG(
+            !should_use_raft(config),
+            "GpuIndexIVFFlat: RAFT does not support separate coarseQuantizer");
     // We could have been passed an already trained coarse quantizer. There is
     // no other quantizer that we need to train, so this is sufficient
     if (this->is_trained) {
         FAISS_ASSERT(this->quantizer);
-        set_index_(
+        setIndex_(
                 resources_.get(),
                 this->d,
                 this->nlist,
@@ -92,56 +97,13 @@ GpuIndexIVFFlat::GpuIndexIVFFlat(
 
 GpuIndexIVFFlat::~GpuIndexIVFFlat() {}
 
-void GpuIndexIVFFlat::set_index_(
-        GpuResources* resources,
-        int dim,
-        int nlist,
-        faiss::MetricType metric,
-        float metricArg,
-        bool useResidual,
-        /// Optional ScalarQuantizer
-        faiss::ScalarQuantizer* scalarQ,
-        bool interleavedLayout,
-        IndicesOptions indicesOptions,
-        MemorySpace space) {
-#if defined USE_NVIDIA_RAFT
+void GpuIndexIVFFlat::reserveMemory(size_t numVecs) {
+    DeviceScope scope(config_.device);
 
-    if (config_.use_raft) {
-        index_.reset(new RaftIVFFlat(
-                resources,
-                dim,
-                nlist,
-                metric,
-                metricArg,
-                useResidual,
-                scalarQ,
-                interleavedLayout,
-                indicesOptions,
-                space));
-    } else
-#else
-    if (config_.use_raft) {
+    if (should_use_raft(config_)) {
         FAISS_THROW_MSG(
-                "RAFT has not been compiled into the current version so it cannot be used.");
-    } else
-#endif
-    {
-        index_.reset(new IVFFlat(
-                resources,
-                dim,
-                nlist,
-                metric,
-                metricArg,
-                useResidual,
-                scalarQ,
-                interleavedLayout,
-                indicesOptions,
-                space));
+                "Pre-allocation of IVF lists is not supported with RAFT enabled.");
     }
-}
-
-void GpuIndexIVFFlat::reserveMemory(size_t numVecs) {
-    DeviceScope scope(config_.device);
 
     reserveMemoryVecs_ = numVecs;
     if (index_) {
@@ -157,7 +119,11 @@ void GpuIndexIVFFlat::copyFrom(const faiss::IndexIVFFlat* index) {
 
     // Clear out our old data
     index_.reset();
-    baseIndex_.reset();
+
+    // skip base class allocations if RAFT is enabled
+    if (!should_use_raft(config_)) {
+        baseIndex_.reset();
+    }
 
     // The other index might not be trained
     if (!index->is_trained) {
@@ -169,7 +135,7 @@ void GpuIndexIVFFlat::copyFrom(const faiss::IndexIVFFlat* index) {
     FAISS_ASSERT(is_trained);
 
     // Copy our lists as well
-    set_index_(
+    setIndex_(
             resources_.get(),
             d,
             nlist,
@@ -247,23 +213,61 @@ void GpuIndexIVFFlat::train(idx_t n, const float* x) {
 
     if (this->is_trained) {
         FAISS_ASSERT(index_);
+        if (should_use_raft(config_)) {
+            // if RAFT is enabled, copy the IVF centroids to the RAFT index in
+            // case it has been reset. This is because reset clears the RAFT
+            // index and its centroids.
+            // TODO: change this once the coarse quantizer is separated from
+            // RAFT index
+            updateQuantizer();
+        };
         return;
     }
 
     FAISS_ASSERT(!index_);
 
+    if (should_use_raft(config_)) {
 #if defined USE_NVIDIA_RAFT
-    if (config_.use_raft) {
-        // No need to copy the data to host
-        trainQuantizer_(n, x);
-    } else
+        setIndex_(
+                resources_.get(),
+                this->d,
+                this->nlist,
+                this->metric_type,
+                this->metric_arg,
+                false,   // no residual
+                nullptr, // no scalar quantizer
+                ivfFlatConfig_.interleavedLayout,
+                ivfFlatConfig_.indicesOptions,
+                config_.memorySpace);
+        const raft::device_resources& raft_handle =
+                resources_->getRaftHandleCurrentDevice();
+
+        raft::neighbors::ivf_flat::index_params raft_idx_params;
+        raft_idx_params.n_lists = nlist;
+        raft_idx_params.metric = metricFaissToRaft(metric_type, false);
+        raft_idx_params.add_data_on_build = false;
+        raft_idx_params.kmeans_trainset_fraction =
+                static_cast<double>(cp.max_points_per_centroid * nlist) /
+                static_cast<double>(n);
+        raft_idx_params.kmeans_n_iters = cp.niter;
+
+        auto raftIndex_ =
+                std::static_pointer_cast<RaftIVFFlat, IVFFlat>(index_);
+
+        raft::neighbors::ivf_flat::index<float, idx_t> raft_ivfflat_index =
+                raft::neighbors::ivf_flat::build<float, idx_t>(
+                        raft_handle, raft_idx_params, x, n, (idx_t)d);
+
+        quantizer->train(nlist, raft_ivfflat_index.centers().data_handle());
+        quantizer->add(nlist, raft_ivfflat_index.centers().data_handle());
+        raft_handle.sync_stream();
+
+        raftIndex_->setRaftIndex(std::move(raft_ivfflat_index));
 #else
-    if (config_.use_raft) {
         FAISS_THROW_MSG(
                 "RAFT has not been compiled into the current version so it cannot be used.");
-    } else
 #endif
-    {
+    } else {
         // FIXME: GPUize more of this
         // First, make sure that the data is resident on the CPU, if it is not
         // on the CPU, as we depend upon parts of the CPU code
@@ -272,29 +276,85 @@ void GpuIndexIVFFlat::train(idx_t n, const float* x) {
                 resources_->getDefaultStream(config_.device),
                 {n, this->d});
         trainQuantizer_(n, hostData.data());
+
+        setIndex_(
+                resources_.get(),
+                this->d,
+                this->nlist,
+                this->metric_type,
+                this->metric_arg,
+                false,   // no residual
+                nullptr, // no scalar quantizer
+                ivfFlatConfig_.interleavedLayout,
+                ivfFlatConfig_.indicesOptions,
+                config_.memorySpace);
+        updateQuantizer();
     }
 
     // The quantizer is now trained; construct the IVF index
-    set_index_(
-            resources_.get(),
-            this->d,
-            this->nlist,
-            this->metric_type,
-            this->metric_arg,
-            false,   // no residual
-            nullptr, // no scalar quantizer
-            ivfFlatConfig_.interleavedLayout,
-            ivfFlatConfig_.indicesOptions,
-            config_.memorySpace);
     baseIndex_ = std::static_pointer_cast<IVFBase, IVFFlat>(index_);
-    updateQuantizer();
 
     if (reserveMemoryVecs_) {
-        index_->reserveMemory(reserveMemoryVecs_);
+        if (should_use_raft(config_)) {
+            FAISS_THROW_MSG(
+                    "Pre-allocation of IVF lists is not supported with RAFT enabled.");
+        } else
+            index_->reserveMemory(reserveMemoryVecs_);
     }
 
     this->is_trained = true;
 }
 
+void GpuIndexIVFFlat::setIndex_(
+        GpuResources* resources,
+        int dim,
+        int nlist,
+        faiss::MetricType metric,
+        float metricArg,
+        bool useResidual,
+        /// Optional ScalarQuantizer
+        faiss::ScalarQuantizer* scalarQ,
+        bool interleavedLayout,
+        IndicesOptions indicesOptions,
+        MemorySpace space) {
+    if (should_use_raft(config_)) {
+#if defined USE_NVIDIA_RAFT
+        FAISS_THROW_IF_NOT_MSG(
+                ivfFlatConfig_.indicesOptions == INDICES_64_BIT,
+                "RAFT only supports INDICES_64_BIT");
+        if (!ivfFlatConfig_.interleavedLayout) {
+            fprintf(stderr,
+                    "WARN: interleavedLayout is set to False with RAFT enabled. This will be ignored.\n");
+        }
+        index_.reset(new RaftIVFFlat(
+                resources,
+                dim,
+                nlist,
+                metric,
+                metricArg,
+                useResidual,
+                scalarQ,
+                interleavedLayout,
+                indicesOptions,
+                space));
+#else
+        FAISS_THROW_MSG(
+                "RAFT has not been compiled into the current version so it cannot be used.");
+#endif
+    } else {
+        index_.reset(new IVFFlat(
+                resources,
+                dim,
+                nlist,
+                metric,
+                metricArg,
+                useResidual,
+                scalarQ,
+                interleavedLayout,
+                indicesOptions,
+                space));
+    }
+}
+
 } // namespace gpu
 } // namespace faiss
diff --git a/faiss/gpu/GpuIndexIVFFlat.h b/faiss/gpu/GpuIndexIVFFlat.h
index d7508feef4..678bf8e7f4 100644
--- a/faiss/gpu/GpuIndexIVFFlat.h
+++ b/faiss/gpu/GpuIndexIVFFlat.h
@@ -88,7 +88,8 @@ class GpuIndexIVFFlat : public GpuIndexIVF {
     void train(idx_t n, const float* x) override;
 
    protected:
-    void set_index_(
+    /// Initialize appropriate index
+    void setIndex_(
             GpuResources* resources,
             int dim,
             int nlist,
@@ -101,6 +102,7 @@ class GpuIndexIVFFlat : public GpuIndexIVF {
             IndicesOptions indicesOptions,
             MemorySpace space);
 
+   protected:
     /// Our configuration options
     const GpuIndexIVFFlatConfig ivfFlatConfig_;
 
diff --git a/faiss/gpu/GpuIndexIVFPQ.cu b/faiss/gpu/GpuIndexIVFPQ.cu
index 69c4cf0556..d39f036b89 100644
--- a/faiss/gpu/GpuIndexIVFPQ.cu
+++ b/faiss/gpu/GpuIndexIVFPQ.cu
@@ -15,6 +15,13 @@
 #include <faiss/gpu/impl/IVFPQ.cuh>
 #include <faiss/gpu/utils/CopyUtils.cuh>
 
+#if defined USE_NVIDIA_RAFT
+#include <faiss/gpu/utils/RaftUtils.h>
+#include <faiss/gpu/impl/RaftIVFPQ.cuh>
+#include <raft/neighbors/ivf_pq.cuh>
+#include <raft/neighbors/ivf_pq_helpers.cuh>
+#endif
+
 #include <limits>
 
 namespace faiss {
@@ -87,6 +94,10 @@ GpuIndexIVFPQ::GpuIndexIVFPQ(
     // instance
     this->is_trained = false;
 
+    FAISS_THROW_IF_NOT_MSG(
+            !config.use_raft,
+            "GpuIndexIVFPQ: RAFT does not support separate coarseQuantizer");
+
     verifyPQSettings_();
 }
 
@@ -100,7 +111,11 @@ void GpuIndexIVFPQ::copyFrom(const faiss::IndexIVFPQ* index) {
 
     // Clear out our old data
     index_.reset();
-    baseIndex_.reset();
+
+    // skip base class allocations if RAFT is enabled
+    if (!should_use_raft(config_)) {
+        baseIndex_.reset();
+    }
 
     pq = index->pq;
     subQuantizers_ = index->pq.M;
@@ -127,7 +142,7 @@ void GpuIndexIVFPQ::copyFrom(const faiss::IndexIVFPQ* index) {
     // Copy our lists as well
     // The product quantizer must have data in it
     FAISS_ASSERT(index->pq.centroids.size() > 0);
-    index_.reset(new IVFPQ(
+    setIndex_(
             resources_.get(),
             this->d,
             this->nlist,
@@ -140,7 +155,7 @@ void GpuIndexIVFPQ::copyFrom(const faiss::IndexIVFPQ* index) {
             ivfpqConfig_.interleavedLayout,
             (float*)index->pq.centroids.data(),
             ivfpqConfig_.indicesOptions,
-            config_.memorySpace));
+            config_.memorySpace);
     baseIndex_ = std::static_pointer_cast<IVFBase, IVFPQ>(index_);
 
     // Doesn't make sense to reserve memory here
@@ -169,7 +184,7 @@ void GpuIndexIVFPQ::copyTo(faiss::IndexIVFPQ* index) const {
     //
     index->by_residual = true;
     index->use_precomputed_table = 0;
-    index->code_size = subQuantizers_;
+    index->code_size = utils::divUp(subQuantizers_ * bitsPerCode_, 8);
     index->pq = faiss::ProductQuantizer(this->d, subQuantizers_, bitsPerCode_);
 
     index->do_polysemous_training = false;
@@ -308,6 +323,7 @@ void GpuIndexIVFPQ::trainResidualQuantizer_(idx_t n, const float* x) {
         try {
             GpuIndexFlatConfig config;
             config.device = ivfpqConfig_.device;
+            config.use_raft = false;
             GpuIndexFlatL2 pqIndex(resources_, pq.dsub, config);
 
             pq.assign_index = &pqIndex;
@@ -322,29 +338,6 @@ void GpuIndexIVFPQ::trainResidualQuantizer_(idx_t n, const float* x) {
         // use the currently assigned clustering index
         pq.train(n, residuals.data());
     }
-
-    index_.reset(new IVFPQ(
-            resources_.get(),
-            this->d,
-            this->nlist,
-            metric_type,
-            metric_arg,
-            subQuantizers_,
-            bitsPerCode_,
-            ivfpqConfig_.useFloat16LookupTables,
-            ivfpqConfig_.useMMCodeDistance,
-            ivfpqConfig_.interleavedLayout,
-            pq.centroids.data(),
-            ivfpqConfig_.indicesOptions,
-            config_.memorySpace));
-    baseIndex_ = std::static_pointer_cast<IVFBase, IVFPQ>(index_);
-    updateQuantizer();
-
-    if (reserveMemoryVecs_) {
-        index_->reserveMemory(reserveMemoryVecs_);
-    }
-
-    index_->setPrecomputedCodes(quantizer, usePrecomputedTables_);
 }
 
 void GpuIndexIVFPQ::train(idx_t n, const float* x) {
@@ -356,27 +349,179 @@ void GpuIndexIVFPQ::train(idx_t n, const float* x) {
 
     if (this->is_trained) {
         FAISS_ASSERT(index_);
+        if (should_use_raft(config_)) {
+            // if RAFT is enabled, copy the IVF centroids to the RAFT index in
+            // case it has been reset. This is because reset clears the RAFT
+            // index and its centroids.
+            // TODO: change this once the coarse quantizer is separated from
+            // RAFT index
+            updateQuantizer();
+        };
         return;
     }
 
     FAISS_ASSERT(!index_);
 
-    // FIXME: GPUize more of this
-    // First, make sure that the data is resident on the CPU, if it is not on
-    // the CPU, as we depend upon parts of the CPU code
-    auto hostData = toHost<float, 2>(
-            (float*)x,
-            resources_->getDefaultStream(config_.device),
-            {n, this->d});
+    // RAFT does not support using an external index for assignment. Fall back
+    // to the classical GPU impl
+    if (should_use_raft(config_)) {
+#if defined USE_NVIDIA_RAFT
+        if (pq.assign_index) {
+            fprintf(stderr,
+                    "WARN: The Product Quantizer's assign_index will be ignored with RAFT enabled.\n");
+        }
+        // first initialize the index. The PQ centroids will be updated
+        // retroactively.
+        setIndex_(
+                resources_.get(),
+                this->d,
+                this->nlist,
+                metric_type,
+                metric_arg,
+                subQuantizers_,
+                bitsPerCode_,
+                ivfpqConfig_.useFloat16LookupTables,
+                ivfpqConfig_.useMMCodeDistance,
+                ivfpqConfig_.interleavedLayout,
+                pq.centroids.data(),
+                ivfpqConfig_.indicesOptions,
+                config_.memorySpace);
+        // No need to copy the data to host
+        const raft::device_resources& raft_handle =
+                resources_->getRaftHandleCurrentDevice();
+
+        raft::neighbors::ivf_pq::index_params raft_idx_params;
+        raft_idx_params.n_lists = nlist;
+        raft_idx_params.metric = metricFaissToRaft(metric_type, false);
+        raft_idx_params.kmeans_trainset_fraction =
+                static_cast<double>(cp.max_points_per_centroid * nlist) /
+                static_cast<double>(n);
+        raft_idx_params.kmeans_n_iters = cp.niter;
+        raft_idx_params.pq_bits = bitsPerCode_;
+        raft_idx_params.pq_dim = subQuantizers_;
+        raft_idx_params.conservative_memory_allocation = false;
+        raft_idx_params.add_data_on_build = false;
+
+        auto raftIndex_ = std::static_pointer_cast<RaftIVFPQ, IVFPQ>(index_);
+
+        raft::neighbors::ivf_pq::index<idx_t> raft_ivfpq_index =
+                raft::neighbors::ivf_pq::build<float, idx_t>(
+                        raft_handle, raft_idx_params, x, n, (idx_t)d);
+
+        auto raft_centers = raft::make_device_matrix<float>(
+                raft_handle,
+                raft_ivfpq_index.n_lists(),
+                raft_ivfpq_index.dim());
+        raft::neighbors::ivf_pq::helpers::extract_centers(
+                raft_handle, raft_ivfpq_index, raft_centers.view());
+
+        quantizer->train(nlist, raft_centers.data_handle());
+        quantizer->add(nlist, raft_centers.data_handle());
+
+        raft::copy(
+                pq.get_centroids(0, 0),
+                raft_ivfpq_index.pq_centers().data_handle(),
+                raft_ivfpq_index.pq_centers().size(),
+                raft_handle.get_stream());
+        raft_handle.sync_stream();
+        raftIndex_->setRaftIndex(std::move(raft_ivfpq_index));
+#else
+        FAISS_THROW_MSG(
+                "RAFT has not been compiled into the current version so it cannot be used.");
+#endif
+    } else {
+        // FIXME: GPUize more of this
+        // First, make sure that the data is resident on the CPU, if it is not
+        // on the CPU, as we depend upon parts of the CPU code
+        auto hostData = toHost<float, 2>(
+                (float*)x,
+                resources_->getDefaultStream(config_.device),
+                {n, this->d});
+
+        trainQuantizer_(n, hostData.data());
+        trainResidualQuantizer_(n, hostData.data());
+
+        setIndex_(
+                resources_.get(),
+                this->d,
+                this->nlist,
+                metric_type,
+                metric_arg,
+                subQuantizers_,
+                bitsPerCode_,
+                ivfpqConfig_.useFloat16LookupTables,
+                ivfpqConfig_.useMMCodeDistance,
+                ivfpqConfig_.interleavedLayout,
+                pq.centroids.data(),
+                ivfpqConfig_.indicesOptions,
+                config_.memorySpace);
+        updateQuantizer();
+    }
+    baseIndex_ = std::static_pointer_cast<IVFBase, IVFPQ>(index_);
 
-    trainQuantizer_(n, hostData.data());
-    trainResidualQuantizer_(n, hostData.data());
+    if (reserveMemoryVecs_) {
+        index_->reserveMemory(reserveMemoryVecs_);
+    }
+
+    index_->setPrecomputedCodes(quantizer, usePrecomputedTables_);
 
     FAISS_ASSERT(index_);
 
     this->is_trained = true;
 }
 
+void GpuIndexIVFPQ::setIndex_(
+        GpuResources* resources,
+        int dim,
+        idx_t nlist,
+        faiss::MetricType metric,
+        float metricArg,
+        int numSubQuantizers,
+        int bitsPerSubQuantizer,
+        bool useFloat16LookupTables,
+        bool useMMCodeDistance,
+        bool interleavedLayout,
+        float* pqCentroidData,
+        IndicesOptions indicesOptions,
+        MemorySpace space) {
+    if (should_use_raft(config_)) {
+#if defined USE_NVIDIA_RAFT
+        index_.reset(new RaftIVFPQ(
+                resources,
+                dim,
+                nlist,
+                metric,
+                metricArg,
+                numSubQuantizers,
+                bitsPerSubQuantizer,
+                useFloat16LookupTables,
+                useMMCodeDistance,
+                interleavedLayout,
+                pqCentroidData,
+                indicesOptions,
+                space));
+#else
+        FAISS_THROW_MSG(
+                "RAFT has not been compiled into the current version so it cannot be used.");
+#endif
+    } else {
+        index_.reset(new IVFPQ(
+                resources,
+                dim,
+                nlist,
+                metric,
+                metricArg,
+                numSubQuantizers,
+                bitsPerSubQuantizer,
+                useFloat16LookupTables,
+                useMMCodeDistance,
+                interleavedLayout,
+                pqCentroidData,
+                indicesOptions,
+                space));
+    }
+}
+
 void GpuIndexIVFPQ::verifyPQSettings_() const {
     // Our implementation has these restrictions:
 
@@ -384,28 +529,36 @@ void GpuIndexIVFPQ::verifyPQSettings_() const {
     FAISS_THROW_IF_NOT_MSG(nlist > 0, "nlist must be >0");
 
     // up to a single byte per code
-    if (ivfpqConfig_.interleavedLayout) {
+    if (should_use_raft(config_)) {
+        if (!ivfpqConfig_.interleavedLayout) {
+            fprintf(stderr,
+                    "WARN: interleavedLayout is set to False with RAFT enabled. This will be ignored.\n");
+        }
         FAISS_THROW_IF_NOT_FMT(
-                bitsPerCode_ == 4 || bitsPerCode_ == 5 || bitsPerCode_ == 6 ||
-                        bitsPerCode_ == 8,
-                "Bits per code must be between 4, 5, 6 or 8 (passed %d)",
+                bitsPerCode_ >= 4 && bitsPerCode_ <= 8,
+                "Bits per code must be within closed range [4,8] (passed %d)",
                 bitsPerCode_);
-
-    } else {
         FAISS_THROW_IF_NOT_FMT(
-                bitsPerCode_ == 8,
-                "Bits per code must be 8 (passed %d)",
-                bitsPerCode_);
+                (bitsPerCode_ * subQuantizers_) % 8 == 0,
+                "`Bits per code * number of sub-quantizers must be a multiple of 8, (passed %u * %u = %u).",
+                bitsPerCode_,
+                subQuantizers_,
+                bitsPerCode_ * subQuantizers_);
+    } else {
+        if (ivfpqConfig_.interleavedLayout) {
+            FAISS_THROW_IF_NOT_FMT(
+                    bitsPerCode_ == 4 || bitsPerCode_ == 5 ||
+                            bitsPerCode_ == 6 || bitsPerCode_ == 8,
+                    "Bits per code must be between 4, 5, 6 or 8 (passed %d)",
+                    bitsPerCode_);
+        } else {
+            FAISS_THROW_IF_NOT_FMT(
+                    bitsPerCode_ == 8,
+                    "Bits per code must be 8 (passed %d)",
+                    bitsPerCode_);
+        }
     }
 
-    // Sub-quantizers must evenly divide dimensions available
-    FAISS_THROW_IF_NOT_FMT(
-            this->d % subQuantizers_ == 0,
-            "Number of sub-quantizers (%d) must be an "
-            "even divisor of the number of dimensions (%d)",
-            subQuantizers_,
-            this->d);
-
     // The number of bytes per encoded vector must be one we support
     FAISS_THROW_IF_NOT_FMT(
             ivfpqConfig_.interleavedLayout ||
@@ -414,30 +567,40 @@ void GpuIndexIVFPQ::verifyPQSettings_() const {
             "is not supported",
             subQuantizers_);
 
-    // We must have enough shared memory on the current device to store
-    // our lookup distances
-    int lookupTableSize = sizeof(float);
-    if (ivfpqConfig_.useFloat16LookupTables) {
-        lookupTableSize = sizeof(half);
-    }
+    if (!should_use_raft(config_)) {
+        // Sub-quantizers must evenly divide dimensions available
+        FAISS_THROW_IF_NOT_FMT(
+                this->d % subQuantizers_ == 0,
+                "Number of sub-quantizers (%d) must be an "
+                "even divisor of the number of dimensions (%d)",
+                subQuantizers_,
+                this->d);
+
+        // We must have enough shared memory on the current device to store
+        // our lookup distances
+        int lookupTableSize = sizeof(float);
+        if (ivfpqConfig_.useFloat16LookupTables) {
+            lookupTableSize = sizeof(half);
+        }
 
-    // 64 bytes per code is only supported with usage of float16, at 2^8
-    // codes per subquantizer
-    size_t requiredSmemSize =
-            lookupTableSize * subQuantizers_ * utils::pow2(bitsPerCode_);
-    size_t smemPerBlock = getMaxSharedMemPerBlock(config_.device);
+        // 64 bytes per code is only supported with usage of float16, at 2^8
+        // codes per subquantizer
+        size_t requiredSmemSize =
+                lookupTableSize * subQuantizers_ * utils::pow2(bitsPerCode_);
+        size_t smemPerBlock = getMaxSharedMemPerBlock(config_.device);
 
-    FAISS_THROW_IF_NOT_FMT(
-            requiredSmemSize <= getMaxSharedMemPerBlock(config_.device),
-            "Device %d has %zu bytes of shared memory, while "
-            "%d bits per code and %d sub-quantizers requires %zu "
-            "bytes. Consider useFloat16LookupTables and/or "
-            "reduce parameters",
-            config_.device,
-            smemPerBlock,
-            bitsPerCode_,
-            subQuantizers_,
-            requiredSmemSize);
+        FAISS_THROW_IF_NOT_FMT(
+                requiredSmemSize <= getMaxSharedMemPerBlock(config_.device),
+                "Device %d has %zu bytes of shared memory, while "
+                "%d bits per code and %d sub-quantizers requires %zu "
+                "bytes. Consider useFloat16LookupTables and/or "
+                "reduce parameters",
+                config_.device,
+                smemPerBlock,
+                bitsPerCode_,
+                subQuantizers_,
+                requiredSmemSize);
+    }
 }
 
 } // namespace gpu
diff --git a/faiss/gpu/GpuIndexIVFPQ.h b/faiss/gpu/GpuIndexIVFPQ.h
index 22e9961675..1084d4d0d2 100644
--- a/faiss/gpu/GpuIndexIVFPQ.h
+++ b/faiss/gpu/GpuIndexIVFPQ.h
@@ -33,7 +33,8 @@ struct GpuIndexIVFPQConfig : public GpuIndexIVFConfig {
     bool usePrecomputedTables = false;
 
     /// Use the alternative memory layout for the IVF lists
-    /// WARNING: this is a feature under development, do not use!
+    /// WARNING: this is a feature under development, and is only supported with
+    /// RAFT enabled for the index. Do not use if RAFT is not enabled.
     bool interleavedLayout = false;
 
     /// Use GEMM-backed computation of PQ code distances for the no precomputed
@@ -133,6 +134,22 @@ class GpuIndexIVFPQ : public GpuIndexIVF {
     ProductQuantizer pq;
 
    protected:
+    /// Initialize appropriate index
+    void setIndex_(
+            GpuResources* resources,
+            int dim,
+            idx_t nlist,
+            faiss::MetricType metric,
+            float metricArg,
+            int numSubQuantizers,
+            int bitsPerSubQuantizer,
+            bool useFloat16LookupTables,
+            bool useMMCodeDistance,
+            bool interleavedLayout,
+            float* pqCentroidData,
+            IndicesOptions indicesOptions,
+            MemorySpace space);
+
     /// Throws errors if configuration settings are improper
     void verifyPQSettings_() const;
 
diff --git a/faiss/gpu/GpuResources.h b/faiss/gpu/GpuResources.h
index 7d0459955b..fc6dd591b4 100644
--- a/faiss/gpu/GpuResources.h
+++ b/faiss/gpu/GpuResources.h
@@ -32,6 +32,7 @@
 
 #if defined USE_NVIDIA_RAFT
 #include <raft/core/device_resources.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
 #endif
 
 namespace faiss {
@@ -159,6 +160,10 @@ struct AllocRequest : public AllocInfo {
 
     /// The size in bytes of the allocation
     size_t size = 0;
+
+#if defined USE_NVIDIA_RAFT
+    rmm::mr::device_memory_resource* mr = nullptr;
+#endif
 };
 
 /// A RAII object that manages a temporary memory request
diff --git a/faiss/gpu/StandardGpuResources.cpp b/faiss/gpu/StandardGpuResources.cpp
index 754025d049..004f80a27e 100644
--- a/faiss/gpu/StandardGpuResources.cpp
+++ b/faiss/gpu/StandardGpuResources.cpp
@@ -22,11 +22,10 @@
 
 #if defined USE_NVIDIA_RAFT
 #include <raft/core/device_resources.hpp>
-#include <rmm/mr/device/cuda_memory_resource.hpp>
 #include <rmm/mr/device/managed_memory_resource.hpp>
+#include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/mr/host/pinned_memory_resource.hpp>
 #include <memory>
-
 #endif
 
 #include <faiss/gpu/StandardGpuResources.h>
@@ -92,9 +91,8 @@ std::string allocsToString(const std::unordered_map<void*, AllocRequest>& map) {
 StandardGpuResourcesImpl::StandardGpuResourcesImpl()
         :
 #if defined USE_NVIDIA_RAFT
-          cmr(new rmm::mr::cuda_memory_resource),
-          mmr(new rmm::mr::managed_memory_resource),
-          pmr(new rmm::mr::pinned_memory_resource),
+          mmr_(new rmm::mr::managed_memory_resource),
+          pmr_(new rmm::mr::pinned_memory_resource),
 #endif
           pinnedMemAlloc_(nullptr),
           pinnedMemAllocSize_(0),
@@ -161,7 +159,7 @@ StandardGpuResourcesImpl::~StandardGpuResourcesImpl() {
 
     if (pinnedMemAlloc_) {
 #if defined USE_NVIDIA_RAFT
-        pmr->deallocate(pinnedMemAlloc_, pinnedMemAllocSize_);
+        pmr_->deallocate(pinnedMemAlloc_, pinnedMemAllocSize_);
 #else
         auto err = cudaFreeHost(pinnedMemAlloc_);
         FAISS_ASSERT_FMT(
@@ -314,7 +312,7 @@ void StandardGpuResourcesImpl::initializeForDevice(int device) {
         // pinned memory allocation
         if (defaultStreams_.empty() && pinnedMemSize_ > 0) {
             try {
-                pinnedMemAlloc_ = pmr->allocate(pinnedMemSize_);
+                pinnedMemAlloc_ = pmr_->allocate(pinnedMemSize_);
             } catch (const std::bad_alloc& rmm_ex) {
                 FAISS_THROW_MSG("CUDA memory allocation error");
             }
@@ -478,8 +476,6 @@ void* StandardGpuResourcesImpl::allocMemory(const AllocRequest& req) {
     void* p = nullptr;
 
     if (adjReq.space == MemorySpace::Temporary) {
-        // If we don't have enough space in our temporary memory manager, we
-        // need to allocate this request separately
         auto& tempMem = tempMemory_[adjReq.device];
 
         if (adjReq.size > tempMem->getSizeAvailable()) {
@@ -500,11 +496,14 @@ void* StandardGpuResourcesImpl::allocMemory(const AllocRequest& req) {
 
         // Otherwise, we can handle this locally
         p = tempMemory_[adjReq.device]->allocMemory(adjReq.stream, adjReq.size);
-
     } else if (adjReq.space == MemorySpace::Device) {
 #if defined USE_NVIDIA_RAFT
         try {
-            p = cmr->allocate(adjReq.size, adjReq.stream);
+            rmm::mr::device_memory_resource* current_mr =
+                    rmm::mr::get_per_device_resource(
+                            rmm::cuda_device_id{adjReq.device});
+            p = current_mr->allocate_async(adjReq.size, adjReq.stream);
+            adjReq.mr = current_mr;
         } catch (const std::bad_alloc& rmm_ex) {
             FAISS_THROW_MSG("CUDA memory allocation error");
         }
@@ -514,8 +513,8 @@ void* StandardGpuResourcesImpl::allocMemory(const AllocRequest& req) {
         // Throw if we fail to allocate
         if (err != cudaSuccess) {
             // FIXME: as of CUDA 11, a memory allocation error appears to be
-            // presented via cudaGetLastError as well, and needs to be cleared.
-            // Just call the function to clear it
+            // presented via cudaGetLastError as well, and needs to be
+            // cleared. Just call the function to clear it
             cudaGetLastError();
 
             std::stringstream ss;
@@ -534,7 +533,12 @@ void* StandardGpuResourcesImpl::allocMemory(const AllocRequest& req) {
     } else if (adjReq.space == MemorySpace::Unified) {
 #if defined USE_NVIDIA_RAFT
         try {
-            p = mmr->allocate(adjReq.size, adjReq.stream);
+            // for now, use our own managed MR to do Unified Memory allocations.
+            // TODO: change this to use the current device resource once RMM has
+            // a way to retrieve a "guaranteed" managed memory resource for a
+            // device.
+            p = mmr_->allocate_async(adjReq.size, adjReq.stream);
+            adjReq.mr = mmr_.get();
         } catch (const std::bad_alloc& rmm_ex) {
             FAISS_THROW_MSG("CUDA memory allocation error");
         }
@@ -593,16 +597,11 @@ void StandardGpuResourcesImpl::deallocMemory(int device, void* p) {
 
     if (req.space == MemorySpace::Temporary) {
         tempMemory_[device]->deallocMemory(device, req.stream, req.size, p);
-
     } else if (
             req.space == MemorySpace::Device ||
             req.space == MemorySpace::Unified) {
 #if defined USE_NVIDIA_RAFT
-        if (req.space == MemorySpace::Device) {
-            cmr->deallocate(p, req.size, req.stream);
-        } else if (req.space == MemorySpace::Unified) {
-            mmr->deallocate(p, req.size, req.stream);
-        }
+        req.mr->deallocate_async(p, req.size, req.stream);
 #else
         auto err = cudaFree(p);
         FAISS_ASSERT_FMT(
diff --git a/faiss/gpu/StandardGpuResources.h b/faiss/gpu/StandardGpuResources.h
index 9113de573c..661c784aee 100644
--- a/faiss/gpu/StandardGpuResources.h
+++ b/faiss/gpu/StandardGpuResources.h
@@ -24,8 +24,6 @@
 
 #if defined USE_NVIDIA_RAFT
 #include <raft/core/device_resources.hpp>
-#include <rmm/mr/device/cuda_memory_resource.hpp>
-#include <rmm/mr/device/managed_memory_resource.hpp>
 #include <rmm/mr/host/pinned_memory_resource.hpp>
 #endif
 
@@ -37,6 +35,7 @@
 #include <unordered_map>
 #include <vector>
 
+#pragma GCC visibility push(default)
 namespace faiss {
 namespace gpu {
 
@@ -166,14 +165,11 @@ class StandardGpuResourcesImpl : public GpuResources {
      * to create a subclass only for the RMM memory resources.
      */
 
-    // cuda_memory_resource
-    std::unique_ptr<rmm::mr::device_memory_resource> cmr;
-
     // managed_memory_resource
-    std::unique_ptr<rmm::mr::device_memory_resource> mmr;
+    std::unique_ptr<rmm::mr::device_memory_resource> mmr_;
 
     // pinned_memory_resource
-    std::unique_ptr<rmm::mr::host_memory_resource> pmr;
+    std::unique_ptr<rmm::mr::host_memory_resource> pmr_;
 #endif
 
     /// Pinned memory allocation for use with this GPU
@@ -260,3 +256,4 @@ class StandardGpuResources : public GpuResourcesProvider {
 
 } // namespace gpu
 } // namespace faiss
+#pragma GCC visibility pop
diff --git a/faiss/gpu/impl/IVFBase.cuh b/faiss/gpu/impl/IVFBase.cuh
index 2bb319d002..6b1f2ac394 100644
--- a/faiss/gpu/impl/IVFBase.cuh
+++ b/faiss/gpu/impl/IVFBase.cuh
@@ -41,7 +41,7 @@ class IVFBase {
     virtual ~IVFBase();
 
     /// Reserve GPU memory in our inverted lists for this number of vectors
-    void reserveMemory(idx_t numVecs);
+    virtual void reserveMemory(idx_t numVecs);
 
     /// Clear out all inverted lists, but retain the coarse quantizer
     /// and the product quantizer info
@@ -52,7 +52,7 @@ class IVFBase {
 
     /// After adding vectors, one can call this to reclaim device memory
     /// to exactly the amount needed. Returns space reclaimed in bytes
-    size_t reclaimMemory();
+    virtual size_t reclaimMemory();
 
     /// Returns the number of inverted lists
     idx_t getNumLists() const;
diff --git a/faiss/gpu/impl/IVFPQ.cuh b/faiss/gpu/impl/IVFPQ.cuh
index 3670e58edf..0d17b02c9b 100644
--- a/faiss/gpu/impl/IVFPQ.cuh
+++ b/faiss/gpu/impl/IVFPQ.cuh
@@ -39,7 +39,7 @@ class IVFPQ : public IVFBase {
 
     /// Enable or disable pre-computed codes. The quantizer is needed to gather
     /// the IVF centroids for use
-    void setPrecomputedCodes(Index* coarseQuantizer, bool enable);
+    virtual void setPrecomputedCodes(Index* coarseQuantizer, bool enable);
 
     /// Returns our set of sub-quantizers of the form
     /// (sub q)(code id)(sub dim)
@@ -134,7 +134,7 @@ class IVFPQ : public IVFBase {
             Tensor<float, 2, true>& outDistances,
             Tensor<idx_t, 2, true>& outIndices);
 
-   private:
+   protected:
     /// Number of sub-quantizers per vector
     const int numSubQuantizers_;
 
diff --git a/faiss/gpu/impl/RaftFlatIndex.cu b/faiss/gpu/impl/RaftFlatIndex.cu
index 8f5c491163..24a6d39604 100644
--- a/faiss/gpu/impl/RaftFlatIndex.cu
+++ b/faiss/gpu/impl/RaftFlatIndex.cu
@@ -20,7 +20,7 @@
  * limitations under the License.
  */
 
-#include <faiss/gpu/impl/RaftUtils.h>
+#include <faiss/gpu/utils/RaftUtils.h>
 #include <faiss/gpu/impl/RaftFlatIndex.cuh>
 #include <faiss/gpu/utils/ConversionOperators.cuh>
 
@@ -91,7 +91,7 @@ void RaftFlatIndex::query(
                 outDistances.getSize(0),
                 outDistances.getSize(1));
 
-        DistanceType distance = faiss_to_raft(metric, exactDistance);
+        DistanceType distance = metricFaissToRaft(metric, exactDistance);
 
         std::optional<raft::device_vector_view<const float, int64_t>>
                 norms_view = raft::make_device_vector_view(
diff --git a/faiss/gpu/impl/RaftFlatIndex.cuh b/faiss/gpu/impl/RaftFlatIndex.cuh
index 010c5aebce..d3823bbf58 100644
--- a/faiss/gpu/impl/RaftFlatIndex.cuh
+++ b/faiss/gpu/impl/RaftFlatIndex.cuh
@@ -28,6 +28,7 @@
 #include <faiss/gpu/utils/DeviceTensor.cuh>
 #include <faiss/gpu/utils/DeviceVector.cuh>
 
+#pragma GCC visibility push(default)
 namespace faiss {
 namespace gpu {
 
@@ -67,3 +68,4 @@ class RaftFlatIndex : public FlatIndex {
 
 } // namespace gpu
 } // namespace faiss
+#pragma GCC visibility pop
diff --git a/faiss/gpu/impl/RaftIVFFlat.cu b/faiss/gpu/impl/RaftIVFFlat.cu
index 2c6afb795c..1e310723d0 100644
--- a/faiss/gpu/impl/RaftIVFFlat.cu
+++ b/faiss/gpu/impl/RaftIVFFlat.cu
@@ -23,31 +23,19 @@
 #include <cstddef>
 #include <cstdint>
 
-#include <faiss/gpu/GpuIndex.h>
-#include <faiss/gpu/GpuResources.h>
-#include <faiss/gpu/impl/InterleavedCodes.h>
-#include <faiss/gpu/impl/RemapIndices.h>
-#include <faiss/gpu/utils/DeviceUtils.h>
-#include <thrust/host_vector.h>
+#include <faiss/gpu/GpuIndexFlat.h>
+#include <faiss/gpu/utils/RaftUtils.h>
 #include <faiss/gpu/impl/FlatIndex.cuh>
-#include <faiss/gpu/impl/IVFAppend.cuh>
 #include <faiss/gpu/impl/IVFFlat.cuh>
-#include <faiss/gpu/impl/IVFFlatScan.cuh>
-#include <faiss/gpu/impl/IVFInterleaved.cuh>
 #include <faiss/gpu/impl/RaftIVFFlat.cuh>
-#include <faiss/gpu/utils/ConversionOperators.cuh>
-#include <faiss/gpu/utils/CopyUtils.cuh>
-#include <faiss/gpu/utils/DeviceDefs.cuh>
-#include <faiss/gpu/utils/Float16.cuh>
-#include <faiss/gpu/utils/HostTensor.cuh>
 #include <faiss/gpu/utils/Transpose.cuh>
-#include <limits>
-#include <unordered_map>
 
-#include <raft/core/device_mdspan.hpp>
-#include <raft/core/handle.hpp>
 #include <raft/neighbors/ivf_flat_codepacker.hpp>
 #include <raft/neighbors/ivf_flat.cuh>
+#include <raft/neighbors/ivf_flat_helpers.cuh>
+
+#include <limits>
+#include <memory>
 
 namespace faiss {
 namespace gpu {
@@ -71,18 +59,31 @@ RaftIVFFlat::RaftIVFFlat(
                   useResidual,
                   scalarQ,
                   interleavedLayout,
+                  // skip ptr allocations in base class (handled by RAFT
+                  // internally)
                   indicesOptions,
                   space) {
     FAISS_THROW_IF_NOT_MSG(
             indicesOptions == INDICES_64_BIT,
             "only INDICES_64_BIT is supported for RAFT index");
-    reset();
 }
 
 RaftIVFFlat::~RaftIVFFlat() {}
 
-/// Find the approximate k nearest neighbors for `queries` against
-/// our database
+void RaftIVFFlat::reserveMemory(idx_t numVecs) {
+    fprintf(stderr,
+            "WARN: reserveMemory is NOP. Pre-allocation of IVF lists is not supported with RAFT enabled.\n");
+}
+
+void RaftIVFFlat::reset() {
+    raft_knn_index.reset();
+}
+
+void RaftIVFFlat::setRaftIndex(
+        raft::neighbors::ivf_flat::index<float, idx_t>&& idx) {
+    raft_knn_index.emplace(std::move(idx));
+}
+
 void RaftIVFFlat::search(
         Index* coarseQuantizer,
         Tensor<float, 2, true>& queries,
@@ -90,7 +91,9 @@ void RaftIVFFlat::search(
         int k,
         Tensor<float, 2, true>& outDistances,
         Tensor<idx_t, 2, true>& outIndices) {
-    // TODO: We probably don't want to ignore the coarse quantizer here...
+    /// NB: The coarse quantizer is ignored here. The user is assumed to have
+    /// called updateQuantizer() to modify the RAFT index if the quantizer was
+    /// modified externally
 
     uint32_t numQueries = queries.getSize(0);
     uint32_t cols = queries.getSize(1);
@@ -125,7 +128,7 @@ void RaftIVFFlat::search(
     /// Identify NaN rows and mask their nearest neighbors
     auto nan_flag = raft::make_device_vector<bool>(raft_handle, numQueries);
 
-    validRowIndices_(queries, nan_flag.data_handle());
+    validRowIndices(resources_, queries, nan_flag.data_handle());
 
     raft::linalg::map_offset(
             raft_handle,
@@ -154,65 +157,22 @@ void RaftIVFFlat::search(
             });
 }
 
-/// Classify and encode/add vectors to our IVF lists.
-/// The input data must be on our current device.
-/// Returns the number of vectors successfully added. Vectors may
-/// not be able to be added because they contain NaNs.
 idx_t RaftIVFFlat::addVectors(
         Index* coarseQuantizer,
         Tensor<float, 2, true>& vecs,
         Tensor<idx_t, 1, true>& indices) {
-    /// TODO: We probably don't want to ignore the coarse quantizer here
+    /// NB: The coarse quantizer is ignored here. The user is assumed to have
+    /// called updateQuantizer() to update the RAFT index if the quantizer was
+    /// modified externally
 
-    idx_t n_rows = vecs.getSize(0);
+    FAISS_ASSERT(raft_knn_index.has_value());
 
     const raft::device_resources& raft_handle =
             resources_->getRaftHandleCurrentDevice();
 
-    /// Remove NaN values
-    auto nan_flag = raft::make_device_vector<bool, idx_t>(raft_handle, n_rows);
-
-    validRowIndices_(vecs, nan_flag.data_handle());
-
-    idx_t n_rows_valid = thrust::reduce(
-            raft_handle.get_thrust_policy(),
-            nan_flag.data_handle(),
-            nan_flag.data_handle() + n_rows,
-            0);
-
-    if (n_rows_valid < n_rows) {
-        auto gather_indices = raft::make_device_vector<idx_t, idx_t>(
-                raft_handle, n_rows_valid);
-
-        auto count = thrust::make_counting_iterator(0);
+    /// Remove rows containing NaNs
+    idx_t n_rows_valid = inplaceGatherFilteredRows(resources_, vecs, indices);
 
-        thrust::copy_if(
-                raft_handle.get_thrust_policy(),
-                count,
-                count + n_rows,
-                gather_indices.data_handle(),
-                [nan_flag = nan_flag.data_handle()] __device__(auto i) {
-                    return nan_flag[i];
-                });
-
-        raft::matrix::gather(
-                raft_handle,
-                raft::make_device_matrix_view<float, idx_t>(
-                        vecs.data(), n_rows, dim_),
-                raft::make_const_mdspan(gather_indices.view()),
-                (idx_t)16);
-
-        auto valid_indices = raft::make_device_vector<idx_t, idx_t>(
-                raft_handle, n_rows_valid);
-
-        raft::matrix::gather(
-                raft_handle,
-                raft::make_device_matrix_view<idx_t>(
-                        indices.data(), n_rows, (idx_t)1),
-                raft::make_const_mdspan(gather_indices.view()));
-    }
-
-    FAISS_ASSERT(raft_knn_index.has_value());
     raft_knn_index.emplace(raft::neighbors::ivf_flat::extend(
             raft_handle,
             raft::make_device_matrix_view<const float, idx_t>(
@@ -225,10 +185,6 @@ idx_t RaftIVFFlat::addVectors(
     return n_rows_valid;
 }
 
-void RaftIVFFlat::reset() {
-    raft_knn_index.reset();
-}
-
 idx_t RaftIVFFlat::getListLength(idx_t listId) const {
     FAISS_ASSERT(raft_knn_index.has_value());
     const raft::device_resources& raft_handle =
@@ -259,10 +215,11 @@ std::vector<idx_t> RaftIVFFlat::getListIndices(idx_t listId) const {
     // fetch the list indices ptr on host
     idx_t* list_indices_ptr;
 
-    // fetch the list indices ptr on host
     raft::update_host(
             &list_indices_ptr,
-            raft_knn_index.value().inds_ptrs().data_handle() + listId,
+            const_cast<idx_t**>(
+                    raft_knn_index.value().inds_ptrs().data_handle()) +
+                    listId,
             1,
             stream);
     raft_handle.sync_stream();
@@ -278,7 +235,7 @@ std::vector<uint8_t> RaftIVFFlat::getListVectorData(
         idx_t listId,
         bool gpuFormat) const {
     if (gpuFormat) {
-        FAISS_THROW_MSG("gpuFormat is not suppported for raft indices");
+        FAISS_THROW_MSG("gpuFormat should be false for RAFT indices");
     }
     FAISS_ASSERT(raft_knn_index.has_value());
 
@@ -334,59 +291,71 @@ void RaftIVFFlat::searchPreassigned(
 }
 
 void RaftIVFFlat::updateQuantizer(Index* quantizer) {
-    idx_t quantizer_ntotal = quantizer->ntotal;
+    FAISS_THROW_IF_NOT(quantizer->is_trained);
 
-    const raft::device_resources& raft_handle =
-            resources_->getRaftHandleCurrentDevice();
-    auto stream = raft_handle.get_stream();
+    // Must match our basic IVF parameters
+    FAISS_THROW_IF_NOT(quantizer->d == getDim());
+    FAISS_THROW_IF_NOT(quantizer->ntotal == getNumLists());
 
-    auto total_elems = size_t(quantizer_ntotal) * size_t(quantizer->d);
+    size_t total_elems = quantizer->ntotal * quantizer->d;
 
-    raft::logger::get().set_level(RAFT_LEVEL_TRACE);
+    auto stream = resources_->getDefaultStreamCurrentDevice();
+    const raft::device_resources& raft_handle =
+            resources_->getRaftHandleCurrentDevice();
 
     raft::neighbors::ivf_flat::index_params pams;
     pams.add_data_on_build = false;
-
-    pams.n_lists = this->numLists_;
-
-    switch (this->metric_) {
-        case faiss::METRIC_L2:
-            pams.metric = raft::distance::DistanceType::L2Expanded;
-            break;
-        case faiss::METRIC_INNER_PRODUCT:
-            pams.metric = raft::distance::DistanceType::InnerProduct;
-            break;
-        default:
-            FAISS_THROW_MSG("Metric is not supported.");
+    pams.metric = metricFaissToRaft(metric_, false);
+    pams.n_lists = numLists_;
+    raft_knn_index.emplace(raft_handle, pams, static_cast<uint32_t>(dim_));
+
+    raft::neighbors::ivf_flat::helpers::reset_index(
+            raft_handle, &raft_knn_index.value());
+
+    // If the index instance is a GpuIndexFlat, then we can use direct access to
+    // the centroids within.
+    auto gpuQ = dynamic_cast<GpuIndexFlat*>(quantizer);
+    if (gpuQ) {
+        auto gpuData = gpuQ->getGpuData();
+
+        if (gpuData->getUseFloat16()) {
+            // The FlatIndex keeps its data in float16; we need to reconstruct
+            // as float32 and store locally
+            DeviceTensor<float, 2, true> centroids(
+                    resources_,
+                    makeSpaceAlloc(AllocType::FlatData, space_, stream),
+                    {getNumLists(), getDim()});
+
+            gpuData->reconstruct(0, gpuData->getSize(), centroids);
+
+            raft::update_device(
+                    raft_knn_index.value().centers().data_handle(),
+                    centroids.data(),
+                    total_elems,
+                    stream);
+        } else {
+            /// No reconstruct needed since the centers are already in float32
+            auto centroids = gpuData->getVectorsFloat32Ref();
+
+            raft::update_device(
+                    raft_knn_index.value().centers().data_handle(),
+                    centroids.data(),
+                    total_elems,
+                    stream);
+        }
+    } else {
+        // Otherwise, we need to reconstruct all vectors from the index and copy
+        // them to the GPU, in order to have access as needed for residual
+        // computation
+        auto vecs = std::vector<float>(getNumLists() * getDim());
+        quantizer->reconstruct_n(0, quantizer->ntotal, vecs.data());
+
+        raft::update_device(
+                raft_knn_index.value().centers().data_handle(),
+                vecs.data(),
+                total_elems,
+                stream);
     }
-
-    raft_knn_index.emplace(raft_handle, pams, (uint32_t)this->dim_);
-
-    cudaMemsetAsync(
-            raft_knn_index.value().list_sizes().data_handle(),
-            0,
-            raft_knn_index.value().list_sizes().size() * sizeof(uint32_t),
-            stream);
-    cudaMemsetAsync(
-            raft_knn_index.value().data_ptrs().data_handle(),
-            0,
-            raft_knn_index.value().data_ptrs().size() * sizeof(float*),
-            stream);
-    cudaMemsetAsync(
-            raft_knn_index.value().inds_ptrs().data_handle(),
-            0,
-            raft_knn_index.value().inds_ptrs().size() * sizeof(idx_t*),
-            stream);
-
-    /// Copy (reconstructed) centroids over, rather than re-training
-    std::vector<float> buf_host(total_elems);
-    quantizer->reconstruct_n(0, quantizer_ntotal, buf_host.data());
-
-    raft::update_device(
-            raft_knn_index.value().centers().data_handle(),
-            buf_host.data(),
-            total_elems,
-            stream);
 }
 
 void RaftIVFFlat::copyInvertedListsFrom(const InvertedLists* ivf) {
@@ -422,6 +391,9 @@ void RaftIVFFlat::copyInvertedListsFrom(const InvertedLists* ivf) {
         // store the list size
         list_sizes_[i] = static_cast<uint32_t>(listSize);
 
+        // This RAFT list must currently be empty
+        FAISS_ASSERT(getListLength(i) == 0);
+
         raft::neighbors::ivf::resize_list(
                 raft_handle,
                 raft_lists[i],
@@ -483,12 +455,6 @@ void RaftIVFFlat::addEncodedVectorsToList_(
         idx_t numVecs) {
     auto stream = resources_->getDefaultStreamCurrentDevice();
 
-    // This list must already exist
-    FAISS_ASSERT(raft_knn_index.has_value());
-
-    // This list must currently be empty
-    FAISS_ASSERT(getListLength(listId) == 0);
-
     // If there's nothing to add, then there's nothing we have to do
     if (numVecs == 0) {
         return;
@@ -496,7 +462,6 @@ void RaftIVFFlat::addEncodedVectorsToList_(
 
     // The GPU might have a different layout of the memory
     auto gpuListSizeInBytes = getGpuVectorsEncodingSize_(numVecs);
-    auto cpuListSizeInBytes = getCpuVectorsEncodingSize_(numVecs);
 
     // We only have int32 length representations on the GPU per each
     // list; the length is in sizeof(char)
@@ -541,27 +506,6 @@ void RaftIVFFlat::addEncodedVectorsToList_(
     raft::update_device(list_indices_ptr, indices, numVecs, stream);
 }
 
-void RaftIVFFlat::validRowIndices_(
-        Tensor<float, 2, true>& vecs,
-        bool* nan_flag) {
-    raft::device_resources& raft_handle =
-            resources_->getRaftHandleCurrentDevice();
-    idx_t n_rows = vecs.getSize(0);
-
-    thrust::fill_n(raft_handle.get_thrust_policy(), nan_flag, n_rows, true);
-    raft::linalg::map_offset(
-            raft_handle,
-            raft::make_device_vector_view<bool, idx_t>(nan_flag, n_rows),
-            [vecs = vecs.data(), dim_ = this->dim_] __device__(idx_t i) {
-                for (idx_t col = 0; col < dim_; col++) {
-                    if (!isfinite(vecs[i * dim_ + col])) {
-                        return false;
-                    }
-                }
-                return true;
-            });
-}
-
 RaftIVFFlatCodePackerInterleaved::RaftIVFFlatCodePackerInterleaved(
         size_t list_size,
         uint32_t dim,
diff --git a/faiss/gpu/impl/RaftIVFFlat.cuh b/faiss/gpu/impl/RaftIVFFlat.cuh
index 3aba501c9f..4f8c89ecb0 100644
--- a/faiss/gpu/impl/RaftIVFFlat.cuh
+++ b/faiss/gpu/impl/RaftIVFFlat.cuh
@@ -22,16 +22,15 @@
 
 #pragma once
 
+#include <faiss/impl/CodePacker.h>
 #include <faiss/gpu/impl/GpuScalarQuantizer.cuh>
-#include <faiss/gpu/impl/IVFBase.cuh>
 #include <faiss/gpu/impl/IVFFlat.cuh>
 
-#include <faiss/impl/CodePacker.h>
-
 #include <raft/neighbors/ivf_flat.cuh>
 
 #include <optional>
 
+#pragma GCC visibility push(default)
 namespace faiss {
 namespace gpu {
 
@@ -52,6 +51,9 @@ class RaftIVFFlat : public IVFFlat {
 
     ~RaftIVFFlat() override;
 
+    /// Reserve GPU memory in our inverted lists for this number of vectors
+    void reserveMemory(idx_t numVecs) override;
+
     /// Find the approximate k nearest neigbors for `queries` against
     /// our database
     void search(
@@ -83,11 +85,7 @@ class RaftIVFFlat : public IVFFlat {
             Tensor<float, 2, true>& vecs,
             Tensor<idx_t, 1, true>& indices) override;
 
-    /// Reserve GPU memory in our inverted lists for this number of vectors
-    //     void reserveMemory(idx_t numVecs) override;
-
-    /// Clear out all inverted lists, but retain the coarse quantizer
-    /// and the product quantizer info
+    /// Clear out the Raft index
     void reset() override;
 
     /// For debugging purposes, return the list length of a particular
@@ -101,15 +99,17 @@ class RaftIVFFlat : public IVFFlat {
     std::vector<uint8_t> getListVectorData(idx_t listId, bool gpuFormat)
             const override;
 
+    /// Update our Raft index with this quantizer instance; may be a CPU
+    /// or GPU quantizer
     void updateQuantizer(Index* quantizer) override;
 
     /// Copy all inverted lists from a CPU representation to ourselves
     void copyInvertedListsFrom(const InvertedLists* ivf) override;
 
-    /// Filter out matrix rows containing NaN values
-    void validRowIndices_(Tensor<float, 2, true>& vecs, bool* nan_flag);
+    /// Replace the Raft index
+    void setRaftIndex(raft::neighbors::ivf_flat::index<float, idx_t>&& idx);
 
-   protected:
+   private:
     /// Adds a set of codes and indices to a list, with the representation
     /// coming from the CPU equivalent
     void addEncodedVectorsToList_(
@@ -147,3 +147,4 @@ struct RaftIVFFlatCodePackerInterleaved : CodePacker {
 
 } // namespace gpu
 } // namespace faiss
+#pragma GCC visibility pop
diff --git a/faiss/gpu/impl/RaftIVFPQ.cu b/faiss/gpu/impl/RaftIVFPQ.cu
new file mode 100644
index 0000000000..b26025dd47
--- /dev/null
+++ b/faiss/gpu/impl/RaftIVFPQ.cu
@@ -0,0 +1,546 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <faiss/gpu/GpuIndexFlat.h>
+#include <faiss/gpu/utils/RaftUtils.h>
+#include <faiss/gpu/impl/FlatIndex.cuh>
+#include <faiss/gpu/impl/RaftIVFPQ.cuh>
+#include <faiss/gpu/utils/Transpose.cuh>
+
+#include <raft/neighbors/ivf_pq.cuh>
+#include <raft/neighbors/ivf_pq_helpers.cuh>
+
+#include <limits>
+#include <memory>
+
+namespace faiss {
+namespace gpu {
+
+RaftIVFPQ::RaftIVFPQ(
+        GpuResources* resources,
+        int dim,
+        idx_t nlist,
+        faiss::MetricType metric,
+        float metricArg,
+        int numSubQuantizers,
+        int bitsPerSubQuantizer,
+        bool useFloat16LookupTables,
+        bool useMMCodeDistance,
+        bool interleavedLayout,
+        float* pqCentroidData,
+        IndicesOptions indicesOptions,
+        MemorySpace space)
+        : IVFPQ(resources,
+                dim,
+                nlist,
+                metric,
+                metricArg,
+                numSubQuantizers,
+                bitsPerSubQuantizer,
+                useFloat16LookupTables,
+                useMMCodeDistance,
+                interleavedLayout,
+                // skip ptr allocations in base class (handled by RAFT
+                // internally) false,
+                pqCentroidData,
+                indicesOptions,
+                space) {
+    FAISS_THROW_IF_NOT_MSG(
+            indicesOptions == INDICES_64_BIT,
+            "only INDICES_64_BIT is supported for RAFT index");
+}
+
+RaftIVFPQ::~RaftIVFPQ() {}
+
+void RaftIVFPQ::reserveMemory(idx_t numVecs) {
+    fprintf(stderr,
+            "WARN: reserveMemory is NOP. Pre-allocation of IVF lists is not supported with RAFT enabled.\n");
+}
+
+void RaftIVFPQ::reset() {
+    raft_knn_index.reset();
+}
+
+size_t RaftIVFPQ::reclaimMemory() {
+    fprintf(stderr,
+            "WARN: reclaimMemory is NOP. reclaimMemory is not supported with RAFT enabled.\n");
+    return 0;
+}
+
+void RaftIVFPQ::setPrecomputedCodes(Index* quantizer, bool enable) {}
+
+idx_t RaftIVFPQ::getListLength(idx_t listId) const {
+    FAISS_ASSERT(raft_knn_index.has_value());
+    const raft::device_resources& raft_handle =
+            resources_->getRaftHandleCurrentDevice();
+
+    uint32_t size;
+    raft::update_host(
+            &size,
+            raft_knn_index.value().list_sizes().data_handle() + listId,
+            1,
+            raft_handle.get_stream());
+    raft_handle.sync_stream();
+
+    return static_cast<int>(size);
+}
+
+void RaftIVFPQ::updateQuantizer(Index* quantizer) {
+    FAISS_THROW_IF_NOT(quantizer->is_trained);
+
+    // Must match our basic IVF parameters
+    FAISS_THROW_IF_NOT(quantizer->d == getDim());
+    FAISS_THROW_IF_NOT(quantizer->ntotal == getNumLists());
+
+    auto stream = resources_->getDefaultStreamCurrentDevice();
+    const raft::device_resources& raft_handle =
+            resources_->getRaftHandleCurrentDevice();
+
+    raft::neighbors::ivf_pq::index_params pams;
+    pams.metric = metricFaissToRaft(metric_, false);
+    pams.codebook_kind = raft::neighbors::ivf_pq::codebook_gen::PER_SUBSPACE;
+    pams.n_lists = numLists_;
+    pams.pq_bits = bitsPerSubQuantizer_;
+    pams.pq_dim = numSubQuantizers_;
+    raft_knn_index.emplace(raft_handle, pams, static_cast<uint32_t>(dim_));
+
+    raft::neighbors::ivf_pq::helpers::reset_index(
+            raft_handle, &raft_knn_index.value());
+    raft::neighbors::ivf_pq::helpers::make_rotation_matrix(
+            raft_handle, &(raft_knn_index.value()), false);
+
+    // If the index instance is a GpuIndexFlat, then we can use direct access to
+    // the centroids within.
+    auto gpuQ = dynamic_cast<GpuIndexFlat*>(quantizer);
+
+    if (gpuQ) {
+        auto gpuData = gpuQ->getGpuData();
+
+        if (gpuData->getUseFloat16()) {
+            DeviceTensor<float, 2, true> centroids(
+                    resources_,
+                    makeSpaceAlloc(AllocType::FlatData, space_, stream),
+                    {getNumLists(), getDim()});
+
+            // The FlatIndex keeps its data in float16; we need to reconstruct
+            // as float32 and store locally
+            gpuData->reconstruct(0, gpuData->getSize(), centroids);
+
+            raft::neighbors::ivf_pq::helpers::set_centers(
+                    raft_handle,
+                    &(raft_knn_index.value()),
+                    raft::make_device_matrix_view<float, uint32_t>(
+                            centroids.data(), numLists_, dim_));
+        } else {
+            /// No reconstruct needed since the centers are already in float32
+            // The FlatIndex keeps its data in float32, so we can merely
+            // reference it
+            auto centroids = gpuData->getVectorsFloat32Ref();
+
+            raft::neighbors::ivf_pq::helpers::set_centers(
+                    raft_handle,
+                    &(raft_knn_index.value()),
+                    raft::make_device_matrix_view<float, uint32_t>(
+                            centroids.data(), numLists_, dim_));
+        }
+    } else {
+        DeviceTensor<float, 2, true> centroids(
+                resources_,
+                makeSpaceAlloc(AllocType::FlatData, space_, stream),
+                {getNumLists(), getDim()});
+
+        // Otherwise, we need to reconstruct all vectors from the index and copy
+        // them to the GPU, in order to have access as needed for residual
+        // computation
+        auto vecs = std::vector<float>(getNumLists() * getDim());
+        quantizer->reconstruct_n(0, quantizer->ntotal, vecs.data());
+
+        centroids.copyFrom(vecs, stream);
+
+        raft::neighbors::ivf_pq::helpers::set_centers(
+                raft_handle,
+                &(raft_knn_index.value()),
+                raft::make_device_matrix_view<float, uint32_t>(
+                        centroids.data(), numLists_, dim_));
+    }
+
+    setPQCentroids_();
+}
+
+/// Return the list indices of a particular list back to the CPU
+std::vector<idx_t> RaftIVFPQ::getListIndices(idx_t listId) const {
+    FAISS_ASSERT(raft_knn_index.has_value());
+    const raft::device_resources& raft_handle =
+            resources_->getRaftHandleCurrentDevice();
+    auto stream = raft_handle.get_stream();
+
+    idx_t listSize = getListLength(listId);
+
+    std::vector<idx_t> vec(listSize);
+
+    // fetch the list indices ptr on host
+    idx_t* list_indices_ptr;
+
+    raft::update_host(
+            &list_indices_ptr,
+            const_cast<idx_t**>(
+                    raft_knn_index.value().inds_ptrs().data_handle()) +
+                    listId,
+            1,
+            stream);
+    raft_handle.sync_stream();
+
+    raft::update_host(vec.data(), list_indices_ptr, listSize, stream);
+    raft_handle.sync_stream();
+
+    return vec;
+}
+
+/// Performs search when we are already given the IVF cells to look at
+/// (GpuIndexIVF::search_preassigned implementation)
+void RaftIVFPQ::searchPreassigned(
+        Index* coarseQuantizer,
+        Tensor<float, 2, true>& vecs,
+        Tensor<float, 2, true>& ivfDistances,
+        Tensor<idx_t, 2, true>& ivfAssignments,
+        int k,
+        Tensor<float, 2, true>& outDistances,
+        Tensor<idx_t, 2, true>& outIndices,
+        bool storePairs) {
+    // TODO: Fill this in!
+}
+
+size_t RaftIVFPQ::getGpuListEncodingSize_(idx_t listId) {
+    return static_cast<size_t>(
+            raft_knn_index.value().get_list_size_in_bytes(listId));
+}
+
+/// Return the encoded vectors of a particular list back to the CPU
+std::vector<uint8_t> RaftIVFPQ::getListVectorData(idx_t listId, bool gpuFormat)
+        const {
+    if (gpuFormat) {
+        FAISS_THROW_MSG(
+                "gpuFormat should be false for RAFT indices. Unpacked codes are flat.");
+    }
+    FAISS_ASSERT(raft_knn_index.has_value());
+
+    const raft::device_resources& raft_handle =
+            resources_->getRaftHandleCurrentDevice();
+    auto stream = raft_handle.get_stream();
+
+    idx_t listSize = getListLength(listId);
+
+    auto cpuListSizeInBytes = getCpuVectorsEncodingSize_(listSize);
+
+    std::vector<uint8_t> flat_codes(
+            cpuListSizeInBytes, static_cast<uint8_t>(0));
+
+    idx_t maxBatchSize = 65536;
+    for (idx_t offset_b = 0; offset_b < listSize; offset_b += maxBatchSize) {
+        uint32_t batchSize = min(maxBatchSize, listSize - offset_b);
+        uint32_t bufferSize = getCpuVectorsEncodingSize_(batchSize);
+        uint32_t codesOffset = getCpuVectorsEncodingSize_(offset_b);
+
+        // Fetch flat PQ codes for the current batch
+        auto codes_d = raft::make_device_vector<uint8_t>(
+                raft_handle, static_cast<uint32_t>(bufferSize));
+
+        raft::neighbors::ivf_pq::helpers::unpack_contiguous_list_data(
+                raft_handle,
+                raft_knn_index.value(),
+                codes_d.data_handle(),
+                batchSize,
+                listId,
+                offset_b);
+
+        // Copy the flat PQ codes to host
+        raft::update_host(
+                flat_codes.data() + codesOffset,
+                codes_d.data_handle(),
+                bufferSize,
+                stream);
+        raft_handle.sync_stream();
+    }
+
+    return flat_codes;
+}
+
+/// Find the approximate k nearest neighbors for `queries` against
+/// our database
+void RaftIVFPQ::search(
+        Index* coarseQuantizer,
+        Tensor<float, 2, true>& queries,
+        int nprobe,
+        int k,
+        Tensor<float, 2, true>& outDistances,
+        Tensor<idx_t, 2, true>& outIndices) {
+    uint32_t numQueries = queries.getSize(0);
+    uint32_t cols = queries.getSize(1);
+    idx_t k_ = std::min(static_cast<idx_t>(k), raft_knn_index.value().size());
+
+    // Device is already set in GpuIndex::search
+    FAISS_ASSERT(raft_knn_index.has_value());
+    FAISS_ASSERT(numQueries > 0);
+    FAISS_ASSERT(cols == dim_);
+    FAISS_THROW_IF_NOT(nprobe > 0 && nprobe <= numLists_);
+
+    const raft::device_resources& raft_handle =
+            resources_->getRaftHandleCurrentDevice();
+    raft::neighbors::ivf_pq::search_params pams;
+    pams.n_probes = nprobe;
+    pams.lut_dtype = useFloat16LookupTables_ ? CUDA_R_16F : CUDA_R_32F;
+
+    auto queries_view = raft::make_device_matrix_view<const float, idx_t>(
+            queries.data(), (idx_t)numQueries, (idx_t)cols);
+    auto out_inds_view = raft::make_device_matrix_view<idx_t, idx_t>(
+            outIndices.data(), (idx_t)numQueries, (idx_t)k_);
+    auto out_dists_view = raft::make_device_matrix_view<float, idx_t>(
+            outDistances.data(), (idx_t)numQueries, (idx_t)k_);
+
+    raft::neighbors::ivf_pq::search<float, idx_t>(
+            raft_handle,
+            pams,
+            raft_knn_index.value(),
+            queries_view,
+            out_inds_view,
+            out_dists_view);
+
+    /// Identify NaN rows and mask their nearest neighbors
+    auto nan_flag = raft::make_device_vector<bool>(raft_handle, numQueries);
+
+    validRowIndices(resources_, queries, nan_flag.data_handle());
+
+    raft::linalg::map_offset(
+            raft_handle,
+            raft::make_device_vector_view(outIndices.data(), numQueries * k_),
+            [nan_flag = nan_flag.data_handle(),
+             out_inds = outIndices.data(),
+             k_] __device__(uint32_t i) {
+                uint32_t row = i / k_;
+                if (!nan_flag[row])
+                    return idx_t(-1);
+                return out_inds[i];
+            });
+
+    float max_val = std::numeric_limits<float>::max();
+    raft::linalg::map_offset(
+            raft_handle,
+            raft::make_device_vector_view(outDistances.data(), numQueries * k_),
+            [nan_flag = nan_flag.data_handle(),
+             out_dists = outDistances.data(),
+             max_val,
+             k_] __device__(uint32_t i) {
+                uint32_t row = i / k_;
+                if (!nan_flag[row])
+                    return max_val;
+                return out_dists[i];
+            });
+    raft_handle.sync_stream();
+}
+
+idx_t RaftIVFPQ::addVectors(
+        Index* coarseQuantizer,
+        Tensor<float, 2, true>& vecs,
+        Tensor<idx_t, 1, true>& indices) {
+    /// NB: The coarse quantizer is ignored here. The user is assumed to have
+    /// called updateQuantizer() to update the RAFT index if the quantizer was
+    /// modified externally
+
+    FAISS_ASSERT(raft_knn_index.has_value());
+
+    const raft::device_resources& raft_handle =
+            resources_->getRaftHandleCurrentDevice();
+
+    /// Remove rows containing NaNs
+    idx_t n_rows_valid = inplaceGatherFilteredRows(resources_, vecs, indices);
+
+    raft_knn_index.emplace(raft::neighbors::ivf_pq::extend(
+            raft_handle,
+            raft::make_device_matrix_view<const float, idx_t>(
+                    vecs.data(), n_rows_valid, dim_),
+            std::make_optional<raft::device_vector_view<const idx_t, idx_t>>(
+                    raft::make_device_vector_view<const idx_t, idx_t>(
+                            indices.data(), n_rows_valid)),
+            raft_knn_index.value()));
+
+    return n_rows_valid;
+}
+
+void RaftIVFPQ::copyInvertedListsFrom(const InvertedLists* ivf) {
+    size_t nlist = ivf ? ivf->nlist : 0;
+    size_t ntotal = ivf ? ivf->compute_ntotal() : 0;
+
+    const raft::device_resources& raft_handle =
+            resources_->getRaftHandleCurrentDevice();
+
+    std::vector<uint32_t> list_sizes_(nlist);
+    std::vector<idx_t> indices_(ntotal);
+
+    // the index must already exist
+    FAISS_ASSERT(raft_knn_index.has_value());
+
+    auto& raft_lists = raft_knn_index.value().lists();
+
+    // conservative memory alloc for cloning cpu inverted lists
+    raft::neighbors::ivf_pq::list_spec<uint32_t, idx_t> raft_list_spec{
+            static_cast<uint32_t>(bitsPerSubQuantizer_),
+            static_cast<uint32_t>(numSubQuantizers_),
+            true};
+
+    for (size_t i = 0; i < nlist; ++i) {
+        size_t listSize = ivf->list_size(i);
+
+        // GPU index can only support max int entries per list
+        FAISS_THROW_IF_NOT_FMT(
+                listSize <= (size_t)std::numeric_limits<int>::max(),
+                "GPU inverted list can only support "
+                "%zu entries; %zu found",
+                (size_t)std::numeric_limits<int>::max(),
+                listSize);
+
+        // store the list size
+        list_sizes_[i] = static_cast<uint32_t>(listSize);
+
+        // This RAFT list must currently be empty
+        FAISS_ASSERT(getListLength(i) == 0);
+
+        raft::neighbors::ivf::resize_list(
+                raft_handle,
+                raft_lists[i],
+                raft_list_spec,
+                static_cast<uint32_t>(listSize),
+                static_cast<uint32_t>(0));
+    }
+
+    raft::update_device(
+            raft_knn_index.value().list_sizes().data_handle(),
+            list_sizes_.data(),
+            nlist,
+            raft_handle.get_stream());
+
+    //     Update the pointers and the sizes
+    raft::neighbors::ivf_pq::helpers::recompute_internal_state(
+            raft_handle, &(raft_knn_index.value()));
+
+    for (size_t i = 0; i < nlist; ++i) {
+        size_t listSize = ivf->list_size(i);
+        addEncodedVectorsToList_(
+                i, ivf->get_codes(i), ivf->get_ids(i), listSize);
+    }
+}
+
+void RaftIVFPQ::setRaftIndex(raft::neighbors::ivf_pq::index<idx_t>&& idx) {
+    raft_knn_index.emplace(std::move(idx));
+    setBasePQCentroids_();
+}
+
+void RaftIVFPQ::addEncodedVectorsToList_(
+        idx_t listId,
+        const void* codes,
+        const idx_t* indices,
+        idx_t numVecs) {
+    auto stream = resources_->getDefaultStreamCurrentDevice();
+    const raft::device_resources& raft_handle =
+            resources_->getRaftHandleCurrentDevice();
+
+    // If there's nothing to add, then there's nothing we have to do
+    if (numVecs == 0) {
+        return;
+    }
+
+    // The GPU might have a different layout of the memory
+    auto gpuListSizeInBytes = getGpuListEncodingSize_(listId);
+
+    // We only have int32 length representations on the GPU per each
+    // list; the length is in sizeof(char)
+    FAISS_ASSERT(gpuListSizeInBytes <= (size_t)std::numeric_limits<int>::max());
+
+    idx_t maxBatchSize = 4096;
+    for (idx_t offset_b = 0; offset_b < numVecs; offset_b += maxBatchSize) {
+        uint32_t batchSize = min(maxBatchSize, numVecs - offset_b);
+        uint32_t bufferSize = getCpuVectorsEncodingSize_(batchSize);
+        uint32_t codesOffset = getCpuVectorsEncodingSize_(offset_b);
+
+        // Translate the codes as needed to our preferred form
+        auto codes_d = raft::make_device_vector<uint8_t>(
+                raft_handle, static_cast<uint32_t>(bufferSize));
+        raft::update_device(
+                codes_d.data_handle(),
+                static_cast<const uint8_t*>(codes) + codesOffset,
+                bufferSize,
+                stream);
+
+        raft::neighbors::ivf_pq::helpers::pack_contiguous_list_data(
+                raft_handle,
+                &(raft_knn_index.value()),
+                codes_d.data_handle(),
+                batchSize,
+                listId,
+                offset_b);
+    }
+
+    /// Handle the indices as well
+    idx_t* list_indices_ptr;
+
+    // fetch the list indices ptr on host
+    raft::update_host(
+            &list_indices_ptr,
+            raft_knn_index.value().inds_ptrs().data_handle() + listId,
+            1,
+            stream);
+    raft_handle.sync_stream();
+
+    raft::update_device(list_indices_ptr, indices, numVecs, stream);
+}
+
+void RaftIVFPQ::setPQCentroids_() {
+    auto stream = resources_->getDefaultStreamCurrentDevice();
+
+    raft::copy(
+            raft_knn_index.value().pq_centers().data_handle(),
+            pqCentroidsInnermostCode_.data(),
+            pqCentroidsInnermostCode_.numElements(),
+            stream);
+}
+
+void RaftIVFPQ::setBasePQCentroids_() {
+    auto stream = resources_->getDefaultStreamCurrentDevice();
+
+    raft::copy(
+            pqCentroidsInnermostCode_.data(),
+            raft_knn_index.value().pq_centers().data_handle(),
+            raft_knn_index.value().pq_centers().size(),
+            stream);
+
+    DeviceTensor<float, 3, true> pqCentroidsMiddleCode(
+            resources_,
+            makeDevAlloc(AllocType::Quantizer, stream),
+            {numSubQuantizers_, numSubQuantizerCodes_, dimPerSubQuantizer_});
+
+    runTransposeAny(
+            pqCentroidsInnermostCode_, 1, 2, pqCentroidsMiddleCode, stream);
+
+    pqCentroidsMiddleCode_ = std::move(pqCentroidsMiddleCode);
+}
+
+} // namespace gpu
+} // namespace faiss
\ No newline at end of file
diff --git a/faiss/gpu/impl/RaftIVFPQ.cuh b/faiss/gpu/impl/RaftIVFPQ.cuh
new file mode 100644
index 0000000000..a79db3c40d
--- /dev/null
+++ b/faiss/gpu/impl/RaftIVFPQ.cuh
@@ -0,0 +1,150 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <faiss/gpu/impl/GpuScalarQuantizer.cuh>
+#include <faiss/gpu/impl/IVFPQ.cuh>
+
+#include <raft/neighbors/ivf_pq.cuh>
+
+#include <optional>
+
+#pragma GCC visibility push(default)
+namespace faiss {
+namespace gpu {
+/// Implementing class for IVFPQ on the GPU
+class RaftIVFPQ : public IVFPQ {
+   public:
+    RaftIVFPQ(
+            GpuResources* resources,
+            int dim,
+            idx_t nlist,
+            faiss::MetricType metric,
+            float metricArg,
+            int numSubQuantizers,
+            int bitsPerSubQuantizer,
+            bool useFloat16LookupTables,
+            bool useMMCodeDistance,
+            bool interleavedLayout,
+            float* pqCentroidData,
+            IndicesOptions indicesOptions,
+            MemorySpace space);
+
+    ~RaftIVFPQ() override;
+
+    /// Reserve GPU memory in our inverted lists for this number of vectors
+    void reserveMemory(idx_t numVecs) override;
+
+    /// Clear out the RAFT index
+    void reset() override;
+
+    /// After adding vectors, one can call this to reclaim device memory
+    /// to exactly the amount needed. Returns space reclaimed in bytes
+    size_t reclaimMemory() override;
+
+    /// Enable or disable pre-computed codes. The quantizer is needed to gather
+    /// the IVF centroids for use
+    void setPrecomputedCodes(Index* coarseQuantizer, bool enable) override;
+
+    /// Find the approximate k nearest neigbors for `queries` against
+    /// our database
+    void search(
+            Index* coarseQuantizer,
+            Tensor<float, 2, true>& queries,
+            int nprobe,
+            int k,
+            Tensor<float, 2, true>& outDistances,
+            Tensor<idx_t, 2, true>& outIndices) override;
+
+    /// Performs search when we are already given the IVF cells to look at
+    /// (GpuIndexIVF::search_preassigned implementation)
+    void searchPreassigned(
+            Index* coarseQuantizer,
+            Tensor<float, 2, true>& vecs,
+            Tensor<float, 2, true>& ivfDistances,
+            Tensor<idx_t, 2, true>& ivfAssignments,
+            int k,
+            Tensor<float, 2, true>& outDistances,
+            Tensor<idx_t, 2, true>& outIndices,
+            bool storePairs) override;
+
+    /// Return the encoded vectors of a particular list back to the CPU
+    std::vector<uint8_t> getListVectorData(idx_t listId, bool gpuFormat)
+            const override;
+
+    /// Update our Raft index with this quantizer instance; may be a CPU
+    /// or GPU quantizer
+    void updateQuantizer(Index* quantizer) override;
+
+    /// Copy all inverted lists from a CPU representation to ourselves
+    void copyInvertedListsFrom(const InvertedLists* ivf) override;
+
+    /// Replace the Raft index
+    void setRaftIndex(raft::neighbors::ivf_pq::index<idx_t>&& idx);
+
+    /// Classify and encode/add vectors to our IVF lists.
+    /// The input data must be on our current device.
+    /// Returns the number of vectors successfully added. Vectors may
+    /// not be able to be added because they contain NaNs.
+    idx_t addVectors(
+            Index* coarseQuantizer,
+            Tensor<float, 2, true>& vecs,
+            Tensor<idx_t, 1, true>& indices) override;
+
+    /// For debugging purposes, return the list length of a particular
+    /// list
+    idx_t getListLength(idx_t listId) const override;
+
+    /// Return the list indices of a particular list back to the CPU
+    std::vector<idx_t> getListIndices(idx_t listId) const override;
+
+   private:
+    /// Adds a set of codes and indices to a list, with the representation
+    /// coming from the CPU equivalent
+    void addEncodedVectorsToList_(
+            idx_t listId,
+            // resident on the host
+            const void* codes,
+            // resident on the host
+            const idx_t* indices,
+            idx_t numVecs) override;
+
+    /// Returns the encoding size for a PQ-encoded IVF list
+    size_t getGpuListEncodingSize_(idx_t listId);
+
+    /// Copy the PQ centroids to the Raft index. The data is already in the
+    /// preferred format with the transpose performed by the IVFPQ class helper.
+    void setPQCentroids_();
+
+    /// Update the product quantizer centroids buffer held in the IVFPQ class.
+    /// Used when the RAFT index was updated externally.
+    void setBasePQCentroids_();
+
+    /// optional around the Raft IVF-PQ index
+    std::optional<raft::neighbors::ivf_pq::index<idx_t>> raft_knn_index{
+            std::nullopt};
+};
+
+} // namespace gpu
+} // namespace faiss
+#pragma GCC visibility pop
diff --git a/faiss/gpu/test/TestGpuIndexFlat.cpp b/faiss/gpu/test/TestGpuIndexFlat.cpp
index 6d9c83e547..d7a7b45ec0 100644
--- a/faiss/gpu/test/TestGpuIndexFlat.cpp
+++ b/faiss/gpu/test/TestGpuIndexFlat.cpp
@@ -268,6 +268,7 @@ TEST(TestGpuIndexFlat, QueryEmpty) {
     faiss::gpu::GpuIndexFlatConfig config;
     config.device = 0;
     config.useFloat16 = false;
+    config.use_raft = false;
     int dim = 128;
     faiss::gpu::GpuIndexFlatL2 gpuIndex(&res, dim, config);
 
diff --git a/faiss/gpu/test/TestGpuIndexIVFFlat.cpp b/faiss/gpu/test/TestGpuIndexIVFFlat.cpp
index 9fb88e2687..a90825bffb 100644
--- a/faiss/gpu/test/TestGpuIndexIVFFlat.cpp
+++ b/faiss/gpu/test/TestGpuIndexIVFFlat.cpp
@@ -23,6 +23,7 @@
 #include <faiss/IndexFlat.h>
 #include <faiss/IndexIVFFlat.h>
 #include <faiss/gpu/GpuIndexIVFFlat.h>
+#include <faiss/gpu/GpuIndicesOptions.h>
 #include <faiss/gpu/StandardGpuResources.h>
 #include <faiss/gpu/test/TestUtils.h>
 #include <faiss/gpu/utils/DeviceUtils.h>
@@ -30,7 +31,6 @@
 #include <cmath>
 #include <sstream>
 #include <vector>
-#include "faiss/gpu/GpuIndicesOptions.h"
 
 // FIXME: figure out a better way to test fp16
 constexpr float kF16MaxRelErr = 0.3f;
@@ -57,7 +57,7 @@ struct Options {
 
         device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1);
 
-        use_raft = false;
+        useRaft = false;
     }
 
     std::string toString() const {
@@ -65,7 +65,7 @@ struct Options {
         str << "IVFFlat device " << device << " numVecs " << numAdd << " dim "
             << dim << " numCentroids " << numCentroids << " nprobe " << nprobe
             << " numQuery " << numQuery << " k " << k << " indicesOpt "
-            << indicesOpt << " use_raft " << use_raft;
+            << indicesOpt << " useRaft " << useRaft;
 
         return str.str();
     }
@@ -79,7 +79,7 @@ struct Options {
     int k;
     int device;
     faiss::gpu::IndicesOptions indicesOpt;
-    bool use_raft;
+    bool useRaft;
 };
 
 void queryTest(
@@ -110,7 +110,7 @@ void queryTest(
         config.device = opt.device;
         config.indicesOptions = opt.indicesOpt;
         config.flatConfig.useFloat16 = useFloat16CoarseQuantizer;
-        config.use_raft = opt.use_raft;
+        config.use_raft = opt.useRaft;
 
         faiss::gpu::GpuIndexIVFFlat gpuIndex(
                 &res, cpuIndex.d, cpuIndex.nlist, cpuIndex.metric_type, config);
@@ -137,7 +137,7 @@ void queryTest(
 void addTest(
         faiss::MetricType metricType,
         bool useFloat16CoarseQuantizer,
-        bool use_raft) {
+        bool useRaft) {
     for (int tries = 0; tries < 2; ++tries) {
         Options opt;
 
@@ -162,9 +162,9 @@ void addTest(
         faiss::gpu::GpuIndexIVFFlatConfig config;
         config.device = opt.device;
         config.indicesOptions =
-                use_raft ? faiss::gpu::INDICES_64_BIT : opt.indicesOpt;
+                useRaft ? faiss::gpu::INDICES_64_BIT : opt.indicesOpt;
         config.flatConfig.useFloat16 = useFloat16CoarseQuantizer;
-        config.use_raft = use_raft;
+        config.use_raft = useRaft;
 
         faiss::gpu::GpuIndexIVFFlat gpuIndex(
                 &res, cpuIndex.d, cpuIndex.nlist, cpuIndex.metric_type, config);
@@ -188,7 +188,7 @@ void addTest(
     }
 }
 
-void copyToTest(bool useFloat16CoarseQuantizer, bool use_raft) {
+void copyToTest(bool useFloat16CoarseQuantizer, bool useRaft) {
     Options opt;
     std::vector<float> trainVecs = faiss::gpu::randVecs(opt.numTrain, opt.dim);
     std::vector<float> addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim);
@@ -199,9 +199,9 @@ void copyToTest(bool useFloat16CoarseQuantizer, bool use_raft) {
     faiss::gpu::GpuIndexIVFFlatConfig config;
     config.device = opt.device;
     config.indicesOptions =
-            use_raft ? faiss::gpu::INDICES_64_BIT : opt.indicesOpt;
+            useRaft ? faiss::gpu::INDICES_64_BIT : opt.indicesOpt;
     config.flatConfig.useFloat16 = useFloat16CoarseQuantizer;
-    config.use_raft = use_raft;
+    config.use_raft = useRaft;
 
     faiss::gpu::GpuIndexIVFFlat gpuIndex(
             &res, opt.dim, opt.numCentroids, faiss::METRIC_L2, config);
@@ -241,7 +241,7 @@ void copyToTest(bool useFloat16CoarseQuantizer, bool use_raft) {
             compFloat16 ? 0.30f : 0.015f);
 }
 
-void copyFromTest(bool useFloat16CoarseQuantizer, bool use_raft) {
+void copyFromTest(bool useFloat16CoarseQuantizer, bool useRaft) {
     Options opt;
     std::vector<float> trainVecs = faiss::gpu::randVecs(opt.numTrain, opt.dim);
     std::vector<float> addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim);
@@ -260,9 +260,9 @@ void copyFromTest(bool useFloat16CoarseQuantizer, bool use_raft) {
     faiss::gpu::GpuIndexIVFFlatConfig config;
     config.device = opt.device;
     config.indicesOptions =
-            use_raft ? faiss::gpu::INDICES_64_BIT : opt.indicesOpt;
+            useRaft ? faiss::gpu::INDICES_64_BIT : opt.indicesOpt;
     config.flatConfig.useFloat16 = useFloat16CoarseQuantizer;
-    config.use_raft = use_raft;
+    config.use_raft = useRaft;
 
     faiss::gpu::GpuIndexIVFFlat gpuIndex(&res, 1, 1, faiss::METRIC_L2, config);
     gpuIndex.nprobe = 1;
@@ -334,7 +334,7 @@ TEST(TestGpuIndexIVFFlat, Float32_Query_L2) {
     queryTest(opt, faiss::METRIC_L2, false);
 
 #if defined USE_NVIDIA_RAFT
-    opt.use_raft = true;
+    opt.useRaft = true;
     opt.indicesOpt = faiss::gpu::INDICES_64_BIT;
     queryTest(opt, faiss::METRIC_L2, false);
 #endif
@@ -345,7 +345,7 @@ TEST(TestGpuIndexIVFFlat, Float32_Query_IP) {
     queryTest(opt, faiss::METRIC_INNER_PRODUCT, false);
 
 #if defined USE_NVIDIA_RAFT
-    opt.use_raft = true;
+    opt.useRaft = true;
     opt.indicesOpt = faiss::gpu::INDICES_64_BIT;
     queryTest(opt, faiss::METRIC_INNER_PRODUCT, false);
 #endif
@@ -358,7 +358,7 @@ TEST(TestGpuIndexIVFFlat, LargeBatch) {
     queryTest(opt, faiss::METRIC_L2, false);
 
 #if defined USE_NVIDIA_RAFT
-    opt.use_raft = true;
+    opt.useRaft = true;
     opt.indicesOpt = faiss::gpu::INDICES_64_BIT;
     queryTest(opt, faiss::METRIC_L2, false);
 #endif
@@ -371,7 +371,7 @@ TEST(TestGpuIndexIVFFlat, Float16_32_Query_L2) {
     queryTest(opt, faiss::METRIC_L2, true);
 
 #if defined USE_NVIDIA_RAFT
-    opt.use_raft = true;
+    opt.useRaft = true;
     opt.indicesOpt = faiss::gpu::INDICES_64_BIT;
     queryTest(opt, faiss::METRIC_L2, true);
 #endif
@@ -382,7 +382,7 @@ TEST(TestGpuIndexIVFFlat, Float16_32_Query_IP) {
     queryTest(opt, faiss::METRIC_INNER_PRODUCT, true);
 
 #if defined USE_NVIDIA_RAFT
-    opt.use_raft = true;
+    opt.useRaft = true;
     opt.indicesOpt = faiss::gpu::INDICES_64_BIT;
     queryTest(opt, faiss::METRIC_INNER_PRODUCT, true);
 #endif
@@ -399,7 +399,7 @@ TEST(TestGpuIndexIVFFlat, Float32_Query_L2_64) {
     queryTest(opt, faiss::METRIC_L2, false);
 
 #if defined USE_NVIDIA_RAFT
-    opt.use_raft = true;
+    opt.useRaft = true;
     opt.indicesOpt = faiss::gpu::INDICES_64_BIT;
     queryTest(opt, faiss::METRIC_L2, false);
 #endif
@@ -411,7 +411,7 @@ TEST(TestGpuIndexIVFFlat, Float32_Query_IP_64) {
     queryTest(opt, faiss::METRIC_INNER_PRODUCT, false);
 
 #if defined USE_NVIDIA_RAFT
-    opt.use_raft = true;
+    opt.useRaft = true;
     opt.indicesOpt = faiss::gpu::INDICES_64_BIT;
     queryTest(opt, faiss::METRIC_INNER_PRODUCT, false);
 #endif
@@ -423,7 +423,7 @@ TEST(TestGpuIndexIVFFlat, Float32_Query_L2_128) {
     queryTest(opt, faiss::METRIC_L2, false);
 
 #if defined USE_NVIDIA_RAFT
-    opt.use_raft = true;
+    opt.useRaft = true;
     opt.indicesOpt = faiss::gpu::INDICES_64_BIT;
     queryTest(opt, faiss::METRIC_L2, false);
 #endif
@@ -435,7 +435,7 @@ TEST(TestGpuIndexIVFFlat, Float32_Query_IP_128) {
     queryTest(opt, faiss::METRIC_INNER_PRODUCT, false);
 
 #if defined USE_NVIDIA_RAFT
-    opt.use_raft = true;
+    opt.useRaft = true;
     opt.indicesOpt = faiss::gpu::INDICES_64_BIT;
     queryTest(opt, faiss::METRIC_INNER_PRODUCT, false);
 #endif
@@ -499,6 +499,7 @@ TEST(TestGpuIndexIVFFlat, Float32_negative) {
     faiss::gpu::GpuIndexIVFFlatConfig config;
     config.device = opt.device;
     config.indicesOptions = opt.indicesOpt;
+    config.use_raft = false;
 
     faiss::gpu::GpuIndexIVFFlat gpuIndex(
             &res, cpuIndex.d, cpuIndex.nlist, cpuIndex.metric_type, config);
@@ -571,6 +572,7 @@ TEST(TestGpuIndexIVFFlat, QueryNaN) {
     config.device = opt.device;
     config.indicesOptions = opt.indicesOpt;
     config.flatConfig.useFloat16 = faiss::gpu::randBool();
+    config.use_raft = false;
 
     faiss::gpu::GpuIndexIVFFlat gpuIndex(
             &res, opt.dim, opt.numCentroids, faiss::METRIC_L2, config);
@@ -639,6 +641,7 @@ TEST(TestGpuIndexIVFFlat, AddNaN) {
     config.device = opt.device;
     config.indicesOptions = opt.indicesOpt;
     config.flatConfig.useFloat16 = faiss::gpu::randBool();
+    config.use_raft = false;
     faiss::gpu::GpuIndexIVFFlat gpuIndex(
             &res, opt.dim, opt.numCentroids, faiss::METRIC_L2, config);
     gpuIndex.nprobe = opt.nprobe;
@@ -720,6 +723,7 @@ TEST(TestGpuIndexIVFFlat, UnifiedMemory) {
     faiss::gpu::GpuIndexIVFFlatConfig config;
     config.device = device;
     config.memorySpace = faiss::gpu::MemorySpace::Unified;
+    config.use_raft = false;
 
     faiss::gpu::GpuIndexIVFFlat gpuIndex(
             &res, dim, numCentroids, faiss::METRIC_L2, config);
@@ -797,6 +801,7 @@ TEST(TestGpuIndexIVFFlat, LongIVFList) {
 
     faiss::gpu::GpuIndexIVFFlatConfig config;
     config.device = device;
+    config.use_raft = false;
 
     faiss::gpu::GpuIndexIVFFlat gpuIndex(
             &res, dim, numCentroids, faiss::METRIC_L2, config);
diff --git a/faiss/gpu/test/TestGpuIndexIVFPQ.cpp b/faiss/gpu/test/TestGpuIndexIVFPQ.cpp
index 1bdef31914..9cc52bc788 100644
--- a/faiss/gpu/test/TestGpuIndexIVFPQ.cpp
+++ b/faiss/gpu/test/TestGpuIndexIVFPQ.cpp
@@ -35,6 +35,22 @@ void pickEncoding(int& codes, int& dim) {
     }
 }
 
+void pickRaftEncoding(int& codes, int& dim, int bitsPerCode) {
+    // Above 32 doesn't work with no precomputed codes
+    std::vector<int> dimSizes{4, 8, 10, 12, 16, 20, 24, 28, 32};
+
+    while (true) {
+        codes = faiss::gpu::randVal(0, 96);
+        dim = codes * dimSizes[faiss::gpu::randVal(0, dimSizes.size() - 1)];
+
+        // for such a small test, super-low or high dim is more likely to
+        // generate comparison errors
+        if (dim < 256 && dim >= 64 && (codes * bitsPerCode) % 8 == 0) {
+            return;
+        }
+    }
+}
+
 struct Options {
     Options() {
         numAdd = faiss::gpu::randVal(2000, 5000);
@@ -43,9 +59,10 @@ struct Options {
 
         pickEncoding(codes, dim);
 
-        // TODO: Change back to `faiss::gpu::randVal(3, 7)` when we officially
-        //   support non-multiple of 8 subcodes for IVFPQ.
+        // TODO: Change back to `faiss::gpu::randVal(3, 7)` when we
+        // officially support non-multiple of 8 subcodes for IVFPQ.
         bitsPerCode = 8;
+
         nprobe = std::min(faiss::gpu::randVal(40, 1000), numCentroids);
         numQuery = faiss::gpu::randVal(4, 8);
 
@@ -66,6 +83,9 @@ struct Options {
         }
 
         device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1);
+
+        interleavedLayout = false;
+        useRaft = false;
     }
 
     std::string toString() const {
@@ -105,50 +125,66 @@ struct Options {
     faiss::gpu::IndicesOptions indicesOpt;
     bool useFloat16;
     int device;
+    bool interleavedLayout;
+    bool useRaft;
 };
 
-TEST(TestGpuIndexIVFPQ, Query_L2) {
-    for (int tries = 0; tries < 2; ++tries) {
-        Options opt;
+void queryTest(Options opt, faiss::MetricType metricType) {
+    std::vector<float> trainVecs = faiss::gpu::randVecs(opt.numTrain, opt.dim);
+    std::vector<float> addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim);
 
-        std::vector<float> trainVecs =
-                faiss::gpu::randVecs(opt.numTrain, opt.dim);
-        std::vector<float> addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim);
+    faiss::IndexFlatL2 coarseQuantizerL2(opt.dim);
+    faiss::IndexFlatIP coarseQuantizerIP(opt.dim);
+    faiss::Index* quantizer = metricType == faiss::METRIC_L2
+            ? (faiss::Index*)&coarseQuantizerL2
+            : (faiss::Index*)&coarseQuantizerIP;
 
-        faiss::IndexFlatL2 coarseQuantizer(opt.dim);
-        faiss::IndexIVFPQ cpuIndex(
-                &coarseQuantizer,
-                opt.dim,
-                opt.numCentroids,
-                opt.codes,
-                opt.bitsPerCode);
-        cpuIndex.nprobe = opt.nprobe;
-        cpuIndex.train(opt.numTrain, trainVecs.data());
-        cpuIndex.add(opt.numAdd, addVecs.data());
+    faiss::IndexIVFPQ cpuIndex(
+            quantizer, opt.dim, opt.numCentroids, opt.codes, opt.bitsPerCode);
+    cpuIndex.metric_type = metricType;
+    cpuIndex.nprobe = opt.nprobe;
+    cpuIndex.train(opt.numTrain, trainVecs.data());
+    cpuIndex.add(opt.numAdd, addVecs.data());
 
-        // Use the default temporary memory management to test the memory
-        // manager
-        faiss::gpu::StandardGpuResources res;
+    // Use the default temporary memory management to test the memory
+    // manager
+    faiss::gpu::StandardGpuResources res;
 
-        faiss::gpu::GpuIndexIVFPQConfig config;
-        config.device = opt.device;
-        config.usePrecomputedTables = (tries % 2 == 0);
-        config.indicesOptions = opt.indicesOpt;
-        config.useFloat16LookupTables = opt.useFloat16;
+    faiss::gpu::GpuIndexIVFPQConfig config;
+    config.device = opt.device;
+    config.usePrecomputedTables = opt.usePrecomputed;
+    config.indicesOptions = opt.indicesOpt;
+    config.useFloat16LookupTables = opt.useFloat16;
+    config.interleavedLayout = opt.interleavedLayout;
+    config.use_raft = opt.useRaft;
 
-        faiss::gpu::GpuIndexIVFPQ gpuIndex(&res, &cpuIndex, config);
-        gpuIndex.nprobe = opt.nprobe;
+    faiss::gpu::GpuIndexIVFPQ gpuIndex(&res, &cpuIndex, config);
+    gpuIndex.nprobe = opt.nprobe;
 
-        faiss::gpu::compareIndices(
-                cpuIndex,
-                gpuIndex,
-                opt.numQuery,
-                opt.dim,
-                opt.k,
-                opt.toString(),
-                opt.getCompareEpsilon(),
-                opt.getPctMaxDiff1(),
-                opt.getPctMaxDiffN());
+    faiss::gpu::compareIndices(
+            cpuIndex,
+            gpuIndex,
+            opt.numQuery,
+            opt.dim,
+            opt.k,
+            opt.toString(),
+            opt.getCompareEpsilon(),
+            opt.getPctMaxDiff1(),
+            opt.getPctMaxDiffN());
+}
+
+TEST(TestGpuIndexIVFPQ, Query_L2) {
+    for (int tries = 0; tries < 2; ++tries) {
+        Options opt;
+        opt.usePrecomputed = (tries % 2 == 0);
+        queryTest(opt, faiss::MetricType::METRIC_L2);
+    }
+}
+
+TEST(TestGpuIndexIVFPQ, Query_IP) {
+    for (int tries = 0; tries < 2; ++tries) {
+        Options opt;
+        queryTest(opt, faiss::MetricType::METRIC_INNER_PRODUCT);
     }
 }
 
@@ -161,45 +197,10 @@ TEST(TestGpuIndexIVFPQ, LargeBatch) {
         opt.dim = 4;
         opt.numQuery = 100000;
         opt.codes = 2;
+        opt.usePrecomputed = usePrecomputed;
+        opt.useFloat16 = false;
 
-        std::vector<float> trainVecs =
-                faiss::gpu::randVecs(opt.numTrain, opt.dim);
-        std::vector<float> addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim);
-
-        faiss::IndexFlatL2 coarseQuantizer(opt.dim);
-        faiss::IndexIVFPQ cpuIndex(
-                &coarseQuantizer,
-                opt.dim,
-                opt.numCentroids,
-                opt.codes,
-                opt.bitsPerCode);
-        cpuIndex.nprobe = opt.nprobe;
-        cpuIndex.train(opt.numTrain, trainVecs.data());
-        cpuIndex.add(opt.numAdd, addVecs.data());
-
-        // Use the default temporary memory management to test the memory
-        // manager
-        faiss::gpu::StandardGpuResources res;
-
-        faiss::gpu::GpuIndexIVFPQConfig config;
-        config.device = opt.device;
-        config.usePrecomputedTables = usePrecomputed;
-        config.indicesOptions = opt.indicesOpt;
-        config.useFloat16LookupTables = false;
-
-        faiss::gpu::GpuIndexIVFPQ gpuIndex(&res, &cpuIndex, config);
-        gpuIndex.nprobe = opt.nprobe;
-
-        faiss::gpu::compareIndices(
-                cpuIndex,
-                gpuIndex,
-                opt.numQuery,
-                opt.dim,
-                opt.k,
-                opt.toString(),
-                opt.getCompareEpsilon(),
-                opt.getPctMaxDiff1(),
-                opt.getPctMaxDiffN());
+        queryTest(opt, faiss::MetricType::METRIC_L2);
     }
 }
 
@@ -234,6 +235,7 @@ void testMMCodeDistance(faiss::MetricType mt) {
         config.usePrecomputedTables = false;
         config.useMMCodeDistance = true;
         config.indicesOptions = opt.indicesOpt;
+        config.use_raft = false;
 
         // Make sure that the float16 version works as well
         config.useFloat16LookupTables = (tries % 2 == 0);
@@ -284,6 +286,7 @@ void testMMCodeDistance(faiss::MetricType mt) {
         config.device = opt.device;
         config.usePrecomputedTables = false;
         config.indicesOptions = opt.indicesOpt;
+        config.use_raft = false;
 
         // Make sure that the float16 version works as well
         config.useFloat16LookupTables = (dimPerSubQ == 7);
@@ -312,53 +315,6 @@ TEST(TestGpuIndexIVFPQ, Query_IP_MMCodeDistance) {
     testMMCodeDistance(faiss::MetricType::METRIC_INNER_PRODUCT);
 }
 
-TEST(TestGpuIndexIVFPQ, Query_IP) {
-    for (int tries = 0; tries < 2; ++tries) {
-        Options opt;
-
-        std::vector<float> trainVecs =
-                faiss::gpu::randVecs(opt.numTrain, opt.dim);
-        std::vector<float> addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim);
-
-        faiss::IndexFlatIP coarseQuantizer(opt.dim);
-        faiss::IndexIVFPQ cpuIndex(
-                &coarseQuantizer,
-                opt.dim,
-                opt.numCentroids,
-                opt.codes,
-                opt.bitsPerCode);
-        cpuIndex.metric_type = faiss::MetricType::METRIC_INNER_PRODUCT;
-
-        cpuIndex.nprobe = opt.nprobe;
-        cpuIndex.train(opt.numTrain, trainVecs.data());
-        cpuIndex.add(opt.numAdd, addVecs.data());
-
-        // Use the default temporary memory management to test the memory
-        // manager
-        faiss::gpu::StandardGpuResources res;
-
-        faiss::gpu::GpuIndexIVFPQConfig config;
-        config.device = opt.device;
-        config.usePrecomputedTables = false; // not supported/required for IP
-        config.indicesOptions = opt.indicesOpt;
-        config.useFloat16LookupTables = opt.useFloat16;
-
-        faiss::gpu::GpuIndexIVFPQ gpuIndex(&res, &cpuIndex, config);
-        gpuIndex.nprobe = opt.nprobe;
-
-        faiss::gpu::compareIndices(
-                cpuIndex,
-                gpuIndex,
-                opt.numQuery,
-                opt.dim,
-                opt.k,
-                opt.toString(),
-                opt.getCompareEpsilon(),
-                opt.getPctMaxDiff1(),
-                opt.getPctMaxDiffN());
-    }
-}
-
 TEST(TestGpuIndexIVFPQ, Float16Coarse) {
     Options opt;
 
@@ -384,6 +340,7 @@ TEST(TestGpuIndexIVFPQ, Float16Coarse) {
     config.usePrecomputedTables = opt.usePrecomputed;
     config.indicesOptions = opt.indicesOpt;
     config.useFloat16LookupTables = opt.useFloat16;
+    config.use_raft = false;
 
     faiss::gpu::GpuIndexIVFPQ gpuIndex(&res, &cpuIndex, config);
     gpuIndex.nprobe = opt.nprobe;
@@ -403,104 +360,68 @@ TEST(TestGpuIndexIVFPQ, Float16Coarse) {
             opt.getPctMaxDiffN());
 }
 
-TEST(TestGpuIndexIVFPQ, Add_L2) {
-    for (int tries = 0; tries < 2; ++tries) {
-        Options opt;
+void addTest(Options opt, faiss::MetricType metricType) {
+    std::vector<float> trainVecs = faiss::gpu::randVecs(opt.numTrain, opt.dim);
+    std::vector<float> addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim);
 
-        std::vector<float> trainVecs =
-                faiss::gpu::randVecs(opt.numTrain, opt.dim);
-        std::vector<float> addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim);
+    faiss::IndexFlatL2 coarseQuantizerL2(opt.dim);
+    faiss::IndexFlatIP coarseQuantizerIP(opt.dim);
+    faiss::Index* quantizer = metricType == faiss::METRIC_L2
+            ? (faiss::Index*)&coarseQuantizerL2
+            : (faiss::Index*)&coarseQuantizerIP;
 
-        faiss::IndexFlatL2 coarseQuantizer(opt.dim);
-        faiss::IndexIVFPQ cpuIndex(
-                &coarseQuantizer,
-                opt.dim,
-                opt.numCentroids,
-                opt.codes,
-                opt.bitsPerCode);
-        cpuIndex.nprobe = opt.nprobe;
-        cpuIndex.train(opt.numTrain, trainVecs.data());
+    faiss::IndexIVFPQ cpuIndex(
+            quantizer, opt.dim, opt.numCentroids, opt.codes, opt.bitsPerCode);
+    cpuIndex.nprobe = opt.nprobe;
+    cpuIndex.metric_type = metricType;
+    cpuIndex.train(opt.numTrain, trainVecs.data());
 
-        // Use the default temporary memory management to test the memory
-        // manager
-        faiss::gpu::StandardGpuResources res;
+    // Use the default temporary memory management to test the memory
+    // manager
+    faiss::gpu::StandardGpuResources res;
 
-        faiss::gpu::GpuIndexIVFPQConfig config;
-        config.device = opt.device;
-        config.usePrecomputedTables = opt.usePrecomputed;
-        config.indicesOptions = opt.indicesOpt;
-        config.useFloat16LookupTables = opt.useFloat16;
+    faiss::gpu::GpuIndexIVFPQConfig config;
+    config.device = opt.device;
+    config.usePrecomputedTables = opt.usePrecomputed;
+    config.indicesOptions = opt.indicesOpt;
+    config.useFloat16LookupTables = opt.useFloat16;
+    config.interleavedLayout = opt.interleavedLayout;
+    config.use_raft = opt.useRaft;
 
-        faiss::gpu::GpuIndexIVFPQ gpuIndex(&res, &cpuIndex, config);
-        gpuIndex.nprobe = opt.nprobe;
+    faiss::gpu::GpuIndexIVFPQ gpuIndex(&res, &cpuIndex, config);
+    gpuIndex.nprobe = opt.nprobe;
 
-        gpuIndex.add(opt.numAdd, addVecs.data());
-        cpuIndex.add(opt.numAdd, addVecs.data());
+    gpuIndex.add(opt.numAdd, addVecs.data());
+    cpuIndex.add(opt.numAdd, addVecs.data());
 
-        faiss::gpu::compareIndices(
-                cpuIndex,
-                gpuIndex,
-                opt.numQuery,
-                opt.dim,
-                opt.k,
-                opt.toString(),
-                opt.getCompareEpsilon(),
-                opt.getPctMaxDiff1(),
-                opt.getPctMaxDiffN());
+    faiss::gpu::compareIndices(
+            cpuIndex,
+            gpuIndex,
+            opt.numQuery,
+            opt.dim,
+            opt.k,
+            opt.toString(),
+            opt.getCompareEpsilon(),
+            opt.getPctMaxDiff1(),
+            opt.getPctMaxDiffN());
+}
+
+TEST(TestGpuIndexIVFPQ, Add_L2) {
+    for (int tries = 0; tries < 2; ++tries) {
+        Options opt;
+        addTest(opt, faiss::METRIC_L2);
     }
 }
 
 TEST(TestGpuIndexIVFPQ, Add_IP) {
     for (int tries = 0; tries < 2; ++tries) {
         Options opt;
-
-        std::vector<float> trainVecs =
-                faiss::gpu::randVecs(opt.numTrain, opt.dim);
-        std::vector<float> addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim);
-
-        faiss::IndexFlatIP coarseQuantizer(opt.dim);
-        faiss::IndexIVFPQ cpuIndex(
-                &coarseQuantizer,
-                opt.dim,
-                opt.numCentroids,
-                opt.codes,
-                opt.bitsPerCode);
-        cpuIndex.metric_type = faiss::MetricType::METRIC_INNER_PRODUCT;
-        cpuIndex.nprobe = opt.nprobe;
-        cpuIndex.train(opt.numTrain, trainVecs.data());
-
-        // Use the default temporary memory management to test the memory
-        // manager
-        faiss::gpu::StandardGpuResources res;
-
-        faiss::gpu::GpuIndexIVFPQConfig config;
-        config.device = opt.device;
-        config.usePrecomputedTables = opt.usePrecomputed;
-        config.indicesOptions = opt.indicesOpt;
-        config.useFloat16LookupTables = opt.useFloat16;
-
-        faiss::gpu::GpuIndexIVFPQ gpuIndex(&res, &cpuIndex, config);
-        gpuIndex.nprobe = opt.nprobe;
-
-        gpuIndex.add(opt.numAdd, addVecs.data());
-        cpuIndex.add(opt.numAdd, addVecs.data());
-
-        faiss::gpu::compareIndices(
-                cpuIndex,
-                gpuIndex,
-                opt.numQuery,
-                opt.dim,
-                opt.k,
-                opt.toString(),
-                opt.getCompareEpsilon(),
-                opt.getPctMaxDiff1(),
-                opt.getPctMaxDiffN());
+        addTest(opt, faiss::METRIC_INNER_PRODUCT);
     }
 }
 
-TEST(TestGpuIndexIVFPQ, CopyTo) {
+void copyToTest(Options opt) {
     for (int tries = 0; tries < 2; ++tries) {
-        Options opt;
         std::vector<float> trainVecs =
                 faiss::gpu::randVecs(opt.numTrain, opt.dim);
         std::vector<float> addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim);
@@ -511,9 +432,11 @@ TEST(TestGpuIndexIVFPQ, CopyTo) {
 
         faiss::gpu::GpuIndexIVFPQConfig config;
         config.device = opt.device;
-        config.usePrecomputedTables = (tries % 2 == 0);
+        config.usePrecomputedTables = false;
         config.indicesOptions = opt.indicesOpt;
         config.useFloat16LookupTables = opt.useFloat16;
+        config.interleavedLayout = opt.interleavedLayout;
+        config.use_raft = opt.useRaft;
 
         faiss::gpu::GpuIndexIVFPQ gpuIndex(
                 &res,
@@ -561,8 +484,12 @@ TEST(TestGpuIndexIVFPQ, CopyTo) {
     }
 }
 
-TEST(TestGpuIndexIVFPQ, CopyFrom) {
+TEST(TestGpuIndexIVFPQ, CopyTo) {
     Options opt;
+    copyToTest(opt);
+}
+
+void copyFromTest(Options opt) {
     std::vector<float> trainVecs = faiss::gpu::randVecs(opt.numTrain, opt.dim);
     std::vector<float> addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim);
 
@@ -585,6 +512,8 @@ TEST(TestGpuIndexIVFPQ, CopyFrom) {
     config.usePrecomputedTables = opt.usePrecomputed;
     config.indicesOptions = opt.indicesOpt;
     config.useFloat16LookupTables = opt.useFloat16;
+    config.interleavedLayout = opt.interleavedLayout;
+    config.use_raft = opt.useRaft;
 
     // Use garbage values to see if we overwrite them
     faiss::gpu::GpuIndexIVFPQ gpuIndex(
@@ -621,9 +550,12 @@ TEST(TestGpuIndexIVFPQ, CopyFrom) {
             opt.getPctMaxDiffN());
 }
 
-TEST(TestGpuIndexIVFPQ, QueryNaN) {
+TEST(TestGpuIndexIVFPQ, CopyFrom) {
     Options opt;
+    copyFromTest(opt);
+}
 
+void queryNaNTest(Options opt) {
     std::vector<float> trainVecs = faiss::gpu::randVecs(opt.numTrain, opt.dim);
     std::vector<float> addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim);
 
@@ -635,6 +567,8 @@ TEST(TestGpuIndexIVFPQ, QueryNaN) {
     config.usePrecomputedTables = opt.usePrecomputed;
     config.indicesOptions = opt.indicesOpt;
     config.useFloat16LookupTables = opt.useFloat16;
+    config.use_raft = opt.useRaft;
+    config.interleavedLayout = opt.useRaft ? true : opt.interleavedLayout;
 
     faiss::gpu::GpuIndexIVFPQ gpuIndex(
             &res,
@@ -670,9 +604,13 @@ TEST(TestGpuIndexIVFPQ, QueryNaN) {
     }
 }
 
-TEST(TestGpuIndexIVFPQ, AddNaN) {
+TEST(TestGpuIndexIVFPQ, QueryNaN) {
     Options opt;
+    opt.useRaft = false;
+    queryNaNTest(opt);
+}
 
+void addNaNTest(Options opt) {
     // Use the default temporary memory management to test the memory manager
     faiss::gpu::StandardGpuResources res;
 
@@ -681,6 +619,8 @@ TEST(TestGpuIndexIVFPQ, AddNaN) {
     config.usePrecomputedTables = opt.usePrecomputed;
     config.indicesOptions = opt.indicesOpt;
     config.useFloat16LookupTables = opt.useFloat16;
+    config.interleavedLayout = opt.interleavedLayout;
+    config.use_raft = opt.useRaft;
 
     faiss::gpu::GpuIndexIVFPQ gpuIndex(
             &res,
@@ -722,6 +662,128 @@ TEST(TestGpuIndexIVFPQ, AddNaN) {
             indices.data());
 }
 
+TEST(TestGpuIndexIVFPQ, AddNaN) {
+    Options opt;
+    opt.useRaft = false;
+    addNaNTest(opt);
+}
+
+#if defined USE_NVIDIA_RAFT
+TEST(TestGpuIndexIVFPQ, Query_L2_Raft) {
+    for (int tries = 0; tries < 2; ++tries) {
+        Options opt;
+        opt.bitsPerCode = faiss::gpu::randVal(4, 8);
+        opt.useRaft = true;
+        opt.interleavedLayout = true;
+        opt.usePrecomputed = false;
+        opt.indicesOpt = faiss::gpu::INDICES_64_BIT;
+        pickRaftEncoding(opt.codes, opt.dim, opt.bitsPerCode);
+        queryTest(opt, faiss::MetricType::METRIC_L2);
+    }
+}
+
+TEST(TestGpuIndexIVFPQ, Query_IP_Raft) {
+    for (int tries = 0; tries < 2; ++tries) {
+        Options opt;
+        opt.bitsPerCode = faiss::gpu::randVal(4, 8);
+        opt.useRaft = true;
+        opt.interleavedLayout = true;
+        opt.usePrecomputed = false;
+        opt.indicesOpt = faiss::gpu::INDICES_64_BIT;
+        pickRaftEncoding(opt.codes, opt.dim, opt.bitsPerCode);
+        queryTest(opt, faiss::MetricType::METRIC_INNER_PRODUCT);
+    }
+}
+
+// Large batch sizes (>= 65536) should also work
+TEST(TestGpuIndexIVFPQ, LargeBatch_Raft) {
+    Options opt;
+
+    // override for large sizes
+    opt.dim = 4;
+    opt.numQuery = 100000;
+    opt.codes = 2;
+    opt.useRaft = true;
+    opt.interleavedLayout = true;
+    opt.usePrecomputed = false;
+    opt.useFloat16 = false;
+    opt.indicesOpt = faiss::gpu::INDICES_64_BIT;
+    opt.bitsPerCode = 8;
+
+    queryTest(opt, faiss::MetricType::METRIC_L2);
+}
+
+TEST(TestGpuIndexIVFPQ, CopyFrom_Raft) {
+    Options opt;
+    opt.useRaft = true;
+    opt.interleavedLayout = true;
+    opt.bitsPerCode = faiss::gpu::randVal(4, 8);
+    opt.usePrecomputed = false;
+    opt.indicesOpt = faiss::gpu::INDICES_64_BIT;
+    pickRaftEncoding(opt.codes, opt.dim, opt.bitsPerCode);
+    copyFromTest(opt);
+}
+
+TEST(TestGpuIndexIVFPQ, Add_L2_Raft) {
+    for (int tries = 0; tries < 2; ++tries) {
+        Options opt;
+        opt.useRaft = true;
+        opt.interleavedLayout = true;
+        opt.bitsPerCode = faiss::gpu::randVal(4, 8);
+        opt.usePrecomputed = false;
+        opt.indicesOpt = faiss::gpu::INDICES_64_BIT;
+        pickRaftEncoding(opt.codes, opt.dim, opt.bitsPerCode);
+        addTest(opt, faiss::METRIC_L2);
+    }
+}
+
+TEST(TestGpuIndexIVFPQ, Add_IP_Raft) {
+    for (int tries = 0; tries < 2; ++tries) {
+        Options opt;
+        opt.useRaft = true;
+        opt.interleavedLayout = true;
+        opt.bitsPerCode = faiss::gpu::randVal(4, 8);
+        opt.usePrecomputed = false;
+        opt.indicesOpt = faiss::gpu::INDICES_64_BIT;
+        pickRaftEncoding(opt.codes, opt.dim, opt.bitsPerCode);
+        addTest(opt, faiss::METRIC_INNER_PRODUCT);
+    }
+}
+
+TEST(TestGpuIndexIVFPQ, QueryNaN_Raft) {
+    Options opt;
+    opt.useRaft = true;
+    opt.interleavedLayout = true;
+    opt.bitsPerCode = faiss::gpu::randVal(4, 8);
+    opt.usePrecomputed = false;
+    opt.indicesOpt = faiss::gpu::INDICES_64_BIT;
+    pickRaftEncoding(opt.codes, opt.dim, opt.bitsPerCode);
+    queryNaNTest(opt);
+}
+
+TEST(TestGpuIndexIVFPQ, AddNaN_Raft) {
+    Options opt;
+    opt.useRaft = true;
+    opt.interleavedLayout = true;
+    opt.bitsPerCode = faiss::gpu::randVal(4, 8);
+    opt.usePrecomputed = false;
+    opt.indicesOpt = faiss::gpu::INDICES_64_BIT;
+    pickRaftEncoding(opt.codes, opt.dim, opt.bitsPerCode);
+    addNaNTest(opt);
+}
+
+TEST(TestGpuIndexIVFPQ, CopyTo_Raft) {
+    Options opt;
+    opt.useRaft = true;
+    opt.interleavedLayout = true;
+    opt.bitsPerCode = faiss::gpu::randVal(4, 8);
+    opt.usePrecomputed = false;
+    opt.indicesOpt = faiss::gpu::INDICES_64_BIT;
+    pickRaftEncoding(opt.codes, opt.dim, opt.bitsPerCode);
+    copyToTest(opt);
+}
+#endif
+
 TEST(TestGpuIndexIVFPQ, UnifiedMemory) {
     // Construct on a random device to test multi-device, if we have
     // multiple devices
@@ -762,6 +824,7 @@ TEST(TestGpuIndexIVFPQ, UnifiedMemory) {
     faiss::gpu::GpuIndexIVFPQConfig config;
     config.device = device;
     config.memorySpace = faiss::gpu::MemorySpace::Unified;
+    config.use_raft = false;
 
     faiss::gpu::GpuIndexIVFPQ gpuIndex(
             &res,
@@ -784,6 +847,34 @@ TEST(TestGpuIndexIVFPQ, UnifiedMemory) {
             0.015f,
             0.1f,
             0.015f);
+
+#if defined USE_NVIDIA_RAFT
+    config.interleavedLayout = true;
+    config.use_raft = true;
+    config.indicesOptions = faiss::gpu::INDICES_64_BIT;
+
+    faiss::gpu::GpuIndexIVFPQ raftGpuIndex(
+            &res,
+            dim,
+            numCentroids,
+            codes,
+            bitsPerCode,
+            faiss::METRIC_L2,
+            config);
+    raftGpuIndex.copyFrom(&cpuIndex);
+    raftGpuIndex.nprobe = nprobe;
+
+    faiss::gpu::compareIndices(
+            cpuIndex,
+            raftGpuIndex,
+            numQuery,
+            dim,
+            k,
+            "Unified Memory",
+            0.015f,
+            0.1f,
+            0.015f);
+#endif
 }
 
 int main(int argc, char** argv) {
diff --git a/faiss/gpu/test/TestGpuMemoryException.cpp b/faiss/gpu/test/TestGpuMemoryException.cpp
index c6f6e9bdeb..ff4be0893e 100644
--- a/faiss/gpu/test/TestGpuMemoryException.cpp
+++ b/faiss/gpu/test/TestGpuMemoryException.cpp
@@ -31,6 +31,7 @@ TEST(TestGpuMemoryException, AddException) {
 
     faiss::gpu::GpuIndexFlatConfig config;
     config.device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1);
+    config.use_raft = false;
 
     faiss::gpu::GpuIndexFlatL2 gpuIndexL2Broken(
             &res, (int)brokenAddDims, config);
diff --git a/faiss/gpu/test/test_gpu_index.py b/faiss/gpu/test/test_gpu_index.py
index 36a1f8a64b..620bfea198 100755
--- a/faiss/gpu/test/test_gpu_index.py
+++ b/faiss/gpu/test/test_gpu_index.py
@@ -24,7 +24,9 @@ def test_ivfflat_search_preassigned(self):
         nprobe = 10
         k = 50
 
-        idx_gpu = faiss.GpuIndexIVFFlat(res, d, nlist)
+        config = faiss.GpuIndexIVFFlatConfig()
+        config.use_raft = False
+        idx_gpu = faiss.GpuIndexIVFFlat(res, d, nlist, faiss.METRIC_L2, config)
         idx_gpu.nprobe = nprobe
 
         rs = np.random.RandomState(567)
@@ -56,7 +58,9 @@ def test_ivfpq_search_preassigned(self):
         nprobe = 5
         k = 50
 
-        idx_gpu = faiss.GpuIndexIVFPQ(res, d, nlist, 4, 8)
+        config = faiss.GpuIndexIVFPQConfig()
+        config.use_raft = False
+        idx_gpu = faiss.GpuIndexIVFPQ(res, d, nlist, 4, 8, faiss.METRIC_L2, config)
         idx_gpu.nprobe = nprobe
 
         rs = np.random.RandomState(567)
@@ -136,7 +140,9 @@ def test_ivfflat_cpu_coarse(self):
 
         # construct a GPU index using the same trained coarse quantizer
         # from the CPU index
-        idx_gpu = faiss.GpuIndexIVFFlat(res, q, d, nlist, faiss.METRIC_L2)
+        config = faiss.GpuIndexIVFFlatConfig()
+        config.use_raft = False
+        idx_gpu = faiss.GpuIndexIVFFlat(res, q, d, nlist, faiss.METRIC_L2, config)
         assert(idx_gpu.is_trained)
         idx_gpu.add(xb)
 
@@ -150,7 +156,7 @@ def test_ivfflat_cpu_coarse(self):
         self.assertGreaterEqual((i_g == i_c).sum(), i_g.size * 0.9)
         self.assertTrue(np.allclose(d_g, d_c, rtol=5e-5, atol=5e-5))
 
-    def test_ivfsq_cpu_coarse(self):
+    def test_ivfsq_pu_coarse(self):
         res = faiss.StandardGpuResources()
         d = 128
         nb = 5000
@@ -226,8 +232,10 @@ def test_ivfpq_cpu_coarse(self):
 
         # construct a GPU index using the same trained coarse quantizer
         # from the CPU index
+        config = faiss.GpuIndexIVFPQConfig()
+        config.use_raft = False
         idx_gpu = faiss.GpuIndexIVFPQ(
-            res, idx_coarse_cpu, d, nlist_lvl_2, 4, 8)
+            res, idx_coarse_cpu, d, nlist_lvl_2, 4, 8, faiss.METRIC_L2, config)
         assert(not idx_gpu.is_trained)
 
         idx_gpu.train(xb)
@@ -406,6 +414,7 @@ def test_indices_ivfflat(self):
 
         # Store values using 32-bit indices instead
         config.indicesOptions = faiss.INDICES_32_BIT
+        config.use_raft = False
         idx = faiss.GpuIndexIVFFlat(res, d, nlist, faiss.METRIC_L2, config)
         idx.train(xb)
         idx.add_with_ids(xb, xb_indices)
@@ -430,6 +439,7 @@ def test_indices_ivfpq(self):
         xb_indices = (xb_indices_base + 4294967296).astype('int64')
 
         config = faiss.GpuIndexIVFPQConfig()
+        config.use_raft = False
         idx = faiss.GpuIndexIVFPQ(res, d, nlist, M, nbits,
                                   faiss.METRIC_L2, config)
         idx.train(xb)
@@ -490,7 +500,9 @@ def test_sq_cpu_to_gpu(self):
         res = faiss.StandardGpuResources()
         index = faiss.index_factory(32, "SQfp16")
         index.add(np.random.rand(1000, 32).astype(np.float32))
-        gpu_index = faiss.index_cpu_to_gpu(res, 0, index)
+        config = faiss.GpuClonerOptions()
+        config.use_raft = False
+        gpu_index = faiss.index_cpu_to_gpu(res, 0, index, config)
         self.assertIsInstance(gpu_index, faiss.GpuIndexFlat)
 
 
diff --git a/faiss/gpu/test/test_gpu_index_ivfsq.py b/faiss/gpu/test/test_gpu_index_ivfsq.py
index af56316509..09dcdae079 100755
--- a/faiss/gpu/test/test_gpu_index_ivfsq.py
+++ b/faiss/gpu/test/test_gpu_index_ivfsq.py
@@ -27,7 +27,9 @@ def make_indices_copy_from_cpu(nlist, d, qtype, by_residual, metric, clamp):
 
     res = faiss.StandardGpuResources()
     res.noTempMemory()
-    idx_gpu = faiss.GpuIndexIVFScalarQuantizer(res, idx_cpu)
+    config = faiss.GpuIndexIVFScalarQuantizerConfig()
+    config.use_raft = False
+    idx_gpu = faiss.GpuIndexIVFScalarQuantizer(res, idx_cpu, config)
 
     return idx_cpu, idx_gpu
 
@@ -37,8 +39,10 @@ def make_indices_copy_from_gpu(nlist, d, qtype, by_residual, metric, clamp):
 
     res = faiss.StandardGpuResources()
     res.noTempMemory()
+    config = faiss.GpuIndexIVFScalarQuantizerConfig()
+    config.use_raft = False
     idx_gpu = faiss.GpuIndexIVFScalarQuantizer(res, d, nlist,
-                                               qtype, metric, by_residual)
+                                               qtype, metric, by_residual, config)
     idx_gpu.train(to_train)
     idx_gpu.add(to_train)
 
@@ -63,8 +67,10 @@ def make_indices_train(nlist, d, qtype, by_residual, metric, clamp):
 
     res = faiss.StandardGpuResources()
     res.noTempMemory()
+    config = faiss.GpuIndexIVFScalarQuantizerConfig()
+    config.use_raft = False
     idx_gpu = faiss.GpuIndexIVFScalarQuantizer(res, d, nlist,
-                                               qtype, metric, by_residual)
+                                               qtype, metric, by_residual, config)
     assert(by_residual == idx_gpu.by_residual)
 
     idx_gpu.train(to_train)
diff --git a/faiss/gpu/test/test_gpu_index_serialize.py b/faiss/gpu/test/test_gpu_index_serialize.py
index 82cbe577c8..49e51af8b4 100644
--- a/faiss/gpu/test/test_gpu_index_serialize.py
+++ b/faiss/gpu/test/test_gpu_index_serialize.py
@@ -34,7 +34,9 @@ def test_serialize(self):
         indexes.append(faiss.GpuIndexIVFFlat(res, d, nlist, faiss.METRIC_L2))
 
         # IVFSQ
-        indexes.append(faiss.GpuIndexIVFScalarQuantizer(res, d, nlist, faiss.ScalarQuantizer.QT_fp16))
+        config = faiss.GpuIndexIVFScalarQuantizerConfig()
+        config.use_raft = False
+        indexes.append(faiss.GpuIndexIVFScalarQuantizer(res, d, nlist, faiss.ScalarQuantizer.QT_fp16, faiss.METRIC_L2, True, config))
 
         # IVFPQ
         indexes.append(faiss.GpuIndexIVFPQ(res, d, nlist, 4, 8, faiss.METRIC_L2))
@@ -47,8 +49,11 @@ def test_serialize(self):
 
             ser = faiss.serialize_index(faiss.index_gpu_to_cpu(index))
             cpu_index = faiss.deserialize_index(ser)
-
-            gpu_index_restore = faiss.index_cpu_to_gpu(res, 0, cpu_index)
+             
+            gpu_cloner_options = faiss.GpuClonerOptions()
+            if isinstance(index, faiss.GpuIndexIVFScalarQuantizer):
+                gpu_cloner_options.use_raft = False
+            gpu_index_restore = faiss.index_cpu_to_gpu(res, 0, cpu_index, gpu_cloner_options)
 
             restore_d, restore_i = gpu_index_restore.search(query, k)
 
diff --git a/faiss/gpu/test/test_multi_gpu.py b/faiss/gpu/test/test_multi_gpu.py
index 4a63025969..e341f5715a 100644
--- a/faiss/gpu/test/test_multi_gpu.py
+++ b/faiss/gpu/test/test_multi_gpu.py
@@ -29,6 +29,7 @@ def test_sharded(self):
 
         co = faiss.GpuMultipleClonerOptions()
         co.shard = True
+        co.use_raft = False
         index = faiss.index_cpu_to_all_gpus(index_cpu, co, ngpu=2)
 
         index.add(xb)
@@ -71,6 +72,7 @@ def do_test_sharded_ivf(self, index_key):
         co = faiss.GpuMultipleClonerOptions()
         co.shard = True
         co.common_ivf_quantizer = True
+        co.use_raft = False
         index = faiss.index_cpu_to_all_gpus(index, co, ngpu=2)
 
         index.quantizer  # make sure there is indeed a quantizer
@@ -111,6 +113,7 @@ def test_binary_clone(self, ngpu=1, shard=False):
 
         co = faiss.GpuMultipleClonerOptions()
         co.shard = shard
+        co.use_raft = False
 
         # index2 = faiss.index_cpu_to_all_gpus(index, ngpu=ngpu)
         res = faiss.StandardGpuResources()
@@ -188,7 +191,9 @@ def do_cpu_to_gpu(self, index_key):
         ts.append(time.time())
 
         res = faiss.StandardGpuResources()
-        gpu_index = faiss.index_cpu_to_gpu(res, 0, index)
+        co = faiss.GpuClonerOptions()
+        co.use_raft = False
+        gpu_index = faiss.index_cpu_to_gpu(res, 0, index, co)
         ts.append(time.time())
 
         # Validate the layout of the memory info
@@ -217,6 +222,7 @@ def do_cpu_to_gpu(self, index_key):
             res = [faiss.StandardGpuResources() for i in range(2)]
             co = faiss.GpuMultipleClonerOptions()
             co.shard = shard
+            co.use_raft = False
 
             gpu_index = faiss.index_cpu_to_gpu_multiple_py(res, index, co)
 
diff --git a/faiss/gpu/utils/CopyUtils.cuh b/faiss/gpu/utils/CopyUtils.cuh
index 637a46cbee..8ff600a049 100644
--- a/faiss/gpu/utils/CopyUtils.cuh
+++ b/faiss/gpu/utils/CopyUtils.cuh
@@ -119,6 +119,7 @@ inline void fromDevice(T* src, T* dst, size_t num, cudaStream_t stream) {
     if (dev == -1) {
         CUDA_VERIFY(cudaMemcpyAsync(
                 dst, src, num * sizeof(T), cudaMemcpyDeviceToHost, stream));
+        cudaStreamSynchronize(stream);
     } else {
         CUDA_VERIFY(cudaMemcpyAsync(
                 dst, src, num * sizeof(T), cudaMemcpyDeviceToDevice, stream));
diff --git a/faiss/gpu/utils/RaftUtils.cu b/faiss/gpu/utils/RaftUtils.cu
new file mode 100644
index 0000000000..ba40c54c26
--- /dev/null
+++ b/faiss/gpu/utils/RaftUtils.cu
@@ -0,0 +1,117 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <faiss/gpu/GpuIndex.h>
+#include <faiss/gpu/utils/RaftUtils.h>
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/device_mdspan.hpp>
+#include <raft/linalg/coalesced_reduction.cuh>
+#include <raft/linalg/map.cuh>
+#include <raft/matrix/gather.cuh>
+
+#include <thrust/copy.h>
+#include <thrust/gather.h>
+#include <thrust/reduce.h>
+
+namespace faiss {
+namespace gpu {
+
+void validRowIndices(
+        GpuResources* res,
+        Tensor<float, 2, true>& vecs,
+        bool* validRows) {
+    idx_t n_rows = vecs.getSize(0);
+    idx_t dim = vecs.getSize(1);
+
+    raft::linalg::coalescedReduction(
+            validRows,
+            vecs.data(),
+            dim,
+            n_rows,
+            true,
+            res->getDefaultStreamCurrentDevice(),
+            false,
+            [] __device__(float v, idx_t i) { return isfinite(v); },
+            raft::mul_op());
+}
+
+idx_t inplaceGatherFilteredRows(
+        GpuResources* res,
+        Tensor<float, 2, true>& vecs,
+        Tensor<idx_t, 1, true>& indices) {
+    raft::device_resources& raft_handle = res->getRaftHandleCurrentDevice();
+    idx_t n_rows = vecs.getSize(0);
+    idx_t dim = vecs.getSize(1);
+
+    auto valid_rows =
+            raft::make_device_vector<bool, idx_t>(raft_handle, n_rows);
+
+    validRowIndices(res, vecs, valid_rows.data_handle());
+
+    idx_t n_rows_valid = thrust::reduce(
+            raft_handle.get_thrust_policy(),
+            valid_rows.data_handle(),
+            valid_rows.data_handle() + n_rows,
+            0);
+
+    if (n_rows_valid < n_rows) {
+        auto gather_indices = raft::make_device_vector<idx_t, idx_t>(
+                raft_handle, n_rows_valid);
+
+        auto count = thrust::make_counting_iterator(0);
+
+        thrust::copy_if(
+                raft_handle.get_thrust_policy(),
+                count,
+                count + n_rows,
+                gather_indices.data_handle(),
+                [valid_rows = valid_rows.data_handle()] __device__(auto i) {
+                    return valid_rows[i];
+                });
+
+        raft::matrix::gather(
+                raft_handle,
+                raft::make_device_matrix_view<float, idx_t>(
+                        vecs.data(), n_rows, dim),
+                raft::make_const_mdspan(gather_indices.view()),
+                (idx_t)16);
+
+        auto validIndices = raft::make_device_vector<idx_t, idx_t>(
+                raft_handle, n_rows_valid);
+
+        thrust::gather(
+                raft_handle.get_thrust_policy(),
+                gather_indices.data_handle(),
+                gather_indices.data_handle() + gather_indices.size(),
+                indices.data(),
+                validIndices.data_handle());
+        thrust::copy(
+                raft_handle.get_thrust_policy(),
+                validIndices.data_handle(),
+                validIndices.data_handle() + n_rows_valid,
+                indices.data());
+    }
+    return n_rows_valid;
+}
+
+} // namespace gpu
+} // namespace faiss
diff --git a/faiss/gpu/impl/RaftUtils.h b/faiss/gpu/utils/RaftUtils.h
similarity index 73%
rename from faiss/gpu/impl/RaftUtils.h
rename to faiss/gpu/utils/RaftUtils.h
index f1ea19ed33..4dfafa4ec5 100644
--- a/faiss/gpu/impl/RaftUtils.h
+++ b/faiss/gpu/utils/RaftUtils.h
@@ -23,13 +23,16 @@
 #pragma once
 
 #include <faiss/MetricType.h>
-#include <raft/core/error.hpp>
+#include <faiss/gpu/GpuResources.h>
+#include <faiss/gpu/utils/Tensor.cuh>
+
 #include <raft/distance/distance_types.hpp>
 
+#pragma GCC visibility push(default)
 namespace faiss {
 namespace gpu {
 
-inline raft::distance::DistanceType faiss_to_raft(
+inline raft::distance::DistanceType metricFaissToRaft(
         MetricType metric,
         bool exactDistance) {
     switch (metric) {
@@ -53,5 +56,20 @@ inline raft::distance::DistanceType faiss_to_raft(
             RAFT_FAIL("Distance type not supported");
     }
 }
+
+/// Identify matrix rows containing non NaN values. validRows[i] is false if row
+/// i contains a NaN value and true otherwise.
+void validRowIndices(
+        GpuResources* res,
+        Tensor<float, 2, true>& vecs,
+        bool* validRows);
+
+/// Filter out matrix rows containing NaN values. The vectors and indices are
+/// updated in-place.
+idx_t inplaceGatherFilteredRows(
+        GpuResources* res,
+        Tensor<float, 2, true>& vecs,
+        Tensor<idx_t, 1, true>& indices);
 } // namespace gpu
 } // namespace faiss
+#pragma GCC visibility pop
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 8522fa613d..10243b9a9c 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -70,6 +70,7 @@ find_package(OpenMP REQUIRED)
 target_link_libraries(faiss_test PRIVATE
   OpenMP::OpenMP_CXX
   gtest_main
+  $<$<BOOL:${FAISS_ENABLE_RAFT}>:raft::raft>
 )
 
 # Defines `gtest_discover_tests()`.