forked from facebookresearch/faiss
-
Notifications
You must be signed in to change notification settings - Fork 6
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Integrate IVF-PQ from RAFT (facebookresearch#3044)
Summary: Imports changes from facebookresearch#3133 and facebookresearch#3171. So this single PR adds all the changes together. - [x] Implement RaftIVFPQ class - [x] Update gtests to test correctness with RAFT enabled - [x] All googleTests for RAFT enabled IVFPQ pass - [x] Move some common functions in RaftIVFFlat and RaftIVFPQ to helper: RaftUtils.h - [x] update Quantizer retroactively after building RAFT index -- both IVFFlat and IVFPQ - [x] resolve failing LargeBatch (classical GPU) - [x] add checks for Pascal deprecation - [x] apply RMM changes from facebookresearch#3171 - [x] apply robertmaynard's changes from facebookresearch#3133 Pull Request resolved: facebookresearch#3044 Reviewed By: junjieqi Differential Revision: D51074065 Pulled By: algoriddle fbshipit-source-id: 6871257921bcaff2064a20637e2ed358acbdc363
- Loading branch information
1 parent
87d43b9
commit 27b1055
Showing
40 changed files
with
2,199 additions
and
654 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,193 @@ | ||
# Copyright (c) Facebook, Inc. and its affiliates. | ||
# | ||
# This source code is licensed under the MIT license found in the | ||
# LICENSE file in the root directory of this source tree. | ||
# | ||
# Copyright (c) 2023, NVIDIA CORPORATION. | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
# | ||
|
||
import numpy as np | ||
import faiss | ||
import time | ||
import argparse | ||
import rmm | ||
|
||
###################################################### | ||
# Command-line parsing | ||
###################################################### | ||
|
||
parser = argparse.ArgumentParser() | ||
|
||
|
||
def aa(*args, **kwargs): | ||
group.add_argument(*args, **kwargs) | ||
|
||
|
||
group = parser.add_argument_group('benchmarking options') | ||
|
||
aa('--bm_train', default=False, action='store_true', | ||
help='whether to benchmark train operation on GPU index') | ||
aa('--bm_add', default=False, action='store_true', | ||
help='whether to benchmark add operation on GPU index') | ||
aa('--bm_search', default=True, | ||
help='whether to benchmark search operation on GPU index') | ||
aa('--raft_only', default=False, action='store_true', | ||
help='whether to only produce RAFT enabled benchmarks') | ||
|
||
|
||
group = parser.add_argument_group('IVF options') | ||
aa('--n_centroids', default=256, type=int, | ||
help="number of IVF centroids") | ||
|
||
|
||
group = parser.add_argument_group('searching') | ||
|
||
aa('--k', default=100, type=int, help='nb of nearest neighbors') | ||
aa('--nprobe', default=50, help='nb of IVF lists to probe') | ||
|
||
args = parser.parse_args() | ||
|
||
print("args:", args) | ||
|
||
rs = np.random.RandomState(123) | ||
|
||
res = faiss.StandardGpuResources() | ||
|
||
# Use an RMM pool memory resource for device allocations | ||
mr = rmm.mr.PoolMemoryResource(rmm.mr.CudaMemoryResource()) | ||
rmm.mr.set_current_device_resource(mr) | ||
|
||
def bench_train_milliseconds(index, trainVecs, use_raft): | ||
co = faiss.GpuMultipleClonerOptions() | ||
co.use_raft = use_raft | ||
index_gpu = faiss.index_cpu_to_gpu(res, 0, index, co) | ||
t0 = time.time() | ||
index_gpu.train(trainVecs) | ||
return 1000*(time.time() - t0) | ||
|
||
|
||
if args.bm_train: | ||
print("=" * 40) | ||
print("GPU Train Benchmarks") | ||
print("=" * 40) | ||
trainset_sizes = [5000, 10000, 100000, 1000000, 5000000] | ||
dataset_dims = [128, 256, 1024] | ||
for n_rows in trainset_sizes: | ||
for n_cols in dataset_dims: | ||
index = faiss.index_factory(n_cols, "IVF{},Flat".format(args.n_centroids)) | ||
trainVecs = rs.rand(n_rows, n_cols).astype('float32') | ||
raft_gpu_train_time = bench_train_milliseconds( | ||
index, trainVecs, True) | ||
if args.raft_only: | ||
print("Method: IVFFlat, Operation: TRAIN, dim: %d, n_centroids %d, numTrain: %d, RAFT enabled GPU train time: %.3f milliseconds" % ( | ||
n_cols, args.n_centroids, n_rows, raft_gpu_train_time)) | ||
else: | ||
classical_gpu_train_time = bench_train_milliseconds( | ||
index, trainVecs, False) | ||
print("Method: IVFFlat, Operation: TRAIN, dim: %d, n_centroids %d, numTrain: %d, classical GPU train time: %.3f milliseconds, RAFT enabled GPU train time: %.3f milliseconds" % ( | ||
n_cols, args.n_centroids, n_rows, classical_gpu_train_time, raft_gpu_train_time)) | ||
|
||
|
||
def bench_add_milliseconds(index, addVecs, use_raft): | ||
co = faiss.GpuMultipleClonerOptions() | ||
co.use_raft = use_raft | ||
index_gpu = faiss.index_cpu_to_gpu(res, 0, index, co) | ||
index_gpu.copyFrom(index) | ||
t0 = time.time() | ||
index_gpu.add(addVecs) | ||
return 1000*(time.time() - t0) | ||
|
||
|
||
if args.bm_add: | ||
print("=" * 40) | ||
print("GPU Add Benchmarks") | ||
print("=" * 40) | ||
addset_sizes = [5000, 10000, 100000, 1000000] | ||
dataset_dims = [128, 256, 1024] | ||
n_train = 10000 | ||
trainVecs = rs.rand(n_train, n_cols).astype('float32') | ||
index = faiss.index_factory( | ||
n_cols, "IVF" + str(args.n_centroids) + ",Flat") | ||
index.train(trainVecs) | ||
for n_rows in addset_sizes: | ||
for n_cols in dataset_dims: | ||
addVecs = rs.rand(n_rows, n_cols).astype('float32') | ||
raft_gpu_add_time = bench_add_milliseconds(index, addVecs, True) | ||
if args.raft_only: | ||
print("Method: IVFFlat, Operation: ADD, dim: %d, n_centroids %d, numAdd: %d, RAFT enabled GPU add time: %.3f milliseconds" % ( | ||
n_train, n_rows, n_cols, args.n_centroids, raft_gpu_add_time)) | ||
else: | ||
classical_gpu_add_time = bench_add_milliseconds( | ||
index, addVecs, False) | ||
print("Method: IVFFlat, Operation: ADD, dim: %d, n_centroids %d, numAdd: %d, classical GPU add time: %.3f milliseconds, RAFT enabled GPU add time: %.3f milliseconds" % ( | ||
n_train, n_rows, n_cols, args.n_centroids, classical_gpu_add_time, raft_gpu_add_time)) | ||
|
||
|
||
def bench_search_milliseconds(index, addVecs, queryVecs, nprobe, k, use_raft): | ||
co = faiss.GpuMultipleClonerOptions() | ||
co.use_raft = use_raft | ||
index_gpu = faiss.index_cpu_to_gpu(res, 0, index, co) | ||
index_gpu.copyFrom(index) | ||
index_gpu.add(addVecs) | ||
index_gpu.nprobe = nprobe | ||
t0 = time.time() | ||
index_gpu.search(queryVecs, k) | ||
return 1000*(time.time() - t0) | ||
|
||
|
||
if args.bm_search: | ||
print("=" * 40) | ||
print("GPU Search Benchmarks") | ||
print("=" * 40) | ||
queryset_sizes = [5000, 10000, 100000, 500000] | ||
n_train = 10000 | ||
n_add = 100000 | ||
search_bm_dims = [8, 16, 32] | ||
for n_cols in search_bm_dims: | ||
index = faiss.index_factory(n_cols, "IVF{},Flat".format(args.n_centroids)) | ||
trainVecs = rs.rand(n_train, n_cols).astype('float32') | ||
index.train(trainVecs) | ||
addVecs = rs.rand(n_add, n_cols).astype('float32') | ||
for n_rows in queryset_sizes: | ||
queryVecs = rs.rand(n_rows, n_cols).astype('float32') | ||
raft_gpu_search_time = bench_search_milliseconds( | ||
index, addVecs, queryVecs, args.nprobe, args.k, True) | ||
if args.raft_only: | ||
print("Method: IVFFlat, Operation: SEARCH, dim: %d, n_centroids: %d, numVecs: %d, numQuery: %d, nprobe: %d, k: %d, RAFT enabled GPU search time: %.3f milliseconds" % ( | ||
n_cols, args.n_centroids, n_add, n_rows, args.nprobe, args.k, raft_gpu_search_time)) | ||
else: | ||
classical_gpu_search_time = bench_search_milliseconds( | ||
index, addVecs, queryVecs, args.nprobe, args.k, False) | ||
print("Method: IVFFlat, Operation: SEARCH, dim: %d, n_centroids: %d, numVecs: %d, numQuery: %d, nprobe: %d, k: %d, classical GPU search time: %.3f milliseconds, RAFT enabled GPU search time: %.3f milliseconds" % ( | ||
n_cols, args.n_centroids, n_add, n_rows, args.nprobe, args.k, classical_gpu_search_time, raft_gpu_search_time)) | ||
|
||
print("=" * 40) | ||
print("Large RAFT Enabled Benchmarks") | ||
print("=" * 40) | ||
# Avoid classical GPU Benchmarks for large datasets because of OOM for more than 500000 queries and/or large dims as well as for large k | ||
queryset_sizes = [100000, 500000, 1000000] | ||
large_search_bm_dims = [128, 256, 1024] | ||
for n_cols in large_search_bm_dims: | ||
trainVecs = rs.rand(n_train, n_cols).astype('float32') | ||
index = faiss.index_factory( | ||
n_cols, "IVF" + str(args.n_centroids) + ",Flat") | ||
index.train(trainVecs) | ||
addVecs = rs.rand(n_add, n_cols).astype('float32') | ||
for n_rows in queryset_sizes: | ||
queryVecs = rs.rand(n_rows, n_cols).astype('float32') | ||
raft_gpu_search_time = bench_search_milliseconds( | ||
index, addVecs, queryVecs, args.nprobe, args.k, True) | ||
print("Method: IVFFlat, Operation: SEARCH, numTrain: %d, dim: %d, n_centroids: %d, numVecs: %d, numQuery: %d, nprobe: %d, k: %d, RAFT enabled GPU search time: %.3f milliseconds" % ( | ||
n_cols, args.n_centroids, n_add, n_rows, args.nprobe, args.k, raft_gpu_search_time)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,168 @@ | ||
# Copyright (c) Facebook, Inc. and its affiliates. | ||
# | ||
# This source code is licensed under the MIT license found in the | ||
# LICENSE file in the root directory of this source tree. | ||
# | ||
# Copyright (c) 2023, NVIDIA CORPORATION. | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
# | ||
|
||
import numpy as np | ||
import faiss | ||
import time | ||
import argparse | ||
import rmm | ||
|
||
###################################################### | ||
# Command-line parsing | ||
###################################################### | ||
|
||
parser = argparse.ArgumentParser() | ||
|
||
from datasets import load_sift1M, evaluate | ||
|
||
|
||
print("load data") | ||
xb, xq, xt, gt = load_sift1M() | ||
|
||
def aa(*args, **kwargs): | ||
group.add_argument(*args, **kwargs) | ||
|
||
|
||
group = parser.add_argument_group('benchmarking options') | ||
aa('--raft_only', default=False, action='store_true', | ||
help='whether to only produce RAFT enabled benchmarks') | ||
|
||
group = parser.add_argument_group('IVF options') | ||
aa('--bits_per_code', default=8, type=int, help='bits per code. Note that < 8 is only supported when RAFT is enabled') | ||
aa('--pq_len', default=2, type=int, help='number of vector elements represented by one PQ code') | ||
aa('--use_precomputed', default=True, type=bool, help='use precomputed codes (not with RAFT enabled)') | ||
|
||
group = parser.add_argument_group('searching') | ||
aa('--k', default=10, type=int, help='nb of nearest neighbors') | ||
aa('--nprobe', default=50, type=int, help='nb of IVF lists to probe') | ||
|
||
args = parser.parse_args() | ||
|
||
print("args:", args) | ||
|
||
rs = np.random.RandomState(123) | ||
|
||
res = faiss.StandardGpuResources() | ||
|
||
# Use an RMM pool memory resource for device allocations | ||
mr = rmm.mr.PoolMemoryResource(rmm.mr.CudaMemoryResource()) | ||
rmm.mr.set_current_device_resource(mr) | ||
|
||
# A heuristic to select a suitable number of lists | ||
def compute_nlist(numVecs): | ||
nlist = np.sqrt(numVecs) | ||
if (numVecs / nlist < 1000): | ||
nlist = numVecs / 1000 | ||
return int(nlist) | ||
|
||
|
||
def bench_train_milliseconds(index, trainVecs, use_raft): | ||
co = faiss.GpuMultipleClonerOptions() | ||
# use float 16 lookup tables to save space | ||
co.useFloat16LookupTables = True | ||
co.use_raft = use_raft | ||
index_gpu = faiss.index_cpu_to_gpu(res, 0, index, co) | ||
t0 = time.time() | ||
index_gpu.train(trainVecs) | ||
return 1000*(time.time() - t0) | ||
|
||
n_rows, n_cols = xb.shape | ||
n_train, _ = xt.shape | ||
M = n_cols // args.pq_len | ||
nlist = compute_nlist(n_rows) | ||
index = faiss.index_factory(n_cols, "IVF{},PQ{}x{}np".format(nlist, M, args.bits_per_code)) | ||
|
||
print("=" * 40) | ||
print("GPU Train Benchmarks") | ||
print("=" * 40) | ||
raft_gpu_train_time = bench_train_milliseconds(index, xt, True) | ||
if args.raft_only: | ||
print("Method: IVFPQ, Operation: TRAIN, dim: %d, n_centroids %d, numSubQuantizers %d, bitsPerCode %d, numTrain: %d, RAFT enabled GPU train time: %.3f milliseconds" % ( | ||
n_cols, nlist, M, args.bits_per_code, n_train, raft_gpu_train_time)) | ||
else: | ||
classical_gpu_train_time = bench_train_milliseconds( | ||
index, xt, False) | ||
print("Method: IVFPQ, Operation: TRAIN, dim: %d, n_centroids %d, numSubQuantizers %d, bitsPerCode %d, numTrain: %d, classical GPU train time: %.3f milliseconds, RAFT enabled GPU train time: %.3f milliseconds" % ( | ||
n_cols, nlist, M, args.bits_per_code, n_train, classical_gpu_train_time, raft_gpu_train_time)) | ||
|
||
|
||
def bench_add_milliseconds(index, addVecs, use_raft): | ||
co = faiss.GpuMultipleClonerOptions() | ||
# use float 16 lookup tables to save space | ||
co.useFloat16LookupTables = True | ||
co.use_raft = use_raft | ||
index_gpu = faiss.index_cpu_to_gpu(res, 0, index, co) | ||
index_gpu.copyFrom(index) | ||
t0 = time.time() | ||
index_gpu.add(addVecs) | ||
return 1000*(time.time() - t0) | ||
|
||
print("=" * 40) | ||
print("GPU Add Benchmarks") | ||
print("=" * 40) | ||
index.train(xt) | ||
raft_gpu_add_time = bench_add_milliseconds(index, xb, True) | ||
if args.raft_only: | ||
print("Method: IVFPQ, Operation: ADD, dim: %d, n_centroids %d numSubQuantizers %d, bitsPerCode %d, numAdd %d, RAFT enabled GPU add time: %.3f milliseconds" % ( | ||
n_cols, nlist, M, args.bits_per_code, n_rows, raft_gpu_add_time)) | ||
else: | ||
classical_gpu_add_time = bench_add_milliseconds( | ||
index, xb, False) | ||
print("Method: IVFFPQ, Operation: ADD, dim: %d, n_centroids %d, numSubQuantizers %d, bitsPerCode %d, numAdd %d, classical GPU add time: %.3f milliseconds, RAFT enabled GPU add time: %.3f milliseconds" % ( | ||
n_cols, nlist, M, args.bits_per_code, n_rows, classical_gpu_add_time, raft_gpu_add_time)) | ||
|
||
|
||
def bench_search_milliseconds(index, addVecs, queryVecs, nprobe, k, use_raft): | ||
co = faiss.GpuMultipleClonerOptions() | ||
co.use_raft = use_raft | ||
co.useFloat16LookupTables = True | ||
index_gpu = faiss.index_cpu_to_gpu(res, 0, index, co) | ||
index_gpu.copyFrom(index) | ||
index_gpu.add(addVecs) | ||
index_gpu.nprobe = nprobe | ||
t0 = time.time() | ||
index_gpu.search(queryVecs, k) | ||
return 1000*(time.time() - t0) | ||
|
||
|
||
if args.bm_search: | ||
print("=" * 40) | ||
print("GPU Search Benchmarks") | ||
print("=" * 40) | ||
queryset_sizes = [1, 10, 100, 1000, 10000] | ||
n_train, n_cols = xt.shape | ||
n_add, _ = xb.shape | ||
print(xq.shape) | ||
M = n_cols // args.pq_len | ||
nlist = compute_nlist(n_add) | ||
index = faiss.index_factory(n_cols, "IVF{},PQ{}x{}np".format(nlist, M, args.bits_per_code)) | ||
index.train(xt) | ||
for n_rows in queryset_sizes: | ||
queryVecs = xq[np.random.choice(xq.shape[0], n_rows, replace=False)] | ||
raft_gpu_search_time = bench_search_milliseconds( | ||
index, xb, queryVecs, args.nprobe, args.k, True) | ||
if args.raft_only: | ||
print("Method: IVFPQ, Operation: SEARCH, dim: %d, n_centroids: %d, numSubQuantizers %d, bitsPerCode %d, numVecs: %d, numQuery: %d, nprobe: %d, k: %d, RAFT enabled GPU search time: %.3f milliseconds" % ( | ||
n_cols, nlist, M, args.bits_per_code, n_add, n_rows, args.nprobe, args.k, raft_gpu_search_time)) | ||
else: | ||
classical_gpu_search_time = bench_search_milliseconds( | ||
index, xb, queryVecs, args.nprobe, args.k, False) | ||
print("Method: IVFPQ, Operation: SEARCH, dim: %d, n_centroids: %d, numSubQuantizers %d, bitsPerCode %d, numVecs: %d, numQuery: %d, nprobe: %d, k: %d, classical GPU search time: %.3f milliseconds, RAFT enabled GPU search time: %.3f milliseconds" % ( | ||
n_cols, nlist, M, args.bits_per_code, n_add, n_rows, args.nprobe, args.k, classical_gpu_search_time, raft_gpu_search_time)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.