From bb884ae5e78ddc3322543419cef14b8e6cc4b3f1 Mon Sep 17 00:00:00 2001 From: pengyu <6712304+FantasyVR@users.noreply.github.com> Date: Mon, 21 Nov 2022 13:26:26 +0800 Subject: [PATCH] [lang] Merge triplets in the same position when building GPU sparse matrix (#6605) Issue: #2906 ### Brief Summary When building GPU sparse matrix, cuSparse API requires three separated arrays: row index ptr, col index ptr, and values ptr. However, the sparse matrix builder only uses one ndarray to store all triplets, the memory layout is like: [row, col, value, row, col, value, ...]. In this pr, I retrieve all data from ndarray and merge all triplets in the same position of the sparse matrix. Then, all triplets are stored in three separate arrays. At last, these three arrays are used to build sparse matrix using cuSparse API. --- misc/test_build_cusm_from_coo.py | 40 --------- misc/test_coo_cusolver.py | 75 ----------------- python/taichi/linalg/sparse_matrix.py | 38 +++------ python/taichi/linalg/sparse_solver.py | 2 +- taichi/program/sparse_matrix.cpp | 98 +++++++++++++++++------ taichi/program/sparse_matrix.h | 9 +-- taichi/python/export_lang.cpp | 10 +-- tests/python/test_sparse_linear_solver.py | 53 +++++++++--- tests/python/test_sparse_matrix.py | 96 +++++++++++----------- 9 files changed, 181 insertions(+), 240 deletions(-) delete mode 100644 misc/test_build_cusm_from_coo.py delete mode 100644 misc/test_coo_cusolver.py diff --git a/misc/test_build_cusm_from_coo.py b/misc/test_build_cusm_from_coo.py deleted file mode 100644 index 620b95232f7d2..0000000000000 --- a/misc/test_build_cusm_from_coo.py +++ /dev/null @@ -1,40 +0,0 @@ -import numpy as np - -import taichi as ti - -ti.init(arch=ti.cuda) - -h_coo_row = np.asarray([0, 0, 0, 1, 2, 2, 2, 3, 3], dtype=np.int32) -h_coo_col = np.asarray([0, 2, 3, 1, 0, 2, 3, 1, 3], dtype=np.int32) -h_coo_val = np.asarray([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0], - dtype=np.float32) -h_x = np.asarray([1.0, 2.0, 3.0, 4.0], dtype=np.float32) -h_y = np.asarray([19.0, 8.0, 51.0, 52.0], dtype=np.float32) - -d_coo_row = ti.ndarray(shape=9, dtype=ti.int32) -d_coo_col = ti.ndarray(shape=9, dtype=ti.int32) -d_coo_val = ti.ndarray(shape=9, dtype=ti.float32) -x = ti.ndarray(shape=4, dtype=ti.float32) -y = ti.ndarray(shape=4, dtype=ti.float32) - -d_coo_row.from_numpy(h_coo_row) -d_coo_col.from_numpy(h_coo_col) -d_coo_val.from_numpy(h_coo_val) -x.from_numpy(h_x) -y.fill(0.0) - -A = ti.linalg.SparseMatrix(n=4, m=4, dtype=ti.float32) -A.build_coo(d_coo_row, d_coo_col, d_coo_val) - -y = A @ x - -# Check if the results are correct -equal = True -for i in range(4): - if y[i] != h_y[i]: - equal = False - break -if equal: - print("Spmv Results is correct!") -else: - print("Opps! Spmv Results is wrong.") diff --git a/misc/test_coo_cusolver.py b/misc/test_coo_cusolver.py deleted file mode 100644 index 08e30bd2c83ad..0000000000000 --- a/misc/test_coo_cusolver.py +++ /dev/null @@ -1,75 +0,0 @@ -import numpy as np -from scipy.sparse import coo_matrix - -import taichi as ti - -ti.init(arch=ti.cuda) - - -@ti.kernel -def init_b(b: ti.types.ndarray(), nrows: ti.i32): - for i in range(nrows): - b[i] = 1.0 + i / nrows - - -@ti.kernel -def print_x(x: ti.types.ndarray(), ncols: ti.i32): - for i in range(ncols): - print(x[i], end=' ') - print() - - -""" -Generate a positive definite matrix with a given number of rows and columns. -Reference: https://stackoverflow.com/questions/619335/a-simple-algorithm-for-generating-positive-semidefinite-matrices -""" -matrixSize = 10 -A = np.random.rand(matrixSize, matrixSize) -A_psd = np.dot(A, A.transpose()) - -A_raw_coo = coo_matrix(A_psd) -nrows, ncols = A_raw_coo.shape -nnz = A_raw_coo.nnz - -A_csr = A_raw_coo.tocsr() -b = ti.ndarray(shape=nrows, dtype=ti.f32) -init_b(b, nrows) - -print(">> solve Ax = b using Cusolver ......... ") -A_coo = A_csr.tocoo() -d_row_coo = ti.ndarray(shape=nnz, dtype=ti.i32) -d_col_coo = ti.ndarray(shape=nnz, dtype=ti.i32) -d_val_coo = ti.ndarray(shape=nnz, dtype=ti.f32) -d_row_coo.from_numpy(A_coo.row) -d_col_coo.from_numpy(A_coo.col) -d_val_coo.from_numpy(A_coo.data) - -A_ti = ti.linalg.SparseMatrix(n=nrows, m=ncols, dtype=ti.float32) -A_ti.build_coo(d_row_coo, d_col_coo, d_val_coo) -x_ti = ti.ndarray(shape=ncols, dtype=ti.float32) -solver = ti.linalg.SparseSolver() -x_ti = solver.solve_cu(A_ti, b) -ti.sync() -print_x(x_ti, ncols) -ti.sync() - -print(">> solve Ax = b using Numpy ......... ") -b_np = b.to_numpy() -x_np = np.linalg.solve(A_psd, b_np) -print(x_np) - -print( - f"The solution is identical?: {np.allclose(x_ti.to_numpy(), x_np, atol=1e-1)}" -) - -solver = ti.linalg.SparseSolver() -solver.analyze_pattern(A_ti) -solver.factorize(A_ti) -x_ti = solver.solve(b) - -ti.sync() -print_x(x_ti, ncols) -ti.sync() -print( - f"The cusolver rf solution and numpy solution is identical?: {np.allclose(x_ti.to_numpy(), x_np, atol=1e-1)}" -) diff --git a/python/taichi/linalg/sparse_matrix.py b/python/taichi/linalg/sparse_matrix.py index 6224a9822b5b0..81bc5ef2707e8 100644 --- a/python/taichi/linalg/sparse_matrix.py +++ b/python/taichi/linalg/sparse_matrix.py @@ -1,12 +1,13 @@ from functools import reduce import numpy as np +from taichi._lib import core as _ti_core from taichi.lang._ndarray import Ndarray, ScalarNdarray from taichi.lang.exception import TaichiRuntimeError from taichi.lang.field import Field from taichi.lang.impl import get_runtime from taichi.lang.util import warning -from taichi.types import annotations, f32, i32 +from taichi.types import annotations, f32 class SparseMatrix: @@ -206,30 +207,6 @@ def build_from_ndarray(self, ndarray): 'Sparse matrix only supports building from [ti.ndarray, ti.Vector.ndarray, ti.Matrix.ndarray]' ) - def build_coo(self, row_coo, col_coo, value_coo): - """Build a CSR format sparse matrix from COO format inputs. - - Args: - row_indices (ti.ndarray): the row indices of the matrix entries. - col_indices (ti.ndarray): the column indices of the matrix entries. - data (ti.ndarray): the entries of the matrix. - - Raises: - TaichiRuntimeError: If the inputs are not ``ti.ndarray`` or the datatypes of the ndarray are not correct. - """ - if not isinstance(row_coo, Ndarray) or not isinstance( - col_coo, Ndarray) or not isinstance(value_coo, Ndarray): - raise TaichiRuntimeError( - 'Sparse matrix only supports COO format building from [ti.ndarray, ti.Vector.ndarray, ti.Matrix.ndarray].' - ) - elif value_coo.dtype != f32 or row_coo.dtype != i32 or col_coo.dtype != i32: - raise TaichiRuntimeError( - 'Sparse matrix only supports COO fromat building from float32 data and int32 row/col indices.' - ) - else: - get_runtime().prog.make_sparse_matrix_from_ndarray_cusparse( - self.matrix, row_coo.arr, col_coo.arr, value_coo.arr) - class SparseMatrixBuilder: """A python wrap around sparse matrix builder. @@ -270,8 +247,15 @@ def print_triplets(self): def build(self, dtype=f32, _format='CSR'): """Create a sparse matrix using the triplets""" - sm = self.ptr.build() - return SparseMatrix(sm=sm) + taichi_arch = get_runtime().prog.config().arch + if taichi_arch == _ti_core.Arch.x64 or taichi_arch == _ti_core.Arch.arm64: + sm = self.ptr.build() + return SparseMatrix(sm=sm) + if taichi_arch == _ti_core.Arch.cuda: + sm = self.ptr.build_cuda() + return SparseMatrix(sm=sm) + raise TaichiRuntimeError( + 'Sparse matrix only supports CPU and CUDA backends.') # TODO: remove this in 1.0 release diff --git a/python/taichi/linalg/sparse_solver.py b/python/taichi/linalg/sparse_solver.py index aa1c36c02dfc4..ddf813d81fd90 100644 --- a/python/taichi/linalg/sparse_solver.py +++ b/python/taichi/linalg/sparse_solver.py @@ -51,7 +51,7 @@ def compute(self, sparse_matrix): if isinstance(sparse_matrix, SparseMatrix): self.matrix = sparse_matrix taichi_arch = taichi.lang.impl.get_runtime().prog.config().arch - if taichi_arch == _ti_core.Arch.x64: + if taichi_arch == _ti_core.Arch.x64 or taichi_arch == _ti_core.Arch.arm64: self.solver.compute(sparse_matrix.matrix) elif taichi_arch == _ti_core.Arch.cuda: self.analyze_pattern(self.matrix) diff --git a/taichi/program/sparse_matrix.cpp b/taichi/program/sparse_matrix.cpp index cc5ddf59f035d..5ed15f6b79618 100644 --- a/taichi/program/sparse_matrix.cpp +++ b/taichi/program/sparse_matrix.cpp @@ -1,5 +1,6 @@ #include "taichi/program/sparse_matrix.h" +#include #include #include #include @@ -145,6 +146,58 @@ std::unique_ptr SparseMatrixBuilder::build() { return sm; } +std::unique_ptr SparseMatrixBuilder::build_cuda() { + TI_ASSERT(built_ == false); + built_ = true; + auto sm = make_cu_sparse_matrix(rows_, cols_, dtype_); +#ifdef TI_WITH_CUDA + num_triplets_ = ndarray_data_base_ptr_->read_int(std::vector{0}); + std::map> entries; + for (auto i = 0; i < num_triplets_; i++) { + auto idx = 3 * i + 1; + auto row = ndarray_data_base_ptr_->read_int(std::vector{idx}); + auto col = ndarray_data_base_ptr_->read_int(std::vector{idx + 1}); + auto val = ndarray_data_base_ptr_->read_float(std::vector{idx + 2}); + auto e_idx = row * cols_ + col; + if (entries.find(e_idx) == entries.end()) { + entries[e_idx] = std::make_tuple(row, col, val); + } else { + auto [r, c, v] = entries[e_idx]; + entries[e_idx] = std::make_tuple(r, c, v + val); + } + } + auto entry_size = entries.size(); + int *row_host = (int *)malloc(sizeof(int) * entry_size); + int *col_host = (int *)malloc(sizeof(int) * entry_size); + float32 *value_host = (float32 *)malloc(sizeof(float32) * entry_size); + int count = 0; + for (auto entry : entries) { + auto [row, col, value] = entry.second; + row_host[count] = row; + col_host[count] = col; + value_host[count] = value; + count++; + } + void *row_device = nullptr, *col_device = nullptr, *value_device = nullptr; + CUDADriver::get_instance().malloc(&row_device, entry_size * sizeof(int)); + CUDADriver::get_instance().malloc(&col_device, entry_size * sizeof(int)); + CUDADriver::get_instance().malloc(&value_device, + entry_size * sizeof(float32)); + CUDADriver::get_instance().memcpy_host_to_device(row_device, (void *)row_host, + entry_size * sizeof(int)); + CUDADriver::get_instance().memcpy_host_to_device(col_device, (void *)col_host, + entry_size * sizeof(int)); + CUDADriver::get_instance().memcpy_host_to_device( + value_device, (void *)value_host, entry_size * sizeof(float32)); + sm->build_csr_from_coo(row_device, col_device, value_device, entry_size); + clear(); + free(row_host); + free(col_host); + free(value_host); +#endif + return sm; +} + void SparseMatrixBuilder::clear() { built_ = false; ndarray_data_base_ptr_->write_int(std::vector{0}, 0); @@ -286,14 +339,20 @@ void CuSparseMatrix::build_csr_from_coo(void *coo_row_ptr, &matrix_, rows_, cols_, nnz, csr_row_offset_ptr, coo_col_ptr, coo_values_ptr, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F); - CUSPARSEDriver::get_instance().cpDestroySpVec(vec_permutation); - CUSPARSEDriver::get_instance().cpDestroyDnVec(vec_values); - CUSPARSEDriver::get_instance().cpDestroy(cusparse_handle); - // TODO: free csr_row_offset_ptr - // CUDADriver::get_instance().mem_free(csr_row_offset_ptr); - CUDADriver::get_instance().mem_free(d_values_sorted); - CUDADriver::get_instance().mem_free(d_permutation); - CUDADriver::get_instance().mem_free(dbuffer); + if (vec_permutation) + CUSPARSEDriver::get_instance().cpDestroySpVec(vec_permutation); + if (vec_values) + CUSPARSEDriver::get_instance().cpDestroyDnVec(vec_values); + if (cusparse_handle) + CUSPARSEDriver::get_instance().cpDestroy(cusparse_handle); + if (coo_row_ptr) + CUDADriver::get_instance().mem_free(coo_row_ptr); + if (d_values_sorted) + CUDADriver::get_instance().mem_free(d_values_sorted); + if (d_permutation) + CUDADriver::get_instance().mem_free(d_permutation); + if (dbuffer) + CUDADriver::get_instance().mem_free(dbuffer); csr_row_ptr_ = csr_row_offset_ptr; csr_col_ind_ = coo_col_ptr; csr_val_ = coo_values_ptr; @@ -303,21 +362,14 @@ void CuSparseMatrix::build_csr_from_coo(void *coo_row_ptr, CuSparseMatrix::~CuSparseMatrix() { #if defined(TI_WITH_CUDA) - CUSPARSEDriver::get_instance().cpDestroySpMat(matrix_); -#endif -} -void make_sparse_matrix_from_ndarray_cusparse(Program *prog, - SparseMatrix &sm, - const Ndarray &row_coo, - const Ndarray &col_coo, - const Ndarray &val_coo) { -#if defined(TI_WITH_CUDA) - size_t coo_row_ptr = prog->get_ndarray_data_ptr_as_int(&row_coo); - size_t coo_col_ptr = prog->get_ndarray_data_ptr_as_int(&col_coo); - size_t coo_val_ptr = prog->get_ndarray_data_ptr_as_int(&val_coo); - int nnz = val_coo.get_nelement(); - sm.build_csr_from_coo((void *)coo_row_ptr, (void *)coo_col_ptr, - (void *)coo_val_ptr, nnz); + if (matrix_) + CUSPARSEDriver::get_instance().cpDestroySpMat(matrix_); + if (csr_row_ptr_) + CUDADriver::get_instance().mem_free(csr_row_ptr_); + if (csr_col_ind_) + CUDADriver::get_instance().mem_free(csr_col_ind_); + if (csr_val_) + CUDADriver::get_instance().mem_free(csr_val_); #endif } diff --git a/taichi/program/sparse_matrix.h b/taichi/program/sparse_matrix.h index 4b976d6f176da..6031e402441cb 100644 --- a/taichi/program/sparse_matrix.h +++ b/taichi/program/sparse_matrix.h @@ -28,6 +28,8 @@ class SparseMatrixBuilder { std::unique_ptr build(); + std::unique_ptr build_cuda(); + void clear(); private: @@ -287,7 +289,7 @@ class CuSparseMatrix : public SparseMatrix { } private: - cusparseSpMatDescr_t matrix_; + cusparseSpMatDescr_t matrix_{nullptr}; void *csr_row_ptr_{nullptr}; void *csr_col_ind_{nullptr}; void *csr_val_{nullptr}; @@ -310,9 +312,4 @@ std::unique_ptr make_cu_sparse_matrix(cusparseSpMatDescr_t mat, void make_sparse_matrix_from_ndarray(Program *prog, SparseMatrix &sm, const Ndarray &ndarray); -void make_sparse_matrix_from_ndarray_cusparse(Program *prog, - SparseMatrix &sm, - const Ndarray &row_indices, - const Ndarray &col_indices, - const Ndarray &values); } // namespace taichi::lang diff --git a/taichi/python/export_lang.cpp b/taichi/python/export_lang.cpp index 391f7be0d8866..7714adf922e76 100644 --- a/taichi/python/export_lang.cpp +++ b/taichi/python/export_lang.cpp @@ -421,15 +421,6 @@ void export_lang(py::module &m) { "SparseMatrix only supports CPU and CUDA for now."); return make_sparse_matrix_from_ndarray(program, sm, ndarray); }) - .def("make_sparse_matrix_from_ndarray_cusparse", - [](Program *program, CuSparseMatrix &sm, const Ndarray &row_coo, - const Ndarray &col_coo, const Ndarray &val_coo) { - TI_ERROR_IF( - !arch_is_cuda(program->this_thread_config().arch), - "SparseMatrix based on GPU only supports CUDA for now."); - return make_sparse_matrix_from_ndarray_cusparse( - program, sm, row_coo, col_coo, val_coo); - }) .def("no_activate", [](Program *program, SNode *snode) { // TODO(#2193): Also apply to @ti.func? @@ -1199,6 +1190,7 @@ void export_lang(py::module &m) { .def("print_triplets", &SparseMatrixBuilder::print_triplets) .def("get_ndarray_data_ptr", &SparseMatrixBuilder::get_ndarray_data_ptr) .def("build", &SparseMatrixBuilder::build) + .def("build_cuda", &SparseMatrixBuilder::build_cuda) .def("get_addr", [](SparseMatrixBuilder *mat) { return uint64(mat); }); py::class_(m, "SparseMatrix") diff --git a/tests/python/test_sparse_linear_solver.py b/tests/python/test_sparse_linear_solver.py index 4ad1bfea38050..772cfc2382d75 100644 --- a/tests/python/test_sparse_linear_solver.py +++ b/tests/python/test_sparse_linear_solver.py @@ -26,7 +26,6 @@ def fill(Abuilder: ti.types.sparse_matrix_builder(), fill(Abuilder, A_psd, b) A = Abuilder.build() - print(A) solver = ti.linalg.SparseSolver(dtype=dtype, solver_type=solver_type, ordering=ordering) @@ -66,15 +65,20 @@ def init_b(b: ti.types.ndarray(), nrows: ti.i32): # solve Ax = b using cusolver A_coo = A_csr.tocoo() - d_row_coo = ti.ndarray(shape=nnz, dtype=ti.i32) - d_col_coo = ti.ndarray(shape=nnz, dtype=ti.i32) - d_val_coo = ti.ndarray(shape=nnz, dtype=ti.f32) - d_row_coo.from_numpy(A_coo.row) - d_col_coo.from_numpy(A_coo.col) - d_val_coo.from_numpy(A_coo.data) - - A_ti = ti.linalg.SparseMatrix(n=nrows, m=ncols, dtype=ti.float32) - A_ti.build_coo(d_row_coo, d_col_coo, d_val_coo) + A_builder = ti.linalg.SparseMatrixBuilder(num_rows=nrows, + num_cols=ncols, + dtype=ti.f32, + max_num_triplets=nnz) + + @ti.kernel + def fill(A_builder: ti.types.sparse_matrix_builder(), + row_coo: ti.types.ndarray(), col_coo: ti.types.ndarray(), + val_coo: ti.types.ndarray()): + for i in range(nnz): + A_builder[row_coo[i], col_coo[i]] += val_coo[i] + + fill(A_builder, A_coo.row, A_coo.col, A_coo.data) + A_ti = A_builder.build() x_ti = ti.ndarray(shape=ncols, dtype=ti.float32) solver = ti.linalg.SparseSolver() x_ti = solver.solve_cu(A_ti, b) @@ -99,3 +103,32 @@ def init_b(b: ti.types.ndarray(), nrows: ti.i32): x_cti = solver.solve(b) ti.sync() assert (np.allclose(x_cti.to_numpy(), x_np, rtol=5.0e-3)) + + +@pytest.mark.parametrize("dtype", [ti.f32]) +@test_utils.test(arch=ti.cuda) +def test_gpu_sparse_solver2(dtype): + n = 10 + A = np.random.rand(n, n) + A_psd = np.dot(A, A.transpose()) + Abuilder = ti.linalg.SparseMatrixBuilder(n, n, max_num_triplets=300) + b = ti.ndarray(ti.f32, shape=n) + + @ti.kernel + def fill(Abuilder: ti.types.sparse_matrix_builder(), + InputArray: ti.types.ndarray(), b: ti.types.ndarray()): + for i, j in ti.ndrange(n, n): + Abuilder[i, j] += InputArray[i, j] + for i in range(n): + b[i] = i + 1 + + fill(Abuilder, A_psd, b) + A = Abuilder.build() + solver = ti.linalg.SparseSolver(dtype=dtype) + solver.analyze_pattern(A) + solver.factorize(A) + x = solver.solve(b) + + res = np.linalg.solve(A_psd, b.to_numpy()) + for i in range(n): + assert x[i] == test_utils.approx(res[i], rel=1.0) diff --git a/tests/python/test_sparse_matrix.py b/tests/python/test_sparse_matrix.py index e09fd850c7375..ce90d4616e15a 100644 --- a/tests/python/test_sparse_matrix.py +++ b/tests/python/test_sparse_matrix.py @@ -379,36 +379,35 @@ def fill(Abuilder: ti.types.sparse_matrix_builder(), @test_utils.test(arch=ti.cuda) def test_gpu_sparse_matrix(): import numpy as np - h_coo_row = np.asarray([1, 0, 0, 0, 2, 2, 2, 3, 3], dtype=np.int32) - h_coo_col = np.asarray([1, 0, 2, 3, 0, 2, 3, 1, 3], dtype=np.int32) - h_coo_val = np.asarray([4.0, 1.0, 2.0, 3.0, 5.0, 6.0, 7.0, 8.0, 9.0], - dtype=np.float32) - h_X = np.asarray([1.0, 2.0, 3.0, 4.0], dtype=np.float32) - h_Y = np.asarray([19.0, 8.0, 51.0, 52.0], dtype=np.float32) - - # Data structure for building the CSR matrix A using Taichi Sparse Matrix - idx_dt = ti.int32 - val_dt = ti.f32 - d_coo_row = ti.ndarray(shape=9, dtype=idx_dt) - d_coo_col = ti.ndarray(shape=9, dtype=idx_dt) - d_coo_val = ti.ndarray(shape=9, dtype=val_dt) - # Dense vector x - X = ti.ndarray(shape=4, dtype=val_dt) - # Results for A @ x - Y = ti.ndarray(shape=4, dtype=val_dt) - - # Initialize the CSR matrix and vectors with numpy array - d_coo_row.from_numpy(h_coo_row) - d_coo_col.from_numpy(h_coo_col) - d_coo_val.from_numpy(h_coo_val) + num_triplets, num_rows, num_cols = 9, 4, 4 + np_idx_dtype, np_val_dtype = np.int32, np.float32 + coo_row = np.asarray([0, 0, 0, 1, 2, 2, 2, 3, 3], dtype=np_idx_dtype) + coo_col = np.asarray([0, 2, 3, 1, 0, 2, 3, 1, 3], dtype=np_idx_dtype) + coo_val = np.asarray([i + 1.0 for i in range(num_triplets)], + dtype=np_val_dtype) + h_X = np.asarray([1.0, 2.0, 3.0, 4.0], dtype=np_val_dtype) + h_Y = np.asarray([19.0, 8.0, 51.0, 52.0], dtype=np_val_dtype) + + ti_dtype = ti.f32 + X = ti.ndarray(shape=num_cols, dtype=ti_dtype) + Y = ti.ndarray(shape=num_rows, dtype=ti_dtype) + X.from_numpy(h_X) Y.fill(0.0) - # Define the CSR matrix A - A = ti.linalg.SparseMatrix(n=4, m=4, dtype=ti.f32) + A_builder = ti.linalg.SparseMatrixBuilder(num_rows=4, + num_cols=4, + dtype=ti_dtype, + max_num_triplets=50) + + @ti.kernel + def fill(A: ti.types.sparse_matrix_builder(), coo_row: ti.types.ndarray(), + coo_col: ti.types.ndarray(), coo_val: ti.types.ndarray()): + for i in range(num_triplets): + A[coo_row[i], coo_col[i]] += coo_val[i] - # Build the CSR matrix A with Taichi ndarray - A.build_coo(d_coo_row, d_coo_col, d_coo_val) + fill(A_builder, coo_row, coo_col, coo_val) + A = A_builder.build() # Compute Y = A @ X Y = A @ X @@ -424,12 +423,18 @@ def test_gpu_sparse_matrix_ops(N): from scipy import stats from scipy.sparse import coo_matrix, random + @ti.kernel + def fill(A: ti.types.sparse_matrix_builder(), coo_row: ti.types.ndarray(), + coo_col: ti.types.ndarray(), coo_val: ti.types.ndarray(), + nnz: ti.i32): + for i in range(nnz): + A[coo_row[i], coo_col[i]] += coo_val[i] + seed = 2 np.random.seed(seed) rng = default_rng(seed) rvs = stats.poisson(3, loc=1).rvs np_dtype = np.float32 - idx_dt = ti.int32 val_dt = ti.float32 n_rows = N @@ -440,28 +445,21 @@ def test_gpu_sparse_matrix_ops(N): S2 = random(n_rows, n_cols, density=0.5, random_state=rng, data_rvs=rvs).astype(np_dtype).tocoo() - nnz_A = len(S1.data) - nnz_B = len(S2.data) - - row_coo_A = ti.ndarray(shape=nnz_A, dtype=idx_dt) - col_coo_A = ti.ndarray(shape=nnz_A, dtype=idx_dt) - value_coo_A = ti.ndarray(shape=nnz_A, dtype=val_dt) - row_coo_B = ti.ndarray(shape=nnz_B, dtype=idx_dt) - col_coo_B = ti.ndarray(shape=nnz_B, dtype=idx_dt) - value_coo_B = ti.ndarray(shape=nnz_B, dtype=val_dt) - - row_coo_A.from_numpy(S1.row) - col_coo_A.from_numpy(S1.col) - value_coo_A.from_numpy(S1.data) - - row_coo_B.from_numpy(S2.row) - col_coo_B.from_numpy(S2.col) - value_coo_B.from_numpy(S2.data) - - A = ti.linalg.SparseMatrix(n=n_rows, m=n_cols, dtype=ti.f32) - B = ti.linalg.SparseMatrix(n=n_rows, m=n_cols, dtype=ti.f32) - A.build_coo(row_coo_A, col_coo_A, value_coo_A) - B.build_coo(row_coo_B, col_coo_B, value_coo_B) + nnz_A = S1.nnz + nnz_B = S2.nnz + + A_builder = ti.linalg.SparseMatrixBuilder(num_rows=n_rows, + num_cols=n_cols, + dtype=val_dt, + max_num_triplets=nnz_A) + B_builder = ti.linalg.SparseMatrixBuilder(num_rows=n_rows, + num_cols=n_cols, + dtype=val_dt, + max_num_triplets=nnz_B) + fill(A_builder, S1.row, S1.col, S1.data, nnz_A) + fill(B_builder, S2.row, S2.col, S2.data, nnz_B) + A = A_builder.build() + B = B_builder.build() def verify(scipy_spm, taichi_spm): scipy_spm = scipy_spm.tocoo()