diff --git a/misc/test_gpu_sm_op.py b/misc/test_gpu_sm_op.py deleted file mode 100644 index 2de61af1d9a48..0000000000000 --- a/misc/test_gpu_sm_op.py +++ /dev/null @@ -1,84 +0,0 @@ -import numpy as np -import scipy -from numpy.random import default_rng -from scipy import stats -from scipy.sparse import random - -import taichi as ti - -ti.init(arch=ti.cuda) - -idx_dt = ti.i32 -val_dt = ti.f32 - -seed = 2 -np.random.seed(seed) - -rng = default_rng(seed) -rvs = stats.poisson(3, loc=1).rvs -N = 5 -np_dtype = np.float32 -rows = N -cols = N - 1 - -S1 = random(rows, cols, density=0.5, random_state=rng, - data_rvs=rvs).astype(np_dtype) -S2 = random(rows, cols, density=0.5, random_state=rng, - data_rvs=rvs).astype(np_dtype) - -nnz_A = len(S1.data) -nnz_B = len(S2.data) - -coo_S1 = scipy.sparse.coo_matrix(S1) -coo_S2 = scipy.sparse.coo_matrix(S2) - -row_coo_A = ti.ndarray(shape=nnz_A, dtype=idx_dt) -col_coo_A = ti.ndarray(shape=nnz_A, dtype=idx_dt) -value_coo_A = ti.ndarray(shape=nnz_A, dtype=val_dt) -row_coo_A.from_numpy(coo_S1.row) -col_coo_A.from_numpy(coo_S1.col) -value_coo_A.from_numpy(coo_S1.data) - -row_coo_B = ti.ndarray(shape=nnz_B, dtype=idx_dt) -col_coo_B = ti.ndarray(shape=nnz_B, dtype=idx_dt) -value_coo_B = ti.ndarray(shape=nnz_B, dtype=val_dt) -row_coo_B.from_numpy(coo_S2.row) -col_coo_B.from_numpy(coo_S2.col) -value_coo_B.from_numpy(coo_S2.data) - -A = ti.linalg.SparseMatrix(n=rows, m=cols, dtype=ti.f32) -B = ti.linalg.SparseMatrix(n=rows, m=cols, dtype=ti.f32) -A.build_coo(row_coo_A, col_coo_A, value_coo_A) -B.build_coo(row_coo_B, col_coo_B, value_coo_B) - -print('>>>> A:') -print(A) -print('>>>> B:') -print(B) - -print('>>>> C = A + B:') -C = A + B -print(C) -print('>>>> verify:') -S3 = S1 + S2 -print(S3.A) -print('>>>> C - A:') -D = C - A -print(D) -print('>>>> verify:') -print((S3 - S1).A) -print('>>>> A * 2.5:') -E = A * 2.5 -print(E) -print('>>>> verify:') -print((2.5 * S1).A) -print('>>>> A.T:') -F = A.transpose() -print(F) -print('>>>> verify:') -print(S1.T.A) -print('>>>> A @ B.T:') -G = A @ B.transpose() -print(G) -print('>>>> verify:') -print((S1 @ S2.T).A) diff --git a/taichi/program/sparse_matrix.cpp b/taichi/program/sparse_matrix.cpp index befd15951cf40..bf096ac57528e 100644 --- a/taichi/program/sparse_matrix.cpp +++ b/taichi/program/sparse_matrix.cpp @@ -35,12 +35,12 @@ struct key_hash { }; template -void print_triplet_from_csr(int64_t n_rows, - int n_cols, - T *row, - T1 *col, - T2 *value, - std::ostringstream &ostr) { +void print_triplets_from_csr(int64_t n_rows, + int n_cols, + T *row, + T1 *col, + T2 *value, + std::ostringstream &ostr) { using Triplets = Eigen::Triplet; std::vector trips; for (int64_t i = 1; i <= n_rows; ++i) { @@ -57,6 +57,20 @@ void print_triplet_from_csr(int64_t n_rows, ostr << Eigen::MatrixXf(m.cast()).format(clean_fmt); } +template +T2 get_element_from_csr(int row, + int col, + T *row_data, + T1 *col_data, + T2 *value) { + for (T i = row_data[row]; i < row_data[row + 1]; ++i) { + if (col == col_data[i]) + return value[i]; + } + // zero entry + return 0; +} + } // namespace namespace taichi::lang { @@ -433,7 +447,7 @@ std::unique_ptr CuSparseMatrix::addition( std::unique_ptr CuSparseMatrix::matmul( const CuSparseMatrix &other) const { #if defined(TI_WITH_CUDA) - return gemm(other, 1.0f, 1.0f); + return gemm(other, 1.0f, 0.0f); #else TI_NOT_IMPLEMENTED; return std::unique_ptr(); @@ -446,7 +460,7 @@ std::unique_ptr CuSparseMatrix::gemm(const CuSparseMatrix &other, const float alpha, const float beta) const { #if defined(TI_WITH_CUDA) - cusparseHandle_t handle; + cusparseHandle_t handle = nullptr; CUSPARSEDriver::get_instance().cpCreate(&handle); cusparseOperation_t op_A = CUSPARSE_OPERATION_NON_TRANSPOSE; cusparseOperation_t op_B = CUSPARSE_OPERATION_NON_TRANSPOSE; @@ -468,7 +482,7 @@ std::unique_ptr CuSparseMatrix::gemm(const CuSparseMatrix &other, CUSPARSEDriver::get_instance().cpCreateSpGEMM(&spgemm_desc); // 3. ask buffer_size1 bytes for external memory - void *d_buffer1; + void *d_buffer1 = nullptr; size_t buffer_size1 = 0; CUSPARSEDriver::get_instance().cpSpGEMM_workEstimation( handle, op_A, op_B, &alpha, this->matrix_, other.matrix_, &beta, mat_C, @@ -486,7 +500,7 @@ std::unique_ptr CuSparseMatrix::gemm(const CuSparseMatrix &other, CUSPARSEDriver::get_instance().cpSpGEMM_compute( handle, op_A, op_B, &alpha, mat_A, mat_B, &beta, mat_C, CUDA_R_32F, CUSPARSE_SPGEMM_DEFAULT, spgemm_desc, &buffer_size2, nullptr); - void *d_buffer2; + void *d_buffer2 = nullptr; CUDADriver::get_instance().malloc((void **)&d_buffer2, buffer_size2); // 6. compute the intermediate product of A * B @@ -647,9 +661,48 @@ const std::string CuSparseMatrix::to_string() const { CUDADriver::get_instance().memcpy_device_to_host((void *)hV, (void *)dV, (nnz) * sizeof(float)); - print_triplet_from_csr(rows, cols, hR, hC, hV, ostr); + print_triplets_from_csr(rows, cols, hR, hC, hV, ostr); + delete[] hR; + delete[] hC; + delete[] hV; #endif return ostr.str(); } +float CuSparseMatrix::get_element(int row, int col) const { + float res = 0.0f; +#ifdef TI_WITH_CUDA + size_t rows, cols, nnz; + float *dR; + int *dC, *dV; + cusparseIndexType_t row_type, column_type; + cusparseIndexBase_t idx_base; + cudaDataType value_type; + CUSPARSEDriver::get_instance().cpCsrGet( + matrix_, &rows, &cols, &nnz, (void **)&dR, (void **)&dC, (void **)&dV, + &row_type, &column_type, &idx_base, &value_type); + + TI_ASSERT(row < rows); + TI_ASSERT(col < cols); + + auto *hR = new int[rows + 1]; + auto *hC = new int[nnz]; + auto *hV = new float[nnz]; + + CUDADriver::get_instance().memcpy_device_to_host((void *)hR, (void *)dR, + (rows + 1) * sizeof(int)); + CUDADriver::get_instance().memcpy_device_to_host((void *)hC, (void *)dC, + (nnz) * sizeof(int)); + CUDADriver::get_instance().memcpy_device_to_host((void *)hV, (void *)dV, + (nnz) * sizeof(float)); + + res = get_element_from_csr(row, col, hR, hC, hV); + + delete[] hR; + delete[] hC; + delete[] hV; +#endif // TI_WITH_CUDA + return res; +} + } // namespace taichi::lang diff --git a/taichi/program/sparse_matrix.h b/taichi/program/sparse_matrix.h index 91e58f983d82c..781f694f3a68f 100644 --- a/taichi/program/sparse_matrix.h +++ b/taichi/program/sparse_matrix.h @@ -268,6 +268,8 @@ class CuSparseMatrix : public SparseMatrix { return &matrix_; }; + float get_element(int row, int col) const; + const std::string to_string() const override; void *get_row_ptr() const { diff --git a/taichi/python/export_lang.cpp b/taichi/python/export_lang.cpp index 834a9a16ebc0a..774894b1f78a9 100644 --- a/taichi/python/export_lang.cpp +++ b/taichi/python/export_lang.cpp @@ -1262,6 +1262,7 @@ void export_lang(py::module &m) { .def(float32() * py::self) .def("matmul", &CuSparseMatrix::matmul) .def("transpose", &CuSparseMatrix::transpose) + .def("get_element", &CuSparseMatrix::get_element) .def("to_string", &CuSparseMatrix::to_string); py::class_(m, "SparseSolver") diff --git a/tests/python/test_sparse_matrix.py b/tests/python/test_sparse_matrix.py index 10cf8451a25f0..57f5e1a44a0ff 100644 --- a/tests/python/test_sparse_matrix.py +++ b/tests/python/test_sparse_matrix.py @@ -1,4 +1,3 @@ -import numpy as np import pytest import taichi as ti @@ -379,6 +378,7 @@ def fill(Abuilder: ti.types.sparse_matrix_builder(), @test_utils.test(arch=ti.cuda) def test_gpu_sparse_matrix(): + import numpy as np h_coo_row = np.asarray([1, 0, 0, 0, 2, 2, 2, 3, 3], dtype=np.int32) h_coo_col = np.asarray([1, 0, 2, 3, 0, 2, 3, 1, 3], dtype=np.int32) h_coo_val = np.asarray([4.0, 1.0, 2.0, 3.0, 5.0, 6.0, 7.0, 8.0, 9.0], @@ -414,3 +414,80 @@ def test_gpu_sparse_matrix(): A.spmv(X, Y) for i in range(4): assert Y[i] == h_Y[i] + + +@pytest.mark.parametrize('N', [5]) +@test_utils.test(arch=ti.cuda) +def test_gpu_sparse_matrix_ops(N): + import numpy as np + from numpy.random import default_rng + from scipy import stats + from scipy.sparse import coo_matrix, random + + seed = 2 + np.random.seed(seed) + rng = default_rng(seed) + rvs = stats.poisson(3, loc=1).rvs + np_dtype = np.float32 + idx_dt = ti.int32 + val_dt = ti.float32 + + n_rows = N + n_cols = N - 1 + + S1 = random(n_rows, n_cols, density=0.5, random_state=rng, + data_rvs=rvs).astype(np_dtype).tocoo() + S2 = random(n_rows, n_cols, density=0.5, random_state=rng, + data_rvs=rvs).astype(np_dtype).tocoo() + + nnz_A = len(S1.data) + nnz_B = len(S2.data) + + row_coo_A = ti.ndarray(shape=nnz_A, dtype=idx_dt) + col_coo_A = ti.ndarray(shape=nnz_A, dtype=idx_dt) + value_coo_A = ti.ndarray(shape=nnz_A, dtype=val_dt) + row_coo_B = ti.ndarray(shape=nnz_B, dtype=idx_dt) + col_coo_B = ti.ndarray(shape=nnz_B, dtype=idx_dt) + value_coo_B = ti.ndarray(shape=nnz_B, dtype=val_dt) + + row_coo_A.from_numpy(S1.row) + col_coo_A.from_numpy(S1.col) + value_coo_A.from_numpy(S1.data) + + row_coo_B.from_numpy(S2.row) + col_coo_B.from_numpy(S2.col) + value_coo_B.from_numpy(S2.data) + + A = ti.linalg.SparseMatrix(n=n_rows, m=n_cols, dtype=ti.f32) + B = ti.linalg.SparseMatrix(n=n_rows, m=n_cols, dtype=ti.f32) + A.build_coo(row_coo_A, col_coo_A, value_coo_A) + B.build_coo(row_coo_B, col_coo_B, value_coo_B) + + def verify(scipy_spm, taichi_spm): + scipy_spm = scipy_spm.tocoo() + for i, j, v in zip(scipy_spm.row, scipy_spm.col, scipy_spm.data): + assert v == test_utils.approx(taichi_spm[i, j], rel=1e-5) + + C = A + B + S3 = S1 + S2 + verify(S3, C) + + D = C - A + S4 = S3 - S1 + verify(S4, D) + + E = A * 2.5 + S5 = S1 * 2.5 + verify(S5, E) + + F = A * 2.5 + S6 = S1 * 2.5 + verify(S6, F) + + G = A.transpose() + S7 = S1.T + verify(S7, G) + + H = A @ B.transpose() + S8 = S1 @ S2.T + verify(S8, H)