From e9e8eddb6d3c667386874b1cb93a65197aa4dd80 Mon Sep 17 00:00:00 2001 From: pengyu <6712304+FantasyVR@users.noreply.github.com> Date: Tue, 20 Dec 2022 15:30:03 +0800 Subject: [PATCH] [lang] Reorder sparse matrix before solving (#6886) Issue: #2906 ### Brief Summary When solving a sparse linear system without reordering the system matrix, `analyze_pattern` and `factorize` would consume a lot of GPU memory. For example, the `res` in the `stable fluid` example can not be larger than 500. After reordering the sparse system matrix, we can set `res=512` now. Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- .../examples/simulation/stable_fluid.py | 2 +- taichi/program/sparse_solver.cpp | 153 +++++++++++++++--- taichi/program/sparse_solver.h | 11 +- taichi/rhi/cuda/cusolver_functions.inc.h | 2 + taichi/rhi/cuda/cusparse_functions.inc.h | 2 + 5 files changed, 150 insertions(+), 20 deletions(-) diff --git a/python/taichi/examples/simulation/stable_fluid.py b/python/taichi/examples/simulation/stable_fluid.py index ddcdcdc2e87ba..8fdd46ea3bc29 100644 --- a/python/taichi/examples/simulation/stable_fluid.py +++ b/python/taichi/examples/simulation/stable_fluid.py @@ -27,7 +27,7 @@ help='The arch (backend) to run this example on') args, unknowns = parser.parse_known_args() -res = 256 +res = 512 dt = 0.03 p_jacobi_iters = 500 # 40 for a quicker but less accurate result f_strength = 10000.0 diff --git a/taichi/program/sparse_solver.cpp b/taichi/program/sparse_solver.cpp index caa11ac5e7d08..a0c43e32671f2 100644 --- a/taichi/program/sparse_solver.cpp +++ b/taichi/program/sparse_solver.cpp @@ -199,9 +199,11 @@ void CuSparseSolver::analyze_pattern(const SparseMatrix &sm) { SparseMatrix *sm_no_cv = const_cast(&sm); CuSparseMatrix *A = dynamic_cast(sm_no_cv); size_t rowsA = A->num_rows(); + size_t colsA = A->num_cols(); size_t nnzA = A->get_nnz(); void *d_csrRowPtrA = A->get_row_ptr(); void *d_csrColIndA = A->get_col_ind(); + void *d_csrValA = A->get_val_ptr(); CUSOLVERDriver::get_instance().csSpCreate(&cusolver_handle_); CUSPARSEDriver::get_instance().cpCreate(&cusparse_handel_); CUSPARSEDriver::get_instance().cpCreateMatDescr(&descr_); @@ -210,12 +212,68 @@ void CuSparseSolver::analyze_pattern(const SparseMatrix &sm) { CUSPARSEDriver::get_instance().cpSetMatIndexBase(descr_, CUSPARSE_INDEX_BASE_ZERO); - // step 1: create opaque info structure + // step 1: reorder the sparse matrix + float *h_csrValA = nullptr; + h_Q = (int *)malloc(sizeof(int) * colsA); + h_csrRowPtrB = (int *)malloc(sizeof(int) * (rowsA + 1)); + h_csrColIndB = (int *)malloc(sizeof(int) * nnzA); + h_csrValB = (float *)malloc(sizeof(float) * nnzA); + h_csrValA = (float *)malloc(sizeof(float) * nnzA); + h_mapBfromA = (int *)malloc(sizeof(int) * nnzA); + assert(nullptr != h_Q); + assert(nullptr != h_csrRowPtrB); + assert(nullptr != h_csrColIndB); + assert(nullptr != h_csrValB); + assert(nullptr != h_mapBfromA); + + CUDADriver::get_instance().memcpy_device_to_host(h_csrRowPtrB, d_csrRowPtrA, + sizeof(int) * (rowsA + 1)); + CUDADriver::get_instance().memcpy_device_to_host(h_csrColIndB, d_csrColIndA, + sizeof(int) * nnzA); + CUDADriver::get_instance().memcpy_device_to_host(h_csrValA, d_csrValA, + sizeof(float) * nnzA); + + // compoute h_Q + CUSOLVERDriver::get_instance().csSpXcsrsymamdHost( + cusolver_handle_, rowsA, nnzA, descr_, h_csrRowPtrB, h_csrColIndB, h_Q); + CUDADriver::get_instance().malloc((void **)&d_Q, sizeof(int) * colsA); + CUDADriver::get_instance().memcpy_host_to_device((void *)d_Q, (void *)h_Q, + sizeof(int) * (colsA)); + size_t size_perm = 0; + CUSOLVERDriver::get_instance().csSpXcsrperm_bufferSizeHost( + cusolver_handle_, rowsA, colsA, nnzA, descr_, h_csrRowPtrB, h_csrColIndB, + h_Q, h_Q, &size_perm); + void *buffer_cpu = (void *)malloc(sizeof(char) * size_perm); + assert(nullptr != buffer_cpu); + for (int j = 0; j < nnzA; j++) { + h_mapBfromA[j] = j; + } + CUSOLVERDriver::get_instance().csSpXcsrpermHost( + cusolver_handle_, rowsA, colsA, nnzA, descr_, h_csrRowPtrB, h_csrColIndB, + h_Q, h_Q, h_mapBfromA, buffer_cpu); + // B = A( mapBfromA ) + for (int j = 0; j < nnzA; j++) { + h_csrValB[j] = h_csrValA[h_mapBfromA[j]]; + } + CUDADriver::get_instance().malloc((void **)&d_csrRowPtrB, + sizeof(int) * (rowsA + 1)); + CUDADriver::get_instance().malloc((void **)&d_csrColIndB, sizeof(int) * nnzA); + CUDADriver::get_instance().malloc((void **)&d_csrValB, sizeof(float) * nnzA); + CUDADriver::get_instance().memcpy_host_to_device( + (void *)d_csrRowPtrB, (void *)h_csrRowPtrB, sizeof(int) * (rowsA + 1)); + CUDADriver::get_instance().memcpy_host_to_device( + (void *)d_csrColIndB, (void *)h_csrColIndB, sizeof(int) * nnzA); + CUDADriver::get_instance().memcpy_host_to_device( + (void *)d_csrValB, (void *)h_csrValB, sizeof(float) * nnzA); + free(h_csrValA); + free(buffer_cpu); + + // step 2: create opaque info structure CUSOLVERDriver::get_instance().csSpCreateCsrcholInfo(&info_); - // step 2: analyze chol(A) to know structure of L + // step 3: analyze chol(A) to know structure of L CUSOLVERDriver::get_instance().csSpXcsrcholAnalysis( - cusolver_handle_, rowsA, nnzA, descr_, d_csrRowPtrA, d_csrColIndA, info_); + cusolver_handle_, rowsA, nnzA, descr_, d_csrRowPtrB, d_csrColIndB, info_); is_analyzed_ = true; #else TI_NOT_IMPLEMENTED @@ -229,26 +287,23 @@ void CuSparseSolver::factorize(const SparseMatrix &sm) { CuSparseMatrix *A = dynamic_cast(sm_no_cv); size_t rowsA = A->num_rows(); size_t nnzA = A->get_nnz(); - void *d_csrRowPtrA = A->get_row_ptr(); - void *d_csrColIndA = A->get_col_ind(); - void *d_csrValA = A->get_val_ptr(); size_t size_internal = 0; size_t size_chol = 0; // size of working space for csrlu // step 1: workspace for chol(A) CUSOLVERDriver::get_instance().csSpScsrcholBufferInfo( - cusolver_handle_, rowsA, nnzA, descr_, d_csrValA, d_csrRowPtrA, - d_csrColIndA, info_, &size_internal, &size_chol); + cusolver_handle_, rowsA, nnzA, descr_, d_csrValB, d_csrRowPtrB, + d_csrColIndB, info_, &size_internal, &size_chol); if (size_chol > 0) CUDADriver::get_instance().malloc(&gpu_buffer_, sizeof(char) * size_chol); // step 2: compute A = L*L^T CUSOLVERDriver::get_instance().csSpScsrcholFactor( - cusolver_handle_, rowsA, nnzA, descr_, d_csrValA, d_csrRowPtrA, - d_csrColIndA, info_, gpu_buffer_); + cusolver_handle_, rowsA, nnzA, descr_, d_csrValB, d_csrRowPtrB, + d_csrColIndB, info_, gpu_buffer_); // step 3: check if the matrix is singular - const double tol = 1.e-14; + const float tol = 1.e-14; int singularity = 0; CUSOLVERDriver::get_instance().csSpScsrcholZeroPivot(cusolver_handle_, info_, tol, &singularity); @@ -439,16 +494,46 @@ void CuSparseSolver::solve_rf(Program *prog, SparseMatrix *sm_no_cv = const_cast(&sm); CuSparseMatrix *A = dynamic_cast(sm_no_cv); size_t rowsA = A->num_rows(); + size_t colsA = A->num_cols(); size_t d_b = prog->get_ndarray_data_ptr_as_int(&b); size_t d_x = prog->get_ndarray_data_ptr_as_int(&x); - CUSOLVERDriver::get_instance().csSpScsrcholSolve( - cusolver_handle_, rowsA, (void *)d_b, (void *)d_x, info_, gpu_buffer_); - // TODO: free allocated memory and handles - // CUDADriver::get_instance().mem_free(gpu_buffer_); - // CUSOLVERDriver::get_instance().csSpDestory(cusolver_handle_); - // CUSPARSEDriver::get_instance().cpDestroy(cusparse_handel_); - // CUSPARSEDriver::get_instance().cpDestroyMatDescr(descrA); + // step 1: d_Qb = Q * b + void *d_Qb = nullptr; + CUDADriver::get_instance().malloc(&d_Qb, sizeof(float) * rowsA); + cusparseDnVecDescr_t vec_b; + cusparseSpVecDescr_t vec_Qb; + CUSPARSEDriver::get_instance().cpCreateDnVec(&vec_b, (int)rowsA, (void *)d_b, + CUDA_R_32F); + CUSPARSEDriver::get_instance().cpCreateSpVec( + &vec_Qb, (int)rowsA, (int)rowsA, d_Q, d_Qb, CUSPARSE_INDEX_32I, + CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F); + CUSPARSEDriver::get_instance().cpGather(cusparse_handel_, vec_b, vec_Qb); + + // step 2: solve B*z = Q*b using cholesky solver + void *d_z = nullptr; + CUDADriver::get_instance().malloc(&d_z, sizeof(float) * colsA); + CUSOLVERDriver::get_instance().csSpScsrcholSolve( + cusolver_handle_, rowsA, (void *)d_Qb, (void *)d_z, info_, gpu_buffer_); + + // step 3: Q*x = z + cusparseSpVecDescr_t vecX; + cusparseDnVecDescr_t vecY; + CUSPARSEDriver::get_instance().cpCreateSpVec( + &vecX, (int)colsA, (int)colsA, d_Q, d_z, CUSPARSE_INDEX_32I, + CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F); + CUSPARSEDriver::get_instance().cpCreateDnVec(&vecY, (int)colsA, (void *)d_x, + CUDA_R_32F); + CUSPARSEDriver::get_instance().cpScatter(cusparse_handel_, vecX, vecY); + + if (d_Qb != nullptr) + CUDADriver::get_instance().mem_free(d_Qb); + if (d_z != nullptr) + CUDADriver::get_instance().mem_free(d_z); + CUSPARSEDriver::get_instance().cpDestroySpVec(vec_Qb); + CUSPARSEDriver::get_instance().cpDestroyDnVec(vec_b); + CUSPARSEDriver::get_instance().cpDestroySpVec(vecX); + CUSPARSEDriver::get_instance().cpDestroyDnVec(vecY); #else TI_NOT_IMPLEMENTED #endif @@ -492,6 +577,38 @@ std::unique_ptr make_sparse_solver(DataType dt, TI_ERROR("Not supported sparse solver type: {}", solver_type); } +CuSparseSolver::~CuSparseSolver() { +#if defined(TI_WITH_CUDA) + if (h_Q != nullptr) + free(h_Q); + if (h_csrRowPtrB != nullptr) + free(h_csrRowPtrB); + if (h_csrColIndB != nullptr) + free(h_csrColIndB); + if (h_csrValB != nullptr) + free(h_csrValB); + if (h_mapBfromA != nullptr) + free(h_mapBfromA); + if (info_ != nullptr) + CUSOLVERDriver::get_instance().csSpDestroyCsrcholInfo(info_); + if (cusolver_handle_ != nullptr) + CUSOLVERDriver::get_instance().csSpDestory(cusolver_handle_); + if (cusparse_handel_ != nullptr) + CUSPARSEDriver::get_instance().cpDestroy(cusparse_handel_); + if (descr_ != nullptr) + CUSPARSEDriver::get_instance().cpDestroyMatDescr(descr_); + if (gpu_buffer_ != nullptr) + CUDADriver::get_instance().mem_free(gpu_buffer_); + if (d_Q != nullptr) + CUDADriver::get_instance().mem_free(d_Q); + if (d_csrRowPtrB != nullptr) + CUDADriver::get_instance().mem_free(d_csrRowPtrB); + if (d_csrColIndB != nullptr) + CUDADriver::get_instance().mem_free(d_csrColIndB); + if (d_csrValB != nullptr) + CUDADriver::get_instance().mem_free(d_csrValB); +#endif +} std::unique_ptr make_cusparse_solver( DataType dt, const std::string &solver_type, diff --git a/taichi/program/sparse_solver.h b/taichi/program/sparse_solver.h index 67f7796784f1d..0b4e930dfc6d6 100644 --- a/taichi/program/sparse_solver.h +++ b/taichi/program/sparse_solver.h @@ -85,9 +85,18 @@ class CuSparseSolver : public SparseSolver { bool is_analyzed_{false}; bool is_factorized_{false}; + int *h_Q{nullptr}; /* n, B = Q*A*Q' or B = A(Q,Q) by MATLAB notation */ + int *d_Q{nullptr}; + int *h_csrRowPtrB{nullptr}; /* n+1 */ + int *h_csrColIndB{nullptr}; /* nnzA */ + float *h_csrValB{nullptr}; /* nnzA */ + int *h_mapBfromA{nullptr}; /* nnzA */ + int *d_csrRowPtrB{nullptr}; /* n+1 */ + int *d_csrColIndB{nullptr}; /* nnzA */ + float *d_csrValB{nullptr}; /* nnzA */ public: CuSparseSolver(); - ~CuSparseSolver() override = default; + ~CuSparseSolver() override; bool compute(const SparseMatrix &sm) override { TI_NOT_IMPLEMENTED; }; diff --git a/taichi/rhi/cuda/cusolver_functions.inc.h b/taichi/rhi/cuda/cusolver_functions.inc.h index fbde791da6853..b57025a1afe2b 100644 --- a/taichi/rhi/cuda/cusolver_functions.inc.h +++ b/taichi/rhi/cuda/cusolver_functions.inc.h @@ -9,12 +9,14 @@ PER_CUSOLVER_FUNCTION(csSpCreate, cusolverSpCreate, cusolverSpHandle_t * ); PER_CUSOLVER_FUNCTION(csSpDestory, cusolverSpDestroy, cusolverSpHandle_t ); PER_CUSOLVER_FUNCTION(csSpXcsrissymHost, cusolverSpXcsrissymHost, cusolverSpHandle_t,int ,int ,const cusparseMatDescr_t ,const void *,const void *,const void *,void *); PER_CUSOLVER_FUNCTION(csSpXcsrsymrcmHost, cusolverSpXcsrsymrcmHost, cusolverSpHandle_t ,int ,int ,const cusparseMatDescr_t ,const void *,const void *,void *); +PER_CUSOLVER_FUNCTION(csSpXcsrsymamdHost, cusolverSpXcsrsymamdHost, cusolverSpHandle_t ,int ,int ,const cusparseMatDescr_t ,const void *,const void *,void *); PER_CUSOLVER_FUNCTION(csSpXcsrperm_bufferSizeHost, cusolverSpXcsrperm_bufferSizeHost, cusolverSpHandle_t ,int ,int ,int ,const cusparseMatDescr_t ,void *,void *,const void *,const void *,size_t *); PER_CUSOLVER_FUNCTION(csSpXcsrpermHost, cusolverSpXcsrpermHost, cusolverSpHandle_t ,int ,int ,int ,const cusparseMatDescr_t ,void *,void *,const void *,const void *,void *,void *); PER_CUSOLVER_FUNCTION(csSpScsrlsvcholHost, cusolverSpScsrlsvcholHost, cusolverSpHandle_t ,int ,int ,const cusparseMatDescr_t ,const void *,const void *,const void *,const void *,float ,int ,void *,void *); // cusolver preview API PER_CUSOLVER_FUNCTION(csSpCreateCsrcholInfo, cusolverSpCreateCsrcholInfo, csrcholInfo_t*); +PER_CUSOLVER_FUNCTION(csSpDestroyCsrcholInfo, cusolverSpDestroyCsrcholInfo, csrcholInfo_t); PER_CUSOLVER_FUNCTION(csSpXcsrcholAnalysis, cusolverSpXcsrcholAnalysis, cusolverSpHandle_t,int ,int ,const cusparseMatDescr_t , void *, void *,csrcholInfo_t ); PER_CUSOLVER_FUNCTION(csSpScsrcholBufferInfo, cusolverSpScsrcholBufferInfo, cusolverSpHandle_t ,int ,int ,const cusparseMatDescr_t ,void *,void *,void *,csrcholInfo_t ,size_t *,size_t *); PER_CUSOLVER_FUNCTION(csSpScsrcholFactor, cusolverSpScsrcholFactor, cusolverSpHandle_t ,int ,int ,const cusparseMatDescr_t ,void *,void *,void *,csrcholInfo_t ,void *); diff --git a/taichi/rhi/cuda/cusparse_functions.inc.h b/taichi/rhi/cuda/cusparse_functions.inc.h index 53e7881e036b7..3bdcd647e452c 100644 --- a/taichi/rhi/cuda/cusparse_functions.inc.h +++ b/taichi/rhi/cuda/cusparse_functions.inc.h @@ -20,6 +20,8 @@ PER_CUSPARSE_FUNCTION(cpCreateIdentityPermutation, cusparseCreateIdentityPermuta PER_CUSPARSE_FUNCTION(cpXcoosort_bufferSizeExt, cusparseXcoosort_bufferSizeExt, cusparseHandle_t,int ,int,int, void* ,void* ,void*); PER_CUSPARSE_FUNCTION(cpXcoosortByRow, cusparseXcoosortByRow, cusparseHandle_t,int,int,int,void* ,void* ,void* ,void*); PER_CUSPARSE_FUNCTION(cpGather, cusparseGather, cusparseHandle_t, cusparseDnVecDescr_t, cusparseSpVecDescr_t); +PER_CUSPARSE_FUNCTION(cpScatter, cusparseScatter, cusparseHandle_t, cusparseSpVecDescr_t, cusparseDnVecDescr_t); +PER_CUSPARSE_FUNCTION(cpSsctr, cusparseSsctr, cusparseHandle_t, int, void*, void*, void*, cusparseIndexBase_t); PER_CUSPARSE_FUNCTION(cpSetPointerMode, cusparseSetPointerMode, cusparseHandle_t, cusparsePointerMode_t); PER_CUSPARSE_FUNCTION(cpCsrSetPointers, cusparseCsrSetPointers, cusparseSpMatDescr_t,void*, void*, void*);