diff --git a/CMakeLists.txt b/CMakeLists.txt index 66a4a3d6..bf8241f7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -22,7 +22,7 @@ list(APPEND CMAKE_MODULE_PATH include(cmake/SetToolchain.cmake) # rocSPARSE project -project(rocsparse VERSION 0.1.2.0 LANGUAGES CXX) +project(rocsparse VERSION 0.1.3.0 LANGUAGES CXX) set(rocsparse_SOVERSION 0) # Set a default build type if none was specified @@ -50,7 +50,7 @@ option(BUILD_VERBOSE "Output additional build information" OFF) include(cmake/Dependencies.cmake) # AMD targets -set(AMDGPU_TARGETS gfx803;gfx900 CACHE STRING "List of specific machine types for library to target") +set(AMDGPU_TARGETS gfx803;gfx900;gfx906 CACHE STRING "List of specific machine types for library to target") # rocSPARSE library add_subdirectory(library) diff --git a/Jenkinsfile b/Jenkinsfile index 36fa96eb..bee15511 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -473,7 +473,7 @@ rocm_ubuntu: node( 'docker && rocm && dkms') { def hcc_docker_args = new docker_data( - from_image:'rocm/dev-ubuntu-16.04:1.7.1', + from_image:'rocm/dev-ubuntu-16.04:1.9.0', build_docker_file:'dockerfile-build-ubuntu', install_docker_file:'dockerfile-install-ubuntu', docker_run_args:'--device=/dev/kfd --device=/dev/dri --group-add=video', diff --git a/clients/benchmarks/client.cpp b/clients/benchmarks/client.cpp index 829e02cf..b9a4521b 100644 --- a/clients/benchmarks/client.cpp +++ b/clients/benchmarks/client.cpp @@ -16,9 +16,16 @@ // Level2 #include "testing_coomv.hpp" #include "testing_csrmv.hpp" +#include "testing_csrsv.hpp" #include "testing_ellmv.hpp" #include "testing_hybmv.hpp" +// Level3 +#include "testing_csrmm.hpp" + +// Preconditioner +#include "testing_csrilu0.hpp" + // Conversion #include "testing_csr2coo.hpp" #include "testing_csr2csc.hpp" @@ -85,7 +92,9 @@ int main(int argc, char* argv[]) po::value(&function)->default_value("axpyi"), "SPARSE function to test. Options:\n" " Level1: axpyi, doti, gthr, gthrz, roti, sctr\n" - " Level2: coomv, csrmv, ellmv, hybmv\n" + " Level2: coomv, csrmv, csrsv, ellmv, hybmv\n" + " Level3: csrmm\n" + " Preconditioner: csrilu0\n" " Conversion: csr2coo, csr2csc, csr2ell,\n" " csr2hyb, coo2csr, ell2csr\n" " Sorting: csrsort, coosort\n" @@ -201,6 +210,13 @@ int main(int argc, char* argv[]) else if(precision == 'd') testing_csrmv(argus); } + else if(function == "csrsv") + { + if(precision == 's') + testing_csrsv(argus); + else if(precision == 'd') + testing_csrsv(argus); + } else if(function == "ellmv") { if(precision == 's') @@ -215,6 +231,20 @@ int main(int argc, char* argv[]) else if(precision == 'd') testing_hybmv(argus); } + else if(function == "csrmm") + { + if(precision == 's') + testing_csrmm(argus); + else if(precision == 'd') + testing_csrmm(argus); + } + else if(function == "csrilu0") + { + if(precision == 's') + testing_csrilu0(argus); + else if(precision == 'd') + testing_csrilu0(argus); + } else if(function == "csr2coo") { testing_csr2coo(argus); diff --git a/clients/common/arg_check.cpp b/clients/common/arg_check.cpp index 6ae844ad..60acd79c 100644 --- a/clients/common/arg_check.cpp +++ b/clients/common/arg_check.cpp @@ -64,6 +64,19 @@ void verify_rocsparse_status_invalid_value(rocsparse_status status, const char* #endif } +void verify_rocsparse_status_zero_pivot(rocsparse_status status, const char* message) +{ +#ifdef GOOGLE_TEST + ASSERT_EQ(status, rocsparse_status_zero_pivot); +#else + if(status != rocsparse_status_zero_pivot) + { + std::cerr << "rocSPARSE TEST ERROR: status != rocsparse_status_zero_pivot, "; + std::cerr << message << std::endl; + } +#endif +} + void verify_rocsparse_status_invalid_handle(rocsparse_status status) { #ifdef GOOGLE_TEST diff --git a/clients/common/rocsparse_template_specialization.cpp b/clients/common/rocsparse_template_specialization.cpp index 57d72125..e1daba4d 100644 --- a/clients/common/rocsparse_template_specialization.cpp +++ b/clients/common/rocsparse_template_specialization.cpp @@ -250,6 +250,158 @@ rocsparse_status rocsparse_csrmv(rocsparse_handle handle, y); } +template <> +rocsparse_status rocsparse_csrsv_buffer_size(rocsparse_handle handle, + rocsparse_operation trans, + rocsparse_int m, + rocsparse_int nnz, + const rocsparse_mat_descr descr, + const float* csr_val, + const rocsparse_int* csr_row_ptr, + const rocsparse_int* csr_col_ind, + rocsparse_mat_info info, + size_t* buffer_size) +{ + return rocsparse_scsrsv_buffer_size( + handle, trans, m, nnz, descr, csr_val, csr_row_ptr, csr_col_ind, info, buffer_size); +} + +template <> +rocsparse_status rocsparse_csrsv_buffer_size(rocsparse_handle handle, + rocsparse_operation trans, + rocsparse_int m, + rocsparse_int nnz, + const rocsparse_mat_descr descr, + const double* csr_val, + const rocsparse_int* csr_row_ptr, + const rocsparse_int* csr_col_ind, + rocsparse_mat_info info, + size_t* buffer_size) +{ + return rocsparse_dcsrsv_buffer_size( + handle, trans, m, nnz, descr, csr_val, csr_row_ptr, csr_col_ind, info, buffer_size); +} + +template <> +rocsparse_status rocsparse_csrsv_analysis(rocsparse_handle handle, + rocsparse_operation trans, + rocsparse_int m, + rocsparse_int nnz, + const rocsparse_mat_descr descr, + const float* csr_val, + const rocsparse_int* csr_row_ptr, + const rocsparse_int* csr_col_ind, + rocsparse_mat_info info, + rocsparse_analysis_policy analysis, + rocsparse_solve_policy solve, + void* temp_buffer) +{ + return rocsparse_scsrsv_analysis(handle, + trans, + m, + nnz, + descr, + csr_val, + csr_row_ptr, + csr_col_ind, + info, + analysis, + solve, + temp_buffer); +} + +template <> +rocsparse_status rocsparse_csrsv_analysis(rocsparse_handle handle, + rocsparse_operation trans, + rocsparse_int m, + rocsparse_int nnz, + const rocsparse_mat_descr descr, + const double* csr_val, + const rocsparse_int* csr_row_ptr, + const rocsparse_int* csr_col_ind, + rocsparse_mat_info info, + rocsparse_analysis_policy analysis, + rocsparse_solve_policy solve, + void* temp_buffer) +{ + return rocsparse_dcsrsv_analysis(handle, + trans, + m, + nnz, + descr, + csr_val, + csr_row_ptr, + csr_col_ind, + info, + analysis, + solve, + temp_buffer); +} + +template <> +rocsparse_status rocsparse_csrsv_solve(rocsparse_handle handle, + rocsparse_operation trans, + rocsparse_int m, + rocsparse_int nnz, + const float* alpha, + const rocsparse_mat_descr descr, + const float* csr_val, + const rocsparse_int* csr_row_ptr, + const rocsparse_int* csr_col_ind, + rocsparse_mat_info info, + const float* x, + float* y, + rocsparse_solve_policy policy, + void* temp_buffer) +{ + return rocsparse_scsrsv_solve(handle, + trans, + m, + nnz, + alpha, + descr, + csr_val, + csr_row_ptr, + csr_col_ind, + info, + x, + y, + policy, + temp_buffer); +} + +template <> +rocsparse_status rocsparse_csrsv_solve(rocsparse_handle handle, + rocsparse_operation trans, + rocsparse_int m, + rocsparse_int nnz, + const double* alpha, + const rocsparse_mat_descr descr, + const double* csr_val, + const rocsparse_int* csr_row_ptr, + const rocsparse_int* csr_col_ind, + rocsparse_mat_info info, + const double* x, + double* y, + rocsparse_solve_policy policy, + void* temp_buffer) +{ + return rocsparse_dcsrsv_solve(handle, + trans, + m, + nnz, + alpha, + descr, + csr_val, + csr_row_ptr, + csr_col_ind, + info, + x, + y, + policy, + temp_buffer); +} + template <> rocsparse_status rocsparse_ellmv(rocsparse_handle handle, rocsparse_operation trans, @@ -388,6 +540,120 @@ rocsparse_status rocsparse_csrmm(rocsparse_handle handle, ldc); } +template <> +rocsparse_status rocsparse_csrilu0_buffer_size(rocsparse_handle handle, + rocsparse_int m, + rocsparse_int nnz, + const rocsparse_mat_descr descr, + const float* csr_val, + const rocsparse_int* csr_row_ptr, + const rocsparse_int* csr_col_ind, + rocsparse_mat_info info, + size_t* buffer_size) +{ + return rocsparse_scsrilu0_buffer_size( + handle, m, nnz, descr, csr_val, csr_row_ptr, csr_col_ind, info, buffer_size); +} + +template <> +rocsparse_status rocsparse_csrilu0_buffer_size(rocsparse_handle handle, + rocsparse_int m, + rocsparse_int nnz, + const rocsparse_mat_descr descr, + const double* csr_val, + const rocsparse_int* csr_row_ptr, + const rocsparse_int* csr_col_ind, + rocsparse_mat_info info, + size_t* buffer_size) +{ + return rocsparse_dcsrilu0_buffer_size( + handle, m, nnz, descr, csr_val, csr_row_ptr, csr_col_ind, info, buffer_size); +} + +template <> +rocsparse_status rocsparse_csrilu0_analysis(rocsparse_handle handle, + rocsparse_int m, + rocsparse_int nnz, + const rocsparse_mat_descr descr, + const float* csr_val, + const rocsparse_int* csr_row_ptr, + const rocsparse_int* csr_col_ind, + rocsparse_mat_info info, + rocsparse_analysis_policy analysis, + rocsparse_solve_policy solve, + void* temp_buffer) +{ + return rocsparse_scsrilu0_analysis(handle, + m, + nnz, + descr, + csr_val, + csr_row_ptr, + csr_col_ind, + info, + analysis, + solve, + temp_buffer); +} + +template <> +rocsparse_status rocsparse_csrilu0_analysis(rocsparse_handle handle, + rocsparse_int m, + rocsparse_int nnz, + const rocsparse_mat_descr descr, + const double* csr_val, + const rocsparse_int* csr_row_ptr, + const rocsparse_int* csr_col_ind, + rocsparse_mat_info info, + rocsparse_analysis_policy analysis, + rocsparse_solve_policy solve, + void* temp_buffer) +{ + return rocsparse_dcsrilu0_analysis(handle, + m, + nnz, + descr, + csr_val, + csr_row_ptr, + csr_col_ind, + info, + analysis, + solve, + temp_buffer); +} + +template <> +rocsparse_status rocsparse_csrilu0(rocsparse_handle handle, + rocsparse_int m, + rocsparse_int nnz, + const rocsparse_mat_descr descr, + float* csr_val, + const rocsparse_int* csr_row_ptr, + const rocsparse_int* csr_col_ind, + rocsparse_mat_info info, + rocsparse_solve_policy policy, + void* temp_buffer) +{ + return rocsparse_scsrilu0( + handle, m, nnz, descr, csr_val, csr_row_ptr, csr_col_ind, info, policy, temp_buffer); +} + +template <> +rocsparse_status rocsparse_csrilu0(rocsparse_handle handle, + rocsparse_int m, + rocsparse_int nnz, + const rocsparse_mat_descr descr, + double* csr_val, + const rocsparse_int* csr_row_ptr, + const rocsparse_int* csr_col_ind, + rocsparse_mat_info info, + rocsparse_solve_policy policy, + void* temp_buffer) +{ + return rocsparse_dcsrilu0( + handle, m, nnz, descr, csr_val, csr_row_ptr, csr_col_ind, info, policy, temp_buffer); +} + template <> rocsparse_status rocsparse_csr2csc(rocsparse_handle handle, rocsparse_int m, diff --git a/clients/common/unit.cpp b/clients/common/unit.cpp index 496bffd8..ef5824c3 100644 --- a/clients/common/unit.cpp +++ b/clients/common/unit.cpp @@ -138,44 +138,3 @@ void unit_check_near( } } } - -/*! \brief Template: gtest unit compare two matrices float/double/complex */ -// Do not put a wrapper over ASSERT_FLOAT_EQ, since assert exit the current function NOT the test -// case -// a wrapper will cause the loop keep going - -template <> -void unit_check_near(rocsparse_int M, rocsparse_int N, float* hCPU, float* hGPU) -{ - for(rocsparse_int j = 0; j < N; j++) - { - for(rocsparse_int i = 0; i < M; i++) - { - float compare_val = - std::max(std::abs(hCPU[i + j] * 1e-6f), 10 * std::numeric_limits::epsilon()); -#ifdef GOOGLE_TEST - ASSERT_NEAR(hCPU[i + j], hGPU[i + j], compare_val); -#else - assert(std::abs(hCPU[i + j] - hGPU[i + j]) < compare_val); -#endif - } - } -} - -template <> -void unit_check_near(rocsparse_int M, rocsparse_int N, double* hCPU, double* hGPU) -{ - for(rocsparse_int j = 0; j < N; j++) - { - for(rocsparse_int i = 0; i < M; i++) - { - double compare_val = std::max(std::abs(hCPU[i + j] * 1e-14), - 10 * std::numeric_limits::epsilon()); -#ifdef GOOGLE_TEST - ASSERT_NEAR(hCPU[i + j], hGPU[i + j], compare_val); -#else - assert(std::abs(hCPU[i + j] - hGPU[i + j]) < compare_val); -#endif - } - } -} diff --git a/clients/common/utility.cpp b/clients/common/utility.cpp index 4f870f4e..2da4e3f6 100644 --- a/clients/common/utility.cpp +++ b/clients/common/utility.cpp @@ -46,7 +46,8 @@ rocsparse_int query_device_property() (int)(props.clockRate / 1000), props.major, props.minor); - printf("maxGridDimX %d, sharedMemPerBlock %ldKB, maxThreadsPerBlock %d, wavefrontSize %d\n", + printf("maxGridDimX %d, sharedMemPerBlock %ldKB, maxThreadsPerBlock %d, wavefrontSize " + "%d\n", props.maxGridSize[0], props.sharedMemPerBlock >> 10, props.maxThreadsPerBlock, diff --git a/clients/include/arg_check.hpp b/clients/include/arg_check.hpp index 292e7eb7..bccd10fb 100644 --- a/clients/include/arg_check.hpp +++ b/clients/include/arg_check.hpp @@ -15,6 +15,8 @@ void verify_rocsparse_status_invalid_size(rocsparse_status status, const char* m void verify_rocsparse_status_invalid_value(rocsparse_status status, const char* message); +void verify_rocsparse_status_zero_pivot(rocsparse_status status, const char* message); + void verify_rocsparse_status_invalid_handle(rocsparse_status status); void verify_rocsparse_status_success(rocsparse_status status, const char* message); diff --git a/clients/include/rocsparse.hpp b/clients/include/rocsparse.hpp index f18c253b..8dbbc52f 100644 --- a/clients/include/rocsparse.hpp +++ b/clients/include/rocsparse.hpp @@ -92,6 +92,48 @@ rocsparse_status rocsparse_csrmv(rocsparse_handle handle, const T* beta, T* y); +template +rocsparse_status rocsparse_csrsv_buffer_size(rocsparse_handle handle, + rocsparse_operation trans, + rocsparse_int m, + rocsparse_int nnz, + const rocsparse_mat_descr descr, + const T* csr_val, + const rocsparse_int* csr_row_ptr, + const rocsparse_int* csr_col_ind, + rocsparse_mat_info info, + size_t* buffer_size); + +template +rocsparse_status rocsparse_csrsv_analysis(rocsparse_handle handle, + rocsparse_operation trans, + rocsparse_int m, + rocsparse_int nnz, + const rocsparse_mat_descr descr, + const T* csr_val, + const rocsparse_int* csr_row_ptr, + const rocsparse_int* csr_col_ind, + rocsparse_mat_info info, + rocsparse_analysis_policy analysis, + rocsparse_solve_policy solve, + void* temp_buffer); + +template +rocsparse_status rocsparse_csrsv_solve(rocsparse_handle handle, + rocsparse_operation trans, + rocsparse_int m, + rocsparse_int nnz, + const T* alpha, + const rocsparse_mat_descr descr, + const T* csr_val, + const rocsparse_int* csr_row_ptr, + const rocsparse_int* csr_col_ind, + rocsparse_mat_info info, + const T* x, + T* y, + rocsparse_solve_policy policy, + void* temp_buffer); + template rocsparse_status rocsparse_ellmv(rocsparse_handle handle, rocsparse_operation trans, @@ -135,6 +177,42 @@ rocsparse_status rocsparse_csrmm(rocsparse_handle handle, T* C, rocsparse_int ldc); +template +rocsparse_status rocsparse_csrilu0_buffer_size(rocsparse_handle handle, + rocsparse_int m, + rocsparse_int nnz, + const rocsparse_mat_descr descr, + const T* csr_val, + const rocsparse_int* csr_row_ptr, + const rocsparse_int* csr_col_ind, + rocsparse_mat_info info, + size_t* buffer_size); + +template +rocsparse_status rocsparse_csrilu0_analysis(rocsparse_handle handle, + rocsparse_int m, + rocsparse_int nnz, + const rocsparse_mat_descr descr, + const T* csr_val, + const rocsparse_int* csr_row_ptr, + const rocsparse_int* csr_col_ind, + rocsparse_mat_info info, + rocsparse_analysis_policy analysis, + rocsparse_solve_policy solve, + void* temp_buffer); + +template +rocsparse_status rocsparse_csrilu0(rocsparse_handle handle, + rocsparse_int m, + rocsparse_int nnz, + const rocsparse_mat_descr descr, + T* csr_val, + const rocsparse_int* csr_row_ptr, + const rocsparse_int* csr_col_ind, + rocsparse_mat_info info, + rocsparse_solve_policy policy, + void* temp_buffer); + template rocsparse_status rocsparse_csr2csc(rocsparse_handle handle, rocsparse_int m, diff --git a/clients/include/testing_csrilu0.hpp b/clients/include/testing_csrilu0.hpp new file mode 100644 index 00000000..5959c23a --- /dev/null +++ b/clients/include/testing_csrilu0.hpp @@ -0,0 +1,625 @@ +/* ************************************************************************ + * Copyright 2018 Advanced Micro Devices, Inc. + * ************************************************************************ */ + +#pragma once +#ifndef TESTING_CSRILU0_HPP +#define TESTING_CSRILU0_HPP + +#include "rocsparse_test_unique_ptr.hpp" +#include "rocsparse.hpp" +#include "utility.hpp" +#include "unit.hpp" + +#include +#include +#include + +using namespace rocsparse; +using namespace rocsparse_test; + +template +void testing_csrilu0_bad_arg(void) +{ + rocsparse_int m = 100; + rocsparse_int nnz = 100; + rocsparse_int safe_size = 100; + rocsparse_analysis_policy analysis = rocsparse_analysis_policy_reuse; + rocsparse_solve_policy solve = rocsparse_solve_policy_auto; + rocsparse_status status; + + std::unique_ptr unique_ptr_handle(new handle_struct); + rocsparse_handle handle = unique_ptr_handle->handle; + + std::unique_ptr unique_ptr_descr(new descr_struct); + rocsparse_mat_descr descr = unique_ptr_descr->descr; + + std::unique_ptr unique_ptr_mat_info(new mat_info_struct); + rocsparse_mat_info info = unique_ptr_mat_info->info; + + auto dptr_managed = + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * safe_size), device_free}; + auto dcol_managed = + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * safe_size), device_free}; + auto dval_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * safe_size), device_free}; + auto dbuffer_managed = + rocsparse_unique_ptr{device_malloc(sizeof(char) * safe_size), device_free}; + + rocsparse_int* dptr = (rocsparse_int*)dptr_managed.get(); + rocsparse_int* dcol = (rocsparse_int*)dcol_managed.get(); + T* dval = (T*)dval_managed.get(); + void* dbuffer = (void*)dbuffer_managed.get(); + + if(!dval || !dptr || !dcol || !dbuffer) + { + PRINT_IF_HIP_ERROR(hipErrorOutOfMemory); + return; + } + + // testing rocsparse_csrilu0_buffer_size + size_t size; + + // testing for(nullptr == dptr) + { + rocsparse_int* dptr_null = nullptr; + + status = rocsparse_csrilu0_buffer_size( + handle, m, nnz, descr, dval, dptr_null, dcol, info, &size); + verify_rocsparse_status_invalid_pointer(status, "Error: dptr is nullptr"); + } + // testing for(nullptr == dcol) + { + rocsparse_int* dcol_null = nullptr; + + status = rocsparse_csrilu0_buffer_size( + handle, m, nnz, descr, dval, dptr, dcol_null, info, &size); + verify_rocsparse_status_invalid_pointer(status, "Error: dcol is nullptr"); + } + // testing for(nullptr == dval) + { + T* dval_null = nullptr; + + status = rocsparse_csrilu0_buffer_size( + handle, m, nnz, descr, dval_null, dptr, dcol, info, &size); + verify_rocsparse_status_invalid_pointer(status, "Error: dval is nullptr"); + } + // testing for(nullptr == buffer_size) + { + size_t* size_null = nullptr; + + status = + rocsparse_csrilu0_buffer_size(handle, m, nnz, descr, dval, dptr, dcol, info, size_null); + verify_rocsparse_status_invalid_pointer(status, "Error: size is nullptr"); + } + // testing for(nullptr == descr) + { + rocsparse_mat_descr descr_null = nullptr; + + status = rocsparse_csrilu0_buffer_size( + handle, m, nnz, descr_null, dval, dptr, dcol, info, &size); + verify_rocsparse_status_invalid_pointer(status, "Error: descr is nullptr"); + } + // testing for(nullptr == info) + { + rocsparse_mat_info info_null = nullptr; + + status = rocsparse_csrilu0_buffer_size( + handle, m, nnz, descr, dval, dptr, dcol, info_null, &size); + verify_rocsparse_status_invalid_pointer(status, "Error: info is nullptr"); + } + // testing for(nullptr == handle) + { + rocsparse_handle handle_null = nullptr; + + status = rocsparse_csrilu0_buffer_size( + handle_null, m, nnz, descr, dval, dptr, dcol, info, &size); + verify_rocsparse_status_invalid_handle(status); + } + + // testing rocsparse_csrilu0_analysis + + // testing for(nullptr == dptr) + { + rocsparse_int* dptr_null = nullptr; + + status = rocsparse_csrilu0_analysis( + handle, m, nnz, descr, dval, dptr_null, dcol, info, analysis, solve, dbuffer); + verify_rocsparse_status_invalid_pointer(status, "Error: dptr is nullptr"); + } + // testing for(nullptr == dcol) + { + rocsparse_int* dcol_null = nullptr; + + status = rocsparse_csrilu0_analysis( + handle, m, nnz, descr, dval, dptr, dcol_null, info, analysis, solve, dbuffer); + verify_rocsparse_status_invalid_pointer(status, "Error: dcol is nullptr"); + } + // testing for(nullptr == dval) + { + T* dval_null = nullptr; + + status = rocsparse_csrilu0_analysis( + handle, m, nnz, descr, dval_null, dptr, dcol, info, analysis, solve, dbuffer); + verify_rocsparse_status_invalid_pointer(status, "Error: dcol is nullptr"); + } + // testing for(nullptr == dbuffer) + { + void* dbuffer_null = nullptr; + + status = rocsparse_csrilu0_analysis( + handle, m, nnz, descr, dval, dptr, dcol, info, analysis, solve, dbuffer_null); + verify_rocsparse_status_invalid_pointer(status, "Error: dbuffer is nullptr"); + } + // testing for(nullptr == descr) + { + rocsparse_mat_descr descr_null = nullptr; + + status = rocsparse_csrilu0_analysis( + handle, m, nnz, descr_null, dval, dptr, dcol, info, analysis, solve, dbuffer); + verify_rocsparse_status_invalid_pointer(status, "Error: descr is nullptr"); + } + // testing for(nullptr == info) + { + rocsparse_mat_info info_null = nullptr; + + status = rocsparse_csrilu0_analysis( + handle, m, nnz, descr, dval, dptr, dcol, info_null, analysis, solve, dbuffer); + verify_rocsparse_status_invalid_pointer(status, "Error: info is nullptr"); + } + // testing for(nullptr == handle) + { + rocsparse_handle handle_null = nullptr; + + status = rocsparse_csrilu0_analysis( + handle_null, m, nnz, descr, dval, dptr, dcol, info, analysis, solve, dbuffer); + verify_rocsparse_status_invalid_handle(status); + } + + // testing rocsparse_csrilu0 + + // testing for(nullptr == dptr) + { + rocsparse_int* dptr_null = nullptr; + + status = + rocsparse_csrilu0(handle, m, nnz, descr, dval, dptr_null, dcol, info, solve, dbuffer); + verify_rocsparse_status_invalid_pointer(status, "Error: dptr is nullptr"); + } + // testing for(nullptr == dcol) + { + rocsparse_int* dcol_null = nullptr; + + status = + rocsparse_csrilu0(handle, m, nnz, descr, dval, dptr, dcol_null, info, solve, dbuffer); + verify_rocsparse_status_invalid_pointer(status, "Error: dcol is nullptr"); + } + // testing for(nullptr == dval) + { + T* dval_null = nullptr; + + status = + rocsparse_csrilu0(handle, m, nnz, descr, dval_null, dptr, dcol, info, solve, dbuffer); + verify_rocsparse_status_invalid_pointer(status, "Error: dval is nullptr"); + } + // testing for(nullptr == dbuffer) + { + void* dbuffer_null = nullptr; + + status = + rocsparse_csrilu0(handle, m, nnz, descr, dval, dptr, dcol, info, solve, dbuffer_null); + verify_rocsparse_status_invalid_pointer(status, "Error: dbuffer is nullptr"); + } + // testing for(nullptr == descr) + { + rocsparse_mat_descr descr_null = nullptr; + + status = + rocsparse_csrilu0(handle, m, nnz, descr_null, dval, dptr, dcol, info, solve, dbuffer); + verify_rocsparse_status_invalid_pointer(status, "Error: descr is nullptr"); + } + // testing for(nullptr == info) + { + rocsparse_mat_info info_null = nullptr; + + status = + rocsparse_csrilu0(handle, m, nnz, descr, dval, dptr, dcol, info_null, solve, dbuffer); + verify_rocsparse_status_invalid_pointer(status, "Error: info is nullptr"); + } + // testing for(nullptr == handle) + { + rocsparse_handle handle_null = nullptr; + + status = + rocsparse_csrilu0(handle_null, m, nnz, descr, dval, dptr, dcol, info, solve, dbuffer); + verify_rocsparse_status_invalid_handle(status); + } + + // testing rocsparse_csrilu0_zero_pivot + rocsparse_int position; + + // testing for(nullptr == position) + { + rocsparse_int* position_null = nullptr; + + status = rocsparse_csrilu0_zero_pivot(handle, info, position_null); + verify_rocsparse_status_invalid_pointer(status, "Error: position is nullptr"); + } + // testing for(nullptr == info) + { + rocsparse_mat_info info_null = nullptr; + + status = rocsparse_csrilu0_zero_pivot(handle, info_null, &position); + verify_rocsparse_status_invalid_pointer(status, "Error: info is nullptr"); + } + // testing for(nullptr == handle) + { + rocsparse_handle handle_null = nullptr; + + status = rocsparse_csrilu0_zero_pivot(handle_null, info, &position); + verify_rocsparse_status_invalid_handle(status); + } + + // testing rocsparse_csrilu0_clear + + // testing for(nullptr == info) + { + rocsparse_mat_info info_null = nullptr; + + status = rocsparse_csrilu0_clear(handle, info_null); + verify_rocsparse_status_invalid_pointer(status, "Error: info is nullptr"); + } + // testing for(nullptr == handle) + { + rocsparse_handle handle_null = nullptr; + + status = rocsparse_csrilu0_clear(handle_null, info); + verify_rocsparse_status_invalid_handle(status); + } +} + +template +rocsparse_status testing_csrilu0(Arguments argus) +{ + rocsparse_int safe_size = 100; + rocsparse_int m = argus.M; + rocsparse_index_base idx_base = argus.idx_base; + std::string binfile = ""; + std::string filename = ""; + rocsparse_status status; + size_t size; + + // When in testing mode, M == N == -99 indicates that we are testing with a real + // matrix from cise.ufl.edu + if(m == -99 && argus.timing == 0) + { + binfile = argus.filename; + m = safe_size; + } + + if(argus.timing == 1) + { + filename = argus.filename; + } + + std::unique_ptr test_handle(new handle_struct); + rocsparse_handle handle = test_handle->handle; + + std::unique_ptr test_descr(new descr_struct); + rocsparse_mat_descr descr = test_descr->descr; + + std::unique_ptr unique_ptr_mat_info(new mat_info_struct); + rocsparse_mat_info info = unique_ptr_mat_info->info; + + // Set matrix index base + CHECK_ROCSPARSE_ERROR(rocsparse_set_mat_index_base(descr, idx_base)); + + // Determine number of non-zero elements + double scale = 0.02; + if(m > 1000) + { + scale = 2.0 / m; + } + rocsparse_int nnz = m * scale * m; + + // Argument sanity check before allocating invalid memory + if(m <= 0 || nnz <= 0) + { + auto dptr_managed = + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * safe_size), device_free}; + auto dcol_managed = + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * safe_size), device_free}; + auto dval_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * safe_size), device_free}; + auto buffer_managed = + rocsparse_unique_ptr{device_malloc(sizeof(char) * safe_size), device_free}; + + rocsparse_int* dptr = (rocsparse_int*)dptr_managed.get(); + rocsparse_int* dcol = (rocsparse_int*)dcol_managed.get(); + T* dval = (T*)dval_managed.get(); + void* buffer = (void*)buffer_managed.get(); + + if(!dval || !dptr || !dcol || !buffer) + { + verify_rocsparse_status_success(rocsparse_status_memory_error, + "!dptr || !dcol || !dval || !buffer"); + return rocsparse_status_memory_error; + } + + // Test rocsparse_csrilu0_buffer_size + status = + rocsparse_csrilu0_buffer_size(handle, m, nnz, descr, dval, dptr, dcol, info, &size); + + if(m < 0 || nnz < 0) + { + verify_rocsparse_status_invalid_size(status, "Error: m < 0 || nnz < 0"); + } + else + { + verify_rocsparse_status_success(status, "m >= 0 && nnz >= 0"); + } + + // Test rocsparse_csrilu0_analysis + status = rocsparse_csrilu0_analysis(handle, + m, + nnz, + descr, + dval, + dptr, + dcol, + info, + rocsparse_analysis_policy_reuse, + rocsparse_solve_policy_auto, + buffer); + + if(m < 0 || nnz < 0) + { + verify_rocsparse_status_invalid_size(status, "Error: m < 0 || nnz < 0"); + } + else + { + verify_rocsparse_status_success(status, "m >= 0 && nnz >= 0"); + } + + // Test rocsparse_csrilu0 + status = rocsparse_csrilu0( + handle, m, nnz, descr, dval, dptr, dcol, info, rocsparse_solve_policy_auto, buffer); + + if(m < 0 || nnz < 0) + { + verify_rocsparse_status_invalid_size(status, "Error: m < 0 || nnz < 0"); + } + else + { + verify_rocsparse_status_success(status, "m >= 0 && nnz >= 0"); + } + + // Test rocsparse_csrilu0_zero_pivot + rocsparse_int zero_pivot; + CHECK_ROCSPARSE_ERROR(rocsparse_csrilu0_zero_pivot(handle, info, &zero_pivot)); + + // Zero pivot should be -1 + rocsparse_int res = -1; + unit_check_general(1, 1, 1, &res, &zero_pivot); + + // Test rocsparse_csrilu0_clear + CHECK_ROCSPARSE_ERROR(rocsparse_csrilu0_clear(handle, info)); + + return rocsparse_status_success; + } + + // Host structures + std::vector hcsr_row_ptr; + std::vector hcsr_col_ind; + std::vector hcsr_val; + + // Initial Data on CPU + srand(12345ULL); + if(binfile != "") + { + if(read_bin_matrix( + binfile.c_str(), m, m, nnz, hcsr_row_ptr, hcsr_col_ind, hcsr_val, idx_base) != 0) + { + fprintf(stderr, "Cannot open [read] %s\n", binfile.c_str()); + return rocsparse_status_internal_error; + } + } + else if(argus.laplacian) + { + m = gen_2d_laplacian(argus.laplacian, hcsr_row_ptr, hcsr_col_ind, hcsr_val, idx_base); + nnz = hcsr_row_ptr[m]; + } + else + { + std::vector hcoo_row_ind; + + if(filename != "") + { + if(read_mtx_matrix( + filename.c_str(), m, m, nnz, hcoo_row_ind, hcsr_col_ind, hcsr_val, idx_base) != + 0) + { + fprintf(stderr, "Cannot open [read] %s\n", filename.c_str()); + return rocsparse_status_internal_error; + } + } + else + { + gen_matrix_coo(m, m, nnz, hcoo_row_ind, hcsr_col_ind, hcsr_val, idx_base); + } + + // Convert COO to CSR + hcsr_row_ptr.resize(m + 1, 0); + for(rocsparse_int i = 0; i < nnz; ++i) + { + ++hcsr_row_ptr[hcoo_row_ind[i] + 1 - idx_base]; + } + + hcsr_row_ptr[0] = idx_base; + for(rocsparse_int i = 0; i < m; ++i) + { + hcsr_row_ptr[i + 1] += hcsr_row_ptr[i]; + } + } + + // Allocate memory on device + auto dptr_managed = + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * (m + 1)), device_free}; + auto dcol_managed = + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * nnz), device_free}; + auto dval_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * nnz), device_free}; + auto d_position_managed = + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int)), device_free}; + + rocsparse_int* dptr = (rocsparse_int*)dptr_managed.get(); + rocsparse_int* dcol = (rocsparse_int*)dcol_managed.get(); + T* dval = (T*)dval_managed.get(); + rocsparse_int* d_position = (rocsparse_int*)d_position_managed.get(); + + if(!dval || !dptr || !dcol || !d_position) + { + verify_rocsparse_status_success(rocsparse_status_memory_error, + "!dval || !dptr || !dcol || !d_position"); + return rocsparse_status_memory_error; + } + + // copy data from CPU to device + CHECK_HIP_ERROR(hipMemcpy( + dptr, hcsr_row_ptr.data(), sizeof(rocsparse_int) * (m + 1), hipMemcpyHostToDevice)); + CHECK_HIP_ERROR( + hipMemcpy(dcol, hcsr_col_ind.data(), sizeof(rocsparse_int) * nnz, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(dval, hcsr_val.data(), sizeof(T) * nnz, hipMemcpyHostToDevice)); + + // Obtain csrilu0 buffer size + CHECK_ROCSPARSE_ERROR( + rocsparse_csrilu0_buffer_size(handle, m, nnz, descr, dval, dptr, dcol, info, &size)); + + // Allocate buffer on the device + auto dbuffer_managed = rocsparse_unique_ptr{device_malloc(sizeof(char) * size), device_free}; + + void* dbuffer = (void*)dbuffer_managed.get(); + + if(!dbuffer) + { + verify_rocsparse_status_success(rocsparse_status_memory_error, "!dbuffer"); + return rocsparse_status_memory_error; + } + + // csrilu0 analysis + CHECK_ROCSPARSE_ERROR(rocsparse_csrilu0_analysis(handle, + m, + nnz, + descr, + dval, + dptr, + dcol, + info, + rocsparse_analysis_policy_reuse, + rocsparse_solve_policy_auto, + dbuffer)); + + if(argus.unit_check) + { + CHECK_ROCSPARSE_ERROR(rocsparse_csrilu0( + handle, m, nnz, descr, dval, dptr, dcol, info, rocsparse_solve_policy_auto, dbuffer)); + + // Pointer mode host + CHECK_ROCSPARSE_ERROR(rocsparse_set_pointer_mode(handle, rocsparse_pointer_mode_host)); + + rocsparse_int hposition_1; + rocsparse_status pivot_status_1; + pivot_status_1 = rocsparse_csrilu0_zero_pivot(handle, info, &hposition_1); + + // Pointer mode device + CHECK_ROCSPARSE_ERROR(rocsparse_set_pointer_mode(handle, rocsparse_pointer_mode_device)); + + rocsparse_status pivot_status_2; + pivot_status_2 = rocsparse_csrilu0_zero_pivot(handle, info, d_position); + + // Copy output from device to CPU + rocsparse_int hposition_2; + std::vector result(nnz); + CHECK_HIP_ERROR(hipMemcpy(result.data(), dval, sizeof(T) * nnz, hipMemcpyDeviceToHost)); + CHECK_HIP_ERROR( + hipMemcpy(&hposition_2, d_position, sizeof(rocsparse_int), hipMemcpyDeviceToHost)); + + // Host csrilu0 + double cpu_time_used = get_time_us(); + + rocsparse_int position_gold = + csrilu0(m, hcsr_row_ptr.data(), hcsr_col_ind.data(), hcsr_val.data(), idx_base); + + cpu_time_used = get_time_us() - cpu_time_used; + + unit_check_general(1, 1, 1, &position_gold, &hposition_1); + unit_check_general(1, 1, 1, &position_gold, &hposition_2); + + if(hposition_1 != -1) + { + verify_rocsparse_status_zero_pivot(pivot_status_1, + "expected rocsparse_status_zero_pivot"); + return rocsparse_status_success; + } + + if(hposition_2 != -1) + { + verify_rocsparse_status_zero_pivot(pivot_status_2, + "expected rocsparse_status_zero_pivot"); + return rocsparse_status_success; + } + + unit_check_general(1, nnz, 1, hcsr_val.data(), result.data()); + } + + if(argus.timing) + { + int number_cold_calls = 2; + int number_hot_calls = argus.iters; + CHECK_ROCSPARSE_ERROR(rocsparse_set_pointer_mode(handle, rocsparse_pointer_mode_host)); + + for(int iter = 0; iter < number_cold_calls; iter++) + { + rocsparse_csrilu0(handle, + m, + nnz, + descr, + dval, + dptr, + dcol, + info, + rocsparse_solve_policy_auto, + dbuffer); + } + + double gpu_time_used = get_time_us(); // in microseconds + + for(int iter = 0; iter < number_hot_calls; iter++) + { + rocsparse_csrilu0(handle, + m, + nnz, + descr, + dval, + dptr, + dcol, + info, + rocsparse_solve_policy_auto, + dbuffer); + } + + // Convert to miliseconds per call + gpu_time_used = (get_time_us() - gpu_time_used) / (number_hot_calls * 1e3); + + // Bandwidth + size_t int_data = (m + 1 + nnz) * sizeof(rocsparse_int); + size_t flt_data = (nnz + nnz) * sizeof(T); + double bandwidth = (int_data + flt_data) / gpu_time_used / 1e6; + + printf("m\t\tnnz\t\tGB/s\tmsec\n"); + printf("%8d\t%9d\t%0.2lf\t%0.2lf\n", m, nnz, bandwidth, gpu_time_used); + } + + CHECK_ROCSPARSE_ERROR(rocsparse_csrilu0_clear(handle, info)); + + return rocsparse_status_success; +} + +#endif // TESTING_CSRILU0_HPP diff --git a/clients/include/testing_csrilusv.hpp b/clients/include/testing_csrilusv.hpp new file mode 100644 index 00000000..75381dc1 --- /dev/null +++ b/clients/include/testing_csrilusv.hpp @@ -0,0 +1,418 @@ +/* ************************************************************************ + * Copyright 2018 Advanced Micro Devices, Inc. + * ************************************************************************ */ + +#pragma once +#ifndef TESTING_CSRILUSV_HPP +#define TESTING_CSRILUSV_HPP + +#include "rocsparse_test_unique_ptr.hpp" +#include "rocsparse.hpp" +#include "utility.hpp" +#include "unit.hpp" + +#include +#include +#include +#include +#include + +using namespace rocsparse; +using namespace rocsparse_test; + +template +rocsparse_status testing_csrilusv(Arguments argus) +{ + rocsparse_index_base idx_base = argus.idx_base; + rocsparse_analysis_policy analysis = argus.analysis; + + std::unique_ptr test_handle(new handle_struct); + rocsparse_handle handle = test_handle->handle; + + std::unique_ptr test_descr_M(new descr_struct); + rocsparse_mat_descr descr_M = test_descr_M->descr; + + std::unique_ptr unique_ptr_mat_info(new mat_info_struct); + rocsparse_mat_info info = unique_ptr_mat_info->info; + + // Initialize the matrix descriptor + CHECK_ROCSPARSE_ERROR(rocsparse_set_mat_index_base(descr_M, idx_base)); + + // Host structures + std::vector hcsr_row_ptr; + std::vector hcsr_col_ind; + std::vector hcsr_val; + + // Initial Data on CPU + rocsparse_int m; + rocsparse_int n; + rocsparse_int nnz; + + if(read_bin_matrix( + argus.filename.c_str(), m, n, nnz, hcsr_row_ptr, hcsr_col_ind, hcsr_val, idx_base) != 0) + { + fprintf(stderr, "Cannot open [read] %s\n", argus.filename.c_str()); + return rocsparse_status_internal_error; + } + + // Allocate memory on device + auto dptr_managed = + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * (m + 1)), device_free}; + auto dcol_managed = + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * nnz), device_free}; + auto dval_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * nnz), device_free}; + auto d_position_managed = + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int)), device_free}; + + rocsparse_int* dptr = (rocsparse_int*)dptr_managed.get(); + rocsparse_int* dcol = (rocsparse_int*)dcol_managed.get(); + T* dval = (T*)dval_managed.get(); + rocsparse_int* d_position = (rocsparse_int*)d_position_managed.get(); + + if(!dval || !dptr || !dcol || !d_position) + { + verify_rocsparse_status_success(rocsparse_status_memory_error, + "!dval || !dptr || !dcol || !d_position"); + return rocsparse_status_memory_error; + } + + // copy data from CPU to device + CHECK_HIP_ERROR(hipMemcpy( + dptr, hcsr_row_ptr.data(), sizeof(rocsparse_int) * (m + 1), hipMemcpyHostToDevice)); + CHECK_HIP_ERROR( + hipMemcpy(dcol, hcsr_col_ind.data(), sizeof(rocsparse_int) * nnz, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(dval, hcsr_val.data(), sizeof(T) * nnz, hipMemcpyHostToDevice)); + + // Obtain csrilu0 buffer size + size_t size; + CHECK_ROCSPARSE_ERROR( + rocsparse_csrilu0_buffer_size(handle, m, nnz, descr_M, dval, dptr, dcol, info, &size)); + + // Allocate buffer on the device + auto dbuffer_managed = rocsparse_unique_ptr{device_malloc(sizeof(char) * size), device_free}; + + void* dbuffer = (void*)dbuffer_managed.get(); + + if(!dbuffer) + { + verify_rocsparse_status_success(rocsparse_status_memory_error, "!dbuffer"); + return rocsparse_status_memory_error; + } + + // csrilu0 analysis + CHECK_ROCSPARSE_ERROR(rocsparse_csrilu0_analysis(handle, + m, + nnz, + descr_M, + dval, + dptr, + dcol, + info, + analysis, + rocsparse_solve_policy_auto, + dbuffer)); + + // Compute incomplete LU factorization + CHECK_ROCSPARSE_ERROR(rocsparse_csrilu0( + handle, m, nnz, descr_M, dval, dptr, dcol, info, rocsparse_solve_policy_auto, dbuffer)); + + // Check for zero pivot + rocsparse_int hposition_1, hposition_2; + rocsparse_status pivot_status_1, pivot_status_2; + + // Host pointer mode + CHECK_ROCSPARSE_ERROR(rocsparse_set_pointer_mode(handle, rocsparse_pointer_mode_host)); + pivot_status_1 = rocsparse_csrilu0_zero_pivot(handle, info, &hposition_1); + + // device pointer mode + CHECK_ROCSPARSE_ERROR(rocsparse_set_pointer_mode(handle, rocsparse_pointer_mode_device)); + pivot_status_2 = rocsparse_csrilu0_zero_pivot(handle, info, d_position); + + // Copy output to CPU + std::vector iluresult(nnz); + CHECK_HIP_ERROR(hipMemcpy(iluresult.data(), dval, sizeof(T) * nnz, hipMemcpyDeviceToHost)); + CHECK_HIP_ERROR( + hipMemcpy(&hposition_2, d_position, sizeof(rocsparse_int), hipMemcpyDeviceToHost)); + + // Compute host reference csrilu0 + rocsparse_int position_gold = + csrilu0(m, hcsr_row_ptr.data(), hcsr_col_ind.data(), hcsr_val.data(), idx_base); + + // Check zero pivot results + unit_check_general(1, 1, 1, &position_gold, &hposition_1); + unit_check_general(1, 1, 1, &position_gold, &hposition_2); + + // If zero pivot was found, do not go further + if(hposition_1 != -1) + { + verify_rocsparse_status_zero_pivot(pivot_status_1, "expected rocsparse_status_zero_pivot"); + return rocsparse_status_success; + } + + if(hposition_2 != -1) + { + verify_rocsparse_status_zero_pivot(pivot_status_2, "expected rocsparse_status_zero_pivot"); + return rocsparse_status_success; + } + + // Check csrilu0 factorization + unit_check_general(1, nnz, 1, hcsr_val.data(), iluresult.data()); + + // Create matrix descriptors for csrsv + std::unique_ptr test_descr_L(new descr_struct); + rocsparse_mat_descr descr_L = test_descr_L->descr; + + CHECK_ROCSPARSE_ERROR(rocsparse_set_mat_index_base(descr_L, idx_base)); + CHECK_ROCSPARSE_ERROR(rocsparse_set_mat_fill_mode(descr_L, rocsparse_fill_mode_lower)); + CHECK_ROCSPARSE_ERROR(rocsparse_set_mat_diag_type(descr_L, rocsparse_diag_type_unit)); + + std::unique_ptr test_descr_U(new descr_struct); + rocsparse_mat_descr descr_U = test_descr_U->descr; + + CHECK_ROCSPARSE_ERROR(rocsparse_set_mat_index_base(descr_U, idx_base)); + CHECK_ROCSPARSE_ERROR(rocsparse_set_mat_fill_mode(descr_U, rocsparse_fill_mode_upper)); + CHECK_ROCSPARSE_ERROR(rocsparse_set_mat_diag_type(descr_U, rocsparse_diag_type_non_unit)); + + // Obtain csrsv buffer sizes + size_t size_lower, size_upper; + CHECK_ROCSPARSE_ERROR(rocsparse_csrsv_buffer_size( + handle, rocsparse_operation_none, m, nnz, descr_L, dval, dptr, dcol, info, &size_lower)); + CHECK_ROCSPARSE_ERROR(rocsparse_csrsv_buffer_size( + handle, rocsparse_operation_none, m, nnz, descr_U, dval, dptr, dcol, info, &size_upper)); + + // Sizes should match with csrilu0 + unit_check_general(1, 1, 1, &size, &size_lower); + unit_check_general(1, 1, 1, &size, &size_upper); + + // csrsv analysis + CHECK_ROCSPARSE_ERROR(rocsparse_csrsv_analysis(handle, + rocsparse_operation_none, + m, + nnz, + descr_L, + dval, + dptr, + dcol, + info, + analysis, + rocsparse_solve_policy_auto, + dbuffer)); + + CHECK_ROCSPARSE_ERROR(rocsparse_csrsv_analysis(handle, + rocsparse_operation_none, + m, + nnz, + descr_U, + dval, + dptr, + dcol, + info, + analysis, + rocsparse_solve_policy_auto, + dbuffer)); + + // Initialize some more structures required for Lz = x + T h_alpha = static_cast(1); + + std::vector hx(m, static_cast(1)); + std::vector hy_gold(m); + std::vector hz_gold(m); + + // Allocate device memory + auto dx_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * m), device_free}; + auto dy_1_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * m), device_free}; + auto dy_2_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * m), device_free}; + auto dz_1_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * m), device_free}; + auto dz_2_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * m), device_free}; + auto d_alpha_managed = rocsparse_unique_ptr{device_malloc(sizeof(T)), device_free}; + + T* dx = (T*)dx_managed.get(); + T* dy_1 = (T*)dy_1_managed.get(); + T* dy_2 = (T*)dy_2_managed.get(); + T* dz_1 = (T*)dz_1_managed.get(); + T* dz_2 = (T*)dz_2_managed.get(); + T* d_alpha = (T*)d_alpha_managed.get(); + + if(!dx || !dy_1 || !dy_2 || !dz_1 || !dz_2 || !d_alpha) + { + verify_rocsparse_status_success(rocsparse_status_memory_error, + "!dx || !dy_1 || !dy_2 || !dz_1 || " + "!dz_2 || !d_alpha"); + return rocsparse_status_memory_error; + } + + // Copy data from CPU to device + CHECK_HIP_ERROR(hipMemcpy(dx, hx.data(), sizeof(T) * m, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(d_alpha, &h_alpha, sizeof(T), hipMemcpyHostToDevice)); + + // Solve Lz = x + + // host pointer mode + CHECK_ROCSPARSE_ERROR(rocsparse_set_pointer_mode(handle, rocsparse_pointer_mode_host)); + CHECK_ROCSPARSE_ERROR(rocsparse_csrsv_solve(handle, + rocsparse_operation_none, + m, + nnz, + &h_alpha, + descr_L, + dval, + dptr, + dcol, + info, + dx, + dz_1, + rocsparse_solve_policy_auto, + dbuffer)); + + // Check for zero pivot + pivot_status_1 = rocsparse_csrsv_zero_pivot(handle, descr_L, info, &hposition_1); + + // device pointer mode + CHECK_ROCSPARSE_ERROR(rocsparse_set_pointer_mode(handle, rocsparse_pointer_mode_device)); + CHECK_ROCSPARSE_ERROR(rocsparse_csrsv_solve(handle, + rocsparse_operation_none, + m, + nnz, + d_alpha, + descr_L, + dval, + dptr, + dcol, + info, + dx, + dz_2, + rocsparse_solve_policy_auto, + dbuffer)); + + // Check for zero pivot + pivot_status_2 = rocsparse_csrsv_zero_pivot(handle, descr_L, info, d_position); + + // Host csrsv + hipDeviceProp_t prop; + hipGetDeviceProperties(&prop, 0); + + position_gold = lsolve(m, + hcsr_row_ptr.data(), + hcsr_col_ind.data(), + hcsr_val.data(), + h_alpha, + hx.data(), + hz_gold.data(), + idx_base, + rocsparse_diag_type_unit, + prop.warpSize); + + // Check zero pivot results + unit_check_general(1, 1, 1, &position_gold, &hposition_1); + unit_check_general(1, 1, 1, &position_gold, &hposition_2); + + // If zero pivot was found, do not go further + if(hposition_1 != -1) + { + verify_rocsparse_status_zero_pivot(pivot_status_1, "expected rocsparse_status_zero_pivot"); + return rocsparse_status_success; + } + + if(hposition_2 != -1) + { + verify_rocsparse_status_zero_pivot(pivot_status_2, "expected rocsparse_status_zero_pivot"); + return rocsparse_status_success; + } + + // Copy output from device to CPU + std::vector hz_1(m); + std::vector hz_2(m); + + CHECK_HIP_ERROR(hipMemcpy(hz_1.data(), dz_1, sizeof(T) * m, hipMemcpyDeviceToHost)); + CHECK_HIP_ERROR(hipMemcpy(hz_2.data(), dz_2, sizeof(T) * m, hipMemcpyDeviceToHost)); + + // Check z + unit_check_general(1, m, 1, hz_gold.data(), hz_1.data()); + unit_check_general(1, m, 1, hz_gold.data(), hz_2.data()); + + // Solve Uy = z + + // host pointer mode + CHECK_ROCSPARSE_ERROR(rocsparse_set_pointer_mode(handle, rocsparse_pointer_mode_host)); + CHECK_ROCSPARSE_ERROR(rocsparse_csrsv_solve(handle, + rocsparse_operation_none, + m, + nnz, + &h_alpha, + descr_U, + dval, + dptr, + dcol, + info, + dz_1, + dy_1, + rocsparse_solve_policy_auto, + dbuffer)); + + // Check for zero pivot + pivot_status_1 = rocsparse_csrsv_zero_pivot(handle, descr_U, info, &hposition_1); + + // device pointer mode + CHECK_ROCSPARSE_ERROR(rocsparse_set_pointer_mode(handle, rocsparse_pointer_mode_device)); + CHECK_ROCSPARSE_ERROR(rocsparse_csrsv_solve(handle, + rocsparse_operation_none, + m, + nnz, + d_alpha, + descr_U, + dval, + dptr, + dcol, + info, + dz_2, + dy_2, + rocsparse_solve_policy_auto, + dbuffer)); + + // Check for zero pivot + pivot_status_2 = rocsparse_csrsv_zero_pivot(handle, descr_U, info, d_position); + + // Host csrsv + position_gold = usolve(m, + hcsr_row_ptr.data(), + hcsr_col_ind.data(), + hcsr_val.data(), + h_alpha, + hz_gold.data(), + hy_gold.data(), + idx_base, + rocsparse_diag_type_non_unit, + prop.warpSize); + + // Check zero pivot results + unit_check_general(1, 1, 1, &position_gold, &hposition_1); + unit_check_general(1, 1, 1, &position_gold, &hposition_2); + + // If zero pivot was found, do not go further + if(hposition_1 != -1) + { + verify_rocsparse_status_zero_pivot(pivot_status_1, "expected rocsparse_status_zero_pivot"); + return rocsparse_status_success; + } + + if(hposition_2 != -1) + { + verify_rocsparse_status_zero_pivot(pivot_status_2, "expected rocsparse_status_zero_pivot"); + return rocsparse_status_success; + } + + // Copy output from device to CPU + std::vector hy_1(m); + std::vector hy_2(m); + + CHECK_HIP_ERROR(hipMemcpy(hy_1.data(), dy_1, sizeof(T) * m, hipMemcpyDeviceToHost)); + CHECK_HIP_ERROR(hipMemcpy(hy_2.data(), dy_2, sizeof(T) * m, hipMemcpyDeviceToHost)); + + // Check z + unit_check_near(1, m, 1, hy_gold.data(), hy_1.data()); + unit_check_near(1, m, 1, hy_gold.data(), hy_2.data()); + + return rocsparse_status_success; +} + +#endif // TESTING_CSRILUSOLVE_HPP diff --git a/clients/include/testing_csrmv.hpp b/clients/include/testing_csrmv.hpp index d477bf8f..c8a21199 100644 --- a/clients/include/testing_csrmv.hpp +++ b/clients/include/testing_csrmv.hpp @@ -656,8 +656,8 @@ rocsparse_status testing_csrmv(Arguments argus) if(adaptive) { - unit_check_near(1, m, hy_gold.data(), hy_1.data()); - unit_check_near(1, m, hy_gold.data(), hy_2.data()); + unit_check_near(1, m, 1, hy_gold.data(), hy_1.data()); + unit_check_near(1, m, 1, hy_gold.data(), hy_2.data()); } else { diff --git a/clients/include/testing_csrsv.hpp b/clients/include/testing_csrsv.hpp new file mode 100644 index 00000000..df392a50 --- /dev/null +++ b/clients/include/testing_csrsv.hpp @@ -0,0 +1,918 @@ +/* ************************************************************************ + * Copyright 2018 Advanced Micro Devices, Inc. + * ************************************************************************ */ + +#pragma once +#ifndef TESTING_CSRSV_HPP +#define TESTING_CSRSV_HPP + +#include "rocsparse_test_unique_ptr.hpp" +#include "rocsparse.hpp" +#include "utility.hpp" +#include "unit.hpp" + +#include +#include +#include +#include +#include + +using namespace rocsparse; +using namespace rocsparse_test; + +template +void testing_csrsv_bad_arg(void) +{ + rocsparse_int m = 100; + rocsparse_int nnz = 100; + rocsparse_int safe_size = 100; + T h_alpha = 0.6; + rocsparse_operation transA = rocsparse_operation_none; + rocsparse_analysis_policy analysis = rocsparse_analysis_policy_reuse; + rocsparse_solve_policy solve = rocsparse_solve_policy_auto; + rocsparse_status status; + + std::unique_ptr unique_ptr_handle(new handle_struct); + rocsparse_handle handle = unique_ptr_handle->handle; + + std::unique_ptr unique_ptr_descr(new descr_struct); + rocsparse_mat_descr descr = unique_ptr_descr->descr; + + std::unique_ptr unique_ptr_mat_info(new mat_info_struct); + rocsparse_mat_info info = unique_ptr_mat_info->info; + + auto dptr_managed = + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * safe_size), device_free}; + auto dcol_managed = + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * safe_size), device_free}; + auto dval_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * safe_size), device_free}; + auto dx_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * safe_size), device_free}; + auto dy_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * safe_size), device_free}; + auto dbuffer_managed = + rocsparse_unique_ptr{device_malloc(sizeof(char) * safe_size), device_free}; + + rocsparse_int* dptr = (rocsparse_int*)dptr_managed.get(); + rocsparse_int* dcol = (rocsparse_int*)dcol_managed.get(); + T* dval = (T*)dval_managed.get(); + T* dx = (T*)dx_managed.get(); + T* dy = (T*)dy_managed.get(); + void* dbuffer = (void*)dbuffer_managed.get(); + + if(!dval || !dptr || !dcol || !dx || !dy || !dbuffer) + { + PRINT_IF_HIP_ERROR(hipErrorOutOfMemory); + return; + } + + // testing rocsparse_csrsv_buffer_size + size_t size; + + // testing for(nullptr == dptr) + { + rocsparse_int* dptr_null = nullptr; + + status = rocsparse_csrsv_buffer_size( + handle, transA, m, nnz, descr, dval, dptr_null, dcol, info, &size); + verify_rocsparse_status_invalid_pointer(status, "Error: dptr is nullptr"); + } + // testing for(nullptr == dcol) + { + rocsparse_int* dcol_null = nullptr; + + status = rocsparse_csrsv_buffer_size( + handle, transA, m, nnz, descr, dval, dptr, dcol_null, info, &size); + verify_rocsparse_status_invalid_pointer(status, "Error: dcol is nullptr"); + } + // testing for(nullptr == dval) + { + T* dval_null = nullptr; + + status = rocsparse_csrsv_buffer_size( + handle, transA, m, nnz, descr, dval_null, dptr, dcol, info, &size); + verify_rocsparse_status_invalid_pointer(status, "Error: dval is nullptr"); + } + // testing for(nullptr == buffer_size) + { + size_t* size_null = nullptr; + + status = rocsparse_csrsv_buffer_size( + handle, transA, m, nnz, descr, dval, dptr, dcol, info, size_null); + verify_rocsparse_status_invalid_pointer(status, "Error: size is nullptr"); + } + // testing for(nullptr == descr) + { + rocsparse_mat_descr descr_null = nullptr; + + status = rocsparse_csrsv_buffer_size( + handle, transA, m, nnz, descr_null, dval, dptr, dcol, info, &size); + verify_rocsparse_status_invalid_pointer(status, "Error: descr is nullptr"); + } + // testing for(nullptr == info) + { + rocsparse_mat_info info_null = nullptr; + + status = rocsparse_csrsv_buffer_size( + handle, transA, m, nnz, descr, dval, dptr, dcol, info_null, &size); + verify_rocsparse_status_invalid_pointer(status, "Error: info is nullptr"); + } + // testing for(nullptr == handle) + { + rocsparse_handle handle_null = nullptr; + + status = rocsparse_csrsv_buffer_size( + handle_null, transA, m, nnz, descr, dval, dptr, dcol, info, &size); + verify_rocsparse_status_invalid_handle(status); + } + + // testing rocsparse_csrsv_analysis + + // testing for(nullptr == dptr) + { + rocsparse_int* dptr_null = nullptr; + + status = rocsparse_csrsv_analysis( + handle, transA, m, nnz, descr, dval, dptr_null, dcol, info, analysis, solve, dbuffer); + verify_rocsparse_status_invalid_pointer(status, "Error: dptr is nullptr"); + } + // testing for(nullptr == dcol) + { + rocsparse_int* dcol_null = nullptr; + + status = rocsparse_csrsv_analysis( + handle, transA, m, nnz, descr, dval, dptr, dcol_null, info, analysis, solve, dbuffer); + verify_rocsparse_status_invalid_pointer(status, "Error: dcol is nullptr"); + } + // testing for(nullptr == dval) + { + T* dval_null = nullptr; + + status = rocsparse_csrsv_analysis( + handle, transA, m, nnz, descr, dval_null, dptr, dcol, info, analysis, solve, dbuffer); + verify_rocsparse_status_invalid_pointer(status, "Error: dval is nullptr"); + } + // testing for(nullptr == dbuffer) + { + void* dbuffer_null = nullptr; + + status = rocsparse_csrsv_analysis( + handle, transA, m, nnz, descr, dval, dptr, dcol, info, analysis, solve, dbuffer_null); + verify_rocsparse_status_invalid_pointer(status, "Error: dbuffer is nullptr"); + } + // testing for(nullptr == descr) + { + rocsparse_mat_descr descr_null = nullptr; + + status = rocsparse_csrsv_analysis( + handle, transA, m, nnz, descr_null, dval, dptr, dcol, info, analysis, solve, dbuffer); + verify_rocsparse_status_invalid_pointer(status, "Error: descr is nullptr"); + } + // testing for(nullptr == info) + { + rocsparse_mat_info info_null = nullptr; + + status = rocsparse_csrsv_analysis( + handle, transA, m, nnz, descr, dval, dptr, dcol, info_null, analysis, solve, dbuffer); + verify_rocsparse_status_invalid_pointer(status, "Error: info is nullptr"); + } + // testing for(nullptr == handle) + { + rocsparse_handle handle_null = nullptr; + + status = rocsparse_csrsv_analysis( + handle_null, transA, m, nnz, descr, dval, dptr, dcol, info, analysis, solve, dbuffer); + verify_rocsparse_status_invalid_handle(status); + } + + // testing rocsparse_csrsv + + // testing for(nullptr == dptr) + { + rocsparse_int* dptr_null = nullptr; + + status = rocsparse_csrsv_solve(handle, + transA, + m, + nnz, + &h_alpha, + descr, + dval, + dptr_null, + dcol, + info, + dx, + dy, + solve, + dbuffer); + verify_rocsparse_status_invalid_pointer(status, "Error: dptr is nullptr"); + } + // testing for(nullptr == dcol) + { + rocsparse_int* dcol_null = nullptr; + + status = rocsparse_csrsv_solve(handle, + transA, + m, + nnz, + &h_alpha, + descr, + dval, + dptr, + dcol_null, + info, + dx, + dy, + solve, + dbuffer); + verify_rocsparse_status_invalid_pointer(status, "Error: dcol is nullptr"); + } + // testing for(nullptr == dval) + { + T* dval_null = nullptr; + + status = rocsparse_csrsv_solve(handle, + transA, + m, + nnz, + &h_alpha, + descr, + dval_null, + dptr, + dcol, + info, + dx, + dy, + solve, + dbuffer); + verify_rocsparse_status_invalid_pointer(status, "Error: dval is nullptr"); + } + // testing for(nullptr == dx) + { + T* dx_null = nullptr; + + status = rocsparse_csrsv_solve(handle, + transA, + m, + nnz, + &h_alpha, + descr, + dval, + dptr, + dcol, + info, + dx_null, + dy, + solve, + dbuffer); + verify_rocsparse_status_invalid_pointer(status, "Error: dx is nullptr"); + } + // testing for(nullptr == dy) + { + T* dy_null = nullptr; + + status = rocsparse_csrsv_solve(handle, + transA, + m, + nnz, + &h_alpha, + descr, + dval, + dptr, + dcol, + info, + dx, + dy_null, + solve, + dbuffer); + verify_rocsparse_status_invalid_pointer(status, "Error: dy is nullptr"); + } + // testing for(nullptr == d_alpha) + { + T* d_alpha_null = nullptr; + + status = rocsparse_csrsv_solve(handle, + transA, + m, + nnz, + d_alpha_null, + descr, + dval, + dptr, + dcol, + info, + dx, + dy, + solve, + dbuffer); + verify_rocsparse_status_invalid_pointer(status, "Error: alpha is nullptr"); + } + // testing for(nullptr == dbuffer) + { + void* dbuffer_null = nullptr; + + status = rocsparse_csrsv_solve(handle, + transA, + m, + nnz, + &h_alpha, + descr, + dval, + dptr, + dcol, + info, + dx, + dy, + solve, + dbuffer_null); + verify_rocsparse_status_invalid_pointer(status, "Error: dbuffer is nullptr"); + } + // testing for(nullptr == descr) + { + rocsparse_mat_descr descr_null = nullptr; + + status = rocsparse_csrsv_solve(handle, + transA, + m, + nnz, + &h_alpha, + descr_null, + dval, + dptr, + dcol, + info, + dx, + dy, + solve, + dbuffer); + verify_rocsparse_status_invalid_pointer(status, "Error: descr is nullptr"); + } + // testing for(nullptr == info) + { + rocsparse_mat_info info_null = nullptr; + + status = rocsparse_csrsv_solve(handle, + transA, + m, + nnz, + &h_alpha, + descr, + dval, + dptr, + dcol, + info_null, + dx, + dy, + solve, + dbuffer); + verify_rocsparse_status_invalid_pointer(status, "Error: info is nullptr"); + } + // testing for(nullptr == handle) + { + rocsparse_handle handle_null = nullptr; + + status = rocsparse_csrsv_solve(handle_null, + transA, + m, + nnz, + &h_alpha, + descr, + dval, + dptr, + dcol, + info, + dx, + dy, + solve, + dbuffer); + verify_rocsparse_status_invalid_handle(status); + } + + // testing rocsparse_csrsv_zero_pivot + rocsparse_int position; + + // testing for(nullptr == position) + { + rocsparse_int* position_null = nullptr; + + status = rocsparse_csrsv_zero_pivot(handle, descr, info, position_null); + verify_rocsparse_status_invalid_pointer(status, "Error: position is nullptr"); + } + // testing for(nullptr == info) + { + rocsparse_mat_info info_null = nullptr; + + status = rocsparse_csrsv_zero_pivot(handle, descr, info_null, &position); + verify_rocsparse_status_invalid_pointer(status, "Error: info is nullptr"); + } + // testing for(nullptr == handle) + { + rocsparse_handle handle_null = nullptr; + + status = rocsparse_csrsv_zero_pivot(handle_null, descr, info, &position); + verify_rocsparse_status_invalid_handle(status); + } + + // testing rocsparse_csrsv_clear + + // testing for(nullptr == descr) + { + rocsparse_mat_descr descr_null = nullptr; + + status = rocsparse_csrsv_clear(handle, descr_null, info); + verify_rocsparse_status_invalid_pointer(status, "Error: descr is nullptr"); + } + // testing for(nullptr == info) + { + rocsparse_mat_info info_null = nullptr; + + status = rocsparse_csrsv_clear(handle, descr, info_null); + verify_rocsparse_status_invalid_pointer(status, "Error: info is nullptr"); + } + // testing for(nullptr == handle) + { + rocsparse_handle handle_null = nullptr; + + status = rocsparse_csrsv_clear(handle_null, descr, info); + verify_rocsparse_status_invalid_handle(status); + } +} + +template +rocsparse_status testing_csrsv(Arguments argus) +{ + rocsparse_int safe_size = 100; + rocsparse_int m = argus.M; + rocsparse_int n = argus.M; + rocsparse_index_base idx_base = argus.idx_base; + rocsparse_operation trans = argus.transA; + rocsparse_diag_type diag_type = argus.diag_type; + rocsparse_fill_mode fill_mode = argus.fill_mode; + T h_alpha = argus.alpha; + std::string binfile = ""; + std::string filename = ""; + rocsparse_status status; + size_t size; + + // When in testing mode, M == N == -99 indicates that we are testing with a real + // matrix from cise.ufl.edu + if(m == -99 && argus.timing == 0) + { + binfile = argus.filename; + m = safe_size; + } + + if(argus.timing == 1) + { + filename = argus.filename; + } + + std::unique_ptr test_handle(new handle_struct); + rocsparse_handle handle = test_handle->handle; + + std::unique_ptr test_descr(new descr_struct); + rocsparse_mat_descr descr = test_descr->descr; + + std::unique_ptr unique_ptr_mat_info(new mat_info_struct); + rocsparse_mat_info info = unique_ptr_mat_info->info; + + // Set matrix index base + CHECK_ROCSPARSE_ERROR(rocsparse_set_mat_index_base(descr, idx_base)); + + // Set matrix diag type + CHECK_ROCSPARSE_ERROR(rocsparse_set_mat_diag_type(descr, diag_type)); + + // Set matrix fill mode + CHECK_ROCSPARSE_ERROR(rocsparse_set_mat_fill_mode(descr, fill_mode)); + + // Determine number of non-zero elements + double scale = 0.02; + if(m > 1000) + { + scale = 2.0 / m; + } + rocsparse_int nnz = m * scale * m; + + // Argument sanity check before allocating invalid memory + if(m <= 0 || nnz <= 0) + { + auto dptr_managed = + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * safe_size), device_free}; + auto dcol_managed = + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * safe_size), device_free}; + auto dval_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * safe_size), device_free}; + auto dx_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * safe_size), device_free}; + auto dy_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * safe_size), device_free}; + auto buffer_managed = + rocsparse_unique_ptr{device_malloc(sizeof(char) * safe_size), device_free}; + + rocsparse_int* dptr = (rocsparse_int*)dptr_managed.get(); + rocsparse_int* dcol = (rocsparse_int*)dcol_managed.get(); + T* dval = (T*)dval_managed.get(); + T* dx = (T*)dx_managed.get(); + T* dy = (T*)dy_managed.get(); + void* buffer = (void*)buffer_managed.get(); + + if(!dval || !dptr || !dcol || !dx || !dy || !buffer) + { + verify_rocsparse_status_success(rocsparse_status_memory_error, + "!dptr || !dcol || !dval || " + "!dx || !dy || !buffer"); + return rocsparse_status_memory_error; + } + + // Test rocsparse_csrsv_buffer_size + status = rocsparse_csrsv_buffer_size( + handle, trans, m, nnz, descr, dval, dptr, dcol, info, &size); + + if(m < 0 || nnz < 0) + { + verify_rocsparse_status_invalid_size(status, "Error: m < 0 || nnz < 0"); + } + else + { + verify_rocsparse_status_success(status, "m >= 0 && nnz >= 0"); + } + + // Test rocsparse_csrsv_analysis + status = rocsparse_csrsv_analysis(handle, + trans, + m, + nnz, + descr, + dval, + dptr, + dcol, + info, + rocsparse_analysis_policy_reuse, + rocsparse_solve_policy_auto, + buffer); + + if(m < 0 || nnz < 0) + { + verify_rocsparse_status_invalid_size(status, "Error: m < 0 || nnz < 0"); + } + else + { + verify_rocsparse_status_success(status, "m >= 0 && nnz >= 0"); + } + + // Test rocsparse_csrsv_solve + status = rocsparse_csrsv_solve(handle, + trans, + m, + nnz, + &h_alpha, + descr, + dval, + dptr, + dcol, + info, + dx, + dy, + rocsparse_solve_policy_auto, + buffer); + + if(m < 0 || nnz < 0) + { + verify_rocsparse_status_invalid_size(status, "Error: m < 0 || nnz < 0"); + } + else + { + verify_rocsparse_status_success(status, "m >= 0 && nnz >= 0"); + } + + // Test rocsparse_csrsv_zero_pivot + rocsparse_int zero_pivot; + CHECK_ROCSPARSE_ERROR(rocsparse_csrsv_zero_pivot(handle, descr, info, &zero_pivot)); + + // Zero pivot should be -1 + rocsparse_int res = -1; + unit_check_general(1, 1, 1, &res, &zero_pivot); + + // Test rocsparse_csrsv_clear + CHECK_ROCSPARSE_ERROR(rocsparse_csrsv_clear(handle, descr, info)); + + return rocsparse_status_success; + } + + // Host structures + std::vector hcsr_row_ptr; + std::vector hcsr_col_ind; + std::vector hcsr_val; + + // Initial Data on CPU + srand(12345ULL); + if(binfile != "") + { + if(read_bin_matrix( + binfile.c_str(), m, n, nnz, hcsr_row_ptr, hcsr_col_ind, hcsr_val, idx_base) != 0) + { + fprintf(stderr, "Cannot open [read] %s\n", binfile.c_str()); + return rocsparse_status_internal_error; + } + } + else if(argus.laplacian) + { + m = n = gen_2d_laplacian(argus.laplacian, hcsr_row_ptr, hcsr_col_ind, hcsr_val, idx_base); + nnz = hcsr_row_ptr[m]; + } + else + { + std::vector hcoo_row_ind; + + if(filename != "") + { + if(read_mtx_matrix( + filename.c_str(), m, n, nnz, hcoo_row_ind, hcsr_col_ind, hcsr_val, idx_base) != + 0) + { + fprintf(stderr, "Cannot open [read] %s\n", filename.c_str()); + return rocsparse_status_internal_error; + } + } + else + { + gen_matrix_coo(m, n, nnz, hcoo_row_ind, hcsr_col_ind, hcsr_val, idx_base); + } + + // Convert COO to CSR + hcsr_row_ptr.resize(m + 1, 0); + for(rocsparse_int i = 0; i < nnz; ++i) + { + ++hcsr_row_ptr[hcoo_row_ind[i] + 1 - idx_base]; + } + + hcsr_row_ptr[0] = idx_base; + for(rocsparse_int i = 0; i < m; ++i) + { + hcsr_row_ptr[i + 1] += hcsr_row_ptr[i]; + } + } + + std::vector hx(m); + std::vector hy_1(n); + std::vector hy_2(n); + std::vector hy_gold(n); + + rocsparse_init(hx, 1, m); + + // Allocate memory on device + auto dptr_managed = + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * (m + 1)), device_free}; + auto dcol_managed = + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int) * nnz), device_free}; + auto dval_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * nnz), device_free}; + auto dx_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * m), device_free}; + auto dy_1_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * n), device_free}; + auto dy_2_managed = rocsparse_unique_ptr{device_malloc(sizeof(T) * n), device_free}; + auto d_alpha_managed = rocsparse_unique_ptr{device_malloc(sizeof(T)), device_free}; + auto d_position_managed = + rocsparse_unique_ptr{device_malloc(sizeof(rocsparse_int)), device_free}; + + rocsparse_int* dptr = (rocsparse_int*)dptr_managed.get(); + rocsparse_int* dcol = (rocsparse_int*)dcol_managed.get(); + T* dval = (T*)dval_managed.get(); + T* dx = (T*)dx_managed.get(); + T* dy_1 = (T*)dy_1_managed.get(); + T* dy_2 = (T*)dy_2_managed.get(); + T* d_alpha = (T*)d_alpha_managed.get(); + rocsparse_int* d_position = (rocsparse_int*)d_position_managed.get(); + + if(!dval || !dptr || !dcol || !dx || !dy_1 || !dy_2 || !d_alpha || !d_position) + { + verify_rocsparse_status_success(rocsparse_status_memory_error, + "!dval || !dptr || !dcol || !dx || " + "!dy_1 || !dy_2 || !d_alpha || !d_position"); + return rocsparse_status_memory_error; + } + + // copy data from CPU to device + CHECK_HIP_ERROR(hipMemcpy( + dptr, hcsr_row_ptr.data(), sizeof(rocsparse_int) * (m + 1), hipMemcpyHostToDevice)); + CHECK_HIP_ERROR( + hipMemcpy(dcol, hcsr_col_ind.data(), sizeof(rocsparse_int) * nnz, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(dval, hcsr_val.data(), sizeof(T) * nnz, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(dx, hx.data(), sizeof(T) * m, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(dy_1, hy_1.data(), sizeof(T) * n, hipMemcpyHostToDevice)); + CHECK_HIP_ERROR(hipMemcpy(d_alpha, &h_alpha, sizeof(T), hipMemcpyHostToDevice)); + + // Obtain csrsv buffer size + CHECK_ROCSPARSE_ERROR( + rocsparse_csrsv_buffer_size(handle, trans, m, nnz, descr, dval, dptr, dcol, info, &size)); + + // Allocate buffer on the device + auto dbuffer_managed = rocsparse_unique_ptr{device_malloc(sizeof(char) * size), device_free}; + + void* dbuffer = (void*)dbuffer_managed.get(); + + if(!dbuffer) + { + verify_rocsparse_status_success(rocsparse_status_memory_error, "!dbuffer"); + return rocsparse_status_memory_error; + } + + // csrsv analysis + CHECK_ROCSPARSE_ERROR(rocsparse_csrsv_analysis(handle, + trans, + m, + nnz, + descr, + dval, + dptr, + dcol, + info, + rocsparse_analysis_policy_reuse, + rocsparse_solve_policy_auto, + dbuffer)); + + if(argus.unit_check) + { + CHECK_HIP_ERROR(hipMemcpy(dy_2, hy_2.data(), sizeof(T) * n, hipMemcpyHostToDevice)); + + // ROCSPARSE pointer mode host + CHECK_ROCSPARSE_ERROR(rocsparse_set_pointer_mode(handle, rocsparse_pointer_mode_host)); + CHECK_ROCSPARSE_ERROR(rocsparse_csrsv_solve(handle, + trans, + m, + nnz, + &h_alpha, + descr, + dval, + dptr, + dcol, + info, + dx, + dy_1, + rocsparse_solve_policy_auto, + dbuffer)); + + rocsparse_int hposition_1; + rocsparse_status pivot_status_1; + pivot_status_1 = rocsparse_csrsv_zero_pivot(handle, descr, info, &hposition_1); + + // ROCSPARSE pointer mode device + CHECK_ROCSPARSE_ERROR(rocsparse_set_pointer_mode(handle, rocsparse_pointer_mode_device)); + CHECK_ROCSPARSE_ERROR(rocsparse_csrsv_solve(handle, + trans, + m, + nnz, + d_alpha, + descr, + dval, + dptr, + dcol, + info, + dx, + dy_2, + rocsparse_solve_policy_auto, + dbuffer)); + + rocsparse_status pivot_status_2; + pivot_status_2 = rocsparse_csrsv_zero_pivot(handle, descr, info, d_position); + + // Copy output from device to CPU + rocsparse_int hposition_2; + CHECK_HIP_ERROR(hipMemcpy(hy_1.data(), dy_1, sizeof(T) * n, hipMemcpyDeviceToHost)); + CHECK_HIP_ERROR(hipMemcpy(hy_2.data(), dy_2, sizeof(T) * n, hipMemcpyDeviceToHost)); + CHECK_HIP_ERROR( + hipMemcpy(&hposition_2, d_position, sizeof(rocsparse_int), hipMemcpyDeviceToHost)); + + // Host csrsv + hipDeviceProp_t prop; + hipGetDeviceProperties(&prop, 0); + + double cpu_time_used = get_time_us(); + + rocsparse_int position_gold; + if(fill_mode == rocsparse_fill_mode_lower) + { + position_gold = lsolve(m, + hcsr_row_ptr.data(), + hcsr_col_ind.data(), + hcsr_val.data(), + h_alpha, + hx.data(), + hy_gold.data(), + idx_base, + diag_type, + prop.warpSize); + } + else + { + position_gold = usolve(m, + hcsr_row_ptr.data(), + hcsr_col_ind.data(), + hcsr_val.data(), + h_alpha, + hx.data(), + hy_gold.data(), + idx_base, + diag_type, + prop.warpSize); + } + + cpu_time_used = get_time_us() - cpu_time_used; + + unit_check_general(1, 1, 1, &position_gold, &hposition_1); + unit_check_general(1, 1, 1, &position_gold, &hposition_2); + + if(hposition_1 != -1) + { + verify_rocsparse_status_zero_pivot(pivot_status_1, + "expected rocsparse_status_zero_pivot"); + return rocsparse_status_success; + } + + if(hposition_2 != -1) + { + verify_rocsparse_status_zero_pivot(pivot_status_2, + "expected rocsparse_status_zero_pivot"); + return rocsparse_status_success; + } + + unit_check_near(1, n, 1, hy_gold.data(), hy_1.data()); + unit_check_near(1, n, 1, hy_gold.data(), hy_2.data()); + } + + if(argus.timing) + { + int number_cold_calls = 2; + int number_hot_calls = argus.iters; + CHECK_ROCSPARSE_ERROR(rocsparse_set_pointer_mode(handle, rocsparse_pointer_mode_host)); + + for(int iter = 0; iter < number_cold_calls; iter++) + { + rocsparse_csrsv_solve(handle, + trans, + m, + nnz, + &h_alpha, + descr, + dval, + dptr, + dcol, + info, + dx, + dy_1, + rocsparse_solve_policy_auto, + dbuffer); + } + + double gpu_time_used = get_time_us(); // in microseconds + + for(int iter = 0; iter < number_hot_calls; iter++) + { + rocsparse_csrsv_solve(handle, + trans, + m, + nnz, + &h_alpha, + descr, + dval, + dptr, + dcol, + info, + dx, + dy_1, + rocsparse_solve_policy_auto, + dbuffer); + } + + // Convert to miliseconds per call + gpu_time_used = (get_time_us() - gpu_time_used) / (number_hot_calls * 1e3); + + // GFlops + size_t flops = 2 * nnz; + + if(h_alpha != 1.0) + { + flops += m; + } + + if(diag_type == rocsparse_diag_type_non_unit) + { + flops += m; + } + + double gpu_gflops = flops / gpu_time_used / 1e6; + + // Bandwidth + size_t int_data = (m + 1 + nnz) * sizeof(rocsparse_int); + size_t flt_data = (nnz + m + m) * sizeof(T); + double bandwidth = (int_data + flt_data) / gpu_time_used / 1e6; + + printf("m\t\tnnz\t\talpha\tGFlops\tGB/s\tmsec\n"); + printf("%8d\t%9d\t%0.2lf\t%0.2lf\t%0.2lf\t%0.2lf\n", + m, + nnz, + h_alpha, + gpu_gflops, + bandwidth, + gpu_time_used); + } + + CHECK_ROCSPARSE_ERROR(rocsparse_csrsv_clear(handle, descr, info)); + + return rocsparse_status_success; +} + +#endif // TESTING_CSRSV_HPP diff --git a/clients/include/testing_ellmv.hpp b/clients/include/testing_ellmv.hpp index 59f7ab5e..3dcbbec4 100644 --- a/clients/include/testing_ellmv.hpp +++ b/clients/include/testing_ellmv.hpp @@ -367,7 +367,7 @@ rocsparse_status testing_ellmv(Arguments argus) if(col >= 0 && col < n) { - sum += hell_val[idx] * hx[col]; + sum = std::fma(hell_val[idx], hx[col], sum); } else { @@ -377,7 +377,7 @@ rocsparse_status testing_ellmv(Arguments argus) if(h_beta != static_cast(0)) { - hy_gold[i] = h_beta * hy_gold[i] + h_alpha * sum; + hy_gold[i] = std::fma(h_beta, hy_gold[i], h_alpha * sum); } else { diff --git a/clients/include/unit.hpp b/clients/include/unit.hpp index 0e697236..07d78abe 100644 --- a/clients/include/unit.hpp +++ b/clients/include/unit.hpp @@ -32,7 +32,4 @@ void unit_check_general(rocsparse_int M, rocsparse_int N, rocsparse_int lda, T* template void unit_check_near(rocsparse_int M, rocsparse_int N, rocsparse_int lda, T* hCPU, T* hGPU); -template -void unit_check_near(rocsparse_int M, rocsparse_int N, T* hCPU, T* hGPU); - #endif // UNIT_HPP diff --git a/clients/include/utility.hpp b/clients/include/utility.hpp index f3af6315..4556c77f 100644 --- a/clients/include/utility.hpp +++ b/clients/include/utility.hpp @@ -580,6 +580,308 @@ rocsparse_int read_bin_matrix(const char* filename, return 0; } +/* ============================================================================================ */ +/*! \brief Compute incomplete LU factorization without fill-ins and no pivoting using CSR + * matrix storage format. + */ +template +rocsparse_int csrilu0(rocsparse_int m, + const rocsparse_int* ptr, + const rocsparse_int* col, + T* val, + rocsparse_index_base idx_base) +{ + // pointer of upper part of each row + std::vector diag_offset(m); + std::vector nnz_entries(m, 0); + + // ai = 0 to N loop over all rows + for(rocsparse_int ai = 0; ai < m; ++ai) + { + // ai-th row entries + rocsparse_int row_start = ptr[ai] - idx_base; + rocsparse_int row_end = ptr[ai + 1] - idx_base; + rocsparse_int j; + + // nnz position of ai-th row in val array + for(j = row_start; j < row_end; ++j) + { + nnz_entries[col[j] - idx_base] = j; + } + + bool has_diag = false; + + // loop over ai-th row nnz entries + for(j = row_start; j < row_end; ++j) + { + // if nnz entry is in lower matrix + if(col[j] - idx_base < ai) + { + + rocsparse_int col_j = col[j] - idx_base; + rocsparse_int diag_j = diag_offset[col_j]; + + if(val[diag_j] != static_cast(0)) + { + // multiplication factor + val[j] = val[j] / val[diag_j]; + + // loop over upper offset pointer and do linear combination for nnz entry + for(rocsparse_int k = diag_j + 1; k < ptr[col_j + 1] - idx_base; ++k) + { + // if nnz at this position do linear combination + if(nnz_entries[col[k] - idx_base] != 0) + { + val[nnz_entries[col[k] - idx_base]] -= val[j] * val[k]; + } + } + } + else + { + // Numerical zero diagonal + return col_j + idx_base; + } + } + else if(col[j] - idx_base == ai) + { + has_diag = true; + break; + } + else + { + break; + } + } + + if(!has_diag) + { + // Structural zero digonal + return ai + idx_base; + } + + // set diagonal pointer to diagonal element + diag_offset[ai] = j; + + // clear nnz entries + for(j = row_start; j < row_end; ++j) + { + nnz_entries[col[j] - idx_base] = 0; + } + } + + return -1; +} + +/* ============================================================================================ */ +/*! \brief Sparse triangular lower solve using CSR storage format. */ +template +rocsparse_int lsolve(rocsparse_int m, + const rocsparse_int* ptr, + const rocsparse_int* col, + const T* val, + T alpha, + const T* x, + T* y, + rocsparse_index_base idx_base, + rocsparse_diag_type diag_type, + unsigned int wf_size) +{ + rocsparse_int pivot = std::numeric_limits::max(); + std::vector temp(wf_size); + + for(rocsparse_int i = 0; i < m; ++i) + { + temp.assign(wf_size, static_cast(0)); + temp[0] = alpha * x[i]; + + rocsparse_int diag = -1; + rocsparse_int row_begin = ptr[i] - idx_base; + rocsparse_int row_end = ptr[i + 1] - idx_base; + + T diag_val; + + for(rocsparse_int l = row_begin; l < row_end; l += wf_size) + { + for(rocsparse_int k = 0; k < wf_size; ++k) + { + rocsparse_int j = l + k; + + // Do not run out of bounds + if(j >= row_end) + { + break; + } + + rocsparse_int col_j = col[j] - idx_base; + T val_j = val[j]; + + if(col_j < i) + { + // Lower part + temp[k] -= val[j] * y[col_j]; + } + else if(col_j == i) + { + // Diagonal + if(diag_type == rocsparse_diag_type_non_unit) + { + // Check for numerical zero + if(val_j == static_cast(0)) + { + pivot = std::min(pivot, i + idx_base); + val_j = static_cast(1); + } + + diag = j; + diag_val = static_cast(1) / val_j; + } + + break; + } + else + { + // Upper part + break; + } + } + } + + for(rocsparse_int j = 1; j < wf_size; j <<= 1) + { + for(rocsparse_int k = 0; k < wf_size - j; ++k) + { + temp[k] += temp[k + j]; + } + } + + if(diag_type == rocsparse_diag_type_non_unit) + { + if(diag == -1) + { + pivot = std::min(pivot, i + idx_base); + } + + y[i] = temp[0] * diag_val; + } + else + { + y[i] = temp[0]; + } + } + + if(pivot != std::numeric_limits::max()) + { + return pivot; + } + + return -1; +} + +/* ============================================================================================ */ +/*! \brief Sparse triangular upper solve using CSR storage format. */ +template +rocsparse_int usolve(rocsparse_int m, + const rocsparse_int* ptr, + const rocsparse_int* col, + const T* val, + T alpha, + const T* x, + T* y, + rocsparse_index_base idx_base, + rocsparse_diag_type diag_type, + unsigned int wf_size) +{ + rocsparse_int pivot = std::numeric_limits::max(); + std::vector temp(wf_size); + + for(rocsparse_int i = m - 1; i >= 0; --i) + { + temp.assign(wf_size, static_cast(0)); + temp[0] = alpha * x[i]; + + rocsparse_int diag = -1; + rocsparse_int row_begin = ptr[i] - idx_base; + rocsparse_int row_end = ptr[i + 1] - idx_base; + + T diag_val; + + for(rocsparse_int l = row_begin; l < row_end; l += wf_size) + { + for(rocsparse_int k = 0; k < wf_size; ++k) + { + rocsparse_int j = l + k; + + // Do not run out of bounds + if(j >= row_end) + { + break; + } + + rocsparse_int col_j = col[j] - idx_base; + T val_j = val[j]; + + if(col_j < i) + { + // Lower part + continue; + } + else if(col_j == i) + { + // Diagonal + if(diag_type == rocsparse_diag_type_non_unit) + { + // Check for numerical zero + if(val_j == static_cast(0)) + { + pivot = std::min(pivot, i + idx_base); + val_j = static_cast(1); + } + + diag = j; + diag_val = static_cast(1) / val_j; + } + + continue; + } + else + { + // Upper part + temp[k] -= val[j] * y[col_j]; + } + } + } + + for(rocsparse_int j = 1; j < wf_size; j <<= 1) + { + for(rocsparse_int k = 0; k < wf_size - j; ++k) + { + temp[k] += temp[k + j]; + } + } + + if(diag_type == rocsparse_diag_type_non_unit) + { + if(diag == -1) + { + pivot = std::min(pivot, i + idx_base); + } + + y[i] = temp[0] * diag_val; + } + else + { + y[i] = temp[0]; + } + } + + if(pivot != std::numeric_limits::max()) + { + return pivot; + } + + return -1; +} + #ifdef __cplusplus extern "C" { #endif @@ -626,12 +928,15 @@ class Arguments double alpha = 1.0; double beta = 0.0; - rocsparse_operation transA = rocsparse_operation_none; - rocsparse_operation transB = rocsparse_operation_none; - rocsparse_index_base idx_base = rocsparse_index_base_zero; - rocsparse_index_base idx_base2 = rocsparse_index_base_zero; - rocsparse_action action = rocsparse_action_numeric; - rocsparse_hyb_partition part = rocsparse_hyb_partition_auto; + rocsparse_operation transA = rocsparse_operation_none; + rocsparse_operation transB = rocsparse_operation_none; + rocsparse_index_base idx_base = rocsparse_index_base_zero; + rocsparse_index_base idx_base2 = rocsparse_index_base_zero; + rocsparse_action action = rocsparse_action_numeric; + rocsparse_hyb_partition part = rocsparse_hyb_partition_auto; + rocsparse_diag_type diag_type = rocsparse_diag_type_non_unit; + rocsparse_fill_mode fill_mode = rocsparse_fill_mode_lower; + rocsparse_analysis_policy analysis = rocsparse_analysis_policy_reuse; rocsparse_int norm_check = 0; rocsparse_int unit_check = 1; @@ -664,6 +969,9 @@ class Arguments this->idx_base2 = rhs.idx_base2; this->action = rhs.action; this->part = rhs.part; + this->diag_type = rhs.diag_type; + this->fill_mode = rhs.fill_mode; + this->analysis = rhs.analysis; this->norm_check = rhs.norm_check; this->unit_check = rhs.unit_check; diff --git a/clients/tests/CMakeLists.txt b/clients/tests/CMakeLists.txt index 5c417026..bd5b901e 100644 --- a/clients/tests/CMakeLists.txt +++ b/clients/tests/CMakeLists.txt @@ -54,9 +54,11 @@ set(ROCSPARSE_TEST_SOURCES test_sctr.cpp test_coomv.cpp test_csrmv.cpp + test_csrsv.cpp test_ellmv.cpp test_hybmv.cpp test_csrmm.cpp + test_csrilu0.cpp test_csr2coo.cpp test_csr2csc.cpp test_csr2ell.cpp @@ -66,6 +68,7 @@ set(ROCSPARSE_TEST_SOURCES test_identity.cpp test_csrsort.cpp test_coosort.cpp + test_csrilusv.cpp ) set(ROCSPARSE_CLIENTS_COMMON diff --git a/clients/tests/test_csrilu0.cpp b/clients/tests/test_csrilu0.cpp new file mode 100644 index 00000000..a56af901 --- /dev/null +++ b/clients/tests/test_csrilu0.cpp @@ -0,0 +1,133 @@ +/* ************************************************************************ + * Copyright 2018 Advanced Micro Devices, Inc. + * ************************************************************************ */ + +#include "testing_csrilu0.hpp" +#include "utility.hpp" + +#include +#include +#include +#include +#include + +typedef rocsparse_index_base base; +typedef std::tuple csrilu0_tuple; +typedef std::tuple csrilu0_bin_tuple; + +int csrilu0_M_range[] = {-1, 0, 50, 647}; + +base csrilu0_idxbase_range[] = {rocsparse_index_base_zero, rocsparse_index_base_one}; + +std::string csrilu0_bin[] = {"rma10.bin", + "mac_econ_fwd500.bin", + "mc2depi.bin", + "scircuit.bin", + "ASIC_320k.bin", + "bmwcra_1.bin", + "nos1.bin", + "nos2.bin", + "nos3.bin", + "nos4.bin", + "nos5.bin", + "nos6.bin", + "nos7.bin"}; + +class parameterized_csrilu0 : public testing::TestWithParam +{ + protected: + parameterized_csrilu0() {} + virtual ~parameterized_csrilu0() {} + virtual void SetUp() {} + virtual void TearDown() {} +}; + +class parameterized_csrilu0_bin : public testing::TestWithParam +{ + protected: + parameterized_csrilu0_bin() {} + virtual ~parameterized_csrilu0_bin() {} + virtual void SetUp() {} + virtual void TearDown() {} +}; + +Arguments setup_csrilu0_arguments(csrilu0_tuple tup) +{ + Arguments arg; + arg.M = std::get<0>(tup); + arg.idx_base = std::get<1>(tup); + arg.timing = 0; + return arg; +} + +Arguments setup_csrilu0_arguments(csrilu0_bin_tuple tup) +{ + Arguments arg; + arg.M = -99; + arg.idx_base = std::get<0>(tup); + arg.timing = 0; + + // Determine absolute path of test matrix + std::string bin_file = std::get<1>(tup); + + // Get current executables absolute path + char path_exe[PATH_MAX]; + ssize_t len = readlink("/proc/self/exe", path_exe, sizeof(path_exe) - 1); + if(len < 14) + { + path_exe[0] = '\0'; + } + else + { + path_exe[len - 14] = '\0'; + } + + // Matrices are stored at the same path in matrices directory + arg.filename = std::string(path_exe) + "matrices/" + bin_file; + + return arg; +} + +TEST(csrilu0_bad_arg, csrilu0_float) { testing_csrilu0_bad_arg(); } + +TEST_P(parameterized_csrilu0, csrilu0_float) +{ + Arguments arg = setup_csrilu0_arguments(GetParam()); + + rocsparse_status status = testing_csrilu0(arg); + EXPECT_EQ(status, rocsparse_status_success); +} + +TEST_P(parameterized_csrilu0, csrilu0_double) +{ + Arguments arg = setup_csrilu0_arguments(GetParam()); + + rocsparse_status status = testing_csrilu0(arg); + EXPECT_EQ(status, rocsparse_status_success); +} + +TEST_P(parameterized_csrilu0_bin, csrilu0_bin_float) +{ + Arguments arg = setup_csrilu0_arguments(GetParam()); + + rocsparse_status status = testing_csrilu0(arg); + EXPECT_EQ(status, rocsparse_status_success); +} + +TEST_P(parameterized_csrilu0_bin, csrilu0_bin_double) +{ + Arguments arg = setup_csrilu0_arguments(GetParam()); + + rocsparse_status status = testing_csrilu0(arg); + EXPECT_EQ(status, rocsparse_status_success); +} + +INSTANTIATE_TEST_CASE_P(csrilu0, + parameterized_csrilu0, + testing::Combine(testing::ValuesIn(csrilu0_M_range), + testing::ValuesIn(csrilu0_idxbase_range))); + +INSTANTIATE_TEST_CASE_P(csrilu0_bin, + parameterized_csrilu0_bin, + testing::Combine(testing::ValuesIn(csrilu0_idxbase_range), + testing::ValuesIn(csrilu0_bin))); diff --git a/clients/tests/test_csrilusv.cpp b/clients/tests/test_csrilusv.cpp new file mode 100644 index 00000000..3e860ecc --- /dev/null +++ b/clients/tests/test_csrilusv.cpp @@ -0,0 +1,91 @@ +/* ************************************************************************ + * Copyright 2018 Advanced Micro Devices, Inc. + * ************************************************************************ */ + +#include "testing_csrilusv.hpp" +#include "utility.hpp" + +#include +#include +#include +#include +#include + +typedef rocsparse_index_base base; +typedef rocsparse_analysis_policy analysis; + +typedef std::tuple csrilusv_bin_tuple; + +base csrilusv_idxbase_range[] = {rocsparse_index_base_zero, rocsparse_index_base_one}; +analysis csrilusv_analysis_range[] = {rocsparse_analysis_policy_reuse, + rocsparse_analysis_policy_force}; + +std::string csrilusv_bin[] = {"mac_econ_fwd500.bin", + "mc2depi.bin", + "scircuit.bin", + "bmwcra_1.bin", + "nos1.bin", + "nos3.bin", + "nos4.bin", + "nos5.bin", + "nos6.bin", + "nos7.bin"}; + +class parameterized_csrilusv_bin : public testing::TestWithParam +{ + protected: + parameterized_csrilusv_bin() {} + virtual ~parameterized_csrilusv_bin() {} + virtual void SetUp() {} + virtual void TearDown() {} +}; + +Arguments setup_csrilusv_arguments(csrilusv_bin_tuple tup) +{ + Arguments arg; + arg.idx_base = std::get<0>(tup); + arg.analysis = std::get<1>(tup); + arg.timing = 0; + + // Determine absolute path of test matrix + std::string bin_file = std::get<2>(tup); + + // Get current executables absolute path + char path_exe[PATH_MAX]; + ssize_t len = readlink("/proc/self/exe", path_exe, sizeof(path_exe) - 1); + if(len < 14) + { + path_exe[0] = '\0'; + } + else + { + path_exe[len - 14] = '\0'; + } + + // Matrices are stored at the same path in matrices directory + arg.filename = std::string(path_exe) + "matrices/" + bin_file; + + return arg; +} + +TEST_P(parameterized_csrilusv_bin, csrilusv_bin_float) +{ + Arguments arg = setup_csrilusv_arguments(GetParam()); + + rocsparse_status status = testing_csrilusv(arg); + EXPECT_EQ(status, rocsparse_status_success); +} + +TEST_P(parameterized_csrilusv_bin, csrilusv_bin_double) +{ + Arguments arg = setup_csrilusv_arguments(GetParam()); + + rocsparse_status status = testing_csrilusv(arg); + EXPECT_EQ(status, rocsparse_status_success); +} + +INSTANTIATE_TEST_CASE_P(csrilusv_bin, + parameterized_csrilusv_bin, + testing::Combine(testing::ValuesIn(csrilusv_idxbase_range), + testing::ValuesIn(csrilusv_analysis_range), + testing::ValuesIn(csrilusv_bin))); diff --git a/clients/tests/test_csrsv.cpp b/clients/tests/test_csrsv.cpp new file mode 100644 index 00000000..54485b32 --- /dev/null +++ b/clients/tests/test_csrsv.cpp @@ -0,0 +1,157 @@ +/* ************************************************************************ + * Copyright 2018 Advanced Micro Devices, Inc. + * ************************************************************************ */ + +#include "testing_csrsv.hpp" +#include "utility.hpp" + +#include +#include +#include +#include +#include + +typedef rocsparse_index_base base; +typedef rocsparse_operation op; +typedef rocsparse_diag_type diag; +typedef rocsparse_fill_mode fill; + +typedef std::tuple csrsv_tuple; +typedef std::tuple csrsv_bin_tuple; + +int csrsv_M_range[] = {-1, 0, 50, 647}; + +double csrsv_alpha_range[] = {1.0, 2.3, -3.7}; + +base csrsv_idxbase_range[] = {rocsparse_index_base_zero, rocsparse_index_base_one}; +op csrsv_op_range[] = {rocsparse_operation_none}; +diag csrsv_diag_range[] = {rocsparse_diag_type_non_unit}; +fill csrsv_fill_range[] = {rocsparse_fill_mode_lower, rocsparse_fill_mode_upper}; + +std::string csrsv_bin[] = {"rma10.bin", + "mac_econ_fwd500.bin", + "mc2depi.bin", + "scircuit.bin", + "ASIC_320k.bin", + "bmwcra_1.bin", + "nos1.bin", + "nos2.bin", + "nos3.bin", + "nos4.bin", + "nos5.bin", + "nos6.bin"}; + +class parameterized_csrsv : public testing::TestWithParam +{ + protected: + parameterized_csrsv() {} + virtual ~parameterized_csrsv() {} + virtual void SetUp() {} + virtual void TearDown() {} +}; + +class parameterized_csrsv_bin : public testing::TestWithParam +{ + protected: + parameterized_csrsv_bin() {} + virtual ~parameterized_csrsv_bin() {} + virtual void SetUp() {} + virtual void TearDown() {} +}; + +Arguments setup_csrsv_arguments(csrsv_tuple tup) +{ + Arguments arg; + arg.M = std::get<0>(tup); + arg.alpha = std::get<1>(tup); + arg.idx_base = std::get<2>(tup); + arg.transA = std::get<3>(tup); + arg.diag_type = std::get<4>(tup); + arg.fill_mode = std::get<5>(tup); + arg.timing = 0; + return arg; +} + +Arguments setup_csrsv_arguments(csrsv_bin_tuple tup) +{ + Arguments arg; + arg.M = -99; + arg.alpha = std::get<0>(tup); + arg.idx_base = std::get<1>(tup); + arg.transA = std::get<2>(tup); + arg.diag_type = std::get<3>(tup); + arg.fill_mode = std::get<4>(tup); + arg.timing = 0; + + // Determine absolute path of test matrix + std::string bin_file = std::get<5>(tup); + + // Get current executables absolute path + char path_exe[PATH_MAX]; + ssize_t len = readlink("/proc/self/exe", path_exe, sizeof(path_exe) - 1); + if(len < 14) + { + path_exe[0] = '\0'; + } + else + { + path_exe[len - 14] = '\0'; + } + + // Matrices are stored at the same path in matrices directory + arg.filename = std::string(path_exe) + "matrices/" + bin_file; + + return arg; +} + +TEST(csrsv_bad_arg, csrsv_float) { testing_csrsv_bad_arg(); } + +TEST_P(parameterized_csrsv, csrsv_float) +{ + Arguments arg = setup_csrsv_arguments(GetParam()); + + rocsparse_status status = testing_csrsv(arg); + EXPECT_EQ(status, rocsparse_status_success); +} + +TEST_P(parameterized_csrsv, csrsv_double) +{ + Arguments arg = setup_csrsv_arguments(GetParam()); + + rocsparse_status status = testing_csrsv(arg); + EXPECT_EQ(status, rocsparse_status_success); +} + +TEST_P(parameterized_csrsv_bin, csrsv_bin_float) +{ + Arguments arg = setup_csrsv_arguments(GetParam()); + + rocsparse_status status = testing_csrsv(arg); + EXPECT_EQ(status, rocsparse_status_success); +} + +TEST_P(parameterized_csrsv_bin, csrsv_bin_double) +{ + Arguments arg = setup_csrsv_arguments(GetParam()); + + rocsparse_status status = testing_csrsv(arg); + EXPECT_EQ(status, rocsparse_status_success); +} + +INSTANTIATE_TEST_CASE_P(csrsv, + parameterized_csrsv, + testing::Combine(testing::ValuesIn(csrsv_M_range), + testing::ValuesIn(csrsv_alpha_range), + testing::ValuesIn(csrsv_idxbase_range), + testing::ValuesIn(csrsv_op_range), + testing::ValuesIn(csrsv_diag_range), + testing::ValuesIn(csrsv_fill_range))); + +INSTANTIATE_TEST_CASE_P(csrsv_bin, + parameterized_csrsv_bin, + testing::Combine(testing::ValuesIn(csrsv_alpha_range), + testing::ValuesIn(csrsv_idxbase_range), + testing::ValuesIn(csrsv_op_range), + testing::ValuesIn(csrsv_diag_range), + testing::ValuesIn(csrsv_fill_range), + testing::ValuesIn(csrsv_bin))); diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake index b6538fcc..58dcb798 100644 --- a/cmake/Dependencies.cmake +++ b/cmake/Dependencies.cmake @@ -47,7 +47,7 @@ if(HIP_PLATFORM STREQUAL "hcc") message(STATUS "Downloading rocPRIM.") download_project(PROJ rocPRIM GIT_REPOSITORY https://github.com/ROCmSoftwarePlatform/rocPRIM.git - GIT_TAG caef132d64b29a7d857eb68af5323fc302d26766 # TODO change back to master once rocm docker image is updated + GIT_TAG master INSTALL_DIR ${ROCPRIM_ROOT} CMAKE_ARGS -DCMAKE_BUILD_TYPE=RELEASE -DBUILD_TEST=OFF -DCMAKE_INSTALL_PREFIX= -DCMAKE_CXX_COMPILER=/opt/rocm/bin/hcc LOG_DOWNLOAD TRUE diff --git a/cmake/SetToolchain.cmake b/cmake/SetToolchain.cmake index e71d00c1..83cd060d 100644 --- a/cmake/SetToolchain.cmake +++ b/cmake/SetToolchain.cmake @@ -1,5 +1,5 @@ # Find HIP package -find_package(HIP REQUIRED) +find_package(HIP 1.5.18353 REQUIRED) # ROCm 1.9 # Select toolchain if(HIP_PLATFORM STREQUAL "nvcc" OR HIP_COMPILER STREQUAL "clang") diff --git a/docs/Doxyfile b/docs/Doxyfile index ece147c6..f3c64fa6 100644 --- a/docs/Doxyfile +++ b/docs/Doxyfile @@ -150,7 +150,7 @@ INLINE_INHERITED_MEMB = NO # shortest path that makes the file name unique will be used # The default value is: YES. -FULL_PATH_NAMES = YES +FULL_PATH_NAMES = NO # The STRIP_FROM_PATH tag can be used to strip a user-defined part of the path. # Stripping is only done if one of the specified strings matches the left-hand @@ -454,7 +454,7 @@ EXTRACT_STATIC = NO # for Java sources. # The default value is: YES. -EXTRACT_LOCAL_CLASSES = YES +EXTRACT_LOCAL_CLASSES = NO # This flag is only useful for Objective-C code. If set to YES, local methods, # which are defined in the implementation section but not in the interface are @@ -487,7 +487,7 @@ HIDE_UNDOC_MEMBERS = NO # has no effect if EXTRACT_ALL is enabled. # The default value is: NO. -HIDE_UNDOC_CLASSES = NO +HIDE_UNDOC_CLASSES = YES # If the HIDE_FRIEND_COMPOUNDS tag is set to YES, doxygen will hide all friend # (class|struct|union) declarations. If set to NO, these declarations will be @@ -537,7 +537,7 @@ HIDE_COMPOUND_REFERENCE= NO # the files that are included by a file in the documentation of that file. # The default value is: YES. -SHOW_INCLUDE_FILES = YES +SHOW_INCLUDE_FILES = NO # If the SHOW_GROUPED_MEMB_INC tag is set to YES then Doxygen will add for each # grouped member an include statement to the documentation, telling the reader @@ -777,7 +777,8 @@ WARN_LOGFILE = # spaces. See also FILE_PATTERNS and EXTENSION_MAPPING # Note: If this tag is empty the current directory is searched. -INPUT = rocsparse-functions.h \ +INPUT = modules.dox \ + rocsparse-functions.h \ rocsparse-auxiliary.h \ rocsparse-types.h \ ../ diff --git a/docs/modules.dox b/docs/modules.dox new file mode 100644 index 00000000..0ea471b2 --- /dev/null +++ b/docs/modules.dox @@ -0,0 +1,60 @@ +/* ************************************************************************ + * Copyright 2018 Advanced Micro Devices, Inc. + * + * ************************************************************************ */ + +/*! \file + * \brief doxygen group definitions + */ + +/*! \defgroup aux_module SPARSE Auxiliary functions + * \brief This module holds all sparse auxiliary functions. + * + * \details + * The functions that are contained in the auxiliary module describe all available + * helper functions that are required for subsequent library calls. + */ + +/*! \defgroup types_module SPARSE Types + * \brief This module holds all sparse types. + */ + +/*! \defgroup level1_module SPARSE Level 1 routines + * \brief This module holds all sparse level 1 routines. + * + * \details + * The sparse level 1 routines describe operations between a vector in sparse format + * and a vector in dense format. + */ + +/*! \defgroup level2_module SPARSE Level 2 routines + * \brief This module holds all sparse level 2 routines. + * + * \details + * The sparse level 2 routines describe operations between a matrix in sparse format + * and a vector in dense format. + */ + +/*! \defgroup level3_module SPARSE Level 3 routines + * \brief This module holds all sparse level 3 routines. + * + * \details + * The sparse level 3 routines describe operations between a matrix in sparse format + * and multiple vectors in dense format that can also be seen as a dense matrix. + */ + +/*! \defgroup precond_module SPARSE Preconditioners + * \brief This module holds all sparse preconditioners. + * + * \details + * The sparse preconditioners describe manipulations on a matrix in sparse format to + * obtain a sparse preconditioner matrix. + */ + +/*! \defgroup conv_module SPARSE Conversion routines + * \brief This module holds all sparse conversion routines. + * + * \details + * The sparse conversion routines describe operations on a matrix in sparse format to + * obtain a matrix in a different sparse format. + */ diff --git a/docs/source/api.rst b/docs/source/api.rst index a5d3366c..6550b29f 100644 --- a/docs/source/api.rst +++ b/docs/source/api.rst @@ -104,6 +104,18 @@ Sparse Level 2 Functions .. doxygenfunction:: rocsparse_dcsrmv +.. doxygenfunction:: rocsparse_csrsv_buffer_size + +.. doxygenfunction:: rocsparse_csrsv_analysis + +.. doxygenfunction:: rocsparse_scsrsv_solve + +.. doxygenfunction:: rocsparse_dcsrsv_solve + +.. doxygenfunction:: rocsparse_csrsv_zero_pivot + +.. doxygenfunction:: rocsparse_csrsv_clear + .. doxygenfunction:: rocsparse_sellmv .. doxygenfunction:: rocsparse_dellmv @@ -119,6 +131,21 @@ Sparse Level 3 Functions .. doxygenfunction:: rocsparse_dcsrmm +Sparse Preconditioner Functions +------------------------------- + +.. doxygenfunction:: rocsparse_csrilu0_buffer_size + +.. doxygenfunction:: rocsparse_csrilu0_analysis + +.. doxygenfunction:: rocsparse_scsrilu0 + +.. doxygenfunction:: rocsparse_dcsrilu0 + +.. doxygenfunction:: rocsparse_csrilu0_zero_pivot + +.. doxygenfunction:: rocsparse_csrilu0_clear + Sparse Conversion Functions --------------------------- diff --git a/library/CMakeLists.txt b/library/CMakeLists.txt index 0269b361..90876822 100644 --- a/library/CMakeLists.txt +++ b/library/CMakeLists.txt @@ -84,8 +84,8 @@ rocm_export_targets(TARGETS rocsparse-targets rocm_install_symlink_subdir(rocsparse) # Package specific CPACK vars -set(CPACK_DEBIAN_PACKAGE_DEPENDS "hip_hcc (>= 1.3)") -set(CPACK_RPM_PACKAGE_REQUIRES "hip_hcc >= 1.3") +set(CPACK_DEBIAN_PACKAGE_DEPENDS "hip_hcc (>= 1.5.18353)") # 1.5.18353 is HIP version in ROCm 1.9 +set(CPACK_RPM_PACKAGE_REQUIRES "hip_hcc >= 1.5.18353") set(CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/../LICENSE.md") if(NOT CPACK_PACKAGING_INSTALL_PREFIX) diff --git a/library/include/rocsparse-auxiliary.h b/library/include/rocsparse-auxiliary.h index 09e441b6..015d2a1d 100644 --- a/library/include/rocsparse-auxiliary.h +++ b/library/include/rocsparse-auxiliary.h @@ -3,9 +3,9 @@ * * ************************************************************************ */ -/*!\file - * \brief rocsparse-auxiliary.h provides auxilary functions in rocsparse -*/ +/*! \file + * \brief rocsparse-auxiliary.h provides auxilary functions in rocsparse + */ #pragma once #ifndef _ROCSPARSE_AUXILIARY_H_ @@ -20,7 +20,8 @@ extern "C" { #endif -/*! \brief Create a rocsparse handle +/*! \ingroup aux_module + * \brief Create a rocsparse handle * * \details * \p rocsparse_create_handle creates the rocSPARSE library context. It must be @@ -38,7 +39,8 @@ extern "C" { ROCSPARSE_EXPORT rocsparse_status rocsparse_create_handle(rocsparse_handle* handle); -/*! \brief Destroy a rocsparse handle +/*! \ingroup aux_module + * \brief Destroy a rocsparse handle * * \details * \p rocsparse_destroy_handle destroys the rocSPARSE library context and releases all @@ -54,7 +56,8 @@ rocsparse_status rocsparse_create_handle(rocsparse_handle* handle); ROCSPARSE_EXPORT rocsparse_status rocsparse_destroy_handle(rocsparse_handle handle); -/*! \brief Specify user defined HIP stream +/*! \ingroup aux_module + * \brief Specify user defined HIP stream * * \details * \p rocsparse_set_stream specifies the stream to be used by the rocSPARSE library @@ -71,7 +74,8 @@ rocsparse_status rocsparse_destroy_handle(rocsparse_handle handle); ROCSPARSE_EXPORT rocsparse_status rocsparse_set_stream(rocsparse_handle handle, hipStream_t stream); -/*! \brief Get current stream from library context +/*! \ingroup aux_module + * \brief Get current stream from library context * * \details * \p rocsparse_get_stream gets the rocSPARSE library context stream which is currently @@ -88,7 +92,8 @@ rocsparse_status rocsparse_set_stream(rocsparse_handle handle, hipStream_t strea ROCSPARSE_EXPORT rocsparse_status rocsparse_get_stream(rocsparse_handle handle, hipStream_t* stream); -/*! \brief Specify pointer mode +/*! \ingroup aux_module + * \brief Specify pointer mode * * \details * \p rocsparse_set_pointer_mode specifies the pointer mode to be used by the rocSPARSE @@ -108,7 +113,8 @@ ROCSPARSE_EXPORT rocsparse_status rocsparse_set_pointer_mode(rocsparse_handle handle, rocsparse_pointer_mode pointer_mode); -/*! \brief Get current pointer mode from library context +/*! \ingroup aux_module + * \brief Get current pointer mode from library context * * \details * \p rocsparse_get_pointer_mode gets the rocSPARSE library context pointer mode which @@ -127,7 +133,8 @@ ROCSPARSE_EXPORT rocsparse_status rocsparse_get_pointer_mode(rocsparse_handle handle, rocsparse_pointer_mode* pointer_mode); -/*! \brief Get rocSPARSE version +/*! \ingroup aux_module + * \brief Get rocSPARSE version * * \details * rocsparse_get_version gets the rocSPARSE library version number. @@ -147,7 +154,8 @@ rocsparse_status rocsparse_get_pointer_mode(rocsparse_handle handle, ROCSPARSE_EXPORT rocsparse_status rocsparse_get_version(rocsparse_handle handle, int* version); -/*! \brief Create a matrix descriptor +/*! \ingroup aux_module + * \brief Create a matrix descriptor * \details * \p rocsparse_create_mat_descr creates a matrix descriptor. It initializes * \ref rocsparse_matrix_type to \ref rocsparse_matrix_type_general and @@ -163,7 +171,26 @@ rocsparse_status rocsparse_get_version(rocsparse_handle handle, int* version); ROCSPARSE_EXPORT rocsparse_status rocsparse_create_mat_descr(rocsparse_mat_descr* descr); -/*! \brief Destroy a matrix descriptor +/*! \ingroup aux_module + * \brief Copy a matrix descriptor + * \details + * \p rocsparse_copy_mat_descr copies a matrix descriptor. Both, source and destination + * matrix descriptors must be initialized prior to calling \p rocsparse_copy_mat_descr. + * + * @param[out] + * dest the pointer to the destination matrix descriptor. + * @param[in] + * src the pointer to the source matrix descriptor. + * + * \returns \ref rocsparse_status_success the operation completed successfully.
+ * \ref rocsparse_status_invalid_pointer \p src or \p dest pointer is + * invalid. + */ +ROCSPARSE_EXPORT +rocsparse_status rocsparse_copy_mat_descr(rocsparse_mat_descr dest, const rocsparse_mat_descr src); + +/*! \ingroup aux_module + * \brief Destroy a matrix descriptor * * \details * \p rocsparse_destroy_mat_descr destroys a matrix descriptor and releases all @@ -178,7 +205,8 @@ rocsparse_status rocsparse_create_mat_descr(rocsparse_mat_descr* descr); ROCSPARSE_EXPORT rocsparse_status rocsparse_destroy_mat_descr(rocsparse_mat_descr descr); -/*! \brief Specify the index base of a matrix descriptor +/*! \ingroup aux_module + * \brief Specify the index base of a matrix descriptor * * \details * \p rocsparse_set_mat_index_base sets the index base of a matrix descriptor. Valid @@ -196,7 +224,8 @@ rocsparse_status rocsparse_destroy_mat_descr(rocsparse_mat_descr descr); ROCSPARSE_EXPORT rocsparse_status rocsparse_set_mat_index_base(rocsparse_mat_descr descr, rocsparse_index_base base); -/*! \brief Get the index base of a matrix descriptor +/*! \ingroup aux_module + * \brief Get the index base of a matrix descriptor * * \details * \p rocsparse_get_mat_index_base returns the index base of a matrix descriptor. @@ -209,7 +238,8 @@ rocsparse_status rocsparse_set_mat_index_base(rocsparse_mat_descr descr, rocspar ROCSPARSE_EXPORT rocsparse_index_base rocsparse_get_mat_index_base(const rocsparse_mat_descr descr); -/*! \brief Specify the matrix type of a matrix descriptor +/*! \ingroup aux_module + * \brief Specify the matrix type of a matrix descriptor * * \details * \p rocsparse_set_mat_type sets the matrix type of a matrix descriptor. Valid @@ -231,7 +261,8 @@ rocsparse_index_base rocsparse_get_mat_index_base(const rocsparse_mat_descr desc ROCSPARSE_EXPORT rocsparse_status rocsparse_set_mat_type(rocsparse_mat_descr descr, rocsparse_matrix_type type); -/*! \brief Get the matrix type of a matrix descriptor +/*! \ingroup aux_module + * \brief Get the matrix type of a matrix descriptor * * \details * \p rocsparse_get_mat_type returns the matrix type of a matrix descriptor. @@ -246,7 +277,8 @@ rocsparse_status rocsparse_set_mat_type(rocsparse_mat_descr descr, rocsparse_mat ROCSPARSE_EXPORT rocsparse_matrix_type rocsparse_get_mat_type(const rocsparse_mat_descr descr); -/*! \brief Specify the matrix fill mode of a matrix descriptor +/*! \ingroup aux_module + * \brief Specify the matrix fill mode of a matrix descriptor * * \details * \p rocsparse_set_mat_fill_mode sets the matrix fill mode of a matrix descriptor. @@ -263,9 +295,11 @@ rocsparse_matrix_type rocsparse_get_mat_type(const rocsparse_mat_descr descr); * \ref rocsparse_status_invalid_value \p fill_mode is invalid. */ ROCSPARSE_EXPORT -rocsparse_status rocsparse_set_mat_fill_mode(rocsparse_mat_descr descr, rocsparse_fill_mode fill_mode); +rocsparse_status rocsparse_set_mat_fill_mode(rocsparse_mat_descr descr, + rocsparse_fill_mode fill_mode); -/*! \brief Get the matrix fill mode of a matrix descriptor +/*! \ingroup aux_module + * \brief Get the matrix fill mode of a matrix descriptor * * \details * \p rocsparse_get_mat_fill_mode returns the matrix fill mode of a matrix descriptor. @@ -278,7 +312,8 @@ rocsparse_status rocsparse_set_mat_fill_mode(rocsparse_mat_descr descr, rocspars ROCSPARSE_EXPORT rocsparse_fill_mode rocsparse_get_mat_fill_mode(const rocsparse_mat_descr descr); -/*! \brief Specify the matrix diagonal type of a matrix descriptor +/*! \ingroup aux_module + * \brief Specify the matrix diagonal type of a matrix descriptor * * \details * \p rocsparse_set_mat_diag_type sets the matrix diagonal type of a matrix @@ -295,9 +330,11 @@ rocsparse_fill_mode rocsparse_get_mat_fill_mode(const rocsparse_mat_descr descr) * \ref rocsparse_status_invalid_value \p diag_type is invalid. */ ROCSPARSE_EXPORT -rocsparse_status rocsparse_set_mat_diag_type(rocsparse_mat_descr descr, rocsparse_diag_type diag_type); +rocsparse_status rocsparse_set_mat_diag_type(rocsparse_mat_descr descr, + rocsparse_diag_type diag_type); -/*! \brief Get the matrix diagonal type of a matrix descriptor +/*! \ingroup aux_module + * \brief Get the matrix diagonal type of a matrix descriptor * * \details * \p rocsparse_get_mat_diag_type returns the matrix diagonal type of a matrix @@ -311,7 +348,8 @@ rocsparse_status rocsparse_set_mat_diag_type(rocsparse_mat_descr descr, rocspars ROCSPARSE_EXPORT rocsparse_diag_type rocsparse_get_mat_diag_type(const rocsparse_mat_descr descr); -/*! \brief Create a \p HYB matrix structure +/*! \ingroup aux_module + * \brief Create a \p HYB matrix structure * * \details * \p rocsparse_create_hyb_mat creates a structure that holds the matrix in \p HYB @@ -326,7 +364,8 @@ rocsparse_diag_type rocsparse_get_mat_diag_type(const rocsparse_mat_descr descr) ROCSPARSE_EXPORT rocsparse_status rocsparse_create_hyb_mat(rocsparse_hyb_mat* hyb); -/*! \brief Destroy a \p HYB matrix structure +/*! \ingroup aux_module + * \brief Destroy a \p HYB matrix structure * * \details * \p rocsparse_destroy_hyb_mat destroys a \p HYB structure. @@ -341,7 +380,8 @@ rocsparse_status rocsparse_create_hyb_mat(rocsparse_hyb_mat* hyb); ROCSPARSE_EXPORT rocsparse_status rocsparse_destroy_hyb_mat(rocsparse_hyb_mat hyb); -/*! \brief Create a matrix info structure +/*! \ingroup aux_module + * \brief Create a matrix info structure * * \details * \p rocsparse_create_mat_info creates a structure that holds the matrix info data @@ -357,7 +397,8 @@ rocsparse_status rocsparse_destroy_hyb_mat(rocsparse_hyb_mat hyb); ROCSPARSE_EXPORT rocsparse_status rocsparse_create_mat_info(rocsparse_mat_info* info); -/*! \brief Destroy a matrix info structure +/*! \ingroup aux_module + * \brief Destroy a matrix info structure * * \details * \p rocsparse_destroy_mat_info destroys a matrix info structure. diff --git a/library/include/rocsparse-functions.h b/library/include/rocsparse-functions.h index a53bbc03..52d2cee0 100644 --- a/library/include/rocsparse-functions.h +++ b/library/include/rocsparse-functions.h @@ -3,8 +3,8 @@ * * ************************************************************************ */ -/*!\file - * \brief rocsparse-functions.h provides Sparse Linear Algebra Subprograms +/*! \file + * \brief rocsparse-functions.h provides Sparse Linear Algebra Subprograms * of Level 1, 2 and 3, using HIP optimized for AMD HCC-based GPU hardware. */ @@ -25,7 +25,8 @@ extern "C" { * =========================================================================== */ -/*! \brief Scale a sparse vector and add it to a dense vector. +/*! \ingroup level1_module + * \brief Scale a sparse vector and add it to a dense vector. * * \details * \p rocsparse_axpyi multiplies the sparse vector \f$x\f$ with scalar \f$\alpha\f$ and @@ -66,7 +67,7 @@ extern "C" { * \ref rocsparse_status_invalid_pointer \p alpha, \p x_val, \p x_ind or * \p y pointer is invalid. */ -///@{ +/**@{*/ ROCSPARSE_EXPORT rocsparse_status rocsparse_saxpyi(rocsparse_handle handle, rocsparse_int nnz, @@ -103,9 +104,10 @@ rocsparse_status rocsparse_zaxpyi(rocsparse_handle handle, rocsparse_double_complex* y, rocsparse_index_base idx_base); */ -///@} +/**@}*/ -/*! \brief Compute the dot product of a sparse vector with a dense vector. +/*! \ingroup level1_module + * \brief Compute the dot product of a sparse vector with a dense vector. * * \details * \p rocsparse_doti computes the dot product of the sparse vector \f$x\f$ with the @@ -147,7 +149,7 @@ rocsparse_status rocsparse_zaxpyi(rocsparse_handle handle, * reduction could not be allocated.
* \ref rocsparse_status_internal_error an internal error occurred. */ -///@{ +/**@{*/ ROCSPARSE_EXPORT rocsparse_status rocsparse_sdoti(rocsparse_handle handle, rocsparse_int nnz, @@ -184,9 +186,10 @@ rocsparse_status rocsparse_zdoti(rocsparse_handle handle, rocsparse_double_complex* result, rocsparse_index_base idx_base); */ -///@} +/**@}*/ -/*! \brief Gather elements from a dense vector and store them into a sparse vector. +/*! \ingroup level1_module + * \brief Gather elements from a dense vector and store them into a sparse vector. * * \details * \p rocsparse_gthr gathers the elements that are listed in \p x_ind from the dense @@ -221,7 +224,7 @@ rocsparse_status rocsparse_zdoti(rocsparse_handle handle, * \ref rocsparse_status_invalid_pointer \p y, \p x_val or \p x_ind pointer * is invalid. */ -///@{ +/**@{*/ ROCSPARSE_EXPORT rocsparse_status rocsparse_sgthr(rocsparse_handle handle, rocsparse_int nnz, @@ -254,9 +257,10 @@ rocsparse_status rocsparse_zgthr(rocsparse_handle handle, const rocsparse_int* x_ind, rocsparse_index_base idx_base); */ -///@} +/**@}*/ -/*! \brief Gather and zero out elements from a dense vector and store them into a sparse +/*! \ingroup level1_module + * \brief Gather and zero out elements from a dense vector and store them into a sparse * vector. * * \details @@ -294,7 +298,7 @@ rocsparse_status rocsparse_zgthr(rocsparse_handle handle, * \ref rocsparse_status_invalid_pointer \p y, \p x_val or \p x_ind pointer * is invalid. */ -///@{ +/**@{*/ ROCSPARSE_EXPORT rocsparse_status rocsparse_sgthrz(rocsparse_handle handle, rocsparse_int nnz, @@ -327,9 +331,10 @@ rocsparse_status rocsparse_zgthrz(rocsparse_handle handle, const rocsparse_int* x_ind, rocsparse_index_base idx_base); */ -///@} +/**@}*/ -/*! \brief Apply Givens rotation to a dense and a sparse vector. +/*! \ingroup level1_module + * \brief Apply Givens rotation to a dense and a sparse vector. * * \details * \p rocsparse_roti applies the Givens rotation matrix \f$G\f$ to the sparse vector @@ -374,7 +379,7 @@ rocsparse_status rocsparse_zgthrz(rocsparse_handle handle, * \ref rocsparse_status_invalid_pointer \p c, \p s, \p x_val, \p x_ind or * \p y pointer is invalid. */ -///@{ +/**@{*/ ROCSPARSE_EXPORT rocsparse_status rocsparse_sroti(rocsparse_handle handle, rocsparse_int nnz, @@ -394,9 +399,10 @@ rocsparse_status rocsparse_droti(rocsparse_handle handle, const double* c, const double* s, rocsparse_index_base idx_base); -///@} +/**@}*/ -/*! \brief Scatter elements from a dense vector across a sparse vector. +/*! \ingroup level1_module + * \brief Scatter elements from a dense vector across a sparse vector. * * \details * \p rocsparse_sctr scatters the elements that are listed in \p x_ind from the sparse @@ -432,7 +438,7 @@ rocsparse_status rocsparse_droti(rocsparse_handle handle, * \ref rocsparse_status_invalid_pointer \p x_val, \p x_ind or \p y pointer * is invalid. */ -///@{ +/**@{*/ ROCSPARSE_EXPORT rocsparse_status rocsparse_ssctr(rocsparse_handle handle, rocsparse_int nnz, @@ -465,7 +471,7 @@ rocsparse_status rocsparse_zsctr(rocsparse_handle handle, rocsparse_double_complex* y, rocsparse_index_base idx_base); */ -///@} +/**@}*/ /* * =========================================================================== @@ -473,14 +479,15 @@ rocsparse_status rocsparse_zsctr(rocsparse_handle handle, * =========================================================================== */ -/*! \brief Sparse matrix vector multiplication using \p COO storage format +/*! \ingroup level2_module + * \brief Sparse matrix vector multiplication using \p COO storage format * * \details * \p rocsparse_coomv multiplies the scalar \f$\alpha\f$ with a sparse \f$m \times n\f$ * matrix, defined in \p COO storage format, and the dense vector \f$x\f$ and adds the * result to the dense vector \f$y\f$ that is multiplied by the scalar \f$\beta\f$, * such that - * + * * \f$y := \alpha \cdot op(A) \cdot x + \beta \cdot y\f$, with * * \f$ @@ -509,7 +516,7 @@ rocsparse_status rocsparse_zsctr(rocsparse_handle handle, * y[coo_row_ind[i]] += alpha * coo_val[i] * x[coo_col_ind[i]]; * } * \endcode - * + * * @param[in] * handle handle to the rocsparse library context queue. * @param[in] @@ -554,7 +561,7 @@ rocsparse_status rocsparse_zsctr(rocsparse_handle handle, * \p trans != \ref rocsparse_operation_none or * \ref rocsparse_matrix_type != \ref rocsparse_matrix_type_general. */ -///@{ +/**@{*/ ROCSPARSE_EXPORT rocsparse_status rocsparse_scoomv(rocsparse_handle handle, rocsparse_operation trans, @@ -615,9 +622,10 @@ rocsparse_status rocsparse_zcoomv(rocsparse_handle handle, const rocsparse_double_complex* beta, rocsparse_double_complex* y); */ -///@} +/**@}*/ -/*! \brief Sparse matrix vector multiplication using \p CSR storage format +/*! \ingroup level2_module + * \brief Sparse matrix vector multiplication using \p CSR storage format * * \details * \p rocsparse_csrmv_analysis performs the analysis step for rocsparse_scsrmv() and @@ -645,8 +653,7 @@ rocsparse_status rocsparse_zcoomv(rocsparse_handle handle, * csr_col_ind array of \p nnz elements containing the column indices of the sparse * \p CSR matrix. * @param[out] - * info structure that holds the information collected during - * the analysis step. + * info structure that holds the information collected during the analysis step. * * \returns \ref rocsparse_status_success the operation completed successfully.
* \ref rocsparse_status_invalid_handle the library context was @@ -672,7 +679,8 @@ rocsparse_status rocsparse_csrmv_analysis(rocsparse_handle handle, const rocsparse_int* csr_col_ind, rocsparse_mat_info info); -/*! \brief Sparse matrix vector multiplication using \p CSR storage format +/*! \ingroup level2_module + * \brief Sparse matrix vector multiplication using \p CSR storage format * * \details * \p rocsparse_csrmv_clear deallocates all memory that was allocated by @@ -698,16 +706,17 @@ rocsparse_status rocsparse_csrmv_analysis(rocsparse_handle handle, ROCSPARSE_EXPORT rocsparse_status rocsparse_csrmv_clear(rocsparse_handle handle, rocsparse_mat_info info); -/*! \brief Sparse matrix vector multiplication using \p CSR storage format +/*! \ingroup level2_module + * \brief Sparse matrix vector multiplication using \p CSR storage format * * \details * \p rocsparse_csrmv multiplies the scalar \f$\alpha\f$ with a sparse \f$m \times n\f$ * matrix, defined in \p CSR storage format, and the dense vector \f$x\f$ and adds the * result to the dense vector \f$y\f$ that is multiplied by the scalar \f$\beta\f$, * such that - * + * * \f$y := \alpha \cdot op(A) \cdot x + \beta \cdot y\f$, with - * + * * \f$ * op(A) = \left\{ * \begin{array}{ll} @@ -784,7 +793,7 @@ rocsparse_status rocsparse_csrmv_clear(rocsparse_handle handle, rocsparse_mat_in * \p trans != \ref rocsparse_operation_none or * \ref rocsparse_matrix_type != \ref rocsparse_matrix_type_general. */ -///@{ +/**@{*/ ROCSPARSE_EXPORT rocsparse_status rocsparse_scsrmv(rocsparse_handle handle, rocsparse_operation trans, @@ -849,44 +858,389 @@ rocsparse_status rocsparse_zcsrmv(rocsparse_handle handle, const rocsparse_double_complex* beta, rocsparse_double_complex* y); */ -///@} - - - - - - - - - -ROCSPARSE_EXPORT -rocsparse_status rocsparse_csrsv_buffer_size(rocsparse_handle handle, - rocsparse_operation trans, - rocsparse_int m, - rocsparse_int nnz, - const rocsparse_mat_descr descr, - const rocsparse_int* csr_row_ptr, - const rocsparse_int* csr_col_ind, - rocsparse_mat_info info, - size_t* buffer_size); +/**@}*/ +/*! \ingroup level2_module + * \brief Sparse triangular solve using \p CSR storage format + * + * \details + * \p rocsparse_csrsv_zero_pivot returns \ref rocsparse_status_zero_pivot, if either a + * structural or numerical zero has been found during rocsparse_scsrsv_solve() or + * rocsparse_dcsrsv_solve() computation. The first zero pivot \f$j\f$ at \f$A_{j,j}\f$ + * is stored in \p position, using same index base as the \p CSR matrix. + * + * \p position can be in host or device memory. If no zero pivot has been found, + * \p position is set to -1 and \ref rocsparse_status_success is returned instead. + * + * @param[in] + * handle handle to the rocsparse library context queue. + * @param[in] + * descr descriptor of the sparse \p CSR matrix. + * @param[in] + * info structure that holds the information collected during the analysis step. + * @param[inout] + * position pointer to zero pivot \f$j\f$, can be in host or device memory. + * + * \returns \ref rocsparse_status_success the operation completed successfully.
+ * \ref rocsparse_status_invalid_handle the library context was not + * initialized.
+ * \ref rocsparse_status_invalid_pointer \p info or \p position + * pointer is invalid.
+ * \ref rocsparse_status_internal_error an internal error occurred.
+ * \ref rocsparse_status_zero_pivot zero pivot has been found. + */ ROCSPARSE_EXPORT -rocsparse_status rocsparse_csrsv_analysis(rocsparse_handle handle, - rocsparse_operation trans, - rocsparse_int m, - rocsparse_int nnz, - const rocsparse_mat_descr descr, - const rocsparse_int* csr_row_ptr, - const rocsparse_int* csr_col_ind, - rocsparse_mat_info info, - rocsparse_solve_policy solve, - rocsparse_analysis_policy analysis, - void* temp_buffer); +rocsparse_status rocsparse_csrsv_zero_pivot(rocsparse_handle handle, + const rocsparse_mat_descr descr, + rocsparse_mat_info info, + rocsparse_int* position); +/*! \ingroup level2_module + * \brief Sparse triangular solve using \p CSR storage format + * + * \details + * \p rocsparse_csrsv_buffer_size returns the size of the temporary storage buffer that + * is required by rocsparse_scsrsv_analysis(), rocsparse_dcsrsv_analysis(), + * rocsparse_scsrsv_solve() and rocsparse_dcsrsv_solve(). The temporary storage buffer + * must be allocated by the user. The size of the temporary storage buffer is identical + * to the size returned by rocsparse_scsrilu0_buffer_size() and + * rocsparse_dcsrilu0_buffer_size() if the matrix sparsity pattern is identical. The + * user allocated buffer can thus be shared between subsequent calls to those functions. + * + * @param[in] + * handle handle to the rocsparse library context queue. + * @param[in] + * trans matrix operation type. + * @param[in] + * m number of rows of the sparse \p CSR matrix. + * @param[in] + * nnz number of non-zero entries of the sparse \p CSR matrix. + * @param[in] + * descr descriptor of the sparse \p CSR matrix. + * @param[in] + * csr_val array of \p nnz elements of the sparse \p CSR matrix. + * @param[in] + * csr_row_ptr array of \p m+1 elements that point to the start of every row of the + * sparse \p CSR matrix. + * @param[in] + * csr_col_ind array of \p nnz elements containing the column indices of the sparse + * \p CSR matrix. + * @param[out] + * info structure that holds the information collected during the analysis step. + * @param[in] + * buffer_size number of bytes of the temporary storage buffer required by + * rocsparse_scsrsv_analysis(), rocsparse_dcsrsv_analysis(), + * rocsparse_scsrsv_solve() and rocsparse_dcsrsv_solve(). + * + * \returns \ref rocsparse_status_success the operation completed successfully.
+ * \ref rocsparse_status_invalid_handle the library context was + * not initialized.
+ * \ref rocsparse_status_invalid_size \p m or \p nnz is invalid.
+ * \ref rocsparse_status_invalid_pointer \p descr, \p csr_val, + * \p csr_row_ptr, \p csr_col_ind, \p info or \p buffer_size pointer is + * invalid.
+ * \ref rocsparse_status_internal_error an internal error occurred.
+ * \ref rocsparse_status_not_implemented + * \p trans != \ref rocsparse_operation_none or + * \ref rocsparse_matrix_type != \ref rocsparse_matrix_type_general. + */ +/**@{*/ +ROCSPARSE_EXPORT +rocsparse_status rocsparse_scsrsv_buffer_size(rocsparse_handle handle, + rocsparse_operation trans, + rocsparse_int m, + rocsparse_int nnz, + const rocsparse_mat_descr descr, + const float* csr_val, + const rocsparse_int* csr_row_ptr, + const rocsparse_int* csr_col_ind, + rocsparse_mat_info info, + size_t* buffer_size); + +ROCSPARSE_EXPORT +rocsparse_status rocsparse_dcsrsv_buffer_size(rocsparse_handle handle, + rocsparse_operation trans, + rocsparse_int m, + rocsparse_int nnz, + const rocsparse_mat_descr descr, + const double* csr_val, + const rocsparse_int* csr_row_ptr, + const rocsparse_int* csr_col_ind, + rocsparse_mat_info info, + size_t* buffer_size); +/**@}*/ + +/*! \ingroup level2_module + * \brief Sparse triangular solve using \p CSR storage format + * + * \details + * \p rocsparse_csrsv_analysis performs the analysis step for rocsparse_scsrsv_solve() + * and rocsparse_dcsrsv_solve(). It is expected that this function will be executed only + * once for a given matrix and particular operation type. Note that if the matrix + * sparsity pattern changes, the gathered information will become invalid. The analysis + * meta data can be cleared by rocsparse_csrsv_clear(). + * + * \p rocsparse_csrsv_analysis can share its meta data with + * rocsparse_scsrilu0_analysis() and rocsparse_dcsrilu0_analysis(). Selecting + * \ref rocsparse_analysis_policy_reuse policy can greatly improve computation + * performance of meta data. However, the user need to make sure that the sparsity + * pattern remains unchanged. If this cannot be assured, + * \ref rocsparse_analysis_policy_force has to be used. + * + * @param[in] + * handle handle to the rocsparse library context queue. + * @param[in] + * trans matrix operation type. + * @param[in] + * m number of rows of the sparse \p CSR matrix. + * @param[in] + * nnz number of non-zero entries of the sparse \p CSR matrix. + * @param[in] + * descr descriptor of the sparse \p CSR matrix. + * @param[in] + * csr_val array of \p nnz elements of the sparse \p CSR matrix. + * @param[in] + * csr_row_ptr array of \p m+1 elements that point to the start of every row of the + * sparse \p CSR matrix. + * @param[in] + * csr_col_ind array of \p nnz elements containing the column indices of the sparse + * \p CSR matrix. + * @param[out] + * info structure that holds the information collected during + * the analysis step. + * @param[in] + * analysis \ref rocsparse_analysis_policy_reuse or + * \ref rocsparse_analysis_policy_force. + * @param[in] + * solve \ref rocsparse_solve_policy_auto. + * @param[in] + * temp_buffer temporary storage buffer allocated by the user. + * + * \returns \ref rocsparse_status_success the operation completed successfully.
+ * \ref rocsparse_status_invalid_handle the library context was + * not initialized.
+ * \ref rocsparse_status_invalid_size \p m or \p nnz is invalid.
+ * \ref rocsparse_status_invalid_pointer \p descr, \p csr_row_ptr, + * \p csr_col_ind, \p info or \p temp_buffer pointer is invalid.
+ * \ref rocsparse_status_internal_error an internal error occurred.
+ * \ref rocsparse_status_not_implemented + * \p trans != \ref rocsparse_operation_none or + * \ref rocsparse_matrix_type != \ref rocsparse_matrix_type_general. + */ +/**@{*/ +ROCSPARSE_EXPORT +rocsparse_status rocsparse_scsrsv_analysis(rocsparse_handle handle, + rocsparse_operation trans, + rocsparse_int m, + rocsparse_int nnz, + const rocsparse_mat_descr descr, + const float* csr_val, + const rocsparse_int* csr_row_ptr, + const rocsparse_int* csr_col_ind, + rocsparse_mat_info info, + rocsparse_analysis_policy analysis, + rocsparse_solve_policy solve, + void* temp_buffer); + +ROCSPARSE_EXPORT +rocsparse_status rocsparse_dcsrsv_analysis(rocsparse_handle handle, + rocsparse_operation trans, + rocsparse_int m, + rocsparse_int nnz, + const rocsparse_mat_descr descr, + const double* csr_val, + const rocsparse_int* csr_row_ptr, + const rocsparse_int* csr_col_ind, + rocsparse_mat_info info, + rocsparse_analysis_policy analysis, + rocsparse_solve_policy solve, + void* temp_buffer); +/**@}*/ + +/*! \ingroup level2_module + * \brief Sparse triangular solve using \p CSR storage format + * + * \details + * \p rocsparse_csrsv_clear deallocates all memory that was allocated by + * rocsparse_scsrsv_analysis() or rocsparse_dcsrsv_analysis(). This is especially + * useful, if memory is an issue and the analysis data is not required for further + * computation, e.g. when switching to another sparse matrix format. Calling + * \p rocsparse_csrsv_clear is optional. All allocated resources will be cleared, when + * the opaque \ref rocsparse_mat_info struct is destroyed using + * rocsparse_destroy_mat_info(). + * + * @param[in] + * handle handle to the rocsparse library context queue. + * @param[in] + * descr descriptor of the sparse \p CSR matrix. + * @param[inout] + * info structure that holds the information collected during the analysis step. + * + * \returns \ref rocsparse_status_success the operation completed successfully.
+ * \ref rocsparse_status_invalid_handle the library context was + * not initialized.
+ * \ref rocsparse_status_invalid_pointer \p info pointer is invalid.
+ * \ref rocsparse_status_memory_error the buffer holding the meta data + * could not be deallocated.
+ * \ref rocsparse_status_internal_error an internal error occurred.
+ */ ROCSPARSE_EXPORT -rocsparse_status rocsparse_csrsv_clear(const rocsparse_mat_descr descr, +rocsparse_status rocsparse_csrsv_clear(rocsparse_handle handle, + const rocsparse_mat_descr descr, rocsparse_mat_info info); +/*! \ingroup level2_module + * \brief Sparse triangular solve using \p CSR storage format + * + * \details + * \p rocsparse_csrsv_solve solves a sparse triangular linear system of a sparse + * \f$m \times m\f$ matrix, defined in \p CSR storage format, a dense solution vector + * \f$y\f$ and the right-hand side \f$x\f$ that is multiplied by \f$\alpha\f$, such that + * + * \f$op(A) * y = \alpha * x\f$, with + * + * \f$ + * op(A) = \left\{ + * \begin{array}{ll} + * A, & \text{if trans == rocsparse_operation_none} \\ + * A^T, & \text{if trans == rocsparse_operation_transpose} \\ + * A^H, & \text{if trans == rocsparse_operation_conjugate_transpose} + * \end{array} + * \right. + * \f$ + * + * Currently, only \p trans == \ref rocsparse_operation_none is supported. + * + * \p rocsparse_csrsv_solve requires a user allocated temporary buffer. Its size is + * returned by rocsparse_scsrsv_buffer_size() or rocsparse_dcsrsv_buffer_size(). + * Furthermore, analysis meta data is required. It can be obtained by + * rocsparse_scsrsv_analysis() or rocsparse_dcsrsv_analysis(). + * \p rocsparse_csrsv_solve reports the first zero pivot (either numerical or structural + * zero). The zero pivot status can be checked calling rocsparse_csrsv_zero_pivot(). + * If \ref rocsparse_diag_type == \ref rocsparse_diag_type_unit, no zero pivot will be + * reported, even if \f$A_{j,j} = 0\f$ for some \f$j\f$. + * + * Note that the sparse \p CSR matrix has to be sorted. This can be achieved by calling + * rocsparse_csrsort(). + * + * @param[in] + * handle handle to the rocsparse library context queue. + * @param[in] + * trans matrix operation type. + * @param[in] + * m number of rows of the sparse \p CSR matrix. + * @param[in] + * nnz number of non-zero entries of the sparse \p CSR matrix. + * @param[in] + * alpha scalar \f$\alpha\f$. + * @param[in] + * descr descriptor of the sparse \p CSR matrix. + * @param[in] + * csr_val array of \p nnz elements of the sparse \p CSR matrix. + * @param[in] + * csr_row_ptr array of \p m+1 elements that point to the start + * of every row of the sparse \p CSR matrix. + * @param[in] + * csr_col_ind array of \p nnz elements containing the column indices of the sparse + * \p CSR matrix. + * @param[in] + * info structure that holds the information collected during the analysis step. + * @param[in] + * x array of \p m elements, holding the right-hand side. + * @param[out] + * y array of \p m elements, holding the solution. + * @param[in] + * policy \ref rocsparse_solve_policy_auto. + * @param[in] + * temp_buffer temporary storage buffer allocated by the user. + * + * \returns \ref rocsparse_status_success the operation completed successfully.
+ * \ref rocsparse_status_invalid_handle the library context was + * not initialized.
+ * \ref rocsparse_status_invalid_size \p m or \p nnz is invalid.
+ * \ref rocsparse_status_invalid_pointer \p descr, \p alpha, \p csr_val, + * \p csr_row_ptr, \p csr_col_ind, \p x or \p y pointer is invalid.
+ * \ref rocsparse_status_arch_mismatch the device is not supported.
+ * \ref rocsparse_status_internal_error an internal error occurred.
+ * \ref rocsparse_status_not_implemented + * \p trans != \ref rocsparse_operation_none or + * \ref rocsparse_matrix_type != \ref rocsparse_matrix_type_general. + * + * \par Example + * Consider the lower triangular \f$m \times m\f$ matrix \f$L\f$, stored in \p CSR + * storage format with unit diagonal. The following example solves \f$L \cdot y = x\f$. + * \code{.c} + * // Create rocSPARSE handle + * rocsparse_handle handle; + * rocsparse_create_handle(&handle); + * + * // Create matrix descriptor + * rocsparse_mat_descr descr; + * rocsparse_create_mat_descr(&descr); + * rocsparse_set_mat_fill_mode(descr, rocsparse_fill_mode_lower); + * rocsparse_set_mat_diag_type(descr, rocsparse_diag_type_unit); + * + * // Create matrix info structure + * rocsparse_mat_info info; + * rocsparse_create_mat_info(&info); + * + * // Obtain required buffer size + * size_t buffer_size; + * rocsparse_dcsrsv_buffer_size(handle, + * rocsparse_operation_none, + * m, + * nnz, + * descr, + * csr_val, + * csr_row_ptr, + * csr_col_ind, + * info, + * &buffer_size); + * + * // Allocate temporary buffer + * void* temp_buffer; + * hipMalloc(&temp_buffer, buffer_size); + * + * // Perform analysis step + * rocsparse_dcsrsv_analysis(handle, + * rocsparse_operation_none, + * m, + * nnz, + * descr, + * csr_val, + * csr_row_ptr, + * csr_col_ind, + * info, + * rocsparse_analysis_policy_reuse, + * rocsparse_solve_policy_auto, + * temp_buffer); + * + * // Solve Ly = x + * rocsparse_dcsrsv_solve(handle, + * rocsparse_operation_none, + * m, + * nnz, + * &alpha, + * descr, + * csr_val, + * csr_row_ptr, + * csr_col_ind, + * info, + * x, + * y, + * rocsparse_solve_policy_auto, + * temp_buffer); + * + * // No zero pivot should be found, with L having unit diagonal + * + * // Clean up + * hipFree(temp_buffer); + * rocsparse_destroy_mat_info(info); + * rocsparse_destroy_mat_descr(descr); + * rocsparse_destroy_handle(handle); + * \endcode + */ +/**@{*/ ROCSPARSE_EXPORT rocsparse_status rocsparse_scsrsv_solve(rocsparse_handle handle, rocsparse_operation trans, @@ -918,29 +1272,19 @@ rocsparse_status rocsparse_dcsrsv_solve(rocsparse_handle handle, double* y, rocsparse_solve_policy policy, void* temp_buffer); +/**@}*/ - - - - - - - - - - - - -/*! \brief Sparse matrix vector multiplication using \p ELL storage format +/*! \ingroup level2_module + * \brief Sparse matrix vector multiplication using \p ELL storage format * * \details * \p rocsparse_ellmv multiplies the scalar \f$\alpha\f$ with a sparse \f$m \times n\f$ * matrix, defined in \p ELL storage format, and the dense vector \f$x\f$ and adds the * result to the dense vector \f$y\f$ that is multiplied by the scalar \f$\beta\f$, * such that - * + * * \f$y := \alpha \cdot op(A) \cdot x + \beta \cdot y\f$, with - * + * * \f$ * op(A) = \left\{ * \begin{array}{ll} @@ -1011,7 +1355,7 @@ rocsparse_status rocsparse_dcsrsv_solve(rocsparse_handle handle, * \p trans != \ref rocsparse_operation_none or * \ref rocsparse_matrix_type != \ref rocsparse_matrix_type_general. */ -///@{ +/**@{*/ ROCSPARSE_EXPORT rocsparse_status rocsparse_sellmv(rocsparse_handle handle, rocsparse_operation trans, @@ -1069,18 +1413,19 @@ rocsparse_status rocsparse_sellmv(rocsparse_handle handle, const rocsparse_double_complex* beta, rocsparse_double_complex* y); */ -///@} +/**@}*/ -/*! \brief Sparse matrix vector multiplication using \p HYB storage format +/*! \ingroup level2_module + * \brief Sparse matrix vector multiplication using \p HYB storage format * * \details * \p rocsparse_hybmv multiplies the scalar \f$\alpha\f$ with a sparse \f$m \times n\f$ * matrix, defined in \p HYB storage format, and the dense vector \f$x\f$ and adds the * result to the dense vector \f$y\f$ that is multiplied by the scalar \f$\beta\f$, * such that - * + * * \f$y := \alpha \cdot op(A) \cdot x + \beta \cdot y\f$, with - * + * * \f$ * op(A) = \left\{ * \begin{array}{ll} @@ -1130,7 +1475,7 @@ rocsparse_status rocsparse_sellmv(rocsparse_handle handle, * \p trans != \ref rocsparse_operation_none or * \ref rocsparse_matrix_type != \ref rocsparse_matrix_type_general. */ -///@{ +/**@{*/ ROCSPARSE_EXPORT rocsparse_status rocsparse_shybmv(rocsparse_handle handle, rocsparse_operation trans, @@ -1171,7 +1516,7 @@ rocsparse_status rocsparse_dhybmv(rocsparse_handle handle, const rocsparse_double_complex* beta, rocsparse_double_complex* y); */ -///@} +/**@}*/ /* * =========================================================================== @@ -1179,16 +1524,17 @@ rocsparse_status rocsparse_dhybmv(rocsparse_handle handle, * =========================================================================== */ -/*! \brief Sparse matrix dense matrix multiplication using \p CSR storage format +/*! \ingroup level3_module + * \brief Sparse matrix dense matrix multiplication using \p CSR storage format * * \details * \p rocsparse_csrmm multiplies the scalar \f$\alpha\f$ with a sparse \f$m \times k\f$ * matrix \f$A\f$, defined in \p CSR storage format, and the dense \f$k \times n\f$ * matrix \f$B\f$ and adds the result to the dense \f$m \times n\f$ matrix \f$C\f$ that * is multiplied by the scalar \f$\beta\f$, such that - * + * * \f$C := \alpha \cdot op(A) \cdot op(B) + \beta \cdot C\f$, with - * + * * \f$ * op(A) = \left\{ * \begin{array}{ll} @@ -1219,7 +1565,7 @@ rocsparse_status rocsparse_dhybmv(rocsparse_handle handle, * for(j = 0; j < n; ++j) * { * C[i][j] = beta * C[i][j]; - * + * * for(k = csr_row_ptr[i]; k < csr_row_ptr[i + 1]; ++k) * { * C[i][j] += alpha * csr_val[k] * B[csr_col_ind[k]][j]; @@ -1284,7 +1630,7 @@ rocsparse_status rocsparse_dhybmv(rocsparse_handle handle, * \p trans_A != \ref rocsparse_operation_none or * \ref rocsparse_matrix_type != \ref rocsparse_matrix_type_general. */ -///@{ +/**@{*/ ROCSPARSE_EXPORT rocsparse_status rocsparse_scsrmm(rocsparse_handle handle, rocsparse_operation trans_A, @@ -1361,43 +1707,467 @@ rocsparse_status rocsparse_zcsrmm(rocsparse_handle handle, rocsparse_double_complex* C, rocsparse_int ldc); */ -///@} - - - - - +/**@}*/ +/* + * =========================================================================== + * preconditioner SPARSE + * =========================================================================== + */ -//TODO +/*! \ingroup precond_module + * \brief Incomplete LU factorization with 0 fill-ins and no pivoting using \p CSR + * storage format + * + * \details + * \p rocsparse_csrilu0_zero_pivot returns \ref rocsparse_status_zero_pivot, if either a + * structural or numerical zero has been found during rocsparse_scsrilu0() or + * rocsparse_dcsrilu0() computation. The first zero pivot \f$j\f$ at \f$A_{j,j}\f$ + * is stored in \p position, using same index base as the \p CSR matrix. + * + * \p position can be in host or device memory. If no zero pivot has been found, + * \p position is set to -1 and \ref rocsparse_status_success is returned instead. + * + * @param[in] + * handle handle to the rocsparse library context queue. + * @param[in] + * info structure that holds the information collected during the analysis step. + * @param[inout] + * position pointer to zero pivot \f$j\f$, can be in host or device memory. + * + * \returns \ref rocsparse_status_success the operation completed successfully.
+ * \ref rocsparse_status_invalid_handle the library context was not + * initialized.
+ * \ref rocsparse_status_invalid_pointer \p info or \p position + * pointer is invalid.
+ * \ref rocsparse_status_internal_error an internal error occurred.
+ * \ref rocsparse_status_zero_pivot zero pivot has been found. + */ ROCSPARSE_EXPORT -rocsparse_status rocsparse_csrilu0_buffer_size(rocsparse_handle handle, - rocsparse_int m, - rocsparse_int nnz, - const rocsparse_mat_descr descr, - const rocsparse_int* csr_row_ptr, - const rocsparse_int* csr_col_ind, - rocsparse_mat_info info, - size_t* buffer_size); +rocsparse_status rocsparse_csrilu0_zero_pivot(rocsparse_handle handle, + rocsparse_mat_info info, + rocsparse_int* position); +/*! \ingroup precond_module + * \brief Incomplete LU factorization with 0 fill-ins and no pivoting using \p CSR + * storage format + * + * \details + * \p rocsparse_csrilu0_buffer_size returns the size of the temporary storage buffer + * that is required by rocsparse_scsrilu0_analysis(), rocsparse_dcsrilu0_analysis, + * rocsparse_scsrilu0() and rocsparse_dcsrilu0(). The temporary storage buffer must + * be allocated by the user. The size of the temporary storage buffer is identical to + * the size returned by rocsparse_scsrsv_buffer_size() and + * rocsparse_dcsrsv_buffer_size() if the matrix sparsity pattern is identical. The user + * allocated buffer can thus be shared between subsequent calls to those functions. + * + * @param[in] + * handle handle to the rocsparse library context queue. + * @param[in] + * m number of rows of the sparse \p CSR matrix. + * @param[in] + * nnz number of non-zero entries of the sparse \p CSR matrix. + * @param[in] + * descr descriptor of the sparse \p CSR matrix. + * @param[in] + * csr_val array of \p nnz elements of the sparse \p CSR matrix. + * @param[in] + * csr_row_ptr array of \p m+1 elements that point to the start of every row of the + * sparse \p CSR matrix. + * @param[in] + * csr_col_ind array of \p nnz elements containing the column indices of the sparse + * \p CSR matrix. + * @param[out] + * info structure that holds the information collected during the analysis step. + * @param[in] + * buffer_size number of bytes of the temporary storage buffer required by + * rocsparse_scsrilu0_analysis(), rocsparse_dcsrilu0_analysis(), + * rocsparse_scsrilu0() and rocsparse_dcsrilu0(). + * + * \returns \ref rocsparse_status_success the operation completed successfully.
+ * \ref rocsparse_status_invalid_handle the library context was + * not initialized.
+ * \ref rocsparse_status_invalid_size \p m or \p nnz is invalid.
+ * \ref rocsparse_status_invalid_pointer \p descr, \p csr_val, + * \p csr_row_ptr, \p csr_col_ind, \p info or \p buffer_size pointer is + * invalid.
+ * \ref rocsparse_status_internal_error an internal error occurred.
+ * \ref rocsparse_status_not_implemented + * \p trans != \ref rocsparse_operation_none or + * \ref rocsparse_matrix_type != \ref rocsparse_matrix_type_general. + */ +/**@{*/ +ROCSPARSE_EXPORT +rocsparse_status rocsparse_scsrilu0_buffer_size(rocsparse_handle handle, + rocsparse_int m, + rocsparse_int nnz, + const rocsparse_mat_descr descr, + const float* csr_val, + const rocsparse_int* csr_row_ptr, + const rocsparse_int* csr_col_ind, + rocsparse_mat_info info, + size_t* buffer_size); + +ROCSPARSE_EXPORT +rocsparse_status rocsparse_dcsrilu0_buffer_size(rocsparse_handle handle, + rocsparse_int m, + rocsparse_int nnz, + const rocsparse_mat_descr descr, + const double* csr_val, + const rocsparse_int* csr_row_ptr, + const rocsparse_int* csr_col_ind, + rocsparse_mat_info info, + size_t* buffer_size); +/**@}*/ + +/*! \ingroup precond_module + * \brief Incomplete LU factorization with 0 fill-ins and no pivoting using \p CSR + * storage format + * + * \details + * \p rocsparse_csrilu0_analysis performs the analysis step for rocsparse_scsrilu0() + * and rocsparse_dcsrilu0(). It is expected that this function will be executed only + * once for a given matrix and particular operation type. Note that if the matrix + * sparsity pattern changes, the gathered information will become invalid. The analysis + * meta data can be cleared by rocsparse_csrilu0_clear(). + * + * \p rocsparse_csrilu0_analysis can share its meta data with + * rocsparse_scsrsv_analysis() and rocsparse_dcsrsv_analysis(). Selecting + * \ref rocsparse_analysis_policy_reuse policy can greatly improve computation + * performance of meta data. However, the user need to make sure that the sparsity + * pattern remains unchanged. If this cannot be assured, + * \ref rocsparse_analysis_policy_force has to be used. + * + * @param[in] + * handle handle to the rocsparse library context queue. + * @param[in] + * m number of rows of the sparse \p CSR matrix. + * @param[in] + * nnz number of non-zero entries of the sparse \p CSR matrix. + * @param[in] + * descr descriptor of the sparse \p CSR matrix. + * @param[in] + * csr_val array of \p nnz elements of the sparse \p CSR matrix. + * @param[in] + * csr_row_ptr array of \p m+1 elements that point to the start of every row of the + * sparse \p CSR matrix. + * @param[in] + * csr_col_ind array of \p nnz elements containing the column indices of the sparse + * \p CSR matrix. + * @param[out] + * info structure that holds the information collected during + * the analysis step. + * @param[in] + * analysis \ref rocsparse_analysis_policy_reuse or + * \ref rocsparse_analysis_policy_force. + * @param[in] + * solve \ref rocsparse_solve_policy_auto. + * @param[in] + * temp_buffer temporary storage buffer allocated by the user. + * + * \returns \ref rocsparse_status_success the operation completed successfully.
+ * \ref rocsparse_status_invalid_handle the library context was + * not initialized.
+ * \ref rocsparse_status_invalid_size \p m or \p nnz is invalid.
+ * \ref rocsparse_status_invalid_pointer \p descr, \p csr_val, + * \p csr_row_ptr, \p csr_col_ind, \p info or \p temp_buffer pointer is + * invalid.
+ * \ref rocsparse_status_internal_error an internal error occurred.
+ * \ref rocsparse_status_not_implemented + * \p trans != \ref rocsparse_operation_none or + * \ref rocsparse_matrix_type != \ref rocsparse_matrix_type_general. + */ +/**@{*/ ROCSPARSE_EXPORT -rocsparse_status rocsparse_csrilu0_analysis(rocsparse_handle handle, - rocsparse_int m, - rocsparse_int nnz, - const rocsparse_mat_descr descr, - const rocsparse_int* csr_row_ptr, - const rocsparse_int* csr_col_ind, - rocsparse_mat_info info, - rocsparse_solve_policy solve, - rocsparse_analysis_policy analysis, - void* temp_buffer); - +rocsparse_status rocsparse_scsrilu0_analysis(rocsparse_handle handle, + rocsparse_int m, + rocsparse_int nnz, + const rocsparse_mat_descr descr, + const float* csr_val, + const rocsparse_int* csr_row_ptr, + const rocsparse_int* csr_col_ind, + rocsparse_mat_info info, + rocsparse_analysis_policy analysis, + rocsparse_solve_policy solve, + void* temp_buffer); ROCSPARSE_EXPORT -rocsparse_status rocsparse_csrilu0_clear(rocsparse_mat_info info); - +rocsparse_status rocsparse_dcsrilu0_analysis(rocsparse_handle handle, + rocsparse_int m, + rocsparse_int nnz, + const rocsparse_mat_descr descr, + const double* csr_val, + const rocsparse_int* csr_row_ptr, + const rocsparse_int* csr_col_ind, + rocsparse_mat_info info, + rocsparse_analysis_policy analysis, + rocsparse_solve_policy solve, + void* temp_buffer); +/**@}*/ +/*! \ingroup precond_module + * \brief Incomplete LU factorization with 0 fill-ins and no pivoting using \p CSR + * storage format + * + * \details + * \p rocsparse_csrilu0_clear deallocates all memory that was allocated by + * rocsparse_scsrilu0_analysis() or rocsparse_dcsrilu0_analysis(). This is especially + * useful, if memory is an issue and the analysis data is not required for further + * computation. Calling \p rocsparse_csrilu0_clear is optional. All allocated resources + * will be cleared, when the opaque \ref rocsparse_mat_info struct is destroyed using + * rocsparse_destroy_mat_info(). + * + * @param[in] + * handle handle to the rocsparse library context queue. + * @param[inout] + * info structure that holds the information collected during the analysis step. + * + * \returns \ref rocsparse_status_success the operation completed successfully.
+ * \ref rocsparse_status_invalid_handle the library context was + * not initialized.
+ * \ref rocsparse_status_invalid_pointer \p info pointer is invalid.
+ * \ref rocsparse_status_memory_error the buffer holding the meta data + * could not be deallocated.
+ * \ref rocsparse_status_internal_error an internal error occurred.
+ */ +ROCSPARSE_EXPORT +rocsparse_status rocsparse_csrilu0_clear(rocsparse_handle handle, rocsparse_mat_info info); +/*! \ingroup precond_module + * \brief Incomplete LU factorization with 0 fill-ins and no pivoting using \p CSR + * storage format + * + * \details + * \p rocsparse_csrilu0 computes the incomplete LU factorization with 0 fill-ins and no + * pivoting of a sparse \f$m \times m\f$ \p CSR matrix \f$A\f$, such that + * + * \f$A \approx LU\f$ + * + * \p rocsparse_csrilu0 requires a user allocated temporary buffer. Its size is returned + * by rocsparse_scsrilu0_buffer_size() or rocsparse_dcsrilu0_buffer_size(). Furthermore, + * analysis meta data is required. It can be obtained by rocsparse_scsrilu0_analysis() + * or rocsparse_dcsrilu0_analysis(). \p rocsparse_csrilu0 reports the first zero pivot + * (either numerical or structural zero). The zero pivot status can be obtained by + * calling rocsparse_csrilu0_zero_pivot(). + * + * Note that the sparse \p CSR matrix has to be sorted. This can be achieved by calling + * rocsparse_csrsort(). + * + * @param[in] + * handle handle to the rocsparse library context queue. + * @param[in] + * m number of rows of the sparse \p CSR matrix. + * @param[in] + * nnz number of non-zero entries of the sparse \p CSR matrix. + * @param[in] + * descr descriptor of the sparse \p CSR matrix. + * @param[inout] + * csr_val array of \p nnz elements of the sparse \p CSR matrix. + * @param[in] + * csr_row_ptr array of \p m+1 elements that point to the start + * of every row of the sparse \p CSR matrix. + * @param[in] + * csr_col_ind array of \p nnz elements containing the column indices of the sparse + * \p CSR matrix. + * @param[in] + * info structure that holds the information collected during the analysis step. + * @param[in] + * policy \ref rocsparse_solve_policy_auto. + * @param[in] + * temp_buffer temporary storage buffer allocated by the user. + * + * \returns \ref rocsparse_status_success the operation completed successfully.
+ * \ref rocsparse_status_invalid_handle the library context was + * not initialized.
+ * \ref rocsparse_status_invalid_size \p m or \p nnz is invalid.
+ * \ref rocsparse_status_invalid_pointer \p descr, \p csr_val, + * \p csr_row_ptr or \p csr_col_ind pointer is invalid.
+ * \ref rocsparse_status_arch_mismatch the device is not supported.
+ * \ref rocsparse_status_internal_error an internal error occurred.
+ * \ref rocsparse_status_not_implemented + * \p trans != \ref rocsparse_operation_none or + * \ref rocsparse_matrix_type != \ref rocsparse_matrix_type_general. + * + * \par Example + * Consider the sparse \f$m \times m\f$ matrix \f$A\f$, stored in \p CSR + * storage format. The following example computes the incomplete LU factorization + * \f$M \approx LU\f$ and solves the preconditioned system \f$My = x\f$. + * \code{.c} + * // Create rocSPARSE handle + * rocsparse_handle handle; + * rocsparse_create_handle(&handle); + * + * // Create matrix descriptor for M + * rocsparse_mat_descr descr_M; + * rocsparse_create_mat_descr(&descr_M); + * + * // Create matrix descriptor for L + * rocsparse_mat_descr descr_L; + * rocsparse_create_mat_descr(&descr_L); + * rocsparse_set_mat_fill_mode(descr_L, rocsparse_fill_mode_lower); + * rocsparse_set_mat_diag_type(descr_L, rocsparse_diag_type_unit); + * + * // Create matrix descriptor for U + * rocsparse_mat_descr descr_U; + * rocsparse_create_mat_descr(&descr_U); + * rocsparse_set_mat_fill_mode(descr_U, rocsparse_fill_mode_upper); + * rocsparse_set_mat_diag_type(descr_U, rocsparse_diag_type_non_unit); + * + * // Create matrix info structure + * rocsparse_mat_info info; + * rocsparse_create_mat_info(&info); + * + * // Obtain required buffer size + * size_t buffer_size_M; + * size_t buffer_size_L; + * size_t buffer_size_U; + * rocsparse_dcsrilu0_buffer_size(handle, + * m, + * nnz, + * descr_M, + * csr_val, + * csr_row_ptr, + * csr_col_ind, + * info, + * &buffer_size_M); + * rocsparse_dcsrsv_buffer_size(handle, + * rocsparse_operation_none, + * m, + * nnz, + * descr_L, + * csr_val, + * csr_row_ptr, + * csr_col_ind, + * info, + * &buffer_size_L); + * rocsparse_dcsrsv_buffer_size(handle, + * rocsparse_operation_none, + * m, + * nnz, + * descr_U, + * csr_val, + * csr_row_ptr, + * csr_col_ind, + * info, + * &buffer_size_U); + * + * size_t buffer_size = max(buffer_size_M, max(buffer_size_L, buffer_size_U)); + * + * // Allocate temporary buffer + * void* temp_buffer; + * hipMalloc(&temp_buffer, buffer_size); + * + * // Perform analysis steps, using rocsparse_analysis_policy_reuse to improve + * // computation performance + * rocsparse_dcsrilu0_analysis(handle, + * m, + * nnz, + * descr_M, + * csr_val, + * csr_row_ptr, + * csr_col_ind, + * info, + * rocsparse_analysis_policy_reuse, + * rocsparse_solve_policy_auto, + * temp_buffer); + * rocsparse_dcsrsv_analysis(handle, + * rocsparse_operation_none, + * m, + * nnz, + * descr_L, + * csr_val, + * csr_row_ptr, + * csr_col_ind, + * info, + * rocsparse_analysis_policy_reuse, + * rocsparse_solve_policy_auto, + * temp_buffer); + * rocsparse_dcsrsv_analysis(handle, + * rocsparse_operation_none, + * m, + * nnz, + * descr_U, + * csr_val, + * csr_row_ptr, + * csr_col_ind, + * info, + * rocsparse_analysis_policy_reuse, + * rocsparse_solve_policy_auto, + * temp_buffer); + * + * // Check for zero pivot + * rocsparse_int position; + * if(rocsparse_status_zero_pivot == rocsparse_csrilu0_zero_pivot(handle, + * info, + * &position)) + * { + * printf("A has structural zero at A(%d,%d)\n", position, position); + * } + * + * // Compute incomplete LU factorization + * rocsparse_dcsrilu0(handle, + * m, + * nnz, + * descr_M, + * csr_val, + * csr_row_ptr, + * csr_col_ind, + * info, + * rocsparse_solve_policy_auto, + * temp_buffer); + * + * // Check for zero pivot + * if(rocsparse_status_zero_pivot == rocsparse_csrilu0_zero_pivot(handle, + * info, + * &position)) + * { + * printf("U has structural and/or numerical zero at U(%d,%d)\n", + * position, + * position); + * } + * + * // Solve Lz = x + * rocsparse_dcsrsv_solve(handle, + * rocsparse_operation_none, + * m, + * nnz, + * &alpha, + * descr_L, + * csr_val, + * csr_row_ptr, + * csr_col_ind, + * info, + * x, + * z, + * rocsparse_solve_policy_auto, + * temp_buffer); + * + * // Solve Uy = z + * rocsparse_dcsrsv_solve(handle, + * rocsparse_operation_none, + * m, + * nnz, + * &alpha, + * descr_U, + * csr_val, + * csr_row_ptr, + * csr_col_ind, + * info, + * z, + * y, + * rocsparse_solve_policy_auto, + * temp_buffer); + * + * // Clean up + * hipFree(temp_buffer); + * rocsparse_destroy_mat_info(info); + * rocsparse_destroy_mat_descr(descr_M); + * rocsparse_destroy_mat_descr(descr_L); + * rocsparse_destroy_mat_descr(descr_U); + * rocsparse_destroy_handle(handle); + * \endcode + */ +/**@{*/ ROCSPARSE_EXPORT rocsparse_status rocsparse_scsrilu0(rocsparse_handle handle, rocsparse_int m, @@ -1410,7 +2180,6 @@ rocsparse_status rocsparse_scsrilu0(rocsparse_handle handle, rocsparse_solve_policy policy, void* temp_buffer); - ROCSPARSE_EXPORT rocsparse_status rocsparse_dcsrilu0(rocsparse_handle handle, rocsparse_int m, @@ -1422,18 +2191,7 @@ rocsparse_status rocsparse_dcsrilu0(rocsparse_handle handle, rocsparse_mat_info info, rocsparse_solve_policy policy, void* temp_buffer); - - - - - - - - - - - - +/**@}*/ /* * =========================================================================== @@ -1441,7 +2199,8 @@ rocsparse_status rocsparse_dcsrilu0(rocsparse_handle handle, * =========================================================================== */ -/*! \brief Convert a sparse \p CSR matrix into sparse \p COO matrix +/*! \ingroup conv_module + * \brief Convert a sparse \p CSR matrix into sparse \p COO matrix * * \details * \p rocsparse_csr2coo converts the \p CSR array containing the row offsets, that point @@ -1480,7 +2239,8 @@ rocsparse_status rocsparse_csr2coo(rocsparse_handle handle, rocsparse_int* coo_row_ind, rocsparse_index_base idx_base); -/*! \brief Convert a sparse \p CSR matrix into sparse \p CSC matrix +/*! \ingroup conv_module + * \brief Convert a sparse \p CSR matrix into sparse \p CSC matrix * * \details * \p rocsparse_csr2csc_buffer_size returns the size of the temporary storage buffer @@ -1525,7 +2285,8 @@ rocsparse_status rocsparse_csr2csc_buffer_size(rocsparse_handle handle, rocsparse_action copy_values, size_t* buffer_size); -/*! \brief Convert a sparse \p CSR matrix into sparse \p CSC matrix +/*! \ingroup conv_module + * \brief Convert a sparse \p CSR matrix into sparse \p CSC matrix * * \details * \p rocsparse_csr2csc converts a \p CSR matrix info a \p CSC matrix. The resulting @@ -1580,7 +2341,7 @@ rocsparse_status rocsparse_csr2csc_buffer_size(rocsparse_handle handle, * \ref rocsparse_status_arch_mismatch the device is not supported.
* \ref rocsparse_status_internal_error an internal error occurred. */ -///@{ +/**@{*/ ROCSPARSE_EXPORT rocsparse_status rocsparse_scsr2csc(rocsparse_handle handle, rocsparse_int m, @@ -1610,9 +2371,10 @@ rocsparse_status rocsparse_dcsr2csc(rocsparse_handle handle, rocsparse_action copy_values, rocsparse_index_base idx_base, void* temp_buffer); -///@} +/**@}*/ -/*! \brief Convert a sparse \p CSR matrix into sparse \p ELL matrix +/*! \ingroup conv_module + * \brief Convert a sparse \p CSR matrix into sparse \p ELL matrix * * \details * \p rocsparse_csr2ell_width computes the maximum of the per row non-zero elements @@ -1653,7 +2415,8 @@ rocsparse_status rocsparse_csr2ell_width(rocsparse_handle handle, const rocsparse_mat_descr ell_descr, rocsparse_int* ell_width); -/*! \brief Convert a sparse \p CSR matrix into sparse \p ELL matrix +/*! \ingroup conv_module + * \brief Convert a sparse \p CSR matrix into sparse \p ELL matrix * * \details * \p rocsparse_csr2ell converts a \p CSR matrix into an \p ELL matrix. It is assumed, @@ -1697,7 +2460,7 @@ rocsparse_status rocsparse_csr2ell_width(rocsparse_handle handle, * \ref rocsparse_status_not_implemented * \ref rocsparse_matrix_type != \ref rocsparse_matrix_type_general. */ -///@{ +/**@{*/ ROCSPARSE_EXPORT rocsparse_status rocsparse_scsr2ell(rocsparse_handle handle, rocsparse_int m, @@ -1747,9 +2510,10 @@ rocsparse_status rocsparse_scsr2ell(rocsparse_handle handle, rocsparse_double_complex* ell_val, rocsparse_int* ell_col_ind); */ -///@} +/**@}*/ -/*! \brief Convert a sparse \p CSR matrix into sparse \p HYB matrix +/*! \ingroup conv_module + * \brief Convert a sparse \p CSR matrix into sparse \p HYB matrix * * \details * \p rocsparse_csr2hyb converts a \p CSR matrix into a \p HYB matrix. It is assumed @@ -1798,7 +2562,7 @@ rocsparse_status rocsparse_scsr2ell(rocsparse_handle handle, * \ref rocsparse_status_not_implemented * \ref rocsparse_matrix_type != \ref rocsparse_matrix_type_general. */ -///@{ +/**@{*/ ROCSPARSE_EXPORT rocsparse_status rocsparse_scsr2hyb(rocsparse_handle handle, rocsparse_int m, @@ -1847,9 +2611,10 @@ rocsparse_status rocsparse_dcsr2hyb(rocsparse_handle handle, rocsparse_int user_ell_width, rocsparse_hyb_partition partition_type); */ -///@} +/**@}*/ -/*! \brief Convert a sparse \p COO matrix into sparse \p CSR matrix +/*! \ingroup conv_module + * \brief Convert a sparse \p COO matrix into sparse \p CSR matrix * * \details * \p rocsparse_coo2csr converts the \p COO array containing the row indices into a @@ -1889,7 +2654,8 @@ rocsparse_status rocsparse_coo2csr(rocsparse_handle handle, rocsparse_int* csr_row_ptr, rocsparse_index_base idx_base); -/*! \brief Convert a sparse \p ELL matrix into sparse \p CSR matrix +/*! \ingroup conv_module + * \brief Convert a sparse \p ELL matrix into sparse \p CSR matrix * * \details * \p rocsparse_ell2csr_nnz computes the total \p CSR non-zero elements and the \p CSR @@ -1942,7 +2708,8 @@ rocsparse_status rocsparse_ell2csr_nnz(rocsparse_handle handle, rocsparse_int* csr_row_ptr, rocsparse_int* csr_nnz); -/*! \brief Convert a sparse \p ELL matrix into sparse \p CSR matrix +/*! \ingroup conv_module + * \brief Convert a sparse \p ELL matrix into sparse \p CSR matrix * * \details * \p rocsparse_ell2csr converts an \p ELL matrix into a \p CSR matrix. It is assumed @@ -1989,7 +2756,7 @@ rocsparse_status rocsparse_ell2csr_nnz(rocsparse_handle handle, * \ref rocsparse_status_not_implemented * \ref rocsparse_matrix_type != \ref rocsparse_matrix_type_general. */ -///@{ +/**@{*/ ROCSPARSE_EXPORT rocsparse_status rocsparse_sell2csr(rocsparse_handle handle, rocsparse_int m, @@ -2042,9 +2809,10 @@ rocsparse_status rocsparse_zell2csr(rocsparse_handle handle, const rocsparse_int* csr_row_ptr, rocsparse_int* csr_col_ind); */ -///@} +/**@}*/ -/*! \brief Create the identity map +/*! \ingroup conv_module + * \brief Create the identity map * * \details * \p rocsparse_create_identity_permutation stores the identity map in \p p, such that @@ -2074,7 +2842,8 @@ ROCSPARSE_EXPORT rocsparse_status rocsparse_create_identity_permutation(rocsparse_handle handle, rocsparse_int n, rocsparse_int* p); -/*! \brief Sort a sparse \p CSR matrix +/*! \ingroup conv_module + * \brief Sort a sparse \p CSR matrix * * \details * \p rocsparse_csrsort_buffer_size returns the size of the temporary storage buffer @@ -2115,7 +2884,8 @@ rocsparse_status rocsparse_csrsort_buffer_size(rocsparse_handle handle, const rocsparse_int* csr_col_ind, size_t* buffer_size); -/*! \brief Sort a sparse \p CSR matrix +/*! \ingroup conv_module + * \brief Sort a sparse \p CSR matrix * * \details * \p rocsparse_csrsort sorts a matrix in \p CSR format. The sorted permutation vector @@ -2167,30 +2937,30 @@ rocsparse_status rocsparse_csrsort_buffer_size(rocsparse_handle handle, * rocsparse_int m = 3; * rocsparse_int n = 3; * rocsparse_int nnz = 9; - * - * csr_row_ptr[m + 1] = {0, 3, 6, 9} // device memory + * + * csr_row_ptr[m + 1] = {0, 3, 6, 9}; // device memory * csr_col_ind[nnz] = {2, 0, 1, 0, 1, 2, 0, 2, 1}; // device memory * csr_val[nnz] = {3, 1, 2, 4, 5, 6, 7, 9, 8}; // device memory - * + * * // Allocate temporary buffer * size_t buffer_size = 0; * void* temp_buffer = NULL; * rocsparse_csrsort_buffer_size(handle, m, n, nnz, csr_row_ptr, csr_col_ind, &buffer_size); * hipMalloc(&temp_buffer, sizeof(char) * buffer_size); - * + * * // Create permutation vector perm as the identity map * rocsparse_int* perm = NULL; * hipMalloc((void**)&perm, sizeof(rocsparse_int) * nnz); * rocsparse_create_identity_permutation(handle, nnz, perm); - * + * * // Sort the CSR matrix * rocsparse_csrsort(handle, m, n, nnz, descr, csr_row_ptr, csr_col_ind, perm, temp_buffer); - * + * * // Gather sorted csr_val array * float* csr_val_sorted = NULL; * hipMalloc((void**)&csr_val_sorted, sizeof(float) * nnz); * rocsparse_sgthr(handle, nnz, csr_val, csr_val_sorted, perm, rocsparse_index_base_zero); - * + * * // Clean up * hipFree(temp_buffer); * hipFree(perm); @@ -2208,7 +2978,8 @@ rocsparse_status rocsparse_csrsort(rocsparse_handle handle, rocsparse_int* perm, void* temp_buffer); -/*! \brief Sort a sparse \p COO matrix +/*! \ingroup conv_module + * \brief Sort a sparse \p COO matrix * * \details * coosort_buffer_size returns the size of the temporary storage buffer @@ -2250,7 +3021,8 @@ rocsparse_status rocsparse_coosort_buffer_size(rocsparse_handle handle, const rocsparse_int* coo_col_ind, size_t* buffer_size); -/*! \brief Sort a sparse \p COO matrix by row +/*! \ingroup conv_module + * \brief Sort a sparse \p COO matrix by row * * \details * \p rocsparse_coosort_by_row sorts a matrix in \p COO format by row. The sorted @@ -2301,7 +3073,8 @@ rocsparse_status rocsparse_coosort_by_row(rocsparse_handle handle, rocsparse_int* perm, void* temp_buffer); -/*! \brief Sort a sparse \p COO matrix by column +/*! \ingroup conv_module + * \brief Sort a sparse \p COO matrix by column * * \details * \p rocsparse_coosort_by_column sorts a matrix in \p COO format by column. The sorted diff --git a/library/include/rocsparse-types.h b/library/include/rocsparse-types.h index 4d3b30db..2addcc17 100644 --- a/library/include/rocsparse-types.h +++ b/library/include/rocsparse-types.h @@ -13,42 +13,100 @@ #include -/*! \brief Specifies whether int32 or int64 is used. */ +/*! \ingroup types_module + * \brief Specifies whether int32 or int64 is used. + */ #if defined(rocsparse_ILP64) typedef int64_t rocsparse_int; #else typedef int32_t rocsparse_int; #endif - - -/*! \brief Handle to the rocSPARSE library context queue. */ +/*! \ingroup types_module + * \brief Handle to the rocSPARSE library context queue. + * + * \details + * The rocSPARSE handle is a structure holding the rocSPARSE library context. It must + * be initialized using rocsparse_create_handle() and the returned handle must be + * passed to all subsequent library function calls. It should be destroyed at the end + * using rocsparse_destroy_handle(). + */ typedef struct _rocsparse_handle* rocsparse_handle; -/*! \brief Descriptor of the matrix. */ + +/*! \ingroup types_module + * \brief Descriptor of the matrix. + * + * \details + * The rocSPARSE matrix descriptor is a structure holding all properties of a matrix. + * It must be initialized using rocsparse_create_mat_descr() and the returned descriptor + * must be passed to all subsequent library calls that involve the matrix. It should be + * destroyed at the end using rocsparse_destroy_mat_descr(). + */ typedef struct _rocsparse_mat_descr* rocsparse_mat_descr; -/*! \brief HYB matrix storage format. */ + +/*! \ingroup types_module + * \brief HYB matrix storage format. + * + * \details + * The rocSPARSE HYB matrix structure holds the HYB matrix. It must be initialized using + * rocsparse_create_hyb_mat() and the returned HYB matrix must be passed to all + * subsequent library calls that involve the matrix. It should be destroyed at the end + * using rocsparse_destroy_hyb_mat(). TODO For more details on the HYB format, see HYB storage + * format. + */ typedef struct _rocsparse_hyb_mat* rocsparse_hyb_mat; -/*! \brief Info structure to hold all matrix meta data. */ + +/*! \ingroup types_module + * \brief Info structure to hold all matrix meta data. + * + * \details + * The rocSPARSE matrix info is a structure holding all matrix information that is + * gathered during analysis routines. It must be initialized using + * rocsparse_create_mat_info() and the returned info structure must be passed to all + * subsequent library calls that require additional matrix information. It should be + * destroyed at the end using rocsparse_destroy_mat_info(). + */ typedef struct _rocsparse_mat_info* rocsparse_mat_info; #ifdef __cplusplus extern "C" { #endif -/*! \brief Specify whether the matrix is to be transposed or not. */ +/*! \ingroup types_module + * \brief Specify whether the matrix is to be transposed or not. + * + * \details + * The \ref rocsparse_operation indicates the operation performed with the given matrix. + */ typedef enum rocsparse_operation_ { rocsparse_operation_none = 111, /**< Operate with matrix. */ rocsparse_operation_transpose = 112, /**< Operate with transpose. */ rocsparse_operation_conjugate_transpose = 113 /**< Operate with conj. transpose. */ } rocsparse_operation; -/*! \brief Specify the matrix index base. */ +/*! \ingroup types_module + * \brief Specify the matrix index base. + * + * \details + * The \ref rocsparse_index_base indicates the index base of the indices. For a + * given \ref rocsparse_mat_descr, the \ref rocsparse_index_base can be set using + * rocsparse_set_mat_index_base(). The current \ref rocsparse_index_base of a matrix + * can be obtained by rocsparse_get_mat_index_base(). + */ typedef enum rocsparse_index_base_ { rocsparse_index_base_zero = 0, /**< zero based indexing. */ rocsparse_index_base_one = 1 /**< one based indexing. */ } rocsparse_index_base; -/*! \brief Specify the matrix type. */ +/*! \ingroup types_module + * \brief Specify the matrix type. + * + * \details + * The \ref rocsparse_matrix_type indices the type of a matrix. For a given + * \ref rocsparse_mat_descr, the \ref rocsparse_matrix_type can be set using + * rocsparse_set_mat_type(). The current \ref rocsparse_matrix_type of a matrix can be + * obtained by rocsparse_get_mat_type(). + */ typedef enum rocsparse_matrix_type_ { rocsparse_matrix_type_general = 0, /**< general matrix type. */ rocsparse_matrix_type_symmetric = 1, /**< symmetric matrix type. */ @@ -56,56 +114,120 @@ typedef enum rocsparse_matrix_type_ { rocsparse_matrix_type_triangular = 3 /**< triangular matrix type. */ } rocsparse_matrix_type; -/*! \brief Indicates if the diagonal entries are unity. */ +/*! \ingroup types_module + * \brief Indicates if the diagonal entries are unity. + * + * \details + * The \ref rocsparse_diag_type indicates whether the diagonal entries of a matrix are + * unity or not. If \ref rocsparse_diag_type_unit is specified, all present diagonal + * values will be ignored. For a given \ref rocsparse_mat_descr, the + * \ref rocsparse_diag_type can be set using rocsparse_set_mat_diag_type(). The current + * \ref rocsparse_diag_type of a matrix can be obtained by + * rocsparse_get_mat_diag_type(). + */ typedef enum rocsparse_diag_type_ { rocsparse_diag_type_non_unit = 0, /**< diagonal entries are non-unity. */ rocsparse_diag_type_unit = 1 /**< diagonal entries are unity */ } rocsparse_diag_type; -/*! \brief Specify the matrix fill mode. */ +/*! \ingroup types_module + * \brief Specify the matrix fill mode. + * + * \details + * The \ref rocsparse_fill_mode indicates whether the lower or the upper part is stored + * in a sparse triangular matrix. For a given \ref rocsparse_mat_descr, the + * \ref rocsparse_fill_mode can be set using rocsparse_set_mat_fill_mode(). The current + * \ref rocsparse_fill_mode of a matrix can be obtained by + * rocsparse_get_mat_fill_mode(). + */ typedef enum rocsparse_fill_mode_ { rocsparse_fill_mode_lower = 0, /**< lower triangular part is stored. */ rocsparse_fill_mode_upper = 1 /**< upper triangular part is stored. */ } rocsparse_fill_mode; -/*! \brief Specify where the operation is performed on. */ +/*! \ingroup types_module + * \brief Specify where the operation is performed on. + * + * \details + * The \ref rocsparse_action indicates whether the operation is performed on the full + * matrix, or only on the sparsity pattern of the matrix. + */ typedef enum rocsparse_action_ { rocsparse_action_symbolic = 0, /**< Operate only on indices. */ rocsparse_action_numeric = 1 /**< Operate on data and indices. */ } rocsparse_action; -/*! \brief HYB matrix partitioning type. */ +/*! \ingroup types_module + * \brief HYB matrix partitioning type. + * + * \details + * The \ref rocsparse_hyb_partition type indicates how the hybrid format partitioning + * between COO and ELL storage formats is performed. + */ typedef enum rocsparse_hyb_partition_ { rocsparse_hyb_partition_auto = 0, /**< automatically decide on ELL nnz per row. */ rocsparse_hyb_partition_user = 1, /**< user given ELL nnz per row. */ rocsparse_hyb_partition_max = 2 /**< max ELL nnz per row, no COO part. */ } rocsparse_hyb_partition; -/*! \brief Specify policy in triangular solvers and factorizations. */ -typedef enum rocsparse_solve_policy_ { - rocsparse_solve_policy_auto = 0 /**< automatically decide on level information. */ -} rocsparse_solve_policy; - -/*! \brief Specify policy in analysis functions. */ +/*! \ingroup types_module + * \brief Specify policy in analysis functions. + * + * \details + * The \ref rocsparse_analysis_policy specifies whether gathered analysis data should be + * re-used or not. If meta data from a previous e.g. rocsparse_csrilu0_analysis() call + * is available, it can be re-used for subsequent calls to e.g. + * rocsparse_csrsv_analysis() and greatly improve performance of the analysis function. + */ typedef enum rocsparse_analysis_policy_ { rocsparse_analysis_policy_reuse = 0, /**< try to re-use meta data. */ rocsparse_analysis_policy_force = 1 /**< force to re-build meta data. */ } rocsparse_analysis_policy; -/*! \brief Indicates if the pointer is device pointer or host pointer. */ +/*! \ingroup types_module + * \brief Specify policy in triangular solvers and factorizations. + * + * \details + * This is a placeholder. + */ +typedef enum rocsparse_solve_policy_ { + rocsparse_solve_policy_auto = 0 /**< automatically decide on level information. */ +} rocsparse_solve_policy; + +/*! \ingroup types_module + * \brief Indicates if the pointer is device pointer or host pointer. + * + * \details + * The \ref rocsparse_pointer_mode indicates whether scalar values are passed by + * reference on the host or device. The \ref rocsparse_pointer_mode can be changed by + * rocsparse_set_pointer_mode(). The currently used pointer mode can be obtained by + * rocsparse_get_pointer_mode(). + */ typedef enum rocsparse_pointer_mode_ { rocsparse_pointer_mode_host = 0, /**< scalar pointers are in host memory. */ rocsparse_pointer_mode_device = 1 /**< scalar pointers are in device memory. */ } rocsparse_pointer_mode; -/*! \brief Indicates if layer is active with bitmask. */ +/*! \ingroup types_module + * \brief Indicates if layer is active with bitmask. + * + * \details + * The \ref rocsparse_layer_mode bit mask indicates the logging characteristics. See + * TODO rocsparse_logging for more informations. + */ typedef enum rocsparse_layer_mode { - rocsparse_layer_mode_none = 0b0000000000, /**< layer is not active. */ - rocsparse_layer_mode_log_trace = 0b0000000001, /**< layer is in logging mode. */ - rocsparse_layer_mode_log_bench = 0b0000000010, /**< layer is in benchmarking mode. */ + rocsparse_layer_mode_none = 0x0, /**< layer is not active. */ + rocsparse_layer_mode_log_trace = 0x1, /**< layer is in logging mode. */ + rocsparse_layer_mode_log_bench = 0x2 /**< layer is in benchmarking mode. */ } rocsparse_layer_mode; -/*! \brief rocsparse status codes definition. */ +/*! \ingroup types_module + * \brief List of rocsparse status codes definition. + * + * \details + * This is a list of the \ref rocsparse_status types that are used by the rocSPARSE + * library. + */ typedef enum rocsparse_status_ { rocsparse_status_success = 0, /**< success. */ rocsparse_status_invalid_handle = 1, /**< handle not initialized, invalid or null. */ @@ -115,7 +237,8 @@ typedef enum rocsparse_status_ { rocsparse_status_memory_error = 5, /**< failed memory allocation, copy, dealloc. */ rocsparse_status_internal_error = 6, /**< other internal library failure. */ rocsparse_status_invalid_value = 7, /**< invalid value parameter. */ - rocsparse_status_arch_mismatch = 8 /**< device arch is not supported. */ + rocsparse_status_arch_mismatch = 8, /**< device arch is not supported. */ + rocsparse_status_zero_pivot = 9 /**< encountered zero pivot. */ } rocsparse_status; #ifdef __cplusplus diff --git a/library/src/conversion/rocsparse_coosort.cpp b/library/src/conversion/rocsparse_coosort.cpp index f2610465..0c1fee5a 100644 --- a/library/src/conversion/rocsparse_coosort.cpp +++ b/library/src/conversion/rocsparse_coosort.cpp @@ -230,8 +230,8 @@ extern "C" rocsparse_status rocsparse_coosort_by_row(rocsparse_handle handle, // Copy sorted rows, if stored in buffer if(output != coo_row_ind) { - RETURN_IF_HIP_ERROR(hipMemcpy( - coo_row_ind, output, sizeof(rocsparse_int) * nnz, hipMemcpyDeviceToDevice)); + RETURN_IF_HIP_ERROR(hipMemcpyAsync( + coo_row_ind, output, sizeof(rocsparse_int) * nnz, hipMemcpyDeviceToDevice, stream)); } // Obtain segments for segmented sort by columns @@ -375,15 +375,15 @@ extern "C" rocsparse_status rocsparse_coosort_by_row(rocsparse_handle handle, // Copy sorted columns, if stored in buffer if(output != coo_col_ind) { - RETURN_IF_HIP_ERROR(hipMemcpy( - coo_col_ind, output, sizeof(rocsparse_int) * nnz, hipMemcpyDeviceToDevice)); + RETURN_IF_HIP_ERROR(hipMemcpyAsync( + coo_col_ind, output, sizeof(rocsparse_int) * nnz, hipMemcpyDeviceToDevice, stream)); } // Copy reordered permutation, if stored in buffer if(mapping != perm) { - RETURN_IF_HIP_ERROR( - hipMemcpy(perm, mapping, sizeof(rocsparse_int) * nnz, hipMemcpyDeviceToDevice)); + RETURN_IF_HIP_ERROR(hipMemcpyAsync( + perm, mapping, sizeof(rocsparse_int) * nnz, hipMemcpyDeviceToDevice, stream)); } } else @@ -414,8 +414,8 @@ extern "C" rocsparse_status rocsparse_coosort_by_row(rocsparse_handle handle, // Copy sorted rows, if stored in buffer if(output != coo_row_ind) { - RETURN_IF_HIP_ERROR(hipMemcpy( - coo_row_ind, output, sizeof(rocsparse_int) * nnz, hipMemcpyDeviceToDevice)); + RETURN_IF_HIP_ERROR(hipMemcpyAsync( + coo_row_ind, output, sizeof(rocsparse_int) * nnz, hipMemcpyDeviceToDevice, stream)); } // Obtain segments for segmented sort by columns @@ -474,8 +474,8 @@ extern "C" rocsparse_status rocsparse_coosort_by_row(rocsparse_handle handle, // Copy sorted columns, if stored in buffer if(output != coo_col_ind) { - RETURN_IF_HIP_ERROR(hipMemcpy( - coo_col_ind, output, sizeof(rocsparse_int) * nnz, hipMemcpyDeviceToDevice)); + RETURN_IF_HIP_ERROR(hipMemcpyAsync( + coo_col_ind, output, sizeof(rocsparse_int) * nnz, hipMemcpyDeviceToDevice, stream)); } } diff --git a/library/src/conversion/rocsparse_csr2csc.hpp b/library/src/conversion/rocsparse_csr2csc.hpp index b2eb4ad5..ecb81611 100644 --- a/library/src/conversion/rocsparse_csr2csc.hpp +++ b/library/src/conversion/rocsparse_csr2csc.hpp @@ -128,8 +128,8 @@ rocsparse_status rocsparse_csr2csc_template(rocsparse_handle handle, void* tmp_hipcub = reinterpret_cast(ptr); // Load CSR column indices into work1 buffer - RETURN_IF_HIP_ERROR( - hipMemcpy(tmp_work1, csr_col_ind, sizeof(rocsparse_int) * nnz, hipMemcpyDeviceToDevice)); + RETURN_IF_HIP_ERROR(hipMemcpyAsync( + tmp_work1, csr_col_ind, sizeof(rocsparse_int) * nnz, hipMemcpyDeviceToDevice, stream)); if(copy_values == rocsparse_action_symbolic) { @@ -156,8 +156,11 @@ rocsparse_status rocsparse_csr2csc_template(rocsparse_handle handle, // Copy csc_row_ind if not current if(vals.Current() != csc_row_ind) { - RETURN_IF_HIP_ERROR(hipMemcpy( - csc_row_ind, vals.Current(), sizeof(rocsparse_int) * nnz, hipMemcpyDeviceToDevice)); + RETURN_IF_HIP_ERROR(hipMemcpyAsync(csc_row_ind, + vals.Current(), + sizeof(rocsparse_int) * nnz, + hipMemcpyDeviceToDevice, + stream)); } } else diff --git a/library/src/conversion/rocsparse_csr2ell.cpp b/library/src/conversion/rocsparse_csr2ell.cpp index 40efe439..0dae13ca 100644 --- a/library/src/conversion/rocsparse_csr2ell.cpp +++ b/library/src/conversion/rocsparse_csr2ell.cpp @@ -79,12 +79,14 @@ extern "C" rocsparse_status rocsparse_csr2ell_width(rocsparse_handle handle, return rocsparse_status_invalid_pointer; } + hipStream_t stream = handle->stream; + // Quick return if possible if(m == 0) { if(handle->pointer_mode == rocsparse_pointer_mode_device) { - RETURN_IF_HIP_ERROR(hipMemset(ell_width, 0, sizeof(rocsparse_int))); + RETURN_IF_HIP_ERROR(hipMemsetAsync(ell_width, 0, sizeof(rocsparse_int), stream)); } else { @@ -93,8 +95,6 @@ extern "C" rocsparse_status rocsparse_csr2ell_width(rocsparse_handle handle, return rocsparse_status_success; } - hipStream_t stream = handle->stream; - // Determine ELL width #define CSR2ELL_DIM 512 @@ -126,8 +126,8 @@ extern "C" rocsparse_status rocsparse_csr2ell_width(rocsparse_handle handle, // Copy ELL width back to host, if handle says so if(handle->pointer_mode == rocsparse_pointer_mode_device) { - RETURN_IF_HIP_ERROR( - hipMemcpy(ell_width, workspace, sizeof(rocsparse_int), hipMemcpyDeviceToDevice)); + RETURN_IF_HIP_ERROR(hipMemcpyAsync( + ell_width, workspace, sizeof(rocsparse_int), hipMemcpyDeviceToDevice, stream)); } else { @@ -135,6 +135,9 @@ extern "C" rocsparse_status rocsparse_csr2ell_width(rocsparse_handle handle, hipMemcpy(ell_width, workspace, sizeof(rocsparse_int), hipMemcpyDeviceToHost)); } + // Free workspace + RETURN_IF_HIP_ERROR(hipFree(workspace)); + return rocsparse_status_success; } diff --git a/library/src/conversion/rocsparse_csrsort.cpp b/library/src/conversion/rocsparse_csrsort.cpp index 7c5e2f14..8fd354c8 100644 --- a/library/src/conversion/rocsparse_csrsort.cpp +++ b/library/src/conversion/rocsparse_csrsort.cpp @@ -345,13 +345,19 @@ extern "C" rocsparse_status rocsparse_csrsort(rocsparse_handle handle, } if(keys.current() != csr_col_ind) { - RETURN_IF_HIP_ERROR(hipMemcpy( - csr_col_ind, keys.current(), sizeof(rocsparse_int) * nnz, hipMemcpyDeviceToDevice)); + RETURN_IF_HIP_ERROR(hipMemcpyAsync(csr_col_ind, + keys.current(), + sizeof(rocsparse_int) * nnz, + hipMemcpyDeviceToDevice, + stream)); } if(vals.current() != perm) { - RETURN_IF_HIP_ERROR(hipMemcpy( - perm, vals.current(), sizeof(rocsparse_int) * nnz, hipMemcpyDeviceToDevice)); + RETURN_IF_HIP_ERROR(hipMemcpyAsync(perm, + vals.current(), + sizeof(rocsparse_int) * nnz, + hipMemcpyDeviceToDevice, + stream)); } #elif defined(__HIP_PLATFORM_NVCC__) hipcub::DoubleBuffer keys(csr_col_ind, tmp_cols); @@ -361,13 +367,19 @@ extern "C" rocsparse_status rocsparse_csrsort(rocsparse_handle handle, tmp_rocprim, size, keys, vals, nnz, m, offsets, offsets + 1, startbit, endbit, stream)); if(keys.Current() != csr_col_ind) { - RETURN_IF_HIP_ERROR(hipMemcpy( - csr_col_ind, keys.Current(), sizeof(rocsparse_int) * nnz, hipMemcpyDeviceToDevice)); + RETURN_IF_HIP_ERROR(hipMemcpyAsync(csr_col_ind, + keys.Current(), + sizeof(rocsparse_int) * nnz, + hipMemcpyDeviceToDevice, + stream)); } if(vals.Current() != perm) { - RETURN_IF_HIP_ERROR(hipMemcpy( - perm, vals.Current(), sizeof(rocsparse_int) * nnz, hipMemcpyDeviceToDevice)); + RETURN_IF_HIP_ERROR(hipMemcpyAsync(perm, + vals.Current(), + sizeof(rocsparse_int) * nnz, + hipMemcpyDeviceToDevice, + stream)); } #endif } @@ -408,8 +420,11 @@ extern "C" rocsparse_status rocsparse_csrsort(rocsparse_handle handle, } if(keys.current() != csr_col_ind) { - RETURN_IF_HIP_ERROR(hipMemcpy( - csr_col_ind, keys.current(), sizeof(rocsparse_int) * nnz, hipMemcpyDeviceToDevice)); + RETURN_IF_HIP_ERROR(hipMemcpyAsync(csr_col_ind, + keys.current(), + sizeof(rocsparse_int) * nnz, + hipMemcpyDeviceToDevice, + stream)); } #elif defined(__HIP_PLATFORM_NVCC__) hipcub::DoubleBuffer keys(csr_col_ind, tmp_cols); @@ -418,8 +433,11 @@ extern "C" rocsparse_status rocsparse_csrsort(rocsparse_handle handle, tmp_rocprim, size, keys, nnz, m, offsets, offsets + 1, startbit, endbit, stream)); if(keys.Current() != csr_col_ind) { - RETURN_IF_HIP_ERROR(hipMemcpy( - csr_col_ind, keys.Current(), sizeof(rocsparse_int) * nnz, hipMemcpyDeviceToDevice)); + RETURN_IF_HIP_ERROR(hipMemcpyAsync(csr_col_ind, + keys.Current(), + sizeof(rocsparse_int) * nnz, + hipMemcpyDeviceToDevice, + stream)); } #endif } diff --git a/library/src/conversion/rocsparse_ell2csr.cpp b/library/src/conversion/rocsparse_ell2csr.cpp index 33c8b28d..5593f128 100644 --- a/library/src/conversion/rocsparse_ell2csr.cpp +++ b/library/src/conversion/rocsparse_ell2csr.cpp @@ -96,12 +96,14 @@ extern "C" rocsparse_status rocsparse_ell2csr_nnz(rocsparse_handle handle, return rocsparse_status_invalid_pointer; } + hipStream_t stream = handle->stream; + // Quick return if possible if(m == 0 || n == 0 || ell_width == 0) { if(handle->pointer_mode == rocsparse_pointer_mode_device) { - RETURN_IF_HIP_ERROR(hipMemset(csr_nnz, 0, sizeof(rocsparse_int))); + RETURN_IF_HIP_ERROR(hipMemsetAsync(csr_nnz, 0, sizeof(rocsparse_int), stream)); } else { @@ -110,8 +112,6 @@ extern "C" rocsparse_status rocsparse_ell2csr_nnz(rocsparse_handle handle, return rocsparse_status_success; } - hipStream_t stream = handle->stream; - // Count nnz per row #define ELL2CSR_DIM 256 dim3 ell2csr_blocks((m + 1) / ELL2CSR_DIM + 1); @@ -146,26 +146,42 @@ extern "C" rocsparse_status rocsparse_ell2csr_nnz(rocsparse_handle handle, RETURN_IF_HIP_ERROR(hipcub::DeviceScan::InclusiveSum( d_temp_storage, temp_storage_bytes, csr_row_ptr, csr_row_ptr, m + 1)); - // Free hipcub buffer - RETURN_IF_HIP_ERROR(hipFree(d_temp_storage)); - // Extract and adjust nnz according to index base - rocsparse_int nnz; - RETURN_IF_HIP_ERROR( - hipMemcpy(&nnz, csr_row_ptr + m, sizeof(rocsparse_int), hipMemcpyDeviceToHost)); + if(csr_descr->base == rocsparse_index_base_one) + { + rocsparse_int nnz; + RETURN_IF_HIP_ERROR( + hipMemcpy(&nnz, csr_row_ptr + m, sizeof(rocsparse_int), hipMemcpyDeviceToHost)); - nnz -= csr_descr->base; + nnz -= csr_descr->base; - // Set nnz - if(handle->pointer_mode == rocsparse_pointer_mode_device) - { - RETURN_IF_HIP_ERROR(hipMemcpy(csr_nnz, &nnz, sizeof(rocsparse_int), hipMemcpyHostToDevice)); + if(handle->pointer_mode == rocsparse_pointer_mode_device) + { + RETURN_IF_HIP_ERROR( + hipMemcpy(csr_nnz, &nnz, sizeof(rocsparse_int), hipMemcpyHostToDevice)); + } + else + { + *csr_nnz = nnz; + } } else { - *csr_nnz = nnz; + if(handle->pointer_mode == rocsparse_pointer_mode_device) + { + RETURN_IF_HIP_ERROR(hipMemcpyAsync( + csr_nnz, csr_row_ptr + m, sizeof(rocsparse_int), hipMemcpyDeviceToDevice, stream)); + } + else + { + RETURN_IF_HIP_ERROR( + hipMemcpy(csr_nnz, csr_row_ptr + m, sizeof(rocsparse_int), hipMemcpyDeviceToHost)); + } } + // Free hipcub buffer + RETURN_IF_HIP_ERROR(hipFree(d_temp_storage)); + return rocsparse_status_success; } diff --git a/library/src/handle.cpp b/library/src/handle.cpp index 5810dd8d..8323c917 100644 --- a/library/src/handle.cpp +++ b/library/src/handle.cpp @@ -32,6 +32,26 @@ _rocsparse_handle::_rocsparse_handle() layer_mode = (rocsparse_layer_mode)(atoi(str_layer_mode)); } + // Allocating small device buffer + rocsparse_int nthreads = properties.maxThreadsPerBlock; + rocsparse_int nprocs = properties.multiProcessorCount; + rocsparse_int nblocks = (nprocs * nthreads - 1) / 128 + 1; + rocsparse_int nwfs = nblocks * (128 / properties.warpSize); + + size_t size = (((sizeof(rocsparse_int) + 16) * nwfs - 1) / 256 + 1) * 256; + + THROW_IF_HIP_ERROR(hipMalloc(&buffer, size)); + + // Device one + THROW_IF_HIP_ERROR(hipMalloc(&sone, sizeof(float))); + THROW_IF_HIP_ERROR(hipMalloc(&done, sizeof(double))); + + float hsone = 1.0f; + double hdone = 1.0; + + THROW_IF_HIP_ERROR(hipMemcpy(sone, &hsone, sizeof(float), hipMemcpyHostToDevice)); + THROW_IF_HIP_ERROR(hipMemcpy(done, &hdone, sizeof(double), hipMemcpyHostToDevice)); + // Open log file if(layer_mode & rocsparse_layer_mode_log_trace) { @@ -50,6 +70,10 @@ _rocsparse_handle::_rocsparse_handle() ******************************************************************************/ _rocsparse_handle::~_rocsparse_handle() { + PRINT_IF_HIP_ERROR(hipFree(buffer)); + PRINT_IF_HIP_ERROR(hipFree(sone)); + PRINT_IF_HIP_ERROR(hipFree(done)); + // Close log files if(log_trace_ofs.is_open()) { @@ -90,7 +114,7 @@ rocsparse_status _rocsparse_handle::get_stream(hipStream_t* user_stream) const * \brief rocsparse_csrmv_info is a structure holding the rocsparse csrmv info * data gathered during csrmv_analysis. It must be initialized using the * rocsparse_create_csrmv_info() routine. It should be destroyed at the end - * rocsparse_destroy_csrmv_info(). + * using rocsparse_destroy_csrmv_info(). *******************************************************************************/ rocsparse_status rocsparse_create_csrmv_info(rocsparse_csrmv_info* info) { @@ -141,19 +165,12 @@ rocsparse_status rocsparse_destroy_csrmv_info(rocsparse_csrmv_info info) return rocsparse_status_success; } - - - - - - - - - - - - - +/******************************************************************************** + * \brief rocsparse_csrtr_info is a structure holding the rocsparse csrsv and + * csrilu0 data gathered during csrsv_analysis and csrilu0_analysis. It must be + * initialized using the rocsparse_create_csrtr_info() routine. It should be + * destroyed at the end using rocsparse_destroy_csrtr_info(). + *******************************************************************************/ rocsparse_status rocsparse_create_csrtr_info(rocsparse_csrtr_info* info) { if(info == nullptr) @@ -175,6 +192,9 @@ rocsparse_status rocsparse_create_csrtr_info(rocsparse_csrtr_info* info) } } +/******************************************************************************** + * \brief Destroy csrmv info. + *******************************************************************************/ rocsparse_status rocsparse_destroy_csrtr_info(rocsparse_csrtr_info info) { if(info == nullptr) @@ -195,6 +215,12 @@ rocsparse_status rocsparse_destroy_csrtr_info(rocsparse_csrtr_info info) info->csr_diag_ind = nullptr; } + if(info->zero_pivot != nullptr) + { + RETURN_IF_HIP_ERROR(hipFree(info->zero_pivot)); + info->zero_pivot = nullptr; + } + // Destruct try { diff --git a/library/src/include/handle.h b/library/src/include/handle.h index 48a32f9d..5fc0c792 100644 --- a/library/src/include/handle.h +++ b/library/src/include/handle.h @@ -27,14 +27,14 @@ typedef struct _rocsparse_csrtr_info* rocsparse_csrtr_info; *******************************************************************************/ struct _rocsparse_handle { - // Constructor + // constructor _rocsparse_handle(); - // Destructor + // destructor ~_rocsparse_handle(); - // Set stream + // set stream rocsparse_status set_stream(hipStream_t user_stream); - // Get stream + // get stream rocsparse_status get_stream(hipStream_t* user_stream) const; // device id @@ -49,6 +49,11 @@ struct _rocsparse_handle rocsparse_pointer_mode pointer_mode = rocsparse_pointer_mode_host; // logging mode rocsparse_layer_mode layer_mode; + // device buffer + void* buffer; + // device one + float* sone; + double* done; // logging streams std::ofstream log_trace_ofs; @@ -66,13 +71,13 @@ struct _rocsparse_handle *******************************************************************************/ struct _rocsparse_mat_descr { - // Matrix type + // matrix type rocsparse_matrix_type type = rocsparse_matrix_type_general; - // Fill mode + // fill mode rocsparse_fill_mode fill_mode = rocsparse_fill_mode_lower; - // Diagonal type + // diagonal type rocsparse_diag_type diag_type = rocsparse_diag_type_non_unit; - // Index base + // index base rocsparse_index_base base = rocsparse_index_base_zero; }; @@ -149,7 +154,7 @@ struct _rocsparse_csrmv_info * \brief rocsparse_csrmv_info is a structure holding the rocsparse csrmv info * data gathered during csrmv_analysis. It must be initialized using the * rocsparse_create_csrmv_info() routine. It should be destroyed at the end - * rocsparse_destroy_csrmv_info(). + * using rocsparse_destroy_csrmv_info(). *******************************************************************************/ rocsparse_status rocsparse_create_csrmv_info(rocsparse_csrmv_info* info); @@ -158,20 +163,23 @@ rocsparse_status rocsparse_create_csrmv_info(rocsparse_csrmv_info* info); *******************************************************************************/ rocsparse_status rocsparse_destroy_csrmv_info(rocsparse_csrmv_info info); - - - - - struct _rocsparse_csrtr_info { + // maximum depth rocsparse_int max_depth; + // total number of spin loops unsigned long long total_spin; + // maximum non-zero entries of a single row rocsparse_int max_nnz; + // host array to hold number of rows per level std::vector rows_per_level; + // device array to hold row permutation rocsparse_int* row_map = nullptr; + // device array to hold pointer to diagonal entry rocsparse_int* csr_diag_ind = nullptr; + // device pointer to hold zero pivot + rocsparse_int* zero_pivot = nullptr; // some data to verify correct execution rocsparse_int m; @@ -181,14 +189,19 @@ struct _rocsparse_csrtr_info const rocsparse_int* csr_col_ind; }; +/******************************************************************************** + * \brief rocsparse_csrtr_info is a structure holding the rocsparse csrsv and + * csrilu0 data gathered during csrsv_analysis and csrilu0_analysis. It must be + * initialized using the rocsparse_create_csrtr_info() routine. It should be + * destroyed at the end using rocsparse_destroy_csrtr_info(). + *******************************************************************************/ rocsparse_status rocsparse_create_csrtr_info(rocsparse_csrtr_info* info); +/******************************************************************************** + * \brief Destroy csrmv info. + *******************************************************************************/ rocsparse_status rocsparse_destroy_csrtr_info(rocsparse_csrtr_info info); - - - - /******************************************************************************** * \brief ELL format indexing *******************************************************************************/ diff --git a/library/src/include/utility.h b/library/src/include/utility.h index 9843a24e..de2c2fff 100644 --- a/library/src/include/utility.h +++ b/library/src/include/utility.h @@ -22,6 +22,17 @@ static inline rocsparse_int rocsparse_clz(rocsparse_int n) { return 64 - __built static inline rocsparse_int rocsparse_clz(rocsparse_int n) { return 32 - __builtin_clz(n); } #endif +// Return one on the device +static inline void rocsparse_one(const rocsparse_handle handle, float** one) +{ + *one = handle->sone; +} + +static inline void rocsparse_one(const rocsparse_handle handle, double** one) +{ + *one = handle->done; +} + // if trace logging is turned on with // (handle->layer_mode & rocsparse_layer_mode_log_trace) == true // then diff --git a/library/src/level1/rocsparse_doti.hpp b/library/src/level1/rocsparse_doti.hpp index d3b9e781..56dd603d 100644 --- a/library/src/level1/rocsparse_doti.hpp +++ b/library/src/level1/rocsparse_doti.hpp @@ -142,9 +142,10 @@ rocsparse_status rocsparse_doti_template(rocsparse_handle handle, result); } RETURN_IF_HIP_ERROR(hipMemcpy(result, workspace, sizeof(T), hipMemcpyDeviceToHost)); - RETURN_IF_HIP_ERROR(hipFree(workspace)); } + RETURN_IF_HIP_ERROR(hipFree(workspace)); + return rocsparse_status_success; } diff --git a/library/src/level2/coomv_device.h b/library/src/level2/coomv_device.h index 70887311..9c95b9cc 100644 --- a/library/src/level2/coomv_device.h +++ b/library/src/level2/coomv_device.h @@ -11,7 +11,7 @@ // Scale kernel for beta != 1.0 template -__global__ void coomv_scale(rocsparse_int size, T scalar, T* __restrict__ data) +__device__ void coomv_scale_device(rocsparse_int size, T beta, T* __restrict__ data) { rocsparse_int gid = hipBlockIdx_x * hipBlockDim_x + hipThreadIdx_x; @@ -20,7 +20,7 @@ __global__ void coomv_scale(rocsparse_int size, T scalar, T* __restrict__ data) return; } - data[gid] *= scalar; + data[gid] *= beta; } // Implementation motivated by papers 'Efficient Sparse Matrix-Vector Multiplication on CUDA', diff --git a/library/src/level2/csrmv_device.h b/library/src/level2/csrmv_device.h index c788c8f3..6173efb9 100644 --- a/library/src/level2/csrmv_device.h +++ b/library/src/level2/csrmv_device.h @@ -226,7 +226,8 @@ __device__ static __inline__ int64_t rocsparse_mul24(int64_t x, int64_t y) return ((x << 40) >> 40) * ((y << 40) >> 40); } -__device__ static __inline__ rocsparse_int rocsparse_mad24(rocsparse_int x, rocsparse_int y, rocsparse_int z) +__device__ static __inline__ rocsparse_int +rocsparse_mad24(rocsparse_int x, rocsparse_int y, rocsparse_int z) { return rocsparse_mul24(x, y) + z; } @@ -296,7 +297,8 @@ __device__ void csrmvn_adaptive_device(unsigned long long* row_blocks, // Any workgroup only calculates, at most, BLOCK_MULTIPLIER*BLOCKSIZE items in a row. // If there are more items in this row, we assign more workgroups. - rocsparse_int vecStart = rocsparse_mad24(wg, BLOCK_MULTIPLIER * BLOCKSIZE, csr_row_ptr[row] - idx_base); + rocsparse_int vecStart = + rocsparse_mad24(wg, BLOCK_MULTIPLIER * BLOCKSIZE, csr_row_ptr[row] - idx_base); rocsparse_int vecEnd = ((csr_row_ptr[row + 1] - idx_base) > vecStart + BLOCK_MULTIPLIER * BLOCKSIZE) ? vecStart + BLOCK_MULTIPLIER * BLOCKSIZE @@ -417,7 +419,9 @@ __device__ void csrmvn_adaptive_device(unsigned long long* row_blocks, // If so, just do a write rather than a read-write. Measured to be a slight (~5%) // performance improvement. if(beta != 0.) - temp_sum += beta * y[local_row]; + { + temp_sum = fma(beta, y[local_row], temp_sum); + } y[local_row] = temp_sum; } } @@ -445,7 +449,7 @@ __device__ void csrmvn_adaptive_device(unsigned long long* row_blocks, // put that into the output for each row. if(beta != 0.) { - temp_sum += beta * y[local_row]; + temp_sum = fma(beta, y[local_row], temp_sum); } y[local_row] = temp_sum; @@ -480,7 +484,7 @@ __device__ void csrmvn_adaptive_device(unsigned long long* row_blocks, for(unsigned long long j = vecStart + lid; j < vecEnd; j += WG_SIZE) { rocsparse_int col = csr_col_ind[(unsigned int)j] - idx_base; - temp_sum += alpha * csr_val[(unsigned int)j] * x[col]; + temp_sum = fma(alpha, csr_val[(unsigned int)j] * x[col], temp_sum); } partialSums[lid] = temp_sum; @@ -496,7 +500,7 @@ __device__ void csrmvn_adaptive_device(unsigned long long* row_blocks, { if(beta != 0.) { - temp_sum += beta * y[row]; + temp_sum = fma(beta, y[row], temp_sum); } y[row] = temp_sum; @@ -558,11 +562,13 @@ __device__ void csrmvn_adaptive_device(unsigned long long* row_blocks, // That increases register pressure and reduces occupancy. for(rocsparse_int j = 0; j < vecEnd - col; j += WG_SIZE) { - temp_sum += alpha * csr_val[col + j] * x[csr_col_ind[col + j] - idx_base]; + temp_sum = + fma(alpha, csr_val[col + j] * x[csr_col_ind[col + j] - idx_base], temp_sum); #if 2 * WG_SIZE <= BLOCK_MULTIPLIER * BLOCKSIZE // If you can, unroll this loop once. It somewhat helps performance. j += WG_SIZE; - temp_sum += alpha * csr_val[col + j] * x[csr_col_ind[col + j] - idx_base]; + temp_sum = + fma(alpha, csr_val[col + j] * x[csr_col_ind[col + j] - idx_base], temp_sum); #endif } } @@ -570,7 +576,8 @@ __device__ void csrmvn_adaptive_device(unsigned long long* row_blocks, { for(rocsparse_int j = 0; j < vecEnd - col; j += WG_SIZE) { - temp_sum += alpha * csr_val[col + j] * x[csr_col_ind[col + j] - idx_base]; + temp_sum = + fma(alpha, csr_val[col + j] * x[csr_col_ind[col + j] - idx_base], temp_sum); } } diff --git a/library/src/level2/csrsv_device.h b/library/src/level2/csrsv_device.h index 7069f1ad..48b3d7ba 100644 --- a/library/src/level2/csrsv_device.h +++ b/library/src/level2/csrsv_device.h @@ -1,5 +1,6 @@ /* ************************************************************************ * Copyright 2018 Advanced Micro Devices, Inc. + * * ************************************************************************ */ #pragma once @@ -8,8 +9,9 @@ #include +// Compute intra wavefront maximum and spin summation template -static __device__ __inline__ void two_reduce(int* local_max, int *local_spin) +static __device__ __inline__ void two_reduce(int* local_max, int* local_spin) { #if defined(__HIP_PLATFORM_HCC__) int max_depth = *local_max; @@ -80,6 +82,7 @@ __global__ void csrsv_analysis_kernel(rocsparse_int m, rocsparse_int* __restrict__ max_depth, unsigned long long* __restrict__ total_spin, rocsparse_int* __restrict__ max_nnz, + rocsparse_int* __restrict__ zero_pivot, rocsparse_index_base idx_base) { rocsparse_int tid = hipThreadIdx_x; @@ -87,11 +90,13 @@ __global__ void csrsv_analysis_kernel(rocsparse_int m, rocsparse_int lid = tid & (WF_SIZE - 1); rocsparse_int row = gid / WF_SIZE; + // Do not run out of bounds if(row >= m) { return; } + // If we process upper triangular, we need to access with reverse index if(FILL_MODE == rocsparse_fill_mode_upper) { // Processing upper triangular matrix @@ -104,7 +109,8 @@ __global__ void csrsv_analysis_kernel(rocsparse_int m, csr_diag_ind[row] = -1; } - rocsparse_int local_max = 0; + // Local depth and spin + rocsparse_int local_max = 0; rocsparse_int local_spin = 0; int row_begin = csr_row_ptr[row] - idx_base; @@ -126,8 +132,11 @@ __global__ void csrsv_analysis_kernel(rocsparse_int m, csr_diag_ind[row] = j; } + // Differentiate fill mode if(FILL_MODE == rocsparse_fill_mode_upper) { + // If upper triangular, skip all entries that are not in the upper part + // of the matrix if(local_col <= row) { continue; @@ -136,7 +145,7 @@ __global__ void csrsv_analysis_kernel(rocsparse_int m, else if(FILL_MODE == rocsparse_fill_mode_lower) { // Diagonal and above, skip this. - if (local_col >= row) + if(local_col >= row) { break; } @@ -146,27 +155,29 @@ __global__ void csrsv_analysis_kernel(rocsparse_int m, // While there are threads in this workgroup that have been unable to // get their input, loop and wait for the flag to exist. - while (!local_done) + while(!local_done) { #if defined(__HIP_PLATFORM_HCC__) - local_done = __atomic_load_n(&done_array[local_col], __ATOMIC_RELAXED); + local_done = __atomic_load_n(&done_array[local_col], __ATOMIC_ACQUIRE); #elif defined(__HIP_PLATFORM_NVCC__) local_done = atomicOr(&done_array[local_col], 0); #endif ++local_spin; } + // Local maximum local_max = max(local_done, local_max); } - // Determine maximum local depth and local spin loops + // Determine maximum local depth and local spin loops within the wavefront two_reduce(&local_max, &local_spin); ++local_max; - if (lid == WF_SIZE - 1) + if(lid == WF_SIZE - 1) { +// Lane 0 writes the "row is done" flag #if defined(__HIP_PLATFORM_HCC__) - __atomic_store_n(&done_array[row], local_max, __ATOMIC_RELAXED); + __atomic_store_n(&done_array[row], local_max, __ATOMIC_RELEASE); #elif defined(__HIP_PLATFORM_NVCC__) atomicOr(&done_array[row], local_max); #endif @@ -175,84 +186,142 @@ __global__ void csrsv_analysis_kernel(rocsparse_int m, // We're sending out "local_max - 1" because of 0-based indexing. // However, we needed to put a non-zero value into the done_array up above // when we crammed local_depth in, so these two will be off by one. - atomicAdd(&rows_per_level[local_max-1], 1); + atomicAdd(&rows_per_level[local_max - 1], 1); atomicMax(max_depth, local_max); atomicAdd(total_spin, local_spin); atomicMax(max_nnz, row_end - row_begin); + + if(csr_diag_ind[row] == -1) + { + // We are looking for the first zero pivot + atomicMin(zero_pivot, row + idx_base); + } } } - - - - - #if defined(__HIP_PLATFORM_HCC__) // While HIP does not contain llvm intrinsics __device__ int __llvm_amdgcn_readlane(int index, int offset) __asm("llvm.amdgcn.readlane"); +// Swizzle based intra wavefront reduction sum +template static __device__ __inline__ float wf_reduce(float temp_sum) { - typedef union flt_b32 { + typedef union flt_b32 + { float val; int b32; } flt_b32_t; flt_b32_t upper_sum, t_temp_sum; t_temp_sum.val = temp_sum; - upper_sum.b32 = __hip_ds_swizzle(t_temp_sum.b32, 0x80b1); - t_temp_sum.val += upper_sum.val; - upper_sum.b32 = __hip_ds_swizzle(t_temp_sum.b32, 0x804e); - t_temp_sum.val += upper_sum.val; - upper_sum.b32 = __hip_ds_swizzle(t_temp_sum.b32, 0x101f); - t_temp_sum.val += upper_sum.val; - upper_sum.b32 = __hip_ds_swizzle(t_temp_sum.b32, 0x201f); - t_temp_sum.val += upper_sum.val; - upper_sum.b32 = __hip_ds_swizzle(t_temp_sum.b32, 0x401f); - t_temp_sum.val += upper_sum.val; - upper_sum.b32 = __llvm_amdgcn_readlane(t_temp_sum.b32, 32); - t_temp_sum.val += upper_sum.val; + + if(WF_SIZE > 1) + { + upper_sum.b32 = __hip_ds_swizzle(t_temp_sum.b32, 0x80b1); + t_temp_sum.val += upper_sum.val; + } + + if(WF_SIZE > 2) + { + upper_sum.b32 = __hip_ds_swizzle(t_temp_sum.b32, 0x804e); + t_temp_sum.val += upper_sum.val; + } + + if(WF_SIZE > 4) + { + upper_sum.b32 = __hip_ds_swizzle(t_temp_sum.b32, 0x101f); + t_temp_sum.val += upper_sum.val; + } + + if(WF_SIZE > 8) + { + upper_sum.b32 = __hip_ds_swizzle(t_temp_sum.b32, 0x201f); + t_temp_sum.val += upper_sum.val; + } + + if(WF_SIZE > 16) + { + upper_sum.b32 = __hip_ds_swizzle(t_temp_sum.b32, 0x401f); + t_temp_sum.val += upper_sum.val; + } + + if(WF_SIZE > 32) + { + upper_sum.b32 = __llvm_amdgcn_readlane(t_temp_sum.b32, 32); + t_temp_sum.val += upper_sum.val; + } + temp_sum = t_temp_sum.val; return temp_sum; } +// Swizzle based intra wavefront reduction sum +template static __device__ __inline__ double wf_reduce(double temp_sum) { - typedef union dbl_b32 { + typedef union dbl_b32 + { double val; int b32[2]; } dbl_b32_t; dbl_b32_t upper_sum, t_temp_sum; t_temp_sum.val = temp_sum; - upper_sum.b32[0] = __hip_ds_swizzle(t_temp_sum.b32[0], 0x80b1); - upper_sum.b32[1] = __hip_ds_swizzle(t_temp_sum.b32[1], 0x80b1); - t_temp_sum.val += upper_sum.val; - upper_sum.b32[0] = __hip_ds_swizzle(t_temp_sum.b32[0], 0x804e); - upper_sum.b32[1] = __hip_ds_swizzle(t_temp_sum.b32[1], 0x804e); - t_temp_sum.val += upper_sum.val; - upper_sum.b32[0] = __hip_ds_swizzle(t_temp_sum.b32[0], 0x101f); - upper_sum.b32[1] = __hip_ds_swizzle(t_temp_sum.b32[1], 0x101f); - t_temp_sum.val += upper_sum.val; - upper_sum.b32[0] = __hip_ds_swizzle(t_temp_sum.b32[0], 0x201f); - upper_sum.b32[1] = __hip_ds_swizzle(t_temp_sum.b32[1], 0x201f); - t_temp_sum.val += upper_sum.val; - upper_sum.b32[0] = __hip_ds_swizzle(t_temp_sum.b32[0], 0x401f); - upper_sum.b32[1] = __hip_ds_swizzle(t_temp_sum.b32[1], 0x401f); - t_temp_sum.val += upper_sum.val; - upper_sum.b32[0] = __llvm_amdgcn_readlane(t_temp_sum.b32[0], 32); - upper_sum.b32[1] = __llvm_amdgcn_readlane(t_temp_sum.b32[1], 32); - t_temp_sum.val += upper_sum.val; + + if(WF_SIZE > 1) + { + upper_sum.b32[0] = __hip_ds_swizzle(t_temp_sum.b32[0], 0x80b1); + upper_sum.b32[1] = __hip_ds_swizzle(t_temp_sum.b32[1], 0x80b1); + t_temp_sum.val += upper_sum.val; + } + + if(WF_SIZE > 2) + { + upper_sum.b32[0] = __hip_ds_swizzle(t_temp_sum.b32[0], 0x804e); + upper_sum.b32[1] = __hip_ds_swizzle(t_temp_sum.b32[1], 0x804e); + t_temp_sum.val += upper_sum.val; + } + + if(WF_SIZE > 4) + { + upper_sum.b32[0] = __hip_ds_swizzle(t_temp_sum.b32[0], 0x101f); + upper_sum.b32[1] = __hip_ds_swizzle(t_temp_sum.b32[1], 0x101f); + t_temp_sum.val += upper_sum.val; + } + + if(WF_SIZE > 8) + { + upper_sum.b32[0] = __hip_ds_swizzle(t_temp_sum.b32[0], 0x201f); + upper_sum.b32[1] = __hip_ds_swizzle(t_temp_sum.b32[1], 0x201f); + t_temp_sum.val += upper_sum.val; + } + + if(WF_SIZE > 16) + { + upper_sum.b32[0] = __hip_ds_swizzle(t_temp_sum.b32[0], 0x401f); + upper_sum.b32[1] = __hip_ds_swizzle(t_temp_sum.b32[1], 0x401f); + t_temp_sum.val += upper_sum.val; + } + + if(WF_SIZE > 32) + { + upper_sum.b32[0] = __llvm_amdgcn_readlane(t_temp_sum.b32[0], 32); + upper_sum.b32[1] = __llvm_amdgcn_readlane(t_temp_sum.b32[1], 32); + t_temp_sum.val += upper_sum.val; + } + temp_sum = t_temp_sum.val; return temp_sum; } #elif defined(__HIP_PLATFORM_NVCC__) -template +template static __device__ __inline__ T wf_reduce(T temp_sum) { - for(int i = 16; i >= 1; i >>= 1) + // Perform wavefront reduction sum + for(int i = WF_SIZE >> 1; i >= 1; i >>= 1) { temp_sum += __shfl_down_sync(0xffffffff, temp_sum, i); } @@ -272,6 +341,7 @@ __device__ void csrsv_device(rocsparse_int m, rocsparse_int* __restrict__ done_array, rocsparse_int* __restrict__ map, rocsparse_int offset, + rocsparse_int* __restrict__ zero_pivot, rocsparse_index_base idx_base, rocsparse_fill_mode fill_mode, rocsparse_diag_type diag_type) @@ -280,59 +350,92 @@ __device__ void csrsv_device(rocsparse_int m, rocsparse_int gid = hipBlockIdx_x * BLOCKSIZE + tid; rocsparse_int lid = tid & (WF_SIZE - 1); rocsparse_int wid = tid / WF_SIZE; + + // Index into the row map rocsparse_int idx = gid / WF_SIZE; + // LDS to hold diagonal entry __shared__ T diagonal[BLOCKSIZE / WF_SIZE]; + // Do not run out of bounds if(idx >= m) { return; } - rocsparse_int row = map[idx + offset]; + // Get the row this warp will operate on + rocsparse_int row = map[idx + offset]; + + // Current row entry point and exit point rocsparse_int row_begin = csr_row_ptr[row] - idx_base; rocsparse_int row_end = csr_row_ptr[row + 1] - idx_base; + // Local summation variable. T local_sum = static_cast(0); if(lid == 0) { + // Lane 0 initializes its local sum with alpha and x local_sum = alpha * x[row]; } for(rocsparse_int j = row_begin + lid; j < row_end; j += WF_SIZE) { + // Current column this lane operates on rocsparse_int local_col = csr_col_ind[j] - idx_base; + + // Local value this lane operates with T local_val = csr_val[j]; + // Check for numerical zero + if(local_val == static_cast(0) && local_col == row && + diag_type == rocsparse_diag_type_non_unit) + { + // Numerical zero pivot found, avoid division by 0 + // and store index for later use. + atomicMin(zero_pivot, row + idx_base); + local_val = static_cast(1); + } + + // Differentiate upper and lower triangular mode if(fill_mode == rocsparse_fill_mode_upper) { // Processing upper triangular + + // Ignore all entries that are below the diagonal if(local_col < row) { continue; } + // Diagonal entry if(local_col == row) { + // If diagonal type is non unit, do division by diagonal entry + // This is not required for unit diagonal for obvious reasons if(diag_type == rocsparse_diag_type_non_unit) { diagonal[wid] = static_cast(1) / local_val; } - + continue; } } else if(fill_mode == rocsparse_fill_mode_lower) { // Processing lower triangular + + // Ignore all entries that are above the diagonal if(local_col > row) { break; } + // Diagonal entry if(local_col == row) { + // If diagonal type is non unit, do division by diagonal entry + // This is not required for unit diagonal for obvious reasons if(diag_type == rocsparse_diag_type_non_unit) { diagonal[wid] = static_cast(1) / local_val; @@ -342,36 +445,45 @@ __device__ void csrsv_device(rocsparse_int m, } } +// Spin loop until dependency has been resolved #if defined(__HIP_PLATFORM_HCC__) - while(!__atomic_load_n(&done_array[local_col], __ATOMIC_RELAXED)); + while(!__atomic_load_n(&done_array[local_col], __ATOMIC_ACQUIRE)) + ; #elif defined(__HIP_PLATFORM_NVCC__) - while(!atomicOr(&done_array[local_col], 0)); + while(!atomicOr(&done_array[local_col], 0)) + ; #endif +// Load y value bypassing caches #if defined(__HIP_PLATFORM_HCC__) T out_val; - __atomic_load(&y[local_col], &out_val, __ATOMIC_RELAXED); + __atomic_load(&y[local_col], &out_val, __ATOMIC_ACQUIRE); #elif defined(__HIP_PLATFORM_NVCC__) T out_val = y[local_col]; #endif + // Local sum computation for each lane local_sum -= local_val * out_val; } - local_sum = wf_reduce(local_sum); + // Gather all local sums for each lane + local_sum = wf_reduce(local_sum); + // If we have non unit diagonal, take the diagonal into account + // For unit diagonal, this would be multiplication with one if(diag_type == rocsparse_diag_type_non_unit) { local_sum *= diagonal[wid]; } - if (lid == 0) + if(lid == 0) { +// Lane 0 writes the "row is done" flag and stores the rows result in y #if defined(__HIP_PLATFORM_HCC__) - __atomic_store(&y[row], &local_sum, __ATOMIC_RELAXED); - __atomic_store_n(&done_array[row], 1, __ATOMIC_RELAXED); + __atomic_store(&y[row], &local_sum, __ATOMIC_RELEASE); + __atomic_store_n(&done_array[row], 1, __ATOMIC_RELEASE); #elif defined(__HIP_PLATFORM_NVCC__) - y[row] = local_sum; + y[row] = local_sum; atomicOr(&done_array[row], 1); #endif } diff --git a/library/src/level2/ellmv_device.h b/library/src/level2/ellmv_device.h index e4e79699..e717ef28 100644 --- a/library/src/level2/ellmv_device.h +++ b/library/src/level2/ellmv_device.h @@ -39,7 +39,7 @@ static __device__ void ellmvn_device(rocsparse_int m, if(col >= 0 && col < n) { - sum += ell_val[idx] * __ldg(x + col); + sum = fma(ell_val[idx], __ldg(x + col), sum); } else { @@ -49,7 +49,7 @@ static __device__ void ellmvn_device(rocsparse_int m, if(beta != static_cast(0)) { - y[ai] = beta * y[ai] + alpha * sum; + y[ai] = fma(beta, y[ai], alpha * sum); } else { diff --git a/library/src/level2/rocsparse_coomv.hpp b/library/src/level2/rocsparse_coomv.hpp index 2261716a..ed1fc4e3 100644 --- a/library/src/level2/rocsparse_coomv.hpp +++ b/library/src/level2/rocsparse_coomv.hpp @@ -15,11 +15,56 @@ #include +template +__global__ void coomv_scale_host_pointer(rocsparse_int size, T beta, T* __restrict__ data) +{ + coomv_scale_device(size, beta, data); +} + +template +__global__ void +coomv_scale_device_pointer(rocsparse_int size, const T* __restrict__ beta, T* __restrict__ data) +{ + if(*beta == static_cast(1)) + { + return; + } + + coomv_scale_device(size, *beta, data); +} + template __launch_bounds__(128) __global__ void coomvn_wf_host_pointer(rocsparse_int nnz, + rocsparse_int loops, + T alpha, + const rocsparse_int* __restrict__ coo_row_ind, + const rocsparse_int* __restrict__ coo_col_ind, + const T* __restrict__ coo_val, + const T* __restrict__ x, + T* __restrict__ y, + rocsparse_int* __restrict__ row_block_red, + T* __restrict__ val_block_red, + rocsparse_index_base idx_base) +{ + coomvn_general_wf_reduce(nnz, + loops, + alpha, + coo_row_ind, + coo_col_ind, + coo_val, + x, + y, + row_block_red, + val_block_red, + idx_base); +} + +template +__launch_bounds__(128) __global__ + void coomvn_wf_device_pointer(rocsparse_int nnz, rocsparse_int loops, - T alpha, + const T* alpha, const rocsparse_int* __restrict__ coo_row_ind, const rocsparse_int* __restrict__ coo_col_ind, const T* __restrict__ coo_val, @@ -30,43 +75,16 @@ __launch_bounds__(128) __global__ rocsparse_index_base idx_base) { coomvn_general_wf_reduce(nnz, - loops, - alpha, - coo_row_ind, - coo_col_ind, - coo_val, - x, - y, - row_block_red, - val_block_red, - idx_base); -} - -template -__launch_bounds__(128) __global__ - void coomvn_wf_device_pointer(rocsparse_int nnz, - rocsparse_int loops, - const T* alpha, - const rocsparse_int* __restrict__ coo_row_ind, - const rocsparse_int* __restrict__ coo_col_ind, - const T* __restrict__ coo_val, - const T* __restrict__ x, - T* __restrict__ y, - rocsparse_int* __restrict__ row_block_red, - T* __restrict__ val_block_red, - rocsparse_index_base idx_base) -{ - coomvn_general_wf_reduce(nnz, - loops, - *alpha, - coo_row_ind, - coo_col_ind, - coo_val, - x, - y, - row_block_red, - val_block_red, - idx_base); + loops, + *alpha, + coo_row_ind, + coo_col_ind, + coo_val, + x, + y, + row_block_red, + val_block_red, + idx_base); } template @@ -220,34 +238,28 @@ rocsparse_status rocsparse_coomv_template(rocsparse_handle handle, dim3 coomvn_blocks(nblocks); dim3 coomvn_threads(COOMVN_DIM); - rocsparse_int* row_block_red = NULL; - T* val_block_red = NULL; + // Buffer + char* ptr = reinterpret_cast(handle->buffer); + ptr += 256; + + // row block reduction buffer + rocsparse_int* row_block_red = reinterpret_cast(ptr); + ptr += ((sizeof(rocsparse_int) * nwfs - 1) / 256 + 1) * 256; - // Allocating a maximum of 8 kByte - RETURN_IF_HIP_ERROR(hipMalloc((void**)&row_block_red, sizeof(rocsparse_int) * nwfs)); - RETURN_IF_HIP_ERROR(hipMalloc((void**)&val_block_red, sizeof(T) * nwfs)); + // val block reduction buffer + T* val_block_red = reinterpret_cast(ptr); if(handle->pointer_mode == rocsparse_pointer_mode_device) { - // We need a host copy of beta to avoid unneccessary kernel launch - T h_beta; - RETURN_IF_HIP_ERROR(hipMemcpy(&h_beta, beta, sizeof(T), hipMemcpyDeviceToHost)); - - if(h_beta == static_cast(0)) - { - RETURN_IF_HIP_ERROR(hipMemset(y, 0, sizeof(T) * m)); - } - else if(h_beta != static_cast(1)) - { - hipLaunchKernelGGL((coomv_scale), - dim3((m - 1) / COOMVN_DIM + 1), - coomvn_threads, - 0, - stream, - m, - h_beta, - y); - } + // Scale y with beta + hipLaunchKernelGGL((coomv_scale_device_pointer), + dim3((m - 1) / 1024 + 1), + dim3(1024), + 0, + stream, + m, + beta, + y); if(handle->wavefront_size == 32) { @@ -302,13 +314,13 @@ rocsparse_status rocsparse_coomv_template(rocsparse_handle handle, // If beta == 0.0 we need to set y to 0 if(*beta == static_cast(0)) { - RETURN_IF_HIP_ERROR(hipMemset(y, 0, sizeof(T) * m)); + RETURN_IF_HIP_ERROR(hipMemsetAsync(y, 0, sizeof(T) * m, stream)); } else if(*beta != static_cast(1)) { - hipLaunchKernelGGL((coomv_scale), - dim3((m - 1) / COOMVN_DIM + 1), - coomvn_threads, + hipLaunchKernelGGL((coomv_scale_host_pointer), + dim3((m - 1) / 1024 + 1), + dim3(1024), 0, stream, m, @@ -369,9 +381,6 @@ rocsparse_status rocsparse_coomv_template(rocsparse_handle handle, row_block_red, val_block_red, y); - - RETURN_IF_HIP_ERROR(hipFree(row_block_red)); - RETURN_IF_HIP_ERROR(hipFree(val_block_red)); #undef COOMVN_DIM } else diff --git a/library/src/level2/rocsparse_csrmv.cpp b/library/src/level2/rocsparse_csrmv.cpp index cb076c70..8a21d43d 100644 --- a/library/src/level2/rocsparse_csrmv.cpp +++ b/library/src/level2/rocsparse_csrmv.cpp @@ -391,8 +391,7 @@ extern "C" rocsparse_status rocsparse_csrmv_analysis(rocsparse_handle handle, return rocsparse_status_success; } -extern "C" rocsparse_status rocsparse_csrmv_clear(rocsparse_handle handle, - rocsparse_mat_info info) +extern "C" rocsparse_status rocsparse_csrmv_clear(rocsparse_handle handle, rocsparse_mat_info info) { // Check for valid handle and matrix descriptor if(handle == nullptr) diff --git a/library/src/level2/rocsparse_csrsv.cpp b/library/src/level2/rocsparse_csrsv.cpp index 97ee8c97..a9ce62b5 100644 --- a/library/src/level2/rocsparse_csrsv.cpp +++ b/library/src/level2/rocsparse_csrsv.cpp @@ -5,127 +5,101 @@ #include "rocsparse.h" #include "rocsparse_csrsv.hpp" +#include + /* * =========================================================================== * C wrapper * =========================================================================== */ -extern "C" rocsparse_status rocsparse_csrsv_buffer_size(rocsparse_handle handle, - rocsparse_operation trans, - rocsparse_int m, - rocsparse_int nnz, - const rocsparse_mat_descr descr, - const rocsparse_int* csr_row_ptr, - const rocsparse_int* csr_col_ind, - rocsparse_mat_info info, - size_t* buffer_size) +extern "C" rocsparse_status rocsparse_scsrsv_buffer_size(rocsparse_handle handle, + rocsparse_operation trans, + rocsparse_int m, + rocsparse_int nnz, + const rocsparse_mat_descr descr, + const float* csr_val, + const rocsparse_int* csr_row_ptr, + const rocsparse_int* csr_col_ind, + rocsparse_mat_info info, + size_t* buffer_size) { - // Check for valid handle and matrix descriptor - if(handle == nullptr) - { - return rocsparse_status_invalid_handle; - } - else if(descr == nullptr) - { - return rocsparse_status_invalid_pointer; - } - else if(info == nullptr) - { - return rocsparse_status_invalid_pointer; - } - - // Logging - log_trace(handle, - "rocsparse_csrsv_buffer_size", - trans, - m, - nnz, - (const void*&)descr, - (const void*&)csr_row_ptr, - (const void*&)csr_col_ind, - (const void*&)info, - (const void*&)buffer_size); - - // Check index base - if(descr->base != rocsparse_index_base_zero && descr->base != rocsparse_index_base_one) - { - return rocsparse_status_invalid_value; - } - if(descr->type != rocsparse_matrix_type_general) - { - // TODO - return rocsparse_status_not_implemented; - } - - // Check sizes - if(m < 0) - { - return rocsparse_status_invalid_size; - } - else if(nnz < 0) - { - return rocsparse_status_invalid_size; - } - - // Check pointer arguments - if(csr_row_ptr == nullptr) - { - return rocsparse_status_invalid_pointer; - } - else if(csr_col_ind == nullptr) - { - return rocsparse_status_invalid_pointer; - } - else if(buffer_size == nullptr) - { - return rocsparse_status_invalid_pointer; - } - - // Quick return if possible - if(m == 0 || nnz == 0) - { - // Do not return 0 as buffer size - *buffer_size = 4; - return rocsparse_status_success; - } - - // rocsparse_int max depth - *buffer_size = 256; - - // unsigned long long total_spin - *buffer_size += 256; - - // rocsparse_int max_nnz - *buffer_size += 256; - - // rocsparse_int done_array[m] - *buffer_size += sizeof(rocsparse_int) * ((m - 1) / 256 + 1) * 256; - - // rocsparse_int rows_per_level[m] - *buffer_size += sizeof(rocsparse_int) * ((m - 1) / 256 + 1) * 256; + return rocsparse_csrsv_buffer_size_template( + handle, trans, m, nnz, descr, csr_val, csr_row_ptr, csr_col_ind, info, buffer_size); +} - size_t hipcub_size = 0; - rocsparse_int* ptr = nullptr; - RETURN_IF_HIP_ERROR(hipcub::DeviceScan::InclusiveSum(nullptr, hipcub_size, ptr, ptr, m)); +extern "C" rocsparse_status rocsparse_dcsrsv_buffer_size(rocsparse_handle handle, + rocsparse_operation trans, + rocsparse_int m, + rocsparse_int nnz, + const rocsparse_mat_descr descr, + const double* csr_val, + const rocsparse_int* csr_row_ptr, + const rocsparse_int* csr_col_ind, + rocsparse_mat_info info, + size_t* buffer_size) +{ + return rocsparse_csrsv_buffer_size_template( + handle, trans, m, nnz, descr, csr_val, csr_row_ptr, csr_col_ind, info, buffer_size); +} - // hipcub buffer - *buffer_size += hipcub_size; +extern "C" rocsparse_status rocsparse_scsrsv_analysis(rocsparse_handle handle, + rocsparse_operation trans, + rocsparse_int m, + rocsparse_int nnz, + const rocsparse_mat_descr descr, + const float* csr_val, + const rocsparse_int* csr_row_ptr, + const rocsparse_int* csr_col_ind, + rocsparse_mat_info info, + rocsparse_analysis_policy analysis, + rocsparse_solve_policy solve, + void* temp_buffer) +{ + return rocsparse_csrsv_analysis_template(handle, + trans, + m, + nnz, + descr, + csr_val, + csr_row_ptr, + csr_col_ind, + info, + analysis, + solve, + temp_buffer); +} - return rocsparse_status_success; +extern "C" rocsparse_status rocsparse_dcsrsv_analysis(rocsparse_handle handle, + rocsparse_operation trans, + rocsparse_int m, + rocsparse_int nnz, + const rocsparse_mat_descr descr, + const double* csr_val, + const rocsparse_int* csr_row_ptr, + const rocsparse_int* csr_col_ind, + rocsparse_mat_info info, + rocsparse_analysis_policy analysis, + rocsparse_solve_policy solve, + void* temp_buffer) +{ + return rocsparse_csrsv_analysis_template(handle, + trans, + m, + nnz, + descr, + csr_val, + csr_row_ptr, + csr_col_ind, + info, + analysis, + solve, + temp_buffer); } -extern "C" rocsparse_status rocsparse_csrsv_analysis(rocsparse_handle handle, - rocsparse_operation trans, - rocsparse_int m, - rocsparse_int nnz, - const rocsparse_mat_descr descr, - const rocsparse_int* csr_row_ptr, - const rocsparse_int* csr_col_ind, - rocsparse_mat_info info, - rocsparse_solve_policy solve, - rocsparse_analysis_policy analysis, - void* temp_buffer) +extern "C" rocsparse_status rocsparse_csrsv_clear(rocsparse_handle handle, + const rocsparse_mat_descr descr, + rocsparse_mat_info info) { // Check for valid handle and matrix descriptor if(handle == nullptr) @@ -142,162 +116,8 @@ extern "C" rocsparse_status rocsparse_csrsv_analysis(rocsparse_handle handle, } // Logging - log_trace(handle, - "rocsparse_csrsv_analysis", - trans, - m, - nnz, - (const void*&)descr, - (const void*&)csr_row_ptr, - (const void*&)csr_col_ind, - (const void*&)info, - solve, - analysis, - (const void*&)temp_buffer); - - // Check index base - if(descr->base != rocsparse_index_base_zero && descr->base != rocsparse_index_base_one) - { - return rocsparse_status_invalid_value; - } - - // Check matrix type - if(descr->type != rocsparse_matrix_type_general) - { - // TODO - return rocsparse_status_not_implemented; - } - - // Check analysis policy - if(analysis != rocsparse_analysis_policy_reuse && analysis != rocsparse_analysis_policy_force) - { - return rocsparse_status_invalid_value; - } - - // Check solve policy - if(solve != rocsparse_solve_policy_auto) - { - return rocsparse_status_invalid_value; - } - - // Check sizes - if(m < 0) - { - return rocsparse_status_invalid_size; - } - else if(nnz < 0) - { - return rocsparse_status_invalid_size; - } - - // Check pointer arguments - if(csr_row_ptr == nullptr) - { - return rocsparse_status_invalid_pointer; - } - else if(csr_col_ind == nullptr) - { - return rocsparse_status_invalid_pointer; - } - else if(temp_buffer == nullptr) - { - return rocsparse_status_invalid_pointer; - } - - // Quick return if possible - if(m == 0 || nnz == 0) - { - return rocsparse_status_success; - } - - - // Switch between lower and upper triangular analysis - if(descr->fill_mode == rocsparse_fill_mode_upper) - { - // This is currently the only case where we need upper triangular analysis, - // therefore we ignore the analysis policy - - // Clear csrsv info - RETURN_IF_ROCSPARSE_ERROR(rocsparse_destroy_csrtr_info(info->csrsv_upper_info)); + log_trace(handle, "rocsparse_csrsv_clear", (const void*&)descr, (const void*&)info); - // Create csrsv info - RETURN_IF_ROCSPARSE_ERROR(rocsparse_create_csrtr_info(&info->csrsv_upper_info)); - - // Perform analysis - RETURN_IF_ROCSPARSE_ERROR(rocsparse_csrtr_analysis(handle, - trans, - m, - nnz, - descr, - csr_row_ptr, - csr_col_ind, - info->csrsv_upper_info, - temp_buffer)); - } - else - { - // Differentiate the analysis policies - if(analysis == rocsparse_analysis_policy_reuse) - { - // We try to re-use already analyzed lower part, if available. - // It is the user's responsibility that this data is still valid, - // since he passed the 'reuse' flag. - - // If csrsv meta data is already available, do nothing - if(info->csrsv_lower_info != nullptr) - { - return rocsparse_status_success; - } - - // Check for other lower analysis meta data - rocsparse_csrtr_info reuse = nullptr; - - // csrilu0 meta data - if(info->csrilu0_info != nullptr) - { - reuse = info->csrilu0_info; - } - - // TODO add more crossover data here - - - - // If data has been found, use it - if(reuse != nullptr) - { - info->csrsv_lower_info = reuse; - - return rocsparse_status_success; - } - } - - // User is explicitly asking to force a re-analysis, or no valid data has been - // found to be re-used. - - // Clear csrsv info - RETURN_IF_ROCSPARSE_ERROR(rocsparse_destroy_csrtr_info(info->csrsv_lower_info)); - - // Create csrsv info - RETURN_IF_ROCSPARSE_ERROR(rocsparse_create_csrtr_info(&info->csrsv_lower_info)); - - // Perform analysis - RETURN_IF_ROCSPARSE_ERROR(rocsparse_csrtr_analysis(handle, - trans, - m, - nnz, - descr, - csr_row_ptr, - csr_col_ind, - info->csrsv_lower_info, - temp_buffer)); - } - - return rocsparse_status_success; -} - -extern "C" rocsparse_status rocsparse_csrsv_clear(const rocsparse_mat_descr descr, - rocsparse_mat_info info) -{ // Determine which info meta data should be deleted if(descr->fill_mode == rocsparse_fill_mode_lower) { @@ -337,8 +157,20 @@ extern "C" rocsparse_status rocsparse_scsrsv_solve(rocsparse_handle handle, rocsparse_solve_policy policy, void* temp_buffer) { - return rocsparse_csrsv_solve_template( - handle, trans, m, nnz, alpha, descr, csr_val, csr_row_ind, csr_col_ind, info, x, y, policy, temp_buffer); + return rocsparse_csrsv_solve_template(handle, + trans, + m, + nnz, + alpha, + descr, + csr_val, + csr_row_ind, + csr_col_ind, + info, + x, + y, + policy, + temp_buffer); } extern "C" rocsparse_status rocsparse_dcsrsv_solve(rocsparse_handle handle, @@ -356,6 +188,132 @@ extern "C" rocsparse_status rocsparse_dcsrsv_solve(rocsparse_handle handle, rocsparse_solve_policy policy, void* temp_buffer) { - return rocsparse_csrsv_solve_template( - handle, trans, m, nnz, alpha, descr, csr_val, csr_row_ind, csr_col_ind, info, x, y, policy, temp_buffer); + return rocsparse_csrsv_solve_template(handle, + trans, + m, + nnz, + alpha, + descr, + csr_val, + csr_row_ind, + csr_col_ind, + info, + x, + y, + policy, + temp_buffer); +} + +extern "C" rocsparse_status rocsparse_csrsv_zero_pivot(rocsparse_handle handle, + const rocsparse_mat_descr descr, + rocsparse_mat_info info, + rocsparse_int* position) +{ + // Check for valid handle and matrix descriptor + if(handle == nullptr) + { + return rocsparse_status_invalid_handle; + } + else if(info == nullptr) + { + return rocsparse_status_invalid_pointer; + } + + // Logging + log_trace(handle, "rocsparse_csrsv_zero_pivot", (const void*&)info, (const void*&)position); + + // Check pointer arguments + if(position == nullptr) + { + return rocsparse_status_invalid_pointer; + } + + // Stream + hipStream_t stream = handle->stream; + + // Determine the info meta data place + rocsparse_csrtr_info csrsv = nullptr; + + // For hipSPARSE compatibility mode, we allow descr == nullptr + // In this case, only lower OR upper is populated and we can use the right + // info meta data + if(descr == nullptr) + { + if(info->csrsv_lower_info != nullptr) + { + csrsv = info->csrsv_lower_info; + } + else + { + csrsv = info->csrsv_upper_info; + } + } + else + { + // Switch between upper and lower triangular + if(descr->fill_mode == rocsparse_fill_mode_lower) + { + csrsv = info->csrsv_lower_info; + } + else + { + csrsv = info->csrsv_upper_info; + } + } + + // If m == 0 || nnz == 0 it can happen, that info structure is not created. + // In this case, always return -1. + if(csrsv == nullptr) + { + if(handle->pointer_mode == rocsparse_pointer_mode_device) + { + RETURN_IF_HIP_ERROR(hipMemsetAsync(position, 255, sizeof(rocsparse_int), stream)); + } + else + { + *position = -1; + } + + return rocsparse_status_success; + } + + // Differentiate between pointer modes + if(handle->pointer_mode == rocsparse_pointer_mode_device) + { + // rocsparse_pointer_mode_device + rocsparse_int pivot; + + RETURN_IF_HIP_ERROR( + hipMemcpy(&pivot, csrsv->zero_pivot, sizeof(rocsparse_int), hipMemcpyDeviceToHost)); + + if(pivot == std::numeric_limits::max()) + { + RETURN_IF_HIP_ERROR(hipMemsetAsync(position, 255, sizeof(rocsparse_int), stream)); + } + else + { + RETURN_IF_HIP_ERROR(hipMemcpy( + position, csrsv->zero_pivot, sizeof(rocsparse_int), hipMemcpyDeviceToDevice)); + + return rocsparse_status_zero_pivot; + } + } + else + { + // rocsparse_pointer_mode_host + RETURN_IF_HIP_ERROR( + hipMemcpy(position, csrsv->zero_pivot, sizeof(rocsparse_int), hipMemcpyDeviceToHost)); + + // If no zero pivot is found, set -1 + if(*position == std::numeric_limits::max()) + { + *position = -1; + } + else + { + return rocsparse_status_zero_pivot; + } + } + + return rocsparse_status_success; } diff --git a/library/src/level2/rocsparse_csrsv.hpp b/library/src/level2/rocsparse_csrsv.hpp index 812a24c6..ae16db60 100644 --- a/library/src/level2/rocsparse_csrsv.hpp +++ b/library/src/level2/rocsparse_csrsv.hpp @@ -12,9 +12,121 @@ #include "utility.h" #include "csrsv_device.h" +#include #include #include +template +rocsparse_status rocsparse_csrsv_buffer_size_template(rocsparse_handle handle, + rocsparse_operation trans, + rocsparse_int m, + rocsparse_int nnz, + const rocsparse_mat_descr descr, + const T* csr_val, + const rocsparse_int* csr_row_ptr, + const rocsparse_int* csr_col_ind, + rocsparse_mat_info info, + size_t* buffer_size) +{ + // Check for valid handle and matrix descriptor + if(handle == nullptr) + { + return rocsparse_status_invalid_handle; + } + else if(descr == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if(info == nullptr) + { + return rocsparse_status_invalid_pointer; + } + + // Logging + log_trace(handle, + replaceX("rocsparse_Xcsrsv_buffer_size"), + trans, + m, + nnz, + (const void*&)descr, + (const void*&)csr_val, + (const void*&)csr_row_ptr, + (const void*&)csr_col_ind, + (const void*&)info, + (const void*&)buffer_size); + + // Check index base + if(descr->base != rocsparse_index_base_zero && descr->base != rocsparse_index_base_one) + { + return rocsparse_status_invalid_value; + } + if(descr->type != rocsparse_matrix_type_general) + { + // TODO + return rocsparse_status_not_implemented; + } + + // Check sizes + if(m < 0) + { + return rocsparse_status_invalid_size; + } + else if(nnz < 0) + { + return rocsparse_status_invalid_size; + } + + // Check pointer arguments + if(csr_row_ptr == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if(csr_col_ind == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if(csr_val == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if(buffer_size == nullptr) + { + return rocsparse_status_invalid_pointer; + } + + // Quick return if possible + if(m == 0 || nnz == 0) + { + // Do not return 0 as buffer size + *buffer_size = 4; + return rocsparse_status_success; + } + + // rocsparse_int max depth + *buffer_size = 256; + + // unsigned long long total_spin + *buffer_size += 256; + + // rocsparse_int max_nnz + *buffer_size += 256; + + // rocsparse_int done_array[m] + *buffer_size += sizeof(rocsparse_int) * ((m - 1) / 256 + 1) * 256; + + // rocsparse_int rows_per_level[m] + *buffer_size += sizeof(rocsparse_int) * ((m - 1) / 256 + 1) * 256; + + size_t hipcub_size = 0; + rocsparse_int* ptr = nullptr; + RETURN_IF_HIP_ERROR(hipcub::DeviceScan::InclusiveSum(nullptr, hipcub_size, ptr, ptr, m)); + + // hipcub buffer + *buffer_size += hipcub_size; + + return rocsparse_status_success; +} + static rocsparse_status rocsparse_csrtr_analysis(rocsparse_handle handle, rocsparse_operation trans, rocsparse_int m, @@ -32,13 +144,11 @@ static rocsparse_status rocsparse_csrtr_analysis(rocsparse_handle handle, char* ptr = reinterpret_cast(temp_buffer); // Initialize temporary buffer - size_t buffer_size = 256 - + 256 - + 256 - + sizeof(rocsparse_int) * ((m - 1) / 256 + 1) * 256 - + sizeof(rocsparse_int) * ((m - 1) / 256 + 1) * 256; + size_t buffer_size = 256 + 256 + 256 + sizeof(rocsparse_int) * ((m - 1) / 256 + 1) * 256 + + sizeof(rocsparse_int) * ((m - 1) / 256 + 1) * 256; - RETURN_IF_HIP_ERROR(hipMemset(ptr, 0, sizeof(char) * buffer_size)); + // Set temporary buffer to 0 + RETURN_IF_HIP_ERROR(hipMemsetAsync(ptr, 0, sizeof(char) * buffer_size, stream)); // max_depth rocsparse_int* d_max_depth = reinterpret_cast(ptr); @@ -66,10 +176,15 @@ static rocsparse_status rocsparse_csrtr_analysis(rocsparse_handle handle, // Allocate buffer to hold diagonal entry point RETURN_IF_HIP_ERROR(hipMalloc((void**)&info->csr_diag_ind, sizeof(rocsparse_int) * m)); + // Allocate buffer to hold zero pivot + RETURN_IF_HIP_ERROR(hipMalloc((void**)&info->zero_pivot, sizeof(rocsparse_int))); + // Initialize zero pivot + rocsparse_int max = std::numeric_limits::max(); + RETURN_IF_HIP_ERROR( + hipMemcpy(info->zero_pivot, &max, sizeof(rocsparse_int), hipMemcpyHostToDevice)); - - // Run analysis +// Run analysis #define CSRILU0_DIM 1024 dim3 csrsv_blocks((handle->wavefront_size * m - 1) / CSRILU0_DIM + 1); dim3 csrsv_threads(CSRILU0_DIM); @@ -93,6 +208,7 @@ static rocsparse_status rocsparse_csrtr_analysis(rocsparse_handle handle, d_max_depth, d_total_spin, d_max_nnz, + info->zero_pivot, descr->base); } else if(descr->fill_mode == rocsparse_fill_mode_lower) @@ -111,6 +227,7 @@ static rocsparse_status rocsparse_csrtr_analysis(rocsparse_handle handle, d_max_depth, d_total_spin, d_max_nnz, + info->zero_pivot, descr->base); } } @@ -132,6 +249,7 @@ static rocsparse_status rocsparse_csrtr_analysis(rocsparse_handle handle, d_max_depth, d_total_spin, d_max_nnz, + info->zero_pivot, descr->base); } else if(descr->fill_mode == rocsparse_fill_mode_lower) @@ -150,6 +268,7 @@ static rocsparse_status rocsparse_csrtr_analysis(rocsparse_handle handle, d_max_depth, d_total_spin, d_max_nnz, + info->zero_pivot, descr->base); } } @@ -159,22 +278,31 @@ static rocsparse_status rocsparse_csrtr_analysis(rocsparse_handle handle, } // Post processing - RETURN_IF_HIP_ERROR(hipMemcpy(&info->max_depth, d_max_depth, sizeof(rocsparse_int), hipMemcpyDeviceToHost)); - RETURN_IF_HIP_ERROR(hipMemcpy(&info->total_spin, d_total_spin, sizeof(unsigned long long), hipMemcpyDeviceToHost)); - RETURN_IF_HIP_ERROR(hipMemcpy(&info->max_nnz, d_max_nnz, sizeof(rocsparse_int), hipMemcpyDeviceToHost)); + RETURN_IF_HIP_ERROR( + hipMemcpy(&info->max_depth, d_max_depth, sizeof(rocsparse_int), hipMemcpyDeviceToHost)); + RETURN_IF_HIP_ERROR(hipMemcpy( + &info->total_spin, d_total_spin, sizeof(unsigned long long), hipMemcpyDeviceToHost)); + RETURN_IF_HIP_ERROR( + hipMemcpy(&info->max_nnz, d_max_nnz, sizeof(rocsparse_int), hipMemcpyDeviceToHost)); // Inclusive sum to obtain rows per level size_t hipcub_size = 0; - RETURN_IF_HIP_ERROR(hipcub::DeviceScan::InclusiveSum(nullptr, hipcub_size, d_rows_per_level, d_rows_per_level, info->max_depth)); - RETURN_IF_HIP_ERROR(hipcub::DeviceScan::InclusiveSum(hipcub_buffer, hipcub_size, d_rows_per_level, d_rows_per_level, info->max_depth)); + RETURN_IF_HIP_ERROR(hipcub::DeviceScan::InclusiveSum( + nullptr, hipcub_size, d_rows_per_level, d_rows_per_level, info->max_depth)); + RETURN_IF_HIP_ERROR(hipcub::DeviceScan::InclusiveSum( + hipcub_buffer, hipcub_size, d_rows_per_level, d_rows_per_level, info->max_depth)); // Allocate host memory for meta data info->rows_per_level.resize(info->max_depth); std::vector done_array(m); // Move meta data to host (required for kernel launching) - RETURN_IF_HIP_ERROR(hipMemcpy(info->rows_per_level.data(), d_rows_per_level, sizeof(rocsparse_int) * info->max_depth, hipMemcpyDeviceToHost)); - RETURN_IF_HIP_ERROR(hipMemcpy(done_array.data(), d_done_array, sizeof(rocsparse_int) * m, hipMemcpyDeviceToHost)); + RETURN_IF_HIP_ERROR(hipMemcpy(info->rows_per_level.data(), + d_rows_per_level, + sizeof(rocsparse_int) * info->max_depth, + hipMemcpyDeviceToHost)); + RETURN_IF_HIP_ERROR(hipMemcpy( + done_array.data(), d_done_array, sizeof(rocsparse_int) * m, hipMemcpyDeviceToHost)); std::vector row_map(m + 1, 0); std::vector counter(info->max_depth, 0); @@ -182,8 +310,8 @@ static rocsparse_status rocsparse_csrtr_analysis(rocsparse_handle handle, // Create row map for(rocsparse_int i = 0; i < m; ++i) { - rocsparse_int level = done_array[i] - 1; - rocsparse_int prev_level = level - 1; + rocsparse_int level = done_array[i] - 1; + rocsparse_int prev_level = level - 1; rocsparse_int depth_offset = (level == 0) ? 0 : info->rows_per_level[prev_level]; row_map[depth_offset + counter[level]] = i; @@ -192,44 +320,219 @@ static rocsparse_status rocsparse_csrtr_analysis(rocsparse_handle handle, // Copy row map to device RETURN_IF_HIP_ERROR(hipMalloc((void**)&info->row_map, sizeof(rocsparse_int) * (m + 1))); - RETURN_IF_HIP_ERROR(hipMemcpy(info->row_map, row_map.data(), sizeof(rocsparse_int) * (m + 1), hipMemcpyHostToDevice)); + RETURN_IF_HIP_ERROR(hipMemcpy( + info->row_map, row_map.data(), sizeof(rocsparse_int) * (m + 1), hipMemcpyHostToDevice)); + + // Store some pointers to verify correct execution + info->m = m; + info->nnz = nnz; + info->descr = descr; + info->csr_row_ptr = csr_row_ptr; + info->csr_col_ind = csr_col_ind; + return rocsparse_status_success; +} +template +rocsparse_status rocsparse_csrsv_analysis_template(rocsparse_handle handle, + rocsparse_operation trans, + rocsparse_int m, + rocsparse_int nnz, + const rocsparse_mat_descr descr, + const T* csr_val, + const rocsparse_int* csr_row_ptr, + const rocsparse_int* csr_col_ind, + rocsparse_mat_info info, + rocsparse_analysis_policy analysis, + rocsparse_solve_policy solve, + void* temp_buffer) +{ + // Check for valid handle and matrix descriptor + if(handle == nullptr) + { + return rocsparse_status_invalid_handle; + } + else if(descr == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if(info == nullptr) + { + return rocsparse_status_invalid_pointer; + } + // Logging + log_trace(handle, + replaceX("rocsparse_Xcsrsv_analysis"), + trans, + m, + nnz, + (const void*&)descr, + (const void*&)csr_val, + (const void*&)csr_row_ptr, + (const void*&)csr_col_ind, + (const void*&)info, + solve, + analysis, + (const void*&)temp_buffer); + // Check index base + if(descr->base != rocsparse_index_base_zero && descr->base != rocsparse_index_base_one) + { + return rocsparse_status_invalid_value; + } + // Check matrix type + if(descr->type != rocsparse_matrix_type_general) + { + // TODO + return rocsparse_status_not_implemented; + } + // Check analysis policy + if(analysis != rocsparse_analysis_policy_reuse && analysis != rocsparse_analysis_policy_force) + { + return rocsparse_status_invalid_value; + } + // Check solve policy + if(solve != rocsparse_solve_policy_auto) + { + return rocsparse_status_invalid_value; + } + // Check sizes + if(m < 0) + { + return rocsparse_status_invalid_size; + } + else if(nnz < 0) + { + return rocsparse_status_invalid_size; + } + // Check pointer arguments + if(csr_row_ptr == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if(csr_col_ind == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if(csr_val == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if(temp_buffer == nullptr) + { + return rocsparse_status_invalid_pointer; + } + // Quick return if possible + if(m == 0 || nnz == 0) + { + return rocsparse_status_success; + } + // Switch between lower and upper triangular analysis + if(descr->fill_mode == rocsparse_fill_mode_upper) + { + // This is currently the only case where we need upper triangular analysis, + // therefore we ignore the analysis policy + + // Clear csrsv info + RETURN_IF_ROCSPARSE_ERROR(rocsparse_destroy_csrtr_info(info->csrsv_upper_info)); + + // Create csrsv info + RETURN_IF_ROCSPARSE_ERROR(rocsparse_create_csrtr_info(&info->csrsv_upper_info)); + + // Perform analysis + RETURN_IF_ROCSPARSE_ERROR(rocsparse_csrtr_analysis(handle, + trans, + m, + nnz, + descr, + csr_row_ptr, + csr_col_ind, + info->csrsv_upper_info, + temp_buffer)); + } + else + { + // Differentiate the analysis policies + if(analysis == rocsparse_analysis_policy_reuse) + { + // We try to re-use already analyzed lower part, if available. + // It is the user's responsibility that this data is still valid, + // since he passed the 'reuse' flag. + + // If csrsv meta data is already available, do nothing + if(info->csrsv_lower_info != nullptr) + { + return rocsparse_status_success; + } + + // Check for other lower analysis meta data + rocsparse_csrtr_info reuse = nullptr; + + // csrilu0 meta data + if(info->csrilu0_info != nullptr) + { + reuse = info->csrilu0_info; + } + + // TODO add more crossover data here + + // If data has been found, use it + if(reuse != nullptr) + { + info->csrsv_lower_info = reuse; + + return rocsparse_status_success; + } + } - // Store some pointers to verify correct execution - info->m = m; - info->nnz = nnz; - info->descr = descr; - info->csr_row_ptr = csr_row_ptr; - info->csr_col_ind = csr_col_ind; + // User is explicitly asking to force a re-analysis, or no valid data has been + // found to be re-used. + + // Clear csrsv info + RETURN_IF_ROCSPARSE_ERROR(rocsparse_destroy_csrtr_info(info->csrsv_lower_info)); + + // Create csrsv info + RETURN_IF_ROCSPARSE_ERROR(rocsparse_create_csrtr_info(&info->csrsv_lower_info)); + + // Perform analysis + RETURN_IF_ROCSPARSE_ERROR(rocsparse_csrtr_analysis(handle, + trans, + m, + nnz, + descr, + csr_row_ptr, + csr_col_ind, + info->csrsv_lower_info, + temp_buffer)); + } return rocsparse_status_success; } template -__launch_bounds__(BLOCKSIZE) -__global__ void csrsv_host_pointer(rocsparse_int m, - T alpha, - const rocsparse_int* __restrict__ csr_row_ptr, - const rocsparse_int* __restrict__ csr_col_ind, - const T* __restrict__ csr_val, - const T* __restrict__ x, - T* __restrict__ y, - rocsparse_int* __restrict__ done_array, - rocsparse_int* __restrict__ map, - rocsparse_int offset, - rocsparse_index_base idx_base, - rocsparse_fill_mode fill_mode, - rocsparse_diag_type diag_type) +__launch_bounds__(BLOCKSIZE) __global__ + void csrsv_host_pointer(rocsparse_int m, + T alpha, + const rocsparse_int* __restrict__ csr_row_ptr, + const rocsparse_int* __restrict__ csr_col_ind, + const T* __restrict__ csr_val, + const T* __restrict__ x, + T* __restrict__ y, + rocsparse_int* __restrict__ done_array, + rocsparse_int* __restrict__ map, + rocsparse_int offset, + rocsparse_int* __restrict__ zero_pivot, + rocsparse_index_base idx_base, + rocsparse_fill_mode fill_mode, + rocsparse_diag_type diag_type) { csrsv_device(m, alpha, @@ -241,32 +544,29 @@ __global__ void csrsv_host_pointer(rocsparse_int m, done_array, map, offset, + zero_pivot, idx_base, fill_mode, diag_type); } template -__launch_bounds__(BLOCKSIZE) -__global__ void csrsv_device_pointer(rocsparse_int m, - const T* alpha, - const rocsparse_int* __restrict__ csr_row_ptr, - const rocsparse_int* __restrict__ csr_col_ind, - const T* __restrict__ csr_val, - const T* __restrict__ x, - T* __restrict__ y, - rocsparse_int* __restrict__ done_array, - rocsparse_int* __restrict__ map, - rocsparse_int offset, - rocsparse_index_base idx_base, - rocsparse_fill_mode fill_mode, - rocsparse_diag_type diag_type) +__launch_bounds__(BLOCKSIZE) __global__ + void csrsv_device_pointer(rocsparse_int m, + const T* alpha, + const rocsparse_int* __restrict__ csr_row_ptr, + const rocsparse_int* __restrict__ csr_col_ind, + const T* __restrict__ csr_val, + const T* __restrict__ x, + T* __restrict__ y, + rocsparse_int* __restrict__ done_array, + rocsparse_int* __restrict__ map, + rocsparse_int offset, + rocsparse_int* __restrict__ zero_pivot, + rocsparse_index_base idx_base, + rocsparse_fill_mode fill_mode, + rocsparse_diag_type diag_type) { - if(*alpha == static_cast(0)) - { - return; - } - csrsv_device(m, *alpha, csr_row_ptr, @@ -277,6 +577,7 @@ __global__ void csrsv_device_pointer(rocsparse_int m, done_array, map, offset, + zero_pivot, idx_base, fill_mode, diag_type); @@ -417,83 +718,383 @@ rocsparse_status rocsparse_csrsv_solve_template(rocsparse_handle handle, // Stream hipStream_t stream = handle->stream; - - - - - - // Buffer char* ptr = reinterpret_cast(temp_buffer); + ptr += 256; + ptr += 256; + ptr += 256; + // done array rocsparse_int* d_done_array = reinterpret_cast(ptr); // Initialize buffers - RETURN_IF_HIP_ERROR(hipMemset(d_done_array, 0, sizeof(rocsparse_int) * m)); - - + RETURN_IF_HIP_ERROR(hipMemsetAsync(d_done_array, 0, sizeof(rocsparse_int) * m, stream)); + rocsparse_csrtr_info csrsv = (descr->fill_mode == rocsparse_fill_mode_upper) + ? info->csrsv_upper_info + : info->csrsv_lower_info; + // If diag type is unit, re-initialize zero pivot to remove structural zeros + if(descr->diag_type == rocsparse_diag_type_unit) + { + rocsparse_int max = std::numeric_limits::max(); + RETURN_IF_HIP_ERROR( + hipMemcpy(csrsv->zero_pivot, &max, sizeof(rocsparse_int), hipMemcpyHostToDevice)); + } - rocsparse_csrtr_info csrsv = (descr->fill_mode == rocsparse_fill_mode_upper) ? - info->csrsv_upper_info : - info->csrsv_lower_info; +/* +#define CSRSV_DIM 1024 + rocsparse_int depth_offset = 0; + rocsparse_int running_total = 0; + rocsparse_int wf_size = handle->wavefront_size; + rocsparse_int wf_per_wg = CSRSV_DIM / wf_size; + rocsparse_int cutoff = (m / csrsv->max_depth < 32) ? 2560 : 81920; + for(rocsparse_int level = 0; level < csrsv->max_depth; ++level) + { + if(level != 0 && running_total == 0) + { + depth_offset = csrsv->rows_per_level[level - 1]; + } + running_total = csrsv->rows_per_level[level] - depth_offset; + if(running_total >= cutoff) + { + dim3 csrsv_blocks(((running_total + (running_total % wf_per_wg)) * wf_size - 1) / +CSRSV_DIM + 1); + dim3 csrsv_threads(CSRSV_DIM); + + if(handle->pointer_mode == rocsparse_pointer_mode_device) + { + // rocsparse_pointer_mode_device + if(wf_size == 32) + { + hipLaunchKernelGGL((csrsv_device_pointer), + csrsv_blocks, + csrsv_threads, + 0, + stream, + m, + alpha, + csr_row_ptr, + csr_col_ind, + csr_val, + x, + y, + d_done_array, + csrsv->row_map, + depth_offset, + csrsv->zero_pivot, + descr->base, + descr->fill_mode, + descr->diag_type); + } + else if(wf_size == 64) + { + hipLaunchKernelGGL((csrsv_device_pointer), + csrsv_blocks, + csrsv_threads, + 0, + stream, + m, + alpha, + csr_row_ptr, + csr_col_ind, + csr_val, + x, + y, + d_done_array, + csrsv->row_map, + depth_offset, + csrsv->zero_pivot, + descr->base, + descr->fill_mode, + descr->diag_type); + } + else + { + return rocsparse_status_arch_mismatch; + } + } + else + { + // rocsparse_pointer_mode_host + if(wf_size == 32) + { + hipLaunchKernelGGL((csrsv_host_pointer), + csrsv_blocks, + csrsv_threads, + 0, + stream, + m, + *alpha, + csr_row_ptr, + csr_col_ind, + csr_val, + x, + y, + d_done_array, + csrsv->row_map, + depth_offset, + csrsv->zero_pivot, + descr->base, + descr->fill_mode, + descr->diag_type); + } + else if(wf_size == 64) + { + hipLaunchKernelGGL((csrsv_host_pointer), + csrsv_blocks, + csrsv_threads, + 0, + stream, + m, + *alpha, + csr_row_ptr, + csr_col_ind, + csr_val, + x, + y, + d_done_array, + csrsv->row_map, + depth_offset, + csrsv->zero_pivot, + descr->base, + descr->fill_mode, + descr->diag_type); + } + else + { + return rocsparse_status_arch_mismatch; + } + } + + running_total = 0; + } + } + if(running_total) + { + dim3 csrsv_blocks(((running_total + (running_total % wf_per_wg)) * wf_size - 1) / CSRSV_DIM ++ 1); + dim3 csrsv_threads(CSRSV_DIM); + if(handle->pointer_mode == rocsparse_pointer_mode_device) + { + // rocsparse_pointer_mode_device + if(wf_size == 32) + { + hipLaunchKernelGGL((csrsv_device_pointer), + csrsv_blocks, + csrsv_threads, + 0, + stream, + m, + alpha, + csr_row_ptr, + csr_col_ind, + csr_val, + x, + y, + d_done_array, + csrsv->row_map, + depth_offset, + csrsv->zero_pivot, + descr->base, + descr->fill_mode, + descr->diag_type); + } + else if(wf_size == 64) + { + hipLaunchKernelGGL((csrsv_device_pointer), + csrsv_blocks, + csrsv_threads, + 0, + stream, + m, + alpha, + csr_row_ptr, + csr_col_ind, + csr_val, + x, + y, + d_done_array, + csrsv->row_map, + depth_offset, + csrsv->zero_pivot, + descr->base, + descr->fill_mode, + descr->diag_type); + } + else + { + return rocsparse_status_arch_mismatch; + } + } + else + { + // rocsparse_pointer_mode_host + if(wf_size == 32) + { + hipLaunchKernelGGL((csrsv_host_pointer), + csrsv_blocks, + csrsv_threads, + 0, + stream, + m, + *alpha, + csr_row_ptr, + csr_col_ind, + csr_val, + x, + y, + d_done_array, + csrsv->row_map, + depth_offset, + csrsv->zero_pivot, + descr->base, + descr->fill_mode, + descr->diag_type); + } + else if(wf_size == 64) + { + hipLaunchKernelGGL((csrsv_host_pointer), + csrsv_blocks, + csrsv_threads, + 0, + stream, + m, + *alpha, + csr_row_ptr, + csr_col_ind, + csr_val, + x, + y, + d_done_array, + csrsv->row_map, + depth_offset, + csrsv->zero_pivot, + descr->base, + descr->fill_mode, + descr->diag_type); + } + else + { + return rocsparse_status_arch_mismatch; + } + } + } +#undef CSRSV_DIM +*/ #define CSRSV_DIM 1024 dim3 csrsv_blocks((handle->wavefront_size * m - 1) / CSRSV_DIM + 1); dim3 csrsv_threads(CSRSV_DIM); -// TODO host dev ptr - if(handle->wavefront_size == 32) + if(handle->pointer_mode == rocsparse_pointer_mode_device) { - hipLaunchKernelGGL((csrsv_host_pointer), - csrsv_blocks, - csrsv_threads, - 0, - stream, - m, - *alpha, - csr_row_ptr, - csr_col_ind, - csr_val, - x, - y, - d_done_array, - csrsv->row_map, - 0, - descr->base, - descr->fill_mode, - descr->diag_type); - } - else if(handle->wavefront_size == 64) - { - hipLaunchKernelGGL((csrsv_host_pointer), - csrsv_blocks, - csrsv_threads, - 0, - stream, - m, - *alpha, - csr_row_ptr, - csr_col_ind, - csr_val, - x, - y, - d_done_array, - csrsv->row_map, - 0, - descr->base, - descr->fill_mode, - descr->diag_type); + // rocsparse_pointer_mode_device + if(handle->wavefront_size == 32) + { + hipLaunchKernelGGL((csrsv_device_pointer), + csrsv_blocks, + csrsv_threads, + 0, + stream, + m, + alpha, + csr_row_ptr, + csr_col_ind, + csr_val, + x, + y, + d_done_array, + csrsv->row_map, + 0, + csrsv->zero_pivot, + descr->base, + descr->fill_mode, + descr->diag_type); + } + else if(handle->wavefront_size == 64) + { + hipLaunchKernelGGL((csrsv_device_pointer), + csrsv_blocks, + csrsv_threads, + 0, + stream, + m, + alpha, + csr_row_ptr, + csr_col_ind, + csr_val, + x, + y, + d_done_array, + csrsv->row_map, + 0, + csrsv->zero_pivot, + descr->base, + descr->fill_mode, + descr->diag_type); + } + else + { + return rocsparse_status_arch_mismatch; + } } else { - return rocsparse_status_arch_mismatch; + // rocsparse_pointer_mode_host + if(handle->wavefront_size == 32) + { + hipLaunchKernelGGL((csrsv_host_pointer), + csrsv_blocks, + csrsv_threads, + 0, + stream, + m, + *alpha, + csr_row_ptr, + csr_col_ind, + csr_val, + x, + y, + d_done_array, + csrsv->row_map, + 0, + csrsv->zero_pivot, + descr->base, + descr->fill_mode, + descr->diag_type); + } + else if(handle->wavefront_size == 64) + { + hipLaunchKernelGGL((csrsv_host_pointer), + csrsv_blocks, + csrsv_threads, + 0, + stream, + m, + *alpha, + csr_row_ptr, + csr_col_ind, + csr_val, + x, + y, + d_done_array, + csrsv->row_map, + 0, + csrsv->zero_pivot, + descr->base, + descr->fill_mode, + descr->diag_type); + } + else + { + return rocsparse_status_arch_mismatch; + } } #undef CSRSV_DIM diff --git a/library/src/level2/rocsparse_hybmv.hpp b/library/src/level2/rocsparse_hybmv.hpp index 5d79eff7..50cded38 100644 --- a/library/src/level2/rocsparse_hybmv.hpp +++ b/library/src/level2/rocsparse_hybmv.hpp @@ -194,12 +194,9 @@ rocsparse_status rocsparse_hybmv_template(rocsparse_handle handle, // Beta is applied by ELL part, IF ell_nnz > 0 if(hyb->ell_nnz > 0) { - T one = static_cast(1); - T* coo_beta; + T* coo_beta = NULL; + rocsparse_one(handle, &coo_beta); - RETURN_IF_HIP_ERROR(hipMalloc((void**)&coo_beta, sizeof(T))); - RETURN_IF_HIP_ERROR( - hipMemcpy(coo_beta, &one, sizeof(T), hipMemcpyHostToDevice)); RETURN_IF_ROCSPARSE_ERROR(rocsparse_coomv_template(handle, trans, hyb->m, @@ -213,7 +210,6 @@ rocsparse_status rocsparse_hybmv_template(rocsparse_handle handle, x, coo_beta, y)); - RETURN_IF_HIP_ERROR(hipFree(coo_beta)); } else { diff --git a/library/src/level3/csrmm_device.h b/library/src/level3/csrmm_device.h index 504bf6f3..e5514f2a 100644 --- a/library/src/level3/csrmm_device.h +++ b/library/src/level3/csrmm_device.h @@ -15,13 +15,13 @@ static __device__ void csrmmnn_general_device(rocsparse_int M, rocsparse_int K, rocsparse_int nnz, T alpha, - const rocsparse_int* csr_row_ptr, - const rocsparse_int* csr_col_ind, - const T* csr_val, - const T* B, + const rocsparse_int* __restrict__ csr_row_ptr, + const rocsparse_int* __restrict__ csr_col_ind, + const T* __restrict__ csr_val, + const T* __restrict__ B, rocsparse_int ldb, T beta, - T* C, + T* __restrict__ C, rocsparse_int ldc, rocsparse_index_base idx_base) { @@ -40,8 +40,8 @@ static __device__ void csrmmnn_general_device(rocsparse_int M, for(rocsparse_int row = gid / WF_SIZE; row < M; row += nwf) { - rocsparse_int row_start = __ldg(csr_row_ptr + row) - idx_base; - rocsparse_int row_end = __ldg(csr_row_ptr + row + 1) - idx_base; + rocsparse_int row_start = csr_row_ptr[row] - idx_base; + rocsparse_int row_end = csr_row_ptr[row + 1] - idx_base; T sum = static_cast(0); @@ -51,15 +51,14 @@ static __device__ void csrmmnn_general_device(rocsparse_int M, __syncthreads(); - shared_col[wid][lid] = (k < row_end) ? __ldg(csr_col_ind + k) - idx_base : 0; - shared_val[wid][lid] = - (k < row_end) ? alpha * __ldg(csr_val + k) : static_cast(0); + shared_col[wid][lid] = (k < row_end) ? csr_col_ind[k] - idx_base : 0; + shared_val[wid][lid] = (k < row_end) ? alpha * csr_val[k] : static_cast(0); __syncthreads(); for(rocsparse_int i = 0; i < WF_SIZE && col < N; ++i) { - sum += shared_val[wid][i] * __ldg(&B[shared_col[wid][i] + colB]); + sum = fma(shared_val[wid][i], B[shared_col[wid][i] + colB], sum); } } @@ -71,7 +70,7 @@ static __device__ void csrmmnn_general_device(rocsparse_int M, } else { - C[row + colC] = __ldg(&C[row + colC]) * beta + sum; + C[row + colC] = fma(beta, C[row + colC], sum); } } } @@ -85,13 +84,13 @@ static __device__ void csrmmnt_general_device(rocsparse_int offset, rocsparse_int K, rocsparse_int nnz, T alpha, - const rocsparse_int* csr_row_ptr, - const rocsparse_int* csr_col_ind, - const T* csr_val, - const T* B, + const rocsparse_int* __restrict__ csr_row_ptr, + const rocsparse_int* __restrict__ csr_col_ind, + const T* __restrict__ csr_val, + const T* __restrict__ B, rocsparse_int ldb, T beta, - T* C, + T* __restrict__ C, rocsparse_int ldc, rocsparse_index_base idx_base) { @@ -109,8 +108,8 @@ static __device__ void csrmmnt_general_device(rocsparse_int offset, __shared__ rocsparse_int shared_col[BLOCKSIZE / WF_SIZE][WF_SIZE]; __shared__ T shared_val[BLOCKSIZE / WF_SIZE][WF_SIZE]; - rocsparse_int row_start = __ldg(csr_row_ptr + row) - idx_base; - rocsparse_int row_end = __ldg(csr_row_ptr + row + 1) - idx_base; + rocsparse_int row_start = csr_row_ptr[row] - idx_base; + rocsparse_int row_end = csr_row_ptr[row + 1] - idx_base; for(rocsparse_int l = offset; l < ncol; l += WF_SIZE) { @@ -123,16 +122,15 @@ static __device__ void csrmmnt_general_device(rocsparse_int offset, __syncthreads(); - shared_col[wid][lid] = (k < row_end) ? N * (__ldg(csr_col_ind + k) - idx_base) : 0; - shared_val[wid][lid] = - (k < row_end) ? alpha * __ldg(csr_val + k) : static_cast(0); + shared_col[wid][lid] = (k < row_end) ? N * (csr_col_ind[k] - idx_base) : 0; + shared_val[wid][lid] = (k < row_end) ? alpha * csr_val[k] : static_cast(0); __syncthreads(); for(rocsparse_int i = 0; i < WF_SIZE; ++i) { T val_B = (col < ncol) ? __ldg(B + col + shared_col[wid][i]) : static_cast(0); - sum += shared_val[wid][i] * val_B; + sum = fma(shared_val[wid][i], val_B, sum); } } @@ -144,7 +142,7 @@ static __device__ void csrmmnt_general_device(rocsparse_int offset, } else { - C[row + col * ldc] = beta * __ldg(C + row + col * ldc) + sum; + C[row + col * ldc] = fma(beta, C[row + col * ldc], sum); } } } diff --git a/library/src/level3/rocsparse_csrmm.hpp b/library/src/level3/rocsparse_csrmm.hpp index 3f66e1c3..d99bcd29 100644 --- a/library/src/level3/rocsparse_csrmm.hpp +++ b/library/src/level3/rocsparse_csrmm.hpp @@ -81,21 +81,21 @@ __launch_bounds__(256) __global__ rocsparse_index_base idx_base) { csrmmnt_general_device(offset, - ncol, - m, - n, - k, - nnz, - alpha, - csr_row_ptr, - csr_col_ind, - csr_val, - B, - ldb, - beta, - C, - ldc, - idx_base); + ncol, + m, + n, + k, + nnz, + alpha, + csr_row_ptr, + csr_col_ind, + csr_val, + B, + ldb, + beta, + C, + ldc, + idx_base); } template @@ -123,21 +123,21 @@ __launch_bounds__(256) __global__ } csrmmnt_general_device(offset, - ncol, - m, - n, - k, - nnz, - *alpha, - csr_row_ptr, - csr_col_ind, - csr_val, - B, - ldb, - *beta, - C, - ldc, - idx_base); + ncol, + m, + n, + k, + nnz, + *alpha, + csr_row_ptr, + csr_col_ind, + csr_val, + B, + ldb, + *beta, + C, + ldc, + idx_base); } template diff --git a/library/src/precond/csrilu0_device.h b/library/src/precond/csrilu0_device.h index 5f6c9363..1e7677e6 100644 --- a/library/src/precond/csrilu0_device.h +++ b/library/src/precond/csrilu0_device.h @@ -1,5 +1,6 @@ /* ************************************************************************ * Copyright 2018 Advanced Micro Devices, Inc. + * * ************************************************************************ */ #pragma once @@ -24,140 +25,141 @@ __global__ void csrilu0_hash_kernel(rocsparse_int m, rocsparse_int lid = tid & (WF_SIZE - 1); rocsparse_int idx = gid / WF_SIZE; -int wid = tid / WF_SIZE; + __shared__ rocsparse_int stable[BLOCKSIZE * HASH]; + __shared__ rocsparse_int sdata[BLOCKSIZE * HASH]; - __shared__ rocsparse_int stable[BLOCKSIZE / WF_SIZE][WF_SIZE * HASH];//[BLOCKSIZE * HASH]; - __shared__ rocsparse_int sdata[BLOCKSIZE / WF_SIZE][WF_SIZE * HASH]; + // Pointer to each wavefronts shared data + rocsparse_int* table = &stable[(tid / WF_SIZE) * WF_SIZE * HASH]; + rocsparse_int* data = &sdata[(tid / WF_SIZE) * WF_SIZE * HASH]; - for(rocsparse_int j = 0; j < HASH; ++j) + // Initialize hash table with -1 + for(rocsparse_int j = lid; j < WF_SIZE * HASH; j += WF_SIZE) { - stable[wid][lid + j * WF_SIZE] = -1; + table[j] = -1; } - if (idx >= m) + // Do not run out of bounds + if(idx >= m) { - return; + return; } + // Current row this wavefront is working on rocsparse_int row = map[idx]; + // Diagonal entry point of the current row rocsparse_int row_diag = csr_diag_ind[row]; - - // Row has structural zero diagonal, skip - if(row_diag == -1) - { - if(lid == 0) - { - atomicMin(zero_pivot, row); -#if defined(__HIP_PLATFORM_HCC__) - __atomic_store_n(&done[row], 1, __ATOMIC_RELAXED); -#elif defined(__HIP_PLATFORM_NVCC__) - atomicOr(&done[row], 1); -#endif - } - - return; - } - rocsparse_int row_begin = csr_row_ptr[row] - idx_base; rocsparse_int row_end = csr_row_ptr[row + 1] - idx_base; // Fill hash table -// rocsparse_int* table = &stable[(tid / WF_SIZE) * WF_SIZE * HASH]; -// rocsparse_int* data = &sdata[(tid / WF_SIZE) * WF_SIZE * HASH]; - rocsparse_int* table = stable[wid]; - rocsparse_int* data = sdata[wid]; - + // Loop over columns of current row and fill hash table with row dependencies + // Each lane processes one entry for(rocsparse_int j = row_begin + lid; j < row_end; j += WF_SIZE) { // Insert key into hash table - int key = csr_col_ind[j]; - int hash = (key * 103) & (WF_SIZE * HASH - 1); - + rocsparse_int key = csr_col_ind[j]; + // Compute hash + rocsparse_int hash = (key * 103) & (WF_SIZE * HASH - 1); + + // Hash operation while(true) { if(table[hash] == key) { + // key is already inserted, done break; } else if(atomicCAS(&table[hash], -1, key) == -1) { + // inserted key into the table, done data[hash] = j; break; } else { + // collision, compute new hash hash = (hash + 1) & (WF_SIZE * HASH - 1); } } } + // Loop over column of current row for(rocsparse_int j = row_begin; j < row_diag; ++j) { - rocsparse_int local_col = csr_col_ind[j] - idx_base; + // Column index currently being processes + rocsparse_int local_col = csr_col_ind[j] - idx_base; + // Corresponding value T local_val = csr_val[j]; - rocsparse_int local_end = csr_row_ptr[local_col + 1] - idx_base; + // End of the row that corresponds to local_col + rocsparse_int local_end = csr_row_ptr[local_col + 1] - idx_base; + // Diagonal entry point of row local_col rocsparse_int local_diag = csr_diag_ind[local_col]; - // Row depends on structural zero diagonal + // Structural zero pivot, do not process this row if(local_diag == -1) { - if(lid == 0) - { - atomicMin(zero_pivot, local_col); - } - - break; + local_diag = local_end - 1; } + // Spin loop until dependency has been resolved rocsparse_int local_done = 0; while(!local_done) { #if defined(__HIP_PLATFORM_HCC__) - local_done = __atomic_load_n(&done[local_col], __ATOMIC_RELAXED); + local_done = __atomic_load_n(&done[local_col], __ATOMIC_ACQUIRE); #elif defined(__HIP_PLATFORM_NVCC__) - local_done = atomicOr(&done[local_col], 0x0); + local_done = atomicOr(&done[local_col], 0); #endif } +// Load diagonal entry #if defined(__HIP_PLATFORM_HCC__) T diag_val; - __atomic_load(&csr_val[local_diag], &diag_val, __ATOMIC_RELAXED); + __atomic_load(&csr_val[local_diag], &diag_val, __ATOMIC_ACQUIRE); #elif defined(__HIP_PLATFORM_NVCC__) - T diag_val = csr_val[local_diag]; + T diag_val = csr_val[local_diag]; #endif // Row has numerical zero diagonal - if(diag_val == 0.0) + if(diag_val == static_cast(0)) { if(lid == 0) { + // We are looking for the first zero pivot atomicMin(zero_pivot, local_col); } + // Skip this row if it has a zero pivot break; } csr_val[j] = local_val /= diag_val; + // Loop over the row the current column index depends on + // Each lane processes one entry for(rocsparse_int k = local_diag + 1 + lid; k < local_end; k += WF_SIZE) { // Get value from hash table - int key = csr_col_ind[k]; - int hash = (key * 103) & (WF_SIZE * HASH - 1); + rocsparse_int key = csr_col_ind[k]; + // Compute hash + rocsparse_int hash = (key * 103) & (WF_SIZE * HASH - 1); + // Hash operation while(true) { - int val = table[hash]; + rocsparse_int val = table[hash]; if(val == -1) { + // No entry for the key, done break; } else if(val == key) { +// Entry found, do ILU computation #if defined(__HIP_PLATFORM_HCC__) T val_k; - __atomic_load(&csr_val[k], &val_k, __ATOMIC_RELAXED); + __atomic_load(&csr_val[k], &val_k, __ATOMIC_ACQUIRE); #elif defined(__HIP_PLATFORM_NVCC__) T val_k = csr_val[k]; #endif @@ -165,6 +167,7 @@ int wid = tid / WF_SIZE; break; } + // Collision, compute new hash hash = (hash + 1) & (WF_SIZE * HASH - 1); } } @@ -172,8 +175,9 @@ int wid = tid / WF_SIZE; if(lid == 0) { +// Lane 0 write "we are done" flag #if defined(__HIP_PLATFORM_HCC__) - __atomic_store_n(&done[row], 1, __ATOMIC_RELAXED); + __atomic_store_n(&done[row], 1, __ATOMIC_RELEASE); #elif defined(__HIP_PLATFORM_NVCC__) atomicOr(&done[row], 1); #endif @@ -196,91 +200,85 @@ __global__ void csrilu0_binsearch_kernel(rocsparse_int m, rocsparse_int lid = tid & (WF_SIZE - 1); rocsparse_int idx = gid / WF_SIZE; - if (idx >= m) + // Do not run out of bounds + if(idx >= m) { - return; + return; } + // Current row this wavefront is working on rocsparse_int row = map[idx]; + // Diagonal entry point of the current row rocsparse_int row_diag = csr_diag_ind[row]; - - // Row has structural zero diagonal, skip - if(row_diag == -1) - { - if(lid == 0) - { - atomicMin(zero_pivot, row); -#if defined(__HIP_PLATFORM_HCC__) - __atomic_store_n(&done[row], 1, __ATOMIC_RELAXED); -#elif defined(__HIP_PLATFORM_NVCC__) - atomicOr(&done[row], 1); -#endif - } - - return; - } - rocsparse_int row_begin = csr_row_ptr[row] - idx_base; rocsparse_int row_end = csr_row_ptr[row + 1] - idx_base; + // Loop over column of current row for(rocsparse_int j = row_begin; j < row_diag; ++j) { - rocsparse_int local_col = csr_col_ind[j] - idx_base; + // Column index currently being processes + rocsparse_int local_col = csr_col_ind[j] - idx_base; + // Corresponding value T local_val = csr_val[j]; - rocsparse_int local_end = csr_row_ptr[local_col + 1] - idx_base; + // End of the row that corresponds to local_col + rocsparse_int local_end = csr_row_ptr[local_col + 1] - idx_base; + // Diagonal entry point of row local_col rocsparse_int local_diag = csr_diag_ind[local_col]; - // Row depends on structural zero diagonal + // Structural zero pivot, do not process this row if(local_diag == -1) { - if(lid == 0) - { - atomicMin(zero_pivot, local_col); - } - - break; + local_diag = local_end - 1; } + // Spin loop until dependency has been resolved rocsparse_int local_done = 0; while(!local_done) { #if defined(__HIP_PLATFORM_HCC__) - local_done = __atomic_load_n(&done[local_col], __ATOMIC_RELAXED); + local_done = __atomic_load_n(&done[local_col], __ATOMIC_ACQUIRE); #elif defined(__HIP_PLATFORM_NVCC__) - local_done = atomicOr(&done[local_col], 0x0); + local_done = atomicOr(&done[local_col], 0); #endif } +// Load diagonal entry #if defined(__HIP_PLATFORM_HCC__) T diag_val; - __atomic_load(&csr_val[local_diag], &diag_val, __ATOMIC_RELAXED); + __atomic_load(&csr_val[local_diag], &diag_val, __ATOMIC_ACQUIRE); #elif defined(__HIP_PLATFORM_NVCC__) // TODO - volatile T diag_val = csr_val[local_diag]; + volatile T diag_val = csr_val[local_diag]; #endif // Row has numerical zero diagonal - if(diag_val == 0.0) + if(diag_val == static_cast(0)) { if(lid == 0) { + // We are looking for the first zero pivot atomicMin(zero_pivot, local_col); } + // Skip this row if it has a zero pivot break; } csr_val[j] = local_val /= diag_val; + // Loop over the row the current column index depends on + // Each lane processes one entry rocsparse_int l = j + 1; for(rocsparse_int k = local_diag + 1 + lid; k < local_end; k += WF_SIZE) { - rocsparse_int r = row_end - 1; - rocsparse_int m = (r + l) >> 1; + // Perform a binary search to find matching columns + rocsparse_int r = row_end - 1; + rocsparse_int m = (r + l) >> 1; rocsparse_int col_j = csr_col_ind[m]; - + rocsparse_int col_k = csr_col_ind[k]; - + + // Binary search while(l < r) { if(col_j < col_k) @@ -291,16 +289,18 @@ __global__ void csrilu0_binsearch_kernel(rocsparse_int m, { r = m; } - - m = (r + l) >> 1; + + m = (r + l) >> 1; col_j = csr_col_ind[m]; } - + + // Check if a match has been found if(col_j == col_k) { +// If a match has been found, do ILU computation #if defined(__HIP_PLATFORM_HCC__) T val_k; - __atomic_load(&csr_val[k], &val_k, __ATOMIC_RELAXED); + __atomic_load(&csr_val[k], &val_k, __ATOMIC_ACQUIRE); #elif defined(__HIP_PLATFORM_NVCC__) volatile T val_k = csr_val[k]; #endif @@ -312,12 +312,13 @@ __global__ void csrilu0_binsearch_kernel(rocsparse_int m, if(lid == 0) { +// Lane 0 write "we are done" flag #if defined(__HIP_PLATFORM_HCC__) - __atomic_store_n(&done[row], 1, __ATOMIC_RELAXED); + __atomic_store_n(&done[row], 1, __ATOMIC_RELEASE); #elif defined(__HIP_PLATFORM_NVCC__) atomicOr(&done[row], 1); #endif } } -#endif // CSRMV_DEVICE_H +#endif // CSRILU0_DEVICE_H diff --git a/library/src/precond/rocsparse_csrilu0.cpp b/library/src/precond/rocsparse_csrilu0.cpp index 29d75d93..a1754985 100644 --- a/library/src/precond/rocsparse_csrilu0.cpp +++ b/library/src/precond/rocsparse_csrilu0.cpp @@ -1,194 +1,130 @@ /* ************************************************************************ * Copyright 2018 Advanced Micro Devices, Inc. + * * ************************************************************************ */ #include "definitions.h" #include "rocsparse.h" #include "rocsparse_csrilu0.hpp" -#include "../level2/rocsparse_csrsv.hpp" - -#include - /* * =========================================================================== * C wrapper * =========================================================================== */ -extern "C" rocsparse_status rocsparse_csrilu0_buffer_size(rocsparse_handle handle, - rocsparse_int m, - rocsparse_int nnz, - const rocsparse_mat_descr descr, - const rocsparse_int* csr_row_ptr, - const rocsparse_int* csr_col_ind, - rocsparse_mat_info info, - size_t* buffer_size) +extern "C" rocsparse_status rocsparse_scsrilu0_buffer_size(rocsparse_handle handle, + rocsparse_int m, + rocsparse_int nnz, + const rocsparse_mat_descr descr, + const float* csr_val, + const rocsparse_int* csr_row_ptr, + const rocsparse_int* csr_col_ind, + rocsparse_mat_info info, + size_t* buffer_size) +{ + return rocsparse_scsrsv_buffer_size(handle, + rocsparse_operation_none, + m, + nnz, + descr, + csr_val, + csr_row_ptr, + csr_col_ind, + info, + buffer_size); +} + +extern "C" rocsparse_status rocsparse_dcsrilu0_buffer_size(rocsparse_handle handle, + rocsparse_int m, + rocsparse_int nnz, + const rocsparse_mat_descr descr, + const double* csr_val, + const rocsparse_int* csr_row_ptr, + const rocsparse_int* csr_col_ind, + rocsparse_mat_info info, + size_t* buffer_size) +{ + return rocsparse_dcsrsv_buffer_size(handle, + rocsparse_operation_none, + m, + nnz, + descr, + csr_val, + csr_row_ptr, + csr_col_ind, + info, + buffer_size); +} + +extern "C" rocsparse_status rocsparse_scsrilu0_analysis(rocsparse_handle handle, + rocsparse_int m, + rocsparse_int nnz, + const rocsparse_mat_descr descr, + const float* csr_val, + const rocsparse_int* csr_row_ptr, + const rocsparse_int* csr_col_ind, + rocsparse_mat_info info, + rocsparse_analysis_policy analysis, + rocsparse_solve_policy solve, + void* temp_buffer) { - return rocsparse_csrsv_buffer_size(handle, - rocsparse_operation_none, - m, - nnz, - descr, - csr_row_ptr, - csr_col_ind, - info, - buffer_size); + return rocsparse_csrilu0_analysis_template(handle, + m, + nnz, + descr, + csr_val, + csr_row_ptr, + csr_col_ind, + info, + analysis, + solve, + temp_buffer); +} + +extern "C" rocsparse_status rocsparse_dcsrilu0_analysis(rocsparse_handle handle, + rocsparse_int m, + rocsparse_int nnz, + const rocsparse_mat_descr descr, + const double* csr_val, + const rocsparse_int* csr_row_ptr, + const rocsparse_int* csr_col_ind, + rocsparse_mat_info info, + rocsparse_analysis_policy analysis, + rocsparse_solve_policy solve, + void* temp_buffer) +{ + return rocsparse_csrilu0_analysis_template(handle, + m, + nnz, + descr, + csr_val, + csr_row_ptr, + csr_col_ind, + info, + analysis, + solve, + temp_buffer); } -extern "C" rocsparse_status rocsparse_csrilu0_analysis(rocsparse_handle handle, - rocsparse_int m, - rocsparse_int nnz, - const rocsparse_mat_descr descr, - const rocsparse_int* csr_row_ptr, - const rocsparse_int* csr_col_ind, - rocsparse_mat_info info, - rocsparse_solve_policy solve, - rocsparse_analysis_policy analysis, - void* temp_buffer) +extern "C" rocsparse_status rocsparse_csrilu0_clear(rocsparse_handle handle, + rocsparse_mat_info info) { - // Check for valid handle + // Check for valid handle and matrix descriptor if(handle == nullptr) { return rocsparse_status_invalid_handle; } - else if(descr == nullptr) - { - return rocsparse_status_invalid_pointer; - } else if(info == nullptr) { return rocsparse_status_invalid_pointer; } // Logging - log_trace(handle, - "rocsparse_csrilu0_analysis", - m, - nnz, - (const void*&)descr, - (const void*&)csr_row_ptr, - (const void*&)csr_col_ind, - (const void*&)info, - solve, - analysis); - - // Check index base - if(descr->base != rocsparse_index_base_zero && descr->base != rocsparse_index_base_one) - { - return rocsparse_status_invalid_value; - } - if(descr->type != rocsparse_matrix_type_general) - { - // TODO - return rocsparse_status_not_implemented; - } - - // Check analysis policy - if(analysis != rocsparse_analysis_policy_reuse && analysis != rocsparse_analysis_policy_force) - { - return rocsparse_status_invalid_value; - } - - // Check solve policy - if(solve != rocsparse_solve_policy_auto) - { - return rocsparse_status_invalid_value; - } - - // Check sizes - if(m < 0) - { - return rocsparse_status_invalid_size; - } - else if(nnz < 0) - { - return rocsparse_status_invalid_size; - } - - // Check pointer arguments - if(csr_row_ptr == nullptr) - { - return rocsparse_status_invalid_pointer; - } - else if(csr_col_ind == nullptr) - { - return rocsparse_status_invalid_pointer; - } - else if(temp_buffer == nullptr) - { - return rocsparse_status_invalid_pointer; - } - - // Quick return if possible - if(m == 0 || nnz == 0) - { - return rocsparse_status_success; - } - - // Differentiate the analysis policies - if(analysis == rocsparse_analysis_policy_reuse) - { - // We try to re-use already analyzed lower part, if available. - // It is the user's responsibility that this data is still valid, - // since he passed the 'reuse' flag. - - // If csrilu0 meta data is already available, do nothing - if(info->csrilu0_info != nullptr) - { - return rocsparse_status_success; - } - - // Check for other lower analysis meta data - rocsparse_csrtr_info reuse = nullptr; - - // csrsv_lower meta data - if(info->csrsv_lower_info != nullptr) - { - reuse = info->csrsv_lower_info; - } - - // TODO add more crossover data here - - - - - // If data has been found, use it - if(reuse != nullptr) - { - info->csrilu0_info = reuse; + log_trace(handle, "rocsparse_csrilu0_clear", (const void*&)info); - return rocsparse_status_success; - } - } - - // User is explicitly asking to force a re-analysis, or no valid data has been - // found to be re-used. - - // Clear csrilu0 info - RETURN_IF_ROCSPARSE_ERROR(rocsparse_destroy_csrtr_info(info->csrilu0_info)); - - // Create csrilu0 info - RETURN_IF_ROCSPARSE_ERROR(rocsparse_create_csrtr_info(&info->csrilu0_info)); - - // Perform analysis - RETURN_IF_ROCSPARSE_ERROR(rocsparse_csrtr_analysis(handle, - rocsparse_operation_none, - m, - nnz, - descr, - csr_row_ptr, - csr_col_ind, - info->csrilu0_info, - temp_buffer)); - - return rocsparse_status_success; -} - -extern "C" rocsparse_status rocsparse_csrilu0_clear(rocsparse_mat_info info) -{ // If meta data is shared, do not delete anything - if(info->csrilu0_info == info->csrsv_lower_info) + if(info->csrilu0_info == info->csrsv_lower_info || info->csrilu0_info == info->csrsv_upper_info) { info->csrilu0_info = nullptr; @@ -212,16 +148,8 @@ extern "C" rocsparse_status rocsparse_scsrilu0(rocsparse_handle handle, rocsparse_solve_policy policy, void* temp_buffer) { - return rocsparse_csrilu0_template(handle, - m, - nnz, - descr, - csr_val, - csr_row_ptr, - csr_col_ind, - info, - policy, - temp_buffer); + return rocsparse_csrilu0_template( + handle, m, nnz, descr, csr_val, csr_row_ptr, csr_col_ind, info, policy, temp_buffer); } extern "C" rocsparse_status rocsparse_dcsrilu0(rocsparse_handle handle, @@ -235,14 +163,93 @@ extern "C" rocsparse_status rocsparse_dcsrilu0(rocsparse_handle handle, rocsparse_solve_policy policy, void* temp_buffer) { - return rocsparse_csrilu0_template(handle, - m, - nnz, - descr, - csr_val, - csr_row_ptr, - csr_col_ind, - info, - policy, - temp_buffer); + return rocsparse_csrilu0_template( + handle, m, nnz, descr, csr_val, csr_row_ptr, csr_col_ind, info, policy, temp_buffer); +} + +extern "C" rocsparse_status rocsparse_csrilu0_zero_pivot(rocsparse_handle handle, + rocsparse_mat_info info, + rocsparse_int* position) +{ + // Check for valid handle and matrix descriptor + if(handle == nullptr) + { + return rocsparse_status_invalid_handle; + } + else if(info == nullptr) + { + return rocsparse_status_invalid_pointer; + } + + // Logging + log_trace(handle, "rocsparse_csrilu0_zero_pivot", (const void*&)info, (const void*&)position); + + // Check pointer arguments + if(position == nullptr) + { + return rocsparse_status_invalid_pointer; + } + + // Stream + hipStream_t stream = handle->stream; + + // If m == 0 || nnz == 0 it can happen, that info structure is not created. + // In this case, always return -1. + if(info->csrilu0_info == nullptr) + { + if(handle->pointer_mode == rocsparse_pointer_mode_device) + { + RETURN_IF_HIP_ERROR(hipMemsetAsync(position, 255, sizeof(rocsparse_int), stream)); + } + else + { + *position = -1; + } + + return rocsparse_status_success; + } + + // Differentiate between pointer modes + if(handle->pointer_mode == rocsparse_pointer_mode_device) + { + // rocsparse_pointer_mode_device + rocsparse_int pivot; + + RETURN_IF_HIP_ERROR(hipMemcpy( + &pivot, info->csrilu0_info->zero_pivot, sizeof(rocsparse_int), hipMemcpyDeviceToHost)); + + if(pivot == std::numeric_limits::max()) + { + RETURN_IF_HIP_ERROR(hipMemsetAsync(position, 255, sizeof(rocsparse_int), stream)); + } + else + { + RETURN_IF_HIP_ERROR(hipMemcpy(position, + info->csrilu0_info->zero_pivot, + sizeof(rocsparse_int), + hipMemcpyDeviceToDevice)); + + return rocsparse_status_zero_pivot; + } + } + else + { + // rocsparse_pointer_mode_host + RETURN_IF_HIP_ERROR(hipMemcpy(position, + info->csrilu0_info->zero_pivot, + sizeof(rocsparse_int), + hipMemcpyDeviceToHost)); + + // If no zero pivot is found, set -1 + if(*position == std::numeric_limits::max()) + { + *position = -1; + } + else + { + return rocsparse_status_zero_pivot; + } + } + + return rocsparse_status_success; } diff --git a/library/src/precond/rocsparse_csrilu0.hpp b/library/src/precond/rocsparse_csrilu0.hpp index 54c4bebb..05f0ef75 100644 --- a/library/src/precond/rocsparse_csrilu0.hpp +++ b/library/src/precond/rocsparse_csrilu0.hpp @@ -1,17 +1,173 @@ /* ************************************************************************ * Copyright 2018 Advanced Micro Devices, Inc. + * * ************************************************************************ */ #pragma once #ifndef ROCSPARSE_CSRILU0_HPP #define ROCSPARSE_CSRILU0_HPP +#include "definitions.h" #include "rocsparse.h" #include "utility.h" #include "csrilu0_device.h" +#include "../level2/rocsparse_csrsv.hpp" #include +template +rocsparse_status rocsparse_csrilu0_analysis_template(rocsparse_handle handle, + rocsparse_int m, + rocsparse_int nnz, + const rocsparse_mat_descr descr, + const T* csr_val, + const rocsparse_int* csr_row_ptr, + const rocsparse_int* csr_col_ind, + rocsparse_mat_info info, + rocsparse_analysis_policy analysis, + rocsparse_solve_policy solve, + void* temp_buffer) +{ + // Check for valid handle + if(handle == nullptr) + { + return rocsparse_status_invalid_handle; + } + else if(descr == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if(info == nullptr) + { + return rocsparse_status_invalid_pointer; + } + + // Logging + log_trace(handle, + replaceX("rocsparse_Xcsrilu0_analysis"), + m, + nnz, + (const void*&)descr, + (const void*&)csr_val, + (const void*&)csr_row_ptr, + (const void*&)csr_col_ind, + (const void*&)info, + solve, + analysis); + + // Check index base + if(descr->base != rocsparse_index_base_zero && descr->base != rocsparse_index_base_one) + { + return rocsparse_status_invalid_value; + } + if(descr->type != rocsparse_matrix_type_general) + { + // TODO + return rocsparse_status_not_implemented; + } + + // Check analysis policy + if(analysis != rocsparse_analysis_policy_reuse && analysis != rocsparse_analysis_policy_force) + { + return rocsparse_status_invalid_value; + } + + // Check solve policy + if(solve != rocsparse_solve_policy_auto) + { + return rocsparse_status_invalid_value; + } + + // Check sizes + if(m < 0) + { + return rocsparse_status_invalid_size; + } + else if(nnz < 0) + { + return rocsparse_status_invalid_size; + } + + // Check pointer arguments + if(csr_row_ptr == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if(csr_col_ind == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if(csr_val == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if(temp_buffer == nullptr) + { + return rocsparse_status_invalid_pointer; + } + + // Quick return if possible + if(m == 0 || nnz == 0) + { + return rocsparse_status_success; + } + + // Differentiate the analysis policies + if(analysis == rocsparse_analysis_policy_reuse) + { + // We try to re-use already analyzed lower part, if available. + // It is the user's responsibility that this data is still valid, + // since he passed the 'reuse' flag. + + // If csrilu0 meta data is already available, do nothing + if(info->csrilu0_info != nullptr) + { + return rocsparse_status_success; + } + + // Check for other lower analysis meta data + rocsparse_csrtr_info reuse = nullptr; + + // csrsv_lower meta data + if(info->csrsv_lower_info != nullptr) + { + reuse = info->csrsv_lower_info; + } + + // TODO add more crossover data here + + // If data has been found, use it + if(reuse != nullptr) + { + info->csrilu0_info = reuse; + + return rocsparse_status_success; + } + } + + // User is explicitly asking to force a re-analysis, or no valid data has been + // found to be re-used. + + // Clear csrilu0 info + RETURN_IF_ROCSPARSE_ERROR(rocsparse_destroy_csrtr_info(info->csrilu0_info)); + + // Create csrilu0 info + RETURN_IF_ROCSPARSE_ERROR(rocsparse_create_csrtr_info(&info->csrilu0_info)); + + // Perform analysis + RETURN_IF_ROCSPARSE_ERROR(rocsparse_csrtr_analysis(handle, + rocsparse_operation_none, + m, + nnz, + descr, + csr_row_ptr, + csr_col_ind, + info->csrilu0_info, + temp_buffer)); + + return rocsparse_status_success; +} + template rocsparse_status rocsparse_csrilu0_template(rocsparse_handle handle, rocsparse_int m, @@ -51,10 +207,7 @@ rocsparse_status rocsparse_csrilu0_template(rocsparse_handle handle, policy, (const void*&)temp_buffer); - log_bench(handle, - "./rocsparse-bench -f csrilu0 -r", - replaceX("X"), - "--mtx "); + log_bench(handle, "./rocsparse-bench -f csrilu0 -r", replaceX("X"), "--mtx "); // Check index base if(descr->base != rocsparse_index_base_zero && descr->base != rocsparse_index_base_one) @@ -104,21 +257,18 @@ rocsparse_status rocsparse_csrilu0_template(rocsparse_handle handle, // Stream hipStream_t stream = handle->stream; - - // Buffer char* ptr = reinterpret_cast(temp_buffer); - // zero pivot - rocsparse_int* d_zero_pivot = reinterpret_cast(ptr); + ptr += 256; + ptr += 256; ptr += 256; // done array rocsparse_int* d_done_array = reinterpret_cast(ptr); // Initialize buffers - RETURN_IF_HIP_ERROR(hipMemcpy(d_zero_pivot, &m, sizeof(rocsparse_int), hipMemcpyHostToDevice)); - RETURN_IF_HIP_ERROR(hipMemset(d_done_array, 0, sizeof(rocsparse_int) * m)); + RETURN_IF_HIP_ERROR(hipMemsetAsync(d_done_array, 0, sizeof(rocsparse_int) * m, stream)); #define CSRILU0_DIM 256 dim3 csrilu0_blocks((m * handle->wavefront_size - 1) / CSRILU0_DIM + 1); @@ -126,20 +276,108 @@ rocsparse_status rocsparse_csrilu0_template(rocsparse_handle handle, if(handle->wavefront_size == 32) { - hipLaunchKernelGGL((csrilu0_binsearch_kernel), - csrilu0_blocks, - csrilu0_threads, - 0, - stream, - m, - csr_row_ptr, - csr_col_ind, - csr_val, - info->csrilu0_info->csr_diag_ind, - d_done_array, - info->csrilu0_info->row_map, - d_zero_pivot, - descr->base); + if(info->csrilu0_info->max_nnz <= 32) + { + hipLaunchKernelGGL((csrilu0_hash_kernel), + csrilu0_blocks, + csrilu0_threads, + 0, + stream, + m, + csr_row_ptr, + csr_col_ind, + csr_val, + info->csrilu0_info->csr_diag_ind, + d_done_array, + info->csrilu0_info->row_map, + info->csrilu0_info->zero_pivot, + descr->base); + } + else if(info->csrilu0_info->max_nnz <= 64) + { + hipLaunchKernelGGL((csrilu0_hash_kernel), + csrilu0_blocks, + csrilu0_threads, + 0, + stream, + m, + csr_row_ptr, + csr_col_ind, + csr_val, + info->csrilu0_info->csr_diag_ind, + d_done_array, + info->csrilu0_info->row_map, + info->csrilu0_info->zero_pivot, + descr->base); + } + else if(info->csrilu0_info->max_nnz <= 128) + { + hipLaunchKernelGGL((csrilu0_hash_kernel), + csrilu0_blocks, + csrilu0_threads, + 0, + stream, + m, + csr_row_ptr, + csr_col_ind, + csr_val, + info->csrilu0_info->csr_diag_ind, + d_done_array, + info->csrilu0_info->row_map, + info->csrilu0_info->zero_pivot, + descr->base); + } + else if(info->csrilu0_info->max_nnz <= 256) + { + hipLaunchKernelGGL((csrilu0_hash_kernel), + csrilu0_blocks, + csrilu0_threads, + 0, + stream, + m, + csr_row_ptr, + csr_col_ind, + csr_val, + info->csrilu0_info->csr_diag_ind, + d_done_array, + info->csrilu0_info->row_map, + info->csrilu0_info->zero_pivot, + descr->base); + } + else if(info->csrilu0_info->max_nnz <= 512) + { + hipLaunchKernelGGL((csrilu0_hash_kernel), + csrilu0_blocks, + csrilu0_threads, + 0, + stream, + m, + csr_row_ptr, + csr_col_ind, + csr_val, + info->csrilu0_info->csr_diag_ind, + d_done_array, + info->csrilu0_info->row_map, + info->csrilu0_info->zero_pivot, + descr->base); + } + else + { + hipLaunchKernelGGL((csrilu0_binsearch_kernel), + csrilu0_blocks, + csrilu0_threads, + 0, + stream, + m, + csr_row_ptr, + csr_col_ind, + csr_val, + info->csrilu0_info->csr_diag_ind, + d_done_array, + info->csrilu0_info->row_map, + info->csrilu0_info->zero_pivot, + descr->base); + } } else if(handle->wavefront_size == 64) { @@ -157,7 +395,7 @@ rocsparse_status rocsparse_csrilu0_template(rocsparse_handle handle, info->csrilu0_info->csr_diag_ind, d_done_array, info->csrilu0_info->row_map, - d_zero_pivot, + info->csrilu0_info->zero_pivot, descr->base); } else if(info->csrilu0_info->max_nnz <= 128) @@ -174,7 +412,7 @@ rocsparse_status rocsparse_csrilu0_template(rocsparse_handle handle, info->csrilu0_info->csr_diag_ind, d_done_array, info->csrilu0_info->row_map, - d_zero_pivot, + info->csrilu0_info->zero_pivot, descr->base); } else if(info->csrilu0_info->max_nnz <= 256) @@ -191,7 +429,7 @@ rocsparse_status rocsparse_csrilu0_template(rocsparse_handle handle, info->csrilu0_info->csr_diag_ind, d_done_array, info->csrilu0_info->row_map, - d_zero_pivot, + info->csrilu0_info->zero_pivot, descr->base); } else if(info->csrilu0_info->max_nnz <= 512) @@ -208,7 +446,7 @@ rocsparse_status rocsparse_csrilu0_template(rocsparse_handle handle, info->csrilu0_info->csr_diag_ind, d_done_array, info->csrilu0_info->row_map, - d_zero_pivot, + info->csrilu0_info->zero_pivot, descr->base); } else if(info->csrilu0_info->max_nnz <= 1024) @@ -225,12 +463,11 @@ rocsparse_status rocsparse_csrilu0_template(rocsparse_handle handle, info->csrilu0_info->csr_diag_ind, d_done_array, info->csrilu0_info->row_map, - d_zero_pivot, + info->csrilu0_info->zero_pivot, descr->base); } else { - printf("standard kernel\n"); hipLaunchKernelGGL((csrilu0_binsearch_kernel), csrilu0_blocks, csrilu0_threads, @@ -243,7 +480,7 @@ rocsparse_status rocsparse_csrilu0_template(rocsparse_handle handle, info->csrilu0_info->csr_diag_ind, d_done_array, info->csrilu0_info->row_map, - d_zero_pivot, + info->csrilu0_info->zero_pivot, descr->base); } } @@ -252,14 +489,7 @@ rocsparse_status rocsparse_csrilu0_template(rocsparse_handle handle, return rocsparse_status_arch_mismatch; } #undef CSRILU0_DIM -/* -// TODO this is blocking somehow - int zero; - hipMemcpyAsync(&zero, d_zero_pivot, sizeof(int), hipMemcpyDeviceToHost, stream); - - if(zero != m) - printf("Zero pivot: %d\n", zero); -*/ + return rocsparse_status_success; } diff --git a/library/src/rocsparse_auxiliary.cpp b/library/src/rocsparse_auxiliary.cpp index 1aa1c097..ecb43ea3 100644 --- a/library/src/rocsparse_auxiliary.cpp +++ b/library/src/rocsparse_auxiliary.cpp @@ -172,6 +172,28 @@ rocsparse_status rocsparse_create_mat_descr(rocsparse_mat_descr* descr) } } +/******************************************************************************** + * \brief copy matrix descriptor + *******************************************************************************/ +rocsparse_status rocsparse_copy_mat_descr(rocsparse_mat_descr dest, const rocsparse_mat_descr src) +{ + if(dest == nullptr) + { + return rocsparse_status_invalid_pointer; + } + else if(src == nullptr) + { + return rocsparse_status_invalid_pointer; + } + + dest->type = src->type; + dest->fill_mode = src->fill_mode; + dest->diag_type = src->diag_type; + dest->base = src->base; + + return rocsparse_status_success; +} + /******************************************************************************** * \brief destroy matrix descriptor *******************************************************************************/ @@ -252,18 +274,8 @@ rocsparse_matrix_type rocsparse_get_mat_type(const rocsparse_mat_descr descr) return descr->type; } - - - - - - - - - - - -rocsparse_status rocsparse_set_mat_fill_mode(rocsparse_mat_descr descr, rocsparse_fill_mode fill_mode) +rocsparse_status rocsparse_set_mat_fill_mode(rocsparse_mat_descr descr, + rocsparse_fill_mode fill_mode) { // Check if descriptor is valid if(descr == nullptr) @@ -288,8 +300,8 @@ rocsparse_fill_mode rocsparse_get_mat_fill_mode(const rocsparse_mat_descr descr) return descr->fill_mode; } - -rocsparse_status rocsparse_set_mat_diag_type(rocsparse_mat_descr descr, rocsparse_diag_type diag_type) +rocsparse_status rocsparse_set_mat_diag_type(rocsparse_mat_descr descr, + rocsparse_diag_type diag_type) { // Check if descriptor is valid if(descr == nullptr) @@ -314,19 +326,6 @@ rocsparse_diag_type rocsparse_get_mat_diag_type(const rocsparse_mat_descr descr) return descr->diag_type; } - - - - - - - - - - - - - /******************************************************************************** * \brief rocsparse_create_hyb_mat is a structure holding the rocsparse HYB * matrix. It must be initialized using rocsparse_create_hyb_mat()