diff --git a/CMakeLists.txt b/CMakeLists.txt index 781b9587..02731e3e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -85,7 +85,7 @@ option(BUILD_VERBOSE "Output additional build information" OFF) include(cmake/Dependencies.cmake) # Setup version -rocm_setup_version(VERSION 1.12.2) +rocm_setup_version(VERSION 1.12.3) set(rocsparse_SOVERSION 0.1) # AMD targets diff --git a/library/src/conversion/csr2csr_compress_device.h b/library/src/conversion/csr2csr_compress_device.h index 173eeec3..6406058b 100644 --- a/library/src/conversion/csr2csr_compress_device.h +++ b/library/src/conversion/csr2csr_compress_device.h @@ -98,7 +98,7 @@ __launch_bounds__(BLOCK_SIZE) __global__ // Check if value in matrix will be kept const bool predicate - = rocsparse_abs(value) > std::real(tol) + = rocsparse_abs(value) > rocsparse_real(tol) && rocsparse_abs(value) > std::numeric_limits::min() ? true : false; diff --git a/library/src/conversion/nnz_compress_device.h b/library/src/conversion/nnz_compress_device.h index c86062e9..7af4860b 100644 --- a/library/src/conversion/nnz_compress_device.h +++ b/library/src/conversion/nnz_compress_device.h @@ -60,7 +60,7 @@ __launch_bounds__(BLOCK_SIZE) __global__ for(rocsparse_int i = start_A + segment_lane_id; i < end_A; i += SEGMENT_SIZE) { const T value = csr_val_A[i]; - if(rocsparse_abs(value) > std::real(tol) + if(rocsparse_abs(value) > rocsparse_real(tol) && rocsparse_abs(value) > std::numeric_limits::min()) { count++; diff --git a/library/src/conversion/rocsparse_csr2bsr.cpp b/library/src/conversion/rocsparse_csr2bsr.cpp index 28af220a..bf966ad5 100644 --- a/library/src/conversion/rocsparse_csr2bsr.cpp +++ b/library/src/conversion/rocsparse_csr2bsr.cpp @@ -33,20 +33,20 @@ * =========================================================================== */ -#define launch_csr2bsr_nnz_fast_kernel(block_size, bsr_block_dim, segment_size) \ - hipLaunchKernelGGL(csr2bsr_nnz_fast_kernel, \ - dim3(grid_size), \ - dim3(block_size), \ - 0, \ - handle->stream, \ - m, \ - n, \ - mb, \ - nb, \ - csr_descr->base, \ - csr_row_ptr, \ - csr_col_ind, \ - bsr_descr->base, \ +#define launch_csr2bsr_nnz_fast_kernel(block_size, bsr_block_dim, segment_size) \ + hipLaunchKernelGGL((csr2bsr_nnz_fast_kernel), \ + dim3(grid_size), \ + dim3(block_size), \ + 0, \ + handle->stream, \ + m, \ + n, \ + mb, \ + nb, \ + csr_descr->base, \ + csr_row_ptr, \ + csr_col_ind, \ + bsr_descr->base, \ bsr_row_ptr); extern "C" rocsparse_status rocsparse_csr2bsr_nnz(rocsparse_handle handle, diff --git a/library/src/include/common.h b/library/src/include/common.h index 1ec07ef6..40b943fe 100644 --- a/library/src/include/common.h +++ b/library/src/include/common.h @@ -51,6 +51,11 @@ __device__ __forceinline__ double rocsparse_conj(const double& x) { return x; } __device__ __forceinline__ rocsparse_float_complex rocsparse_conj(const rocsparse_float_complex& x) { return std::conj(x); } __device__ __forceinline__ rocsparse_double_complex rocsparse_conj(const rocsparse_double_complex& x) { return std::conj(x); } +__device__ __forceinline__ float rocsparse_real(const float& x) { return x; } +__device__ __forceinline__ double rocsparse_real(const double& x) { return x; } +__device__ __forceinline__ float rocsparse_real(const rocsparse_float_complex& x) { return std::real(x); } +__device__ __forceinline__ double rocsparse_real(const rocsparse_double_complex& x) { return std::real(x); } + __device__ __forceinline__ float rocsparse_nontemporal_load(const float* ptr) { return __builtin_nontemporal_load(ptr); } __device__ __forceinline__ double rocsparse_nontemporal_load(const double* ptr) { return __builtin_nontemporal_load(ptr); } __device__ __forceinline__ rocsparse_float_complex rocsparse_nontemporal_load(const rocsparse_float_complex* ptr) { return rocsparse_float_complex(__builtin_nontemporal_load((const float*)ptr), __builtin_nontemporal_load((const float*)ptr + 1)); }