diff --git a/.github/workflows/windows-build.yml b/.github/workflows/windows-build.yml index 66a81dda312..c1bda6bc2f3 100644 --- a/.github/workflows/windows-build.yml +++ b/.github/workflows/windows-build.yml @@ -4,16 +4,27 @@ on: [push] jobs: windows_cuda: - name: cuda102/release/shared (only compile) + strategy: + fail-fast: false + matrix: + config: + - {version: "10.2.89.20191206", name: "cuda102/release/shared"} + - {version: "latest", name: "cuda-latest/release/shared"} + name: msvc/${{ matrix.config.name }} (only compile) runs-on: [windows-latest] steps: - uses: actions/checkout@v2 - - name: setup + - name: setup (versioned) + if: matrix.config.version != 'latest' + run: | + choco install cuda --version=${{ matrix.config.version }} -y + - name: setup (latest) + if: matrix.config.version == 'latest' run: | - choco install cuda --version=10.2.89.20191206 -y + choco install cuda -y - name: configure run: | - $env:ChocolateyInstall = Convert-Path "$((Get-Command choco).Path)\..\.." + $env:ChocolateyInstall = Convert-Path "$((Get-Command choco).Path)\..\.." Import-Module "$env:ChocolateyInstall\helpers\chocolateyProfile.psm1" refreshenv mkdir build @@ -21,7 +32,7 @@ jobs: $env:PATH="$pwd\windows_shared_library;$env:PATH" cmake -DGINKGO_BUILD_CUDA=ON -DGINKGO_BUILD_OMP=OFF .. cmake --build . -j4 --config Release - + windows_ref: strategy: fail-fast: false diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 77598e32fe6..c2ab6d813a1 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -339,17 +339,17 @@ build/cuda101/clang/all/release/static: - cuda - gpu -build/cuda101/intel/cuda/debug/static: +# clang-cuda with cuda 10.1 and friends +build/clang-cuda101/gcc/all/release/shared: <<: *default_build_with_test - image: localhost:5000/gko-cuda101-gnu8-llvm7-intel2019 + image: localhost:5000/gko-cuda101-gnu8-llvm10-intel2019 variables: <<: *default_variables - C_COMPILER: "icc" - CXX_COMPILER: "icpc" + CUDA_COMPILER: "clang++" BUILD_OMP: "ON" BUILD_CUDA: "ON" - BUILD_TYPE: "Debug" - BUILD_SHARED_LIBS: "OFF" + BUILD_HIP: "ON" + BUILD_TYPE: "Release" CUDA_ARCH: 35 only: variables: @@ -359,17 +359,58 @@ build/cuda101/intel/cuda/debug/static: - cuda - gpu -# clang-cuda with cuda 10.1 and friends -build/clang-cuda101/gcc/all/release/shared: +build/clang-cuda101/clang/cuda/debug/static: <<: *default_build_with_test image: localhost:5000/gko-cuda101-gnu8-llvm10-intel2019 variables: <<: *default_variables + C_COMPILER: "clang" + CXX_COMPILER: "clang++" CUDA_COMPILER: "clang++" BUILD_OMP: "ON" BUILD_CUDA: "ON" + BUILD_TYPE: "Debug" + BUILD_SHARED_LIBS: "OFF" + CUDA_ARCH: 35 + only: + variables: + - $RUN_CI_TAG + tags: + - private_ci + - cuda + - gpu + +# cuda 10.2 and friends +build/cuda102/gcc/all/debug/shared: + <<: *default_build_with_test + image: localhost:5000/gko-cuda102-gnu8-llvm8-intel2019 + variables: + <<: *default_variables + BUILD_OMP: "ON" + BUILD_CUDA: "ON" + BUILD_HIP: "ON" + BUILD_TYPE: "Debug" + CUDA_ARCH: 35 + only: + variables: + - $RUN_CI_TAG + tags: + - private_ci + - cuda + - gpu + +build/cuda102/clang/all/release/static: + <<: *default_build_with_test + image: localhost:5000/gko-cuda102-gnu8-llvm8-intel2019 + variables: + <<: *default_variables + C_COMPILER: "clang" + CXX_COMPILER: "clang++" + BUILD_OMP: "ON" + BUILD_CUDA: "ON" BUILD_HIP: "ON" BUILD_TYPE: "Release" + BUILD_SHARED_LIBS: "OFF" CUDA_ARCH: 35 only: variables: @@ -379,14 +420,71 @@ build/clang-cuda101/gcc/all/release/shared: - cuda - gpu -build/clang-cuda101/clang/cuda/debug/static: +build/cuda102/intel/cuda/debug/static: <<: *default_build_with_test - image: localhost:5000/gko-cuda101-gnu8-llvm10-intel2019 + image: localhost:5000/gko-cuda102-gnu8-llvm8-intel2019 + variables: + <<: *default_variables + C_COMPILER: "icc" + CXX_COMPILER: "icpc" + BUILD_OMP: "ON" + BUILD_CUDA: "ON" + BUILD_TYPE: "Debug" + BUILD_SHARED_LIBS: "OFF" + CUDA_ARCH: 35 + only: + variables: + - $RUN_CI_TAG + tags: + - private_ci + - cuda + - gpu + +# cuda 11.0 and friends +build/cuda110/gcc/cuda/debug/shared: + <<: *default_build_with_test + image: localhost:5000/gko-cuda110-gnu9-llvm9-intel2020 + variables: + <<: *default_variables + BUILD_OMP: "ON" + BUILD_CUDA: "ON" + BUILD_TYPE: "Debug" + CUDA_ARCH: 35 + only: + variables: + - $RUN_CI_TAG + tags: + - private_ci + - cuda + - gpu + +build/cuda110/clang/cuda/release/static: + <<: *default_build_with_test + image: localhost:5000/gko-cuda110-gnu9-llvm9-intel2020 variables: <<: *default_variables C_COMPILER: "clang" CXX_COMPILER: "clang++" - CUDA_COMPILER: "clang++" + BUILD_OMP: "ON" + BUILD_CUDA: "ON" + BUILD_TYPE: "Release" + BUILD_SHARED_LIBS: "OFF" + CUDA_ARCH: 35 + only: + variables: + - $RUN_CI_TAG + tags: + - private_ci + - cuda + - gpu + +build/cuda110/intel/cuda/debug/static: + <<: *default_build_with_test + image: localhost:5000/gko-cuda110-gnu9-llvm9-intel2020 + variables: + <<: *default_variables + C_COMPILER: "icc" + CXX_COMPILER: "icpc" BUILD_OMP: "ON" BUILD_CUDA: "ON" BUILD_TYPE: "Debug" diff --git a/CMakeLists.txt b/CMakeLists.txt index 8e9a76fdd6f..ff54c555a1f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -5,6 +5,12 @@ if (CMAKE_VERSION VERSION_GREATER_EQUAL 3.12) cmake_policy(SET CMP0074 NEW) endif() +# Let CAS handle the CUDA architecture flags (for now) +# Windows still gives CMP0104 warning if putting it in cuda. +if (CMAKE_VERSION VERSION_GREATER_EQUAL 3.18) + cmake_policy(SET CMP0104 OLD) +endif() + project(Ginkgo LANGUAGES C CXX VERSION 1.2.0 DESCRIPTION "A numerical linear algebra library targeting many-core architectures") set(Ginkgo_VERSION_TAG "develop") set(PROJECT_VERSION_TAG ${Ginkgo_VERSION_TAG}) diff --git a/benchmark/utils/cuda_linops.hpp b/benchmark/utils/cuda_linops.hpp index 7762a2439d2..08f5ea7b61d 100644 --- a/benchmark/utils/cuda_linops.hpp +++ b/benchmark/utils/cuda_linops.hpp @@ -117,6 +117,9 @@ class CuspBase : public gko::LinOp { }; +#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) + + template class CuspCsrmp @@ -298,6 +301,9 @@ class CuspCsrmm }; +#endif // defined(CUDA_VERSION) && (CUDA_VERSION < 11000) + + template class CuspCsrEx @@ -388,7 +394,7 @@ class CuspCsrEx { #ifdef ALLOWMP algmode_ = CUSPARSE_ALG_MERGE_PATH; -#endif +#endif // ALLOWMP } private: @@ -400,6 +406,9 @@ class CuspCsrEx }; +#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) + + template = 10010) && \ - !(defined(_WIN32) || defined(__CYGWIN__)) +#endif // defined(CUDA_VERSION) && (CUDA_VERSION < 11000) + + +#if defined(CUDA_VERSION) && \ + (CUDA_VERSION >= 11000 || \ + ((CUDA_VERSION >= 10010) && !(defined(_WIN32) || defined(__CYGWIN__)))) template @@ -512,7 +525,7 @@ void cusp_generic_spmv(std::shared_ptr gpu_exec, &vecb, dense_b->get_num_stored_elements(), as_culibs_type(const_cast(db)), cu_value)); - size_t buffer_size = 0; + gko::size_type buffer_size = 0; GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseSpMV_bufferSize( gpu_exec->get_cusparse_handle(), trans, &scalars.get_const_data()[0], mat, vecb, &scalars.get_const_data()[1], vecx, cu_value, alg, @@ -680,22 +693,25 @@ class CuspGenericCoo }; -#endif // defined(CUDA_VERSION) && (CUDA_VERSION >= 10010) && - // !(defined(_WIN32) || defined(__CYGWIN__)) +#endif // defined(CUDA_VERSION) && (CUDA_VERSION >= 11000 || ((CUDA_VERSION >= + // 10010) && !(defined(_WIN32) || defined(__CYGWIN__)))) } // namespace detail // Some shortcuts -using cusp_csr = detail::CuspCsr<>; using cusp_csrex = detail::CuspCsrEx<>; +#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) +using cusp_csr = detail::CuspCsr<>; using cusp_csrmp = detail::CuspCsrmp<>; using cusp_csrmm = detail::CuspCsrmm<>; +#endif // defined(CUDA_VERSION) && (CUDA_VERSION < 11000) -#if defined(CUDA_VERSION) && (CUDA_VERSION >= 10010) && \ - !(defined(_WIN32) || defined(__CYGWIN__)) +#if defined(CUDA_VERSION) && \ + (CUDA_VERSION >= 11000 || \ + ((CUDA_VERSION >= 10010) && !(defined(_WIN32) || defined(__CYGWIN__)))) using cusp_gcsr = detail::CuspGenericCsr<>; @@ -704,14 +720,17 @@ using cusp_gcsr2 = using cusp_gcoo = detail::CuspGenericCoo<>; -#endif // defined(CUDA_VERSION) && (CUDA_VERSION >= 10010) && - // !(defined(_WIN32) || defined(__CYGWIN__)) +#endif // defined(CUDA_VERSION) && (CUDA_VERSION >= 11000 || ((CUDA_VERSION >= + // 10010) && !(defined(_WIN32) || defined(__CYGWIN__)))) +#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) using cusp_coo = detail::CuspHybrid; using cusp_ell = detail::CuspHybrid; using cusp_hybrid = detail::CuspHybrid<>; +#endif // defined(CUDA_VERSION) && (CUDA_VERSION < 11000) + #endif // GKO_BENCHMARK_UTILS_CUDA_LINOPS_HPP_ diff --git a/benchmark/utils/formats.hpp b/benchmark/utils/formats.hpp index c4379f834f7..ca5ad5f58be 100644 --- a/benchmark/utils/formats.hpp +++ b/benchmark/utils/formats.hpp @@ -60,8 +60,16 @@ std::string available_format = "hybrid60, hybrid80, hybridlimit0, hybridlimit25, hybridlimit33, " "hybridminstorage" #ifdef HAS_CUDA - ", cusp_csr, cusp_csrex, cusp_csrmp, cusp_csrmm, cusp_coo, cusp_ell, " - "cusp_hybrid" + ", cusp_csr, cusp_csrex, cusp_coo" +#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) + ", cusp_csrmp, cusp_csrmm, cusp_ell, cusp_hybrid" +#endif // defined(CUDA_VERSION) && (CUDA_VERSION < 11000) +#if defined(CUDA_VERSION) && \ + (CUDA_VERSION >= 11000 || \ + ((CUDA_VERSION >= 10010) && !(defined(_WIN32) || defined(__CYGWIN__)))) + ", cusp_gcsr, cusp_gcsr2, cusp_gcoo" +#endif // defined(CUDA_VERSION) && (CUDA_VERSION >= 11000 || ((CUDA_VERSION >= + // 10010) && !(defined(_WIN32) || defined(__CYGWIN__)))) #endif // HAS_CUDA #ifdef HAS_HIP ", hipsp_csr, hipsp_csrmm, hipsp_coo, hipsp_ell, hipsp_hybrid" @@ -88,17 +96,23 @@ std::string format_description = "hybridminstorage: Hybrid uses the minimal storage to store the matrix." #ifdef HAS_CUDA "\n" - "cusp_hybrid: benchmark CuSPARSE spmv with cusparseXhybmv and an automatic " - "partition.\n" +#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) "cusp_coo: use cusparseXhybmv with a CUSPARSE_HYB_PARTITION_USER " "partition.\n" - "cusp_ell: use cusparseXhybmv with CUSPARSE_HYB_PARTITION_MAX partition.\n" "cusp_csr: benchmark CuSPARSE with the cusparseXcsrmv function.\n" - "cusp_csrex: benchmark CuSPARSE with the cusparseXcsrmvEx function.\n" + "cusp_ell: use cusparseXhybmv with CUSPARSE_HYB_PARTITION_MAX partition.\n" "cusp_csrmp: benchmark CuSPARSE with the cusparseXcsrmv_mp function.\n" - "cusp_csrmm: benchmark CuSPARSE with the cusparseXcsrmv_mm function." -#if defined(CUDA_VERSION) && (CUDA_VERSION >= 10010) && \ - !(defined(_WIN32) || defined(__CYGWIN__)) + "cusp_csrmm: benchmark CuSPARSE with the cusparseXcsrmv_mm function.\n" + "cusp_hybrid: benchmark CuSPARSE spmv with cusparseXhybmv and an automatic " + "partition.\n" +#else // CUDA_VERSION >= 11000 + "cusp_csr: is an alias of cusp_gcsr.\n" + "cusp_coo: is an alias of cusp_gcoo.\n" +#endif + "cusp_csrex: benchmark CuSPARSE with the cusparseXcsrmvEx function." +#if defined(CUDA_VERSION) && \ + (CUDA_VERSION >= 11000 || \ + ((CUDA_VERSION >= 10010) && !(defined(_WIN32) || defined(__CYGWIN__)))) "\n" "cusp_gcsr: benchmark CuSPARSE with the generic csr with default " "algorithm.\n" @@ -106,8 +120,8 @@ std::string format_description = "CUSPARSE_CSRMV_ALG2.\n" "cusp_gcoo: benchmark CuSPARSE with the generic coo with default " "algorithm.\n" -#endif // defined(CUDA_VERSION) && (CUDA_VERSION >= 10010) && - // !(defined(_WIN32) || defined(__CYGWIN__)) +#endif // defined(CUDA_VERSION) && (CUDA_VERSION >= 11000 || ((CUDA_VERSION >= + // 10010) && !(defined(_WIN32) || defined(__CYGWIN__)))) #endif // HAS_CUDA #ifdef HAS_HIP "\n" @@ -187,20 +201,27 @@ const std::map( {"coo", read_matrix_from_data>}, {"ell", read_matrix_from_data>}, #ifdef HAS_CUDA +#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) {"cusp_csr", read_matrix_from_data}, {"cusp_csrmp", read_matrix_from_data}, - {"cusp_csrex", read_matrix_from_data}, {"cusp_csrmm", read_matrix_from_data}, {"cusp_hybrid", read_matrix_from_data}, {"cusp_coo", read_matrix_from_data}, {"cusp_ell", read_matrix_from_data}, -#if defined(CUDA_VERSION) && (CUDA_VERSION >= 10010) && \ - !(defined(_WIN32) || defined(__CYGWIN__)) +#else // CUDA_VERSION >= 11000 + // cusp_csr, cusp_coo use the generic ones from CUDA 11 + {"cusp_csr", read_matrix_from_data}, + {"cusp_coo", read_matrix_from_data}, +#endif + {"cusp_csrex", read_matrix_from_data}, +#if defined(CUDA_VERSION) && \ + (CUDA_VERSION >= 11000 || \ + ((CUDA_VERSION >= 10010) && !(defined(_WIN32) || defined(__CYGWIN__)))) {"cusp_gcsr", read_matrix_from_data}, {"cusp_gcsr2", read_matrix_from_data}, {"cusp_gcoo", read_matrix_from_data}, -#endif // defined(CUDA_VERSION) && (CUDA_VERSION >= 10010) && - // !(defined(_WIN32) || defined(__CYGWIN__)) +#endif // defined(CUDA_VERSION) && (CUDA_VERSION >= 11000 || ((CUDA_VERSION >= + // 10010) && !(defined(_WIN32) || defined(__CYGWIN__)))) #endif // HAS_CUDA #ifdef HAS_HIP {"hipsp_csr", read_matrix_from_data}, diff --git a/cmake/create_test.cmake b/cmake/create_test.cmake index d968f781609..f2ff8496e64 100644 --- a/cmake/create_test.cmake +++ b/cmake/create_test.cmake @@ -68,6 +68,9 @@ function(ginkgo_create_cuda_test test_name) PRIVATE "$" ) + cas_target_cuda_architectures(${TEST_TARGET_NAME} + ARCHITECTURES ${GINKGO_CUDA_ARCHITECTURES} + UNSUPPORTED "20" "21") set_target_properties(${TEST_TARGET_NAME} PROPERTIES OUTPUT_NAME ${test_name}) diff --git a/common/base/executor.hpp.inc b/common/base/executor.hpp.inc index adbe4d420f8..705404b314e 100644 --- a/common/base/executor.hpp.inc +++ b/common/base/executor.hpp.inc @@ -78,9 +78,9 @@ inline int convert_sm_ver_to_cores(int major, int minor) << "is undefined. The default value of " << nGpuArchCoresPerSM[index - 1].Cores << " Cores/SM is used." << std::endl; -#endif +#endif // GKO_VERBOSE_LEVEL >= 1 return nGpuArchCoresPerSM[index - 1].Cores; } -} // namespace \ No newline at end of file +} // namespace diff --git a/common/components/atomic.hpp.inc b/common/components/atomic.hpp.inc index 20ed9b12c06..6ae0ac5e751 100644 --- a/common/components/atomic.hpp.inc +++ b/common/components/atomic.hpp.inc @@ -88,7 +88,7 @@ GKO_BIND_ATOMIC_HELPER_STRUCTURE(unsigned int); #if !(defined(CUDA_VERSION) && (CUDA_VERSION < 10010)) // CUDA 10.1 starts supporting 16-bit unsigned short int atomicCAS GKO_BIND_ATOMIC_HELPER_STRUCTURE(unsigned short int); -#endif +#endif // !(defined(CUDA_VERSION) && (CUDA_VERSION < 10010)) #undef GKO_BIND_ATOMIC_HELPER_STRUCTURE @@ -125,14 +125,16 @@ GKO_BIND_ATOMIC_ADD(float); // CUDA 8.0 starts suppoting 64-bit double atomicAdd on devices of compute // capability 6.x and higher GKO_BIND_ATOMIC_ADD(double); -#endif +#endif // !((defined(CUDA_VERSION) && (CUDA_VERSION < 8000)) || + // (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 600))) #if !((defined(CUDA_VERSION) && (CUDA_VERSION < 10000)) || \ (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 700))) // CUDA 10.0 starts supporting 16-bit __half floating-point atomicAdd on devices // of compute capability 7.x and higher. GKO_BIND_ATOMIC_ADD(__half); -#endif +#endif // !((defined(CUDA_VERSION) && (CUDA_VERSION < 10000)) || + // (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 700))) #if !((defined(CUDA_VERSION) && (CUDA_VERSION < 10000)) || \ (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 600))) @@ -142,7 +144,8 @@ GKO_BIND_ATOMIC_ADD(__half); // elements; the entire __half2 is not guaranteed to be atomic as a single // 32-bit access. GKO_BIND_ATOMIC_ADD(__half2); -#endif +#endif // !((defined(CUDA_VERSION) && (CUDA_VERSION < 10000)) || + // (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 600))) #endif // !defined(__HIPCC__) || (defined(__HIP_DEVICE_COMPILE__) && diff --git a/common/components/segment_scan.hpp.inc b/common/components/segment_scan.hpp.inc index 3aac34832dc..9de87b6c702 100644 --- a/common/components/segment_scan.hpp.inc +++ b/common/components/segment_scan.hpp.inc @@ -38,7 +38,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * performs suffix sum. Works on the source array and returns whether the thread * is the first element of its segment with same `ind`. */ -template +template __device__ __forceinline__ bool segment_scan( const group::thread_block_tile &group, const IndexType ind, ValueType *__restrict__ val) @@ -60,4 +60,4 @@ __device__ __forceinline__ bool segment_scan( } } return head; -} \ No newline at end of file +} diff --git a/common/components/sorting.hpp.inc b/common/components/sorting.hpp.inc index cc042a08d90..611b99c5538 100644 --- a/common/components/sorting.hpp.inc +++ b/common/components/sorting.hpp.inc @@ -128,7 +128,8 @@ struct bitonic_warp { __forceinline__ __device__ static void merge(ValueType *els, bool reverse) { - auto tile = group::thread_block_tile{}; + auto tile = + group::tiled_partition(group::this_thread_block()); auto new_reverse = reverse != upper_half(); for (auto i = 0; i < num_local; ++i) { auto other = tile.shfl_xor(els[i], num_threads / 2); diff --git a/common/factorization/par_ilut_filter_kernels.hpp.inc b/common/factorization/par_ilut_filter_kernels.hpp.inc index 43addc2504b..2d520f33019 100644 --- a/common/factorization/par_ilut_filter_kernels.hpp.inc +++ b/common/factorization/par_ilut_filter_kernels.hpp.inc @@ -42,7 +42,8 @@ __device__ void abstract_filter_impl(const IndexType *row_ptrs, StepCallback step_cb, FinishCallback finish_cb) { - auto subwarp = group::thread_block_tile(); + auto subwarp = + group::tiled_partition(group::this_thread_block()); auto row = thread::get_subwarp_id_flat(); auto lane = subwarp.thread_rank(); auto lane_prefix_mask = (config::lane_mask_type(1) << lane) - 1; diff --git a/common/matrix/csr_kernels.hpp.inc b/common/matrix/csr_kernels.hpp.inc index 0ee4c34dad6..64b17a1016a 100644 --- a/common/matrix/csr_kernels.hpp.inc +++ b/common/matrix/csr_kernels.hpp.inc @@ -87,7 +87,7 @@ __device__ __forceinline__ void find_next_row( } -template __device__ __forceinline__ void warp_atomic_add( const group::thread_block_tile &group, bool force_write, @@ -105,7 +105,7 @@ __device__ __forceinline__ void warp_atomic_add( } -template __device__ __forceinline__ void process_window( const group::thread_block_tile &group, diff --git a/common/preconditioner/jacobi_generate_kernel.hpp.inc b/common/preconditioner/jacobi_generate_kernel.hpp.inc index da8fe668aa0..b402c94e5db 100644 --- a/common/preconditioner/jacobi_generate_kernel.hpp.inc +++ b/common/preconditioner/jacobi_generate_kernel.hpp.inc @@ -53,12 +53,12 @@ __device__ __forceinline__ bool validate_precision_reduction_feasibility( } // compute the condition number - auto perm = group.thread_rank(); - auto trans_perm = perm; + uint32 perm = group.thread_rank(); + uint32 trans_perm = perm; auto block_cond = compute_infinity_norm(group, block_size, block_size, row); - auto succeeded = - invert_block(group, block_size, row, perm, trans_perm); + auto succeeded = invert_block( + group, static_cast(block_size), row, perm, trans_perm); block_cond *= compute_infinity_norm(group, block_size, block_size, row); @@ -99,10 +99,10 @@ __global__ void __launch_bounds__(warps_per_block *config::warp_size) generate( const auto subwarp = group::tiled_partition(block); if (block_id < num_blocks) { const auto block_size = block_ptrs[block_id + 1] - block_ptrs[block_id]; - auto perm = subwarp.thread_rank(); - auto trans_perm = subwarp.thread_rank(); - invert_block(subwarp, block_size, row, perm, - trans_perm); + uint32 perm = subwarp.thread_rank(); + uint32 trans_perm = subwarp.thread_rank(); + invert_block(subwarp, static_cast(block_size), + row, perm, trans_perm); copy_matrix( subwarp, block_size, row, 1, perm, trans_perm, block_data + storage_scheme.get_global_block_offset(block_id), @@ -138,11 +138,11 @@ __launch_bounds__(warps_per_block *config::warp_size) adaptive_generate( // compute inverse and figure out the correct precision const auto subwarp = group::tiled_partition(block); - const auto block_size = + const uint32 block_size = block_id < num_blocks ? block_ptrs[block_id + 1] - block_ptrs[block_id] : 0; - auto perm = subwarp.thread_rank(); - auto trans_perm = subwarp.thread_rank(); + uint32 perm = subwarp.thread_rank(); + uint32 trans_perm = subwarp.thread_rank(); auto prec_descriptor = ~uint32{}; if (block_id < num_blocks) { auto block_cond = compute_infinity_norm( diff --git a/cuda/CMakeLists.txt b/cuda/CMakeLists.txt index 0822dee0174..6b5c623e856 100644 --- a/cuda/CMakeLists.txt +++ b/cuda/CMakeLists.txt @@ -151,11 +151,6 @@ target_link_libraries(ginkgo_cuda PRIVATE ${CUDA_RUNTIME_LIBS} ${CUBLAS} ${CUSPA # Need to link against ginkgo_hip for the `raw_copy_to(HipExecutor ...)` method target_link_libraries(ginkgo_cuda PUBLIC ginkgo_hip) -# Let CAS handle the CUDA architecture flags (for now) -if (CMAKE_VERSION VERSION_GREATER_EQUAL 3.18) - cmake_policy(SET CMP0104 OLD) -endif() - cas_target_cuda_architectures(ginkgo_cuda ARCHITECTURES ${GINKGO_CUDA_ARCHITECTURES} UNSUPPORTED "20" "21") diff --git a/cuda/base/cusparse_bindings.hpp b/cuda/base/cusparse_bindings.hpp index ed9f043f9ef..0cbb962bb15 100644 --- a/cuda/base/cusparse_bindings.hpp +++ b/cuda/base/cusparse_bindings.hpp @@ -87,6 +87,9 @@ template <> struct is_supported, int32> : std::true_type {}; +#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) + + #define GKO_BIND_CUSPARSE32_SPMV(ValueType, CusparseName) \ inline void spmv(cusparseHandle_t handle, cusparseOperation_t transA, \ int32 m, int32 n, int32 nnz, const ValueType *alpha, \ @@ -133,6 +136,41 @@ GKO_BIND_CUSPARSE64_SPMV(ValueType, detail::not_implemented); #undef GKO_BIND_CUSPARSE64_SPMV +#else // CUDA_VERSION >= 11000 + + +template +inline void spmv_buffersize(cusparseHandle_t handle, cusparseOperation_t opA, + const ValueType *alpha, + const cusparseSpMatDescr_t matA, + const cusparseDnVecDescr_t vecX, + const ValueType *beta, + const cusparseDnVecDescr_t vecY, + cusparseSpMVAlg_t alg, size_type *bufferSize) +{ + constexpr auto value_type = cuda_data_type(); + cusparseSpMV_bufferSize(handle, opA, alpha, matA, vecX, beta, vecY, + value_type, alg, bufferSize); +} + +template +inline void spmv(cusparseHandle_t handle, cusparseOperation_t opA, + const ValueType *alpha, const cusparseSpMatDescr_t matA, + const cusparseDnVecDescr_t vecX, const ValueType *beta, + const cusparseDnVecDescr_t vecY, cusparseSpMVAlg_t alg, + void *externalBuffer) +{ + constexpr auto value_type = cuda_data_type(); + cusparseSpMV(handle, opA, alpha, matA, vecX, beta, vecY, value_type, alg, + externalBuffer); +} + + +#endif + + +#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) + #define GKO_BIND_CUSPARSE32_SPMV(ValueType, CusparseName) \ inline void spmv_mp(cusparseHandle_t handle, cusparseOperation_t transA, \ int32 m, int32 n, int32 nnz, const ValueType *alpha, \ @@ -227,6 +265,9 @@ GKO_BIND_CUSPARSE64_SPMM(ValueType, detail::not_implemented); #undef GKO_BIND_CUSPARSE64_SPMM +#endif // defined(CUDA_VERSION) && (CUDA_VERSION < 11000) + + template inline void spmv(cusparseHandle_t handle, cusparseAlgMode_t alg, cusparseOperation_t transA, IndexType m, IndexType n, @@ -277,7 +318,7 @@ inline void spmv_buffersize(cusparseHandle_t handle, cusparseAlgMode_t alg, const IndexType *csrRowPtrA, const IndexType *csrColIndA, const ValueType *x, const ValueType *beta, ValueType *y, - size_t *bufferSizeInBytes) GKO_NOT_IMPLEMENTED; + size_type *bufferSizeInBytes) GKO_NOT_IMPLEMENTED; #define GKO_BIND_CUSPARSE_SPMV_BUFFERSIZE(ValueType) \ template <> \ @@ -287,7 +328,7 @@ inline void spmv_buffersize(cusparseHandle_t handle, cusparseAlgMode_t alg, const ValueType *alpha, const cusparseMatDescr_t descrA, \ const ValueType *csrValA, const int32 *csrRowPtrA, \ const int32 *csrColIndA, const ValueType *x, const ValueType *beta, \ - ValueType *y, size_t *bufferSizeInBytes) \ + ValueType *y, size_type *bufferSizeInBytes) \ { \ auto data_type = gko::kernels::cuda::cuda_data_type(); \ if (data_type == CUDA_C_8U) { \ @@ -311,6 +352,9 @@ GKO_BIND_CUSPARSE_SPMV_BUFFERSIZE(std::complex); #undef GKO_BIND_CUSPARSE_SPMV_BUFFERSIZE +#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) + + #define GKO_BIND_CUSPARSE32_SPMV(ValueType, CusparseName) \ inline void spmv(cusparseHandle_t handle, cusparseOperation_t transA, \ const ValueType *alpha, const cusparseMatDescr_t descrA, \ @@ -336,6 +380,12 @@ GKO_BIND_CUSPARSE32_SPMV(ValueType, detail::not_implemented); #undef GKO_BIND_CUSPARSE32_SPMV +#endif // defined(CUDA_VERSION) && (CUDA_VERSION < 11000) + + +#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) + + template void spgemm_buffer_size( cusparseHandle_t handle, IndexType m, IndexType n, IndexType k, @@ -459,6 +509,78 @@ GKO_BIND_CUSPARSE_SPGEMM(std::complex, cusparseZcsrgemm2); #undef GKO_BIND_CUSPARSE_SPGEMM +#else // CUDA_VERSION >= 11000 + + +template +void spgemm_work_estimation(cusparseHandle_t handle, const ValueType *alpha, + cusparseSpMatDescr_t a_descr, + cusparseSpMatDescr_t b_descr, const ValueType *beta, + cusparseSpMatDescr_t c_descr, + cusparseSpGEMMDescr_t spgemm_descr, + size_type &buffer1_size, void *buffer1) +{ + GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseSpGEMM_workEstimation( + handle, CUSPARSE_OPERATION_NON_TRANSPOSE, + CUSPARSE_OPERATION_NON_TRANSPOSE, alpha, a_descr, b_descr, beta, + c_descr, cuda_data_type(), CUSPARSE_SPGEMM_DEFAULT, + spgemm_descr, &buffer1_size, buffer1)); +} + + +template +void spgemm_compute(cusparseHandle_t handle, const ValueType *alpha, + cusparseSpMatDescr_t a_descr, cusparseSpMatDescr_t b_descr, + const ValueType *beta, cusparseSpMatDescr_t c_descr, + cusparseSpGEMMDescr_t spgemm_descr, void *buffer1, + size_type &buffer2_size, void *buffer2) +{ + GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseSpGEMM_compute( + handle, CUSPARSE_OPERATION_NON_TRANSPOSE, + CUSPARSE_OPERATION_NON_TRANSPOSE, alpha, a_descr, b_descr, beta, + c_descr, cuda_data_type(), CUSPARSE_SPGEMM_DEFAULT, + spgemm_descr, &buffer2_size, buffer2)); +} + + +template +void spgemm_copy(cusparseHandle_t handle, const ValueType *alpha, + cusparseSpMatDescr_t a_descr, cusparseSpMatDescr_t b_descr, + const ValueType *beta, cusparseSpMatDescr_t c_descr, + cusparseSpGEMMDescr_t spgemm_descr) +{ + GKO_ASSERT_NO_CUSPARSE_ERRORS( + cusparseSpGEMM_copy(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, + CUSPARSE_OPERATION_NON_TRANSPOSE, alpha, a_descr, + b_descr, beta, c_descr, cuda_data_type(), + CUSPARSE_SPGEMM_DEFAULT, spgemm_descr)); +} + + +inline size_type sparse_matrix_nnz(cusparseSpMatDescr_t descr) +{ + int64_t dummy1{}; + int64_t dummy2{}; + int64_t nnz{}; + cusparseSpMatGetSize(descr, &dummy1, &dummy2, &nnz); + return static_cast(nnz); +} + + +template +void csr_set_pointers(cusparseSpMatDescr_t descr, IndexType *row_ptrs, + IndexType *col_idxs, ValueType *vals) +{ + cusparseCsrSetPointers(descr, row_ptrs, col_idxs, vals); +} + + +#endif // CUDA_VERSION >= 11000 + + +#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) + + #define GKO_BIND_CUSPARSE32_CSR2HYB(ValueType, CusparseName) \ inline void csr2hyb(cusparseHandle_t handle, int32 m, int32 n, \ const cusparseMatDescr_t descrA, \ @@ -504,101 +626,119 @@ GKO_BIND_CUSPARSE64_CSR2HYB(ValueType, detail::not_implemented); #undef GKO_BIND_CUSPARSE64_CSR2HYB +#endif // defined(CUDA_VERSION) && (CUDA_VERSION < 11000) + + +#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) + +template +inline void transpose(cusparseHandle_t handle, size_type m, size_type n, + size_type nnz, const ValueType *OrigValA, + const IndexType *OrigRowPtrA, + const IndexType *OrigColIndA, ValueType *TransValA, + IndexType *TransRowPtrA, IndexType *TransColIndA, + cusparseAction_t copyValues, + cusparseIndexBase_t idxBase) GKO_NOT_IMPLEMENTED; + +// Cusparse csr2csc use the order (row_inx, col_ptr) for csc, so we need to +// switch row_ptr and col_idx of transposed csr here #define GKO_BIND_CUSPARSE_TRANSPOSE32(ValueType, CusparseName) \ - inline void transpose(cusparseHandle_t handle, size_type m, size_type n, \ - size_type nnz, const ValueType *OrigValA, \ - const int32 *OrigRowPtrA, const int32 *OrigColIndA, \ - ValueType *TransValA, int32 *TransRowPtrA, \ - int32 *TransColIndA, cusparseAction_t copyValues, \ - cusparseIndexBase_t idxBase) \ + template <> \ + inline void transpose( \ + cusparseHandle_t handle, size_type m, size_type n, size_type nnz, \ + const ValueType *OrigValA, const int32 *OrigRowPtrA, \ + const int32 *OrigColIndA, ValueType *TransValA, int32 *TransRowPtrA, \ + int32 *TransColIndA, cusparseAction_t copyValues, \ + cusparseIndexBase_t idxBase) \ { \ GKO_ASSERT_NO_CUSPARSE_ERRORS( \ CusparseName(handle, m, n, nnz, as_culibs_type(OrigValA), \ OrigRowPtrA, OrigColIndA, as_culibs_type(TransValA), \ - TransRowPtrA, TransColIndA, copyValues, idxBase)); \ + TransColIndA, TransRowPtrA, copyValues, idxBase)); \ } \ static_assert(true, \ "This assert is used to counter the false positive extra " \ "semi-colon warnings") -#define GKO_BIND_CUSPARSE_TRANSPOSE64(ValueType, CusparseName) \ - inline void transpose(cusparseHandle_t handle, size_type m, size_type n, \ - size_type nnz, const ValueType *OrigValA, \ - const int64 *OrigRowPtrA, const int64 *OrigColIndA, \ - ValueType *TransValA, int64 *TransRowPtrA, \ - int64 *TransColIndA, cusparseAction_t copyValues, \ - cusparseIndexBase_t idxBase) GKO_NOT_IMPLEMENTED; \ - static_assert(true, \ - "This assert is used to counter the false positive extra " \ - "semi-colon warnings") - GKO_BIND_CUSPARSE_TRANSPOSE32(float, cusparseScsr2csc); GKO_BIND_CUSPARSE_TRANSPOSE32(double, cusparseDcsr2csc); -GKO_BIND_CUSPARSE_TRANSPOSE64(float, cusparseScsr2csc); -GKO_BIND_CUSPARSE_TRANSPOSE64(double, cusparseDcsr2csc); GKO_BIND_CUSPARSE_TRANSPOSE32(std::complex, cusparseCcsr2csc); GKO_BIND_CUSPARSE_TRANSPOSE32(std::complex, cusparseZcsr2csc); -GKO_BIND_CUSPARSE_TRANSPOSE64(std::complex, cusparseCcsr2csc); -GKO_BIND_CUSPARSE_TRANSPOSE64(std::complex, cusparseZcsr2csc); -template -GKO_BIND_CUSPARSE_TRANSPOSE32(ValueType, detail::not_implemented); -template -GKO_BIND_CUSPARSE_TRANSPOSE64(ValueType, detail::not_implemented); -#undef GKO_BIND_CUSPARSE_TRANSPOSE +#undef GKO_BIND_CUSPARSE_TRANSPOSE32 -#define GKO_BIND_CUSPARSE_CONJ_TRANSPOSE32(ValueType, CusparseName) \ - inline void conj_transpose( \ - cusparseHandle_t handle, size_type m, size_type n, size_type nnz, \ - const ValueType *OrigValA, const int32 *OrigRowPtrA, \ - const int32 *OrigColIndA, ValueType *TransValA, int32 *TransRowPtrA, \ - int32 *TransColIndA, cusparseAction_t copyValues, \ - cusparseIndexBase_t idxBase) GKO_NOT_IMPLEMENTED; \ - static_assert(true, \ - "This assert is used to counter the false positive extra " \ - "semi-colon warnings") -#define GKO_BIND_CUSPARSE_CONJ_TRANSPOSE64(ValueType, CusparseName) \ - inline void conj_transpose( \ - cusparseHandle_t handle, size_type m, size_type n, size_type nnz, \ - const ValueType *OrigValA, const int64 *OrigRowPtrA, \ - const int64 *OrigColIndA, ValueType *TransValA, int64 *TransRowPtrA, \ - int64 *TransColIndA, cusparseAction_t copyValues, \ - cusparseIndexBase_t idxBase) GKO_NOT_IMPLEMENTED; \ - static_assert(true, \ - "This assert is used to counter the false positive extra " \ - "semi-colon warnings") +#else // CUDA_VERSION >= 11000 -GKO_BIND_CUSPARSE_CONJ_TRANSPOSE32(float, cusparseScsr2csc); -GKO_BIND_CUSPARSE_CONJ_TRANSPOSE32(double, cusparseDcsr2csc); -GKO_BIND_CUSPARSE_CONJ_TRANSPOSE64(float, cusparseScsr2csc); -GKO_BIND_CUSPARSE_CONJ_TRANSPOSE64(double, cusparseDcsr2csc); -GKO_BIND_CUSPARSE_CONJ_TRANSPOSE32(std::complex, cusparseCcsr2csc); -GKO_BIND_CUSPARSE_CONJ_TRANSPOSE32(std::complex, cusparseZcsr2csc); -GKO_BIND_CUSPARSE_CONJ_TRANSPOSE64(std::complex, cusparseCcsr2csc); -GKO_BIND_CUSPARSE_CONJ_TRANSPOSE64(std::complex, cusparseZcsr2csc); -template -GKO_BIND_CUSPARSE_CONJ_TRANSPOSE32(ValueType, detail::not_implemented); -template -GKO_BIND_CUSPARSE_CONJ_TRANSPOSE64(ValueType, detail::not_implemented); +template +inline void transpose_buffersize( + cusparseHandle_t handle, size_type m, size_type n, size_type nnz, + const ValueType *OrigValA, const IndexType *OrigRowPtrA, + const IndexType *OrigColIndA, ValueType *TransValA, IndexType *TransRowPtrA, + IndexType *TransColIndA, cudaDataType_t valType, + cusparseAction_t copyValues, cusparseIndexBase_t idxBase, + cusparseCsr2CscAlg_t alg, size_type *buffer_size) GKO_NOT_IMPLEMENTED; + +#define GKO_BIND_CUSPARSE_TRANSPOSE_BUFFERSIZE32(ValueType) \ + template <> \ + inline void transpose_buffersize( \ + cusparseHandle_t handle, size_type m, size_type n, size_type nnz, \ + const ValueType *OrigValA, const int32 *OrigRowPtrA, \ + const int32 *OrigColIndA, ValueType *TransValA, int32 *TransRowPtrA, \ + int32 *TransColIndA, cudaDataType_t valType, \ + cusparseAction_t copyValues, cusparseIndexBase_t idxBase, \ + cusparseCsr2CscAlg_t alg, size_type *buffer_size) \ + { \ + GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseCsr2cscEx2_bufferSize( \ + handle, m, n, nnz, OrigValA, OrigRowPtrA, OrigColIndA, TransValA, \ + TransRowPtrA, TransColIndA, valType, copyValues, idxBase, alg, \ + buffer_size)); \ + } \ + static_assert(true, \ + "This assert is used to counter the false positive extra " \ + "semi-colon warnings") -#undef GKO_BIND_CUSPARSE_CONJ_TRANSPOSE +GKO_BIND_CUSPARSE_TRANSPOSE_BUFFERSIZE32(float); +GKO_BIND_CUSPARSE_TRANSPOSE_BUFFERSIZE32(double); +GKO_BIND_CUSPARSE_TRANSPOSE_BUFFERSIZE32(std::complex); +GKO_BIND_CUSPARSE_TRANSPOSE_BUFFERSIZE32(std::complex); +template +inline void transpose(cusparseHandle_t handle, size_type m, size_type n, + size_type nnz, const ValueType *OrigValA, + const IndexType *OrigRowPtrA, + const IndexType *OrigColIndA, ValueType *TransValA, + IndexType *TransRowPtrA, IndexType *TransColIndA, + cudaDataType_t valType, cusparseAction_t copyValues, + cusparseIndexBase_t idxBase, cusparseCsr2CscAlg_t alg, + void *buffer) GKO_NOT_IMPLEMENTED; + +#define GKO_BIND_CUSPARSE_TRANSPOSE32(ValueType) \ + template <> \ + inline void transpose( \ + cusparseHandle_t handle, size_type m, size_type n, size_type nnz, \ + const ValueType *OrigValA, const int32 *OrigRowPtrA, \ + const int32 *OrigColIndA, ValueType *TransValA, int32 *TransRowPtrA, \ + int32 *TransColIndA, cudaDataType_t valType, \ + cusparseAction_t copyValues, cusparseIndexBase_t idxBase, \ + cusparseCsr2CscAlg_t alg, void *buffer) \ + { \ + GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseCsr2cscEx2( \ + handle, m, n, nnz, OrigValA, OrigRowPtrA, OrigColIndA, TransValA, \ + TransRowPtrA, TransColIndA, valType, copyValues, idxBase, alg, \ + buffer)); \ + } \ + static_assert(true, \ + "This assert is used to counter the false positive extra " \ + "semi-colon warnings") -inline cusparseHandle_t init() -{ - cusparseHandle_t handle{}; - GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseCreate(&handle)); - GKO_ASSERT_NO_CUSPARSE_ERRORS( - cusparseSetPointerMode(handle, CUSPARSE_POINTER_MODE_DEVICE)); - return handle; -} +GKO_BIND_CUSPARSE_TRANSPOSE32(float); +GKO_BIND_CUSPARSE_TRANSPOSE32(double); +GKO_BIND_CUSPARSE_TRANSPOSE32(std::complex); +GKO_BIND_CUSPARSE_TRANSPOSE32(std::complex); -inline void destroy(cusparseHandle_t handle) -{ - GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseDestroy(handle)); -} +#endif inline cusparseMatDescr_t create_mat_descr() @@ -615,6 +755,9 @@ inline void destroy(cusparseMatDescr_t descr) } +#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) + + inline csrgemm2Info_t create_spgemm_info() { csrgemm2Info_t info{}; @@ -629,6 +772,84 @@ inline void destroy(csrgemm2Info_t info) } +#else // CUDA_VERSION >= 11000 + + +inline cusparseSpGEMMDescr_t create_spgemm_descr() +{ + cusparseSpGEMMDescr_t descr{}; + GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseSpGEMM_createDescr(&descr)); + return descr; +} + + +inline void destroy(cusparseSpGEMMDescr_t info) +{ + GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseSpGEMM_destroyDescr(info)); +} + + +template +inline cusparseDnVecDescr_t create_dnvec(int64_t size, ValueType *values) +{ + cusparseDnVecDescr_t descr{}; + constexpr auto value_type = cuda_data_type(); + GKO_ASSERT_NO_CUSPARSE_ERRORS( + cusparseCreateDnVec(&descr, size, values, value_type)); + return descr; +} + + +inline void destroy(cusparseDnVecDescr_t descr) +{ + GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseDestroyDnVec(descr)); +} + + +template +inline cusparseSpVecDescr_t create_spvec(int64_t size, int64_t nnz, + IndexType *indices, ValueType *values) +{ + cusparseSpVecDescr_t descr{}; + constexpr auto index_type = cusparse_index_type(); + constexpr auto value_type = cuda_data_type(); + GKO_ASSERT_NO_CUSPARSE_ERRORS( + cusparseCreateSpVec(&descr, size, nnz, indices, values, index_type, + CUSPARSE_INDEX_BASE_ZERO, value_type)); + return descr; +} + + +inline void destroy(cusparseSpVecDescr_t descr) +{ + GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseDestroySpVec(descr)); +} + + +template +inline cusparseSpMatDescr_t create_csr(int64_t rows, int64_t cols, int64_t nnz, + IndexType *csrRowOffsets, + IndexType *csrColInd, + ValueType *csrValues) +{ + cusparseSpMatDescr_t descr{}; + constexpr auto index_type = cusparse_index_type(); + constexpr auto value_type = cuda_data_type(); + GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseCreateCsr( + &descr, rows, cols, nnz, csrRowOffsets, csrColInd, csrValues, + index_type, index_type, CUSPARSE_INDEX_BASE_ZERO, value_type)); + return descr; +} + + +inline void destroy(cusparseSpMatDescr_t descr) +{ + GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseDestroySpMat(descr)); +} + +#endif + + // CUDA versions 9.2 and above have csrsm2. #if (defined(CUDA_VERSION) && (CUDA_VERSION >= 9020)) @@ -694,7 +915,7 @@ inline void destroy(csrilu02Info_t info) const ValueType *csrVal, const int32 *csrRowPtr, \ const int32 *csrColInd, const ValueType *rhs, int32 sol_size, \ csrsm2Info_t factor_info, cusparseSolvePolicy_t policy, \ - size_t *factor_work_size) \ + size_type *factor_work_size) \ { \ GKO_ASSERT_NO_CUSPARSE_ERRORS( \ CusparseName(handle, algo, trans1, trans2, m, n, nnz, \ @@ -714,7 +935,7 @@ inline void destroy(csrilu02Info_t info) const ValueType *csrVal, const int64 *csrRowPtr, \ const int64 *csrColInd, const ValueType *rhs, int64 sol_size, \ csrsm2Info_t factor_info, cusparseSolvePolicy_t policy, \ - size_t *factor_work_size) GKO_NOT_IMPLEMENTED; \ + size_type *factor_work_size) GKO_NOT_IMPLEMENTED; \ static_assert(true, \ "This assert is used to counter the false positive extra " \ "semi-colon warnings") @@ -989,6 +1210,9 @@ inline void csrsort(cusparseHandle_t handle, int32 m, int32 n, int32 nnz, } +#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) + + template void gather(cusparseHandle_t handle, IndexType nnz, const ValueType *in, ValueType *out, const IndexType *permutation) GKO_NOT_IMPLEMENTED; @@ -1015,6 +1239,19 @@ GKO_BIND_CUSPARSE_GATHER(std::complex, cusparseZgthr); #undef GKO_BIND_CUSPARSE_GATHER +#else // CUDA_VERSION >= 11000 + + +inline void gather(cusparseHandle_t handle, cusparseDnVecDescr_t in, + cusparseSpVecDescr_t out) +{ + GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseGather(handle, in, out)); +} + + +#endif + + template void ilu0_buffer_size(cusparseHandle_t handle, IndexType m, IndexType nnz, const cusparseMatDescr_t descr, const ValueType *vals, diff --git a/cuda/base/cusparse_handle.hpp b/cuda/base/cusparse_handle.hpp new file mode 100644 index 00000000000..0f2733c0ed4 --- /dev/null +++ b/cuda/base/cusparse_handle.hpp @@ -0,0 +1,77 @@ +/************************************************************* +Copyright (c) 2017-2020, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_CUDA_BASE_CUSPARSE_HANDLE_HPP_ +#define GKO_CUDA_BASE_CUSPARSE_HANDLE_HPP_ + + +#include +#include + + +#include + + +namespace gko { +namespace kernels { +namespace cuda { +/** + * @brief The CUSPARSE namespace. + * + * @ingroup cusparse + */ +namespace cusparse { + + +inline cusparseHandle_t init() +{ + cusparseHandle_t handle{}; + GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseCreate(&handle)); + GKO_ASSERT_NO_CUSPARSE_ERRORS( + cusparseSetPointerMode(handle, CUSPARSE_POINTER_MODE_DEVICE)); + return handle; +} + + +inline void destroy(cusparseHandle_t handle) +{ + GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseDestroy(handle)); +} + + +} // namespace cusparse +} // namespace cuda +} // namespace kernels +} // namespace gko + + +#endif // GKO_CUDA_BASE_CUSPARSE_HANDLE_HPP_ diff --git a/cuda/base/executor.cpp b/cuda/base/executor.cpp index a72ecef7591..98d24c2e00f 100644 --- a/cuda/base/executor.cpp +++ b/cuda/base/executor.cpp @@ -45,7 +45,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "cuda/base/config.hpp" #include "cuda/base/cublas_bindings.hpp" -#include "cuda/base/cusparse_bindings.hpp" +#include "cuda/base/cusparse_handle.hpp" #include "cuda/base/device_guard.hpp" @@ -93,7 +93,7 @@ void CudaExecutor::raw_free(void *ptr) const noexcept << " in " << __func__ << ": " << cudaGetErrorName(error_code) << ": " << cudaGetErrorString(error_code) << std::endl << "Exiting program" << std::endl; -#endif +#endif // GKO_VERBOSE_LEVEL >= 1 std::exit(error_code); } } diff --git a/cuda/base/types.hpp b/cuda/base/types.hpp index 36190658d57..580976b265f 100644 --- a/cuda/base/types.hpp +++ b/cuda/base/types.hpp @@ -198,8 +198,9 @@ constexpr cudaDataType_t cuda_data_type_impl() } -#if defined(CUDA_VERSION) && (CUDA_VERSION >= 10010) && \ - !(defined(_WIN32) || defined(__CYGWIN__)) +#if defined(CUDA_VERSION) && \ + (CUDA_VERSION >= 11000 || \ + ((CUDA_VERSION >= 10010) && !(defined(_WIN32) || defined(__CYGWIN__)))) template @@ -221,8 +222,8 @@ constexpr cusparseIndexType_t cusparse_index_type_impl() } -#endif // defined(CUDA_VERSION) && (CUDA_VERSION >= 10010) && - // !(defined(_WIN32) || defined(__CYGWIN__)) +#endif // defined(CUDA_VERSION) && (CUDA_VERSION >= 11000 || ((CUDA_VERSION >= + // 10010) && !(defined(_WIN32) || defined(__CYGWIN__)))) } // namespace detail @@ -243,8 +244,9 @@ constexpr cudaDataType_t cuda_data_type() } -#if defined(CUDA_VERSION) && (CUDA_VERSION >= 10010) && \ - !(defined(_WIN32) || defined(__CYGWIN__)) +#if defined(CUDA_VERSION) && \ + (CUDA_VERSION >= 11000 || \ + ((CUDA_VERSION >= 10010) && !(defined(_WIN32) || defined(__CYGWIN__)))) /** @@ -262,8 +264,8 @@ constexpr cusparseIndexType_t cusparse_index_type() } -#endif // defined(CUDA_VERSION) && (CUDA_VERSION >= 10010) && - // !(defined(_WIN32) || defined(__CYGWIN__)) +#endif // defined(CUDA_VERSION) && (CUDA_VERSION >= 11000 || ((CUDA_VERSION >= + // 10010) && !(defined(_WIN32) || defined(__CYGWIN__)))) /** diff --git a/cuda/components/cooperative_groups.cuh b/cuda/components/cooperative_groups.cuh index 25af6135ced..d48815c0c6f 100644 --- a/cuda/components/cooperative_groups.cuh +++ b/cuda/components/cooperative_groups.cuh @@ -38,6 +38,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include #include "cuda/base/config.hpp" @@ -242,7 +243,7 @@ private: (threadIdx.y + blockDim.y * (threadIdx.z + blockDim.z * (blockIdx.x + gridDim.x * - (blockIdx.y + gridDim.y * blockIdx.z))))} + (blockIdx.y + gridDim.y * blockIdx.z))))} {} // clang-format on @@ -312,6 +313,9 @@ struct is_synchronizable_group_impl : std::true_type {}; namespace detail { +#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) + + // Adds generalized shuffles that support any type to the group. template class enable_extended_shuffle : public Group { @@ -363,17 +367,33 @@ private: }; +#endif // defined(CUDA_VERSION) && (CUDA_VERSION < 11000) + + } // namespace detail +#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) + + // Implementing this as a using directive messes up with SFINAE for some reason, // probably a bug in NVCC. If it is a complete type, everything works fine. -template +template struct thread_block_tile : detail::enable_extended_shuffle< cooperative_groups::thread_block_tile> { using detail::enable_extended_shuffle< cooperative_groups::thread_block_tile>::enable_extended_shuffle; }; + + +#else // CUDA_VERSION >= 11000 + + +// Cuda11 cooperative group's shuffle supports complex +using cooperative_groups::thread_block_tile; + + +#endif // inherits thread_group // // public API: @@ -393,20 +413,44 @@ struct thread_block_tile : detail::enable_extended_shuffle< // unsigned match_all(T) const // TODO: implement for all types namespace detail { -template + + +#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) + + +template struct is_group_impl> : std::true_type {}; -template +template struct is_synchronizable_group_impl> : std::true_type { }; -template +template struct is_communicator_group_impl> : std::true_type {}; // make sure the original CUDA group is recognized whenever possible -template +template struct is_group_impl> : std::true_type {}; -template +template struct is_synchronizable_group_impl> : std::true_type {}; + + +#else // CUDA_VERSION >= 11000 + + +// thread_block_tile is same as cuda11's +template +struct is_group_impl> : std::true_type {}; +template +struct is_synchronizable_group_impl> + : std::true_type {}; +template +struct is_communicator_group_impl> + : std::true_type {}; + + +#endif + + } // namespace detail @@ -457,6 +501,10 @@ __device__ __forceinline__ auto tiled_partition(const Group &g) // Only support tile_partition with 1, 2, 4, 8, 16, 32. // Reference: // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#warp-notes +#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) + + +// cooperative group before cuda11 does not contain parent group in template template __device__ __forceinline__ std::enable_if_t<(Size <= kernels::cuda::config::warp_size) && (Size > 0) && @@ -468,6 +516,23 @@ __device__ __forceinline__ } +#else // CUDA_VERSION >= 11000 + + +// cooperative group after cuda11 contain parent group in template. +// we remove the information because we do not restrict cooperative group by its +// parent group type. +template +__device__ __forceinline__ thread_block_tile tiled_partition( + const Group &g) +{ + return cooperative_groups::tiled_partition(g); +} + + +#endif + + } // namespace group } // namespace cuda } // namespace kernels diff --git a/cuda/components/format_conversion.cuh b/cuda/components/format_conversion.cuh index d7235414f37..17b20438524 100644 --- a/cuda/components/format_conversion.cuh +++ b/cuda/components/format_conversion.cuh @@ -100,7 +100,11 @@ __host__ size_type calculate_nwarps(std::shared_ptr exec, exec->get_num_warps_per_sm() * config::warp_size / subwarp_size; size_type nwarps_in_cuda = exec->get_num_multiprocessor() * warps_per_sm; size_type multiple = 8; - if (nnz >= 2e6) { + if (nnz >= 2e8) { + multiple = 2048; + } else if (nnz >= 2e7) { + multiple = 512; + } else if (nnz >= 2e6) { multiple = 128; } else if (nnz >= 2e5) { multiple = 32; diff --git a/cuda/matrix/csr_kernels.cu b/cuda/matrix/csr_kernels.cu index 8c189b99673..f618bcd9e8f 100644 --- a/cuda/matrix/csr_kernels.cu +++ b/cuda/matrix/csr_kernels.cu @@ -307,23 +307,56 @@ void spmv(std::shared_ptr exec, if (cusparse::is_supported::value) { // TODO: add implementation for int64 and multiple RHS auto handle = exec->get_cusparse_handle(); - auto descr = cusparse::create_mat_descr(); { cusparse::pointer_mode_guard pm_guard(handle); + const auto alpha = one(); + const auto beta = zero(); + // TODO: add implementation for int64 and multiple RHS + if (b->get_stride() != 1 || c->get_stride() != 1) + GKO_NOT_IMPLEMENTED; + +#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) + auto descr = cusparse::create_mat_descr(); auto row_ptrs = a->get_const_row_ptrs(); auto col_idxs = a->get_const_col_idxs(); - auto alpha = one(); - auto beta = zero(); - if (b->get_stride() != 1 || c->get_stride() != 1) { - GKO_NOT_IMPLEMENTED; - } cusparse::spmv(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, a->get_size()[0], a->get_size()[1], a->get_num_stored_elements(), &alpha, descr, a->get_const_values(), row_ptrs, col_idxs, b->get_const_values(), &beta, c->get_values()); + + cusparse::destroy(descr); +#else // CUDA_VERSION >= 11000 + cusparseOperation_t trans = CUSPARSE_OPERATION_NON_TRANSPOSE; + cusparseSpMVAlg_t alg = CUSPARSE_CSRMV_ALG1; + auto row_ptrs = + const_cast(a->get_const_row_ptrs()); + auto col_idxs = + const_cast(a->get_const_col_idxs()); + auto values = const_cast(a->get_const_values()); + auto mat = cusparse::create_csr( + a->get_size()[0], a->get_size()[1], + a->get_num_stored_elements(), row_ptrs, col_idxs, values); + auto b_val = const_cast(b->get_const_values()); + auto c_val = c->get_values(); + auto vecb = + cusparse::create_dnvec(b->get_num_stored_elements(), b_val); + auto vecc = + cusparse::create_dnvec(c->get_num_stored_elements(), c_val); + size_type buffer_size = 0; + cusparse::spmv_buffersize(handle, trans, &alpha, mat, + vecb, &beta, vecc, alg, + &buffer_size); + + gko::Array buffer_array(exec, buffer_size); + auto buffer = buffer_array.get_data(); + cusparse::spmv(handle, trans, &alpha, mat, vecb, + &beta, vecc, alg, buffer); + cusparse::destroy(vecb); + cusparse::destroy(vecc); + cusparse::destroy(mat); +#endif } - cusparse::destroy(descr); } else { GKO_NOT_IMPLEMENTED; } @@ -368,14 +401,13 @@ void advanced_spmv(std::shared_ptr exec, a->get_strategy()->get_name() == "cusparse") { if (cusparse::is_supported::value) { // TODO: add implementation for int64 and multiple RHS - auto descr = cusparse::create_mat_descr(); - - auto row_ptrs = a->get_const_row_ptrs(); - auto col_idxs = a->get_const_col_idxs(); - if (b->get_stride() != 1 || c->get_stride() != 1) GKO_NOT_IMPLEMENTED; +#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) + auto descr = cusparse::create_mat_descr(); + auto row_ptrs = a->get_const_row_ptrs(); + auto col_idxs = a->get_const_col_idxs(); cusparse::spmv(exec->get_cusparse_handle(), CUSPARSE_OPERATION_NON_TRANSPOSE, a->get_size()[0], a->get_size()[1], a->get_num_stored_elements(), @@ -385,6 +417,34 @@ void advanced_spmv(std::shared_ptr exec, c->get_values()); cusparse::destroy(descr); +#else // CUDA_VERSION >= 11000 + cusparseOperation_t trans = CUSPARSE_OPERATION_NON_TRANSPOSE; + cusparseSpMVAlg_t alg = CUSPARSE_CSRMV_ALG1; + auto row_ptrs = const_cast(a->get_const_row_ptrs()); + auto col_idxs = const_cast(a->get_const_col_idxs()); + auto values = const_cast(a->get_const_values()); + auto mat = cusparse::create_csr(a->get_size()[0], a->get_size()[1], + a->get_num_stored_elements(), + row_ptrs, col_idxs, values); + auto b_val = const_cast(b->get_const_values()); + auto c_val = c->get_values(); + auto vecb = + cusparse::create_dnvec(b->get_num_stored_elements(), b_val); + auto vecc = + cusparse::create_dnvec(c->get_num_stored_elements(), c_val); + size_type buffer_size = 0; + cusparse::spmv_buffersize( + exec->get_cusparse_handle(), trans, alpha->get_const_values(), + mat, vecb, beta->get_const_values(), vecc, alg, &buffer_size); + gko::Array buffer_array(exec, buffer_size); + auto buffer = buffer_array.get_data(); + cusparse::spmv( + exec->get_cusparse_handle(), trans, alpha->get_const_values(), + mat, vecb, beta->get_const_values(), vecc, alg, buffer); + cusparse::destroy(vecb); + cusparse::destroy(vecc); + cusparse::destroy(mat); +#endif } else { GKO_NOT_IMPLEMENTED; } @@ -433,35 +493,38 @@ void spgemm(std::shared_ptr exec, const matrix::Csr *b, matrix::Csr *c) { + auto a_nnz = IndexType(a->get_num_stored_elements()); + auto a_vals = a->get_const_values(); + auto a_row_ptrs = a->get_const_row_ptrs(); + auto a_col_idxs = a->get_const_col_idxs(); + auto b_vals = b->get_const_values(); + auto b_row_ptrs = b->get_const_row_ptrs(); + auto b_col_idxs = b->get_const_col_idxs(); + auto c_row_ptrs = c->get_row_ptrs(); + if (cusparse::is_supported::value) { auto handle = exec->get_cusparse_handle(); cusparse::pointer_mode_guard pm_guard(handle); - auto a_descr = cusparse::create_mat_descr(); - auto b_descr = cusparse::create_mat_descr(); - auto c_descr = cusparse::create_mat_descr(); - auto d_descr = cusparse::create_mat_descr(); - auto info = cusparse::create_spgemm_info(); auto alpha = one(); - auto a_nnz = IndexType(a->get_num_stored_elements()); - auto a_vals = a->get_const_values(); - auto a_row_ptrs = a->get_const_row_ptrs(); - auto a_col_idxs = a->get_const_col_idxs(); - auto b_nnz = IndexType(b->get_num_stored_elements()); - auto b_vals = b->get_const_values(); - auto b_row_ptrs = b->get_const_row_ptrs(); - auto b_col_idxs = b->get_const_col_idxs(); + auto a_nnz = static_cast(a->get_num_stored_elements()); + auto b_nnz = static_cast(b->get_num_stored_elements()); auto null_value = static_cast(nullptr); auto null_index = static_cast(nullptr); auto zero_nnz = IndexType{}; auto m = IndexType(a->get_size()[0]); auto n = IndexType(b->get_size()[1]); auto k = IndexType(a->get_size()[1]); - auto c_row_ptrs = c->get_row_ptrs(); matrix::CsrBuilder c_builder{c}; auto &c_col_idxs_array = c_builder.get_col_idx_array(); auto &c_vals_array = c_builder.get_value_array(); +#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) + auto a_descr = cusparse::create_mat_descr(); + auto b_descr = cusparse::create_mat_descr(); + auto c_descr = cusparse::create_mat_descr(); + auto d_descr = cusparse::create_mat_descr(); + auto info = cusparse::create_spgemm_info(); // allocate buffer size_type buffer_size{}; cusparse::spgemm_buffer_size( @@ -494,6 +557,57 @@ void spgemm(std::shared_ptr exec, cusparse::destroy(c_descr); cusparse::destroy(b_descr); cusparse::destroy(a_descr); + +#else // CUDA_VERSION >= 11000 + const auto beta = zero(); + auto spgemm_descr = cusparse::create_spgemm_descr(); + auto a_descr = cusparse::create_csr(m, k, a_nnz, + const_cast(a_row_ptrs), + const_cast(a_col_idxs), + const_cast(a_vals)); + auto b_descr = cusparse::create_csr(k, n, b_nnz, + const_cast(b_row_ptrs), + const_cast(b_col_idxs), + const_cast(b_vals)); + auto c_descr = cusparse::create_csr(m, n, zero_nnz, null_index, + null_index, null_value); + + // estimate work + size_type buffer1_size{}; + cusparse::spgemm_work_estimation(handle, &alpha, a_descr, b_descr, + &beta, c_descr, spgemm_descr, + buffer1_size, nullptr); + Array buffer1{exec, buffer1_size}; + cusparse::spgemm_work_estimation(handle, &alpha, a_descr, b_descr, + &beta, c_descr, spgemm_descr, + buffer1_size, buffer1.get_data()); + + // compute spgemm + size_type buffer2_size{}; + cusparse::spgemm_compute(handle, &alpha, a_descr, b_descr, &beta, + c_descr, spgemm_descr, buffer1.get_data(), + buffer2_size, nullptr); + Array buffer2{exec, buffer2_size}; + cusparse::spgemm_compute(handle, &alpha, a_descr, b_descr, &beta, + c_descr, spgemm_descr, buffer1.get_data(), + buffer2_size, buffer2.get_data()); + + // copy data to result + auto c_nnz = cusparse::sparse_matrix_nnz(c_descr); + c_col_idxs_array.resize_and_reset(c_nnz); + c_vals_array.resize_and_reset(c_nnz); + cusparse::csr_set_pointers(c_descr, c_row_ptrs, + c_col_idxs_array.get_data(), + c_vals_array.get_data()); + + cusparse::spgemm_copy(handle, &alpha, a_descr, b_descr, &beta, c_descr, + spgemm_descr); + + cusparse::destroy(c_descr); + cusparse::destroy(b_descr); + cusparse::destroy(a_descr); + cusparse::destroy(spgemm_descr); +#endif // CUDA_VERSION >= 11000 } else { GKO_NOT_IMPLEMENTED; } @@ -502,6 +616,47 @@ void spgemm(std::shared_ptr exec, GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_SPGEMM_KERNEL); +namespace { + + +template +void spgeam(syn::value_list, + std::shared_ptr exec, const ValueType *alpha, + const IndexType *a_row_ptrs, const IndexType *a_col_idxs, + const ValueType *a_vals, const ValueType *beta, + const IndexType *b_row_ptrs, const IndexType *b_col_idxs, + const ValueType *b_vals, matrix::Csr *c) +{ + auto m = static_cast(c->get_size()[0]); + auto c_row_ptrs = c->get_row_ptrs(); + // count nnz for alpha * A + beta * B + auto subwarps_per_block = default_block_size / subwarp_size; + auto num_blocks = ceildiv(m, subwarps_per_block); + kernel::spgeam_nnz<<>>( + a_row_ptrs, a_col_idxs, b_row_ptrs, b_col_idxs, m, c_row_ptrs); + + // build row pointers + components::prefix_sum(exec, c_row_ptrs, m + 1); + + // accumulate non-zeros for alpha * A + beta * B + matrix::CsrBuilder c_builder{c}; + auto c_nnz = exec->copy_val_to_host(c_row_ptrs + m); + c_builder.get_col_idx_array().resize_and_reset(c_nnz); + c_builder.get_value_array().resize_and_reset(c_nnz); + auto c_col_idxs = c->get_col_idxs(); + auto c_vals = c->get_values(); + kernel::spgeam<<>>( + as_cuda_type(alpha), a_row_ptrs, a_col_idxs, as_cuda_type(a_vals), + as_cuda_type(beta), b_row_ptrs, b_col_idxs, as_cuda_type(b_vals), m, + c_row_ptrs, c_col_idxs, as_cuda_type(c_vals)); +} + +GKO_ENABLE_IMPLEMENTATION_SELECTION(select_spgeam, spgeam); + + +} // namespace + + template void advanced_spgemm(std::shared_ptr exec, const matrix::Dense *alpha, @@ -514,11 +669,6 @@ void advanced_spgemm(std::shared_ptr exec, if (cusparse::is_supported::value) { auto handle = exec->get_cusparse_handle(); cusparse::pointer_mode_guard pm_guard(handle); - auto a_descr = cusparse::create_mat_descr(); - auto b_descr = cusparse::create_mat_descr(); - auto c_descr = cusparse::create_mat_descr(); - auto d_descr = cusparse::create_mat_descr(); - auto info = cusparse::create_spgemm_info(); auto valpha = exec->copy_val_to_host(alpha->get_const_values()); auto a_nnz = IndexType(a->get_num_stored_elements()); @@ -538,10 +688,16 @@ void advanced_spgemm(std::shared_ptr exec, auto n = IndexType(b->get_size()[1]); auto k = IndexType(a->get_size()[1]); auto c_row_ptrs = c->get_row_ptrs(); + +#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) matrix::CsrBuilder c_builder{c}; auto &c_col_idxs_array = c_builder.get_col_idx_array(); auto &c_vals_array = c_builder.get_value_array(); - + auto a_descr = cusparse::create_mat_descr(); + auto b_descr = cusparse::create_mat_descr(); + auto c_descr = cusparse::create_mat_descr(); + auto d_descr = cusparse::create_mat_descr(); + auto info = cusparse::create_spgemm_info(); // allocate buffer size_type buffer_size{}; cusparse::spgemm_buffer_size( @@ -574,6 +730,75 @@ void advanced_spgemm(std::shared_ptr exec, cusparse::destroy(c_descr); cusparse::destroy(b_descr); cusparse::destroy(a_descr); +#else // CUDA_VERSION >= 11000 + auto null_value = static_cast(nullptr); + auto null_index = static_cast(nullptr); + auto one_val = one(); + auto zero_val = zero(); + auto zero_nnz = IndexType{}; + auto spgemm_descr = cusparse::create_spgemm_descr(); + auto a_descr = cusparse::create_csr(m, k, a_nnz, + const_cast(a_row_ptrs), + const_cast(a_col_idxs), + const_cast(a_vals)); + auto b_descr = cusparse::create_csr(k, n, b_nnz, + const_cast(b_row_ptrs), + const_cast(b_col_idxs), + const_cast(b_vals)); + auto c_descr = cusparse::create_csr(m, n, zero_nnz, null_index, + null_index, null_value); + + // estimate work + size_type buffer1_size{}; + cusparse::spgemm_work_estimation(handle, &one_val, a_descr, b_descr, + &zero_val, c_descr, spgemm_descr, + buffer1_size, nullptr); + Array buffer1{exec, buffer1_size}; + cusparse::spgemm_work_estimation(handle, &one_val, a_descr, b_descr, + &zero_val, c_descr, spgemm_descr, + buffer1_size, buffer1.get_data()); + + // compute spgemm + size_type buffer2_size{}; + cusparse::spgemm_compute(handle, &one_val, a_descr, b_descr, &zero_val, + c_descr, spgemm_descr, buffer1.get_data(), + buffer2_size, nullptr); + Array buffer2{exec, buffer2_size}; + cusparse::spgemm_compute(handle, &one_val, a_descr, b_descr, &zero_val, + c_descr, spgemm_descr, buffer1.get_data(), + buffer2_size, buffer2.get_data()); + + // write result to temporary storage + auto c_tmp_nnz = cusparse::sparse_matrix_nnz(c_descr); + Array c_tmp_row_ptrs_array(exec, m + 1); + Array c_tmp_col_idxs_array(exec, c_tmp_nnz); + Array c_tmp_vals_array(exec, c_tmp_nnz); + cusparse::csr_set_pointers(c_descr, c_tmp_row_ptrs_array.get_data(), + c_tmp_col_idxs_array.get_data(), + c_tmp_vals_array.get_data()); + + cusparse::spgemm_copy(handle, &one_val, a_descr, b_descr, &zero_val, + c_descr, spgemm_descr); + + cusparse::destroy(c_descr); + cusparse::destroy(b_descr); + cusparse::destroy(a_descr); + cusparse::destroy(spgemm_descr); + + auto spgeam_total_nnz = c_tmp_nnz + d->get_num_stored_elements(); + auto nnz_per_row = spgeam_total_nnz / m; + select_spgeam( + spgeam_kernels(), + [&](int compiled_subwarp_size) { + return compiled_subwarp_size >= nnz_per_row || + compiled_subwarp_size == config::warp_size; + }, + syn::value_list(), syn::type_list<>(), exec, + alpha->get_const_values(), c_tmp_row_ptrs_array.get_const_data(), + c_tmp_col_idxs_array.get_const_data(), + c_tmp_vals_array.get_const_data(), beta->get_const_values(), + d_row_ptrs, d_col_idxs, d_vals, c); +#endif // CUDA_VERSION >= 11000 } else { GKO_NOT_IMPLEMENTED; } @@ -583,47 +808,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE( GKO_DECLARE_CSR_ADVANCED_SPGEMM_KERNEL); -namespace { - - -template -void spgeam(syn::value_list, - std::shared_ptr exec, const ValueType *alpha, - const IndexType *a_row_ptrs, const IndexType *a_col_idxs, - const ValueType *a_vals, const ValueType *beta, - const IndexType *b_row_ptrs, const IndexType *b_col_idxs, - const ValueType *b_vals, matrix::Csr *c) -{ - auto m = static_cast(c->get_size()[0]); - auto c_row_ptrs = c->get_row_ptrs(); - // count nnz for alpha * A + beta * B - auto subwarps_per_block = default_block_size / subwarp_size; - auto num_blocks = ceildiv(m, subwarps_per_block); - kernel::spgeam_nnz<<>>( - a_row_ptrs, a_col_idxs, b_row_ptrs, b_col_idxs, m, c_row_ptrs); - - // build row pointers - components::prefix_sum(exec, c_row_ptrs, m + 1); - - // accumulate non-zeros for alpha * A + beta * B - matrix::CsrBuilder c_builder{c}; - auto c_nnz = exec->copy_val_to_host(c_row_ptrs + m); - c_builder.get_col_idx_array().resize_and_reset(c_nnz); - c_builder.get_value_array().resize_and_reset(c_nnz); - auto c_col_idxs = c->get_col_idxs(); - auto c_vals = c->get_values(); - kernel::spgeam<<>>( - as_cuda_type(alpha), a_row_ptrs, a_col_idxs, as_cuda_type(a_vals), - as_cuda_type(beta), b_row_ptrs, b_col_idxs, as_cuda_type(b_vals), m, - c_row_ptrs, c_col_idxs, as_cuda_type(c_vals)); -} - -GKO_ENABLE_IMPLEMENTATION_SELECTION(select_spgeam, spgeam); - - -} // namespace - - template void spgeam(std::shared_ptr exec, const matrix::Dense *alpha, @@ -860,6 +1044,7 @@ void transpose(std::shared_ptr exec, matrix::Csr *trans) { if (cusparse::is_supported::value) { +#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) cusparseAction_t copyValues = CUSPARSE_ACTION_NUMERIC; cusparseIndexBase_t idxBase = CUSPARSE_INDEX_BASE_ZERO; @@ -868,7 +1053,31 @@ void transpose(std::shared_ptr exec, orig->get_size()[1], orig->get_num_stored_elements(), orig->get_const_values(), orig->get_const_row_ptrs(), orig->get_const_col_idxs(), trans->get_values(), - trans->get_col_idxs(), trans->get_row_ptrs(), copyValues, idxBase); + trans->get_row_ptrs(), trans->get_col_idxs(), copyValues, idxBase); +#else // CUDA_VERSION >= 11000 + cudaDataType_t cu_value = + gko::kernels::cuda::cuda_data_type(); + cusparseAction_t copyValues = CUSPARSE_ACTION_NUMERIC; + cusparseIndexBase_t idxBase = CUSPARSE_INDEX_BASE_ZERO; + cusparseCsr2CscAlg_t alg = CUSPARSE_CSR2CSC_ALG1; + size_type buffer_size = 0; + cusparse::transpose_buffersize( + exec->get_cusparse_handle(), orig->get_size()[0], + orig->get_size()[1], orig->get_num_stored_elements(), + orig->get_const_values(), orig->get_const_row_ptrs(), + orig->get_const_col_idxs(), trans->get_values(), + trans->get_row_ptrs(), trans->get_col_idxs(), cu_value, copyValues, + idxBase, alg, &buffer_size); + Array buffer_array(exec, buffer_size); + auto buffer = buffer_array.get_data(); + cusparse::transpose( + exec->get_cusparse_handle(), orig->get_size()[0], + orig->get_size()[1], orig->get_num_stored_elements(), + orig->get_const_values(), orig->get_const_row_ptrs(), + orig->get_const_col_idxs(), trans->get_values(), + trans->get_row_ptrs(), trans->get_col_idxs(), cu_value, copyValues, + idxBase, alg, buffer); +#endif } else { GKO_NOT_IMPLEMENTED; } @@ -887,6 +1096,7 @@ void conj_transpose(std::shared_ptr exec, const dim3 grid_size( ceildiv(trans->get_num_stored_elements(), block_size.x), 1, 1); +#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) cusparseAction_t copyValues = CUSPARSE_ACTION_NUMERIC; cusparseIndexBase_t idxBase = CUSPARSE_INDEX_BASE_ZERO; @@ -895,7 +1105,31 @@ void conj_transpose(std::shared_ptr exec, orig->get_size()[1], orig->get_num_stored_elements(), orig->get_const_values(), orig->get_const_row_ptrs(), orig->get_const_col_idxs(), trans->get_values(), - trans->get_col_idxs(), trans->get_row_ptrs(), copyValues, idxBase); + trans->get_row_ptrs(), trans->get_col_idxs(), copyValues, idxBase); +#else // CUDA_VERSION >= 11000 + cudaDataType_t cu_value = + gko::kernels::cuda::cuda_data_type(); + cusparseAction_t copyValues = CUSPARSE_ACTION_NUMERIC; + cusparseIndexBase_t idxBase = CUSPARSE_INDEX_BASE_ZERO; + cusparseCsr2CscAlg_t alg = CUSPARSE_CSR2CSC_ALG1; + size_type buffer_size = 0; + cusparse::transpose_buffersize( + exec->get_cusparse_handle(), orig->get_size()[0], + orig->get_size()[1], orig->get_num_stored_elements(), + orig->get_const_values(), orig->get_const_row_ptrs(), + orig->get_const_col_idxs(), trans->get_values(), + trans->get_row_ptrs(), trans->get_col_idxs(), cu_value, copyValues, + idxBase, alg, &buffer_size); + Array buffer_array(exec, buffer_size); + auto buffer = buffer_array.get_data(); + cusparse::transpose( + exec->get_cusparse_handle(), orig->get_size()[0], + orig->get_size()[1], orig->get_num_stored_elements(), + orig->get_const_values(), orig->get_const_row_ptrs(), + orig->get_const_col_idxs(), trans->get_values(), + trans->get_row_ptrs(), trans->get_col_idxs(), cu_value, copyValues, + idxBase, alg, buffer); +#endif conjugate_kernel<<>>( trans->get_num_stored_elements(), @@ -1081,7 +1315,14 @@ void sort_by_column_index(std::shared_ptr exec, permutation, buffer); // sort values +#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) cusparse::gather(handle, nnz, tmp_vals, vals, permutation); +#else // CUDA_VERSION >= 11000 + auto val_vec = cusparse::create_spvec(nnz, nnz, permutation, vals); + auto tmp_vec = + cusparse::create_dnvec(nnz, const_cast(tmp_vals)); + cusparse::gather(handle, tmp_vec, val_vec); +#endif cusparse::destroy(descr); } else { diff --git a/cuda/test/matrix/csr_kernels.cpp b/cuda/test/matrix/csr_kernels.cpp index 39608505cbd..898bd4ca106 100644 --- a/cuda/test/matrix/csr_kernels.cpp +++ b/cuda/test/matrix/csr_kernels.cpp @@ -416,6 +416,7 @@ TEST_F(Csr, TransposeIsEquivalentToRef) GKO_ASSERT_MTX_NEAR(static_cast(d_trans.get()), static_cast(trans.get()), 0.0); + ASSERT_TRUE(static_cast(d_trans.get())->is_sorted_by_column_index()); } @@ -428,6 +429,8 @@ TEST_F(Csr, ConjugateTransposeIsEquivalentToRef) GKO_ASSERT_MTX_NEAR(static_cast(d_trans.get()), static_cast(trans.get()), 0.0); + ASSERT_TRUE( + static_cast(d_trans.get())->is_sorted_by_column_index()); } diff --git a/cuda/test/solver/lower_trs_kernels.cpp b/cuda/test/solver/lower_trs_kernels.cpp index a2cac176e8c..b677b9eb10d 100644 --- a/cuda/test/solver/lower_trs_kernels.cpp +++ b/cuda/test/solver/lower_trs_kernels.cpp @@ -131,7 +131,7 @@ TEST_F(LowerTrs, CudaLowerTrsFlagCheckIsCorrect) bool expected_flag = false; #if (defined(CUDA_VERSION) && (CUDA_VERSION < 9020)) expected_flag = true; -#endif +#endif // (defined(CUDA_VERSION) && (CUDA_VERSION < 9020)) gko::kernels::cuda::lower_trs::should_perform_transpose(cuda, trans_flag); ASSERT_EQ(expected_flag, trans_flag); diff --git a/cuda/test/solver/upper_trs_kernels.cpp b/cuda/test/solver/upper_trs_kernels.cpp index 92a76b1e47b..9da9d33930c 100644 --- a/cuda/test/solver/upper_trs_kernels.cpp +++ b/cuda/test/solver/upper_trs_kernels.cpp @@ -132,7 +132,7 @@ TEST_F(UpperTrs, CudaUpperTrsFlagCheckIsCorrect) #if (defined(CUDA_VERSION) && (CUDA_VERSION < 9020)) expected_flag = true; -#endif +#endif // (defined(CUDA_VERSION) && (CUDA_VERSION < 9020)) gko::kernels::cuda::upper_trs::should_perform_transpose(cuda, trans_flag); ASSERT_EQ(expected_flag, trans_flag); diff --git a/hip/base/executor.hip.cpp b/hip/base/executor.hip.cpp index 9592bc20b8d..4df120cde9f 100644 --- a/hip/base/executor.hip.cpp +++ b/hip/base/executor.hip.cpp @@ -93,7 +93,7 @@ void HipExecutor::raw_free(void *ptr) const noexcept << " in " << __func__ << ": " << hipGetErrorName(error_code) << ": " << hipGetErrorString(error_code) << std::endl << "Exiting program" << std::endl; -#endif +#endif // GKO_VERBOSE_LEVEL >= 1 std::exit(error_code); } } diff --git a/hip/base/hipsparse_bindings.hip.hpp b/hip/base/hipsparse_bindings.hip.hpp index 3b7c8a978a4..95be1bfe129 100644 --- a/hip/base/hipsparse_bindings.hip.hpp +++ b/hip/base/hipsparse_bindings.hip.hpp @@ -372,8 +372,8 @@ GKO_BIND_HIPSPARSE64_CSR2HYB(ValueType, detail::not_implemented); { \ GKO_ASSERT_NO_HIPSPARSE_ERRORS(HipsparseName( \ handle, m, n, nnz, as_hiplibs_type(OrigValA), OrigRowPtrA, \ - OrigColIndA, as_hiplibs_type(TransValA), TransRowPtrA, \ - TransColIndA, copyValues, idxBase)); \ + OrigColIndA, as_hiplibs_type(TransValA), TransColIndA, \ + TransRowPtrA, copyValues, idxBase)); \ } \ static_assert(true, \ "This assert is used to counter the false positive extra " \ diff --git a/hip/components/cooperative_groups.hip.hpp b/hip/components/cooperative_groups.hip.hpp index 704513391ee..5f818dca9be 100644 --- a/hip/components/cooperative_groups.hip.hpp +++ b/hip/components/cooperative_groups.hip.hpp @@ -199,7 +199,7 @@ class thread_block_tile { { #if GINKGO_HIP_PLATFORM_NVCC __syncwarp(data_.mask); -#endif +#endif // GINKGO_HIP_PLATFORM_NVCC } #if GINKGO_HIP_PLATFORM_HCC @@ -375,7 +375,7 @@ class enable_extended_shuffle : public Group { // Implementing this as a using directive messes up with SFINAE for some reason, // probably a bug in NVCC. If it is a complete type, everything works fine. -template +template struct thread_block_tile : detail::enable_extended_shuffle> { using detail::enable_extended_shuffle< @@ -398,12 +398,12 @@ __device__ __forceinline__ namespace detail { -template +template struct is_group_impl> : std::true_type {}; -template +template struct is_synchronizable_group_impl> : std::true_type { }; -template +template struct is_communicator_group_impl> : std::true_type {}; @@ -485,7 +485,7 @@ class grid_group { (threadIdx.y + blockDim.y * (threadIdx.z + blockDim.z * (blockIdx.x + gridDim.x * - (blockIdx.y + gridDim.y * blockIdx.z))))} + (blockIdx.y + gridDim.y * blockIdx.z))))} {} // clang-format on diff --git a/hip/components/format_conversion.hip.hpp b/hip/components/format_conversion.hip.hpp index bdea7613b63..f97e70ef438 100644 --- a/hip/components/format_conversion.hip.hpp +++ b/hip/components/format_conversion.hip.hpp @@ -104,7 +104,11 @@ __host__ size_type calculate_nwarps(std::shared_ptr exec, subwarp_size; #if GINKGO_HIP_PLATFORM_NVCC size_type multiple = 8; - if (nnz >= 2e6) { + if (nnz >= 2e8) { + multiple = 2048; + } else if (nnz >= 2e7) { + multiple = 512; + } else if (nnz >= 2e6) { multiple = 128; } else if (nnz >= 2e5) { multiple = 32; diff --git a/hip/matrix/csr_kernels.hip.cpp b/hip/matrix/csr_kernels.hip.cpp index 82aee4891ec..1303a4e08da 100644 --- a/hip/matrix/csr_kernels.hip.cpp +++ b/hip/matrix/csr_kernels.hip.cpp @@ -926,7 +926,7 @@ void transpose(std::shared_ptr exec, orig->get_size()[1], orig->get_num_stored_elements(), orig->get_const_values(), orig->get_const_row_ptrs(), orig->get_const_col_idxs(), trans->get_values(), - trans->get_col_idxs(), trans->get_row_ptrs(), copyValues, idxBase); + trans->get_row_ptrs(), trans->get_col_idxs(), copyValues, idxBase); } else { GKO_NOT_IMPLEMENTED; } diff --git a/hip/test/matrix/csr_kernels.hip.cpp b/hip/test/matrix/csr_kernels.hip.cpp index a1b2adfd794..677e886c36c 100644 --- a/hip/test/matrix/csr_kernels.hip.cpp +++ b/hip/test/matrix/csr_kernels.hip.cpp @@ -401,6 +401,7 @@ TEST_F(Csr, TransposeIsEquivalentToRef) GKO_ASSERT_MTX_NEAR(static_cast(d_trans.get()), static_cast(trans.get()), 0.0); + ASSERT_TRUE(static_cast(d_trans.get())->is_sorted_by_column_index()); } @@ -482,18 +483,6 @@ TEST_F(Csr, MoveToSparsityCsrIsEquivalentToRef) } -TEST_F(Csr, ConvertsEmptyToSellp) -{ - auto dempty_mtx = Mtx::create(hip); - auto dsellp_mtx = gko::matrix::Sellp<>::create(hip); - - dempty_mtx->convert_to(dsellp_mtx.get()); - - ASSERT_EQ(hip->copy_val_to_host(dsellp_mtx->get_const_slice_sets()), 0); - ASSERT_FALSE(dsellp_mtx->get_size()); -} - - TEST_F(Csr, CalculateMaxNnzPerRowIsEquivalentToRef) { set_up_apply_data(std::make_shared()); @@ -561,6 +550,18 @@ TEST_F(Csr, MoveToSellpIsEquivalentToRef) } +TEST_F(Csr, ConvertsEmptyToSellp) +{ + auto dempty_mtx = Mtx::create(hip); + auto dsellp_mtx = gko::matrix::Sellp<>::create(hip); + + dempty_mtx->convert_to(dsellp_mtx.get()); + + ASSERT_EQ(hip->copy_val_to_host(dsellp_mtx->get_const_slice_sets()), 0); + ASSERT_FALSE(dsellp_mtx->get_size()); +} + + TEST_F(Csr, CalculateTotalColsIsEquivalentToRef) { set_up_apply_data(std::make_shared()); @@ -625,7 +626,7 @@ TEST_F(Csr, MoveToHybridIsEquivalentToRef) TEST_F(Csr, RecognizeSortedMatrixIsEquivalentToRef) { - set_up_apply_data(std::make_shared()); + set_up_apply_data(std::make_shared(hip)); bool is_sorted_hip{}; bool is_sorted_ref{}; diff --git a/hip/test/preconditioner/jacobi_kernels.cpp b/hip/test/preconditioner/jacobi_kernels.cpp index f1863a6b42f..868e10fbbad 100644 --- a/hip/test/preconditioner/jacobi_kernels.cpp +++ b/hip/test/preconditioner/jacobi_kernels.cpp @@ -315,7 +315,7 @@ TEST_F(Jacobi, HipPreconditionerEquivalentToRefWithBlockSize64) GKO_ASSERT_MTX_NEAR(gko::as(d_bj.get()), gko::as(bj.get()), 1e-13); } -#endif +#endif // GINKGO_HIP_PLATFORM_HCC TEST_F(Jacobi, HipPreconditionerEquivalentToRefWithDifferentBlockSize) @@ -395,7 +395,7 @@ TEST_F(Jacobi, HipApplyEquivalentToRefWithBlockSize64) GKO_ASSERT_MTX_NEAR(d_x, x, 1e-12); } -#endif +#endif // GINKGO_HIP_PLATFORM_HCC TEST_F(Jacobi, HipApplyEquivalentToRefWithDifferentBlockSize) diff --git a/include/ginkgo/core/base/types.hpp b/include/ginkgo/core/base/types.hpp index 6c2ab2a50d2..10ef4d5d4cf 100644 --- a/include/ginkgo/core/base/types.hpp +++ b/include/ginkgo/core/base/types.hpp @@ -45,7 +45,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifdef __HIPCC__ #include -#endif +#endif // __HIPCC__ // Macros for handling different compilers / architectures uniformly diff --git a/include/ginkgo/core/matrix/csr.hpp b/include/ginkgo/core/matrix/csr.hpp index d463d1004ec..759a602db5f 100644 --- a/include/ginkgo/core/matrix/csr.hpp +++ b/include/ginkgo/core/matrix/csr.hpp @@ -431,7 +431,11 @@ class Csr : public EnableLinOp>, { if (warp_size_ > 0) { int multiple = 8; - if (nnz >= 2e6) { + if (nnz >= 2e8) { + multiple = 2048; + } else if (nnz >= 2e7) { + multiple = 512; + } else if (nnz >= 2e6) { multiple = 128; } else if (nnz >= 2e5) { multiple = 32; diff --git a/omp/test/matrix/csr_kernels.cpp b/omp/test/matrix/csr_kernels.cpp index bb607efd615..9eafec2efad 100644 --- a/omp/test/matrix/csr_kernels.cpp +++ b/omp/test/matrix/csr_kernels.cpp @@ -299,6 +299,7 @@ TEST_F(Csr, TransposeIsEquivalentToRef) GKO_ASSERT_MTX_NEAR(static_cast(d_trans.get()), static_cast(trans.get()), 0.0); + ASSERT_TRUE(static_cast(d_trans.get())->is_sorted_by_column_index()); } @@ -311,6 +312,8 @@ TEST_F(Csr, ConjugateTransposeIsEquivalentToRef) GKO_ASSERT_MTX_NEAR(static_cast(d_trans.get()), static_cast(trans.get()), 0.0); + ASSERT_TRUE( + static_cast(d_trans.get())->is_sorted_by_column_index()); } diff --git a/reference/test/stop/combined.cpp b/reference/test/stop/combined.cpp index 08939c64392..00eed7107bd 100644 --- a/reference/test/stop/combined.cpp +++ b/reference/test/stop/combined.cpp @@ -37,7 +37,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #if defined(_WIN32) || defined(__CYGWIN__) #include -#endif +#endif // defined(_WIN32) || defined(__CYGWIN__) #include diff --git a/reference/test/stop/time.cpp b/reference/test/stop/time.cpp index 258db5d2854..70da8138ae7 100644 --- a/reference/test/stop/time.cpp +++ b/reference/test/stop/time.cpp @@ -37,7 +37,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #if defined(_WIN32) || defined(__CYGWIN__) #include -#endif +#endif // defined(_WIN32) || defined(__CYGWIN__) #include