diff --git a/.github/workflows/windows-build.yml b/.github/workflows/windows-build.yml
index 66a81dda312..c1bda6bc2f3 100644
--- a/.github/workflows/windows-build.yml
+++ b/.github/workflows/windows-build.yml
@@ -4,16 +4,27 @@ on: [push]
 
 jobs:
   windows_cuda:
-    name: cuda102/release/shared (only compile)
+    strategy:
+      fail-fast: false
+      matrix:
+        config:
+        - {version: "10.2.89.20191206", name: "cuda102/release/shared"}
+        - {version: "latest", name: "cuda-latest/release/shared"}
+    name: msvc/${{ matrix.config.name }} (only compile)
     runs-on: [windows-latest]
     steps:
     - uses: actions/checkout@v2
-    - name: setup
+    - name: setup (versioned)
+      if: matrix.config.version != 'latest'
+      run: |
+        choco install cuda --version=${{ matrix.config.version }} -y
+    - name: setup (latest)
+      if: matrix.config.version == 'latest'
       run: |
-        choco install cuda --version=10.2.89.20191206 -y
+        choco install cuda -y
     - name: configure
       run: |
-        $env:ChocolateyInstall = Convert-Path "$((Get-Command choco).Path)\..\.."   
+        $env:ChocolateyInstall = Convert-Path "$((Get-Command choco).Path)\..\.."
         Import-Module "$env:ChocolateyInstall\helpers\chocolateyProfile.psm1"
         refreshenv
         mkdir build
@@ -21,7 +32,7 @@ jobs:
         $env:PATH="$pwd\windows_shared_library;$env:PATH"
         cmake -DGINKGO_BUILD_CUDA=ON -DGINKGO_BUILD_OMP=OFF ..
         cmake --build . -j4 --config Release
-        
+
   windows_ref:
     strategy:
       fail-fast: false
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 77598e32fe6..c2ab6d813a1 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -339,17 +339,17 @@ build/cuda101/clang/all/release/static:
     - cuda
     - gpu
 
-build/cuda101/intel/cuda/debug/static:
+# clang-cuda with cuda 10.1 and friends
+build/clang-cuda101/gcc/all/release/shared:
   <<: *default_build_with_test
-  image: localhost:5000/gko-cuda101-gnu8-llvm7-intel2019
+  image: localhost:5000/gko-cuda101-gnu8-llvm10-intel2019
   variables:
     <<: *default_variables
-    C_COMPILER: "icc"
-    CXX_COMPILER: "icpc"
+    CUDA_COMPILER: "clang++"
     BUILD_OMP: "ON"
     BUILD_CUDA: "ON"
-    BUILD_TYPE: "Debug"
-    BUILD_SHARED_LIBS: "OFF"
+    BUILD_HIP: "ON"
+    BUILD_TYPE: "Release"
     CUDA_ARCH: 35
   only:
     variables:
@@ -359,17 +359,58 @@ build/cuda101/intel/cuda/debug/static:
     - cuda
     - gpu
 
-# clang-cuda with cuda 10.1 and friends
-build/clang-cuda101/gcc/all/release/shared:
+build/clang-cuda101/clang/cuda/debug/static:
   <<: *default_build_with_test
   image: localhost:5000/gko-cuda101-gnu8-llvm10-intel2019
   variables:
     <<: *default_variables
+    C_COMPILER: "clang"
+    CXX_COMPILER: "clang++"
     CUDA_COMPILER: "clang++"
     BUILD_OMP: "ON"
     BUILD_CUDA: "ON"
+    BUILD_TYPE: "Debug"
+    BUILD_SHARED_LIBS: "OFF"
+    CUDA_ARCH: 35
+  only:
+    variables:
+      - $RUN_CI_TAG
+  tags:
+    - private_ci
+    - cuda
+    - gpu
+
+# cuda 10.2 and friends
+build/cuda102/gcc/all/debug/shared:
+  <<: *default_build_with_test
+  image: localhost:5000/gko-cuda102-gnu8-llvm8-intel2019
+  variables:
+    <<: *default_variables
+    BUILD_OMP: "ON"
+    BUILD_CUDA: "ON"
+    BUILD_HIP: "ON"
+    BUILD_TYPE: "Debug"
+    CUDA_ARCH: 35
+  only:
+    variables:
+      - $RUN_CI_TAG
+  tags:
+    - private_ci
+    - cuda
+    - gpu
+
+build/cuda102/clang/all/release/static:
+  <<: *default_build_with_test
+  image: localhost:5000/gko-cuda102-gnu8-llvm8-intel2019
+  variables:
+    <<: *default_variables
+    C_COMPILER: "clang"
+    CXX_COMPILER: "clang++"
+    BUILD_OMP: "ON"
+    BUILD_CUDA: "ON"
     BUILD_HIP: "ON"
     BUILD_TYPE: "Release"
+    BUILD_SHARED_LIBS: "OFF"
     CUDA_ARCH: 35
   only:
     variables:
@@ -379,14 +420,71 @@ build/clang-cuda101/gcc/all/release/shared:
     - cuda
     - gpu
 
-build/clang-cuda101/clang/cuda/debug/static:
+build/cuda102/intel/cuda/debug/static:
   <<: *default_build_with_test
-  image: localhost:5000/gko-cuda101-gnu8-llvm10-intel2019
+  image: localhost:5000/gko-cuda102-gnu8-llvm8-intel2019
+  variables:
+    <<: *default_variables
+    C_COMPILER: "icc"
+    CXX_COMPILER: "icpc"
+    BUILD_OMP: "ON"
+    BUILD_CUDA: "ON"
+    BUILD_TYPE: "Debug"
+    BUILD_SHARED_LIBS: "OFF"
+    CUDA_ARCH: 35
+  only:
+    variables:
+      - $RUN_CI_TAG
+  tags:
+    - private_ci
+    - cuda
+    - gpu
+
+# cuda 11.0 and friends
+build/cuda110/gcc/cuda/debug/shared:
+  <<: *default_build_with_test
+  image: localhost:5000/gko-cuda110-gnu9-llvm9-intel2020
+  variables:
+    <<: *default_variables
+    BUILD_OMP: "ON"
+    BUILD_CUDA: "ON"
+    BUILD_TYPE: "Debug"
+    CUDA_ARCH: 35
+  only:
+    variables:
+      - $RUN_CI_TAG
+  tags:
+    - private_ci
+    - cuda
+    - gpu
+
+build/cuda110/clang/cuda/release/static:
+  <<: *default_build_with_test
+  image: localhost:5000/gko-cuda110-gnu9-llvm9-intel2020
   variables:
     <<: *default_variables
     C_COMPILER: "clang"
     CXX_COMPILER: "clang++"
-    CUDA_COMPILER: "clang++"
+    BUILD_OMP: "ON"
+    BUILD_CUDA: "ON"
+    BUILD_TYPE: "Release"
+    BUILD_SHARED_LIBS: "OFF"
+    CUDA_ARCH: 35
+  only:
+    variables:
+      - $RUN_CI_TAG
+  tags:
+    - private_ci
+    - cuda
+    - gpu
+
+build/cuda110/intel/cuda/debug/static:
+  <<: *default_build_with_test
+  image: localhost:5000/gko-cuda110-gnu9-llvm9-intel2020
+  variables:
+    <<: *default_variables
+    C_COMPILER: "icc"
+    CXX_COMPILER: "icpc"
     BUILD_OMP: "ON"
     BUILD_CUDA: "ON"
     BUILD_TYPE: "Debug"
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8e9a76fdd6f..ff54c555a1f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -5,6 +5,12 @@ if (CMAKE_VERSION VERSION_GREATER_EQUAL 3.12)
     cmake_policy(SET CMP0074 NEW)
 endif()
 
+# Let CAS handle the CUDA architecture flags (for now)
+# Windows still gives CMP0104 warning if putting it in cuda.
+if (CMAKE_VERSION VERSION_GREATER_EQUAL 3.18)
+    cmake_policy(SET CMP0104 OLD)
+endif()
+
 project(Ginkgo LANGUAGES C CXX VERSION 1.2.0 DESCRIPTION "A numerical linear algebra library targeting many-core architectures")
 set(Ginkgo_VERSION_TAG "develop")
 set(PROJECT_VERSION_TAG ${Ginkgo_VERSION_TAG})
diff --git a/benchmark/utils/cuda_linops.hpp b/benchmark/utils/cuda_linops.hpp
index 7762a2439d2..08f5ea7b61d 100644
--- a/benchmark/utils/cuda_linops.hpp
+++ b/benchmark/utils/cuda_linops.hpp
@@ -117,6 +117,9 @@ class CuspBase : public gko::LinOp {
 };
 
 
+#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
+
+
 template <typename ValueType = gko::default_precision,
           typename IndexType = gko::int32>
 class CuspCsrmp
@@ -298,6 +301,9 @@ class CuspCsrmm
 };
 
 
+#endif  // defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
+
+
 template <typename ValueType = gko::default_precision,
           typename IndexType = gko::int32>
 class CuspCsrEx
@@ -388,7 +394,7 @@ class CuspCsrEx
     {
 #ifdef ALLOWMP
         algmode_ = CUSPARSE_ALG_MERGE_PATH;
-#endif
+#endif  // ALLOWMP
     }
 
 private:
@@ -400,6 +406,9 @@ class CuspCsrEx
 };
 
 
+#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
+
+
 template <typename ValueType = gko::default_precision,
           typename IndexType = gko::int32,
           cusparseHybPartition_t Partition = CUSPARSE_HYB_PARTITION_AUTO,
@@ -484,8 +493,12 @@ class CuspHybrid
 };
 
 
-#if defined(CUDA_VERSION) && (CUDA_VERSION >= 10010) && \
-    !(defined(_WIN32) || defined(__CYGWIN__))
+#endif  // defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
+
+
+#if defined(CUDA_VERSION) &&  \
+    (CUDA_VERSION >= 11000 || \
+     ((CUDA_VERSION >= 10010) && !(defined(_WIN32) || defined(__CYGWIN__))))
 
 
 template <typename ValueType>
@@ -512,7 +525,7 @@ void cusp_generic_spmv(std::shared_ptr<const gko::CudaExecutor> gpu_exec,
         &vecb, dense_b->get_num_stored_elements(),
         as_culibs_type(const_cast<ValueType *>(db)), cu_value));
 
-    size_t buffer_size = 0;
+    gko::size_type buffer_size = 0;
     GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseSpMV_bufferSize(
         gpu_exec->get_cusparse_handle(), trans, &scalars.get_const_data()[0],
         mat, vecb, &scalars.get_const_data()[1], vecx, cu_value, alg,
@@ -680,22 +693,25 @@ class CuspGenericCoo
 };
 
 
-#endif  // defined(CUDA_VERSION) && (CUDA_VERSION >= 10010) &&
-        // !(defined(_WIN32) || defined(__CYGWIN__))
+#endif  // defined(CUDA_VERSION) && (CUDA_VERSION >= 11000 || ((CUDA_VERSION >=
+        // 10010) && !(defined(_WIN32) || defined(__CYGWIN__))))
 
 
 }  // namespace detail
 
 
 // Some shortcuts
-using cusp_csr = detail::CuspCsr<>;
 using cusp_csrex = detail::CuspCsrEx<>;
+#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
+using cusp_csr = detail::CuspCsr<>;
 using cusp_csrmp = detail::CuspCsrmp<>;
 using cusp_csrmm = detail::CuspCsrmm<>;
+#endif  // defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
 
 
-#if defined(CUDA_VERSION) && (CUDA_VERSION >= 10010) && \
-    !(defined(_WIN32) || defined(__CYGWIN__))
+#if defined(CUDA_VERSION) &&  \
+    (CUDA_VERSION >= 11000 || \
+     ((CUDA_VERSION >= 10010) && !(defined(_WIN32) || defined(__CYGWIN__))))
 
 
 using cusp_gcsr = detail::CuspGenericCsr<>;
@@ -704,14 +720,17 @@ using cusp_gcsr2 =
 using cusp_gcoo = detail::CuspGenericCoo<>;
 
 
-#endif  // defined(CUDA_VERSION) && (CUDA_VERSION >= 10010) &&
-        // !(defined(_WIN32) || defined(__CYGWIN__))
+#endif  // defined(CUDA_VERSION) && (CUDA_VERSION >= 11000 || ((CUDA_VERSION >=
+        // 10010) && !(defined(_WIN32) || defined(__CYGWIN__))))
 
 
+#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
 using cusp_coo =
     detail::CuspHybrid<double, gko::int32, CUSPARSE_HYB_PARTITION_USER, 0>;
 using cusp_ell =
     detail::CuspHybrid<double, gko::int32, CUSPARSE_HYB_PARTITION_MAX, 0>;
 using cusp_hybrid = detail::CuspHybrid<>;
+#endif  // defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
+
 
 #endif  // GKO_BENCHMARK_UTILS_CUDA_LINOPS_HPP_
diff --git a/benchmark/utils/formats.hpp b/benchmark/utils/formats.hpp
index c4379f834f7..ca5ad5f58be 100644
--- a/benchmark/utils/formats.hpp
+++ b/benchmark/utils/formats.hpp
@@ -60,8 +60,16 @@ std::string available_format =
     "hybrid60, hybrid80, hybridlimit0, hybridlimit25, hybridlimit33, "
     "hybridminstorage"
 #ifdef HAS_CUDA
-    ", cusp_csr, cusp_csrex, cusp_csrmp, cusp_csrmm, cusp_coo, cusp_ell, "
-    "cusp_hybrid"
+    ", cusp_csr, cusp_csrex, cusp_coo"
+#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
+    ", cusp_csrmp, cusp_csrmm, cusp_ell, cusp_hybrid"
+#endif  // defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
+#if defined(CUDA_VERSION) &&  \
+    (CUDA_VERSION >= 11000 || \
+     ((CUDA_VERSION >= 10010) && !(defined(_WIN32) || defined(__CYGWIN__))))
+    ", cusp_gcsr, cusp_gcsr2, cusp_gcoo"
+#endif  // defined(CUDA_VERSION) && (CUDA_VERSION >= 11000 || ((CUDA_VERSION >=
+        // 10010) && !(defined(_WIN32) || defined(__CYGWIN__))))
 #endif  // HAS_CUDA
 #ifdef HAS_HIP
     ", hipsp_csr, hipsp_csrmm, hipsp_coo, hipsp_ell, hipsp_hybrid"
@@ -88,17 +96,23 @@ std::string format_description =
     "hybridminstorage: Hybrid uses the minimal storage to store the matrix."
 #ifdef HAS_CUDA
     "\n"
-    "cusp_hybrid: benchmark CuSPARSE spmv with cusparseXhybmv and an automatic "
-    "partition.\n"
+#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
     "cusp_coo: use cusparseXhybmv with a CUSPARSE_HYB_PARTITION_USER "
     "partition.\n"
-    "cusp_ell: use cusparseXhybmv with CUSPARSE_HYB_PARTITION_MAX partition.\n"
     "cusp_csr: benchmark CuSPARSE with the cusparseXcsrmv function.\n"
-    "cusp_csrex: benchmark CuSPARSE with the cusparseXcsrmvEx function.\n"
+    "cusp_ell: use cusparseXhybmv with CUSPARSE_HYB_PARTITION_MAX partition.\n"
     "cusp_csrmp: benchmark CuSPARSE with the cusparseXcsrmv_mp function.\n"
-    "cusp_csrmm: benchmark CuSPARSE with the cusparseXcsrmv_mm function."
-#if defined(CUDA_VERSION) && (CUDA_VERSION >= 10010) && \
-    !(defined(_WIN32) || defined(__CYGWIN__))
+    "cusp_csrmm: benchmark CuSPARSE with the cusparseXcsrmv_mm function.\n"
+    "cusp_hybrid: benchmark CuSPARSE spmv with cusparseXhybmv and an automatic "
+    "partition.\n"
+#else  // CUDA_VERSION >= 11000
+    "cusp_csr: is an alias of cusp_gcsr.\n"
+    "cusp_coo: is an alias of cusp_gcoo.\n"
+#endif
+    "cusp_csrex: benchmark CuSPARSE with the cusparseXcsrmvEx function."
+#if defined(CUDA_VERSION) &&  \
+    (CUDA_VERSION >= 11000 || \
+     ((CUDA_VERSION >= 10010) && !(defined(_WIN32) || defined(__CYGWIN__))))
     "\n"
     "cusp_gcsr: benchmark CuSPARSE with the generic csr with default "
     "algorithm.\n"
@@ -106,8 +120,8 @@ std::string format_description =
     "CUSPARSE_CSRMV_ALG2.\n"
     "cusp_gcoo: benchmark CuSPARSE with the generic coo with default "
     "algorithm.\n"
-#endif  // defined(CUDA_VERSION) && (CUDA_VERSION >= 10010) &&
-        // !(defined(_WIN32) || defined(__CYGWIN__))
+#endif  // defined(CUDA_VERSION) && (CUDA_VERSION >= 11000 || ((CUDA_VERSION >=
+        // 10010) && !(defined(_WIN32) || defined(__CYGWIN__))))
 #endif  // HAS_CUDA
 #ifdef HAS_HIP
     "\n"
@@ -187,20 +201,27 @@ const std::map<std::string, std::function<std::unique_ptr<gko::LinOp>(
         {"coo", read_matrix_from_data<gko::matrix::Coo<>>},
         {"ell", read_matrix_from_data<gko::matrix::Ell<>>},
 #ifdef HAS_CUDA
+#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
         {"cusp_csr", read_matrix_from_data<cusp_csr>},
         {"cusp_csrmp", read_matrix_from_data<cusp_csrmp>},
-        {"cusp_csrex", read_matrix_from_data<cusp_csrex>},
         {"cusp_csrmm", read_matrix_from_data<cusp_csrmm>},
         {"cusp_hybrid", read_matrix_from_data<cusp_hybrid>},
         {"cusp_coo", read_matrix_from_data<cusp_coo>},
         {"cusp_ell", read_matrix_from_data<cusp_ell>},
-#if defined(CUDA_VERSION) && (CUDA_VERSION >= 10010) && \
-    !(defined(_WIN32) || defined(__CYGWIN__))
+#else // CUDA_VERSION >= 11000
+        // cusp_csr, cusp_coo use the generic ones from CUDA 11
+        {"cusp_csr", read_matrix_from_data<cusp_gcsr>},
+        {"cusp_coo", read_matrix_from_data<cusp_gcoo>},
+#endif
+        {"cusp_csrex", read_matrix_from_data<cusp_csrex>},
+#if defined(CUDA_VERSION) &&  \
+    (CUDA_VERSION >= 11000 || \
+     ((CUDA_VERSION >= 10010) && !(defined(_WIN32) || defined(__CYGWIN__))))
         {"cusp_gcsr", read_matrix_from_data<cusp_gcsr>},
         {"cusp_gcsr2", read_matrix_from_data<cusp_gcsr2>},
         {"cusp_gcoo", read_matrix_from_data<cusp_gcoo>},
-#endif  // defined(CUDA_VERSION) && (CUDA_VERSION >= 10010) &&
-        // !(defined(_WIN32) || defined(__CYGWIN__))
+#endif  // defined(CUDA_VERSION) && (CUDA_VERSION >= 11000 || ((CUDA_VERSION >=
+        // 10010) && !(defined(_WIN32) || defined(__CYGWIN__))))
 #endif  // HAS_CUDA
 #ifdef HAS_HIP
         {"hipsp_csr", read_matrix_from_data<hipsp_csr>},
diff --git a/cmake/create_test.cmake b/cmake/create_test.cmake
index d968f781609..f2ff8496e64 100644
--- a/cmake/create_test.cmake
+++ b/cmake/create_test.cmake
@@ -68,6 +68,9 @@ function(ginkgo_create_cuda_test test_name)
         PRIVATE
         "$<BUILD_INTERFACE:${Ginkgo_BINARY_DIR}>"
         )
+    cas_target_cuda_architectures(${TEST_TARGET_NAME}
+        ARCHITECTURES ${GINKGO_CUDA_ARCHITECTURES}
+        UNSUPPORTED "20" "21")
     set_target_properties(${TEST_TARGET_NAME} PROPERTIES
         OUTPUT_NAME ${test_name})
 
diff --git a/common/base/executor.hpp.inc b/common/base/executor.hpp.inc
index adbe4d420f8..705404b314e 100644
--- a/common/base/executor.hpp.inc
+++ b/common/base/executor.hpp.inc
@@ -78,9 +78,9 @@ inline int convert_sm_ver_to_cores(int major, int minor)
               << "is undefined. The default value of "
               << nGpuArchCoresPerSM[index - 1].Cores << " Cores/SM is used."
               << std::endl;
-#endif
+#endif  // GKO_VERBOSE_LEVEL >= 1
     return nGpuArchCoresPerSM[index - 1].Cores;
 }
 
 
-}  // namespace
\ No newline at end of file
+}  // namespace
diff --git a/common/components/atomic.hpp.inc b/common/components/atomic.hpp.inc
index 20ed9b12c06..6ae0ac5e751 100644
--- a/common/components/atomic.hpp.inc
+++ b/common/components/atomic.hpp.inc
@@ -88,7 +88,7 @@ GKO_BIND_ATOMIC_HELPER_STRUCTURE(unsigned int);
 #if !(defined(CUDA_VERSION) && (CUDA_VERSION < 10010))
 // CUDA 10.1 starts supporting 16-bit unsigned short int atomicCAS
 GKO_BIND_ATOMIC_HELPER_STRUCTURE(unsigned short int);
-#endif
+#endif  // !(defined(CUDA_VERSION) && (CUDA_VERSION < 10010))
 
 #undef GKO_BIND_ATOMIC_HELPER_STRUCTURE
 
@@ -125,14 +125,16 @@ GKO_BIND_ATOMIC_ADD(float);
 // CUDA 8.0 starts suppoting 64-bit double atomicAdd on devices of compute
 // capability 6.x and higher
 GKO_BIND_ATOMIC_ADD(double);
-#endif
+#endif  // !((defined(CUDA_VERSION) && (CUDA_VERSION < 8000)) ||
+        // (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 600)))
 
 #if !((defined(CUDA_VERSION) && (CUDA_VERSION < 10000)) || \
       (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 700)))
 // CUDA 10.0 starts supporting 16-bit __half floating-point atomicAdd on devices
 // of compute capability 7.x and higher.
 GKO_BIND_ATOMIC_ADD(__half);
-#endif
+#endif  // !((defined(CUDA_VERSION) && (CUDA_VERSION < 10000)) ||
+        // (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 700)))
 
 #if !((defined(CUDA_VERSION) && (CUDA_VERSION < 10000)) || \
       (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 600)))
@@ -142,7 +144,8 @@ GKO_BIND_ATOMIC_ADD(__half);
 // elements; the entire __half2 is not guaranteed to be atomic as a single
 // 32-bit access.
 GKO_BIND_ATOMIC_ADD(__half2);
-#endif
+#endif  // !((defined(CUDA_VERSION) && (CUDA_VERSION < 10000)) ||
+        // (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 600)))
 
 
 #endif  // !defined(__HIPCC__) || (defined(__HIP_DEVICE_COMPILE__) &&
diff --git a/common/components/segment_scan.hpp.inc b/common/components/segment_scan.hpp.inc
index 3aac34832dc..9de87b6c702 100644
--- a/common/components/segment_scan.hpp.inc
+++ b/common/components/segment_scan.hpp.inc
@@ -38,7 +38,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  * performs suffix sum. Works on the source array and returns whether the thread
  * is the first element of its segment with same `ind`.
  */
-template <size_type subwarp_size, typename ValueType, typename IndexType>
+template <unsigned subwarp_size, typename ValueType, typename IndexType>
 __device__ __forceinline__ bool segment_scan(
     const group::thread_block_tile<subwarp_size> &group, const IndexType ind,
     ValueType *__restrict__ val)
@@ -60,4 +60,4 @@ __device__ __forceinline__ bool segment_scan(
         }
     }
     return head;
-}
\ No newline at end of file
+}
diff --git a/common/components/sorting.hpp.inc b/common/components/sorting.hpp.inc
index cc042a08d90..611b99c5538 100644
--- a/common/components/sorting.hpp.inc
+++ b/common/components/sorting.hpp.inc
@@ -128,7 +128,8 @@ struct bitonic_warp {
 
     __forceinline__ __device__ static void merge(ValueType *els, bool reverse)
     {
-        auto tile = group::thread_block_tile<num_threads>{};
+        auto tile =
+            group::tiled_partition<num_threads>(group::this_thread_block());
         auto new_reverse = reverse != upper_half();
         for (auto i = 0; i < num_local; ++i) {
             auto other = tile.shfl_xor(els[i], num_threads / 2);
diff --git a/common/factorization/par_ilut_filter_kernels.hpp.inc b/common/factorization/par_ilut_filter_kernels.hpp.inc
index 43addc2504b..2d520f33019 100644
--- a/common/factorization/par_ilut_filter_kernels.hpp.inc
+++ b/common/factorization/par_ilut_filter_kernels.hpp.inc
@@ -42,7 +42,8 @@ __device__ void abstract_filter_impl(const IndexType *row_ptrs,
                                      StepCallback step_cb,
                                      FinishCallback finish_cb)
 {
-    auto subwarp = group::thread_block_tile<subwarp_size>();
+    auto subwarp =
+        group::tiled_partition<subwarp_size>(group::this_thread_block());
     auto row = thread::get_subwarp_id_flat<subwarp_size, IndexType>();
     auto lane = subwarp.thread_rank();
     auto lane_prefix_mask = (config::lane_mask_type(1) << lane) - 1;
diff --git a/common/matrix/csr_kernels.hpp.inc b/common/matrix/csr_kernels.hpp.inc
index 0ee4c34dad6..64b17a1016a 100644
--- a/common/matrix/csr_kernels.hpp.inc
+++ b/common/matrix/csr_kernels.hpp.inc
@@ -87,7 +87,7 @@ __device__ __forceinline__ void find_next_row(
 }
 
 
-template <size_type subwarp_size, typename ValueType, typename IndexType,
+template <unsigned subwarp_size, typename ValueType, typename IndexType,
           typename Closure>
 __device__ __forceinline__ void warp_atomic_add(
     const group::thread_block_tile<subwarp_size> &group, bool force_write,
@@ -105,7 +105,7 @@ __device__ __forceinline__ void warp_atomic_add(
 }
 
 
-template <bool last, size_type subwarp_size, typename ValueType,
+template <bool last, unsigned subwarp_size, typename ValueType,
           typename IndexType, typename Closure>
 __device__ __forceinline__ void process_window(
     const group::thread_block_tile<subwarp_size> &group,
diff --git a/common/preconditioner/jacobi_generate_kernel.hpp.inc b/common/preconditioner/jacobi_generate_kernel.hpp.inc
index da8fe668aa0..b402c94e5db 100644
--- a/common/preconditioner/jacobi_generate_kernel.hpp.inc
+++ b/common/preconditioner/jacobi_generate_kernel.hpp.inc
@@ -53,12 +53,12 @@ __device__ __forceinline__ bool validate_precision_reduction_feasibility(
     }
 
     // compute the condition number
-    auto perm = group.thread_rank();
-    auto trans_perm = perm;
+    uint32 perm = group.thread_rank();
+    uint32 trans_perm = perm;
     auto block_cond = compute_infinity_norm<max_block_size>(group, block_size,
                                                             block_size, row);
-    auto succeeded =
-        invert_block<max_block_size>(group, block_size, row, perm, trans_perm);
+    auto succeeded = invert_block<max_block_size>(
+        group, static_cast<uint32>(block_size), row, perm, trans_perm);
     block_cond *= compute_infinity_norm<max_block_size>(group, block_size,
                                                         block_size, row);
 
@@ -99,10 +99,10 @@ __global__ void __launch_bounds__(warps_per_block *config::warp_size) generate(
     const auto subwarp = group::tiled_partition<subwarp_size>(block);
     if (block_id < num_blocks) {
         const auto block_size = block_ptrs[block_id + 1] - block_ptrs[block_id];
-        auto perm = subwarp.thread_rank();
-        auto trans_perm = subwarp.thread_rank();
-        invert_block<max_block_size>(subwarp, block_size, row, perm,
-                                     trans_perm);
+        uint32 perm = subwarp.thread_rank();
+        uint32 trans_perm = subwarp.thread_rank();
+        invert_block<max_block_size>(subwarp, static_cast<uint32>(block_size),
+                                     row, perm, trans_perm);
         copy_matrix<max_block_size, and_transpose>(
             subwarp, block_size, row, 1, perm, trans_perm,
             block_data + storage_scheme.get_global_block_offset(block_id),
@@ -138,11 +138,11 @@ __launch_bounds__(warps_per_block *config::warp_size) adaptive_generate(
 
     // compute inverse and figure out the correct precision
     const auto subwarp = group::tiled_partition<subwarp_size>(block);
-    const auto block_size =
+    const uint32 block_size =
         block_id < num_blocks ? block_ptrs[block_id + 1] - block_ptrs[block_id]
                               : 0;
-    auto perm = subwarp.thread_rank();
-    auto trans_perm = subwarp.thread_rank();
+    uint32 perm = subwarp.thread_rank();
+    uint32 trans_perm = subwarp.thread_rank();
     auto prec_descriptor = ~uint32{};
     if (block_id < num_blocks) {
         auto block_cond = compute_infinity_norm<max_block_size>(
diff --git a/cuda/CMakeLists.txt b/cuda/CMakeLists.txt
index 0822dee0174..6b5c623e856 100644
--- a/cuda/CMakeLists.txt
+++ b/cuda/CMakeLists.txt
@@ -151,11 +151,6 @@ target_link_libraries(ginkgo_cuda PRIVATE ${CUDA_RUNTIME_LIBS} ${CUBLAS} ${CUSPA
 # Need to link against ginkgo_hip for the `raw_copy_to(HipExecutor ...)` method
 target_link_libraries(ginkgo_cuda PUBLIC ginkgo_hip)
 
-# Let CAS handle the CUDA architecture flags (for now)
-if (CMAKE_VERSION VERSION_GREATER_EQUAL 3.18)
-    cmake_policy(SET CMP0104 OLD)
-endif()
-
 cas_target_cuda_architectures(ginkgo_cuda
     ARCHITECTURES ${GINKGO_CUDA_ARCHITECTURES}
     UNSUPPORTED "20" "21")
diff --git a/cuda/base/cusparse_bindings.hpp b/cuda/base/cusparse_bindings.hpp
index ed9f043f9ef..0cbb962bb15 100644
--- a/cuda/base/cusparse_bindings.hpp
+++ b/cuda/base/cusparse_bindings.hpp
@@ -87,6 +87,9 @@ template <>
 struct is_supported<std::complex<double>, int32> : std::true_type {};
 
 
+#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
+
+
 #define GKO_BIND_CUSPARSE32_SPMV(ValueType, CusparseName)                    \
     inline void spmv(cusparseHandle_t handle, cusparseOperation_t transA,    \
                      int32 m, int32 n, int32 nnz, const ValueType *alpha,    \
@@ -133,6 +136,41 @@ GKO_BIND_CUSPARSE64_SPMV(ValueType, detail::not_implemented);
 #undef GKO_BIND_CUSPARSE64_SPMV
 
 
+#else  // CUDA_VERSION >= 11000
+
+
+template <typename ValueType>
+inline void spmv_buffersize(cusparseHandle_t handle, cusparseOperation_t opA,
+                            const ValueType *alpha,
+                            const cusparseSpMatDescr_t matA,
+                            const cusparseDnVecDescr_t vecX,
+                            const ValueType *beta,
+                            const cusparseDnVecDescr_t vecY,
+                            cusparseSpMVAlg_t alg, size_type *bufferSize)
+{
+    constexpr auto value_type = cuda_data_type<ValueType>();
+    cusparseSpMV_bufferSize(handle, opA, alpha, matA, vecX, beta, vecY,
+                            value_type, alg, bufferSize);
+}
+
+template <typename ValueType>
+inline void spmv(cusparseHandle_t handle, cusparseOperation_t opA,
+                 const ValueType *alpha, const cusparseSpMatDescr_t matA,
+                 const cusparseDnVecDescr_t vecX, const ValueType *beta,
+                 const cusparseDnVecDescr_t vecY, cusparseSpMVAlg_t alg,
+                 void *externalBuffer)
+{
+    constexpr auto value_type = cuda_data_type<ValueType>();
+    cusparseSpMV(handle, opA, alpha, matA, vecX, beta, vecY, value_type, alg,
+                 externalBuffer);
+}
+
+
+#endif
+
+
+#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
+
 #define GKO_BIND_CUSPARSE32_SPMV(ValueType, CusparseName)                    \
     inline void spmv_mp(cusparseHandle_t handle, cusparseOperation_t transA, \
                         int32 m, int32 n, int32 nnz, const ValueType *alpha, \
@@ -227,6 +265,9 @@ GKO_BIND_CUSPARSE64_SPMM(ValueType, detail::not_implemented);
 #undef GKO_BIND_CUSPARSE64_SPMM
 
 
+#endif  // defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
+
+
 template <typename ValueType, typename IndexType>
 inline void spmv(cusparseHandle_t handle, cusparseAlgMode_t alg,
                  cusparseOperation_t transA, IndexType m, IndexType n,
@@ -277,7 +318,7 @@ inline void spmv_buffersize(cusparseHandle_t handle, cusparseAlgMode_t alg,
                             const IndexType *csrRowPtrA,
                             const IndexType *csrColIndA, const ValueType *x,
                             const ValueType *beta, ValueType *y,
-                            size_t *bufferSizeInBytes) GKO_NOT_IMPLEMENTED;
+                            size_type *bufferSizeInBytes) GKO_NOT_IMPLEMENTED;
 
 #define GKO_BIND_CUSPARSE_SPMV_BUFFERSIZE(ValueType)                           \
     template <>                                                                \
@@ -287,7 +328,7 @@ inline void spmv_buffersize(cusparseHandle_t handle, cusparseAlgMode_t alg,
         const ValueType *alpha, const cusparseMatDescr_t descrA,               \
         const ValueType *csrValA, const int32 *csrRowPtrA,                     \
         const int32 *csrColIndA, const ValueType *x, const ValueType *beta,    \
-        ValueType *y, size_t *bufferSizeInBytes)                               \
+        ValueType *y, size_type *bufferSizeInBytes)                            \
     {                                                                          \
         auto data_type = gko::kernels::cuda::cuda_data_type<ValueType>();      \
         if (data_type == CUDA_C_8U) {                                          \
@@ -311,6 +352,9 @@ GKO_BIND_CUSPARSE_SPMV_BUFFERSIZE(std::complex<double>);
 #undef GKO_BIND_CUSPARSE_SPMV_BUFFERSIZE
 
 
+#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
+
+
 #define GKO_BIND_CUSPARSE32_SPMV(ValueType, CusparseName)                     \
     inline void spmv(cusparseHandle_t handle, cusparseOperation_t transA,     \
                      const ValueType *alpha, const cusparseMatDescr_t descrA, \
@@ -336,6 +380,12 @@ GKO_BIND_CUSPARSE32_SPMV(ValueType, detail::not_implemented);
 #undef GKO_BIND_CUSPARSE32_SPMV
 
 
+#endif  // defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
+
+
+#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
+
+
 template <typename ValueType, typename IndexType>
 void spgemm_buffer_size(
     cusparseHandle_t handle, IndexType m, IndexType n, IndexType k,
@@ -459,6 +509,78 @@ GKO_BIND_CUSPARSE_SPGEMM(std::complex<double>, cusparseZcsrgemm2);
 #undef GKO_BIND_CUSPARSE_SPGEMM
 
 
+#else  // CUDA_VERSION >= 11000
+
+
+template <typename ValueType>
+void spgemm_work_estimation(cusparseHandle_t handle, const ValueType *alpha,
+                            cusparseSpMatDescr_t a_descr,
+                            cusparseSpMatDescr_t b_descr, const ValueType *beta,
+                            cusparseSpMatDescr_t c_descr,
+                            cusparseSpGEMMDescr_t spgemm_descr,
+                            size_type &buffer1_size, void *buffer1)
+{
+    GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseSpGEMM_workEstimation(
+        handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
+        CUSPARSE_OPERATION_NON_TRANSPOSE, alpha, a_descr, b_descr, beta,
+        c_descr, cuda_data_type<ValueType>(), CUSPARSE_SPGEMM_DEFAULT,
+        spgemm_descr, &buffer1_size, buffer1));
+}
+
+
+template <typename ValueType>
+void spgemm_compute(cusparseHandle_t handle, const ValueType *alpha,
+                    cusparseSpMatDescr_t a_descr, cusparseSpMatDescr_t b_descr,
+                    const ValueType *beta, cusparseSpMatDescr_t c_descr,
+                    cusparseSpGEMMDescr_t spgemm_descr, void *buffer1,
+                    size_type &buffer2_size, void *buffer2)
+{
+    GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseSpGEMM_compute(
+        handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
+        CUSPARSE_OPERATION_NON_TRANSPOSE, alpha, a_descr, b_descr, beta,
+        c_descr, cuda_data_type<ValueType>(), CUSPARSE_SPGEMM_DEFAULT,
+        spgemm_descr, &buffer2_size, buffer2));
+}
+
+
+template <typename ValueType>
+void spgemm_copy(cusparseHandle_t handle, const ValueType *alpha,
+                 cusparseSpMatDescr_t a_descr, cusparseSpMatDescr_t b_descr,
+                 const ValueType *beta, cusparseSpMatDescr_t c_descr,
+                 cusparseSpGEMMDescr_t spgemm_descr)
+{
+    GKO_ASSERT_NO_CUSPARSE_ERRORS(
+        cusparseSpGEMM_copy(handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
+                            CUSPARSE_OPERATION_NON_TRANSPOSE, alpha, a_descr,
+                            b_descr, beta, c_descr, cuda_data_type<ValueType>(),
+                            CUSPARSE_SPGEMM_DEFAULT, spgemm_descr));
+}
+
+
+inline size_type sparse_matrix_nnz(cusparseSpMatDescr_t descr)
+{
+    int64_t dummy1{};
+    int64_t dummy2{};
+    int64_t nnz{};
+    cusparseSpMatGetSize(descr, &dummy1, &dummy2, &nnz);
+    return static_cast<size_type>(nnz);
+}
+
+
+template <typename ValueType, typename IndexType>
+void csr_set_pointers(cusparseSpMatDescr_t descr, IndexType *row_ptrs,
+                      IndexType *col_idxs, ValueType *vals)
+{
+    cusparseCsrSetPointers(descr, row_ptrs, col_idxs, vals);
+}
+
+
+#endif  // CUDA_VERSION >= 11000
+
+
+#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
+
+
 #define GKO_BIND_CUSPARSE32_CSR2HYB(ValueType, CusparseName)                 \
     inline void csr2hyb(cusparseHandle_t handle, int32 m, int32 n,           \
                         const cusparseMatDescr_t descrA,                     \
@@ -504,101 +626,119 @@ GKO_BIND_CUSPARSE64_CSR2HYB(ValueType, detail::not_implemented);
 #undef GKO_BIND_CUSPARSE64_CSR2HYB
 
 
+#endif  // defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
+
+
+#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
+
+template <typename ValueType, typename IndexType>
+inline void transpose(cusparseHandle_t handle, size_type m, size_type n,
+                      size_type nnz, const ValueType *OrigValA,
+                      const IndexType *OrigRowPtrA,
+                      const IndexType *OrigColIndA, ValueType *TransValA,
+                      IndexType *TransRowPtrA, IndexType *TransColIndA,
+                      cusparseAction_t copyValues,
+                      cusparseIndexBase_t idxBase) GKO_NOT_IMPLEMENTED;
+
+// Cusparse csr2csc use the order (row_inx, col_ptr) for csc, so we need to
+// switch row_ptr and col_idx of transposed csr here
 #define GKO_BIND_CUSPARSE_TRANSPOSE32(ValueType, CusparseName)                \
-    inline void transpose(cusparseHandle_t handle, size_type m, size_type n,  \
-                          size_type nnz, const ValueType *OrigValA,           \
-                          const int32 *OrigRowPtrA, const int32 *OrigColIndA, \
-                          ValueType *TransValA, int32 *TransRowPtrA,          \
-                          int32 *TransColIndA, cusparseAction_t copyValues,   \
-                          cusparseIndexBase_t idxBase)                        \
+    template <>                                                               \
+    inline void transpose<ValueType, int32>(                                  \
+        cusparseHandle_t handle, size_type m, size_type n, size_type nnz,     \
+        const ValueType *OrigValA, const int32 *OrigRowPtrA,                  \
+        const int32 *OrigColIndA, ValueType *TransValA, int32 *TransRowPtrA,  \
+        int32 *TransColIndA, cusparseAction_t copyValues,                     \
+        cusparseIndexBase_t idxBase)                                          \
     {                                                                         \
         GKO_ASSERT_NO_CUSPARSE_ERRORS(                                        \
             CusparseName(handle, m, n, nnz, as_culibs_type(OrigValA),         \
                          OrigRowPtrA, OrigColIndA, as_culibs_type(TransValA), \
-                         TransRowPtrA, TransColIndA, copyValues, idxBase));   \
+                         TransColIndA, TransRowPtrA, copyValues, idxBase));   \
     }                                                                         \
     static_assert(true,                                                       \
                   "This assert is used to counter the false positive extra "  \
                   "semi-colon warnings")
 
-#define GKO_BIND_CUSPARSE_TRANSPOSE64(ValueType, CusparseName)                \
-    inline void transpose(cusparseHandle_t handle, size_type m, size_type n,  \
-                          size_type nnz, const ValueType *OrigValA,           \
-                          const int64 *OrigRowPtrA, const int64 *OrigColIndA, \
-                          ValueType *TransValA, int64 *TransRowPtrA,          \
-                          int64 *TransColIndA, cusparseAction_t copyValues,   \
-                          cusparseIndexBase_t idxBase) GKO_NOT_IMPLEMENTED;   \
-    static_assert(true,                                                       \
-                  "This assert is used to counter the false positive extra "  \
-                  "semi-colon warnings")
-
 GKO_BIND_CUSPARSE_TRANSPOSE32(float, cusparseScsr2csc);
 GKO_BIND_CUSPARSE_TRANSPOSE32(double, cusparseDcsr2csc);
-GKO_BIND_CUSPARSE_TRANSPOSE64(float, cusparseScsr2csc);
-GKO_BIND_CUSPARSE_TRANSPOSE64(double, cusparseDcsr2csc);
 GKO_BIND_CUSPARSE_TRANSPOSE32(std::complex<float>, cusparseCcsr2csc);
 GKO_BIND_CUSPARSE_TRANSPOSE32(std::complex<double>, cusparseZcsr2csc);
-GKO_BIND_CUSPARSE_TRANSPOSE64(std::complex<float>, cusparseCcsr2csc);
-GKO_BIND_CUSPARSE_TRANSPOSE64(std::complex<double>, cusparseZcsr2csc);
-template <typename ValueType>
-GKO_BIND_CUSPARSE_TRANSPOSE32(ValueType, detail::not_implemented);
-template <typename ValueType>
-GKO_BIND_CUSPARSE_TRANSPOSE64(ValueType, detail::not_implemented);
 
-#undef GKO_BIND_CUSPARSE_TRANSPOSE
+#undef GKO_BIND_CUSPARSE_TRANSPOSE32
 
-#define GKO_BIND_CUSPARSE_CONJ_TRANSPOSE32(ValueType, CusparseName)          \
-    inline void conj_transpose(                                              \
-        cusparseHandle_t handle, size_type m, size_type n, size_type nnz,    \
-        const ValueType *OrigValA, const int32 *OrigRowPtrA,                 \
-        const int32 *OrigColIndA, ValueType *TransValA, int32 *TransRowPtrA, \
-        int32 *TransColIndA, cusparseAction_t copyValues,                    \
-        cusparseIndexBase_t idxBase) GKO_NOT_IMPLEMENTED;                    \
-    static_assert(true,                                                      \
-                  "This assert is used to counter the false positive extra " \
-                  "semi-colon warnings")
 
-#define GKO_BIND_CUSPARSE_CONJ_TRANSPOSE64(ValueType, CusparseName)          \
-    inline void conj_transpose(                                              \
-        cusparseHandle_t handle, size_type m, size_type n, size_type nnz,    \
-        const ValueType *OrigValA, const int64 *OrigRowPtrA,                 \
-        const int64 *OrigColIndA, ValueType *TransValA, int64 *TransRowPtrA, \
-        int64 *TransColIndA, cusparseAction_t copyValues,                    \
-        cusparseIndexBase_t idxBase) GKO_NOT_IMPLEMENTED;                    \
-    static_assert(true,                                                      \
-                  "This assert is used to counter the false positive extra " \
-                  "semi-colon warnings")
+#else  // CUDA_VERSION >= 11000
 
-GKO_BIND_CUSPARSE_CONJ_TRANSPOSE32(float, cusparseScsr2csc);
-GKO_BIND_CUSPARSE_CONJ_TRANSPOSE32(double, cusparseDcsr2csc);
-GKO_BIND_CUSPARSE_CONJ_TRANSPOSE64(float, cusparseScsr2csc);
-GKO_BIND_CUSPARSE_CONJ_TRANSPOSE64(double, cusparseDcsr2csc);
-GKO_BIND_CUSPARSE_CONJ_TRANSPOSE32(std::complex<float>, cusparseCcsr2csc);
-GKO_BIND_CUSPARSE_CONJ_TRANSPOSE32(std::complex<double>, cusparseZcsr2csc);
-GKO_BIND_CUSPARSE_CONJ_TRANSPOSE64(std::complex<float>, cusparseCcsr2csc);
-GKO_BIND_CUSPARSE_CONJ_TRANSPOSE64(std::complex<double>, cusparseZcsr2csc);
-template <typename ValueType>
-GKO_BIND_CUSPARSE_CONJ_TRANSPOSE32(ValueType, detail::not_implemented);
-template <typename ValueType>
-GKO_BIND_CUSPARSE_CONJ_TRANSPOSE64(ValueType, detail::not_implemented);
+template <typename ValueType, typename IndexType>
+inline void transpose_buffersize(
+    cusparseHandle_t handle, size_type m, size_type n, size_type nnz,
+    const ValueType *OrigValA, const IndexType *OrigRowPtrA,
+    const IndexType *OrigColIndA, ValueType *TransValA, IndexType *TransRowPtrA,
+    IndexType *TransColIndA, cudaDataType_t valType,
+    cusparseAction_t copyValues, cusparseIndexBase_t idxBase,
+    cusparseCsr2CscAlg_t alg, size_type *buffer_size) GKO_NOT_IMPLEMENTED;
+
+#define GKO_BIND_CUSPARSE_TRANSPOSE_BUFFERSIZE32(ValueType)                   \
+    template <>                                                               \
+    inline void transpose_buffersize<ValueType, int32>(                       \
+        cusparseHandle_t handle, size_type m, size_type n, size_type nnz,     \
+        const ValueType *OrigValA, const int32 *OrigRowPtrA,                  \
+        const int32 *OrigColIndA, ValueType *TransValA, int32 *TransRowPtrA,  \
+        int32 *TransColIndA, cudaDataType_t valType,                          \
+        cusparseAction_t copyValues, cusparseIndexBase_t idxBase,             \
+        cusparseCsr2CscAlg_t alg, size_type *buffer_size)                     \
+    {                                                                         \
+        GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseCsr2cscEx2_bufferSize(          \
+            handle, m, n, nnz, OrigValA, OrigRowPtrA, OrigColIndA, TransValA, \
+            TransRowPtrA, TransColIndA, valType, copyValues, idxBase, alg,    \
+            buffer_size));                                                    \
+    }                                                                         \
+    static_assert(true,                                                       \
+                  "This assert is used to counter the false positive extra "  \
+                  "semi-colon warnings")
 
-#undef GKO_BIND_CUSPARSE_CONJ_TRANSPOSE
+GKO_BIND_CUSPARSE_TRANSPOSE_BUFFERSIZE32(float);
+GKO_BIND_CUSPARSE_TRANSPOSE_BUFFERSIZE32(double);
+GKO_BIND_CUSPARSE_TRANSPOSE_BUFFERSIZE32(std::complex<float>);
+GKO_BIND_CUSPARSE_TRANSPOSE_BUFFERSIZE32(std::complex<double>);
 
+template <typename ValueType, typename IndexType>
+inline void transpose(cusparseHandle_t handle, size_type m, size_type n,
+                      size_type nnz, const ValueType *OrigValA,
+                      const IndexType *OrigRowPtrA,
+                      const IndexType *OrigColIndA, ValueType *TransValA,
+                      IndexType *TransRowPtrA, IndexType *TransColIndA,
+                      cudaDataType_t valType, cusparseAction_t copyValues,
+                      cusparseIndexBase_t idxBase, cusparseCsr2CscAlg_t alg,
+                      void *buffer) GKO_NOT_IMPLEMENTED;
+
+#define GKO_BIND_CUSPARSE_TRANSPOSE32(ValueType)                              \
+    template <>                                                               \
+    inline void transpose<ValueType, int32>(                                  \
+        cusparseHandle_t handle, size_type m, size_type n, size_type nnz,     \
+        const ValueType *OrigValA, const int32 *OrigRowPtrA,                  \
+        const int32 *OrigColIndA, ValueType *TransValA, int32 *TransRowPtrA,  \
+        int32 *TransColIndA, cudaDataType_t valType,                          \
+        cusparseAction_t copyValues, cusparseIndexBase_t idxBase,             \
+        cusparseCsr2CscAlg_t alg, void *buffer)                               \
+    {                                                                         \
+        GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseCsr2cscEx2(                     \
+            handle, m, n, nnz, OrigValA, OrigRowPtrA, OrigColIndA, TransValA, \
+            TransRowPtrA, TransColIndA, valType, copyValues, idxBase, alg,    \
+            buffer));                                                         \
+    }                                                                         \
+    static_assert(true,                                                       \
+                  "This assert is used to counter the false positive extra "  \
+                  "semi-colon warnings")
 
-inline cusparseHandle_t init()
-{
-    cusparseHandle_t handle{};
-    GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseCreate(&handle));
-    GKO_ASSERT_NO_CUSPARSE_ERRORS(
-        cusparseSetPointerMode(handle, CUSPARSE_POINTER_MODE_DEVICE));
-    return handle;
-}
+GKO_BIND_CUSPARSE_TRANSPOSE32(float);
+GKO_BIND_CUSPARSE_TRANSPOSE32(double);
+GKO_BIND_CUSPARSE_TRANSPOSE32(std::complex<float>);
+GKO_BIND_CUSPARSE_TRANSPOSE32(std::complex<double>);
 
 
-inline void destroy(cusparseHandle_t handle)
-{
-    GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseDestroy(handle));
-}
+#endif
 
 
 inline cusparseMatDescr_t create_mat_descr()
@@ -615,6 +755,9 @@ inline void destroy(cusparseMatDescr_t descr)
 }
 
 
+#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
+
+
 inline csrgemm2Info_t create_spgemm_info()
 {
     csrgemm2Info_t info{};
@@ -629,6 +772,84 @@ inline void destroy(csrgemm2Info_t info)
 }
 
 
+#else  // CUDA_VERSION >= 11000
+
+
+inline cusparseSpGEMMDescr_t create_spgemm_descr()
+{
+    cusparseSpGEMMDescr_t descr{};
+    GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseSpGEMM_createDescr(&descr));
+    return descr;
+}
+
+
+inline void destroy(cusparseSpGEMMDescr_t info)
+{
+    GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseSpGEMM_destroyDescr(info));
+}
+
+
+template <typename ValueType>
+inline cusparseDnVecDescr_t create_dnvec(int64_t size, ValueType *values)
+{
+    cusparseDnVecDescr_t descr{};
+    constexpr auto value_type = cuda_data_type<ValueType>();
+    GKO_ASSERT_NO_CUSPARSE_ERRORS(
+        cusparseCreateDnVec(&descr, size, values, value_type));
+    return descr;
+}
+
+
+inline void destroy(cusparseDnVecDescr_t descr)
+{
+    GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseDestroyDnVec(descr));
+}
+
+
+template <typename ValueType, typename IndexType>
+inline cusparseSpVecDescr_t create_spvec(int64_t size, int64_t nnz,
+                                         IndexType *indices, ValueType *values)
+{
+    cusparseSpVecDescr_t descr{};
+    constexpr auto index_type = cusparse_index_type<IndexType>();
+    constexpr auto value_type = cuda_data_type<ValueType>();
+    GKO_ASSERT_NO_CUSPARSE_ERRORS(
+        cusparseCreateSpVec(&descr, size, nnz, indices, values, index_type,
+                            CUSPARSE_INDEX_BASE_ZERO, value_type));
+    return descr;
+}
+
+
+inline void destroy(cusparseSpVecDescr_t descr)
+{
+    GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseDestroySpVec(descr));
+}
+
+
+template <typename IndexType, typename ValueType>
+inline cusparseSpMatDescr_t create_csr(int64_t rows, int64_t cols, int64_t nnz,
+                                       IndexType *csrRowOffsets,
+                                       IndexType *csrColInd,
+                                       ValueType *csrValues)
+{
+    cusparseSpMatDescr_t descr{};
+    constexpr auto index_type = cusparse_index_type<IndexType>();
+    constexpr auto value_type = cuda_data_type<ValueType>();
+    GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseCreateCsr(
+        &descr, rows, cols, nnz, csrRowOffsets, csrColInd, csrValues,
+        index_type, index_type, CUSPARSE_INDEX_BASE_ZERO, value_type));
+    return descr;
+}
+
+
+inline void destroy(cusparseSpMatDescr_t descr)
+{
+    GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseDestroySpMat(descr));
+}
+
+#endif
+
+
 // CUDA versions 9.2 and above have csrsm2.
 #if (defined(CUDA_VERSION) && (CUDA_VERSION >= 9020))
 
@@ -694,7 +915,7 @@ inline void destroy(csrilu02Info_t info)
         const ValueType *csrVal, const int32 *csrRowPtr,                      \
         const int32 *csrColInd, const ValueType *rhs, int32 sol_size,         \
         csrsm2Info_t factor_info, cusparseSolvePolicy_t policy,               \
-        size_t *factor_work_size)                                             \
+        size_type *factor_work_size)                                          \
     {                                                                         \
         GKO_ASSERT_NO_CUSPARSE_ERRORS(                                        \
             CusparseName(handle, algo, trans1, trans2, m, n, nnz,             \
@@ -714,7 +935,7 @@ inline void destroy(csrilu02Info_t info)
         const ValueType *csrVal, const int64 *csrRowPtr,                     \
         const int64 *csrColInd, const ValueType *rhs, int64 sol_size,        \
         csrsm2Info_t factor_info, cusparseSolvePolicy_t policy,              \
-        size_t *factor_work_size) GKO_NOT_IMPLEMENTED;                       \
+        size_type *factor_work_size) GKO_NOT_IMPLEMENTED;                    \
     static_assert(true,                                                      \
                   "This assert is used to counter the false positive extra " \
                   "semi-colon warnings")
@@ -989,6 +1210,9 @@ inline void csrsort<int32>(cusparseHandle_t handle, int32 m, int32 n, int32 nnz,
 }
 
 
+#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
+
+
 template <typename ValueType, typename IndexType>
 void gather(cusparseHandle_t handle, IndexType nnz, const ValueType *in,
             ValueType *out, const IndexType *permutation) GKO_NOT_IMPLEMENTED;
@@ -1015,6 +1239,19 @@ GKO_BIND_CUSPARSE_GATHER(std::complex<double>, cusparseZgthr);
 #undef GKO_BIND_CUSPARSE_GATHER
 
 
+#else  // CUDA_VERSION >= 11000
+
+
+inline void gather(cusparseHandle_t handle, cusparseDnVecDescr_t in,
+                   cusparseSpVecDescr_t out)
+{
+    GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseGather(handle, in, out));
+}
+
+
+#endif
+
+
 template <typename ValueType, typename IndexType>
 void ilu0_buffer_size(cusparseHandle_t handle, IndexType m, IndexType nnz,
                       const cusparseMatDescr_t descr, const ValueType *vals,
diff --git a/cuda/base/cusparse_handle.hpp b/cuda/base/cusparse_handle.hpp
new file mode 100644
index 00000000000..0f2733c0ed4
--- /dev/null
+++ b/cuda/base/cusparse_handle.hpp
@@ -0,0 +1,77 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2020, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_CUDA_BASE_CUSPARSE_HANDLE_HPP_
+#define GKO_CUDA_BASE_CUSPARSE_HANDLE_HPP_
+
+
+#include <cuda.h>
+#include <cusparse.h>
+
+
+#include <ginkgo/core/base/exception_helpers.hpp>
+
+
+namespace gko {
+namespace kernels {
+namespace cuda {
+/**
+ * @brief The CUSPARSE namespace.
+ *
+ * @ingroup cusparse
+ */
+namespace cusparse {
+
+
+inline cusparseHandle_t init()
+{
+    cusparseHandle_t handle{};
+    GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseCreate(&handle));
+    GKO_ASSERT_NO_CUSPARSE_ERRORS(
+        cusparseSetPointerMode(handle, CUSPARSE_POINTER_MODE_DEVICE));
+    return handle;
+}
+
+
+inline void destroy(cusparseHandle_t handle)
+{
+    GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseDestroy(handle));
+}
+
+
+}  // namespace cusparse
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GKO_CUDA_BASE_CUSPARSE_HANDLE_HPP_
diff --git a/cuda/base/executor.cpp b/cuda/base/executor.cpp
index a72ecef7591..98d24c2e00f 100644
--- a/cuda/base/executor.cpp
+++ b/cuda/base/executor.cpp
@@ -45,7 +45,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "cuda/base/config.hpp"
 #include "cuda/base/cublas_bindings.hpp"
-#include "cuda/base/cusparse_bindings.hpp"
+#include "cuda/base/cusparse_handle.hpp"
 #include "cuda/base/device_guard.hpp"
 
 
@@ -93,7 +93,7 @@ void CudaExecutor::raw_free(void *ptr) const noexcept
                   << " in " << __func__ << ": " << cudaGetErrorName(error_code)
                   << ": " << cudaGetErrorString(error_code) << std::endl
                   << "Exiting program" << std::endl;
-#endif
+#endif  // GKO_VERBOSE_LEVEL >= 1
         std::exit(error_code);
     }
 }
diff --git a/cuda/base/types.hpp b/cuda/base/types.hpp
index 36190658d57..580976b265f 100644
--- a/cuda/base/types.hpp
+++ b/cuda/base/types.hpp
@@ -198,8 +198,9 @@ constexpr cudaDataType_t cuda_data_type_impl<uint8>()
 }
 
 
-#if defined(CUDA_VERSION) && (CUDA_VERSION >= 10010) && \
-    !(defined(_WIN32) || defined(__CYGWIN__))
+#if defined(CUDA_VERSION) &&  \
+    (CUDA_VERSION >= 11000 || \
+     ((CUDA_VERSION >= 10010) && !(defined(_WIN32) || defined(__CYGWIN__))))
 
 
 template <typename T>
@@ -221,8 +222,8 @@ constexpr cusparseIndexType_t cusparse_index_type_impl<int64>()
 }
 
 
-#endif  // defined(CUDA_VERSION) && (CUDA_VERSION >= 10010) &&
-        // !(defined(_WIN32) || defined(__CYGWIN__))
+#endif  // defined(CUDA_VERSION) && (CUDA_VERSION >= 11000 || ((CUDA_VERSION >=
+        // 10010) && !(defined(_WIN32) || defined(__CYGWIN__))))
 
 
 }  // namespace detail
@@ -243,8 +244,9 @@ constexpr cudaDataType_t cuda_data_type()
 }
 
 
-#if defined(CUDA_VERSION) && (CUDA_VERSION >= 10010) && \
-    !(defined(_WIN32) || defined(__CYGWIN__))
+#if defined(CUDA_VERSION) &&  \
+    (CUDA_VERSION >= 11000 || \
+     ((CUDA_VERSION >= 10010) && !(defined(_WIN32) || defined(__CYGWIN__))))
 
 
 /**
@@ -262,8 +264,8 @@ constexpr cusparseIndexType_t cusparse_index_type()
 }
 
 
-#endif  // defined(CUDA_VERSION) && (CUDA_VERSION >= 10010) &&
-        // !(defined(_WIN32) || defined(__CYGWIN__))
+#endif  // defined(CUDA_VERSION) && (CUDA_VERSION >= 11000 || ((CUDA_VERSION >=
+        // 10010) && !(defined(_WIN32) || defined(__CYGWIN__))))
 
 
 /**
diff --git a/cuda/components/cooperative_groups.cuh b/cuda/components/cooperative_groups.cuh
index 25af6135ced..d48815c0c6f 100644
--- a/cuda/components/cooperative_groups.cuh
+++ b/cuda/components/cooperative_groups.cuh
@@ -38,6 +38,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 #include <cooperative_groups.h>
+#include <cuda.h>
 
 
 #include "cuda/base/config.hpp"
@@ -242,7 +243,7 @@ private:
                     (threadIdx.y + blockDim.y *
                         (threadIdx.z + blockDim.z *
                             (blockIdx.x + gridDim.x *
-                                (blockIdx.y + gridDim.y * blockIdx.z))))}                      
+                                (blockIdx.y + gridDim.y * blockIdx.z))))}
     {}
     // clang-format on
 
@@ -312,6 +313,9 @@ struct is_synchronizable_group_impl<coalesced_group> : std::true_type {};
 namespace detail {
 
 
+#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
+
+
 // Adds generalized shuffles that support any type to the group.
 template <typename Group>
 class enable_extended_shuffle : public Group {
@@ -363,17 +367,33 @@ private:
 };
 
 
+#endif  // defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
+
+
 }  // namespace detail
 
 
+#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
+
+
 // Implementing this as a using directive messes up with SFINAE for some reason,
 // probably a bug in NVCC. If it is a complete type, everything works fine.
-template <size_type Size>
+template <unsigned Size>
 struct thread_block_tile : detail::enable_extended_shuffle<
                                cooperative_groups::thread_block_tile<Size>> {
     using detail::enable_extended_shuffle<
         cooperative_groups::thread_block_tile<Size>>::enable_extended_shuffle;
 };
+
+
+#else  // CUDA_VERSION >= 11000
+
+
+// Cuda11 cooperative group's shuffle supports complex
+using cooperative_groups::thread_block_tile;
+
+
+#endif
 // inherits thread_group
 //
 // public API:
@@ -393,20 +413,44 @@ struct thread_block_tile : detail::enable_extended_shuffle<
 // unsigned match_all(T) const  // TODO: implement for all types
 
 namespace detail {
-template <size_type Size>
+
+
+#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
+
+
+template <unsigned Size>
 struct is_group_impl<thread_block_tile<Size>> : std::true_type {};
-template <size_type Size>
+template <unsigned Size>
 struct is_synchronizable_group_impl<thread_block_tile<Size>> : std::true_type {
 };
-template <size_type Size>
+template <unsigned Size>
 struct is_communicator_group_impl<thread_block_tile<Size>> : std::true_type {};
 // make sure the original CUDA group is recognized whenever possible
-template <size_type Size>
+template <unsigned Size>
 struct is_group_impl<cooperative_groups::thread_block_tile<Size>>
     : std::true_type {};
-template <size_type Size>
+template <unsigned Size>
 struct is_synchronizable_group_impl<cooperative_groups::thread_block_tile<Size>>
     : std::true_type {};
+
+
+#else  // CUDA_VERSION >= 11000
+
+
+// thread_block_tile is same as cuda11's
+template <unsigned Size, typename Group>
+struct is_group_impl<thread_block_tile<Size, Group>> : std::true_type {};
+template <unsigned Size, typename Group>
+struct is_synchronizable_group_impl<thread_block_tile<Size, Group>>
+    : std::true_type {};
+template <unsigned Size, typename Group>
+struct is_communicator_group_impl<thread_block_tile<Size, Group>>
+    : std::true_type {};
+
+
+#endif
+
+
 }  // namespace detail
 
 
@@ -457,6 +501,10 @@ __device__ __forceinline__ auto tiled_partition(const Group &g)
 // Only support tile_partition with 1, 2, 4, 8, 16, 32.
 // Reference:
 // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#warp-notes
+#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
+
+
+// cooperative group before cuda11 does not contain parent group in template
 template <size_type Size, typename Group>
 __device__ __forceinline__
     std::enable_if_t<(Size <= kernels::cuda::config::warp_size) && (Size > 0) &&
@@ -468,6 +516,23 @@ __device__ __forceinline__
 }
 
 
+#else  // CUDA_VERSION >= 11000
+
+
+// cooperative group after cuda11 contain parent group in template.
+// we remove the information because we do not restrict cooperative group by its
+// parent group type.
+template <unsigned Size, typename Group>
+__device__ __forceinline__ thread_block_tile<Size, void> tiled_partition(
+    const Group &g)
+{
+    return cooperative_groups::tiled_partition<Size>(g);
+}
+
+
+#endif
+
+
 }  // namespace group
 }  // namespace cuda
 }  // namespace kernels
diff --git a/cuda/components/format_conversion.cuh b/cuda/components/format_conversion.cuh
index d7235414f37..17b20438524 100644
--- a/cuda/components/format_conversion.cuh
+++ b/cuda/components/format_conversion.cuh
@@ -100,7 +100,11 @@ __host__ size_type calculate_nwarps(std::shared_ptr<const CudaExecutor> exec,
         exec->get_num_warps_per_sm() * config::warp_size / subwarp_size;
     size_type nwarps_in_cuda = exec->get_num_multiprocessor() * warps_per_sm;
     size_type multiple = 8;
-    if (nnz >= 2e6) {
+    if (nnz >= 2e8) {
+        multiple = 2048;
+    } else if (nnz >= 2e7) {
+        multiple = 512;
+    } else if (nnz >= 2e6) {
         multiple = 128;
     } else if (nnz >= 2e5) {
         multiple = 32;
diff --git a/cuda/matrix/csr_kernels.cu b/cuda/matrix/csr_kernels.cu
index 8c189b99673..f618bcd9e8f 100644
--- a/cuda/matrix/csr_kernels.cu
+++ b/cuda/matrix/csr_kernels.cu
@@ -307,23 +307,56 @@ void spmv(std::shared_ptr<const CudaExecutor> exec,
         if (cusparse::is_supported<ValueType, IndexType>::value) {
             // TODO: add implementation for int64 and multiple RHS
             auto handle = exec->get_cusparse_handle();
-            auto descr = cusparse::create_mat_descr();
             {
                 cusparse::pointer_mode_guard pm_guard(handle);
+                const auto alpha = one<ValueType>();
+                const auto beta = zero<ValueType>();
+                // TODO: add implementation for int64 and multiple RHS
+                if (b->get_stride() != 1 || c->get_stride() != 1)
+                    GKO_NOT_IMPLEMENTED;
+
+#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
+                auto descr = cusparse::create_mat_descr();
                 auto row_ptrs = a->get_const_row_ptrs();
                 auto col_idxs = a->get_const_col_idxs();
-                auto alpha = one<ValueType>();
-                auto beta = zero<ValueType>();
-                if (b->get_stride() != 1 || c->get_stride() != 1) {
-                    GKO_NOT_IMPLEMENTED;
-                }
                 cusparse::spmv(handle, CUSPARSE_OPERATION_NON_TRANSPOSE,
                                a->get_size()[0], a->get_size()[1],
                                a->get_num_stored_elements(), &alpha, descr,
                                a->get_const_values(), row_ptrs, col_idxs,
                                b->get_const_values(), &beta, c->get_values());
+
+                cusparse::destroy(descr);
+#else  // CUDA_VERSION >= 11000
+                cusparseOperation_t trans = CUSPARSE_OPERATION_NON_TRANSPOSE;
+                cusparseSpMVAlg_t alg = CUSPARSE_CSRMV_ALG1;
+                auto row_ptrs =
+                    const_cast<IndexType *>(a->get_const_row_ptrs());
+                auto col_idxs =
+                    const_cast<IndexType *>(a->get_const_col_idxs());
+                auto values = const_cast<ValueType *>(a->get_const_values());
+                auto mat = cusparse::create_csr(
+                    a->get_size()[0], a->get_size()[1],
+                    a->get_num_stored_elements(), row_ptrs, col_idxs, values);
+                auto b_val = const_cast<ValueType *>(b->get_const_values());
+                auto c_val = c->get_values();
+                auto vecb =
+                    cusparse::create_dnvec(b->get_num_stored_elements(), b_val);
+                auto vecc =
+                    cusparse::create_dnvec(c->get_num_stored_elements(), c_val);
+                size_type buffer_size = 0;
+                cusparse::spmv_buffersize<ValueType>(handle, trans, &alpha, mat,
+                                                     vecb, &beta, vecc, alg,
+                                                     &buffer_size);
+
+                gko::Array<char> buffer_array(exec, buffer_size);
+                auto buffer = buffer_array.get_data();
+                cusparse::spmv<ValueType>(handle, trans, &alpha, mat, vecb,
+                                          &beta, vecc, alg, buffer);
+                cusparse::destroy(vecb);
+                cusparse::destroy(vecc);
+                cusparse::destroy(mat);
+#endif
             }
-            cusparse::destroy(descr);
         } else {
             GKO_NOT_IMPLEMENTED;
         }
@@ -368,14 +401,13 @@ void advanced_spmv(std::shared_ptr<const CudaExecutor> exec,
                a->get_strategy()->get_name() == "cusparse") {
         if (cusparse::is_supported<ValueType, IndexType>::value) {
             // TODO: add implementation for int64 and multiple RHS
-            auto descr = cusparse::create_mat_descr();
-
-            auto row_ptrs = a->get_const_row_ptrs();
-            auto col_idxs = a->get_const_col_idxs();
-
             if (b->get_stride() != 1 || c->get_stride() != 1)
                 GKO_NOT_IMPLEMENTED;
 
+#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
+            auto descr = cusparse::create_mat_descr();
+            auto row_ptrs = a->get_const_row_ptrs();
+            auto col_idxs = a->get_const_col_idxs();
             cusparse::spmv(exec->get_cusparse_handle(),
                            CUSPARSE_OPERATION_NON_TRANSPOSE, a->get_size()[0],
                            a->get_size()[1], a->get_num_stored_elements(),
@@ -385,6 +417,34 @@ void advanced_spmv(std::shared_ptr<const CudaExecutor> exec,
                            c->get_values());
 
             cusparse::destroy(descr);
+#else  // CUDA_VERSION >= 11000
+            cusparseOperation_t trans = CUSPARSE_OPERATION_NON_TRANSPOSE;
+            cusparseSpMVAlg_t alg = CUSPARSE_CSRMV_ALG1;
+            auto row_ptrs = const_cast<IndexType *>(a->get_const_row_ptrs());
+            auto col_idxs = const_cast<IndexType *>(a->get_const_col_idxs());
+            auto values = const_cast<ValueType *>(a->get_const_values());
+            auto mat = cusparse::create_csr(a->get_size()[0], a->get_size()[1],
+                                            a->get_num_stored_elements(),
+                                            row_ptrs, col_idxs, values);
+            auto b_val = const_cast<ValueType *>(b->get_const_values());
+            auto c_val = c->get_values();
+            auto vecb =
+                cusparse::create_dnvec(b->get_num_stored_elements(), b_val);
+            auto vecc =
+                cusparse::create_dnvec(c->get_num_stored_elements(), c_val);
+            size_type buffer_size = 0;
+            cusparse::spmv_buffersize<ValueType>(
+                exec->get_cusparse_handle(), trans, alpha->get_const_values(),
+                mat, vecb, beta->get_const_values(), vecc, alg, &buffer_size);
+            gko::Array<char> buffer_array(exec, buffer_size);
+            auto buffer = buffer_array.get_data();
+            cusparse::spmv<ValueType>(
+                exec->get_cusparse_handle(), trans, alpha->get_const_values(),
+                mat, vecb, beta->get_const_values(), vecc, alg, buffer);
+            cusparse::destroy(vecb);
+            cusparse::destroy(vecc);
+            cusparse::destroy(mat);
+#endif
         } else {
             GKO_NOT_IMPLEMENTED;
         }
@@ -433,35 +493,38 @@ void spgemm(std::shared_ptr<const CudaExecutor> exec,
             const matrix::Csr<ValueType, IndexType> *b,
             matrix::Csr<ValueType, IndexType> *c)
 {
+    auto a_nnz = IndexType(a->get_num_stored_elements());
+    auto a_vals = a->get_const_values();
+    auto a_row_ptrs = a->get_const_row_ptrs();
+    auto a_col_idxs = a->get_const_col_idxs();
+    auto b_vals = b->get_const_values();
+    auto b_row_ptrs = b->get_const_row_ptrs();
+    auto b_col_idxs = b->get_const_col_idxs();
+    auto c_row_ptrs = c->get_row_ptrs();
+
     if (cusparse::is_supported<ValueType, IndexType>::value) {
         auto handle = exec->get_cusparse_handle();
         cusparse::pointer_mode_guard pm_guard(handle);
-        auto a_descr = cusparse::create_mat_descr();
-        auto b_descr = cusparse::create_mat_descr();
-        auto c_descr = cusparse::create_mat_descr();
-        auto d_descr = cusparse::create_mat_descr();
-        auto info = cusparse::create_spgemm_info();
 
         auto alpha = one<ValueType>();
-        auto a_nnz = IndexType(a->get_num_stored_elements());
-        auto a_vals = a->get_const_values();
-        auto a_row_ptrs = a->get_const_row_ptrs();
-        auto a_col_idxs = a->get_const_col_idxs();
-        auto b_nnz = IndexType(b->get_num_stored_elements());
-        auto b_vals = b->get_const_values();
-        auto b_row_ptrs = b->get_const_row_ptrs();
-        auto b_col_idxs = b->get_const_col_idxs();
+        auto a_nnz = static_cast<IndexType>(a->get_num_stored_elements());
+        auto b_nnz = static_cast<IndexType>(b->get_num_stored_elements());
         auto null_value = static_cast<ValueType *>(nullptr);
         auto null_index = static_cast<IndexType *>(nullptr);
         auto zero_nnz = IndexType{};
         auto m = IndexType(a->get_size()[0]);
         auto n = IndexType(b->get_size()[1]);
         auto k = IndexType(a->get_size()[1]);
-        auto c_row_ptrs = c->get_row_ptrs();
         matrix::CsrBuilder<ValueType, IndexType> c_builder{c};
         auto &c_col_idxs_array = c_builder.get_col_idx_array();
         auto &c_vals_array = c_builder.get_value_array();
 
+#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
+        auto a_descr = cusparse::create_mat_descr();
+        auto b_descr = cusparse::create_mat_descr();
+        auto c_descr = cusparse::create_mat_descr();
+        auto d_descr = cusparse::create_mat_descr();
+        auto info = cusparse::create_spgemm_info();
         // allocate buffer
         size_type buffer_size{};
         cusparse::spgemm_buffer_size(
@@ -494,6 +557,57 @@ void spgemm(std::shared_ptr<const CudaExecutor> exec,
         cusparse::destroy(c_descr);
         cusparse::destroy(b_descr);
         cusparse::destroy(a_descr);
+
+#else   // CUDA_VERSION >= 11000
+        const auto beta = zero<ValueType>();
+        auto spgemm_descr = cusparse::create_spgemm_descr();
+        auto a_descr = cusparse::create_csr(m, k, a_nnz,
+                                            const_cast<IndexType *>(a_row_ptrs),
+                                            const_cast<IndexType *>(a_col_idxs),
+                                            const_cast<ValueType *>(a_vals));
+        auto b_descr = cusparse::create_csr(k, n, b_nnz,
+                                            const_cast<IndexType *>(b_row_ptrs),
+                                            const_cast<IndexType *>(b_col_idxs),
+                                            const_cast<ValueType *>(b_vals));
+        auto c_descr = cusparse::create_csr(m, n, zero_nnz, null_index,
+                                            null_index, null_value);
+
+        // estimate work
+        size_type buffer1_size{};
+        cusparse::spgemm_work_estimation(handle, &alpha, a_descr, b_descr,
+                                         &beta, c_descr, spgemm_descr,
+                                         buffer1_size, nullptr);
+        Array<char> buffer1{exec, buffer1_size};
+        cusparse::spgemm_work_estimation(handle, &alpha, a_descr, b_descr,
+                                         &beta, c_descr, spgemm_descr,
+                                         buffer1_size, buffer1.get_data());
+
+        // compute spgemm
+        size_type buffer2_size{};
+        cusparse::spgemm_compute(handle, &alpha, a_descr, b_descr, &beta,
+                                 c_descr, spgemm_descr, buffer1.get_data(),
+                                 buffer2_size, nullptr);
+        Array<char> buffer2{exec, buffer2_size};
+        cusparse::spgemm_compute(handle, &alpha, a_descr, b_descr, &beta,
+                                 c_descr, spgemm_descr, buffer1.get_data(),
+                                 buffer2_size, buffer2.get_data());
+
+        // copy data to result
+        auto c_nnz = cusparse::sparse_matrix_nnz(c_descr);
+        c_col_idxs_array.resize_and_reset(c_nnz);
+        c_vals_array.resize_and_reset(c_nnz);
+        cusparse::csr_set_pointers(c_descr, c_row_ptrs,
+                                   c_col_idxs_array.get_data(),
+                                   c_vals_array.get_data());
+
+        cusparse::spgemm_copy(handle, &alpha, a_descr, b_descr, &beta, c_descr,
+                              spgemm_descr);
+
+        cusparse::destroy(c_descr);
+        cusparse::destroy(b_descr);
+        cusparse::destroy(a_descr);
+        cusparse::destroy(spgemm_descr);
+#endif  // CUDA_VERSION >= 11000
     } else {
         GKO_NOT_IMPLEMENTED;
     }
@@ -502,6 +616,47 @@ void spgemm(std::shared_ptr<const CudaExecutor> exec,
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CSR_SPGEMM_KERNEL);
 
 
+namespace {
+
+
+template <int subwarp_size, typename ValueType, typename IndexType>
+void spgeam(syn::value_list<int, subwarp_size>,
+            std::shared_ptr<const DefaultExecutor> exec, const ValueType *alpha,
+            const IndexType *a_row_ptrs, const IndexType *a_col_idxs,
+            const ValueType *a_vals, const ValueType *beta,
+            const IndexType *b_row_ptrs, const IndexType *b_col_idxs,
+            const ValueType *b_vals, matrix::Csr<ValueType, IndexType> *c)
+{
+    auto m = static_cast<IndexType>(c->get_size()[0]);
+    auto c_row_ptrs = c->get_row_ptrs();
+    // count nnz for alpha * A + beta * B
+    auto subwarps_per_block = default_block_size / subwarp_size;
+    auto num_blocks = ceildiv(m, subwarps_per_block);
+    kernel::spgeam_nnz<subwarp_size><<<num_blocks, default_block_size>>>(
+        a_row_ptrs, a_col_idxs, b_row_ptrs, b_col_idxs, m, c_row_ptrs);
+
+    // build row pointers
+    components::prefix_sum(exec, c_row_ptrs, m + 1);
+
+    // accumulate non-zeros for alpha * A + beta * B
+    matrix::CsrBuilder<ValueType, IndexType> c_builder{c};
+    auto c_nnz = exec->copy_val_to_host(c_row_ptrs + m);
+    c_builder.get_col_idx_array().resize_and_reset(c_nnz);
+    c_builder.get_value_array().resize_and_reset(c_nnz);
+    auto c_col_idxs = c->get_col_idxs();
+    auto c_vals = c->get_values();
+    kernel::spgeam<subwarp_size><<<num_blocks, default_block_size>>>(
+        as_cuda_type(alpha), a_row_ptrs, a_col_idxs, as_cuda_type(a_vals),
+        as_cuda_type(beta), b_row_ptrs, b_col_idxs, as_cuda_type(b_vals), m,
+        c_row_ptrs, c_col_idxs, as_cuda_type(c_vals));
+}
+
+GKO_ENABLE_IMPLEMENTATION_SELECTION(select_spgeam, spgeam);
+
+
+}  // namespace
+
+
 template <typename ValueType, typename IndexType>
 void advanced_spgemm(std::shared_ptr<const CudaExecutor> exec,
                      const matrix::Dense<ValueType> *alpha,
@@ -514,11 +669,6 @@ void advanced_spgemm(std::shared_ptr<const CudaExecutor> exec,
     if (cusparse::is_supported<ValueType, IndexType>::value) {
         auto handle = exec->get_cusparse_handle();
         cusparse::pointer_mode_guard pm_guard(handle);
-        auto a_descr = cusparse::create_mat_descr();
-        auto b_descr = cusparse::create_mat_descr();
-        auto c_descr = cusparse::create_mat_descr();
-        auto d_descr = cusparse::create_mat_descr();
-        auto info = cusparse::create_spgemm_info();
 
         auto valpha = exec->copy_val_to_host(alpha->get_const_values());
         auto a_nnz = IndexType(a->get_num_stored_elements());
@@ -538,10 +688,16 @@ void advanced_spgemm(std::shared_ptr<const CudaExecutor> exec,
         auto n = IndexType(b->get_size()[1]);
         auto k = IndexType(a->get_size()[1]);
         auto c_row_ptrs = c->get_row_ptrs();
+
+#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
         matrix::CsrBuilder<ValueType, IndexType> c_builder{c};
         auto &c_col_idxs_array = c_builder.get_col_idx_array();
         auto &c_vals_array = c_builder.get_value_array();
-
+        auto a_descr = cusparse::create_mat_descr();
+        auto b_descr = cusparse::create_mat_descr();
+        auto c_descr = cusparse::create_mat_descr();
+        auto d_descr = cusparse::create_mat_descr();
+        auto info = cusparse::create_spgemm_info();
         // allocate buffer
         size_type buffer_size{};
         cusparse::spgemm_buffer_size(
@@ -574,6 +730,75 @@ void advanced_spgemm(std::shared_ptr<const CudaExecutor> exec,
         cusparse::destroy(c_descr);
         cusparse::destroy(b_descr);
         cusparse::destroy(a_descr);
+#else   // CUDA_VERSION >= 11000
+        auto null_value = static_cast<ValueType *>(nullptr);
+        auto null_index = static_cast<IndexType *>(nullptr);
+        auto one_val = one<ValueType>();
+        auto zero_val = zero<ValueType>();
+        auto zero_nnz = IndexType{};
+        auto spgemm_descr = cusparse::create_spgemm_descr();
+        auto a_descr = cusparse::create_csr(m, k, a_nnz,
+                                            const_cast<IndexType *>(a_row_ptrs),
+                                            const_cast<IndexType *>(a_col_idxs),
+                                            const_cast<ValueType *>(a_vals));
+        auto b_descr = cusparse::create_csr(k, n, b_nnz,
+                                            const_cast<IndexType *>(b_row_ptrs),
+                                            const_cast<IndexType *>(b_col_idxs),
+                                            const_cast<ValueType *>(b_vals));
+        auto c_descr = cusparse::create_csr(m, n, zero_nnz, null_index,
+                                            null_index, null_value);
+
+        // estimate work
+        size_type buffer1_size{};
+        cusparse::spgemm_work_estimation(handle, &one_val, a_descr, b_descr,
+                                         &zero_val, c_descr, spgemm_descr,
+                                         buffer1_size, nullptr);
+        Array<char> buffer1{exec, buffer1_size};
+        cusparse::spgemm_work_estimation(handle, &one_val, a_descr, b_descr,
+                                         &zero_val, c_descr, spgemm_descr,
+                                         buffer1_size, buffer1.get_data());
+
+        // compute spgemm
+        size_type buffer2_size{};
+        cusparse::spgemm_compute(handle, &one_val, a_descr, b_descr, &zero_val,
+                                 c_descr, spgemm_descr, buffer1.get_data(),
+                                 buffer2_size, nullptr);
+        Array<char> buffer2{exec, buffer2_size};
+        cusparse::spgemm_compute(handle, &one_val, a_descr, b_descr, &zero_val,
+                                 c_descr, spgemm_descr, buffer1.get_data(),
+                                 buffer2_size, buffer2.get_data());
+
+        // write result to temporary storage
+        auto c_tmp_nnz = cusparse::sparse_matrix_nnz(c_descr);
+        Array<IndexType> c_tmp_row_ptrs_array(exec, m + 1);
+        Array<IndexType> c_tmp_col_idxs_array(exec, c_tmp_nnz);
+        Array<ValueType> c_tmp_vals_array(exec, c_tmp_nnz);
+        cusparse::csr_set_pointers(c_descr, c_tmp_row_ptrs_array.get_data(),
+                                   c_tmp_col_idxs_array.get_data(),
+                                   c_tmp_vals_array.get_data());
+
+        cusparse::spgemm_copy(handle, &one_val, a_descr, b_descr, &zero_val,
+                              c_descr, spgemm_descr);
+
+        cusparse::destroy(c_descr);
+        cusparse::destroy(b_descr);
+        cusparse::destroy(a_descr);
+        cusparse::destroy(spgemm_descr);
+
+        auto spgeam_total_nnz = c_tmp_nnz + d->get_num_stored_elements();
+        auto nnz_per_row = spgeam_total_nnz / m;
+        select_spgeam(
+            spgeam_kernels(),
+            [&](int compiled_subwarp_size) {
+                return compiled_subwarp_size >= nnz_per_row ||
+                       compiled_subwarp_size == config::warp_size;
+            },
+            syn::value_list<int>(), syn::type_list<>(), exec,
+            alpha->get_const_values(), c_tmp_row_ptrs_array.get_const_data(),
+            c_tmp_col_idxs_array.get_const_data(),
+            c_tmp_vals_array.get_const_data(), beta->get_const_values(),
+            d_row_ptrs, d_col_idxs, d_vals, c);
+#endif  // CUDA_VERSION >= 11000
     } else {
         GKO_NOT_IMPLEMENTED;
     }
@@ -583,47 +808,6 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_CSR_ADVANCED_SPGEMM_KERNEL);
 
 
-namespace {
-
-
-template <int subwarp_size, typename ValueType, typename IndexType>
-void spgeam(syn::value_list<int, subwarp_size>,
-            std::shared_ptr<const DefaultExecutor> exec, const ValueType *alpha,
-            const IndexType *a_row_ptrs, const IndexType *a_col_idxs,
-            const ValueType *a_vals, const ValueType *beta,
-            const IndexType *b_row_ptrs, const IndexType *b_col_idxs,
-            const ValueType *b_vals, matrix::Csr<ValueType, IndexType> *c)
-{
-    auto m = static_cast<IndexType>(c->get_size()[0]);
-    auto c_row_ptrs = c->get_row_ptrs();
-    // count nnz for alpha * A + beta * B
-    auto subwarps_per_block = default_block_size / subwarp_size;
-    auto num_blocks = ceildiv(m, subwarps_per_block);
-    kernel::spgeam_nnz<subwarp_size><<<num_blocks, default_block_size>>>(
-        a_row_ptrs, a_col_idxs, b_row_ptrs, b_col_idxs, m, c_row_ptrs);
-
-    // build row pointers
-    components::prefix_sum(exec, c_row_ptrs, m + 1);
-
-    // accumulate non-zeros for alpha * A + beta * B
-    matrix::CsrBuilder<ValueType, IndexType> c_builder{c};
-    auto c_nnz = exec->copy_val_to_host(c_row_ptrs + m);
-    c_builder.get_col_idx_array().resize_and_reset(c_nnz);
-    c_builder.get_value_array().resize_and_reset(c_nnz);
-    auto c_col_idxs = c->get_col_idxs();
-    auto c_vals = c->get_values();
-    kernel::spgeam<subwarp_size><<<num_blocks, default_block_size>>>(
-        as_cuda_type(alpha), a_row_ptrs, a_col_idxs, as_cuda_type(a_vals),
-        as_cuda_type(beta), b_row_ptrs, b_col_idxs, as_cuda_type(b_vals), m,
-        c_row_ptrs, c_col_idxs, as_cuda_type(c_vals));
-}
-
-GKO_ENABLE_IMPLEMENTATION_SELECTION(select_spgeam, spgeam);
-
-
-}  // namespace
-
-
 template <typename ValueType, typename IndexType>
 void spgeam(std::shared_ptr<const DefaultExecutor> exec,
             const matrix::Dense<ValueType> *alpha,
@@ -860,6 +1044,7 @@ void transpose(std::shared_ptr<const CudaExecutor> exec,
                matrix::Csr<ValueType, IndexType> *trans)
 {
     if (cusparse::is_supported<ValueType, IndexType>::value) {
+#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
         cusparseAction_t copyValues = CUSPARSE_ACTION_NUMERIC;
         cusparseIndexBase_t idxBase = CUSPARSE_INDEX_BASE_ZERO;
 
@@ -868,7 +1053,31 @@ void transpose(std::shared_ptr<const CudaExecutor> exec,
             orig->get_size()[1], orig->get_num_stored_elements(),
             orig->get_const_values(), orig->get_const_row_ptrs(),
             orig->get_const_col_idxs(), trans->get_values(),
-            trans->get_col_idxs(), trans->get_row_ptrs(), copyValues, idxBase);
+            trans->get_row_ptrs(), trans->get_col_idxs(), copyValues, idxBase);
+#else  // CUDA_VERSION >= 11000
+        cudaDataType_t cu_value =
+            gko::kernels::cuda::cuda_data_type<ValueType>();
+        cusparseAction_t copyValues = CUSPARSE_ACTION_NUMERIC;
+        cusparseIndexBase_t idxBase = CUSPARSE_INDEX_BASE_ZERO;
+        cusparseCsr2CscAlg_t alg = CUSPARSE_CSR2CSC_ALG1;
+        size_type buffer_size = 0;
+        cusparse::transpose_buffersize(
+            exec->get_cusparse_handle(), orig->get_size()[0],
+            orig->get_size()[1], orig->get_num_stored_elements(),
+            orig->get_const_values(), orig->get_const_row_ptrs(),
+            orig->get_const_col_idxs(), trans->get_values(),
+            trans->get_row_ptrs(), trans->get_col_idxs(), cu_value, copyValues,
+            idxBase, alg, &buffer_size);
+        Array<char> buffer_array(exec, buffer_size);
+        auto buffer = buffer_array.get_data();
+        cusparse::transpose(
+            exec->get_cusparse_handle(), orig->get_size()[0],
+            orig->get_size()[1], orig->get_num_stored_elements(),
+            orig->get_const_values(), orig->get_const_row_ptrs(),
+            orig->get_const_col_idxs(), trans->get_values(),
+            trans->get_row_ptrs(), trans->get_col_idxs(), cu_value, copyValues,
+            idxBase, alg, buffer);
+#endif
     } else {
         GKO_NOT_IMPLEMENTED;
     }
@@ -887,6 +1096,7 @@ void conj_transpose(std::shared_ptr<const CudaExecutor> exec,
         const dim3 grid_size(
             ceildiv(trans->get_num_stored_elements(), block_size.x), 1, 1);
 
+#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
         cusparseAction_t copyValues = CUSPARSE_ACTION_NUMERIC;
         cusparseIndexBase_t idxBase = CUSPARSE_INDEX_BASE_ZERO;
 
@@ -895,7 +1105,31 @@ void conj_transpose(std::shared_ptr<const CudaExecutor> exec,
             orig->get_size()[1], orig->get_num_stored_elements(),
             orig->get_const_values(), orig->get_const_row_ptrs(),
             orig->get_const_col_idxs(), trans->get_values(),
-            trans->get_col_idxs(), trans->get_row_ptrs(), copyValues, idxBase);
+            trans->get_row_ptrs(), trans->get_col_idxs(), copyValues, idxBase);
+#else  // CUDA_VERSION >= 11000
+        cudaDataType_t cu_value =
+            gko::kernels::cuda::cuda_data_type<ValueType>();
+        cusparseAction_t copyValues = CUSPARSE_ACTION_NUMERIC;
+        cusparseIndexBase_t idxBase = CUSPARSE_INDEX_BASE_ZERO;
+        cusparseCsr2CscAlg_t alg = CUSPARSE_CSR2CSC_ALG1;
+        size_type buffer_size = 0;
+        cusparse::transpose_buffersize(
+            exec->get_cusparse_handle(), orig->get_size()[0],
+            orig->get_size()[1], orig->get_num_stored_elements(),
+            orig->get_const_values(), orig->get_const_row_ptrs(),
+            orig->get_const_col_idxs(), trans->get_values(),
+            trans->get_row_ptrs(), trans->get_col_idxs(), cu_value, copyValues,
+            idxBase, alg, &buffer_size);
+        Array<char> buffer_array(exec, buffer_size);
+        auto buffer = buffer_array.get_data();
+        cusparse::transpose(
+            exec->get_cusparse_handle(), orig->get_size()[0],
+            orig->get_size()[1], orig->get_num_stored_elements(),
+            orig->get_const_values(), orig->get_const_row_ptrs(),
+            orig->get_const_col_idxs(), trans->get_values(),
+            trans->get_row_ptrs(), trans->get_col_idxs(), cu_value, copyValues,
+            idxBase, alg, buffer);
+#endif
 
         conjugate_kernel<<<grid_size, block_size, 0, 0>>>(
             trans->get_num_stored_elements(),
@@ -1081,7 +1315,14 @@ void sort_by_column_index(std::shared_ptr<const CudaExecutor> exec,
                           permutation, buffer);
 
         // sort values
+#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000)
         cusparse::gather(handle, nnz, tmp_vals, vals, permutation);
+#else  // CUDA_VERSION >= 11000
+        auto val_vec = cusparse::create_spvec(nnz, nnz, permutation, vals);
+        auto tmp_vec =
+            cusparse::create_dnvec(nnz, const_cast<ValueType *>(tmp_vals));
+        cusparse::gather(handle, tmp_vec, val_vec);
+#endif
 
         cusparse::destroy(descr);
     } else {
diff --git a/cuda/test/matrix/csr_kernels.cpp b/cuda/test/matrix/csr_kernels.cpp
index 39608505cbd..898bd4ca106 100644
--- a/cuda/test/matrix/csr_kernels.cpp
+++ b/cuda/test/matrix/csr_kernels.cpp
@@ -416,6 +416,7 @@ TEST_F(Csr, TransposeIsEquivalentToRef)
 
     GKO_ASSERT_MTX_NEAR(static_cast<Mtx *>(d_trans.get()),
                         static_cast<Mtx *>(trans.get()), 0.0);
+    ASSERT_TRUE(static_cast<Mtx *>(d_trans.get())->is_sorted_by_column_index());
 }
 
 
@@ -428,6 +429,8 @@ TEST_F(Csr, ConjugateTransposeIsEquivalentToRef)
 
     GKO_ASSERT_MTX_NEAR(static_cast<ComplexMtx *>(d_trans.get()),
                         static_cast<ComplexMtx *>(trans.get()), 0.0);
+    ASSERT_TRUE(
+        static_cast<ComplexMtx *>(d_trans.get())->is_sorted_by_column_index());
 }
 
 
diff --git a/cuda/test/solver/lower_trs_kernels.cpp b/cuda/test/solver/lower_trs_kernels.cpp
index a2cac176e8c..b677b9eb10d 100644
--- a/cuda/test/solver/lower_trs_kernels.cpp
+++ b/cuda/test/solver/lower_trs_kernels.cpp
@@ -131,7 +131,7 @@ TEST_F(LowerTrs, CudaLowerTrsFlagCheckIsCorrect)
     bool expected_flag = false;
 #if (defined(CUDA_VERSION) && (CUDA_VERSION < 9020))
     expected_flag = true;
-#endif
+#endif  // (defined(CUDA_VERSION) && (CUDA_VERSION < 9020))
     gko::kernels::cuda::lower_trs::should_perform_transpose(cuda, trans_flag);
 
     ASSERT_EQ(expected_flag, trans_flag);
diff --git a/cuda/test/solver/upper_trs_kernels.cpp b/cuda/test/solver/upper_trs_kernels.cpp
index 92a76b1e47b..9da9d33930c 100644
--- a/cuda/test/solver/upper_trs_kernels.cpp
+++ b/cuda/test/solver/upper_trs_kernels.cpp
@@ -132,7 +132,7 @@ TEST_F(UpperTrs, CudaUpperTrsFlagCheckIsCorrect)
 
 #if (defined(CUDA_VERSION) && (CUDA_VERSION < 9020))
     expected_flag = true;
-#endif
+#endif  // (defined(CUDA_VERSION) && (CUDA_VERSION < 9020))
     gko::kernels::cuda::upper_trs::should_perform_transpose(cuda, trans_flag);
 
     ASSERT_EQ(expected_flag, trans_flag);
diff --git a/hip/base/executor.hip.cpp b/hip/base/executor.hip.cpp
index 9592bc20b8d..4df120cde9f 100644
--- a/hip/base/executor.hip.cpp
+++ b/hip/base/executor.hip.cpp
@@ -93,7 +93,7 @@ void HipExecutor::raw_free(void *ptr) const noexcept
                   << " in " << __func__ << ": " << hipGetErrorName(error_code)
                   << ": " << hipGetErrorString(error_code) << std::endl
                   << "Exiting program" << std::endl;
-#endif
+#endif  // GKO_VERBOSE_LEVEL >= 1
         std::exit(error_code);
     }
 }
diff --git a/hip/base/hipsparse_bindings.hip.hpp b/hip/base/hipsparse_bindings.hip.hpp
index 3b7c8a978a4..95be1bfe129 100644
--- a/hip/base/hipsparse_bindings.hip.hpp
+++ b/hip/base/hipsparse_bindings.hip.hpp
@@ -372,8 +372,8 @@ GKO_BIND_HIPSPARSE64_CSR2HYB(ValueType, detail::not_implemented);
     {                                                                         \
         GKO_ASSERT_NO_HIPSPARSE_ERRORS(HipsparseName(                         \
             handle, m, n, nnz, as_hiplibs_type(OrigValA), OrigRowPtrA,        \
-            OrigColIndA, as_hiplibs_type(TransValA), TransRowPtrA,            \
-            TransColIndA, copyValues, idxBase));                              \
+            OrigColIndA, as_hiplibs_type(TransValA), TransColIndA,            \
+            TransRowPtrA, copyValues, idxBase));                              \
     }                                                                         \
     static_assert(true,                                                       \
                   "This assert is used to counter the false positive extra "  \
diff --git a/hip/components/cooperative_groups.hip.hpp b/hip/components/cooperative_groups.hip.hpp
index 704513391ee..5f818dca9be 100644
--- a/hip/components/cooperative_groups.hip.hpp
+++ b/hip/components/cooperative_groups.hip.hpp
@@ -199,7 +199,7 @@ class thread_block_tile {
     {
 #if GINKGO_HIP_PLATFORM_NVCC
         __syncwarp(data_.mask);
-#endif
+#endif  // GINKGO_HIP_PLATFORM_NVCC
     }
 
 #if GINKGO_HIP_PLATFORM_HCC
@@ -375,7 +375,7 @@ class enable_extended_shuffle : public Group {
 
 // Implementing this as a using directive messes up with SFINAE for some reason,
 // probably a bug in NVCC. If it is a complete type, everything works fine.
-template <size_type Size>
+template <unsigned Size>
 struct thread_block_tile
     : detail::enable_extended_shuffle<detail::thread_block_tile<Size>> {
     using detail::enable_extended_shuffle<
@@ -398,12 +398,12 @@ __device__ __forceinline__
 namespace detail {
 
 
-template <size_type Size>
+template <unsigned Size>
 struct is_group_impl<thread_block_tile<Size>> : std::true_type {};
-template <size_type Size>
+template <unsigned Size>
 struct is_synchronizable_group_impl<thread_block_tile<Size>> : std::true_type {
 };
-template <size_type Size>
+template <unsigned Size>
 struct is_communicator_group_impl<thread_block_tile<Size>> : std::true_type {};
 
 
@@ -485,7 +485,7 @@ class grid_group {
                     (threadIdx.y + blockDim.y *
                         (threadIdx.z + blockDim.z *
                             (blockIdx.x + gridDim.x *
-                                (blockIdx.y + gridDim.y * blockIdx.z))))}                      
+                                (blockIdx.y + gridDim.y * blockIdx.z))))}
     {}
     // clang-format on
 
diff --git a/hip/components/format_conversion.hip.hpp b/hip/components/format_conversion.hip.hpp
index bdea7613b63..f97e70ef438 100644
--- a/hip/components/format_conversion.hip.hpp
+++ b/hip/components/format_conversion.hip.hpp
@@ -104,7 +104,11 @@ __host__ size_type calculate_nwarps(std::shared_ptr<const HipExecutor> exec,
                               subwarp_size;
 #if GINKGO_HIP_PLATFORM_NVCC
     size_type multiple = 8;
-    if (nnz >= 2e6) {
+    if (nnz >= 2e8) {
+        multiple = 2048;
+    } else if (nnz >= 2e7) {
+        multiple = 512;
+    } else if (nnz >= 2e6) {
         multiple = 128;
     } else if (nnz >= 2e5) {
         multiple = 32;
diff --git a/hip/matrix/csr_kernels.hip.cpp b/hip/matrix/csr_kernels.hip.cpp
index 82aee4891ec..1303a4e08da 100644
--- a/hip/matrix/csr_kernels.hip.cpp
+++ b/hip/matrix/csr_kernels.hip.cpp
@@ -926,7 +926,7 @@ void transpose(std::shared_ptr<const HipExecutor> exec,
             orig->get_size()[1], orig->get_num_stored_elements(),
             orig->get_const_values(), orig->get_const_row_ptrs(),
             orig->get_const_col_idxs(), trans->get_values(),
-            trans->get_col_idxs(), trans->get_row_ptrs(), copyValues, idxBase);
+            trans->get_row_ptrs(), trans->get_col_idxs(), copyValues, idxBase);
     } else {
         GKO_NOT_IMPLEMENTED;
     }
diff --git a/hip/test/matrix/csr_kernels.hip.cpp b/hip/test/matrix/csr_kernels.hip.cpp
index a1b2adfd794..677e886c36c 100644
--- a/hip/test/matrix/csr_kernels.hip.cpp
+++ b/hip/test/matrix/csr_kernels.hip.cpp
@@ -401,6 +401,7 @@ TEST_F(Csr, TransposeIsEquivalentToRef)
 
     GKO_ASSERT_MTX_NEAR(static_cast<Mtx *>(d_trans.get()),
                         static_cast<Mtx *>(trans.get()), 0.0);
+    ASSERT_TRUE(static_cast<Mtx *>(d_trans.get())->is_sorted_by_column_index());
 }
 
 
@@ -482,18 +483,6 @@ TEST_F(Csr, MoveToSparsityCsrIsEquivalentToRef)
 }
 
 
-TEST_F(Csr, ConvertsEmptyToSellp)
-{
-    auto dempty_mtx = Mtx::create(hip);
-    auto dsellp_mtx = gko::matrix::Sellp<>::create(hip);
-
-    dempty_mtx->convert_to(dsellp_mtx.get());
-
-    ASSERT_EQ(hip->copy_val_to_host(dsellp_mtx->get_const_slice_sets()), 0);
-    ASSERT_FALSE(dsellp_mtx->get_size());
-}
-
-
 TEST_F(Csr, CalculateMaxNnzPerRowIsEquivalentToRef)
 {
     set_up_apply_data(std::make_shared<Mtx::sparselib>());
@@ -561,6 +550,18 @@ TEST_F(Csr, MoveToSellpIsEquivalentToRef)
 }
 
 
+TEST_F(Csr, ConvertsEmptyToSellp)
+{
+    auto dempty_mtx = Mtx::create(hip);
+    auto dsellp_mtx = gko::matrix::Sellp<>::create(hip);
+
+    dempty_mtx->convert_to(dsellp_mtx.get());
+
+    ASSERT_EQ(hip->copy_val_to_host(dsellp_mtx->get_const_slice_sets()), 0);
+    ASSERT_FALSE(dsellp_mtx->get_size());
+}
+
+
 TEST_F(Csr, CalculateTotalColsIsEquivalentToRef)
 {
     set_up_apply_data(std::make_shared<Mtx::sparselib>());
@@ -625,7 +626,7 @@ TEST_F(Csr, MoveToHybridIsEquivalentToRef)
 
 TEST_F(Csr, RecognizeSortedMatrixIsEquivalentToRef)
 {
-    set_up_apply_data(std::make_shared<Mtx::sparselib>());
+    set_up_apply_data(std::make_shared<Mtx::automatical>(hip));
     bool is_sorted_hip{};
     bool is_sorted_ref{};
 
diff --git a/hip/test/preconditioner/jacobi_kernels.cpp b/hip/test/preconditioner/jacobi_kernels.cpp
index f1863a6b42f..868e10fbbad 100644
--- a/hip/test/preconditioner/jacobi_kernels.cpp
+++ b/hip/test/preconditioner/jacobi_kernels.cpp
@@ -315,7 +315,7 @@ TEST_F(Jacobi, HipPreconditionerEquivalentToRefWithBlockSize64)
 
     GKO_ASSERT_MTX_NEAR(gko::as<Bj>(d_bj.get()), gko::as<Bj>(bj.get()), 1e-13);
 }
-#endif
+#endif  // GINKGO_HIP_PLATFORM_HCC
 
 
 TEST_F(Jacobi, HipPreconditionerEquivalentToRefWithDifferentBlockSize)
@@ -395,7 +395,7 @@ TEST_F(Jacobi, HipApplyEquivalentToRefWithBlockSize64)
 
     GKO_ASSERT_MTX_NEAR(d_x, x, 1e-12);
 }
-#endif
+#endif  // GINKGO_HIP_PLATFORM_HCC
 
 
 TEST_F(Jacobi, HipApplyEquivalentToRefWithDifferentBlockSize)
diff --git a/include/ginkgo/core/base/types.hpp b/include/ginkgo/core/base/types.hpp
index 6c2ab2a50d2..10ef4d5d4cf 100644
--- a/include/ginkgo/core/base/types.hpp
+++ b/include/ginkgo/core/base/types.hpp
@@ -45,7 +45,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #ifdef __HIPCC__
 #include <hip/hip_runtime.h>
-#endif
+#endif  // __HIPCC__
 
 
 // Macros for handling different compilers / architectures uniformly
diff --git a/include/ginkgo/core/matrix/csr.hpp b/include/ginkgo/core/matrix/csr.hpp
index d463d1004ec..759a602db5f 100644
--- a/include/ginkgo/core/matrix/csr.hpp
+++ b/include/ginkgo/core/matrix/csr.hpp
@@ -431,7 +431,11 @@ class Csr : public EnableLinOp<Csr<ValueType, IndexType>>,
         {
             if (warp_size_ > 0) {
                 int multiple = 8;
-                if (nnz >= 2e6) {
+                if (nnz >= 2e8) {
+                    multiple = 2048;
+                } else if (nnz >= 2e7) {
+                    multiple = 512;
+                } else if (nnz >= 2e6) {
                     multiple = 128;
                 } else if (nnz >= 2e5) {
                     multiple = 32;
diff --git a/omp/test/matrix/csr_kernels.cpp b/omp/test/matrix/csr_kernels.cpp
index bb607efd615..9eafec2efad 100644
--- a/omp/test/matrix/csr_kernels.cpp
+++ b/omp/test/matrix/csr_kernels.cpp
@@ -299,6 +299,7 @@ TEST_F(Csr, TransposeIsEquivalentToRef)
 
     GKO_ASSERT_MTX_NEAR(static_cast<Mtx *>(d_trans.get()),
                         static_cast<Mtx *>(trans.get()), 0.0);
+    ASSERT_TRUE(static_cast<Mtx *>(d_trans.get())->is_sorted_by_column_index());
 }
 
 
@@ -311,6 +312,8 @@ TEST_F(Csr, ConjugateTransposeIsEquivalentToRef)
 
     GKO_ASSERT_MTX_NEAR(static_cast<ComplexMtx *>(d_trans.get()),
                         static_cast<ComplexMtx *>(trans.get()), 0.0);
+    ASSERT_TRUE(
+        static_cast<ComplexMtx *>(d_trans.get())->is_sorted_by_column_index());
 }
 
 
diff --git a/reference/test/stop/combined.cpp b/reference/test/stop/combined.cpp
index 08939c64392..00eed7107bd 100644
--- a/reference/test/stop/combined.cpp
+++ b/reference/test/stop/combined.cpp
@@ -37,7 +37,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <thread>
 #if defined(_WIN32) || defined(__CYGWIN__)
 #include <windows.h>
-#endif
+#endif  // defined(_WIN32) || defined(__CYGWIN__)
 
 
 #include <gtest/gtest.h>
diff --git a/reference/test/stop/time.cpp b/reference/test/stop/time.cpp
index 258db5d2854..70da8138ae7 100644
--- a/reference/test/stop/time.cpp
+++ b/reference/test/stop/time.cpp
@@ -37,7 +37,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <thread>
 #if defined(_WIN32) || defined(__CYGWIN__)
 #include <windows.h>
-#endif
+#endif  // defined(_WIN32) || defined(__CYGWIN__)
 
 
 #include <gtest/gtest.h>