ginkgo-project · upsj · Jul 16, 2020 · Jul 14, 2020 · Jul 14, 2020 · Jul 14, 2020
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
@@ -839,6 +839,25 @@ undefinedsanitizer:
     - cuda
     - gpu
 
+cudamemcheck:
+  stage: QoS_tools
+  image: localhost:5000/gko-cuda101-gnu8-llvm10-intel2019
+  before_script: *default_before_script
+  script:
+    - ctest -V -S cmake/CTestScript.cmake -DCTEST_BUILD_CONFIGURATION=RelWithDebInfo
+      -DCTEST_MEMORYCHECK_TYPE=CudaMemcheck
+  dependencies: []
+  only:
+    refs:
+      - master
+      - develop
+      - tags
+    variables:
+      - $RUN_CI_TAG
+  tags:
+    - private_ci
+    - cuda
+    - gpu
 
 # Benchmark build
 .benchmark_before_script_template: &default_benchmark_before_script

diff --git a/cmake/CTestScript.cmake b/cmake/CTestScript.cmake
@@ -119,11 +119,16 @@ if(CTEST_MEMORYCHECK_TYPE STREQUAL "Valgrind")
     set(CTEST_MEMORYCHECK_SUPPRESSIONS_FILE "${CTEST_SOURCE_DIRECTORY}/dev_tools/valgrind/suppressions.supp")
 endif()
 
+if(CTEST_MEMORYCHECK_TYPE STREQUAL "CudaMemcheck")
+    find_program(CTEST_MEMORYCHECK_COMMAND cuda-memcheck)
+    set(CTEST_BUILD_NAME "CudaMemcheck")
+endif()
+
 if(CTEST_BUILD_CONFIGURATION STREQUAL "COVERAGE")
     find_program(CTEST_COVERAGE_COMMAND gcov)
 endif()
 
-if(NOT CTEST_MEMORYCHECK_TYPE STREQUAL "Valgrind")
+if(NOT CTEST_MEMORYCHECK_TYPE STREQUAL "Valgrind" AND NOT CTEST_MEMORYCHECK_TYPE STREQUAL "CudaMemcheck")
     set(CTEST_MEMORYCHECK_SANITIZER_OPTIONS "${CTEST_MEMORYCHECK_SANITIZER_OPTIONS}:allocator_may_return_null=1:verbosity=1")
 endif()
 
@@ -146,7 +151,10 @@ endif()
 ctest_start("${CTEST_TEST_MODEL}")
 ctest_submit(PARTS Start)
 
-if((NOT CTEST_MEMORYCHECK_TYPE STREQUAL "NONE" AND NOT CTEST_MEMORYCHECK_TYPE STREQUAL "Valgrind") OR CTEST_BUILD_CONFIGURATION STREQUAL "COVERAGE")
+if (CTEST_MEMORYCHECK_TYPE STREQUAL "CudaMemcheck")
+    # generate line number information for CUDA
+    set(GINKGO_CONFIGURE_OPTIONS "-DGINKGO_DEVEL_TOOLS=OFF;-DGINKGO_BUILD_REFERENCE=ON;-DGINKGO_BUILD_OMP=OFF;-DGINKGO_BUILD_CUDA=ON;-DGINKGO_BUILD_HIP=ON;-DCMAKE_BUILD_TYPE=${CTEST_BUILD_CONFIGURATION};-DCMAKE_CUDA_FLAGS=-lineinfo")
+elseif((NOT CTEST_MEMORYCHECK_TYPE STREQUAL "NONE" AND NOT CTEST_MEMORYCHECK_TYPE STREQUAL "Valgrind") OR CTEST_BUILD_CONFIGURATION STREQUAL "COVERAGE")
     set(GINKGO_CONFIGURE_OPTIONS "-DGINKGO_DEVEL_TOOLS=OFF;-DGINKGO_BUILD_REFERENCE=ON;-DGINKGO_BUILD_OMP=ON;-DGINKGO_BUILD_CUDA=OFF;-DGINKGO_BUILD_HIP=OFF;-DCMAKE_BUILD_TYPE=${CTEST_BUILD_CONFIGURATION}")
 else()
     set(GINKGO_CONFIGURE_OPTIONS "-DGINKGO_DEVEL_TOOLS=OFF;-DGINKGO_BUILD_REFERENCE=ON;-DGINKGO_BUILD_OMP=ON;-DGINKGO_BUILD_CUDA=ON;-DGINKGO_BUILD_HIP=ON;-DCMAKE_BUILD_TYPE=${CTEST_BUILD_CONFIGURATION}")
@@ -180,7 +188,10 @@ if (CTEST_BUILD_CONFIGURATION STREQUAL "COVERAGE")
 endif()
 
 if(NOT CTEST_MEMORYCHECK_TYPE STREQUAL "NONE")
-    ctest_memcheck(BUILD "${CTEST_BINARY_DIRECTORY}" APPEND)
+    if(CTEST_MEMORYCHECK_TYPE STREQUAL "CudaMemcheck")
+        ctest_memcheck(BUILD "${CTEST_BINARY_DIRECTORY}" INCLUDE "^(cuda|hip).*" APPEND)
+    else()
+        ctest_memcheck(BUILD "${CTEST_BINARY_DIRECTORY}" APPEND)
+    endif()
     ctest_submit(PARTS MemCheck)
 endif()
-
diff --git a/common/matrix/sellp_kernels.hpp.inc b/common/matrix/sellp_kernels.hpp.inc
@@ -95,7 +95,7 @@ namespace kernel {
 
 
 template <typename ValueType>
-__global__ __launch_bounds__(default_block_size) void initialize_zero_dense(
+__global__ __launch_bounds__(config::max_block_size) void initialize_zero_dense(
     size_type num_rows, size_type num_cols, size_type stride,
     ValueType *__restrict__ result)
 {
@@ -196,4 +196,4 @@ __global__ __launch_bounds__(default_block_size) void fill_in_csr(
 }
 
 
-}  // namespace kernel
+}  // namespace kernel
diff --git a/cuda/matrix/coo_kernels.cu b/cuda/matrix/coo_kernels.cu
@@ -229,7 +229,7 @@ void convert_to_dense(std::shared_ptr<const CudaExecutor> exec,
 
     const dim3 block_size(config::warp_size,
                           config::max_block_size / config::warp_size, 1);
-    const dim3 init_grid_dim(ceildiv(stride, block_size.x),
+    const dim3 init_grid_dim(ceildiv(num_cols, block_size.x),
                              ceildiv(num_rows, block_size.y), 1);
     kernel::initialize_zero_dense<<<init_grid_dim, block_size>>>(
         num_rows, num_cols, stride, as_cuda_type(result->get_values()));

diff --git a/cuda/matrix/csr_kernels.cu b/cuda/matrix/csr_kernels.cu
@@ -694,7 +694,7 @@ void convert_to_dense(std::shared_ptr<const CudaExecutor> exec,
 
     const dim3 block_size(config::warp_size,
                           config::max_block_size / config::warp_size, 1);
-    const dim3 init_grid_dim(ceildiv(stride, block_size.x),
+    const dim3 init_grid_dim(ceildiv(num_cols, block_size.x),
                              ceildiv(num_rows, block_size.y), 1);
     kernel::initialize_zero_dense<<<init_grid_dim, block_size>>>(
         num_rows, num_cols, stride, as_cuda_type(result->get_values()));
@@ -737,25 +737,31 @@ void convert_to_sellp(std::shared_ptr<const CudaExecutor> exec,
     auto nnz_per_row = Array<size_type>(exec, num_rows);
     auto grid_dim = ceildiv(num_rows, default_block_size);
 
-    kernel::calculate_nnz_per_row<<<grid_dim, default_block_size>>>(
-        num_rows, as_cuda_type(source_row_ptrs),
-        as_cuda_type(nnz_per_row.get_data()));
+    if (grid_dim > 0) {
+        kernel::calculate_nnz_per_row<<<grid_dim, default_block_size>>>(
+            num_rows, as_cuda_type(source_row_ptrs),
+            as_cuda_type(nnz_per_row.get_data()));
+    }
 
     grid_dim = slice_num;
 
-    kernel::calculate_slice_lengths<<<grid_dim, config::warp_size>>>(
-        num_rows, slice_size, stride_factor,
-        as_cuda_type(nnz_per_row.get_const_data()), as_cuda_type(slice_lengths),
-        as_cuda_type(slice_sets));
+    if (grid_dim > 0) {
+        kernel::calculate_slice_lengths<<<grid_dim, config::warp_size>>>(
+            num_rows, slice_size, stride_factor,
+            as_cuda_type(nnz_per_row.get_const_data()),
+            as_cuda_type(slice_lengths), as_cuda_type(slice_sets));
+    }
 
     components::prefix_sum(exec, slice_sets, slice_num + 1);
 
     grid_dim = ceildiv(num_rows, default_block_size);
-    kernel::fill_in_sellp<<<grid_dim, default_block_size>>>(
-        num_rows, slice_size, as_cuda_type(source_values),
-        as_cuda_type(source_row_ptrs), as_cuda_type(source_col_idxs),
-        as_cuda_type(slice_lengths), as_cuda_type(slice_sets),
-        as_cuda_type(result_col_idxs), as_cuda_type(result_values));
+    if (grid_dim > 0) {
+        kernel::fill_in_sellp<<<grid_dim, default_block_size>>>(
+            num_rows, slice_size, as_cuda_type(source_values),
+            as_cuda_type(source_row_ptrs), as_cuda_type(source_col_idxs),
+            as_cuda_type(slice_lengths), as_cuda_type(slice_sets),
+            as_cuda_type(result_col_idxs), as_cuda_type(result_values));
+    }
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
@@ -805,6 +811,12 @@ void calculate_total_cols(std::shared_ptr<const CudaExecutor> exec,
                           size_type slice_size)
 {
     const auto num_rows = source->get_size()[0];
+
+    if (num_rows == 0) {
+        *result = 0;
+        return;
+    }
+
     const auto slice_num = ceildiv(num_rows, slice_size);
     const auto row_ptrs = source->get_const_row_ptrs();
 

diff --git a/cuda/matrix/dense_kernels.cu b/cuda/matrix/dense_kernels.cu
@@ -378,18 +378,23 @@ void convert_to_sellp(std::shared_ptr<const CudaExecutor> exec,
 
     auto grid_dim = slice_num;
 
-    kernel::calculate_slice_lengths<<<grid_dim, config::warp_size>>>(
-        num_rows, slice_size, slice_num, stride_factor,
-        as_cuda_type(nnz_per_row.get_const_data()), as_cuda_type(slice_lengths),
-        as_cuda_type(slice_sets));
+    if (grid_dim > 0) {
+        kernel::calculate_slice_lengths<<<grid_dim, config::warp_size>>>(
+            num_rows, slice_size, slice_num, stride_factor,
+            as_cuda_type(nnz_per_row.get_const_data()),
+            as_cuda_type(slice_lengths), as_cuda_type(slice_sets));
+    }
 
     components::prefix_sum(exec, slice_sets, slice_num + 1);
 
     grid_dim = ceildiv(num_rows, default_block_size);
-    kernel::fill_in_sellp<<<grid_dim, default_block_size>>>(
-        num_rows, num_cols, slice_size, stride,
-        as_cuda_type(source->get_const_values()), as_cuda_type(slice_lengths),
-        as_cuda_type(slice_sets), as_cuda_type(col_idxs), as_cuda_type(vals));
+    if (grid_dim > 0) {
+        kernel::fill_in_sellp<<<grid_dim, default_block_size>>>(
+            num_rows, num_cols, slice_size, stride,
+            as_cuda_type(source->get_const_values()),
+            as_cuda_type(slice_lengths), as_cuda_type(slice_sets),
+            as_cuda_type(col_idxs), as_cuda_type(vals));
+    }
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
@@ -465,10 +470,12 @@ void calculate_nonzeros_per_row(std::shared_ptr<const CudaExecutor> exec,
     auto rows_per_block = ceildiv(default_block_size, config::warp_size);
     const size_t grid_x = ceildiv(source->get_size()[0], rows_per_block);
     const dim3 grid_size(grid_x, 1, 1);
-    kernel::count_nnz_per_row<<<grid_size, block_size>>>(
-        source->get_size()[0], source->get_size()[1], source->get_stride(),
-        as_cuda_type(source->get_const_values()),
-        as_cuda_type(result->get_data()));
+    if (grid_x > 0) {
+        kernel::count_nnz_per_row<<<grid_size, block_size>>>(
+            source->get_size()[0], source->get_size()[1], source->get_stride(),
+            as_cuda_type(source->get_const_values()),
+            as_cuda_type(result->get_data()));
+    }
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
@@ -482,6 +489,12 @@ void calculate_total_cols(std::shared_ptr<const CudaExecutor> exec,
                           size_type slice_size)
 {
     const auto num_rows = source->get_size()[0];
+
+    if (num_rows == 0) {
+        *result = 0;
+        return;
+    }
+
     const auto num_cols = source->get_size()[1];
     const auto slice_num = ceildiv(num_rows, slice_size);
 

diff --git a/cuda/matrix/ell_kernels.cu b/cuda/matrix/ell_kernels.cu
@@ -271,7 +271,7 @@ void convert_to_dense(std::shared_ptr<const CudaExecutor> exec,
 
     const dim3 block_size(config::warp_size,
                           config::max_block_size / config::warp_size, 1);
-    const dim3 init_grid_dim(ceildiv(result_stride, block_size.x),
+    const dim3 init_grid_dim(ceildiv(num_cols, block_size.x),
                              ceildiv(num_rows, block_size.y), 1);
     kernel::initialize_zero_dense<<<init_grid_dim, block_size>>>(
         num_rows, num_cols, result_stride, as_cuda_type(result->get_values()));

diff --git a/cuda/matrix/sellp_kernels.cu b/cuda/matrix/sellp_kernels.cu
@@ -126,22 +126,27 @@ void convert_to_dense(std::shared_ptr<const CudaExecutor> exec,
 
     const dim3 block_size(config::warp_size,
                           config::max_block_size / config::warp_size, 1);
-    const dim3 init_grid_dim(ceildiv(result->get_stride(), block_size.x),
+    const dim3 init_grid_dim(ceildiv(num_cols, block_size.x),
                              ceildiv(num_rows, block_size.y), 1);
 
-    kernel::initialize_zero_dense<<<init_grid_dim, block_size>>>(
-        num_rows, num_cols, result->get_stride(),
-        as_cuda_type(result->get_values()));
+    if (num_rows > 0 && result->get_stride() > 0) {
+        kernel::initialize_zero_dense<<<init_grid_dim, block_size>>>(
+            num_rows, num_cols, result->get_stride(),
+            as_cuda_type(result->get_values()));
+    }
 
     constexpr auto threads_per_row = config::warp_size;
     const auto grid_dim =
         ceildiv(slice_size * slice_num * threads_per_row, default_block_size);
 
-    kernel::fill_in_dense<threads_per_row><<<grid_dim, default_block_size>>>(
-        num_rows, num_cols, result->get_stride(), slice_size,
-        as_cuda_type(slice_lengths), as_cuda_type(slice_sets),
-        as_cuda_type(col_idxs), as_cuda_type(vals),
-        as_cuda_type(result->get_values()));
+    if (grid_dim > 0) {
+        kernel::fill_in_dense<threads_per_row>
+            <<<grid_dim, default_block_size>>>(
+                num_rows, num_cols, result->get_stride(), slice_size,
+                as_cuda_type(slice_lengths), as_cuda_type(slice_sets),
+                as_cuda_type(col_idxs), as_cuda_type(vals),
+                as_cuda_type(result->get_values()));
+    }
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
@@ -168,9 +173,11 @@ void convert_to_csr(std::shared_ptr<const CudaExecutor> exec,
 
     auto grid_dim = ceildiv(num_rows * config::warp_size, default_block_size);
 
-    kernel::count_nnz_per_row<<<grid_dim, default_block_size>>>(
-        num_rows, slice_size, as_cuda_type(source_slice_sets),
-        as_cuda_type(source_values), as_cuda_type(result_row_ptrs));
+    if (grid_dim > 0) {
+        kernel::count_nnz_per_row<<<grid_dim, default_block_size>>>(
+            num_rows, slice_size, as_cuda_type(source_slice_sets),
+            as_cuda_type(source_values), as_cuda_type(result_row_ptrs));
+    }
 
     grid_dim = ceildiv(num_rows + 1, default_block_size);
     auto add_values = Array<IndexType>(exec, grid_dim);
@@ -179,11 +186,13 @@ void convert_to_csr(std::shared_ptr<const CudaExecutor> exec,
 
     grid_dim = ceildiv(num_rows, default_block_size);
 
-    kernel::fill_in_csr<<<grid_dim, default_block_size>>>(
-        num_rows, slice_size, as_cuda_type(source_slice_sets),
-        as_cuda_type(source_col_idxs), as_cuda_type(source_values),
-        as_cuda_type(result_row_ptrs), as_cuda_type(result_col_idxs),
-        as_cuda_type(result_values));
+    if (grid_dim > 0) {
+        kernel::fill_in_csr<<<grid_dim, default_block_size>>>(
+            num_rows, slice_size, as_cuda_type(source_slice_sets),
+            as_cuda_type(source_col_idxs), as_cuda_type(source_values),
+            as_cuda_type(result_row_ptrs), as_cuda_type(result_col_idxs),
+            as_cuda_type(result_values));
+    }
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
@@ -196,6 +205,12 @@ void count_nonzeros(std::shared_ptr<const CudaExecutor> exec,
                     size_type *result)
 {
     const auto num_rows = source->get_size()[0];
+
+    if (num_rows <= 0) {
+        *result = 0;
+        return;
+    }
+
     const auto slice_size = source->get_slice_size();
     const auto slice_sets = source->get_const_slice_sets();
     const auto values = source->get_const_values();

diff --git a/cuda/test/matrix/sellp_kernels.cpp b/cuda/test/matrix/sellp_kernels.cpp
@@ -112,12 +112,14 @@ class Sellp : public ::testing::Test {
     {
         mtx = Mtx::create(ref);
         mtx->copy_from(gen_mtx(532, 231));
+        empty = Mtx::create(ref);
         expected = gen_mtx(532, 64);
         y = gen_mtx(231, 64);
         alpha = gko::initialize<Vec>({2.0}, ref);
         beta = gko::initialize<Vec>({-1.0}, ref);
         dmtx = Mtx::create(cuda);
         dmtx->copy_from(mtx.get());
+        dempty = Mtx::create(cuda);
         dresult = Vec::create(cuda);
         dresult->copy_from(expected.get());
         dy = Vec::create(cuda);
@@ -134,12 +136,14 @@ class Sellp : public ::testing::Test {
     std::ranlux48 rand_engine;
 
     std::unique_ptr<Mtx> mtx;
+    std::unique_ptr<Mtx> empty;
     std::unique_ptr<Vec> expected;
     std::unique_ptr<Vec> y;
     std::unique_ptr<Vec> alpha;
     std::unique_ptr<Vec> beta;
 
     std::unique_ptr<Mtx> dmtx;
+    std::unique_ptr<Mtx> dempty;
     std::unique_ptr<Vec> dresult;
     std::unique_ptr<Vec> dy;
     std::unique_ptr<Vec> dalpha;
@@ -281,6 +285,34 @@ TEST_F(Sellp, ConvertToCsrIsEquivalentToRef)
 }
 
 
+TEST_F(Sellp, ConvertEmptyToDenseIsEquivalentToRef)
+{
+    set_up_apply_matrix();
+
+    auto dense_mtx = gko::matrix::Dense<>::create(ref);
+    auto ddense_mtx = gko::matrix::Dense<>::create(cuda);
+
+    empty->convert_to(dense_mtx.get());
+    dempty->convert_to(ddense_mtx.get());
+
+    GKO_ASSERT_MTX_NEAR(dense_mtx.get(), ddense_mtx.get(), 0);
+}
+
+
+TEST_F(Sellp, ConvertEmptyToCsrIsEquivalentToRef)
+{
+    set_up_apply_matrix();
+
+    auto csr_mtx = gko::matrix::Csr<>::create(ref);
+    auto dcsr_mtx = gko::matrix::Csr<>::create(cuda);
+
+    empty->convert_to(csr_mtx.get());
+    dempty->convert_to(dcsr_mtx.get());
+
+    GKO_ASSERT_MTX_NEAR(csr_mtx.get(), dcsr_mtx.get(), 0);
+}
+
+
 TEST_F(Sellp, CountNonzerosIsEquivalentToRef)
 {
     set_up_apply_matrix();

diff --git a/hip/matrix/coo_kernels.hip.cpp b/hip/matrix/coo_kernels.hip.cpp
@@ -239,7 +239,7 @@ void convert_to_dense(std::shared_ptr<const HipExecutor> exec,
 
     const dim3 block_size(config::warp_size,
                           config::max_block_size / config::warp_size, 1);
-    const dim3 init_grid_dim(ceildiv(stride, block_size.x),
+    const dim3 init_grid_dim(ceildiv(num_cols, block_size.x),
                              ceildiv(num_rows, block_size.y), 1);
     hipLaunchKernelGGL(kernel::initialize_zero_dense, dim3(init_grid_dim),
                        dim3(block_size), 0, 0, num_rows, num_cols, stride,