Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add cuda-memcheck CI job #592

Merged
merged 5 commits into from
Jul 16, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions .gitlab-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -839,6 +839,25 @@ undefinedsanitizer:
- cuda
- gpu

cudamemcheck:
stage: QoS_tools
image: localhost:5000/gko-cuda101-gnu8-llvm10-intel2019
before_script: *default_before_script
script:
- ctest -V -S cmake/CTestScript.cmake -DCTEST_BUILD_CONFIGURATION=RelWithDebInfo
-DCTEST_MEMORYCHECK_TYPE=CudaMemcheck
dependencies: []
only:
refs:
- master
- develop
- tags
variables:
- $RUN_CI_TAG
tags:
- private_ci
- cuda
- gpu

# Benchmark build
.benchmark_before_script_template: &default_benchmark_before_script
Expand Down
19 changes: 15 additions & 4 deletions cmake/CTestScript.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -119,11 +119,16 @@ if(CTEST_MEMORYCHECK_TYPE STREQUAL "Valgrind")
set(CTEST_MEMORYCHECK_SUPPRESSIONS_FILE "${CTEST_SOURCE_DIRECTORY}/dev_tools/valgrind/suppressions.supp")
endif()

if(CTEST_MEMORYCHECK_TYPE STREQUAL "CudaMemcheck")
find_program(CTEST_MEMORYCHECK_COMMAND cuda-memcheck)
set(CTEST_BUILD_NAME "CudaMemcheck")
endif()

if(CTEST_BUILD_CONFIGURATION STREQUAL "COVERAGE")
find_program(CTEST_COVERAGE_COMMAND gcov)
endif()

if(NOT CTEST_MEMORYCHECK_TYPE STREQUAL "Valgrind")
if(NOT CTEST_MEMORYCHECK_TYPE STREQUAL "Valgrind" AND NOT CTEST_MEMORYCHECK_TYPE STREQUAL "CudaMemcheck")
set(CTEST_MEMORYCHECK_SANITIZER_OPTIONS "${CTEST_MEMORYCHECK_SANITIZER_OPTIONS}:allocator_may_return_null=1:verbosity=1")
endif()

Expand All @@ -146,7 +151,10 @@ endif()
ctest_start("${CTEST_TEST_MODEL}")
ctest_submit(PARTS Start)

if((NOT CTEST_MEMORYCHECK_TYPE STREQUAL "NONE" AND NOT CTEST_MEMORYCHECK_TYPE STREQUAL "Valgrind") OR CTEST_BUILD_CONFIGURATION STREQUAL "COVERAGE")
if (CTEST_MEMORYCHECK_TYPE STREQUAL "CudaMemcheck")
# generate line number information for CUDA
set(GINKGO_CONFIGURE_OPTIONS "-DGINKGO_DEVEL_TOOLS=OFF;-DGINKGO_BUILD_REFERENCE=ON;-DGINKGO_BUILD_OMP=OFF;-DGINKGO_BUILD_CUDA=ON;-DGINKGO_BUILD_HIP=ON;-DCMAKE_BUILD_TYPE=${CTEST_BUILD_CONFIGURATION};-DCMAKE_CUDA_FLAGS=-lineinfo")
elseif((NOT CTEST_MEMORYCHECK_TYPE STREQUAL "NONE" AND NOT CTEST_MEMORYCHECK_TYPE STREQUAL "Valgrind") OR CTEST_BUILD_CONFIGURATION STREQUAL "COVERAGE")
set(GINKGO_CONFIGURE_OPTIONS "-DGINKGO_DEVEL_TOOLS=OFF;-DGINKGO_BUILD_REFERENCE=ON;-DGINKGO_BUILD_OMP=ON;-DGINKGO_BUILD_CUDA=OFF;-DGINKGO_BUILD_HIP=OFF;-DCMAKE_BUILD_TYPE=${CTEST_BUILD_CONFIGURATION}")
else()
set(GINKGO_CONFIGURE_OPTIONS "-DGINKGO_DEVEL_TOOLS=OFF;-DGINKGO_BUILD_REFERENCE=ON;-DGINKGO_BUILD_OMP=ON;-DGINKGO_BUILD_CUDA=ON;-DGINKGO_BUILD_HIP=ON;-DCMAKE_BUILD_TYPE=${CTEST_BUILD_CONFIGURATION}")
Expand Down Expand Up @@ -180,7 +188,10 @@ if (CTEST_BUILD_CONFIGURATION STREQUAL "COVERAGE")
endif()

if(NOT CTEST_MEMORYCHECK_TYPE STREQUAL "NONE")
ctest_memcheck(BUILD "${CTEST_BINARY_DIRECTORY}" APPEND)
if(CTEST_MEMORYCHECK_TYPE STREQUAL "CudaMemcheck")
ctest_memcheck(BUILD "${CTEST_BINARY_DIRECTORY}" INCLUDE "^(cuda|hip).*" APPEND)
else()
ctest_memcheck(BUILD "${CTEST_BINARY_DIRECTORY}" APPEND)
endif()
ctest_submit(PARTS MemCheck)
endif()

4 changes: 2 additions & 2 deletions common/matrix/sellp_kernels.hpp.inc
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ namespace kernel {


template <typename ValueType>
__global__ __launch_bounds__(default_block_size) void initialize_zero_dense(
__global__ __launch_bounds__(config::max_block_size) void initialize_zero_dense(
size_type num_rows, size_type num_cols, size_type stride,
ValueType *__restrict__ result)
{
Expand Down Expand Up @@ -196,4 +196,4 @@ __global__ __launch_bounds__(default_block_size) void fill_in_csr(
}


} // namespace kernel
} // namespace kernel
2 changes: 1 addition & 1 deletion cuda/matrix/coo_kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -229,7 +229,7 @@ void convert_to_dense(std::shared_ptr<const CudaExecutor> exec,

const dim3 block_size(config::warp_size,
config::max_block_size / config::warp_size, 1);
const dim3 init_grid_dim(ceildiv(stride, block_size.x),
const dim3 init_grid_dim(ceildiv(num_cols, block_size.x),
ceildiv(num_rows, block_size.y), 1);
kernel::initialize_zero_dense<<<init_grid_dim, block_size>>>(
num_rows, num_cols, stride, as_cuda_type(result->get_values()));
Expand Down
38 changes: 25 additions & 13 deletions cuda/matrix/csr_kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -694,7 +694,7 @@ void convert_to_dense(std::shared_ptr<const CudaExecutor> exec,

const dim3 block_size(config::warp_size,
config::max_block_size / config::warp_size, 1);
const dim3 init_grid_dim(ceildiv(stride, block_size.x),
const dim3 init_grid_dim(ceildiv(num_cols, block_size.x),
ceildiv(num_rows, block_size.y), 1);
kernel::initialize_zero_dense<<<init_grid_dim, block_size>>>(
num_rows, num_cols, stride, as_cuda_type(result->get_values()));
Expand Down Expand Up @@ -737,25 +737,31 @@ void convert_to_sellp(std::shared_ptr<const CudaExecutor> exec,
auto nnz_per_row = Array<size_type>(exec, num_rows);
auto grid_dim = ceildiv(num_rows, default_block_size);

kernel::calculate_nnz_per_row<<<grid_dim, default_block_size>>>(
num_rows, as_cuda_type(source_row_ptrs),
as_cuda_type(nnz_per_row.get_data()));
if (grid_dim > 0) {
kernel::calculate_nnz_per_row<<<grid_dim, default_block_size>>>(
num_rows, as_cuda_type(source_row_ptrs),
as_cuda_type(nnz_per_row.get_data()));
}

grid_dim = slice_num;

kernel::calculate_slice_lengths<<<grid_dim, config::warp_size>>>(
num_rows, slice_size, stride_factor,
as_cuda_type(nnz_per_row.get_const_data()), as_cuda_type(slice_lengths),
as_cuda_type(slice_sets));
if (grid_dim > 0) {
kernel::calculate_slice_lengths<<<grid_dim, config::warp_size>>>(
num_rows, slice_size, stride_factor,
as_cuda_type(nnz_per_row.get_const_data()),
as_cuda_type(slice_lengths), as_cuda_type(slice_sets));
}

components::prefix_sum(exec, slice_sets, slice_num + 1);

grid_dim = ceildiv(num_rows, default_block_size);
kernel::fill_in_sellp<<<grid_dim, default_block_size>>>(
num_rows, slice_size, as_cuda_type(source_values),
as_cuda_type(source_row_ptrs), as_cuda_type(source_col_idxs),
as_cuda_type(slice_lengths), as_cuda_type(slice_sets),
as_cuda_type(result_col_idxs), as_cuda_type(result_values));
if (grid_dim > 0) {
kernel::fill_in_sellp<<<grid_dim, default_block_size>>>(
num_rows, slice_size, as_cuda_type(source_values),
as_cuda_type(source_row_ptrs), as_cuda_type(source_col_idxs),
as_cuda_type(slice_lengths), as_cuda_type(slice_sets),
as_cuda_type(result_col_idxs), as_cuda_type(result_values));
}
}

GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
Expand Down Expand Up @@ -805,6 +811,12 @@ void calculate_total_cols(std::shared_ptr<const CudaExecutor> exec,
size_type slice_size)
{
const auto num_rows = source->get_size()[0];

if (num_rows == 0) {
*result = 0;
return;
}

const auto slice_num = ceildiv(num_rows, slice_size);
const auto row_ptrs = source->get_const_row_ptrs();

Expand Down
37 changes: 25 additions & 12 deletions cuda/matrix/dense_kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -378,18 +378,23 @@ void convert_to_sellp(std::shared_ptr<const CudaExecutor> exec,

auto grid_dim = slice_num;

kernel::calculate_slice_lengths<<<grid_dim, config::warp_size>>>(
num_rows, slice_size, slice_num, stride_factor,
as_cuda_type(nnz_per_row.get_const_data()), as_cuda_type(slice_lengths),
as_cuda_type(slice_sets));
if (grid_dim > 0) {
kernel::calculate_slice_lengths<<<grid_dim, config::warp_size>>>(
num_rows, slice_size, slice_num, stride_factor,
as_cuda_type(nnz_per_row.get_const_data()),
as_cuda_type(slice_lengths), as_cuda_type(slice_sets));
}

components::prefix_sum(exec, slice_sets, slice_num + 1);

grid_dim = ceildiv(num_rows, default_block_size);
kernel::fill_in_sellp<<<grid_dim, default_block_size>>>(
num_rows, num_cols, slice_size, stride,
as_cuda_type(source->get_const_values()), as_cuda_type(slice_lengths),
as_cuda_type(slice_sets), as_cuda_type(col_idxs), as_cuda_type(vals));
if (grid_dim > 0) {
kernel::fill_in_sellp<<<grid_dim, default_block_size>>>(
num_rows, num_cols, slice_size, stride,
as_cuda_type(source->get_const_values()),
as_cuda_type(slice_lengths), as_cuda_type(slice_sets),
as_cuda_type(col_idxs), as_cuda_type(vals));
}
}

GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
Expand Down Expand Up @@ -465,10 +470,12 @@ void calculate_nonzeros_per_row(std::shared_ptr<const CudaExecutor> exec,
auto rows_per_block = ceildiv(default_block_size, config::warp_size);
const size_t grid_x = ceildiv(source->get_size()[0], rows_per_block);
const dim3 grid_size(grid_x, 1, 1);
kernel::count_nnz_per_row<<<grid_size, block_size>>>(
source->get_size()[0], source->get_size()[1], source->get_stride(),
as_cuda_type(source->get_const_values()),
as_cuda_type(result->get_data()));
if (grid_x > 0) {
kernel::count_nnz_per_row<<<grid_size, block_size>>>(
source->get_size()[0], source->get_size()[1], source->get_stride(),
as_cuda_type(source->get_const_values()),
as_cuda_type(result->get_data()));
}
}

GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
Expand All @@ -482,6 +489,12 @@ void calculate_total_cols(std::shared_ptr<const CudaExecutor> exec,
size_type slice_size)
{
const auto num_rows = source->get_size()[0];

if (num_rows == 0) {
*result = 0;
return;
}

const auto num_cols = source->get_size()[1];
const auto slice_num = ceildiv(num_rows, slice_size);

Expand Down
2 changes: 1 addition & 1 deletion cuda/matrix/ell_kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -271,7 +271,7 @@ void convert_to_dense(std::shared_ptr<const CudaExecutor> exec,

const dim3 block_size(config::warp_size,
config::max_block_size / config::warp_size, 1);
const dim3 init_grid_dim(ceildiv(result_stride, block_size.x),
const dim3 init_grid_dim(ceildiv(num_cols, block_size.x),
ceildiv(num_rows, block_size.y), 1);
kernel::initialize_zero_dense<<<init_grid_dim, block_size>>>(
num_rows, num_cols, result_stride, as_cuda_type(result->get_values()));
Expand Down
49 changes: 32 additions & 17 deletions cuda/matrix/sellp_kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -126,22 +126,27 @@ void convert_to_dense(std::shared_ptr<const CudaExecutor> exec,

const dim3 block_size(config::warp_size,
config::max_block_size / config::warp_size, 1);
const dim3 init_grid_dim(ceildiv(result->get_stride(), block_size.x),
const dim3 init_grid_dim(ceildiv(num_cols, block_size.x),
ceildiv(num_rows, block_size.y), 1);

kernel::initialize_zero_dense<<<init_grid_dim, block_size>>>(
num_rows, num_cols, result->get_stride(),
as_cuda_type(result->get_values()));
if (num_rows > 0 && result->get_stride() > 0) {
kernel::initialize_zero_dense<<<init_grid_dim, block_size>>>(
num_rows, num_cols, result->get_stride(),
as_cuda_type(result->get_values()));
}

constexpr auto threads_per_row = config::warp_size;
const auto grid_dim =
ceildiv(slice_size * slice_num * threads_per_row, default_block_size);

kernel::fill_in_dense<threads_per_row><<<grid_dim, default_block_size>>>(
num_rows, num_cols, result->get_stride(), slice_size,
as_cuda_type(slice_lengths), as_cuda_type(slice_sets),
as_cuda_type(col_idxs), as_cuda_type(vals),
as_cuda_type(result->get_values()));
if (grid_dim > 0) {
kernel::fill_in_dense<threads_per_row>
<<<grid_dim, default_block_size>>>(
num_rows, num_cols, result->get_stride(), slice_size,
as_cuda_type(slice_lengths), as_cuda_type(slice_sets),
as_cuda_type(col_idxs), as_cuda_type(vals),
as_cuda_type(result->get_values()));
}
}

GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
Expand All @@ -168,9 +173,11 @@ void convert_to_csr(std::shared_ptr<const CudaExecutor> exec,

auto grid_dim = ceildiv(num_rows * config::warp_size, default_block_size);

kernel::count_nnz_per_row<<<grid_dim, default_block_size>>>(
num_rows, slice_size, as_cuda_type(source_slice_sets),
as_cuda_type(source_values), as_cuda_type(result_row_ptrs));
if (grid_dim > 0) {
kernel::count_nnz_per_row<<<grid_dim, default_block_size>>>(
num_rows, slice_size, as_cuda_type(source_slice_sets),
as_cuda_type(source_values), as_cuda_type(result_row_ptrs));
}

grid_dim = ceildiv(num_rows + 1, default_block_size);
auto add_values = Array<IndexType>(exec, grid_dim);
Expand All @@ -179,11 +186,13 @@ void convert_to_csr(std::shared_ptr<const CudaExecutor> exec,

grid_dim = ceildiv(num_rows, default_block_size);

kernel::fill_in_csr<<<grid_dim, default_block_size>>>(
num_rows, slice_size, as_cuda_type(source_slice_sets),
as_cuda_type(source_col_idxs), as_cuda_type(source_values),
as_cuda_type(result_row_ptrs), as_cuda_type(result_col_idxs),
as_cuda_type(result_values));
if (grid_dim > 0) {
kernel::fill_in_csr<<<grid_dim, default_block_size>>>(
num_rows, slice_size, as_cuda_type(source_slice_sets),
as_cuda_type(source_col_idxs), as_cuda_type(source_values),
as_cuda_type(result_row_ptrs), as_cuda_type(result_col_idxs),
as_cuda_type(result_values));
}
}

GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
Expand All @@ -196,6 +205,12 @@ void count_nonzeros(std::shared_ptr<const CudaExecutor> exec,
size_type *result)
{
const auto num_rows = source->get_size()[0];

if (num_rows <= 0) {
*result = 0;
return;
}

const auto slice_size = source->get_slice_size();
const auto slice_sets = source->get_const_slice_sets();
const auto values = source->get_const_values();
Expand Down
32 changes: 32 additions & 0 deletions cuda/test/matrix/sellp_kernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -112,12 +112,14 @@ class Sellp : public ::testing::Test {
{
mtx = Mtx::create(ref);
mtx->copy_from(gen_mtx(532, 231));
empty = Mtx::create(ref);
expected = gen_mtx(532, 64);
y = gen_mtx(231, 64);
alpha = gko::initialize<Vec>({2.0}, ref);
beta = gko::initialize<Vec>({-1.0}, ref);
dmtx = Mtx::create(cuda);
dmtx->copy_from(mtx.get());
dempty = Mtx::create(cuda);
dresult = Vec::create(cuda);
dresult->copy_from(expected.get());
dy = Vec::create(cuda);
Expand All @@ -134,12 +136,14 @@ class Sellp : public ::testing::Test {
std::ranlux48 rand_engine;

std::unique_ptr<Mtx> mtx;
std::unique_ptr<Mtx> empty;
std::unique_ptr<Vec> expected;
std::unique_ptr<Vec> y;
std::unique_ptr<Vec> alpha;
std::unique_ptr<Vec> beta;

std::unique_ptr<Mtx> dmtx;
std::unique_ptr<Mtx> dempty;
std::unique_ptr<Vec> dresult;
std::unique_ptr<Vec> dy;
std::unique_ptr<Vec> dalpha;
Expand Down Expand Up @@ -281,6 +285,34 @@ TEST_F(Sellp, ConvertToCsrIsEquivalentToRef)
}


TEST_F(Sellp, ConvertEmptyToDenseIsEquivalentToRef)
{
set_up_apply_matrix();

auto dense_mtx = gko::matrix::Dense<>::create(ref);
auto ddense_mtx = gko::matrix::Dense<>::create(cuda);

empty->convert_to(dense_mtx.get());
dempty->convert_to(ddense_mtx.get());

GKO_ASSERT_MTX_NEAR(dense_mtx.get(), ddense_mtx.get(), 0);
}


TEST_F(Sellp, ConvertEmptyToCsrIsEquivalentToRef)
{
set_up_apply_matrix();

auto csr_mtx = gko::matrix::Csr<>::create(ref);
auto dcsr_mtx = gko::matrix::Csr<>::create(cuda);

empty->convert_to(csr_mtx.get());
dempty->convert_to(dcsr_mtx.get());

GKO_ASSERT_MTX_NEAR(csr_mtx.get(), dcsr_mtx.get(), 0);
}


TEST_F(Sellp, CountNonzerosIsEquivalentToRef)
{
set_up_apply_matrix();
Expand Down
2 changes: 1 addition & 1 deletion hip/matrix/coo_kernels.hip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -239,7 +239,7 @@ void convert_to_dense(std::shared_ptr<const HipExecutor> exec,

const dim3 block_size(config::warp_size,
config::max_block_size / config::warp_size, 1);
const dim3 init_grid_dim(ceildiv(stride, block_size.x),
const dim3 init_grid_dim(ceildiv(num_cols, block_size.x),
ceildiv(num_rows, block_size.y), 1);
hipLaunchKernelGGL(kernel::initialize_zero_dense, dim3(init_grid_dim),
dim3(block_size), 0, 0, num_rows, num_cols, stride,
Expand Down
Loading