Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Batch of fixes for index overflows in grid stride loops. #10448

Merged
merged 10 commits into from
Mar 23, 2022
5 changes: 3 additions & 2 deletions cpp/include/cudf/detail/valid_if.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,8 @@ __global__ void valid_if_kernel(
{
constexpr size_type leader_lane{0};
auto const lane_id{threadIdx.x % warp_size};
size_type i = threadIdx.x + blockIdx.x * blockDim.x;
thread_index_type i = threadIdx.x + blockIdx.x * blockDim.x;
thread_index_type const stride = blockDim.x * gridDim.x;
size_type warp_valid_count{0};

auto active_mask = __ballot_sync(0xFFFF'FFFF, i < size);
Expand All @@ -58,7 +59,7 @@ __global__ void valid_if_kernel(
output[cudf::word_index(i)] = ballot;
warp_valid_count += __popc(ballot);
}
i += blockDim.x * gridDim.x;
i += stride;
active_mask = __ballot_sync(active_mask, i < size);
}

Expand Down
9 changes: 5 additions & 4 deletions cpp/include/cudf/types.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -83,10 +83,11 @@ class mutable_table_view;
* @file
*/

using size_type = int32_t;
using bitmask_type = uint32_t;
using valid_type = uint8_t;
using offset_type = int32_t;
using size_type = int32_t;
using bitmask_type = uint32_t;
using valid_type = uint8_t;
using offset_type = int32_t;
using thread_index_type = int64_t;

/**
* @brief Similar to `std::distance` but returns `cudf::size_type` and performs `static_cast`
Expand Down
14 changes: 10 additions & 4 deletions cpp/src/copying/scatter.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2021, NVIDIA CORPORATION.
* Copyright (c) 2019-2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -47,7 +47,8 @@ __global__ void marking_bitmask_kernel(mutable_column_device_view destination,
MapIterator scatter_map,
size_type num_scatter_rows)
{
size_type row = threadIdx.x + blockIdx.x * blockDim.x;
thread_index_type row = threadIdx.x + blockIdx.x * blockDim.x;
thread_index_type const stride = blockDim.x * gridDim.x;

while (row < num_scatter_rows) {
nvdbaranec marked this conversation as resolved.
Show resolved Hide resolved
size_type const output_row = scatter_map[row];
Expand All @@ -58,7 +59,7 @@ __global__ void marking_bitmask_kernel(mutable_column_device_view destination,
destination.set_null(output_row);
}

row += blockDim.x * gridDim.x;
row += stride;
}
}

Expand Down Expand Up @@ -351,8 +352,13 @@ std::unique_ptr<table> scatter(std::vector<std::reference_wrapper<const scalar>>

// Transform negative indices to index + target size
auto scatter_rows = indices.size();
// note: the intermediate ((in % n_rows) + n_rows) will overflow a size_type for any value of `in`
// > (2^31)/2, but the end result after the final (% n_rows) will fit. so we'll do the computation
// using a signed 64 bit value.
auto scatter_iter = thrust::make_transform_iterator(
map_begin, [n_rows] __device__(size_type in) { return ((in % n_rows) + n_rows) % n_rows; });
map_begin, [n_rows = static_cast<int64_t>(n_rows)] __device__(size_type in) -> size_type {
return ((static_cast<int64_t>(in) % n_rows) + n_rows) % n_rows;
});

// Dispatch over data type per column
auto result = std::vector<std::unique_ptr<column>>(target.num_columns());
Expand Down
15 changes: 9 additions & 6 deletions cpp/src/replace/nulls.cu
Original file line number Diff line number Diff line change
Expand Up @@ -62,8 +62,9 @@ __global__ void replace_nulls_strings(cudf::column_device_view input,
char* chars,
cudf::size_type* valid_counter)
{
cudf::size_type nrows = input.size();
cudf::size_type i = blockIdx.x * blockDim.x + threadIdx.x;
cudf::size_type nrows = input.size();
cudf::thread_index_type i = blockIdx.x * blockDim.x + threadIdx.x;
cudf::thread_index_type const stride = blockDim.x * gridDim.x;

uint32_t active_mask = 0xffffffff;
active_mask = __ballot_sync(active_mask, i < nrows);
Expand Down Expand Up @@ -98,7 +99,7 @@ __global__ void replace_nulls_strings(cudf::column_device_view input,
if (nonzero_output) std::memcpy(chars + offsets[i], out.data(), out.size_bytes());
}

i += blockDim.x * gridDim.x;
i += stride;
active_mask = __ballot_sync(active_mask, i < nrows);
}

Expand All @@ -114,8 +115,9 @@ __global__ void replace_nulls(cudf::column_device_view input,
cudf::mutable_column_device_view output,
cudf::size_type* output_valid_count)
{
cudf::size_type nrows = input.size();
cudf::size_type i = blockIdx.x * blockDim.x + threadIdx.x;
cudf::size_type nrows = input.size();
cudf::thread_index_type i = blockIdx.x * blockDim.x + threadIdx.x;
cudf::thread_index_type const stride = blockDim.x * gridDim.x;

uint32_t active_mask = 0xffffffff;
active_mask = __ballot_sync(active_mask, i < nrows);
Expand All @@ -141,7 +143,7 @@ __global__ void replace_nulls(cudf::column_device_view input,
}
}

i += blockDim.x * gridDim.x;
i += stride;
active_mask = __ballot_sync(active_mask, i < nrows);
}
if (replacement_has_nulls) {
Expand Down Expand Up @@ -247,6 +249,7 @@ std::unique_ptr<cudf::column> replace_nulls_column_kernel_forwarder::operator()<

std::unique_ptr<cudf::column> offsets = cudf::strings::detail::make_offsets_child_column(
sizes_view.begin<int32_t>(), sizes_view.end<int32_t>(), stream, mr);

auto offsets_view = offsets->mutable_view();

auto const bytes =
Expand Down
10 changes: 5 additions & 5 deletions cpp/src/rolling/jit/kernel.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2021, NVIDIA CORPORATION.
* Copyright (c) 2019-2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -51,8 +51,8 @@ __global__ void gpu_rolling_new(cudf::size_type nrows,
FollowingWindowType following_window_begin,
cudf::size_type min_periods)
{
cudf::size_type i = blockIdx.x * blockDim.x + threadIdx.x;
cudf::size_type stride = blockDim.x * gridDim.x;
thread_index_type i = blockIdx.x * blockDim.x + threadIdx.x;
thread_index_type const stride = blockDim.x * gridDim.x;

cudf::size_type warp_valid_count{0};

Expand All @@ -66,8 +66,8 @@ __global__ void gpu_rolling_new(cudf::size_type nrows,
cudf::size_type following_window = get_window(following_window_begin, i);

// compute bounds
cudf::size_type start = min(nrows, max(0, i - preceding_window + 1));
cudf::size_type end = min(nrows, max(0, i + following_window + 1));
cudf::size_type start = min(nrows, max(0, static_cast<size_type>(i) - preceding_window + 1));
cudf::size_type end = min(nrows, max(0, static_cast<size_type>(i) + following_window + 1));
bdice marked this conversation as resolved.
Show resolved Hide resolved
cudf::size_type start_index = min(start, end);
cudf::size_type end_index = max(start, end);

Expand Down
12 changes: 6 additions & 6 deletions cpp/src/rolling/rolling_detail.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -1008,8 +1008,8 @@ __launch_bounds__(block_size) __global__
PrecedingWindowIterator preceding_window_begin,
FollowingWindowIterator following_window_begin)
{
size_type i = blockIdx.x * block_size + threadIdx.x;
size_type stride = block_size * gridDim.x;
thread_index_type i = blockIdx.x * block_size + threadIdx.x;
thread_index_type const stride = block_size * gridDim.x;

size_type warp_valid_count{0};

Expand All @@ -1020,10 +1020,10 @@ __launch_bounds__(block_size) __global__
int64_t following_window = following_window_begin[i];

// compute bounds
auto start = static_cast<size_type>(
min(static_cast<int64_t>(input.size()), max(0L, i - preceding_window + 1)));
auto end = static_cast<size_type>(
min(static_cast<int64_t>(input.size()), max(0L, i + following_window + 1)));
auto start = static_cast<size_type>(min(
static_cast<int64_t>(input.size()), max(0L, static_cast<int64_t>(i) - preceding_window + 1)));
auto end = static_cast<size_type>(min(static_cast<int64_t>(input.size()),
max(0L, static_cast<int64_t>(i) + following_window + 1)));
size_type start_index = min(start, end);
size_type end_index = max(start, end);

Expand Down
9 changes: 5 additions & 4 deletions cpp/src/transform/compute_column.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020-2021, NVIDIA CORPORATION.
* Copyright (c) 2020-2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -67,12 +67,13 @@ __launch_bounds__(max_block_size) __global__

auto thread_intermediate_storage =
&intermediate_storage[threadIdx.x * device_expression_data.num_intermediates];
auto const start_idx = static_cast<cudf::size_type>(threadIdx.x + blockIdx.x * blockDim.x);
auto const stride = static_cast<cudf::size_type>(blockDim.x * gridDim.x);
auto const start_idx =
static_cast<cudf::thread_index_type>(threadIdx.x + blockIdx.x * blockDim.x);
auto const stride = static_cast<cudf::thread_index_type>(blockDim.x * gridDim.x);
auto evaluator =
cudf::ast::detail::expression_evaluator<has_nulls>(table, device_expression_data);

for (cudf::size_type row_index = start_idx; row_index < table.num_rows(); row_index += stride) {
nvdbaranec marked this conversation as resolved.
Show resolved Hide resolved
for (thread_index_type row_index = start_idx; row_index < table.num_rows(); row_index += stride) {
auto output_dest = ast::detail::mutable_column_expression_result<has_nulls>(output_column);
evaluator.evaluate(output_dest, row_index, thread_intermediate_storage);
}
Expand Down