Skip to content

Commit f38445a

Browse files
authored
Drop CUB_QUOTIENT_CEILING (rapidsai#18179)
It can easily overflow, so use `cuda::ceil_div` instead Authors: - Michael Schellenberger Costa (https://github.com/miscco) - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Bradley Dice (https://github.com/bdice) - David Wendt (https://github.com/davidwendt) URL: rapidsai#18179
1 parent bf864ae commit f38445a

File tree

2 files changed

+12
-11
lines changed

2 files changed

+12
-11
lines changed

cpp/src/io/fst/agent_dfa.cuh

+5-6
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,8 @@
1818
#include "in_reg_array.cuh"
1919

2020
#include <cub/cub.cuh>
21+
#include <cuda/functional>
2122
#include <cuda/std/array>
22-
#include <cuda/std/functional>
2323
#include <cuda/std/type_traits>
2424
#include <thrust/execution_policy.h>
2525
#include <thrust/iterator/discard_iterator.h>
@@ -412,10 +412,9 @@ struct AgentDFA {
412412
static constexpr uint32_t SYMBOLS_PER_BLOCK = BLOCK_THREADS * SYMBOLS_PER_THREAD;
413413

414414
static constexpr uint32_t MIN_UINTS_PER_BLOCK =
415-
CUB_QUOTIENT_CEILING(SYMBOLS_PER_BLOCK, sizeof(AliasedLoadT));
416-
static constexpr uint32_t UINTS_PER_THREAD =
417-
CUB_QUOTIENT_CEILING(MIN_UINTS_PER_BLOCK, BLOCK_THREADS);
418-
static constexpr uint32_t UINTS_PER_BLOCK = UINTS_PER_THREAD * BLOCK_THREADS;
415+
cuda::ceil_div<uint32_t>(SYMBOLS_PER_BLOCK, sizeof(AliasedLoadT));
416+
static constexpr uint32_t UINTS_PER_THREAD = cuda::ceil_div(MIN_UINTS_PER_BLOCK, BLOCK_THREADS);
417+
static constexpr uint32_t UINTS_PER_BLOCK = UINTS_PER_THREAD * BLOCK_THREADS;
419418
static constexpr uint32_t SYMBOLS_PER_UINT_BLOCK = UINTS_PER_BLOCK * sizeof(AliasedLoadT);
420419

421420
//------------------------------------------------------------------------------
@@ -564,7 +563,7 @@ struct AgentDFA {
564563

565564
// Last unit to be loaded is IDIV_CEIL(#SYM, SYMBOLS_PER_UNIT)
566565
OffsetT num_total_units =
567-
CUB_QUOTIENT_CEILING(num_total_symbols - block_offset, sizeof(AliasedLoadT));
566+
cuda::ceil_div(num_total_symbols - block_offset, sizeof(AliasedLoadT));
568567

569568
AliasedLoadT const* d_block_symbols =
570569
reinterpret_cast<AliasedLoadT const*>(d_chars + block_offset);

cpp/src/io/fst/dispatch_dfa.cuh

+7-5
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
#include "in_reg_array.cuh"
2020

2121
#include <cub/cub.cuh>
22+
#include <cuda/functional>
2223

2324
#include <cstdint>
2425

@@ -213,7 +214,7 @@ struct DispatchFSM : DeviceFSMPolicy {
213214

214215
// Kernel invocation
215216
uint32_t grid_size = std::max(
216-
1u, CUB_QUOTIENT_CEILING(num_chars, PolicyT::BLOCK_THREADS * PolicyT::ITEMS_PER_THREAD));
217+
1u, cuda::ceil_div<uint32_t>(num_chars, PolicyT::BLOCK_THREADS * PolicyT::ITEMS_PER_THREAD));
217218

218219
dfa_kernel<<<grid_size, PolicyT::BLOCK_THREADS, 0, stream>>>(dfa,
219220
d_chars_in,
@@ -349,8 +350,9 @@ struct DispatchFSM : DeviceFSMPolicy {
349350
NUM_SYMBOLS_PER_BLOCK = BLOCK_THREADS * SYMBOLS_PER_THREAD
350351
};
351352

352-
BlockOffsetT num_blocks = std::max(1u, CUB_QUOTIENT_CEILING(num_chars, NUM_SYMBOLS_PER_BLOCK));
353-
size_t num_threads = num_blocks * BLOCK_THREADS;
353+
BlockOffsetT num_blocks =
354+
std::max<uint32_t>(1u, cuda::ceil_div<uint32_t>(num_chars, NUM_SYMBOLS_PER_BLOCK));
355+
size_t num_threads = num_blocks * BLOCK_THREADS;
354356

355357
//------------------------------------------------------------------------------
356358
// TEMPORARY MEMORY REQUIREMENTS
@@ -416,7 +418,7 @@ struct DispatchFSM : DeviceFSMPolicy {
416418
num_blocks, allocations[MEM_FST_OFFSET], allocation_sizes[MEM_FST_OFFSET]);
417419
if (error != cudaSuccess) return error;
418420
constexpr uint32_t FST_INIT_TPB = 256;
419-
uint32_t num_fst_init_blocks = CUB_QUOTIENT_CEILING(num_blocks, FST_INIT_TPB);
421+
uint32_t num_fst_init_blocks = cuda::ceil_div(num_blocks, FST_INIT_TPB);
420422
initialization_pass_kernel<<<num_fst_init_blocks, FST_INIT_TPB, 0, stream>>>(
421423
fst_offset_tile_state, num_blocks);
422424
}
@@ -431,7 +433,7 @@ struct DispatchFSM : DeviceFSMPolicy {
431433
num_blocks, allocations[MEM_SINGLE_PASS_STV], allocation_sizes[MEM_SINGLE_PASS_STV]);
432434
if (error != cudaSuccess) return error;
433435
constexpr uint32_t STV_INIT_TPB = 256;
434-
uint32_t num_stv_init_blocks = CUB_QUOTIENT_CEILING(num_blocks, STV_INIT_TPB);
436+
uint32_t num_stv_init_blocks = cuda::ceil_div(num_blocks, STV_INIT_TPB);
435437
initialization_pass_kernel<<<num_stv_init_blocks, STV_INIT_TPB, 0, stream>>>(stv_tile_state,
436438
num_blocks);
437439
} else {

0 commit comments

Comments
 (0)