|
3 | 3 | #include <cuda.h>
|
4 | 4 | #include <cuda_runtime.h>
|
5 | 5 |
|
| 6 | +#include "nvgpu_dev_info.h" |
| 7 | + |
6 | 8 | namespace colossalAI {
|
7 | 9 | namespace cuda {
|
8 | 10 | namespace utils {
|
9 | 11 |
|
10 |
| -GPULaunchConfig GPUGetGPULaunchConfig1D(int64_t numel, int vec_size); |
| 12 | +struct GPULaunchConfig { |
| 13 | + dim3 block{1, 1, 1}; |
| 14 | + dim3 grid{1, 1, 1}; |
| 15 | +}; |
| 16 | + |
| 17 | +static GPULaunchConfig GetGPULaunchConfig1D(const NVGPUDevInfo& dev_info, |
| 18 | + int64_t numel, int64_t vec_size) { |
| 19 | + const int64_t max_threads_per_block = dev_info.GetMaxThreadsPerBlock(); |
| 20 | + const int64_t max_blocks_per_grid = dev_info.GetMaxGridDims()[0]; |
| 21 | + const int64_t kMinimumSize = 64; |
| 22 | + const int64_t kMaximumSize = 512; |
| 23 | + int64_t active_threads = (numel + vec_size - 1) / vec_size; |
| 24 | + int64_t sm_num = dev_info.GetMultiProcessorCount(); |
| 25 | + |
| 26 | + // Note(LiuYang): expected threads should be in [64, 128, 256, 512] generally |
| 27 | + int64_t expected_threads_per_block = kMaximumSize; |
11 | 28 |
|
12 |
| -// TODO(LiuYang): to be implemented |
13 |
| -GPULaunchConfig GPUGetGPULaunchConfig2D(int64_t numel, int vec_size); |
| 29 | + auto RoundUpToPowerOfTwo = [](int64_t x) { |
| 30 | + bool is_power_of_two = false; |
| 31 | + int64_t ret = 1; |
| 32 | + int64_t y = x; |
| 33 | + while (y > 0) { |
| 34 | + is_power_of_two = ((ret ^ x) == 0); |
| 35 | + y = (x >> 1); |
| 36 | + ret = (ret << 1); |
| 37 | + if (y > 0) is_power_of_two = false; |
| 38 | + } |
| 39 | + if (is_power_of_two) return x; |
| 40 | + return ret; |
| 41 | + }; |
14 | 42 |
|
15 |
| -// TODO(LiuYang): to be implemented |
16 |
| -GPULaunchConfig GPUGetGPULaunchConfig3D(int64_t numel, int vec_size); |
| 43 | + if ((active_threads / (sm_num << 1)) < max_threads_per_block) { |
| 44 | + expected_threads_per_block = |
| 45 | + RoundUpToPowerOfTwo(active_threads / (sm_num << 1)); |
| 46 | + } else if ((active_threads / (sm_num << 2)) < max_threads_per_block) { |
| 47 | + expected_threads_per_block = |
| 48 | + RoundUpToPowerOfTwo(active_threads / (sm_num << 2)); |
| 49 | + } |
17 | 50 |
|
18 |
| -class GPULaunchConfig { |
19 |
| - public: |
20 |
| - GPULaunchConfig(){}; |
21 |
| - GPULaunchConfig(const dim3& block, const dim3& grid) |
22 |
| - : block_(block), grid_(grid) {} |
23 |
| - friend GPULaunchConfig GPUGetGPULaunchConfig1D(int64_t numel, int vec_size); |
| 51 | + expected_threads_per_block = |
| 52 | + std::max(expected_threads_per_block, kMinimumSize); |
| 53 | + int64_t expect_block_per_grid = |
| 54 | + ((active_threads + expected_threads_per_block - 1) / |
| 55 | + expected_threads_per_block); |
24 | 56 |
|
25 |
| - protected: |
26 |
| - void set_block(const dim3& dim) { block_ = dim; } |
27 |
| - void set_grid(const dim3& dim) { grid_ = dim; } |
| 57 | + if (expect_block_per_grid > max_blocks_per_grid) { |
| 58 | + expect_block_per_grid = max_blocks_per_grid; |
| 59 | + expected_threads_per_block = |
| 60 | + (active_threads + expect_block_per_grid - 1) / expect_block_per_grid; |
| 61 | + if (expected_threads_per_block > max_threads_per_block) |
| 62 | + throw std::invalid_argument( |
| 63 | + "Threads required for current input exceed for current GPU!"); |
| 64 | + expected_threads_per_block = |
| 65 | + RoundUpToPowerOfTwo(expected_threads_per_block); |
| 66 | + expect_block_per_grid = ((active_threads + expected_threads_per_block - 1) / |
| 67 | + expected_threads_per_block); |
| 68 | + } |
28 | 69 |
|
29 |
| - private: |
30 |
| - dim3 block_(1, 1, 1); |
31 |
| - dim3 grid_(1, 1, 1); |
| 70 | + GPULaunchConfig config; |
| 71 | + config.block.x = expected_threads_per_block; |
| 72 | + config.grid.x = expect_block_per_grid; |
| 73 | + return config; |
32 | 74 | }
|
33 | 75 |
|
34 | 76 | } // namespace utils
|
|
0 commit comments