-
Notifications
You must be signed in to change notification settings - Fork 199
/
Copy pathcudart_utils.h
395 lines (348 loc) · 13.7 KB
/
cudart_utils.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
/*
* Copyright (c) 2019-2020, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include <raft/error.hpp>
#include <rmm/cuda_stream_view.hpp>
#include <rmm/mr/device/per_device_resource.hpp>
#include <cuda_runtime.h>
#include <execinfo.h>
#include <chrono>
#include <cstdio>
#include <iomanip>
#include <iostream>
#include <mutex>
#include <unordered_map>
///@todo: enable once logging has been enabled in raft
//#include "logger.hpp"
namespace raft {
/**
* @brief Exception thrown when a CUDA error is encountered.
*/
struct cuda_error : public raft::exception {
explicit cuda_error(char const* const message) : raft::exception(message) {}
explicit cuda_error(std::string const& message) : raft::exception(message) {}
};
} // namespace raft
/**
* @brief Error checking macro for CUDA runtime API functions.
*
* Invokes a CUDA runtime API function call, if the call does not return
* cudaSuccess, invokes cudaGetLastError() to clear the error and throws an
* exception detailing the CUDA error that occurred
*
*/
#define CUDA_TRY(call) \
do { \
cudaError_t const status = call; \
if (status != cudaSuccess) { \
cudaGetLastError(); \
std::string msg{}; \
SET_ERROR_MSG( \
msg, "CUDA error encountered at: ", "call='%s', Reason=%s:%s", #call, \
cudaGetErrorName(status), cudaGetErrorString(status)); \
throw raft::cuda_error(msg); \
} \
} while (0)
/**
* @brief Debug macro to check for CUDA errors
*
* In a non-release build, this macro will synchronize the specified stream
* before error checking. In both release and non-release builds, this macro
* checks for any pending CUDA errors from previous calls. If an error is
* reported, an exception is thrown detailing the CUDA error that occurred.
*
* The intent of this macro is to provide a mechanism for synchronous and
* deterministic execution for debugging asynchronous CUDA execution. It should
* be used after any asynchronous CUDA call, e.g., cudaMemcpyAsync, or an
* asynchronous kernel launch.
*/
#ifndef NDEBUG
#define CHECK_CUDA(stream) CUDA_TRY(cudaStreamSynchronize(stream));
#else
#define CHECK_CUDA(stream) CUDA_TRY(cudaPeekAtLastError());
#endif
/** FIXME: temporary alias for cuML compatibility */
#define CUDA_CHECK(call) CUDA_TRY(call)
///@todo: enable this only after we have added logging support in raft
// /**
// * @brief check for cuda runtime API errors but log error instead of raising
// * exception.
// */
#define CUDA_CHECK_NO_THROW(call) \
do { \
cudaError_t const status = call; \
if (cudaSuccess != status) { \
printf("CUDA call='%s' at file=%s line=%d failed with %s\n", #call, \
__FILE__, __LINE__, cudaGetErrorString(status)); \
} \
} while (0)
namespace raft {
/** Helper method to get to know warp size in device code */
__host__ __device__ constexpr inline int warp_size() { return 32; }
__host__ __device__ constexpr inline unsigned int warp_full_mask() {
return 0xffffffff;
}
/**
* @brief A kernel grid configuration construction gadget for simple one-dimensional mapping
* elements to threads.
*/
class grid_1d_thread_t {
public:
int const block_size{0};
int const num_blocks{0};
/**
* @param overall_num_elements The number of elements the kernel needs to handle/process
* @param num_threads_per_block The grid block size, determined according to the kernel's
* specific features (amount of shared memory necessary, SM functional units use pattern etc.);
* this can't be determined generically/automatically (as opposed to the number of blocks)
* @param elements_per_thread Typically, a single kernel thread processes more than a single
* element; this affects the number of threads the grid must contain
*/
grid_1d_thread_t(size_t overall_num_elements, size_t num_threads_per_block,
size_t max_num_blocks_1d, size_t elements_per_thread = 1)
: block_size(num_threads_per_block),
num_blocks(std::min((overall_num_elements +
(elements_per_thread * num_threads_per_block) - 1) /
(elements_per_thread * num_threads_per_block),
max_num_blocks_1d)) {
RAFT_EXPECTS(overall_num_elements > 0, "overall_num_elements must be > 0");
RAFT_EXPECTS(num_threads_per_block / warp_size() > 0,
"num_threads_per_block / warp_size() must be > 0");
RAFT_EXPECTS(elements_per_thread > 0, "elements_per_thread must be > 0");
}
};
/**
* @brief A kernel grid configuration construction gadget for simple one-dimensional mapping
* elements to warps.
*/
class grid_1d_warp_t {
public:
int const block_size{0};
int const num_blocks{0};
/**
* @param overall_num_elements The number of elements the kernel needs to handle/process
* @param num_threads_per_block The grid block size, determined according to the kernel's
* specific features (amount of shared memory necessary, SM functional units use pattern etc.);
* this can't be determined generically/automatically (as opposed to the number of blocks)
*/
grid_1d_warp_t(size_t overall_num_elements, size_t num_threads_per_block,
size_t max_num_blocks_1d)
: block_size(num_threads_per_block),
num_blocks(std::min(
(overall_num_elements + (num_threads_per_block / warp_size()) - 1) /
(num_threads_per_block / warp_size()),
max_num_blocks_1d)) {
RAFT_EXPECTS(overall_num_elements > 0, "overall_num_elements must be > 0");
RAFT_EXPECTS(num_threads_per_block / warp_size() > 0,
"num_threads_per_block / warp_size() must be > 0");
}
};
/**
* @brief A kernel grid configuration construction gadget for simple one-dimensional mapping
* elements to blocks.
*/
class grid_1d_block_t {
public:
int const block_size{0};
int const num_blocks{0};
/**
* @param overall_num_elements The number of elements the kernel needs to handle/process
* @param num_threads_per_block The grid block size, determined according to the kernel's
* specific features (amount of shared memory necessary, SM functional units use pattern etc.);
* this can't be determined generically/automatically (as opposed to the number of blocks)
*/
grid_1d_block_t(size_t overall_num_elements, size_t num_threads_per_block,
size_t max_num_blocks_1d)
: block_size(num_threads_per_block),
num_blocks(std::min(overall_num_elements, max_num_blocks_1d)) {
RAFT_EXPECTS(overall_num_elements > 0, "overall_num_elements must be > 0");
RAFT_EXPECTS(num_threads_per_block / warp_size() > 0,
"num_threads_per_block / warp_size() must be > 0");
}
};
/**
* @brief Generic copy method for all kinds of transfers
* @tparam Type data type
* @param dst destination pointer
* @param src source pointer
* @param len lenth of the src/dst buffers in terms of number of elements
* @param stream cuda stream
*/
template <typename Type>
void copy(Type* dst, const Type* src, size_t len,
rmm::cuda_stream_view stream) {
CUDA_CHECK(
cudaMemcpyAsync(dst, src, len * sizeof(Type), cudaMemcpyDefault, stream));
}
/**
* @defgroup Copy Copy methods
* These are here along with the generic 'copy' method in order to improve
* code readability using explicitly specified function names
* @{
*/
/** performs a host to device copy */
template <typename Type>
void update_device(Type* d_ptr, const Type* h_ptr, size_t len,
rmm::cuda_stream_view stream) {
copy(d_ptr, h_ptr, len, stream);
}
/** performs a device to host copy */
template <typename Type>
void update_host(Type* h_ptr, const Type* d_ptr, size_t len,
rmm::cuda_stream_view stream) {
copy(h_ptr, d_ptr, len, stream);
}
template <typename Type>
void copy_async(Type* d_ptr1, const Type* d_ptr2, size_t len,
rmm::cuda_stream_view stream) {
CUDA_CHECK(cudaMemcpyAsync(d_ptr1, d_ptr2, len * sizeof(Type),
cudaMemcpyDeviceToDevice, stream));
}
/** @} */
/**
* @defgroup Debug Utils for debugging host/device buffers
* @{
*/
template <class T, class OutStream>
void print_host_vector(const char* variable_name, const T* host_mem,
size_t componentsCount, OutStream& out) {
out << variable_name << "=[";
for (size_t i = 0; i < componentsCount; ++i) {
if (i != 0) out << ",";
out << host_mem[i];
}
out << "];\n";
}
template <class T, class OutStream>
void print_device_vector(const char* variable_name, const T* devMem,
size_t componentsCount, OutStream& out) {
T* host_mem = new T[componentsCount];
CUDA_CHECK(cudaMemcpy(host_mem, devMem, componentsCount * sizeof(T),
cudaMemcpyDeviceToHost));
print_host_vector(variable_name, host_mem, componentsCount, out);
delete[] host_mem;
}
/** @} */
static std::mutex mutex_;
static std::unordered_map<void*, size_t> allocations;
template <typename Type>
void allocate(Type*& ptr, size_t len, rmm::cuda_stream_view stream,
bool setZero = false) {
size_t size = len * sizeof(Type);
ptr = (Type*)rmm::mr::get_current_device_resource()->allocate(size, stream);
if (setZero) CUDA_CHECK(cudaMemsetAsync((void*)ptr, 0, size, stream));
std::lock_guard<std::mutex> _(mutex_);
allocations[ptr] = size;
}
template <typename Type>
void deallocate(Type*& ptr, rmm::cuda_stream_view stream) {
std::lock_guard<std::mutex> _(mutex_);
size_t size = allocations[ptr];
allocations.erase(ptr);
rmm::mr::get_current_device_resource()->deallocate((void*)ptr, size, stream);
}
inline void deallocate_all(rmm::cuda_stream_view stream) {
std::lock_guard<std::mutex> _(mutex_);
for (auto& alloc : allocations) {
void* ptr = alloc.first;
size_t size = alloc.second;
rmm::mr::get_current_device_resource()->deallocate(ptr, size, stream);
}
allocations.clear();
}
/** helper method to get max usable shared mem per block parameter */
inline int getSharedMemPerBlock() {
int devId;
CUDA_CHECK(cudaGetDevice(&devId));
int smemPerBlk;
CUDA_CHECK(cudaDeviceGetAttribute(&smemPerBlk,
cudaDevAttrMaxSharedMemoryPerBlock, devId));
return smemPerBlk;
}
/** helper method to get multi-processor count parameter */
inline int getMultiProcessorCount() {
int devId;
CUDA_CHECK(cudaGetDevice(&devId));
int mpCount;
CUDA_CHECK(
cudaDeviceGetAttribute(&mpCount, cudaDevAttrMultiProcessorCount, devId));
return mpCount;
}
/** helper method to convert an array on device to a string on host */
template <typename T>
std::string arr2Str(const T* arr, int size, std::string name,
cudaStream_t stream, int width = 4) {
std::stringstream ss;
T* arr_h = (T*)malloc(size * sizeof(T));
update_host(arr_h, arr, size, stream);
CUDA_CHECK(cudaStreamSynchronize(stream));
ss << name << " = [ ";
for (int i = 0; i < size; i++) {
ss << std::setw(width) << arr_h[i];
if (i < size - 1) ss << ", ";
}
ss << " ]" << std::endl;
free(arr_h);
return ss.str();
}
/** this seems to be unused, but may be useful in the future */
template <typename T>
void ASSERT_DEVICE_MEM(T* ptr, std::string name) {
cudaPointerAttributes s_att;
cudaError_t s_err = cudaPointerGetAttributes(&s_att, ptr);
if (s_err != 0 || s_att.device == -1)
std::cout << "Invalid device pointer encountered in " << name
<< ". device=" << s_att.device << ", err=" << s_err << std::endl;
}
inline uint32_t curTimeMillis() {
auto now = std::chrono::high_resolution_clock::now();
auto duration = now.time_since_epoch();
return std::chrono::duration_cast<std::chrono::milliseconds>(duration)
.count();
}
/** Helper function to calculate need memory for allocate to store dense matrix.
* @param rows number of rows in matrix
* @param columns number of columns in matrix
* @return need number of items to allocate via allocate()
* @sa allocate()
*/
inline size_t allocLengthForMatrix(size_t rows, size_t columns) {
return rows * columns;
}
/** Helper function to check alignment of pointer.
* @param ptr the pointer to check
* @param alignment to be checked for
* @return true if address in bytes is a multiple of alignment
*/
template <typename Type>
bool is_aligned(Type* ptr, size_t alignment) {
return reinterpret_cast<uintptr_t>(ptr) % alignment == 0;
}
/** calculate greatest common divisor of two numbers
* @a integer
* @b integer
* @ return gcd of a and b
*/
template <typename IntType>
IntType gcd(IntType a, IntType b) {
while (b != 0) {
IntType tmp = b;
b = a % b;
a = tmp;
}
return a;
}
} // namespace raft