-
Notifications
You must be signed in to change notification settings - Fork 0
/
transpose.cu
103 lines (77 loc) · 2.87 KB
/
transpose.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
/* a simple 2D cuda kernel to transpose a matrix
its kind of pointless from calculation standpoint because matrices are 1D memory
in row major format here. So in real libs like pytorch, transpose does NOT physically
change data in memory. It will be implemented in terms of dimensions and strides of
the array.
the idea of this toy code is to understand 2D grids in Cuda */
#include <iostream>
#include <iomanip>
__global__
void transpose(float *in, float *out, unsigned int rows, unsigned int cols) {
unsigned int row = (blockDim.y * blockIdx.y) + threadIdx.y;
unsigned int col = (blockDim.x * blockIdx.x) + threadIdx.x;
unsigned int out_row = col;
unsigned int out_col = row;
if ((row < rows) && (col < cols)) {
unsigned int in_offset = (row * cols) + col;
unsigned int out_offset = (out_row * rows) + out_col;
out[out_offset] = in[in_offset];
}
}
int main(void)
{
// create a simple matrix of size 768 * 1024
const unsigned int rows = 768;
const unsigned int cols = 1024;
constexpr int total_size = rows * cols;
float *in = new float[total_size];
float *out = new float[total_size];
// fill it up with data such that we can track transpose is correct
for (auto r = 0; r < rows; r++)
{
for (auto c = 0; c < cols; c++)
{
auto offset = (r * cols) + c;
in[offset] = (float)c;
}
}
// print it for first few rows
for (auto r = 0; r < 8; r++)
{
for (auto c = 0; c < 16; c++)
{
auto offset = (r * cols) + c;
std::cout << std::setfill('0') << std::setw(4) << in[offset] << " " ;
}
std::cout << std::endl << std::endl;
}
dim3 blockDim {16, 16, 1}; // 16 * 16 threads in each block
unsigned int col_blocks = cols/16;
unsigned int row_blocks = rows/16;
dim3 gridDim {col_blocks, row_blocks, 1}; // note : x and then y
// moving host mem to cuda. @TODO explore cudaManaged for this
float *d_in, *d_out;
cudaMalloc((void **)&d_in, total_size * sizeof(float));
cudaMalloc((void **)&d_out, total_size * sizeof(float));
cudaMemcpy(d_in, in, total_size*sizeof(float), cudaMemcpyHostToDevice);
transpose<<<gridDim, blockDim>>>(d_in, d_out, rows, cols);
cudaDeviceSynchronize();
// copy result back
cudaMemcpy(out, d_out, total_size*sizeof(float), cudaMemcpyDeviceToHost);
// print result
std::cout << "Printing result: " << std::endl;
for (auto out_r = 0; out_r < 16; out_r++)
{
for (auto out_c = 0; out_c < 8 ; out_c++)
{
auto offset = (out_r * rows) + out_c;
std::cout << std::setfill('0') << std::setw(4) << out[offset] << " " ;
}
std::cout << std::endl << std::endl;
}
// cleanup
cudaFree(d_in);
cudaFree(d_out);
delete[] in;
delete[] out;
}