forked from cms-sw/cmssw
-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathcudaMathTest.cu
200 lines (172 loc) · 6.96 KB
/
cudaMathTest.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
/**
* Derived from the nVIDIA CUDA 8.0 samples by
*
* Eyal Rozenberg <E.Rozenberg@cwi.nl>
*
* The derivation is specifically permitted in the nVIDIA CUDA Samples EULA
* and the deriver is the owner of this code according to the EULA.
*
* Use this reasonably. If you want to discuss licensing formalities, please
* contact the author.
*
* Modified by VinInn for testing math funcs
*/
/* to run test
foreach f ( $CMSSW_BASE/test/$SCRAM_ARCH/DFM_Vector* )
echo $f; $f
end
*/
#include <algorithm>
#include <cassert>
#include <chrono>
#include <iomanip>
#include <iostream>
#include <memory>
#include <random>
#include <stdexcept>
#ifdef __CUDACC__
#define inline __host__ __device__ inline
#include <vdt/sin.h>
#undef inline
#else
#include <vdt/sin.h>
#endif
#include "DataFormats/Math/interface/approx_log.h"
#include "DataFormats/Math/interface/approx_exp.h"
#include "DataFormats/Math/interface/approx_atan2.h"
#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
#include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
#include "HeterogeneousCore/CUDAUtilities/interface/launch.h"
std::mt19937 eng;
std::mt19937 eng2;
std::uniform_real_distribution<float> rgen(0., 1.);
constexpr float myExp(float x) { return unsafe_expf<6>(x); }
constexpr float myLog(float x) { return unsafe_logf<6>(x); }
__host__ __device__ inline float mySin(float x) { return vdt::fast_sinf(x); }
constexpr int USEEXP = 0, USESIN = 1, USELOG = 2;
template <int USE, bool ADDY = false>
// __host__ __device__
constexpr float testFunc(float x, float y) {
float ret = 0;
if (USE == USEEXP)
ret = myExp(x);
else if (USE == USESIN)
ret = mySin(x);
else
ret = myLog(x);
return ADDY ? ret + y : ret;
}
template <int USE, bool ADDY>
__global__ void vectorOp(const float *A, const float *B, float *C, int numElements) {
int i = blockDim.x * blockIdx.x + threadIdx.x;
if (i < numElements) {
C[i] = testFunc<USE, ADDY>(A[i], B[i]);
}
}
template <int USE, bool ADDY>
void vectorOpH(const float *A, const float *B, float *C, int numElements) {
for (int i = 0; i < numElements; ++i) {
C[i] = testFunc<USE, ADDY>(A[i], B[i]);
}
}
template <int USE, bool ADDY = false>
void go() {
auto start = std::chrono::high_resolution_clock::now();
auto delta = start - start;
int numElements = 200000;
size_t size = numElements * sizeof(float);
std::cout << "[Vector of " << numElements << " elements]\n";
auto h_A = std::make_unique<float[]>(numElements);
auto h_B = std::make_unique<float[]>(numElements);
auto h_C = std::make_unique<float[]>(numElements);
auto h_C2 = std::make_unique<float[]>(numElements);
std::generate(h_A.get(), h_A.get() + numElements, [&]() { return rgen(eng); });
std::generate(h_B.get(), h_B.get() + numElements, [&]() { return rgen(eng); });
delta -= (std::chrono::high_resolution_clock::now() - start);
auto d_A = cms::cuda::make_device_unique<float[]>(numElements, nullptr);
auto d_B = cms::cuda::make_device_unique<float[]>(numElements, nullptr);
auto d_C = cms::cuda::make_device_unique<float[]>(numElements, nullptr);
cudaCheck(cudaMemcpy(d_A.get(), h_A.get(), size, cudaMemcpyHostToDevice));
cudaCheck(cudaMemcpy(d_B.get(), h_B.get(), size, cudaMemcpyHostToDevice));
delta += (std::chrono::high_resolution_clock::now() - start);
std::cout << "cuda alloc+copy took " << std::chrono::duration_cast<std::chrono::milliseconds>(delta).count() << " ms"
<< std::endl;
// Launch the Vector OP CUDA Kernel
int threadsPerBlock = 256;
int blocksPerGrid = (numElements + threadsPerBlock - 1) / threadsPerBlock;
std::cout << "CUDA kernel launch with " << blocksPerGrid << " blocks of " << threadsPerBlock << " threads\n";
delta -= (std::chrono::high_resolution_clock::now() - start);
cms::cuda::launch(
vectorOp<USE, ADDY>, {blocksPerGrid, threadsPerBlock}, d_A.get(), d_B.get(), d_C.get(), numElements);
delta += (std::chrono::high_resolution_clock::now() - start);
std::cout << "cuda computation took " << std::chrono::duration_cast<std::chrono::milliseconds>(delta).count() << " ms"
<< std::endl;
delta -= (std::chrono::high_resolution_clock::now() - start);
cms::cuda::launch(
vectorOp<USE, ADDY>, {blocksPerGrid, threadsPerBlock}, d_A.get(), d_B.get(), d_C.get(), numElements);
delta += (std::chrono::high_resolution_clock::now() - start);
std::cout << "cuda computation took " << std::chrono::duration_cast<std::chrono::milliseconds>(delta).count() << " ms"
<< std::endl;
delta -= (std::chrono::high_resolution_clock::now() - start);
cudaCheck(cudaMemcpy(h_C.get(), d_C.get(), size, cudaMemcpyDeviceToHost));
delta += (std::chrono::high_resolution_clock::now() - start);
std::cout << "cuda copy back took " << std::chrono::duration_cast<std::chrono::milliseconds>(delta).count() << " ms"
<< std::endl;
// on host now...
delta -= (std::chrono::high_resolution_clock::now() - start);
vectorOpH<USE, ADDY>(h_A.get(), h_B.get(), h_C2.get(), numElements);
delta += (std::chrono::high_resolution_clock::now() - start);
std::cout << "host computation took " << std::chrono::duration_cast<std::chrono::milliseconds>(delta).count() << " ms"
<< std::endl;
delta -= (std::chrono::high_resolution_clock::now() - start);
vectorOpH<USE, ADDY>(h_A.get(), h_B.get(), h_C2.get(), numElements);
delta += (std::chrono::high_resolution_clock::now() - start);
std::cout << "host computation took " << std::chrono::duration_cast<std::chrono::milliseconds>(delta).count() << " ms"
<< std::endl;
// Verify that the result vector is correct
double ave = 0;
int maxDiff = 0;
long long ndiff = 0;
double fave = 0;
float fmaxDiff = 0;
for (int i = 0; i < numElements; ++i) {
approx_math::binary32 g, c;
g.f = testFunc<USE, ADDY>(h_A[i], h_B[i]);
c.f = h_C[i];
auto diff = std::abs(g.i32 - c.i32);
maxDiff = std::max(diff, maxDiff);
ave += diff;
if (diff != 0)
++ndiff;
auto fdiff = std::abs(g.f - c.f);
fave += fdiff;
fmaxDiff = std::max(fdiff, fmaxDiff);
// if (diff>7)
// std::cerr << "Large diff at element " << i << ' ' << diff << ' ' << std::hexfloat
// << g.f << "!=" << c.f << "\n";
}
std::cout << "ndiff ave, max " << ndiff << ' ' << ave / numElements << ' ' << maxDiff << std::endl;
std::cout << "float ave, max " << fave / numElements << ' ' << fmaxDiff << std::endl;
if (!ndiff) {
std::cout << "Test PASSED\n";
std::cout << "SUCCESS" << std::endl;
}
cudaDeviceSynchronize();
}
int main() {
cms::cudatest::requireDevices();
try {
go<USEEXP>();
go<USESIN>();
go<USELOG>();
go<USELOG, true>();
} catch (std::runtime_error &ex) {
std::cerr << "CUDA or std runtime error: " << ex.what() << std::endl;
exit(EXIT_FAILURE);
} catch (...) {
std::cerr << "A non-CUDA error occurred" << std::endl;
exit(EXIT_FAILURE);
}
return EXIT_SUCCESS;
}