-
Notifications
You must be signed in to change notification settings - Fork 1
/
hello.cu
80 lines (57 loc) · 1.72 KB
/
hello.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
#include <cuda_runtime.h>
#include<stdio.h>
#include "timer.h"
__global__ void vector_add(float *a,float *b, float *c, int N){
int i = blockIdx.x * blockDim.x + threadIdx.x;
if(i < N){
c[i] = a[i] + b[i];
}
}
int main(){
cudaDeviceSynchronize();
const int N = 1000000;
float *a = new float[N];
float *b = new float[N];
float *out = new float[N];
float *out_p = new float[N];
for (int i = 0; i < N; i++){
a[i] = rand();
b[i] = rand();
}
const size_t Size = N * sizeof(float);
core::timer cpu_t;
cpu_t.start();
for (int i = 0; i < N; i++){
out[i] = a[i] + b[i];
}
printf("Cpu time taken :- %f ns\n",cpu_t.nanoseconds());
// printf("array from the cpu \n");
// for (int i = 0; i < N; i++){
// printf("%d ",out[i]);
// }
core::timer gpu_total_t;
gpu_total_t.start();
float *d_a, *d_b, *d_out;
cudaMalloc(&d_a, Size);
cudaMalloc(&d_b, Size);
cudaMalloc(&d_out, Size);
cudaMemcpy(d_a,a,Size,cudaMemcpyHostToDevice);
cudaMemcpy(d_b,b,Size,cudaMemcpyHostToDevice);
//still don't understand this part :(
int threadsPerBlock = 256;
int blocksPerGrid = (N + threadsPerBlock - 1) / threadsPerBlock;
core::timer gpu_t;
gpu_t.start();
vector_add<<<blocksPerGrid,threadsPerBlock>>>(d_a,d_b,d_out,N);
printf("gpu time taken :- %f ns\n",gpu_t.nanoseconds());
cudaMemcpy(out_p,d_out,Size,cudaMemcpyDeviceToHost);
// printf("array from the gpu \n");
// for (int i = 0; i < N; i++){
// printf("%d ",out_p[i]);
// }
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_out);
printf("gpu time taken :- %f ns\n",gpu_total_t.nanoseconds());
return 0;
}