-
Notifications
You must be signed in to change notification settings - Fork 0
/
matrixCUDA.cuh
178 lines (169 loc) · 7.3 KB
/
matrixCUDA.cuh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
#pragma once
#include "../cpu/vectorCPU.hpp"
#include "kernels.cuh"
#include "vectorCUDA.cuh"
#include <algorithm>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <vector>
class MatrixGPU {
protected:
unsigned h_rows, h_columns, h_nnz, *d_rows, *d_columns;
int *d_csrRowPtr, *d_csrColInd;
double *d_csrVal;
// cusparse used for transpose
cusparseMatDescr_t descr = NULL;
void *dBuffer = NULL;
size_t bufferSize = 0;
void allocator(unsigned r, unsigned c, unsigned n) {
cudaErrCheck(cudaMalloc((void **)&d_rows, sizeof(unsigned)));
cudaErrCheck(cudaMalloc((void **)&d_columns, sizeof(unsigned)));
cudaErrCheck(cudaMalloc((void **)&d_csrRowPtr, (h_rows + 1) * sizeof(int)));
cudaErrCheck(cudaMalloc((void **)&d_csrColInd, h_nnz * sizeof(int)));
cudaErrCheck(cudaMalloc((void **)&d_csrVal, h_nnz * sizeof(double)));
cudaErrCheck(cudaMalloc(&dBuffer, bufferSize));
// cusparse - used for transpose
cusparseErrCheck(cusparseCreateMatDescr(&descr));
cusparseErrCheck(cusparseSetMatType(descr, CUSPARSE_MATRIX_TYPE_GENERAL));
cusparseErrCheck(cusparseSetMatIndexBase(descr, CUSPARSE_INDEX_BASE_ZERO));
};
void denseToCSR(double *m) {
int row, col;
col = row = 0;
std::vector<int> temp_rowPtr, temp_colIdx;
temp_rowPtr.push_back(0);
std::vector<double> temp_vals;
// convert to CSR
for (int i = 0; i < h_rows * h_columns; ++i) {
if (((int)(i / h_columns)) > row) {
temp_rowPtr.push_back(h_nnz);
row = i / h_columns;
}
col = i - (row * h_columns);
if (m[i] > 1e-15) {
h_nnz += 1;
temp_colIdx.push_back(col);
temp_vals.push_back(m[i]);
}
}
temp_rowPtr.push_back(h_nnz);
// allocate
allocator(h_rows, h_columns, h_nnz);
// copy to device
cudaErrCheck(cudaMemcpy(d_rows, &h_rows, sizeof(unsigned), cudaMemcpyHostToDevice));
cudaErrCheck(cudaMemcpy(d_columns, &h_columns, sizeof(unsigned), cudaMemcpyHostToDevice));
cudaErrCheck(cudaMemcpy(d_csrRowPtr, temp_rowPtr.data(), sizeof(unsigned) * (h_rows + 1), cudaMemcpyHostToDevice));
cudaErrCheck(cudaMemcpy(d_csrColInd, temp_colIdx.data(), sizeof(unsigned) * h_nnz, cudaMemcpyHostToDevice));
cudaErrCheck(cudaMemcpy(d_csrVal, temp_vals.data(), sizeof(double) * h_nnz, cudaMemcpyHostToDevice));
};
public:
/** Constructors */
MatrixGPU() : MatrixGPU(0, 0, 0u){}; // Default Constr.
MatrixGPU(unsigned r, unsigned c) : MatrixGPU(r, c, 0u){}; // Constr. #1
MatrixGPU(unsigned r, unsigned c, unsigned n) : h_rows(r), h_columns(c), h_nnz(n) { // Constr. #2
allocator(h_rows, h_columns, h_nnz);
// copy to device
cudaErrCheck(cudaMemcpy(d_rows, &h_rows, sizeof(unsigned), cudaMemcpyHostToDevice));
cudaErrCheck(cudaMemcpy(d_columns, &h_columns, sizeof(unsigned), cudaMemcpyHostToDevice));
// zero initialize
cudaErrCheck(cudaMemset(d_csrRowPtr, ZERO, (h_rows + 1) * sizeof(int)));
cudaErrCheck(cudaMemset(d_csrColInd, ZERO, h_nnz * sizeof(int)));
cudaErrCheck(cudaMemset(d_csrVal, ZERO, h_nnz * sizeof(double)));
};
MatrixGPU(unsigned r, unsigned c, double *m) : h_rows(r), h_columns(c), h_nnz(0) { // Constr. #3
denseToCSR(m);
};
MatrixGPU(const MatrixGPU &m) : h_rows(m.h_rows), h_columns(m.h_columns), h_nnz(m.h_nnz) { // Copy Constr.
// allocate
allocator(h_rows, h_columns, h_nnz);
// copy to device
cudaErrCheck(cudaMemcpy(d_rows, &h_rows, sizeof(unsigned), cudaMemcpyHostToDevice));
cudaErrCheck(cudaMemcpy(d_columns, &h_columns, sizeof(unsigned), cudaMemcpyHostToDevice));
cudaErrCheck(cudaMemcpy(d_csrVal, m.d_csrVal, h_nnz * sizeof(double), cudaMemcpyDeviceToDevice));
cudaErrCheck(cudaMemcpy(d_csrColInd, m.d_csrColInd, h_nnz * sizeof(int), cudaMemcpyDeviceToDevice));
cudaErrCheck(cudaMemcpy(d_csrRowPtr, m.d_csrRowPtr, (h_rows + 1) * sizeof(int), cudaMemcpyDeviceToDevice));
};
MatrixGPU(MatrixGPU &&m) noexcept : MatrixGPU(m) { // MatrixGPU Move Constr.
// free old resources
cudaErrCheck(cudaFree(m.d_csrVal));
cudaErrCheck(cudaFree(m.d_csrRowPtr));
cudaErrCheck(cudaFree(m.d_csrColInd));
// zero initialize
m.h_rows = ZERO;
m.h_nnz = ZERO;
m.h_columns = ZERO;
cudaErrCheck(cudaMalloc((void **)&m.d_csrVal, h_nnz * sizeof(double)));
cudaErrCheck(cudaMalloc((void **)&m.d_csrColInd, h_nnz * sizeof(int)));
cudaErrCheck(cudaMalloc((void **)&m.d_csrRowPtr, (h_rows + 1) * sizeof(int)));
cudaErrCheck(cudaMemcpy(m.d_rows, &m.h_rows, sizeof(unsigned), cudaMemcpyHostToDevice));
cudaErrCheck(cudaMemcpy(m.d_columns, &m.h_columns, sizeof(unsigned), cudaMemcpyHostToDevice));
cudaErrCheck(cudaMemset(m.d_csrVal, ZERO, m.h_nnz * sizeof(double)));
cudaErrCheck(cudaMemset(m.d_csrColInd, ZERO, m.h_nnz * sizeof(int)));
cudaErrCheck(cudaMemset(m.d_csrRowPtr, ZERO, (m.h_rows + 1) * sizeof(int)));
};
/** Destructor */
~MatrixGPU() { // Destructor
cusparseErrCheck(cusparseDestroyMatDescr(descr));
cudaErrCheck(cudaFree(dBuffer));
cudaErrCheck(cudaFree(d_rows));
cudaErrCheck(cudaFree(d_columns));
cudaErrCheck(cudaFree(d_csrColInd));
cudaErrCheck(cudaFree(d_csrRowPtr));
cudaErrCheck(cudaFree(d_csrVal));
};
/** Assignments */
MatrixGPU &operator=(const MatrixGPU &m) { // Copy Assignment
// free + memory allocation (if needed)
if (h_rows != m.h_rows) {
h_rows = m.h_rows;
// reset memory based on rows
cudaErrCheck(cudaMemcpy(d_rows, m.d_rows, sizeof(unsigned), cudaMemcpyDeviceToDevice));
cudaErrCheck(cudaFree(d_csrRowPtr));
cudaErrCheck(cudaMalloc((void **)&d_csrRowPtr, sizeof(int) * (m.h_rows + 1)));
}
if (h_nnz != m.h_nnz) {
h_nnz = m.h_nnz;
// reset memory based on nnz
cudaErrCheck(cudaFree(d_csrVal));
cudaErrCheck(cudaFree(d_csrColInd));
cudaErrCheck(cudaMalloc((void **)&d_csrVal, sizeof(double) * h_nnz));
cudaErrCheck(cudaMalloc((void **)&d_csrColInd, sizeof(int) * h_nnz));
}
if (h_columns != m.h_columns) {
h_columns = m.h_columns;
cudaErrCheck(cudaMemcpy(d_columns, m.d_columns, sizeof(unsigned), cudaMemcpyDeviceToDevice));
}
// copy to device
cudaErrCheck(cudaMemcpy(d_csrVal, m.d_csrVal, h_nnz * sizeof(double), cudaMemcpyDeviceToDevice));
cudaErrCheck(cudaMemcpy(d_csrColInd, m.d_csrColInd, h_nnz * sizeof(int), cudaMemcpyDeviceToDevice));
cudaErrCheck(cudaMemcpy(d_csrRowPtr, m.d_csrRowPtr, (h_rows + 1) * sizeof(int), cudaMemcpyDeviceToDevice));
return *this;
};
MatrixGPU &operator=(MatrixGPU &&m) noexcept { // Move Assignment
// call copy assignment
*this = m;
// zero initialize
m.h_rows = ZERO;
m.h_nnz = ZERO;
m.h_columns = ZERO;
// freeing memory handled by destructor, potential errors are blocked by rows = cols = 0
return *this;
};
// VectorCUDA operator*(VectorCUDA &v); // Multiplication
// MatrixGPU transpose();
int getRows() { return h_rows; };
int getColumns() { return h_columns; };
int getNnz() { return h_nnz; };
virtual double Dnrm2() = 0;
};
class MatrixCUDA : public MatrixGPU {
public:
/** Inherit everything */
using MatrixGPU::MatrixGPU;
/** Operator overloads */
VectorCUDA operator*(VectorCUDA &v); // SpMV
/** Member Functions */
MatrixCUDA transpose(); // Transpose
double Dnrm2(); // EuclideanNorm
};