Skip to content

Commit

Permalink
Low mem 2D convolution for CPU and CUDA (#277)
Browse files Browse the repository at this point in the history
  • Loading branch information
antodo authored Apr 15, 2021
1 parent 5a7592b commit ff1cc76
Show file tree
Hide file tree
Showing 6 changed files with 351 additions and 67 deletions.
4 changes: 4 additions & 0 deletions include/eddl/hardware/gpu/nn/gpu_tensor_nn_kernels.h
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,10 @@ __global__ void gpu_deltabias_k(float *D, int batch, int r,int c,int nk,float *b
__global__ void gpu_im2col_k(float* I, float *ptrI, int b,int irows,int icols, int idepth, float* K, int nk, int kr,int kc, float* O,int orows,int ocols,int sr,int sc,int padrt,int padrb,int padcl,int padcr,int col2im);
__global__ void gpu_im2col_k_low(float* I, int b, float *ptrI, int irows,int icols, int idepth, float* K, int nk, int kr,int kc, float* O,int orows,int ocols,int sr,int sc,int padrt,int padrb,int padcl,int padcr,int col2im);

const int low_mem_block_size = 256;
__global__ void gpu_low_mem_conv3D(int batch_size, int channels, int image_depth, int image_rows, int image_cols, const float *image, int num_kernels, int kernel_depth, int kernel_rows, int kernel_cols, const float *kernel, int out_depth, int out_rows, int out_cols, float *output, int pad_depth, int pad_row, int pad_col, int stride_depth, int stride_rows, int stride_cols);
__global__ void gpu_low_mem_conv2D_grad(int batch_size, int channels, int image_rows, int image_cols, const float *image, int num_kernels, int kernel_rows, int kernel_cols, float *kernel, int out_rows, int out_cols, const float *delta, int pad_row, int pad_col, int stride_rows, int stride_cols);
__global__ void gpu_low_mem_conv2D_back(int batch_size, int channels, int image_rows, int image_cols, float *image, int num_kernels, int kernel_rows, int kernel_cols, const float *kernel, int out_rows, int out_cols, const float *delta, int pad_row, int pad_col, int stride_rows, int stride_cols);

// GPU: Pool
// MaxPool
Expand Down
30 changes: 14 additions & 16 deletions src/descriptors/descriptor_conv.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -61,15 +61,13 @@ ConvolDescriptor::~ConvolDescriptor(){
#ifndef cCUDNN
else if (O->isGPU()) {

if (mem_level>1) {
if (mem_level == 1) {
// Lowering
delete gpuIB;
}
else {
} else if (mem_level == 0) {
// Big tensor with all the batch for lowering
delete gpuIB;
if (mem_level==0)
delete gpuOB;
delete gpuOB;
}
}
#endif
Expand Down Expand Up @@ -161,24 +159,24 @@ void ConvolDescriptor::build(Tensor *A) {
gbias = new Tensor(vector<int>{nk}, I->device);

if (I->isCPU()) {
// mem for ptr, lowering im2col
unsigned long int l_size = (unsigned long)(A->shape[0] * r * c) * (unsigned long)(kr * kc * kz);
ptrI=get_fmem(l_size,"ConvolDescriptor::build");
matI=Eigen::Map<Eigen::MatrixXf>(ptrI, r*c,kz*kr*kc);
_profile_add_tensor(A->shape[0] * r * c * kr * kc * kz);
if (mem_level < 2) {
// mem for ptr, lowering im2col
unsigned long int l_size = (unsigned long)(A->shape[0] * r * c) * (unsigned long)(kr * kc * kz);
ptrI=get_fmem(l_size,"ConvolDescriptor::build");
matI=Eigen::Map<Eigen::MatrixXf>(ptrI, r*c,kz*kr*kc);
_profile_add_tensor(A->shape[0] * r * c * kr * kc * kz);
}
}
#ifdef cGPU
else if (I->isGPU()) {
#ifndef cCUDNN
if (mem_level>1) {
if (mem_level == 1) {
// Lowering
gpuIB=new Tensor(vector<int>{r*c,kc*kr*kz}, I->device);
}
else {
} else if (mem_level== 0) {
// Big tensor with all the batch for lowering
gpuIB=new Tensor(vector<int>{A->shape[0]*r*c,kc*kr*kz}, I->device);
if (mem_level==0)
gpuOB=new Tensor(vector<int>{z,A->shape[0]*r*c}, I->device);
gpuOB=new Tensor(vector<int>{z,A->shape[0]*r*c}, I->device);
}
#endif
// Tensor with variable shared ptr, delete create ptr
Expand Down Expand Up @@ -261,7 +259,7 @@ void ConvolDescriptor::resize(int b)
#ifdef cGPU
else if (I->isGPU()) {
#ifndef cCUDNN
if (mem_level<2)
if (mem_level < 1)
gpuIB->resize(b*r*c);
if (mem_level==0) {
delete gpuOB;
Expand Down
195 changes: 164 additions & 31 deletions src/hardware/cpu/nn/cpu_conv.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -106,8 +106,7 @@ void im2col(int b,ConvolDescriptor *D,float *ptrI,int col2im)

}


void cpu_conv2D(ConvolDescriptor *D)
void cpu_im2col_conv2D(ConvolDescriptor *D)
{
_profile(_CPU_CONV2D, 0);
int osize=D->z*D->r*D->c;
Expand All @@ -133,23 +132,10 @@ void cpu_conv2D(ConvolDescriptor *D)

matO=matI*matK;
}// batch

//bias
if (D->use_bias) {
#pragma omp parallel for
for(int b=0;b<D->O->shape[0];b++) {
float *ptrO=D->O->ptr+(b*osize);
for(int z=0;z<D->O->shape[1];z++)
for(int r=0;r<D->O->shape[2];r++)
for(int c=0;c<D->O->shape[3];c++,ptrO++)
(*ptrO)+=D->bias->ptr[z];
}
}
_profile(_CPU_CONV2D, 1);

}

void cpu_conv2D_grad(ConvolDescriptor *D)
void cpu_im2col_conv2D_grad(ConvolDescriptor *D)
{
_profile(_CPU_CONV2D_GRAD, 0);
//return;
Expand All @@ -171,24 +157,10 @@ void cpu_conv2D_grad(ConvolDescriptor *D)

matgK+=matI.transpose()*matD;
}// batch

//bias

//#pragma omp parallel for
if (D->use_bias) {
for(int b=0;b<D->D->shape[0];b++) {
float *ptrD=D->D->ptr+(b*osize);
for(int z=0;z<D->D->shape[1];z++)
for(int r=0;r<D->D->shape[2];r++)
for(int c=0;c<D->D->shape[3];c++,ptrD++)
D->gbias->ptr[z]+=(*ptrD);

}
}
_profile(_CPU_CONV2D_GRAD, 1);
}

void cpu_conv2D_back(ConvolDescriptor *D)
void cpu_im2col_conv2D_back(ConvolDescriptor *D)
{
_profile(_CPU_CONV2D_BACK, 0);
int osize=D->z*D->r*D->c;
Expand Down Expand Up @@ -217,6 +189,167 @@ void cpu_conv2D_back(ConvolDescriptor *D)
_profile(_CPU_CONV2D_BACK, 1);
}

void cpu_low_mem_conv3D(int batch_size,
int channels, int image_depth, int image_rows, int image_cols, const float *image,
int num_kernels, int kernel_depth, int kernel_rows, int kernel_cols, const float *kernel,
int out_depth, int out_rows, int out_cols, float *output,
int pad_depth, int pad_row, int pad_col,
int stride_depth, int stride_rows, int stride_cols)
{
#pragma omp parallel for
for (int b = 0; b < batch_size; b++)
for (int nk = 0; nk < num_kernels; nk++)
for (int k = 0; k < out_depth; k++)
for (int i = 0; i < out_rows; i++)
for (int j = 0; j < out_cols; j++) {
float s = 0;
for (int z = 0; z < kernel_depth; z++) {
int pz = k * stride_depth + z - pad_depth;
if (pz >= 0 && pz < image_depth)
for (int x = 0; x < kernel_rows; x++) {
int px = i * stride_rows + x - pad_row;
if (px >= 0 && px < image_rows)
for (int y = 0; y < kernel_cols; y++) {
int py = j * stride_cols + y - pad_col;
if (py >= 0 && py < image_cols) {
for (int c = 0; c < channels; c++)
s += kernel[(((nk * channels + c) * kernel_depth + z) * kernel_rows + x) * kernel_cols + y]
* image[(((b * channels + c) * image_depth + pz) * image_rows + px) * image_cols + py];
}
}
}
}
output[(((b * num_kernels + nk) * out_depth + k) * out_rows + i) * out_cols + j] = s;
}
}

void cpu_conv2D(ConvolDescriptor *D)
{
if (D->mem_level > 1) cpu_low_mem_conv3D(D->I->shape[0],
D->iz, 1, D->ir, D->ic, D->I->ptr,
D->nk, 1, D->kr, D->kc, D->K->ptr,
1, D->r, D->c, D->O->ptr,
0, D->padrt, D->padcl,
1, D->sr, D->sc);
else cpu_im2col_conv2D(D);

int osize=D->z*D->r*D->c;
//bias
if (D->use_bias) {
#pragma omp parallel for
for(int b=0;b<D->O->shape[0];b++) {
float *ptrO=D->O->ptr+(b*osize);
for(int z=0;z<D->O->shape[1];z++)
for(int r=0;r<D->O->shape[2];r++)
for(int c=0;c<D->O->shape[3];c++,ptrO++)
(*ptrO)+=D->bias->ptr[z];
}
}
}

void cpu_low_mem_conv2D_grad(int batch_size,
int channels, int image_rows, int image_cols, const float *image,
int num_kernels, int kernel_rows, int kernel_cols, float *kernel,
int out_rows, int out_cols, const float *delta,
int pad_row, int pad_col,
int stride_rows, int stride_cols)
{
int kernel_size = num_kernels * channels * kernel_rows * kernel_cols;
for (int b = 0; b < batch_size; b++) {
#pragma omp parallel for
/* for (int nk = 0; nk < num_kernels; nk++)
for (int c = 0; c < channels; c++)
for (int x = 0; x < kernel_rows; x++)
for (int y = 0; y < kernel_cols; y++) { */
for (int tid = 0; tid < kernel_size; tid++) {
int nk = tid;
int y = nk % kernel_cols; nk /= kernel_cols;
int x = nk % kernel_rows; nk /= kernel_rows;
int c = nk % channels; nk /= channels;

float s = 0.0;
for (int i = 0; i < out_rows; i++) {
int px = i * stride_rows - pad_row + x;
if (px < 0) continue;
if (px >= image_rows) continue;
for (int j = 0; j < out_cols; j++) {
int py = j * stride_cols - pad_col + y;
if (py < 0) continue;
if (py >= image_cols) continue;
s += image[((b * channels + c) * image_rows + px) * image_cols + py] *
delta[((b * num_kernels + nk) * out_rows + i) * out_cols + j];
}
}
// kernel[(((nk * channels + c) * kernel_rows + x) * kernel_cols) + y] = s;
kernel[tid] += s;
}
}
}

void cpu_conv2D_grad(ConvolDescriptor *D)
{
if (D->mem_level > 1) cpu_low_mem_conv2D_grad(D->I->shape[0],
D->iz, D->ir, D->ic, D->I->ptr,
D->nk, D->kr, D->kc, D->gK->ptr,
D->r, D->c, D->D->ptr,
D->padrt, D->padcl,
D->sr, D->sc);
else cpu_im2col_conv2D_grad(D);

//bias
int osize=D->z*D->r*D->c;
//#pragma omp parallel for
if (D->use_bias) {
for(int b=0;b<D->D->shape[0];b++) {
float *ptrD=D->D->ptr+(b*osize);
for(int z=0;z<D->D->shape[1];z++)
for(int r=0;r<D->D->shape[2];r++)
for(int c=0;c<D->D->shape[3];c++,ptrD++)
D->gbias->ptr[z]+=(*ptrD);
}
}
}

void cpu_low_mem_conv2D_back(int batch_size,
int channels, int image_rows, int image_cols, float *image,
int num_kernels, int kernel_rows, int kernel_cols, const float *kernel,
int out_rows, int out_cols, const float *delta,
int pad_row, int pad_col,
int stride_rows, int stride_cols)
{
#pragma omp parallel for
for (int b = 0; b < batch_size; b++)
for (int c = 0; c < channels; c++)
for (int i = 0; i < out_rows; i++)
for (int j = 0; j < out_cols; j++)
for (int x = 0; x < kernel_rows; x++) {
int px = i * stride_rows - pad_row + x;
if (px < 0) continue;
if (px >= image_rows) continue;
for (int y = 0; y < kernel_cols; y++) {
int py = j * stride_cols - pad_col + y;
if (py < 0) continue;
if (py >= image_cols) continue;
float s = 0.0;
for (int nk = 0; nk < num_kernels; nk++)
s += delta[((b * num_kernels + nk) * out_rows + i) * out_cols + j]
* kernel[((nk * channels + c) * kernel_rows + x) * kernel_cols + y];
image[((b * channels + c) * image_rows + px) * image_cols + py] += s;
}
}
}

void cpu_conv2D_back(ConvolDescriptor *D)
{
if (D->mem_level > 1) cpu_low_mem_conv2D_back(D->I->shape[0],
D->iz, D->ir, D->ic, D->ID->ptr,
D->nk, D->kr, D->kc, D->K->ptr,
D->r, D->c, D->D->ptr,
D->padrt, D->padcl,
D->sr, D->sc);
else cpu_im2col_conv2D_back(D);
}


void cpu_conv3D(ConvolDescriptor3D *D){

Expand Down
Loading

0 comments on commit ff1cc76

Please sign in to comment.