Low mem 2D convolution for CPU and CUDA (#277)

deephealthproject · Apr 15, 2021 · ff1cc76 · ff1cc76
1 parent 5a7592b
commit ff1cc76
Show file tree

Hide file tree

Showing 6 changed files with 351 additions and 67 deletions.
diff --git a/include/eddl/hardware/gpu/nn/gpu_tensor_nn_kernels.h b/include/eddl/hardware/gpu/nn/gpu_tensor_nn_kernels.h
@@ -81,6 +81,10 @@ __global__ void gpu_deltabias_k(float *D, int batch, int r,int c,int nk,float *b
 __global__ void gpu_im2col_k(float* I, float *ptrI, int b,int irows,int icols, int idepth, float* K, int nk, int kr,int kc, float* O,int orows,int ocols,int sr,int sc,int padrt,int padrb,int padcl,int padcr,int col2im);
 __global__ void gpu_im2col_k_low(float* I, int b, float *ptrI, int irows,int icols, int idepth, float* K, int nk, int kr,int kc, float* O,int orows,int ocols,int sr,int sc,int padrt,int padrb,int padcl,int padcr,int col2im);
 
+const int low_mem_block_size = 256;
+__global__ void gpu_low_mem_conv3D(int batch_size, int channels, int image_depth, int image_rows, int image_cols, const float *image, int num_kernels, int kernel_depth, int kernel_rows, int kernel_cols, const float *kernel, int out_depth, int out_rows, int out_cols, float *output, int pad_depth, int pad_row, int pad_col, int stride_depth, int stride_rows, int stride_cols);
+__global__ void gpu_low_mem_conv2D_grad(int batch_size, int channels, int image_rows, int image_cols, const float *image, int num_kernels, int kernel_rows, int kernel_cols, float *kernel, int out_rows, int out_cols, const float *delta, int pad_row, int pad_col, int stride_rows, int stride_cols);
+__global__ void gpu_low_mem_conv2D_back(int batch_size, int channels, int image_rows, int image_cols, float *image, int num_kernels, int kernel_rows, int kernel_cols, const float *kernel, int out_rows, int out_cols, const float *delta, int pad_row, int pad_col, int stride_rows, int stride_cols);
 
 // GPU: Pool
 // MaxPool

diff --git a/src/descriptors/descriptor_conv.cpp b/src/descriptors/descriptor_conv.cpp
@@ -61,15 +61,13 @@ ConvolDescriptor::~ConvolDescriptor(){
 #ifndef cCUDNN
     else if (O->isGPU()) {
 
-        if (mem_level>1) {
+        if (mem_level == 1) {
             // Lowering
             delete gpuIB;
-        }
-        else {
+        } else if (mem_level == 0) {
             // Big tensor with all the batch for lowering
             delete gpuIB;
-            if (mem_level==0)
-                delete gpuOB;
+            delete gpuOB;
         }
     }
 #endif
@@ -161,24 +159,24 @@ void ConvolDescriptor::build(Tensor *A) {
     gbias = new Tensor(vector<int>{nk}, I->device);
 
     if (I->isCPU()) {
-        // mem for ptr, lowering im2col
-        unsigned long int l_size =  (unsigned long)(A->shape[0] * r * c) * (unsigned long)(kr * kc * kz);
-        ptrI=get_fmem(l_size,"ConvolDescriptor::build");
-        matI=Eigen::Map<Eigen::MatrixXf>(ptrI, r*c,kz*kr*kc);
-	   _profile_add_tensor(A->shape[0] * r * c * kr * kc * kz);
+        if (mem_level < 2) {
+            // mem for ptr, lowering im2col
+            unsigned long int l_size =  (unsigned long)(A->shape[0] * r * c) * (unsigned long)(kr * kc * kz);
+            ptrI=get_fmem(l_size,"ConvolDescriptor::build");
+            matI=Eigen::Map<Eigen::MatrixXf>(ptrI, r*c,kz*kr*kc);
+               _profile_add_tensor(A->shape[0] * r * c * kr * kc * kz);
+        }
     }
 #ifdef cGPU
     else if (I->isGPU()) {
 #ifndef cCUDNN
-        if (mem_level>1) {
+        if (mem_level == 1) {
             // Lowering
             gpuIB=new Tensor(vector<int>{r*c,kc*kr*kz}, I->device);
-        }
-        else {
+        } else if (mem_level== 0) {
             // Big tensor with all the batch for lowering
             gpuIB=new Tensor(vector<int>{A->shape[0]*r*c,kc*kr*kz}, I->device);
-            if (mem_level==0)
-                gpuOB=new Tensor(vector<int>{z,A->shape[0]*r*c}, I->device);
+            gpuOB=new Tensor(vector<int>{z,A->shape[0]*r*c}, I->device);
         }
 #endif
         // Tensor with variable shared ptr, delete create ptr
@@ -261,7 +259,7 @@ void ConvolDescriptor::resize(int b)
 #ifdef cGPU
     else if (I->isGPU()) {
 #ifndef cCUDNN
-      if (mem_level<2)
+      if (mem_level < 1)
             gpuIB->resize(b*r*c);
         if (mem_level==0) {
             delete gpuOB;

diff --git a/src/hardware/cpu/nn/cpu_conv.cpp b/src/hardware/cpu/nn/cpu_conv.cpp
@@ -106,8 +106,7 @@ void im2col(int b,ConvolDescriptor *D,float *ptrI,int col2im)
 
 }
 
-
-void cpu_conv2D(ConvolDescriptor *D)
+void cpu_im2col_conv2D(ConvolDescriptor *D)
 {
   _profile(_CPU_CONV2D, 0);
   int osize=D->z*D->r*D->c;
@@ -133,23 +132,10 @@ void cpu_conv2D(ConvolDescriptor *D)
 
     matO=matI*matK;
   }// batch
-
-  //bias
-  if (D->use_bias) {
-    #pragma omp parallel for
-    for(int b=0;b<D->O->shape[0];b++) {
-      float *ptrO=D->O->ptr+(b*osize);
-      for(int z=0;z<D->O->shape[1];z++)
-      for(int r=0;r<D->O->shape[2];r++)
-      for(int c=0;c<D->O->shape[3];c++,ptrO++)
-      (*ptrO)+=D->bias->ptr[z];
-    }
-  }
     _profile(_CPU_CONV2D, 1);
-
 }
 
-void cpu_conv2D_grad(ConvolDescriptor *D)
+void cpu_im2col_conv2D_grad(ConvolDescriptor *D)
 {
   _profile(_CPU_CONV2D_GRAD, 0);
   //return;
@@ -171,24 +157,10 @@ void cpu_conv2D_grad(ConvolDescriptor *D)
 
     matgK+=matI.transpose()*matD;
   }// batch
-
-  //bias
-
-  //#pragma omp parallel for
-  if (D->use_bias) {
-    for(int b=0;b<D->D->shape[0];b++) {
-      float *ptrD=D->D->ptr+(b*osize);
-      for(int z=0;z<D->D->shape[1];z++)
-      for(int r=0;r<D->D->shape[2];r++)
-      for(int c=0;c<D->D->shape[3];c++,ptrD++)
-      D->gbias->ptr[z]+=(*ptrD);
-
-    }
-  }
     _profile(_CPU_CONV2D_GRAD, 1);
 }
 
-void cpu_conv2D_back(ConvolDescriptor *D)
+void cpu_im2col_conv2D_back(ConvolDescriptor *D)
 {
   _profile(_CPU_CONV2D_BACK, 0);
   int osize=D->z*D->r*D->c;
@@ -217,6 +189,167 @@ void cpu_conv2D_back(ConvolDescriptor *D)
     _profile(_CPU_CONV2D_BACK, 1);
 }
 
+void cpu_low_mem_conv3D(int batch_size,
+        int channels, int image_depth, int image_rows, int image_cols, const float *image,
+        int num_kernels, int kernel_depth, int kernel_rows, int kernel_cols, const float *kernel,
+        int out_depth, int out_rows, int out_cols, float *output,
+        int pad_depth, int pad_row, int pad_col,
+        int stride_depth, int stride_rows, int stride_cols)
+{
+    #pragma omp parallel for
+    for (int b = 0; b < batch_size; b++)
+    for (int nk = 0; nk < num_kernels; nk++)
+    for (int k = 0; k < out_depth; k++)
+    for (int i = 0; i < out_rows; i++)
+    for (int j = 0; j < out_cols; j++) {
+        float s = 0;
+        for (int z = 0; z < kernel_depth; z++) {
+            int pz = k * stride_depth + z - pad_depth;
+            if (pz >= 0 && pz < image_depth)
+            for (int x = 0; x < kernel_rows; x++) {
+                int px = i * stride_rows + x - pad_row;
+                if (px >= 0 && px < image_rows)
+                for (int y = 0; y < kernel_cols; y++) {
+                    int py = j * stride_cols + y - pad_col;
+                    if (py >= 0 && py < image_cols) {
+                        for (int c = 0; c < channels; c++)
+                            s += kernel[(((nk * channels + c) * kernel_depth + z) * kernel_rows + x) * kernel_cols + y]
+                               * image[(((b * channels + c) * image_depth + pz) * image_rows + px) * image_cols + py];
+                    }
+                }
+            }
+        }
+        output[(((b * num_kernels + nk) * out_depth + k) * out_rows + i) * out_cols + j] = s;
+    }
+}
+
+void cpu_conv2D(ConvolDescriptor *D)
+{
+    if (D->mem_level > 1) cpu_low_mem_conv3D(D->I->shape[0],
+        D->iz, 1, D->ir, D->ic, D->I->ptr,
+        D->nk, 1, D->kr, D->kc, D->K->ptr,
+        1, D->r, D->c, D->O->ptr,
+        0, D->padrt, D->padcl,
+        1, D->sr, D->sc);
+    else cpu_im2col_conv2D(D);
+
+  int osize=D->z*D->r*D->c;
+  //bias
+  if (D->use_bias) {
+    #pragma omp parallel for
+    for(int b=0;b<D->O->shape[0];b++) {
+      float *ptrO=D->O->ptr+(b*osize);
+      for(int z=0;z<D->O->shape[1];z++)
+      for(int r=0;r<D->O->shape[2];r++)
+      for(int c=0;c<D->O->shape[3];c++,ptrO++)
+      (*ptrO)+=D->bias->ptr[z];
+    }
+  }
+}
+
+void cpu_low_mem_conv2D_grad(int batch_size,
+        int channels, int image_rows, int image_cols, const float *image,
+        int num_kernels, int kernel_rows, int kernel_cols, float *kernel,
+        int out_rows, int out_cols, const float *delta,
+        int pad_row, int pad_col,
+        int stride_rows, int stride_cols)
+{
+    int kernel_size = num_kernels * channels * kernel_rows * kernel_cols;
+    for (int b = 0; b < batch_size; b++) {
+        #pragma omp parallel for
+        /* for (int nk = 0; nk < num_kernels; nk++)
+        for (int c = 0; c < channels; c++)
+        for (int x = 0; x < kernel_rows; x++)
+        for (int y = 0; y < kernel_cols; y++) { */
+        for (int tid = 0; tid < kernel_size; tid++) {
+            int nk = tid;
+            int y = nk % kernel_cols; nk /= kernel_cols;
+            int x = nk % kernel_rows; nk /= kernel_rows;
+            int c = nk % channels; nk /= channels;
+
+            float s = 0.0;
+            for (int i = 0; i < out_rows; i++) {
+                int px = i * stride_rows - pad_row + x;
+                if (px < 0) continue;
+                if (px >= image_rows) continue;
+                for (int j = 0; j < out_cols; j++) {
+                    int py = j * stride_cols - pad_col + y;
+                    if (py < 0) continue;
+                    if (py >= image_cols) continue;
+                    s += image[((b * channels + c) * image_rows + px) * image_cols + py] *
+                        delta[((b * num_kernels + nk) * out_rows + i) * out_cols + j];
+                }
+            }
+            // kernel[(((nk * channels + c) * kernel_rows + x) * kernel_cols) + y] = s;
+            kernel[tid] += s;
+        }
+    }
+}
+
+void cpu_conv2D_grad(ConvolDescriptor *D)
+{
+    if (D->mem_level > 1) cpu_low_mem_conv2D_grad(D->I->shape[0],
+        D->iz, D->ir, D->ic, D->I->ptr,
+        D->nk, D->kr, D->kc, D->gK->ptr,
+        D->r, D->c, D->D->ptr,
+        D->padrt, D->padcl,
+        D->sr, D->sc);
+    else cpu_im2col_conv2D_grad(D);
+
+  //bias
+  int osize=D->z*D->r*D->c;
+  //#pragma omp parallel for
+  if (D->use_bias) {
+    for(int b=0;b<D->D->shape[0];b++) {
+      float *ptrD=D->D->ptr+(b*osize);
+      for(int z=0;z<D->D->shape[1];z++)
+      for(int r=0;r<D->D->shape[2];r++)
+      for(int c=0;c<D->D->shape[3];c++,ptrD++)
+      D->gbias->ptr[z]+=(*ptrD);
+    }
+  }
+}
+
+void cpu_low_mem_conv2D_back(int batch_size,
+        int channels, int image_rows, int image_cols, float *image,
+        int num_kernels, int kernel_rows, int kernel_cols, const float *kernel,
+        int out_rows, int out_cols, const float *delta,
+        int pad_row, int pad_col,
+        int stride_rows, int stride_cols)
+{
+    #pragma omp parallel for
+    for (int b = 0; b < batch_size; b++)
+    for (int c = 0; c < channels; c++)
+    for (int i = 0; i < out_rows; i++)
+    for (int j = 0; j < out_cols; j++)
+        for (int x = 0; x < kernel_rows; x++) {
+            int px = i * stride_rows - pad_row + x;
+            if (px < 0) continue;
+            if (px >= image_rows) continue;
+            for (int y = 0; y < kernel_cols; y++) {
+                int py = j * stride_cols - pad_col + y;
+                if (py < 0) continue;
+                if (py >= image_cols) continue;
+                float s = 0.0;
+                for (int nk = 0; nk < num_kernels; nk++)
+                    s += delta[((b * num_kernels + nk) * out_rows + i) * out_cols + j]
+                       * kernel[((nk * channels + c) * kernel_rows + x) * kernel_cols + y];
+                image[((b * channels + c) * image_rows + px) * image_cols + py] += s;
+            }
+        }
+}
+
+void cpu_conv2D_back(ConvolDescriptor *D)
+{
+    if (D->mem_level > 1) cpu_low_mem_conv2D_back(D->I->shape[0],
+        D->iz, D->ir, D->ic, D->ID->ptr,
+        D->nk, D->kr, D->kc, D->K->ptr,
+        D->r, D->c, D->D->ptr,
+        D->padrt, D->padcl,
+        D->sr, D->sc);
+    else cpu_im2col_conv2D_back(D);
+}
+
 
 void cpu_conv3D(ConvolDescriptor3D *D){