diff --git a/models/ops/depthavgpooling/build.py b/models/ops/depthavgpooling/build.py
index 8d71e00..0b3272a 100644
--- a/models/ops/depthavgpooling/build.py
+++ b/models/ops/depthavgpooling/build.py
@@ -28,7 +28,8 @@
     define_macros=defines,
     relative_to=__file__,
     with_cuda=with_cuda,
-    extra_objects=extra_objects
+    extra_objects=extra_objects,
+    extra_compile_args=["-std=c++11"]
 )
 
 if __name__ == '__main__':
diff --git a/models/ops/depthavgpooling/src/depthavgpooling_cuda.c b/models/ops/depthavgpooling/src/depthavgpooling_cuda.c
index c9b9d36..1ad8bab 100644
--- a/models/ops/depthavgpooling/src/depthavgpooling_cuda.c
+++ b/models/ops/depthavgpooling/src/depthavgpooling_cuda.c
@@ -14,7 +14,7 @@ void shape_check(THCState *state,
   THArgCheck(dW > 0 && dH > 0, 8,
              "stride should be greater than zero, but got dH: %d dW: %d", dH, dW);
 
-  int ndim = input->nDimension;
+  int ndim = THCudaTensor_nDimension(state, input);
   int dimf = 0;
   int dimh = 1;
   int dimw = 2;
@@ -36,16 +36,16 @@ void shape_check(THCState *state,
 //             "padW = %d, padH = %d, kW = %d, kH = %d",
 //             padW, padH, kW, kH);
 
-  long nInputPlane = input->size[dimh-1];
-  long nInputRows = input->size[dimh];
-  long nInputCols = input->size[dimw];
+  long nInputPlane = THCudaTensor_size(state, input, dimh-1);
+  long nInputRows = THCudaTensor_size(state, input, dimh);
+  long nInputCols = THCudaTensor_size(state, input, dimw);
   long nOutputRows, nOutputCols;
   long nOutputPlane = nInputPlane;
 
 
 /////////check depth map shape /////////
 
-  int ndim_depth = input_depth->nDimension;
+  int ndim_depth = THCudaTensor_nDimension(state, input_depth);
   int dimf_depth = 0;
   int dimh_depth = 1;
   int dimw_depth = 2;
@@ -59,25 +59,25 @@ void shape_check(THCState *state,
   THArgCheck(ndim_depth == 3 || ndim_depth == 4, 3,
              "3D input depth tensor expected but got: %s", ndim);
 
-  long inputHeight_depth = input_depth->size[dimh_depth];
-  long inputWidth_depth = input_depth->size[dimw_depth];
+  long inputHeight_depth = THCudaTensor_size(state, input_depth, dimh_depth);
+  long inputWidth_depth = THCudaTensor_size(state, input_depth, dimw_depth);
 
-  THArgCheck(input_depth->size[1] == 1, 3,
+  THArgCheck(THCudaTensor_size(state, input_depth, 1) == 1, 3,
              "input depth should have only 1 channel",
-             nInputPlane, input->size[1]);
+             nInputPlane, THCudaTensor_size(state, input, 1));
 
   THArgCheck((nInputRows == inputHeight_depth && nInputCols == inputWidth_depth), 3,
              "input image and input depth should be the same size, but got: weightcount(%d,%d), depth(%d,%d)",
              nInputRows, inputHeight_depth, nInputCols, inputWidth_depth);
 
   if (depthweightcount!=NULL){
-      THArgCheck(depthweightcount->size[1] == 1, 3,
+      THArgCheck(THCudaTensor_size(state, depthweightcount, 1) == 1, 3,
                  "input depth should have only 1 channel",
-                 nInputPlane, input->size[1]);
+                 nInputPlane, THCudaTensor_size(state, input, 1));
 
-      THArgCheck((inputHeight_depth == depthweightcount->size[2] && inputWidth_depth == depthweightcount->size[3]), 3,
+      THArgCheck((inputHeight_depth == THCudaTensor_size(state, depthweightcount, 2) && inputWidth_depth == THCudaTensor_size(state, depthweightcount, 3)), 3,
                  "input depth and input depthweightcount should be the same size, but got: weightcount(%d,%d), depth(%d,%d)",
-                 depthweightcount->size[dimh_depth], depthweightcount->size[dimw_depth], inputHeight_depth, inputWidth_depth);
+                 THCudaTensor_size(state, depthweightcount, dimh_depth), THCudaTensor_size(state, depthweightcount, dimw_depth), inputHeight_depth, inputWidth_depth);
   }
 //////////////////////////////////////////
 
@@ -103,14 +103,14 @@ void shape_check(THCState *state,
 //    THCUNN_check_dim_size(state, gradOutput, ndim, dimh, nOutputRows);
 //    THCUNN_check_dim_size(state, gradOutput, ndim, dimw, nOutputCols);
 
-    THArgCheck(gradOutput->size[dimf] == nOutputPlane, 4,
+    THArgCheck(THCudaTensor_size(state, gradOutput, dimf) == nOutputPlane, 4,
                "invalid number of gradOutput planes, expected: %d, but got: %d",
-               nOutputPlane, gradOutput->size[dimf]);
+               nOutputPlane, THCudaTensor_size(state, gradOutput, dimf));
 
-    THArgCheck((gradOutput->size[dimh] == nOutputRows &&
-                gradOutput->size[dimw] == nOutputCols),
+    THArgCheck((THCudaTensor_size(state, gradOutput, dimh) == nOutputRows &&
+                THCudaTensor_size(state, gradOutput, dimw) == nOutputCols),
                4, "invalid size of gradOutput, expected height: %d width: %d , but got height: %d width: %d", nOutputRows, nOutputCols,
-               gradOutput->size[dimh], gradOutput->size[dimw]);
+               THCudaTensor_size(state, gradOutput, dimh), THCudaTensor_size(state, gradOutput, dimw));
   }
   }
 
@@ -133,21 +133,21 @@ int depthavgpooling_forward_cuda(THCudaTensor *input,
   long nInputCols, nInputRows, nInputPlane, batchSize;
   long nOutputCols, nOutputRows;
 
-  if (input->nDimension == 3) {
-    nInputCols = input->size[2];
-    nInputRows = input->size[1];
-    nInputPlane = input->size[0];
+  if (THCudaTensor_nDimension(state, input) == 3) {
+    nInputCols = THCudaTensor_size(state, input, 2);
+    nInputRows = THCudaTensor_size(state, input, 1);
+    nInputPlane = THCudaTensor_size(state, input, 0);
     batchSize = 1;
     batch = 0;
-    THCudaTensor_resize4d(state, input, 1, input->size[0], input->size[1], input->size[2]);
-    THCudaTensor_resize4d(state, input_depth, 1, input_depth->size[0], input_depth->size[1], input_depth->size[2]);
+    THCudaTensor_resize4d(state, input, 1, THCudaTensor_size(state, input, 0), THCudaTensor_size(state, input, 1), THCudaTensor_size(state, input, 2));
+    THCudaTensor_resize4d(state, input_depth, 1, THCudaTensor_size(state, input_depth, 0), THCudaTensor_size(state, input_depth, 1), THCudaTensor_size(state, input_depth, 2));
   }
   else
   {
-    nInputCols = input->size[3];
-    nInputRows = input->size[2];
-    nInputPlane = input->size[1];
-    batchSize = input->size[0];
+    nInputCols = THCudaTensor_size(state, input, 3);
+    nInputRows = THCudaTensor_size(state, input, 2);
+    nInputPlane = THCudaTensor_size(state, input, 1);
+    batchSize = THCudaTensor_size(state, input, 0);
   }
 
   nOutputCols = floor(float(nInputCols - kW + 2*padW) / float(dW)) + 1;
@@ -237,22 +237,22 @@ int depthavgpooling_backward_input_cuda(
   int dimRow = 1;
 
   int batch = 1;
-  if (input->nDimension == 3) {
-    nInputPlane = input->size[0];
+  if (THCudaTensor_nDimension(state, input) == 3) {
+    nInputPlane = THCudaTensor_size(state, input, 0);
     batchSize = 1;
     batch = 0;
-    THCudaTensor_resize4d(state, input, 1, input->size[0], input->size[1],input->size[2]);
-    THCudaTensor_resize4d(state, gradOutput, 1, gradOutput->size[0], gradOutput->size[1], gradOutput->size[2]);
+    THCudaTensor_resize4d(state, input, 1, THCudaTensor_size(state, input, 0), THCudaTensor_size(state, input, 1),THCudaTensor_size(state, input, 2));
+    THCudaTensor_resize4d(state, gradOutput, 1, THCudaTensor_size(state, gradOutput, 0), THCudaTensor_size(state, gradOutput, 1), THCudaTensor_size(state, gradOutput, 2));
   }
   else
   {
     dimCol = 3;
     dimRow = 2;
-    nInputPlane = input->size[1];
-    batchSize = input->size[0];
+    nInputPlane = THCudaTensor_size(state, input, 1);
+    batchSize = THCudaTensor_size(state, input, 0);
   }
-  nInputCols = input->size[dimCol];
-  nInputRows = input->size[dimRow];
+  nInputCols = THCudaTensor_size(state, input, dimCol);
+  nInputRows = THCudaTensor_size(state, input, dimRow);
 
   nOutputCols = floor(float(nInputCols - kW + 2*padW) / float(dW)) + 1;
   nOutputRows = floor(float(nInputRows - kH + 2*padH) / float(dH)) + 1;
@@ -269,7 +269,7 @@ int depthavgpooling_backward_input_cuda(
 //  THCUNN_check_dim_size(state, gradOutput, input->nDimension, dimRow, nOutputRows);
 //  THCUNN_check_dim_size(state, gradOutput, input->nDimension, dimCol, nOutputCols);
 
-  THArgCheck((input_depth->size[0] == batchSize), 3, "invalid batch size of input depth");
+  THArgCheck((THCudaTensor_size(state, input_depth, 0) == batchSize), 3, "invalid batch size of input depth");
   THCudaTensor_resizeAs(state, gradInput, input);
 
 //  float* input_depth_data = THCudaTensor_data(state, input_depth);
diff --git a/models/ops/depthconv/build.py b/models/ops/depthconv/build.py
index 1f6aaea..7899ad1 100644
--- a/models/ops/depthconv/build.py
+++ b/models/ops/depthconv/build.py
@@ -28,7 +28,8 @@
     define_macros=defines,
     relative_to=__file__,
     with_cuda=with_cuda,
-    extra_objects=extra_objects
+    extra_objects=extra_objects,
+    extra_compile_args=["-std=c++11"]
 )
 
 if __name__ == '__main__':
diff --git a/models/ops/depthconv/src/depthconv_cuda.c b/models/ops/depthconv/src/depthconv_cuda.c
index 51a78eb..12e6872 100644
--- a/models/ops/depthconv/src/depthconv_cuda.c
+++ b/models/ops/depthconv/src/depthconv_cuda.c
@@ -9,10 +9,10 @@ void shape_check(THCState *state, THCudaTensor *input, THCudaTensor *input_depth
                  int dH, int dW, int padH, int padW, int dilationH,
                  int dilationW) {
 
-  THArgCheck(weight->nDimension == 4, 5,
+  THArgCheck(THCudaTensor_nDimension(state, weight) == 4, 5,
              "4D weight tensor (nOutputPlane,nInputPlane,kH,kW) expected, "
              "but got: %s",
-             weight->nDimension);
+             THCudaTensor_nDimension(state, weight));
 
   THArgCheck(THCudaTensor_isContiguous(state, weight), 5,
              "weight tensor has to be contiguous");
@@ -21,9 +21,9 @@ void shape_check(THCState *state, THCudaTensor *input, THCudaTensor *input_depth
              "kernel size should be greater than zero, but got kH: %d kW: %d",
              kH, kW);
 
-  THArgCheck((weight->size[2] == kH && weight->size[3] == kW), 9,
+  THArgCheck((THCudaTensor_size(state, weight, 2) == kH && THCudaTensor_size(state, weight, 3) == kW), 9,
              "kernel size should be consistent with weight, but got kH: %d kW: %d weight.size(2): %d, weight.size(3): %d", kH,
-             kW, weight->size[2], weight->size[3]);
+             kW, THCudaTensor_size(state, weight, 2), THCudaTensor_size(state, weight, 3));
 
   THArgCheck(dW > 0 && dH > 0, 11,
              "stride should be greater than zero, but got dH: %d dW: %d", dH,
@@ -41,14 +41,14 @@ void shape_check(THCState *state, THCudaTensor *input, THCudaTensor *input_depth
 
   if (bias != NULL) {
 //    THCUNN_check_dim_size(state, bias, 1, 0, weight->size[0]);
-    THArgCheck(bias->nDimension==1, 6,
-             "Need bias of dimension %d but got %d", 1, bias->nDimension);
-    THArgCheck(bias->size[0]==weight->size[0], 6,
-             "Need bias of size %d but got %d", weight->size[0], bias->size[0]);
+    THArgCheck(THCudaTensor_nDimension(state, bias) == 1, 6,
+             "Need bias of dimension %d but got %d", 1, THCudaTensor_nDimension(state, bias));
+    THArgCheck(THCudaTensor_size(state, bias, 0) == THCudaTensor_size(state, weight, 0), 6,
+             "Need bias of size %d but got %d", THCudaTensor_size(state, weight, 0), THCudaTensor_size(state, bias, 0));
   }
 //////////////////////////////////////////
 
-  int ndim = input->nDimension;
+  int ndim = THCudaTensor_nDimension(state, input);
   int dimf = 0;
   int dimh = 1;
   int dimw = 2;
@@ -62,10 +62,11 @@ void shape_check(THCState *state, THCudaTensor *input, THCudaTensor *input_depth
   THArgCheck(ndim == 3 || ndim == 4, 2,
              "3D or 4D input tensor expected but got: %s", ndim);
 
-  long nInputPlane = weight->size[1];
-  long inputHeight = input->size[dimh];
-  long inputWidth = input->size[dimw];
-  long nOutputPlane = weight->size[0];
+  long nInputPlane = THCudaTensor_size(state, weight, 1);
+  long inputHeight = THCudaTensor_size(state, input, dimh);
+  long inputWidth = THCudaTensor_size(state, input, dimw);
+  long nOutputPlane = THCudaTensor_size(state, weight, 0);
+
   long outputHeight =
       (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
   long outputWidth =
@@ -83,7 +84,7 @@ void shape_check(THCState *state, THCudaTensor *input, THCudaTensor *input_depth
 
 /////////check depth map shape /////////
 
-  int ndim_depth = input_depth->nDimension;
+  int ndim_depth = THCudaTensor_nDimension(state, input);
   int dimf_depth = 0;
   int dimh_depth = 1;
   int dimw_depth = 2;
@@ -97,26 +98,27 @@ void shape_check(THCState *state, THCudaTensor *input, THCudaTensor *input_depth
   THArgCheck(ndim_depth == 3 || ndim_depth == 4, 3,
              "3D input depth tensor expected but got: %s", ndim);
 
-  long inputHeight_depth = input_depth->size[dimh_depth];
-  long inputWidth_depth = input_depth->size[dimw_depth];
+  //long inputHeight_depth = input_depth->size[dimh_depth];
+  //long inputWidth_depth = input_depth->size[dimw_depth];
+  long inputHeight_depth = THCudaTensor_size(state, input_depth, dimh_depth);
+  long inputWidth_depth = THCudaTensor_size(state, input_depth, dimw_depth);
 
-  THArgCheck(input_depth->size[1] == 1, 3,
+  THArgCheck(THCudaTensor_size(state, input_depth, 1) == 1, 3,
              "input depth should have only 1 channel",
-             nInputPlane, input->size[1]);
+             nInputPlane, THCudaTensor_size(state, input, 1));
 
   THArgCheck((inputHeight == inputHeight_depth && inputWidth == inputWidth_depth), 3,
              "input image and input depth should be the same size");
 //////////////////////////////////////////
 
   if (gradOutput != NULL) {
-    THArgCheck(gradOutput->size[dimf] == nOutputPlane, 4,
+    THArgCheck(THCudaTensor_size(state, gradOutput, dimf) == nOutputPlane, 4,
                "invalid number of gradOutput planes, expected: %d, but got: %d",
-               nOutputPlane, gradOutput->size[dimf]);
-
-    THArgCheck((gradOutput->size[dimh] == outputHeight &&
-                gradOutput->size[dimw] == outputWidth),
+               nOutputPlane, THCudaTensor_size(state, gradOutput, dimf));
+    THArgCheck((THCudaTensor_size(state, gradOutput, dimh) == outputHeight &&
+                THCudaTensor_size(state, gradOutput, dimw) == outputWidth),
                4, "invalid size of gradOutput, expected height: %d width: %d , but got height: %d width: %d", outputHeight, outputWidth,
-               gradOutput->size[dimh], gradOutput->size[dimw]);
+               THCudaTensor_size(state, gradOutput, dimh), THCudaTensor_size(state, gradOutput, dimw));
   }
 }
 
@@ -135,21 +137,21 @@ int depthconv_forward_cuda(THCudaTensor *input, THCudaTensor *input_depth, THCud
   weight = THCudaTensor_newContiguous(state, weight);
 
   int batch = 1;
-  if (input->nDimension == 3) {
+  if (THCudaTensor_nDimension(state, input) == 3) {
     // Force batch
     batch = 0;
-    THCudaTensor_resize4d(state, input, 1, input->size[0], input->size[1],
-                          input->size[2]);
-    THCudaTensor_resize4d(state, input_depth, 1, input_depth->size[0], input_depth->size[1],
-                          input_depth->size[2]);
+    THCudaTensor_resize4d(state, input, 1, THCudaTensor_size(state, input, 0), THCudaTensor_size(state, input, 1),
+                          THCudaTensor_size(state, input, 2));
+    THCudaTensor_resize4d(state, input_depth, 1, THCudaTensor_size(state, input_depth, 0), THCudaTensor_size(state, input_depth, 1),
+                          THCudaTensor_size(state, input_depth, 2));
   }
 
-  long batchSize = input->size[0];
-  long nInputPlane = input->size[1];
-  long inputHeight = input->size[2];
-  long inputWidth = input->size[3];
+  long batchSize = THCudaTensor_size(state, input, 0);
+  long nInputPlane = THCudaTensor_size(state, input, 1);
+  long inputHeight = THCudaTensor_size(state, input, 2);
+  long inputWidth = THCudaTensor_size(state, input, 3);
 
-  long nOutputPlane = weight->size[0];
+  long nOutputPlane = THCudaTensor_size(state, weight, 0);
 
   long outputWidth =
       (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
@@ -163,8 +165,8 @@ int depthconv_forward_cuda(THCudaTensor *input, THCudaTensor *input_depth, THCud
   THCudaTensor_resize2d(state, columns, nInputPlane * kW * kH,
                         outputHeight * outputWidth);
 
-  if (ones->nDimension != 2 ||
-      ones->size[0] * ones->size[1] < outputHeight * outputWidth) {
+  if (THCudaTensor_nDimension(state, ones) != 2 ||
+      THCudaTensor_size(state, ones, 0) * THCudaTensor_size(state, ones, 1) < outputHeight * outputWidth) {
     THCudaTensor_resize2d(state, ones, outputHeight, outputWidth);
     THCudaTensor_fill(state, ones, 1);
   }
@@ -199,7 +201,7 @@ int depthconv_forward_cuda(THCudaTensor *input, THCudaTensor *input_depth, THCud
         inputWidth, kH, kW, padH, padW, dH, dW, dilationH, dilationW, THCudaTensor_data(state, columns));
 
     long m = nOutputPlane;
-    long n = columns->size[1];
+    long n = THCudaTensor_size(state, columns, 1);
     long k = nInputPlane * kH * kW;
 
     THCudaBlas_Sgemm(state, 'n', 'n', n, m, k, 1.0f,
@@ -244,28 +246,28 @@ int depthconv_backward_input_cuda(
   weight = THCudaTensor_newContiguous(state, weight);
 
   int batch = 1;
-  if (input->nDimension == 3) {
+  if (THCudaTensor_nDimension(state, input) == 3) {
     // Force batch
     batch = 0;
-    THCudaTensor_resize4d(state, input, 1, input->size[0], input->size[1],
-                          input->size[2]);
-    THCudaTensor_resize4d(state, gradOutput, 1, gradOutput->size[0],
-                          gradOutput->size[1], gradOutput->size[2]);
+    THCudaTensor_resize4d(state, input, 1, THCudaTensor_size(state, input, 0), THCudaTensor_size(state, input, 1),
+                          THCudaTensor_size(state, input, 2));
+    THCudaTensor_resize4d(state, gradOutput, 1, THCudaTensor_size(state, gradOutput, 0),
+                          THCudaTensor_size(state, gradOutput, 1), THCudaTensor_size(state, gradOutput, 2));
   }
 
-  long batchSize = input->size[0];
-  long nInputPlane = input->size[1];
-  long inputHeight = input->size[2];
-  long inputWidth = input->size[3];
+  long batchSize = THCudaTensor_size(state, input, 0);
+  long nInputPlane = THCudaTensor_size(state, input, 1);
+  long inputHeight = THCudaTensor_size(state, input, 2);
+  long inputWidth = THCudaTensor_size(state, input, 3);
 
-  long nOutputPlane = weight->size[0];
+  long nOutputPlane = THCudaTensor_size(state, weight, 0);
 
   long outputWidth =
       (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
   long outputHeight =
       (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
 
-  THArgCheck((input_depth->size[0] == batchSize), 3, "invalid batch size of input depth");
+  THArgCheck((THCudaTensor_size(state, input_depth, 0) == batchSize), 3, "invalid batch size of input depth");
 
   THCudaTensor_resize4d(state, gradInput, batchSize, nInputPlane, inputHeight,
                         inputWidth);
@@ -285,7 +287,7 @@ int depthconv_backward_input_cuda(
     THCudaTensor_select(state, gradOutput_n, gradOutput, 0, elt);
 
     long m = nInputPlane * kW * kH;
-    long n = columns->size[1];
+    long n = THCudaTensor_size(state, columns, 1);
     long k = nOutputPlane;
 
     THCudaBlas_Sgemm(state, 'n', 't', n, m, k, 1.0f,
@@ -338,21 +340,21 @@ int depthconv_backward_parameters_cuda(
   gradOutput = THCudaTensor_newContiguous(state, gradOutput);
 
   int batch = 1;
-  if (input->nDimension == 3) {
+  if (THCudaTensor_nDimension(state, input) == 3) {
     // Force batch
     batch = 0;
-    THCudaTensor_resize4d(state, input, 1, input->size[0], input->size[1],
-                          input->size[2]);
-    THCudaTensor_resize4d(state, gradOutput, 1, gradOutput->size[0],
-                          gradOutput->size[1], gradOutput->size[2]);
+    THCudaTensor_resize4d(state, input, 1, THCudaTensor_size(state, input, 0), THCudaTensor_size(state, input, 1),
+                          THCudaTensor_size(state, input, 2));
+    THCudaTensor_resize4d(state, gradOutput, 1, THCudaTensor_size(state, gradOutput, 0),
+                          THCudaTensor_size(state, gradOutput, 1), THCudaTensor_size(state, gradOutput, 2));
   }
 
-  long batchSize = input->size[0];
-  long nInputPlane = input->size[1];
-  long inputHeight = input->size[2];
-  long inputWidth = input->size[3];
+  long batchSize = THCudaTensor_size(state, input, 0);
+  long nInputPlane = THCudaTensor_size(state, input, 1);
+  long inputHeight = THCudaTensor_size(state, input, 2);
+  long inputWidth = THCudaTensor_size(state, input, 3);
 
-  long nOutputPlane = gradWeight->size[0];
+  long nOutputPlane = THCudaTensor_size(state, gradWeight, 0);
 
   long outputWidth =
       (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
@@ -361,8 +363,8 @@ int depthconv_backward_parameters_cuda(
 
 
   // Define a buffer of ones, for bias accumulation
-  if (ones->nDimension != 2 ||
-      ones->size[0] * ones->size[1] < outputHeight * outputWidth) {
+  if (THCudaTensor_nDimension(state, ones) != 2 ||
+      THCudaTensor_size(state, ones, 0) * THCudaTensor_size(state, ones, 1) < outputHeight * outputWidth) {
     THCudaTensor_resize2d(state, ones, outputHeight, outputWidth);
     THCudaTensor_fill(state, ones, 1);
   }
@@ -386,7 +388,7 @@ int depthconv_backward_parameters_cuda(
 
     long m = nOutputPlane;
     long n = nInputPlane * kW * kH;
-    long k = columns->size[1];
+    long k = THCudaTensor_size(state, columns, 1);
 
     THCudaBlas_Sgemm(state, 't', 'n', n, m, k, scale,
                      THCudaTensor_data(state, columns), k,