diff --git a/Makefile b/Makefile
index e95ee067980f..2f2b14bee0a7 100644
--- a/Makefile
+++ b/Makefile
@@ -64,14 +64,14 @@ endif
 #BIN = test/test_threaded_engine test/api_registry_test
 OBJ = narray_function_cpu.o
 # add threaded engine after it is done
-OBJCXX11 = engine.o narray.o c_api.o operator.o symbol.o storage.o fully_connected_cpu.o static_graph.o activation_cpu.o graph_executor.o softmax_cpu.o elementwise_sum_cpu.o pooling_cpu.o
+OBJCXX11 = flatten_cpu.o engine.o narray.o c_api.o operator.o symbol.o storage.o fully_connected_cpu.o static_graph.o activation_cpu.o graph_executor.o softmax_cpu.o elementwise_sum_cpu.o pooling_cpu.o convolution_cpu.o
 CUOBJ =
 SLIB = lib/libmxnet.so
 ALIB = lib/libmxnet.a
 LIB_DEP = $(DMLC_CORE)/libdmlc.a
 
 ifeq ($(USE_CUDA), 1)
-	CUOBJ += narray_function_gpu.o fully_connected_gpu.o activation_gpu.o elementwise_sum_gpu.o pooling_gpu.o softmax_gpu.o
+	CUOBJ += flatten_gpu.o narray_function_gpu.o fully_connected_gpu.o activation_gpu.o elementwise_sum_gpu.o pooling_gpu.o softmax_gpu.o convolution_gpu.o
 endif
 
 .PHONY: clean all test lint doc
@@ -101,6 +101,10 @@ pooling_cpu.o: src/operator/pooling.cc
 pooling_gpu.o: src/operator/pooling.cu
 softmax_cpu.o: src/operator/softmax.cc
 softmax_gpu.o: src/operator/softmax.cu
+convolution_cpu.o: src/operator/convolution.cc
+convolution_gpu.o: src/operator/convolution.cu
+flatten_cpu.o: src/operator/flatten.cc
+flatten_gpu.o: src/operator/flatten.cu
 
 lib/libmxnet.a: $(OBJ) $(OBJCXX11) $(CUOBJ)
 lib/libmxnet.so: $(OBJ) $(OBJCXX11) $(CUOBJ)
diff --git a/include/mxnet/base.h b/include/mxnet/base.h
index fe260e082148..0d3f81ea7605 100644
--- a/include/mxnet/base.h
+++ b/include/mxnet/base.h
@@ -6,6 +6,7 @@
 #ifndef MXNET_BASE_H_
 #define MXNET_BASE_H_
 #include <dmlc/base.h>
+#include <dmlc/type_traits.h>
 #include <mshadow/tensor.h>
 #include <string>
 
@@ -45,6 +46,5 @@ typedef mshadow::default_real_t real_t;
 typedef mshadow::TShape TShape;
 /*! \brief storage container type */
 typedef mshadow::TBlob TBlob;
-
 }  // namespace mxnet
 #endif  // MXNET_BASE_H_
diff --git a/python/test_mnist.py b/python/test_mnist.py
index 3a3ee85a8d3f..63153cbe7f19 100644
--- a/python/test_mnist.py
+++ b/python/test_mnist.py
@@ -3,28 +3,14 @@
 import numpy as np
 import os, cPickle, gzip
 
-def Softmax(x):
-    batch, nidden = x.shape
-    maxes = np.max(x, axis=1)
-    x -= maxes.reshape(batch, 1)
-    x = np.exp(x)
-    norm = np.sum(x, axis=1)
-    prob = x / norm.reshape((batch, 1))
-    return prob
-
 def CalAcc(out, label):
     pred = np.argmax(out, axis=1)
     return np.sum(pred == label) * 1.0 / out.shape[0]
 
-def SetGradient(out_grad, label):
-    assert(out_grad.shape[0] == label.shape[0])
-    for i in xrange(label.shape[0]):
-        k = label[i]
-        out_grad[i][k] -= 1.0
 
 # load data
 class MNISTIter(object):
-    def __init__(self, which_set, batch_size=100):
+    def __init__(self, which_set, batch_size=100, flatten=True):
         if not os.path.exists('mnist.pkl.gz'):
             os.system("wget http://deeplearning.net/data/mnist/mnist.pkl.gz")
         f = gzip.open('mnist.pkl.gz', 'rb')
@@ -39,6 +25,7 @@ def __init__(self, which_set, batch_size=100):
         else:
             self.data = test_set[0]
             self.data = np.asarray(test_set[1])
+        self.flatten = flatten
         self.batch_size = batch_size
         self.nbatch = self.data.shape[0] / batch_size
         assert(self.data.shape[0] % batch_size == 0) # I am lazy
@@ -57,25 +44,34 @@ def Get(self):
             raise Exception("Iterator is at end")
         start = self.now_idx * self.batch_size
         end = (self.now_idx + 1) * self.batch_size
-        return (self.data[start:end, :], self.label[start:end])
+        if self.flatten:
+            return (self.data[start:end, :], self.label[start:end])
+        else:
+            return (self.data[start:end, :].reshape(batch_size, 1, 28, 28),
+                    self.label[start:end])
 
 
 
 # symbol net
 batch_size = 100
 data = mx.symbol.Variable('data')
-fc1 = mx.symbol.FullyConnected(data=data, name='fc1', num_hidden=160)
+fc1 = mx.symbol.Convolution(data = data, name='conv1', nb_filter=32, kernel=(7,7), stride=(2,2), nstep=10, no_bias=1)
 act1 = mx.symbol.Activation(data = fc1, name='relu1', act_type="relu")
-fc2 = mx.symbol.FullyConnected(data = act1, name='fc2', num_hidden=10)
-args_list = fc2.list_arguments()
+mp = mx.symbol.Pooling(data = act1, name = 'mp', kernel=(2,2), stride=(2,2), pool_type='avg')
+fl = mx.symbol.Flatten(data = mp, name="flatten")
+fc2 = mx.symbol.FullyConnected(data = fl, name='fc2', num_hidden=10)
+softmax = mx.symbol.Softmax(data = fc2, name = 'sm')
+args_list = softmax.list_arguments()
 # infer shape
-data_shape = (batch_size, 784)
-arg_shapes, out_shapes = fc2.infer_shape(data=data_shape)
+#data_shape = (batch_size, 784)
+
+data_shape = (batch_size, 1, 28, 28)
+arg_shapes, out_shapes = softmax.infer_shape(data=data_shape)
 arg_narrays = [mx.narray.create(shape) for shape in arg_shapes]
 grad_narrays = [mx.narray.create(shape) for shape in arg_shapes]
 mom_narrays = [mx.narray.create(shape) for shape in arg_shapes]
 inputs = dict(zip(args_list, arg_narrays))
-
+print zip(args_list, arg_shapes)
 np.random.seed(0)
 # set random weight
 for name, narray in inputs.items():
@@ -87,7 +83,7 @@ def Get(self):
 req = ['write_to' for i in range(len(arg_narrays))]
 # bind executer
 # TODO(bing): think of a better bind interface
-executor = fc2.bind(mx.Context('cpu'), arg_narrays, grad_narrays, req)
+executor = softmax.bind(mx.Context('cpu'), arg_narrays, grad_narrays, req)
 # update
 
 out_narray = executor.heads()[0]
@@ -104,8 +100,8 @@ def Update(mom, grad, weight):
 block = zip(mom_narrays, grad_narrays, arg_narrays)
 
 
-train = MNISTIter("train", batch_size)
-valid = MNISTIter("valid", batch_size)
+train = MNISTIter("train", batch_size, False)
+valid = MNISTIter("valid", batch_size, False)
 
 for i in xrange(epoch):
     # train
@@ -115,11 +111,10 @@ def Update(mom, grad, weight):
     while train.Next():
         data, label = train.Get()
         inputs["data"].numpy[:] = data
+        inputs["sm_label"].numpy[:] = label
         executor.forward()
-        out_narray.numpy[:] = Softmax(out_narray.numpy)
         train_acc += CalAcc(out_narray.numpy, label)
         grad_narray.numpy[:] = out_narray.numpy
-        SetGradient(grad_narray.numpy, label)
         executor.backward([grad_narray])
 
         for mom, grad, weight in block:
diff --git a/src/operator/convolution-inl.h b/src/operator/convolution-inl.h
new file mode 100644
index 000000000000..e96d81023ef5
--- /dev/null
+++ b/src/operator/convolution-inl.h
@@ -0,0 +1,317 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file convolution-inl.h
+ * \brief
+ * \author Bing Xu
+*/
+#ifndef MXNET_OPERATOR_CONVOLUTION_INL_H_
+#define MXNET_OPERATOR_CONVOLUTION_INL_H_
+
+#include <dmlc/logging.h>
+#include <dmlc/parameter.h>
+#include <mxnet/operator.h>
+#include <algorithm>
+#include <map>
+#include <vector>
+#include <string>
+#include <utility>
+#include "./operator_common.h"
+
+
+namespace mxnet {
+namespace op {
+
+enum FullyConnectedOpInputs {kData, kWeight, kBias};
+enum FullyConnectedOpOutputs {kOut};
+
+struct ConvolutionParam : public dmlc::Parameter<ConvolutionParam> {
+  TShape kernel;
+  TShape stride;
+  TShape pad;
+  int nb_filter;
+  int nb_group;
+  uint32_t nstep;
+  bool no_bias;
+  DMLC_DECLARE_PARAMETER(ConvolutionParam) {
+    int shape[] = {1, 1};
+    DMLC_DECLARE_FIELD(kernel).describe("convolution kernel size: (y, x)");
+    DMLC_DECLARE_FIELD(stride).describe("convolution stride: (y, x)")
+      .set_default(TShape(shape, shape + 2));
+    shape[0] = shape[1] = 0;
+    DMLC_DECLARE_FIELD(pad).describe("pad for convolution: (y, x)")
+      .set_default(TShape(shape, shape + 2));
+    DMLC_DECLARE_FIELD(nb_filter).describe("convolution filter(channel) number")
+      .set_range(1, 100000);
+    DMLC_DECLARE_FIELD(nb_group).set_default(1)
+      .describe("number of groups partition");
+    DMLC_DECLARE_FIELD(nstep)
+      .describe("process n images once").set_default(2).set_range(1, 10000);
+    DMLC_DECLARE_FIELD(no_bias).set_default(false)
+        .describe("Whether to disable bias parameter.");
+  }
+};
+
+template<typename xpu>
+class ConvolutionOp : public Operator {
+ public:
+  explicit ConvolutionOp(ConvolutionParam p) {
+    this->param_ = p;
+  }
+
+  virtual void Forward(const OpContext &ctx,
+                       const std::vector<TBlob> &in_data,
+                       const std::vector<OpReqType> &req,
+                       const std::vector<TBlob> &out_data) {
+    using namespace mshadow;
+    using namespace mshadow::expr;
+    CHECK_EQ(req[kOut], kWriteTo);
+    size_t expected = param_.no_bias ? 2 : 3;
+    CHECK_EQ(in_data.size(), expected);
+    CHECK_EQ(out_data.size(), 1);
+    // TODO(bing): check the BLAS Handle, be careful
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    Tensor<xpu, 4> data = in_data[kData].get<xpu, 4, real_t>(s);
+    Tensor<xpu, 3> wmat = in_data[kWeight].get<xpu, 3, real_t>(s);
+    Tensor<xpu, 4> out = out_data[kOut].get<xpu, 4, real_t>(s);
+    this->InitTemp(data.shape_, out.shape_);
+    const index_t nbatch = data.size(0);
+    for (index_t i = 0; i < nbatch; i += param_.nstep) {
+      const index_t step = std::min(param_.nstep, nbatch - i);
+      temp_col_.Resize(mshadow::Shape2(shape_colunit_[0],
+                                       shape_colunit_[1] * step));
+      temp_dst_.Resize(mshadow::Shape3(shape_dstunit_[0],
+                                       shape_dstunit_[1],
+                                       shape_dstunit_[2] * step));
+      if (param_.pad[0] == 0 && param_.pad[1] == 0) {
+        temp_col_ = unpack_patch2col(data.Slice(i, i + step),
+                                     param_.kernel[0],
+                                     param_.kernel[1],
+                                     param_.stride[0]);
+        // TODO(bing): make mshadow support dual stride
+      } else {
+        temp_col_ = unpack_patch2col(pad(data.Slice(i, i + step),
+                                         param_.pad[0], param_.pad[1]),
+                                     param_.kernel[0],
+                                     param_.kernel[1],
+                                     param_.stride[0]);
+        // TODO(bing): make mshadow support dual stride
+      }
+      const index_t gstride = temp_col_.size(0) / param_.nb_group;
+      for (int gid = 0; gid < param_.nb_group; ++gid) {
+        mshadow::Tensor<xpu, 2> tmpc = temp_col_.Slice(gstride * gid,
+                                                       gstride * (gid + 1));
+        temp_dst_[gid] = dot(wmat[gid], tmpc);
+      }
+      out.Slice(i, i + step) = swapaxis<1, 0>(reshape(temp_dst_,
+                                                      mshadow::Shape4(param_.nb_filter,
+                                                      step,
+                                                      out.size(2),
+                                                      out.size(3))));
+    }
+    if (!param_.no_bias) {
+      // add bias, broadcast bias to dim 1: channel
+      Tensor<xpu, 1> bias = in_data[kBias].get<xpu, 1, real_t>(s);
+      out += broadcast<1>(bias, out.shape_);
+    }
+  }
+
+  virtual void Backward(const OpContext &ctx,
+                        const std::vector<TBlob> &out_grad,
+                        const std::vector<TBlob> &in_data,
+                        const std::vector<TBlob> &out_data,
+                        const std::vector<OpReqType> &req,
+                        const std::vector<TBlob> &in_grad) {
+    using namespace mshadow;
+    using namespace mshadow::expr;
+    // TODO(bing): check the BLAS Handle, be careful
+    CHECK_EQ(out_grad.size(), 1);
+    size_t expected = param_.no_bias == 0 ? 3 : 2;
+    CHECK(in_data.size() == expected && in_grad.size() == expected);
+    CHECK_EQ(req.size(), expected);
+    // get data
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    Tensor<xpu, 4> data = in_data[kData].get<xpu, 4, real_t>(s);
+    Tensor<xpu, 3> wmat = in_data[kWeight].get<xpu, 3, real_t>(s);
+    Tensor<xpu, 4> grad = out_grad[kOut].get<xpu, 4, real_t>(s);
+    Tensor<xpu, 4> gdata = in_grad[kData].get<xpu, 4, real_t>(s);
+    Tensor<xpu, 3> gwmat = in_grad[kWeight].get<xpu, 3, real_t>(s);
+    this->InitTemp(data.shape_, grad.shape_);
+    const index_t nbatch = data.size(0);
+    for (index_t i = 0; i < nbatch; i += param_.nstep) {
+      const index_t step = std::min(param_.nstep, nbatch - i);
+      temp_col_.Resize(mshadow::Shape2(shape_colunit_[0], \
+                                       shape_colunit_[1] * step));
+      temp_dst_.Resize(mshadow::Shape3(shape_dstunit_[0], \
+                                       shape_dstunit_[1], shape_dstunit_[2] * step));
+      temp_dst_ = reshape(swapaxis<1, 0>(grad.Slice(i, i + step)), temp_dst_.shape_);
+      if (param_.pad[0] == 0 && param_.pad[1] == 0) {
+        // TODO(bing): dual stride
+        temp_col_ = unpack_patch2col(data.Slice(i, i + step), \
+                                     param_.kernel[0], \
+                                     param_.kernel[1], \
+                                     param_.stride[0]);
+      } else {
+        // TODO(bing): dual stride
+        temp_col_ = unpack_patch2col(pad(data.Slice(i, i + step), param_.pad[0], param_.pad[1]), \
+                                     param_.kernel[0], \
+                                     param_.kernel[1], \
+                                     param_.stride[0]);
+      }
+      const index_t gstride = temp_col_.size(0) / param_.nb_group;
+      for (int gid = 0; gid < param_.nb_group; ++gid) {
+        mshadow::Tensor<xpu, 2> tmpc = temp_col_.Slice(gstride * gid, gstride * (gid + 1));
+        gwmat[gid] += dot(temp_dst_[gid], tmpc.T());
+      }
+      if (req[kData] == kWriteTo) {
+        for (int gid = 0; gid < param_.nb_group; ++gid) {
+          mshadow::Tensor<xpu, 2> tmpc = temp_col_.Slice(gstride * gid, gstride * (gid + 1));
+          tmpc = dot(wmat[gid].T(), temp_dst_[gid]);
+        }
+        if (param_.pad[0] == 0 && param_.pad[1] == 0) {
+          gdata.Slice(i, i + step) = pack_col2patch(temp_col_, \
+                                                    data.Slice(i, i + step).shape_, \
+                                                    param_.kernel[0], \
+                                                    param_.kernel[1], \
+                                                    param_.stride[0]);
+        } else {
+          mshadow::Shape<4> pshape = data.Slice(i, i + step).shape_;
+          pshape[2] += 2 * param_.pad[0];
+          pshape[3] += 2 * param_.pad[1];
+          gdata.Slice(i, i + step) = crop(pack_col2patch(temp_col_, \
+                                                         pshape, \
+                                                         param_.kernel[0], \
+                                                         param_.kernel[1], \
+                                                         param_.stride[0]), \
+                                          gdata[i][0].shape_);
+        }
+      }
+    }
+    if (!param_.no_bias) {
+      Tensor<xpu, 1> gbias = in_grad[kBias].get<xpu, 1, real_t>(s);
+      // Assign(gbias, req[kBias], sumall_except_dim<1>(grad);
+      gbias += sumall_except_dim<1>(grad);
+    }
+  }
+
+ private:
+  // TODO(bing): use global resource allocator
+  inline void InitTemp(const mshadow::Shape<4> &ishape,
+                       const mshadow::Shape<4> &oshape) {
+    const int ksize_y = param_.kernel[0];
+    const int ksize_x = param_.kernel[1];
+    shape_colunit_ = mshadow::Shape2(ishape[1] * ksize_y * ksize_x,
+                                     oshape[2] * oshape[3]);
+    shape_dstunit_ = mshadow::Shape3(param_.nb_group,
+                                     param_.nb_filter / param_.nb_group,
+                                     oshape[2] * oshape[3]);
+    int nop = (ishape[0] + param_.nstep - 1) / param_.nstep;
+    param_.nstep = (ishape[0] + nop - 1) / nop;
+    temp_col_.Resize(mshadow::Shape2(shape_colunit_[0],
+                                     shape_colunit_[1] * param_.nstep));
+    temp_dst_.Resize(mshadow::Shape3(shape_dstunit_[0],
+                                     shape_dstunit_[1],
+                                     shape_dstunit_[2] * param_.nstep));
+  }
+
+  ConvolutionParam param_;
+  // TODO(bing): use global resource allocator
+  mshadow::TensorContainer<xpu, 2> temp_col_;
+  mshadow::TensorContainer<xpu, 3> temp_dst_;
+  mshadow::Shape<2> shape_colunit_;
+  mshadow::Shape<3> shape_dstunit_;
+};  // class ConvolutionOp
+
+template<typename xpu>
+Operator* CreateOp(ConvolutionParam param);
+
+#if DMLC_USE_CXX11
+class ConvolutionProp : public OperatorProperty {
+ public:
+  virtual std::vector<std::string> ListArguments() const {
+    if (!param_.no_bias) {
+      return {"data", "weight", "bias"};
+    } else {
+      return {"data", "weight"};
+    }
+  }
+
+  virtual void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) {
+    param_.Init(kwargs);
+  }
+
+  virtual bool InferShape(std::vector<TShape> *in_shape,
+                          std::vector<TShape> *out_shape) const {
+    using namespace mshadow;
+    if (!param_.no_bias) {
+      CHECK_EQ(in_shape->size(), 3) << "Input:[data, weight, bias]";
+    } else {
+      CHECK_EQ(in_shape->size(), 2) << "Input:[data, weight]";
+    }
+    const TShape &dshape = (*in_shape)[kData];
+    if (dshape.ndim() ==  0) return false;
+    CHECK_EQ(dshape.ndim(), 4) \
+      << "Input data should be 4D in batch-nb_filter-y-x";
+    SHAPE_ASSIGN_CHECK(*in_shape, \
+                       kWeight, \
+                       Shape3(param_.nb_group, \
+                              param_.nb_filter / param_.nb_group, \
+                              dshape[1] / param_.nb_group * param_.kernel[0] * param_.kernel[1]));
+    if (!param_.no_bias) {
+      SHAPE_ASSIGN_CHECK(*in_shape, kBias, Shape1(param_.nb_filter));
+    }
+    out_shape->clear();
+    out_shape->push_back(dshape);
+    const index_t ksize_y = static_cast<index_t>(param_.kernel[0]);
+    const index_t ksize_x = static_cast<index_t>(param_.kernel[1]);
+    const index_t kstride = static_cast<index_t>(param_.stride[0]);
+    // TODO(bing) : support dual stride
+    CHECK_EQ(dshape[1] % param_.nb_group, 0) \
+      << "input nb_filter must divide group size";
+    CHECK_EQ(param_.nb_filter % param_.nb_group, 0) \
+      << "output nb_filter must divide group size";
+    CHECK_GE(param_.kernel.Size(), 0) \
+      << "incorrect kernel size: " << param_.kernel;
+    CHECK_GE(param_.stride.Size(), 0) \
+      << "incorrect stride size: " << param_.stride;
+    CHECK(ksize_x <= dshape[3] && ksize_y <= dshape[2])
+      << "kernel size exceed input";
+    (*out_shape)[kOut][1] = param_.nb_filter;
+    (*out_shape)[kOut][2] = (dshape[2] + 2 * param_.pad[0] - ksize_y) / kstride + 1;
+    (*out_shape)[kOut][3] = (dshape[3] + 2 * param_.pad[1] - ksize_x) / kstride + 1;
+    return true;
+  }
+
+  virtual OperatorProperty* Copy() const {
+    auto ptr = new ConvolutionProp();
+    ptr->param_ = param_;
+    return ptr;
+  }
+
+  virtual std::string TypeString() const {
+    return "Convolution";
+  }
+
+  virtual std::vector<int> DeclareBackwardDependency(
+      const std::vector<int> &out_grad,
+      const std::vector<int> &in_data,
+      const std::vector<int> &out_data) const {
+    return {out_grad[kOut], in_data[kData], in_data[kWeight]};
+  }
+
+  virtual std::vector<std::pair<int, void*> > BackwardInplaceOption(
+      const std::vector<int> &out_grad,
+      const std::vector<int> &in_data,
+      const std::vector<int> &out_data,
+      const std::vector<void*> &in_grad) const {
+    return {{in_data[kData], in_grad[kData]}};
+  }
+
+  Operator* CreateOperator(Context ctx) const;
+
+ private:
+  ConvolutionParam param_;
+};  // class ConvolutionProp
+#endif  // DMLC_USE_CXX11
+}  // namespace op
+}  // namespace mxnet
+#endif  // MXNET_OPERATOR_CONVOLUTION_INL_H_
diff --git a/src/operator/convolution.cc b/src/operator/convolution.cc
new file mode 100644
index 000000000000..e78533d0a985
--- /dev/null
+++ b/src/operator/convolution.cc
@@ -0,0 +1,32 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file convolution.cc
+ * \brief
+ * \author Bing Xu
+*/
+
+#include "./convolution-inl.h"
+
+namespace mxnet {
+namespace op {
+template<>
+Operator* CreateOp<cpu>(ConvolutionParam param) {
+  return new ConvolutionOp<cpu>(param);
+}
+
+Operator* ConvolutionProp::CreateOperator(Context ctx) const {
+  DO_BIND_DISPATCH(CreateOp, param_);
+}
+
+DMLC_REGISTER_PARAMETER(ConvolutionParam);
+
+MXNET_REGISTER_OP_PROPERTY(Convolution, ConvolutionProp)
+.describe("Apply convolution to input then add a bias.")
+.add_argument("data", "Symbol", "Input data to the ConvolutionOp.")
+.add_argument("weight", "Symbol", "Weight matrix.")
+.add_argument("bias", "Symbol", "Bias parameter.")
+.add_arguments(ConvolutionParam::__FIELDS__());
+
+}  // namespace op
+}  // namespace mxnet
+
diff --git a/src/operator/convolution.cu b/src/operator/convolution.cu
new file mode 100644
index 000000000000..4f0a3ce78b45
--- /dev/null
+++ b/src/operator/convolution.cu
@@ -0,0 +1,19 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file convolution.cu
+ * \brief
+ * \author Bing Xu
+*/
+
+#include "./convolution-inl.h"
+
+namespace mxnet {
+namespace op {
+template<>
+Operator* CreateOp<gpu>(ConvolutionParam param) {
+  return new ConvolutionOp<gpu>(param);
+}
+
+}  // namespace op
+}  // namespace mxnet
+
diff --git a/src/operator/flatten-inl.h b/src/operator/flatten-inl.h
new file mode 100644
index 000000000000..da4110296909
--- /dev/null
+++ b/src/operator/flatten-inl.h
@@ -0,0 +1,101 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file flatten-inl.h
+ * \brief
+ * \author Bing Xu
+*/
+#ifndef MXNET_OPERATOR_FLATTEN_INL_H_
+#define MXNET_OPERATOR_FLATTEN_INL_H_
+
+#include <dmlc/logging.h>
+#include <dmlc/parameter.h>
+#include <mxnet/operator.h>
+#include <algorithm>
+#include <map>
+#include <vector>
+#include <string>
+#include <utility>
+#include "./operator_common.h"
+
+namespace mxnet {
+namespace op {
+
+enum FlattenOpInputs {kData};
+enum FlattenOpOutputs {kOut};
+
+template<typename xpu>
+class FlattenOp : public Operator {
+ public:
+  virtual void Forward(const OpContext &ctx,
+                       const std::vector<TBlob> &in_data,
+                       const std::vector<OpReqType> &req,
+                       const std::vector<TBlob> &out_data) {
+    using namespace mshadow;
+    using namespace mshadow::expr;
+    CHECK_EQ(in_data.size(), 1);
+    CHECK_EQ(req.size(), 1);
+    CHECK_EQ(out_data.size(), 1);
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    Tensor<xpu, 4> data = in_data[kData].get<xpu, 4, real_t>(s);
+    Tensor<xpu, 4> out = out_data[kOut].get<xpu, 4, real_t>(s);
+    Assign(out, req[kOut], reshape(data, out.shape_));
+  }
+
+  virtual void Backward(const OpContext &ctx,
+                        const std::vector<TBlob> &out_grad,
+                        const std::vector<TBlob> &in_data,
+                        const std::vector<TBlob> &out_data,
+                        const std::vector<OpReqType> &req,
+                        const std::vector<TBlob> &in_grad) {
+    using namespace mshadow;
+    using namespace mshadow::expr;
+    Stream<xpu> *s = ctx.get_stream<xpu>();
+    Tensor<xpu, 4> grad_out = out_grad[kData].get<xpu, 4, real_t>(s);
+    Tensor<xpu, 4> grad_in = in_grad[kOut].get<xpu, 4, real_t>(s);
+    Assign(grad_in, req[kData], reshape(grad_out, grad_in.shape_));
+  }
+};  // class FlattenOp
+
+template<typename xpu>
+Operator* CreateOp();
+
+#if DMLC_USE_CXX11
+class FlattenProp : public OperatorProperty {
+ public:
+  FlattenProp() {}
+
+  virtual void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) {}
+
+  virtual std::string TypeString() const {
+    return "Flatten";
+  }
+
+  virtual bool InferShape(std::vector<TShape> *in_shape,
+                          std::vector<TShape> *out_shape) const {
+    CHECK_EQ(in_shape->size(), 1) << "Input: [data]";
+    const TShape &dshape = in_shape->at(kData);
+    if (dshape.ndim() == 0) return false;
+    out_shape->clear();
+    out_shape->push_back(mshadow::Shape4(dshape[0], 1, 1, dshape[1] * dshape[2] * dshape[3]));
+    return true;
+  }
+
+  virtual OperatorProperty* Copy() const {
+    auto ptr = new FlattenProp();
+    return ptr;
+  }
+
+  virtual std::vector<int> DeclareBackwardDependency(
+      const std::vector<int> &out_grad,
+      const std::vector<int> &in_data,
+      const std::vector<int> &out_data) const {
+    return {out_grad[kOut]};
+  }
+
+  Operator* CreateOperator(Context ctx) const;
+};  // class FlattenProp
+#endif  // DMLC_USE_CXX11
+
+}  // namespace op
+}  // namespace mxnet
+#endif  // MXNET_OPERATOR_FLATTEN_INL_H_
diff --git a/src/operator/flatten.cc b/src/operator/flatten.cc
new file mode 100644
index 000000000000..db156def8ca2
--- /dev/null
+++ b/src/operator/flatten.cc
@@ -0,0 +1,27 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file flatten.cc
+ * \brief
+ * \author Bing Xu
+*/
+
+#include "./flatten-inl.h"
+
+
+namespace mxnet {
+namespace op {
+template<>
+Operator *CreateOp<cpu>() {
+  return new FlattenOp<cpu>();
+}
+
+Operator* FlattenProp::CreateOperator(Context ctx) const {
+  DO_BIND_DISPATCH(CreateOp);
+}
+
+MXNET_REGISTER_OP_PROPERTY(Flatten, FlattenProp)
+.add_argument("data", "Symbol", "Input data to  flatten.")
+.describe("Flatten 4D input to form batch-1-1-feature format");
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/flatten.cu b/src/operator/flatten.cu
new file mode 100644
index 000000000000..5bf9d47c5691
--- /dev/null
+++ b/src/operator/flatten.cu
@@ -0,0 +1,19 @@
+/*!
+ * Copyright (c) 2015 by Contributors
+ * \file flatten.cc
+ * \brief
+ * \author Bing Xu
+*/
+
+#include "./flatten-inl.h"
+
+
+namespace mxnet {
+namespace op {
+template<>
+  Operator *CreateOp<gpu>() {
+  return new FlattenOp<gpu>();
+}
+
+}  // namespace op
+}  // namespace mxnet
diff --git a/src/operator/fully_connected-inl.h b/src/operator/fully_connected-inl.h
index 8f4efa6f6b3f..ac5fd992cd82 100644
--- a/src/operator/fully_connected-inl.h
+++ b/src/operator/fully_connected-inl.h
@@ -60,6 +60,7 @@ class FullyConnectedOp : public Operator {
     CHECK_EQ(out_data.size(), 1);
     // TODO(bing): check the BLAS Handle, be careful
     // maybe need blas handle from context
+    // TODO(bing): judge shape to remove flatten op
     Stream<xpu> *s = ctx.get_stream<xpu>();
     Tensor<xpu, 2> data = in_data[kData].FlatTo2D<xpu, real_t>(s);
     Tensor<xpu, 2> wmat = in_data[kWeight].get<xpu, 2, real_t>(s);
@@ -135,21 +136,13 @@ class FullyConnectedProp : public OperatorProperty {
     } else {
       CHECK_EQ(in_shape->size(), 2) << "Input:[data, weight]";
     }
-    CHECK_GT(param_.num_hidden, 0);
-    const TShape &dshape = (*in_shape)[0];
+    const TShape &dshape = (*in_shape)[kData];
     // require data to be known
     if (dshape.ndim() ==  0) return false;
 
-    index_t num_input;
-    if (dshape.ndim() == 4) {
-      // TODO(bing) consider deprecate 4D input
-      CHECK(dshape[1] == 1 && dshape[2] == 1);
-      num_input = dshape[3];
-    } else {
-      CHECK_EQ(dshape.ndim(), 2)
-          << "FullyConnecteded: Input data should be 2D in (batch, num_hidden)";
-      num_input = dshape[1];
-    }
+    index_t num_input = 0;
+    mshadow::Shape<2> ishape = dshape.FlatTo2D();
+    num_input = ishape[1];
     SHAPE_ASSIGN_CHECK(*in_shape, kWeight, Shape2(param_.num_hidden, num_input));
     if (!param_.no_bias) {
       SHAPE_ASSIGN_CHECK(*in_shape, kBias, Shape1(param_.num_hidden));
diff --git a/src/operator/pooling-inl.h b/src/operator/pooling-inl.h
index 6ccae18abf22..359d070cdf11 100644
--- a/src/operator/pooling-inl.h
+++ b/src/operator/pooling-inl.h
@@ -51,7 +51,6 @@ class PoolingOp : public Operator {
  public:
   explicit PoolingOp(PoolingParam p) {
     this->param_ = p;
-    std::cout << param_.kernel << std::endl;
   }
 
   virtual void Forward(const OpContext &ctx,
diff --git a/src/operator/static_operator/convolution_op-inl.h b/src/operator/static_operator/convolution_op-inl.h
deleted file mode 100644
index fc9b3369f2a6..000000000000
--- a/src/operator/static_operator/convolution_op-inl.h
+++ /dev/null
@@ -1,270 +0,0 @@
-/*!
- * Copyright (c) 2015 by Contributors
- * \file convolution_op-inl.h
- * \brief convolution op
- * \author Bing Xu
-*/
-#ifndef MXNET_OPERATOR_STATIC_OPERATOR_CONVOLUTION_OP_INL_H_
-#define MXNET_OPERATOR_STATIC_OPERATOR_CONVOLUTION_OP_INL_H_
-
-#include <mxnet/operator.h>
-#include <vector>
-#include <algorithm>
-#include "./static_operator_common.h"
-#include "./param.h"
-
-namespace mxnet {
-namespace op {
-template<typename xpu>
-class ConvolutionOp : public StaticOperator {
- public:
-  virtual std::vector<ArgType> DescribeArgs() const {
-    ArgType ret[] = {kDataArg, kWeightArg, kBiasArg};
-    if (param_.no_bias == 0) {
-      return std::vector<ArgType>(ret, ret + 3);
-    } else {
-      return std::vector<ArgType>(ret, ret + 2);
-    }
-  }
-  virtual void SetParam(const char *name, const char *val) {
-    param_.SetParam(name, val);
-  }
-  virtual void InferShape(std::vector<TShape> *in_shape,
-                          std::vector<TShape> *out_shape) {
-    using namespace mshadow;
-    if (param_.no_bias == 0) {
-      CHECK_EQ(in_shape->size(), 3) << "Input:[data, weight, bias]";
-    } else {
-      CHECK_EQ(in_shape->size(), 2) << "Input:[data, weight]";
-    }
-    CHECK_GT(param_.num_channel, 0);
-    const TShape &dshape = (*in_shape)[0];
-    CHECK_EQ(dshape.ndim(), 4) << \
-                         "Input data should be 4D in batch-channel-y-x";
-    ShapeAssignCheck((*in_shape)[1], Shape4(param_.num_channel,
-                                            dshape[1],
-                                            param_.kernel_y,
-                                            param_.kernel_x));
-    if (param_.no_bias == 0) {
-      ShapeAssignCheck((*in_shape)[2], Shape1(param_.num_channel));
-    }
-    out_shape->clear();
-    out_shape->push_back(dshape);
-    const index_t ksize_y = static_cast<index_t>(param_.kernel_y);
-    const index_t ksize_x = static_cast<index_t>(param_.kernel_x);
-    const index_t kstride = static_cast<index_t>(param_.stride_y);
-    // todo : support dual stride
-    mshadow::Shape<4> ishape = in_shape->at(0).get<4>();
-    CHECK_EQ(ishape[1] % param_.num_group, 0) << \
-      "input channels must divide group size";
-    CHECK_EQ(param_.num_channel % param_.num_group, 0) << \
-      "output channels must divide group size";
-    CHECK(ksize_y > 0 && ksize_x > 0) << \
-      "incorrect kernel size";
-    CHECK(ksize_x <= ishape[3] && ksize_y <= ishape[2]) << \
-      "kernel size exceed input";
-    (*out_shape)[0][1] = param_.num_channel;
-    (*out_shape)[0][2] = (ishape[2] + 2 * param_.pad_y - ksize_y) / kstride + 1;
-    (*out_shape)[0][3] = (ishape[3] + 2 * param_.pad_x - ksize_x) / kstride + 1;
-  }
-  virtual void Forward(Option opt,
-                       RunContext ctx,
-                       const std::vector<TBlob> &in_data,
-                       const std::vector<TBlob> &out_data) {
-    using namespace mshadow;
-    using namespace mshadow::expr;
-    // TODO(bing): check the BLAS Handle, be careful
-    // maybe need blas handle from context
-    size_t expected = param_.no_bias == 0 ? 3 : 2;
-    CHECK_EQ(in_data.size(), expected);
-    CHECK_EQ(out_data.size(), 1);
-    // weight shape with group
-    TShape ws;
-    ShapeAssignCheck(ws, Shape3(param_.num_group,
-                       param_.num_channel / param_.num_group,
-                       param_.num_input_channel / param_.num_group *
-                       param_.kernel_y * param_.kernel_x));
-    Stream<xpu> *s = static_cast<Stream<xpu> *>(ctx.stream);
-    Tensor<xpu, 4> data = in_data[0].get<xpu, 4, real_t>(s);
-    Tensor<xpu, 3> wmat = in_data[1].get_with_shape<xpu, 3, real_t>(ws, s);
-    Tensor<xpu, 4> out = out_data[0].get<xpu, 4, real_t>(s);
-    this->InitTemp(data.shape_, out.shape_);
-    const index_t nbatch = data.size(0);
-    for (index_t i = 0; i < nbatch; i += nstep_) {
-      // resize, incase last batch is smaller
-      const index_t step = std::min(nstep_, nbatch - i);
-      temp_col_.Resize(mshadow::Shape2(shape_colunit_[0],
-                                       shape_colunit_[1] * step));
-      temp_dst_.Resize(mshadow::Shape3(shape_dstunit_[0],
-                                       shape_dstunit_[1],
-                                       shape_dstunit_[2] * step));
-
-      if (param_.pad_x == 0 && param_.pad_y == 0) {
-        temp_col_ = unpack_patch2col(data.Slice(i, i+step),
-                                     param_.kernel_y,
-                                     param_.kernel_x,
-                                     param_.stride_y);
-        // TODO(bing): make mshadow support dual stride
-      } else {
-        temp_col_ = unpack_patch2col(pad(data.Slice(i, i+step),
-                                         param_.pad_y, param_.pad_x),
-                                     param_.kernel_y,
-                                     param_.kernel_x,
-                                     param_.stride_y);
-        // TODO(bing): make mshadow support dual stride
-      }
-      const index_t gstride = temp_col_.size(0) / param_.num_group;
-      for (int gid = 0; gid < param_.num_group; ++gid) {
-        mshadow::Tensor<xpu, 2> tmpc = temp_col_.Slice(gstride * gid,
-                                                      gstride * (gid + 1));
-        temp_dst_[gid] = dot(wmat[gid], tmpc);
-      }
-      out.Slice(i, i + step) = swapaxis<1, 0>(reshape(temp_dst_,
-                                      mshadow::Shape4(param_.num_channel,
-                                                      step,
-                                                      out.size(2),
-                                                      out.size(3))));
-    }
-    if (param_.no_bias == 0) {
-      // add bias, broadcast bias to dim 1: channel
-      Tensor<xpu, 1> bias = in_data[2].get<xpu, 1, real_t>(s);
-      out += broadcast<1>(bias, out.shape_);
-    }
-  }
-  virtual void Backward(RunContext ctx,
-                        const std::vector<TBlob> &grad_next,
-                        const std::vector<TBlob> &in_data,
-                        const std::vector<TBlob> &out_data,
-                        const std::vector<TBlob> &out_grad,
-                        const std::vector<GradReqType> &req) {
-    using namespace mshadow;
-    using namespace mshadow::expr;
-    // TODO(bing): check the BLAS Handle, be careful
-    // maybe need blas handle from context
-    CHECK_EQ(grad_next.size(), 1);
-    size_t expected = param_.no_bias == 0 ? 3 : 2;
-    CHECK(in_data.size() == expected && out_grad.size() == expected);
-    CHECK_EQ(req.size(), expected);
-    TShape ws;
-    ShapeAssignCheck(ws, Shape3(param_.num_group,
-                       param_.num_channel / param_.num_group,
-                       param_.num_input_channel / param_.num_group *
-                       param_.kernel_y * param_.kernel_x));
-    Stream<xpu> *s = static_cast<Stream<xpu> *>(ctx.stream);
-    Tensor<xpu, 4> data = in_data[0].get<xpu, 4, real_t>(s);
-    Tensor<xpu, 3> wmat = in_data[1].get_with_shape<xpu, 3, real_t>(ws, s);
-    Tensor<xpu, 4> grad = grad_next[0].get<xpu, 4, real_t>(s);
-    Tensor<xpu, 4> gdata = out_grad[0].get<xpu, 4, real_t>(s);
-    Tensor<xpu, 3> gwmat = out_grad[0].get_with_shape<xpu, 3, real_t>(ws, s);
-    this->InitTemp(data.shape_, grad.shape_);
-    const index_t nbatch = data.size(0);
-    for (index_t i = 0; i < nbatch; i += nstep_) {
-      const index_t step = std::min(nstep_, nbatch-i);
-      temp_col_.Resize(mshadow::Shape2(shape_colunit_[0],
-                                       shape_colunit_[1] * step));
-      temp_dst_.Resize(mshadow::Shape3(shape_dstunit_[0],
-                                       shape_dstunit_[1],
-                                       shape_dstunit_[2] * step));
-      temp_dst_ = reshape(swapaxis<1, 0>(grad.Slice(i, i + step)),
-                          temp_dst_.shape_);
-      if (param_.pad_x == 0 && param_.pad_y == 0) {
-        temp_col_ = unpack_patch2col(data.Slice(i, i + step),
-                                     param_.kernel_y,
-                                     param_.kernel_x,
-                                     param_.stride_y);
-        // TODO(bing): dual stride
-      } else {
-        temp_col_ = unpack_patch2col(pad(data.Slice(i, i + step),
-                                         param_.pad_y, param_.pad_x),
-                                     param_.kernel_y,
-                                     param_.kernel_x,
-                                     param_.stride_y);
-        // TODO(bing): dual stride
-      }
-      const index_t gstride = temp_col_.size(0) / param_.num_group;
-      for (int gid = 0; gid < param_.num_group; ++gid) {
-        mshadow::Tensor<xpu, 2> tmpc = temp_col_.Slice(gstride * gid,
-                                                      gstride * (gid + 1));
-        gwmat[gid] += dot(temp_dst_[gid], tmpc.T());
-      }
-      if (req[0] != kNullOp) {
-        for (int gid = 0; gid < param_.num_group; ++gid) {
-          mshadow::Tensor<xpu, 2> tmpc = temp_col_.Slice(gstride * gid,
-                                                        gstride * (gid+1));
-          tmpc = dot(wmat[gid].T(), temp_dst_[gid]);
-        }
-
-        if (param_.pad_x == 0 && param_.pad_y == 0) {
-          Tensor<xpu, 4> gdata_tmp = gdata.Slice(i, i + step);
-          Assign(gdata_tmp,
-                 req[0],
-                 pack_col2patch(temp_col_,
-                                data.Slice(i, i + step).shape_,
-                                param_.kernel_y,
-                                param_.kernel_x,
-                                param_.stride_y));
-        // TODO(bing): dual stride
-        } else {
-          mshadow::Shape<4> pshape = data.Slice(i, i + step).shape_;
-          pshape[2] += 2 * param_.pad_y; pshape[3] += 2 * param_.pad_x;
-          Tensor<xpu, 4> gdata_tmp = gdata.Slice(i, i + step);
-          Assign(gdata_tmp,
-                 req[0],
-                 crop(pack_col2patch(temp_col_,
-                                     pshape,
-                                     param_.kernel_y,
-                                     param_.kernel_x,
-                                     param_.stride_y),
-                      data[i][0].shape_));
-        // TODO(bing): dual stride
-        }
-      }
-    }
-    if (param_.no_bias == 0) {
-      Tensor<xpu, 1> gbias = out_grad[2].get<xpu, 1, real_t>(s);
-      Assign(gbias, req[2], sumall_except_dim<1>(grad));
-    }
-  }
-
- private:
-  /*! \brief Alloc temp space for pack/unpack */
-  inline void InitTemp(mshadow::Shape<4> ishape, mshadow::Shape<4> oshape) {
-    const index_t ksize_y = static_cast<index_t>(param_.kernel_y);
-    const index_t ksize_x = static_cast<index_t>(param_.kernel_x);
-    // this is the unit size of each temp structure
-    shape_colunit_ = mshadow::Shape2(ishape[1] * ksize_y * ksize_x,
-                                     oshape[2] * oshape[3]);
-    shape_dstunit_ = mshadow::Shape3(param_.num_group,
-                                     param_.num_channel/param_.num_group,
-                                     oshape[2] * oshape[3]);
-    nstep_ = std::max(std::min((index_t)(param_.temp_col_max /
-                                         shape_colunit_.Size()),
-                               ishape[0]), 1U);
-    // make nstep more balanced,
-    // nstep will use exactly same number of operations to finish,
-    index_t nop = (ishape[0]+nstep_-1) / nstep_;
-    nstep_ = (ishape[0] + nop - 1)/ nop;
-    CHECK_GT(nstep_, 0);
-    // helper structure
-    temp_col_.Resize(mshadow::Shape2(shape_colunit_[0],
-                                     shape_colunit_[1] * nstep_));
-    temp_dst_.Resize(mshadow::Shape3(shape_dstunit_[0],
-                                     shape_dstunit_[1],
-                                     shape_dstunit_[2] * nstep_));
-  }
-  /*! \brief parameters that potentially be useful */
-  Param param_;
-  /*! \brief temporary data structure to store patches */
-  mshadow::TensorContainer<xpu, 2> temp_col_;
-  /*! \brief temporary data structure to store results */
-  mshadow::TensorContainer<xpu, 3> temp_dst_;
-  /*! \brief shape of column unit */
-  mshadow::Shape<2> shape_colunit_;
-  /*! \brief shape of dst unit */
-  mshadow::Shape<3> shape_dstunit_;
-  /*! \brief how many number of batches to be unpacked together */
-  mshadow::index_t nstep_;
-};  // class ConvolutionOp
-}  // namespace op
-}  // namespace mxnet
-#endif  // MXNET_OPERATOR_STATIC_OPERATOR_CONVOLUTION_OP_INL_H_
diff --git a/src/operator/static_operator/dropout_op-inl.h b/src/operator/static_operator/dropout_op-inl.h
deleted file mode 100644
index 23c9f6aab457..000000000000
--- a/src/operator/static_operator/dropout_op-inl.h
+++ /dev/null
@@ -1,94 +0,0 @@
-/*!
- * Copyright (c) 2015 by Contributors
- * \file dropout_op-inl.h
- * \brief dropout operator
- * \author Bing Xu
-*/
-#ifndef MXNET_OPERATOR_STATIC_OPERATOR_DROPOUT_OP_INL_H_
-#define MXNET_OPERATOR_STATIC_OPERATOR_DROPOUT_OP_INL_H_
-
-#include <mxnet/operator.h>
-#include <vector>
-#include "./mshadow_op.h"
-
-namespace mxnet {
-namespace op {
-template<typename xpu>
-class DropoutOp : public StaticOperator {
- public:
-  explicit DropoutOp(mshadow::Random<xpu> *prnd)
-      : prnd_(prnd), mask_used_(false) {}
-  virtual int DescribeProperty() const {
-    return kForwardRequireRnd | kContainInteralState;
-  }
-  virtual void SetParam(const char *name, const char* val) {
-    if (!strcmp("threshold", name)) pkeep_ = \
-      static_cast<real_t>(1.0f - atof(val));
-    CHECK_GT(pkeep_, 0) << "invalid dropout threshold";
-  }
-  virtual void InferShape(std::vector<TShape> *in_shape,
-                          std::vector<TShape> *out_shape) {
-    CHECK_EQ(in_shape->size(), 1) << "Input: [data]";
-    out_shape->clear();
-    out_shape->push_back((*in_shape)[0]);
-  }
-  virtual void Forward(Option opt,
-                       RunContext ctx,
-                       const std::vector<TBlob> &in_data,
-                       const std::vector<TBlob> &out_data) {
-    CHECK_EQ(in_data.size(), 1);
-    CHECK_EQ(out_data.size(), 1);
-    using namespace mshadow;
-    using namespace mshadow::expr;
-    Stream<xpu> *s = static_cast<Stream<xpu> *>(ctx.stream);
-    Tensor<xpu, 4> data = in_data[0].get<xpu, 4, real_t>(s);
-    Tensor<xpu, 4> out = out_data[0].get<xpu, 4, real_t>(s);
-    if (mask_.shape_!= out.shape_) {
-      mask_.Resize(out.shape_);
-    }
-    if (opt.is_train && pkeep_ != 1.0f) {
-      mask_ = F<op::threshold>(prnd_->uniform(mask_.shape_), pkeep_)  * \
-             (1.0f / pkeep_);
-      out = data * mask_;
-      mask_used_ = true;
-    } else {
-      out = data;
-      mask_used_ = false;
-    }
-  }
-  virtual void Backward(RunContext ctx,
-                        const std::vector<TBlob> &grad_next,
-                        const std::vector<TBlob> &in_data,
-                        const std::vector<TBlob> &out_data,
-                        const std::vector<TBlob> &out_grad,
-                        const std::vector<GradReqType> &req) {
-    CHECK_EQ(grad_next.size(), 1);
-    CHECK_EQ(out_grad.size(), 1);
-    CHECK_EQ(req.size(), 1);
-    using namespace mshadow;
-    using namespace mshadow::expr;
-    Stream<xpu> *s = static_cast<Stream<xpu> *>(ctx.stream);
-    Tensor<xpu, 4> grad = grad_next[0].get<xpu, 4, real_t>(s);
-    Tensor<xpu, 4> out = out_grad[0].get<xpu, 4, real_t>(s);
-    // mask won't be initialized in when
-    if (mask_used_) {
-      Assign(out, req[0], grad * mask_);
-    } else {
-      // avoid directly assign tensor to tensor
-      Assign(out, req[0], F<op::identity>(grad));
-    }
-  }
-
- private:
-  /*! \brief random number generator */
-  mshadow::Random<xpu> *prnd_;
-  /*! \brief random mask */
-  mshadow::TensorContainer<xpu, 4> mask_;
-  /*! \brief probability to keep */
-  real_t pkeep_;
-  /*! \brief record whether mask is used in last forward */
-  bool mask_used_;
-};  // class DropoutOp
-}  // namespace op
-}  // namespace mxnet
-#endif  // MXNET_OPERATOR_STATIC_OPERATOR_DROPOUT_OP_INL_H_
diff --git a/src/operator/static_operator/reshape_op-inl.h b/src/operator/static_operator/reshape_op-inl.h
deleted file mode 100644
index ba966a62a29f..000000000000
--- a/src/operator/static_operator/reshape_op-inl.h
+++ /dev/null
@@ -1,76 +0,0 @@
-/*!
- * Copyright (c) 2015 by Contributors
- * \file reshape_op-inl.h
- * \brief
- * \author Bing Xu
-*/
-#ifndef MXNET_OPERATOR_STATIC_OPERATOR_RESHAPE_OP_INL_H_
-#define MXNET_OPERATOR_STATIC_OPERATOR_RESHAPE_OP_INL_H_
-
-#include <mxnet/operator.h>
-#include <vector>
-
-namespace mxnet {
-namespace op {
-template<typename xpu, bool flatten>
-class ReshapeOp : public StaticOperator {
- public:
-  virtual void SetParam(const char *name, const char *val) {
-    if (!strcmp(name, "out_ch")) oshape_[1] = atoi(val);
-    if (!strcmp(name, "out_y")) oshape_[2] = atoi(val);
-    if (!strcmp(name, "out_x")) oshape_[3] = atoi(val);
-  }
-  virtual void InferShape(std::vector<TShape> *in_shape,
-                          std::vector<TShape> *out_shape) {
-    CHECK_EQ(in_shape->size(), 1);
-    ishape_ = (*in_shape)[0].get<4>();
-    oshape_[0] = ishape_[0];
-    if (flatten) {
-      oshape_[1] = 1;
-      oshape_[2] = 1;
-      oshape_[3] = ishape_[1] * ishape_[2] * ishape_[3];
-    }
-    CHECK_EQ(oshape_.Size(), ishape_.Size()) << "Incorrect new shape";
-    TShape ts;
-    ts = oshape_;
-    out_shape->clear();
-    out_shape->push_back(ts);
-  }
-  virtual void Forward(Option opt,
-                       RunContext ctx,
-                       const std::vector<TBlob> &in_data,
-                       const std::vector<TBlob> &out_data) {
-    CHECK_EQ(in_data.size(), 1);
-    CHECK_EQ(out_data.size(), 1);
-    using namespace mshadow;
-    using namespace mshadow::expr;
-    Stream<xpu> *s = static_cast<Stream<xpu> *>(ctx.stream);
-    Tensor<xpu, 4> data = in_data[0].get<xpu, 4, real_t>(s);
-    Tensor<xpu, 4> out = out_data[0].get<xpu, 4, real_t>(s);
-    out = reshape(data, oshape_);
-  }
-  virtual void Backward(RunContext ctx,
-                        const std::vector<TBlob> &grad_next,
-                        const std::vector<TBlob> &in_data,
-                        const std::vector<TBlob> &out_data,
-                        const std::vector<TBlob> &out_grad,
-                        const std::vector<GradReqType> &req) {
-    CHECK_EQ(grad_next.size(), 1);
-    CHECK_EQ(out_grad.size(), 1);
-    CHECK_EQ(req.size(), 1);
-    using namespace mshadow;
-    using namespace mshadow::expr;
-    Stream<xpu> *s = static_cast<Stream<xpu> *>(ctx.stream);
-    Tensor<xpu, 4> grad = grad_next[0].get<xpu, 4, real_t>(s);
-    Tensor<xpu, 4> out = out_grad[0].get<xpu, 4, real_t>(s);
-    Assign(out, req[0], reshape(grad, ishape_));
-  }
-
- private:
-  mshadow::Shape<4> oshape_;
-  mshadow::Shape<4> ishape_;
-};  // class Operator
-
-}  //  namespace op
-}  //  namespace mxnet
-#endif  // MXNET_OPERATOR_STATIC_OPERATOR_RESHAPE_OP_INL_H_