diff --git a/doc/python/model.md b/doc/python/model.md
index fca3a39ec893..f6f27c99d082 100644
--- a/doc/python/model.md
+++ b/doc/python/model.md
@@ -23,7 +23,7 @@ data = mx.symbol.Variable('data')
 fc1 = mx.symbol.FullyConnected(data, name='fc1', num_hidden=128)
 act1 = mx.symbol.Activation(fc1, name='relu1', act_type='relu')
 fc2 = mx.symbol.FullyConnected(act1, name='fc2', num_hidden=64)
-softmax = mx.symbol.Softmax(fc2, name='sm')
+softmax = mx.symbol.SoftmaxOutput(fc2, name='sm')
 # create a model
 model = mx.model.FeedForward.create(
      softmax,
diff --git a/doc/python/symbol.md b/doc/python/symbol.md
index b153fdb32773..2ca0ee59d6b9 100644
--- a/doc/python/symbol.md
+++ b/doc/python/symbol.md
@@ -23,7 +23,7 @@ The following code gives an example of two layer neural network configuration.
 >>> net = mx.symbol.FullyConnected(data=net, name='fc1', num_hidden=128)
 >>> net = mx.symbol.Activation(data=net, name='relu1', act_type="relu")
 >>> net = mx.symbol.FullyConnected(data=net, name='fc2', num_hidden=64)
->>> net = mx.symbol.Softmax(data=net, name='out')
+>>> net = mx.symbol.SoftmaxOutput(data=net, name='out')
 >>> type(net)
 <class 'mxnet.symbol.Symbol'>
 ```
diff --git a/example/cifar10/cifar10.py b/example/cifar10/cifar10.py
index efb1122504a0..533f6714bd96 100644
--- a/example/cifar10/cifar10.py
+++ b/example/cifar10/cifar10.py
@@ -55,7 +55,7 @@ def SimpleFactory(data, ch_1x1, ch_3x3):
 pool = mx.symbol.Pooling(data=in5b, pool_type="avg", kernel=(7,7), name="global_pool")
 flatten = mx.symbol.Flatten(data=pool, name="flatten1")
 fc = mx.symbol.FullyConnected(data=flatten, num_hidden=10, name="fc1")
-softmax = mx.symbol.Softmax(data=fc, name="loss")
+softmax = mx.symbol.SoftmaxOutput(data=fc, name="loss")
 
 #########################################################
 
diff --git a/example/imagenet/alexnet.py b/example/imagenet/alexnet.py
index dbf5e9a28ba4..b933b090e5b2 100644
--- a/example/imagenet/alexnet.py
+++ b/example/imagenet/alexnet.py
@@ -40,7 +40,7 @@
 dropout2 = mx.symbol.Dropout(data=relu7, p=0.5)
 # stage 6
 fc3 = mx.symbol.FullyConnected(data=dropout2, num_hidden=1000)
-softmax = mx.symbol.Softmax(data=fc3)
+softmax = mx.symbol.SoftmaxOutput(data=fc3)
 
 
 ## data
diff --git a/example/imagenet/inception-full.py b/example/imagenet/inception-full.py
index d703a6db59a2..1ac0a5c14a68 100644
--- a/example/imagenet/inception-full.py
+++ b/example/imagenet/inception-full.py
@@ -74,7 +74,7 @@ def inception(nhidden, grad_scale):
     # linear classifier
     flatten = mx.symbol.Flatten(data=avg, name='flatten')
     fc1 = mx.symbol.FullyConnected(data=flatten, num_hidden=nhidden, name='fc1')
-    softmax = mx.symbol.Softmax(data=fc1, name='softmax')
+    softmax = mx.symbol.SoftmaxOutput(data=fc1, name='softmax')
     return softmax
 
 softmax = inception(21841, 1.0)
diff --git a/example/imagenet/inception.py b/example/imagenet/inception.py
index 263f3a22733f..a9afe9c01f89 100644
--- a/example/imagenet/inception.py
+++ b/example/imagenet/inception.py
@@ -73,7 +73,7 @@ def inception(nhidden, grad_scale):
     # linear classifier
     flatten = mx.symbol.Flatten(data=avg, name='flatten')
     fc1 = mx.symbol.FullyConnected(data=flatten, num_hidden=nhidden, name='fc1')
-    softmax = mx.symbol.Softmax(data=fc1, name='softmax')
+    softmax = mx.symbol.SoftmaxOutput(data=fc1, name='softmax')
     return softmax
 
 softmax = inception(1000, 1.0)
diff --git a/example/memcost/inception_memcost.py b/example/memcost/inception_memcost.py
index 8183c6774724..eb9e16908035 100644
--- a/example/memcost/inception_memcost.py
+++ b/example/memcost/inception_memcost.py
@@ -69,7 +69,7 @@ def inception(nhidden, grad_scale):
     # linear classifier
     flatten = mx.symbol.Flatten(data=avg, name='flatten')
     fc1 = mx.symbol.FullyConnected(data=flatten, num_hidden=nhidden, name='fc1')
-    softmax = mx.symbol.Softmax(data=fc1, name='softmax')
+    softmax = mx.symbol.SoftmaxOutput(data=fc1, name='softmax')
     return softmax
 
 
diff --git a/example/mnist/lenet.py b/example/mnist/lenet.py
index 40779150ccfb..d8691bbe5867 100644
--- a/example/mnist/lenet.py
+++ b/example/mnist/lenet.py
@@ -23,7 +23,7 @@
 # second fullc
 fc2 = mx.symbol.FullyConnected(data=tanh3, num_hidden=10)
 # loss
-lenet = mx.symbol.Softmax(data=fc2)
+lenet = mx.symbol.SoftmaxOutput(data=fc2)
 
 ## data
 train, val = mnist_iterator(batch_size=100, input_shape=(1,28,28))
diff --git a/example/mnist/mlp.py b/example/mnist/mlp.py
index 0cfffe55cbe4..2bfa55d913ba 100644
--- a/example/mnist/mlp.py
+++ b/example/mnist/mlp.py
@@ -11,7 +11,7 @@
 fc2 = mx.symbol.FullyConnected(data = act1, name = 'fc2', num_hidden = 64)
 act2 = mx.symbol.Activation(data = fc2, name='relu2', act_type="relu")
 fc3 = mx.symbol.FullyConnected(data = act2, name='fc3', num_hidden=10)
-mlp = mx.symbol.Softmax(data = fc3, name = 'mlp')
+mlp = mx.symbol.SoftmaxOutput(data = fc3, name = 'mlp')
 
 # data
 
diff --git a/example/mnist/mlp_numpy.py b/example/mnist/mlp_numpy.py
index 114a6bf257d5..538aa87c7c23 100644
--- a/example/mnist/mlp_numpy.py
+++ b/example/mnist/mlp_numpy.py
@@ -11,7 +11,7 @@
 fc2 = mx.symbol.FullyConnected(data = act1, name = 'fc2', num_hidden = 64)
 act2 = mx.symbol.Activation(data = fc2, name='relu2', act_type="relu")
 fc3 = mx.symbol.FullyConnected(data = act2, name='fc3', num_hidden=10)
-mlp = mx.symbol.Softmax(data = fc3, name = 'mlp')
+mlp = mx.symbol.SoftmaxOutput(data = fc3, name = 'mlp')
 
 # data
 
diff --git a/example/notebooks/cifar-100.ipynb b/example/notebooks/cifar-100.ipynb
index 8e8c53a2d75b..bb5cb1b81624 100644
--- a/example/notebooks/cifar-100.ipynb
+++ b/example/notebooks/cifar-100.ipynb
@@ -131,7 +131,7 @@
     "    # linear classifier\n",
     "    flatten = mx.symbol.Flatten(data=avg, name='flatten')\n",
     "    fc1 = mx.symbol.FullyConnected(data=flatten, num_hidden=nhidden, name='fc')\n",
-    "    softmax = mx.symbol.Softmax(data=fc1, name='softmax')\n",
+    "    softmax = mx.symbol.SoftmaxOutput(data=fc1, name='softmax')\n",
     "    return softmax\n",
     "\n",
     "softmax = inception(100, 1.0)"
diff --git a/example/notebooks/cifar-recipe.ipynb b/example/notebooks/cifar-recipe.ipynb
index eae38dab736c..7c436554fa47 100644
--- a/example/notebooks/cifar-recipe.ipynb
+++ b/example/notebooks/cifar-recipe.ipynb
@@ -127,7 +127,7 @@
     "pool = mx.symbol.Pooling(data=in5b, pool_type=\"avg\", kernel=(7,7), name=\"global_avg\")\n",
     "flatten = mx.symbol.Flatten(data=pool)\n",
     "fc = mx.symbol.FullyConnected(data=flatten, num_hidden=10)\n",
-    "softmax = mx.symbol.Softmax(data=fc)"
+    "softmax = mx.symbol.SoftmaxOutput(data=fc)"
    ]
   },
   {
diff --git a/example/notebooks/composite_symbol.ipynb b/example/notebooks/composite_symbol.ipynb
index 22966f5fd3f5..1d2cdaec764d 100644
--- a/example/notebooks/composite_symbol.ipynb
+++ b/example/notebooks/composite_symbol.ipynb
@@ -3691,7 +3691,7 @@
        "<!-- softmax0 -->\n",
        "<g id=\"node234\" class=\"node\"><title>softmax0</title>\n",
        "<polygon fill=\"#b3de69\" stroke=\"black\" points=\"428,-14976 334,-14976 334,-14918 428,-14918 428,-14976\"/>\n",
-       "<text text-anchor=\"middle\" x=\"381\" y=\"-14943.3\" font-family=\"Times,serif\" font-size=\"14.00\">Softmax</text>\n",
+       "<text text-anchor=\"middle\" x=\"381\" y=\"-14943.3\" font-family=\"Times,serif\" font-size=\"14.00\">SoftmaxOutput</text>\n",
        "</g>\n",
        "<!-- softmax0&#45;&gt;fullyconnected0 -->\n",
        "<g id=\"edge261\" class=\"edge\"><title>softmax0&#45;&gt;fullyconnected0</title>\n",
@@ -3739,7 +3739,7 @@
     "# linear classifier\n",
     "flatten = mx.symbol.Flatten(data=avg)\n",
     "fc1 = mx.symbol.FullyConnected(data=flatten, num_hidden=1000)\n",
-    "softmax = mx.symbol.Softmax(data=fc1)\n",
+    "softmax = mx.symbol.SoftmaxOutput(data=fc1)\n",
     "\n",
     "# if you like, you can visualize full network structure\n",
     "mx.viz.plot_network(symbol=softmax, shape={\"data\" : (128, 3, 224, 224)})"
diff --git a/example/python-howto/multiple_outputs.py b/example/python-howto/multiple_outputs.py
index ab6d6d12356c..97ce469d58a2 100644
--- a/example/python-howto/multiple_outputs.py
+++ b/example/python-howto/multiple_outputs.py
@@ -8,7 +8,7 @@
 fc1 = mx.symbol.FullyConnected(data=net, name='fc1', num_hidden=128)
 net = mx.symbol.Activation(data=fc1, name='relu1', act_type="relu")
 net = mx.symbol.FullyConnected(data=net, name='fc2', num_hidden=64)
-out = mx.symbol.Softmax(data=net, name='softmax')
+out = mx.symbol.SoftmaxOutput(data=net, name='softmax')
 # group fc1 and out together
 group = mx.symbol.Group([fc1, out])
 print group.list_outputs()
diff --git a/example/rnn/lstm.py b/example/rnn/lstm.py
index 59daacc4e903..4b5706ff7208 100644
--- a/example/rnn/lstm.py
+++ b/example/rnn/lstm.py
@@ -83,7 +83,7 @@ def lstm_unroll(num_lstm_layer, seq_len,
                                    bias=cls_bias,
                                    num_hidden=num_label,
                                    name="t%d_cls" % seqidx)
-        sm = mx.sym.Softmax(data=fc, label=label, name="t%d_sm" % seqidx)
+        sm = mx.sym.SoftmaxOutput(data=fc, label=label, name="t%d_sm" % seqidx)
         out_prob.append(sm)
 
     for i in range(num_lstm_layer):
diff --git a/src/operator/native_op-inl.h b/src/operator/native_op-inl.h
index 3add8f14ef61..bb0589c14c1c 100644
--- a/src/operator/native_op-inl.h
+++ b/src/operator/native_op-inl.h
@@ -57,7 +57,7 @@ class NativeOp : public Operator {
     SyncVec(out_data, "out_data", s, 1);
     s->Wait();
     param_.pinfo->forward(ptrs.size(), ptrs.data(), ndims.data(), shapes.data(), tags.data());
-    for (int i = 0; i < out_data.size(); ++i) {
+    for (index_t i = 0; i < out_data.size(); ++i) {
       CHECK_NE(req[i], kAddTo) << "NativeOp doesn't support AddTo for output";
       if (req[i] != kNullOp) {
         std::stringstream ss;
@@ -90,7 +90,7 @@ class NativeOp : public Operator {
     }
     s->Wait();
     param_.pinfo->backward(ptrs.size(), ptrs.data(), ndims.data(), shapes.data(), tags.data());
-    for (int i = 0; i < in_grad.size(); ++i) {
+    for (index_t i = 0; i < in_grad.size(); ++i) {
       CHECK_NE(req[i], kAddTo) << "NativeOp doesn't support AddTo for output";
       if (req[i] != kNullOp) {
         std::stringstream ss;
@@ -135,7 +135,7 @@ class NativeOp : public Operator {
                        const std::string &prefix,
                        mshadow::Stream<xpu> *stream,
                        int tag) {
-    for (int i = 0; i < vec.size(); ++i) {
+    for (size_t i = 0; i < vec.size(); ++i) {
       std::stringstream name;
       name << prefix << i;
       SyncBuffer(vec[i], name.str(), stream);
diff --git a/src/operator/softmax-inl.h b/src/operator/softmax-inl.h
index d1e5331d9d06..5b26ef925184 100644
--- a/src/operator/softmax-inl.h
+++ b/src/operator/softmax-inl.h
@@ -20,15 +20,15 @@
 namespace mxnet {
 namespace op {
 
-namespace softmax_enum {
-enum SoftmaxOpInputs {kData, kLabel};
-enum SoftmaxOpOutputs {kOut};
-}  // namespace softmax_enum
+namespace softmaxout_enum {
+enum SoftmaxOutputOpInputs {kData, kLabel};
+enum SoftmaxOutputOpOutputs {kOut};
+}  // namespace softmaxout_enum
 
-struct SoftmaxParam : public dmlc::Parameter<SoftmaxParam> {
+struct SoftmaxOutputParam : public dmlc::Parameter<SoftmaxOutputParam> {
   float grad_scale;
   bool multi_output;
-  DMLC_DECLARE_PARAMETER(SoftmaxParam) {
+  DMLC_DECLARE_PARAMETER(SoftmaxOutputParam) {
     DMLC_DECLARE_FIELD(grad_scale).set_default(1.0f)
     .describe("Scale the gradient by a float factor");
     DMLC_DECLARE_FIELD(multi_output).set_default(false)
@@ -39,9 +39,9 @@ struct SoftmaxParam : public dmlc::Parameter<SoftmaxParam> {
 };
 
 template<typename xpu>
-class SoftmaxOp : public Operator {
+class SoftmaxOutputOp : public Operator {
  public:
-  explicit SoftmaxOp(SoftmaxParam param) : param_(param) {}
+  explicit SoftmaxOutputOp(SoftmaxOutputParam param) : param_(param) {}
 
   virtual void Forward(const OpContext &ctx,
                        const std::vector<TBlob> &in_data,
@@ -50,19 +50,19 @@ class SoftmaxOp : public Operator {
                        const std::vector<TBlob> &aux_args) {
     using namespace mshadow;
     using namespace mshadow::expr;
-    CHECK_EQ(in_data.size(), 2) << "Softmax Input: [data, label]";
-    CHECK_EQ(out_data.size(), 1) << "Softmax Output: [output]";
+    CHECK_EQ(in_data.size(), 2) << "SoftmaxOutput Input: [data, label]";
+    CHECK_EQ(out_data.size(), 1) << "SoftmaxOutput Output: [output]";
     Stream<xpu> *s = ctx.get_stream<xpu>();
     if (param_.multi_output) {
-      int n = in_data[softmax_enum::kData].size(0);
-      int k = in_data[softmax_enum::kData].size(1);
-      Shape<3> s3 = Shape3(n, k, static_cast<int>(in_data[softmax_enum::kData].Size()/n/k));
-      Tensor<xpu, 3> data = in_data[softmax_enum::kData].get_with_shape<xpu, 3, real_t>(s3, s);
-      Tensor<xpu, 3> out = out_data[softmax_enum::kOut].get_with_shape<xpu, 3, real_t>(s3, s);
+      int n = in_data[softmaxout_enum::kData].size(0);
+      int k = in_data[softmaxout_enum::kData].size(1);
+      Shape<3> s3 = Shape3(n, k, static_cast<int>(in_data[softmaxout_enum::kData].Size()/n/k));
+      Tensor<xpu, 3> data = in_data[softmaxout_enum::kData].get_with_shape<xpu, 3, real_t>(s3, s);
+      Tensor<xpu, 3> out = out_data[softmaxout_enum::kOut].get_with_shape<xpu, 3, real_t>(s3, s);
       Softmax(out, data);
     } else {
-      Tensor<xpu, 2> data = in_data[softmax_enum::kData].FlatTo2D<xpu, real_t>(s);
-      Tensor<xpu, 2> out = out_data[softmax_enum::kOut].FlatTo2D<xpu, real_t>(s);
+      Tensor<xpu, 2> data = in_data[softmaxout_enum::kData].FlatTo2D<xpu, real_t>(s);
+      Tensor<xpu, 2> out = out_data[softmaxout_enum::kOut].FlatTo2D<xpu, real_t>(s);
       Softmax(out, data);
     }
   }
@@ -82,20 +82,20 @@ class SoftmaxOp : public Operator {
     CHECK_GE(req.size(), 1);
     Stream<xpu> *s = ctx.get_stream<xpu>();
     if (param_.multi_output) {
-      int n = out_data[softmax_enum::kOut].size(0);
-      int k = out_data[softmax_enum::kOut].size(1);
-      Shape<3> s3 = Shape3(n, k, static_cast<int>(out_data[softmax_enum::kOut].Size()/n/k));
-      Tensor<xpu, 2> label = in_data[softmax_enum::kLabel].FlatTo2D<xpu, real_t>(s);
-      Tensor<xpu, 3> out = out_data[softmax_enum::kOut].get_with_shape<xpu, 3, real_t>(s3, s);
-      Tensor<xpu, 3> grad = in_grad[softmax_enum::kData].get_with_shape<xpu, 3, real_t>(s3, s);
+      int n = out_data[softmaxout_enum::kOut].size(0);
+      int k = out_data[softmaxout_enum::kOut].size(1);
+      Shape<3> s3 = Shape3(n, k, static_cast<int>(out_data[softmaxout_enum::kOut].Size()/n/k));
+      Tensor<xpu, 2> label = in_data[softmaxout_enum::kLabel].FlatTo2D<xpu, real_t>(s);
+      Tensor<xpu, 3> out = out_data[softmaxout_enum::kOut].get_with_shape<xpu, 3, real_t>(s3, s);
+      Tensor<xpu, 3> grad = in_grad[softmaxout_enum::kData].get_with_shape<xpu, 3, real_t>(s3, s);
       SoftmaxGrad(grad, out, label);
       if (param_.grad_scale < 1.0) {
         grad *= param_.grad_scale;
       }
     } else {
-      Tensor<xpu, 1> label = in_data[softmax_enum::kLabel].get<xpu, 1, real_t>(s);
-      Tensor<xpu, 2> out = out_data[softmax_enum::kOut].FlatTo2D<xpu, real_t>(s);
-      Tensor<xpu, 2> grad = in_grad[softmax_enum::kData].FlatTo2D<xpu, real_t>(s);
+      Tensor<xpu, 1> label = in_data[softmaxout_enum::kLabel].get<xpu, 1, real_t>(s);
+      Tensor<xpu, 2> out = out_data[softmaxout_enum::kOut].FlatTo2D<xpu, real_t>(s);
+      Tensor<xpu, 2> grad = in_grad[softmaxout_enum::kData].FlatTo2D<xpu, real_t>(s);
       SoftmaxGrad(grad, out, label);
       if (param_.grad_scale < 1.0) {
         grad *= param_.grad_scale;
@@ -104,15 +104,15 @@ class SoftmaxOp : public Operator {
   }
 
  private:
-  SoftmaxParam param_;
-};  // class SoftmaxOp
+  SoftmaxOutputParam param_;
+};  // class SoftmaxOutputOp
 
 // Decalre Factory function, used for dispatch specialization
 template<typename xpu>
-Operator* CreateOp(SoftmaxParam param);
+Operator* CreateOp(SoftmaxOutputParam param);
 
 #if DMLC_USE_CXX11
-class SoftmaxProp : public OperatorProperty {
+class SoftmaxOutputProp : public OperatorProperty {
  public:
   std::vector<std::string> ListArguments() const override {
     return {"data", "label"};
@@ -134,10 +134,10 @@ class SoftmaxProp : public OperatorProperty {
     const TShape &dshape = in_shape->at(0);
     if (dshape.ndim() == 0) return false;
     if (param_.multi_output) {
-      SHAPE_ASSIGN_CHECK(*in_shape, softmax_enum::kLabel,
+      SHAPE_ASSIGN_CHECK(*in_shape, softmaxout_enum::kLabel,
                          Shape2(dshape[0], dshape.Size()/dshape[0]/dshape[1]));
     } else {
-      SHAPE_ASSIGN_CHECK(*in_shape, softmax_enum::kLabel, Shape1(dshape[0]));
+      SHAPE_ASSIGN_CHECK(*in_shape, softmaxout_enum::kLabel, Shape1(dshape[0]));
     }
     out_shape->clear();
     out_shape->push_back(dshape);
@@ -145,20 +145,20 @@ class SoftmaxProp : public OperatorProperty {
   }
 
   OperatorProperty* Copy() const override {
-    auto ptr = new SoftmaxProp();
+    auto ptr = new SoftmaxOutputProp();
     ptr->param_ = param_;
     return ptr;
   }
 
   std::string TypeString() const override {
-    return "Softmax";
+    return "SoftmaxOutput";
   }
 
   std::vector<int> DeclareBackwardDependency(
     const std::vector<int> &out_grad,
     const std::vector<int> &in_data,
     const std::vector<int> &out_data) const override {
-    return {in_data[softmax_enum::kLabel], out_data[softmax_enum::kOut]};
+    return {in_data[softmaxout_enum::kLabel], out_data[softmaxout_enum::kOut]};
   }
 
   std::vector<std::pair<int, void*> > BackwardInplaceOption(
@@ -166,20 +166,29 @@ class SoftmaxProp : public OperatorProperty {
     const std::vector<int> &in_data,
     const std::vector<int> &out_data,
     const std::vector<void*> &in_grad) const override {
-    return {{out_data[softmax_enum::kOut], in_grad[softmax_enum::kData]}};
+    return {{out_data[softmaxout_enum::kOut], in_grad[softmaxout_enum::kData]}};
   }
 
   std::vector<std::pair<int, void*> > ForwardInplaceOption(
     const std::vector<int> &in_data,
     const std::vector<void*> &out_data) const override {
-    return {{in_data[softmax_enum::kData], out_data[softmax_enum::kOut]}};
+    return {{in_data[softmaxout_enum::kData], out_data[softmaxout_enum::kOut]}};
   }
 
   Operator* CreateOperator(Context ctx) const;
 
- private:
-  SoftmaxParam param_;
-};  // class SoftmaxProp
+ protected:
+  SoftmaxOutputParam param_;
+};  // class SoftmaxOutputProp
+
+class DeprecatedSoftmaxProp : public SoftmaxOutputProp {
+ public:
+  std::string TypeString() const override {
+    LOG(INFO) << "Softmax symbol is renamed to SoftmaxOutput. "
+      << "This API will be deprecated in Dec, 2015";
+    return "Softmax";
+  }
+};
 #endif  // DMLC_USE_CXX11
 
 }  // namespace op
diff --git a/src/operator/softmax.cc b/src/operator/softmax.cc
index 2c2516ba9bc9..4b8a1649e065 100644
--- a/src/operator/softmax.cc
+++ b/src/operator/softmax.cc
@@ -9,20 +9,25 @@
 namespace mxnet {
 namespace op {
 template<>
-Operator *CreateOp<cpu>(SoftmaxParam param) {
-  return new SoftmaxOp<cpu>(param);
+Operator *CreateOp<cpu>(SoftmaxOutputParam param) {
+  return new SoftmaxOutputOp<cpu>(param);
 }
 
-Operator *SoftmaxProp::CreateOperator(Context ctx) const {
+Operator *SoftmaxOutputProp::CreateOperator(Context ctx) const {
   DO_BIND_DISPATCH(CreateOp, param_);
 }
 
-DMLC_REGISTER_PARAMETER(SoftmaxParam);
+DMLC_REGISTER_PARAMETER(SoftmaxOutputParam);
 
-MXNET_REGISTER_OP_PROPERTY(Softmax, SoftmaxProp)
-.describe("Perform a softmax transformation on input.")
+MXNET_REGISTER_OP_PROPERTY(SoftmaxOutput, SoftmaxOutputProp)
+.describe("Perform a softmax transformation on input, backprop with logloss.")
 .add_argument("data", "Symbol", "Input data to softmax.")
-.add_arguments(SoftmaxParam::__FIELDS__());
+.add_arguments(SoftmaxOutputParam::__FIELDS__());
+
+MXNET_REGISTER_OP_PROPERTY(Softmax, DeprecatedSoftmaxProp)
+.describe("DEPRECATED: Perform a softmax transformation on input. Please use SoftmaxOutput")
+.add_argument("data", "Symbol", "Input data to softmax.")
+.add_arguments(SoftmaxOutputParam::__FIELDS__());
 
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/softmax.cu b/src/operator/softmax.cu
index 0ebbfc16ce68..b2940f38fd61 100644
--- a/src/operator/softmax.cu
+++ b/src/operator/softmax.cu
@@ -10,8 +10,8 @@
 namespace mxnet {
 namespace op {
 template<>
-Operator *CreateOp<gpu>(SoftmaxParam param) {
-  return new SoftmaxOp<gpu>(param);
+Operator *CreateOp<gpu>(SoftmaxOutputParam param) {
+  return new SoftmaxOutputOp<gpu>(param);
 }
 
 }  // namespace op
diff --git a/tests/python/common/models.py b/tests/python/common/models.py
index 71df3f07cf47..2c998afcd1db 100644
--- a/tests/python/common/models.py
+++ b/tests/python/common/models.py
@@ -24,6 +24,6 @@ def conv():
 
     fl = mx.symbol.Flatten(data = mp2, name="flatten")
     fc2 = mx.symbol.FullyConnected(data = fl, name='fc2', num_hidden=10)
-    softmax = mx.symbol.Softmax(data = fc2, name = 'sm')
+    softmax = mx.symbol.SoftmaxOutput(data = fc2, name = 'sm')
     return softmax
 
diff --git a/tests/python/multi-node/common.py b/tests/python/multi-node/common.py
index 0db092462a78..d35a1a1fe3f4 100644
--- a/tests/python/multi-node/common.py
+++ b/tests/python/multi-node/common.py
@@ -86,7 +86,7 @@ def mlp():
     fc2 = mx.symbol.FullyConnected(act1, name = 'fc2', num_hidden = 64)
     act2 = mx.symbol.Activation(fc2, name='relu2', act_type="relu")
     fc3 = mx.symbol.FullyConnected(act2, name='fc3', num_hidden=10)
-    softmax = mx.symbol.Softmax(fc3, name = 'sm')
+    softmax = mx.symbol.SoftmaxOutput(fc3, name = 'sm')
     return softmax
 
 def lenet():
@@ -109,7 +109,7 @@ def lenet():
     # second fullc
     fc2 = mx.symbol.FullyConnected(data=tanh3, num_hidden=10)
     # loss
-    lenet = mx.symbol.Softmax(data=fc2)
+    lenet = mx.symbol.SoftmaxOutput(data=fc2)
     return lenet
 
 # Basic Conv + BN + ReLU factory
@@ -155,5 +155,5 @@ def inception():
     pool = mx.symbol.Pooling(data=in5b, pool_type="avg", kernel=(7,7), name="global_pool")
     flatten = mx.symbol.Flatten(data=pool, name="flatten1")
     fc = mx.symbol.FullyConnected(data=flatten, num_hidden=10, name="fc1")
-    softmax = mx.symbol.Softmax(data=fc, name="loss")
+    softmax = mx.symbol.SoftmaxOutput(data=fc, name="loss")
     return softmax
diff --git a/tests/python/multi-node/imagenet.py b/tests/python/multi-node/imagenet.py
index 7663df8d1bad..f4d7c1e35bb3 100644
--- a/tests/python/multi-node/imagenet.py
+++ b/tests/python/multi-node/imagenet.py
@@ -97,5 +97,5 @@ def inception(nhidden):
     # linear classifier
     flatten = mx.symbol.Flatten(data=avg, name='flatten')
     fc1 = mx.symbol.FullyConnected(data=flatten, num_hidden=nhidden, name='fc1')
-    softmax = mx.symbol.Softmax(data=fc1, name='softmax')
+    softmax = mx.symbol.SotfmaxOutput(data=fc1, name='softmax')
     return softmax
diff --git a/tests/python/train/test_conv.py b/tests/python/train/test_conv.py
index bc068153c24e..9d8f77fd7c65 100644
--- a/tests/python/train/test_conv.py
+++ b/tests/python/train/test_conv.py
@@ -24,7 +24,7 @@
 
 fl = mx.symbol.Flatten(data = mp2, name="flatten")
 fc2 = mx.symbol.FullyConnected(data = fl, name='fc2', num_hidden=10)
-softmax = mx.symbol.Softmax(data = fc2, name = 'sm')
+softmax = mx.symbol.SoftmaxOutput(data = fc2, name = 'sm')
 
 num_epoch = 1
 model = mx.model.FeedForward(softmax, mx.cpu(),
diff --git a/tests/python/train/test_mlp.py b/tests/python/train/test_mlp.py
index 5f1c27062066..84a6f17f47d5 100644
--- a/tests/python/train/test_mlp.py
+++ b/tests/python/train/test_mlp.py
@@ -14,7 +14,7 @@
 fc2 = mx.symbol.FullyConnected(act1, name = 'fc2', num_hidden = 64)
 act2 = mx.symbol.Activation(fc2, name='relu2', act_type="relu")
 fc3 = mx.symbol.FullyConnected(act2, name='fc3', num_hidden=10)
-softmax = mx.symbol.Softmax(fc3, name = 'sm')
+softmax = mx.symbol.SoftmaxOutput(fc3, name = 'sm')
 
 def accuracy(label, pred):
     py = np.argmax(pred, axis=1)
diff --git a/tools/caffe_converter/convert_model.py b/tools/caffe_converter/convert_model.py
old mode 100755
new mode 100644
diff --git a/tools/caffe_converter/convert_symbol.py b/tools/caffe_converter/convert_symbol.py
index 9b5bcde99848..88acea0b7c4e 100644
--- a/tools/caffe_converter/convert_symbol.py
+++ b/tools/caffe_converter/convert_symbol.py
@@ -41,7 +41,7 @@ def proto2script(proto_file):
         name = layer[i].name.replace('/', '_')
         if layer[i].type == 'Convolution' or layer[i].type == 4:
             type_string = 'mx.symbol.Convolution'
-            param = layer[i].convolution_param 
+            param = layer[i].convolution_param
             pad = 0 if len(param.pad) == 0 else param.pad[0]
             stride = 1 if len(param.stride) == 0 else param.stride[0]
             param_string = "num_filter=%d, pad=(%d,%d), kernel=(%d,%d), stride=(%d,%d), no_bias=%s" %\
@@ -67,7 +67,7 @@ def proto2script(proto_file):
             need_flatten[name] = need_flatten[mapping[layer[i].bottom[0]]]
         if layer[i].type == 'LRN' or layer[i].type == 15:
             type_string = 'mx.symbol.LRN'
-            param = layer[i].lrn_param  
+            param = layer[i].lrn_param
             param_string = "alpha=%f, beta=%f, knorm=%f, nsize=%d" %\
                 (param.alpha, param.beta, param.k, param.local_size)
             need_flatten[name] = True
@@ -82,7 +82,7 @@ def proto2script(proto_file):
             param_string = "p=%f" % param.dropout_ratio
             need_flatten[name] = need_flatten[mapping[layer[i].bottom[0]]]
         if layer[i].type == 'Softmax' or layer[i].type == 20:
-            type_string = 'mx.symbol.Softmax'
+            type_string = 'mx.symbol.SoftmaxOutput'
 
             # We only support single output network for now.
             output_name = name
@@ -96,7 +96,7 @@ def proto2script(proto_file):
             need_flatten[name] = True
         if type_string == '':
             raise Exception('Unknown Layer %s!' % layer[i].type)
-        
+
         if type_string != 'split':
             bottom = layer[i].bottom
             if param_string != "":
@@ -137,4 +137,4 @@ def main():
         print(symbol_string)
 
 if __name__ == '__main__':
-    main()
\ No newline at end of file
+    main()