From 734e87e55b00418aed0fac5a879b2704d62cf3ab Mon Sep 17 00:00:00 2001
From: yangyaming <mxscmxsc@gmail.com>
Date: Fri, 15 Dec 2017 20:08:55 +0800
Subject: [PATCH 1/6] Add python wrapper for lstm unit op.

---
 doc/api/v2/fluid/layers.rst                 |  11 +-
 python/paddle/v2/fluid/layers/nn.py         | 112 +++++++++++++++++++-
 python/paddle/v2/fluid/tests/test_layers.py |  17 +++
 3 files changed, 132 insertions(+), 8 deletions(-)

diff --git a/doc/api/v2/fluid/layers.rst b/doc/api/v2/fluid/layers.rst
index 89e5fec13bf906..0ab36402fa5acd 100644
--- a/doc/api/v2/fluid/layers.rst
+++ b/doc/api/v2/fluid/layers.rst
@@ -188,12 +188,6 @@ beam_search_decode
     :noindex:
 
 
-lstm
----------
-..  autofunction:: paddle.v2.fluid.layers.lstm
-    :noindex:
-
-
 lod_rank_table
 ---------
 ..  autofunction:: paddle.v2.fluid.layers.lod_rank_table
@@ -300,3 +294,8 @@ conv2d_transpose
 ..  autofunction:: paddle.v2.fluid.layers.conv2d_transpose
     :noindex:
 
+
+lstm_unit
+---------
+..  autofunction:: paddle.v2.fluid.layers.lstm_unit
+    :noindex:
diff --git a/python/paddle/v2/fluid/layers/nn.py b/python/paddle/v2/fluid/layers/nn.py
index bad7dbd84e8810..84e62d988ce9db 100644
--- a/python/paddle/v2/fluid/layers/nn.py
+++ b/python/paddle/v2/fluid/layers/nn.py
@@ -5,12 +5,13 @@
 from ..layer_helper import LayerHelper
 from ..initializer import Normal, Constant
 from ..framework import Variable
+from tensor import concat
 
 __all__ = [
     'fc', 'embedding', 'dynamic_lstm', 'gru_unit', 'linear_chain_crf',
     'crf_decoding', 'cos_sim', 'cross_entropy', 'square_error_cost', 'accuracy',
     'chunk_eval', 'sequence_conv', 'conv2d', 'sequence_pool', 'pool2d',
-    'batch_norm', 'beam_search_decode', 'conv2d_transpose'
+    'batch_norm', 'beam_search_decode', 'conv2d_transpose', 'lstm_unit'
 ]
 
 
@@ -392,7 +393,7 @@ def chunk_eval(input,
                excluded_chunk_types=None,
                **kwargs):
     """
-    This function computes and outputs the precision, recall and 
+    This function computes and outputs the precision, recall and
     F1-score of chunk detection.
     """
     helper = LayerHelper("chunk_eval", **kwargs)
@@ -789,3 +790,110 @@ def conv2d_transpose(input,
         attrs=op_attr)
 
     return out
+
+
+def lstm_unit(x_t,
+              hidden_t_prev,
+              cell_t_prev,
+              forget_bias=0.0,
+              main_program=None,
+              startup_program=None):
+    """Lstm unit layer. The equation of a lstm step is:
+
+        .. math::
+
+            i_t & = \sigma(W_{x_i}x_{t} + W_{h_i}h_{t-1} + W_{c_i}c_{t-1} + b_i)
+
+            f_t & = \sigma(W_{x_f}x_{t} + W_{h_f}h_{t-1} + W_{c_f}c_{t-1} + b_f)
+
+            c_t & = f_tc_{t-1} + i_t tanh (W_{x_c}x_t+W_{h_c}h_{t-1} + b_c)
+
+            o_t & = \sigma(W_{x_o}x_{t} + W_{h_o}h_{t-1} + W_{c_o}c_t + b_o)
+
+            h_t & = o_t tanh(c_t)
+
+    The inputs of lstm unit includes :math:`x_t`, :math:`h_{t-1}` and
+    :math:`c_{t-1}`. The implementation separates the linear transformation
+    and non-linear transformation apart. Here, we take :math:`i_t` as an
+    example. The linear transformation is applied by calling a `fc` layer and
+    the equation is:
+
+        .. math::
+
+            L_{i_t} = W_{x_i}x_{t} + W_{h_i}h_{t-1} + W_{c_i}c_{t-1} + b_i
+
+    The non-linear transformation is applied by calling `lstm_unit_op` and the
+    equation is:
+
+        .. math::
+
+            i_t = \sigma(L_{i_t})
+
+    This layer has two outputs including :math:`o_t` and :math:`h_t`.
+
+    Args:
+        x_t (Variable): The input value of current step.
+        hidden_t_prev (Variable): The hidden value of lstm unit.
+        cell_t_prev (Variable): The cell value of lstm unit.
+        forget_bias (float): The forget bias of lstm unit.
+        main_program (Program): The main program.
+        startup_program (Program): the startup program.
+
+    Returns:
+        tuple: The cell value and hidden value of lstm unit.
+
+    Raises:
+        ValueError: The ranks of **x_t**, **hidden_t_prev** and **cell_t_prev**\
+                not be 2 or the 1st dimensions of **x_t**, **hidden_t_prev** \
+                and **cell_t_prev** not be the same.
+
+    Examples:
+
+        .. code-block:: python
+
+             x_t = fluid.layers.fc(input=x_t_data, size=10)
+             prev_hidden = fluid.layers.fc(input=prev_hidden_data, size=20)
+             prev_cell = fluid.layers.fc(input=prev_cell_data, size=30)
+             cell_value, hidden_value = fluid.layers.lstm_unit(x_t=x_t,
+                                                    hidden_t_prev=prev_hidden,
+                                                    cell_t_prev=prev_cell)
+    """
+    helper = LayerHelper('lstm_unit', **locals())
+
+    if len(x_t.shape) != 2:
+        raise ValueError("Rank of x_t must be 2.")
+
+    if len(hidden_t_prev.shape) != 2:
+        raise ValueError("Rank of hidden_t_prev must be 2.")
+
+    if len(cell_t_prev.shape) != 2:
+        raise ValueError("Rank of cell_t_prev must be 2.")
+
+    if x_t.shape[0] != hidden_t_prev.shape[0] or x_t.shape[
+            0] != cell_t_prev.shape[0]:
+        raise ValueError("The 1s dimension of x_t, hidden_t_prev and "
+                         "cell_t_prev must be the same.")
+
+    size = cell_t_prev.shape[1]
+    concat_out = concat(
+        input=[x_t, hidden_t_prev],
+        axis=1,
+        main_program=main_program,
+        startup_program=startup_program)
+    fc_out = fc(input=concat_out,
+                size=4 * size,
+                main_program=main_program,
+                startup_program=startup_program)
+    dtype = x_t.dtype
+    c = helper.create_tmp_variable(dtype)
+    h = helper.create_tmp_variable(dtype)
+
+    helper.append_op(
+        type='lstm_unit',
+        inputs={"X": fc_out,
+                "C_prev": cell_t_prev},
+        outputs={"C": c,
+                 "H": h},
+        attrs={"forget_bias": forget_bias})
+
+    return c, h
diff --git a/python/paddle/v2/fluid/tests/test_layers.py b/python/paddle/v2/fluid/tests/test_layers.py
index 9b88080158139f..468bd41285526c 100644
--- a/python/paddle/v2/fluid/tests/test_layers.py
+++ b/python/paddle/v2/fluid/tests/test_layers.py
@@ -161,6 +161,23 @@ def test_sigmoid_cross_entropy(self):
                     x=dat, label=lbl))
         print(str(program))
 
+    def test_lstm_unit(self):
+        program = Program()
+        with program_guard(program):
+            x_t_data = layers.data(
+                name='x_t_data', shape=[10, 10], dtype='float32')
+            x_t = layers.fc(input=x_t_data, size=10)
+            prev_hidden_data = layers.data(
+                name='prev_hidden_data', shape=[10, 20], dtype='float32')
+            prev_hidden = layers.fc(input=prev_hidden_data, size=20)
+            prev_cell_data = layers.data(
+                name='prev_cell', shape=[10, 30], dtype='float32')
+            prev_cell = layers.fc(input=prev_cell_data, size=30)
+            self.assertIsNotNone(
+                layers.lstm_unit(
+                    x_t=x_t, hidden_t_prev=prev_hidden, cell_t_prev=prev_cell))
+        print(str(program))
+
 
 if __name__ == '__main__':
     unittest.main()

From a398e25d6ac786e14aa18be79438b8d2d1b191d0 Mon Sep 17 00:00:00 2001
From: yangyaming <mxscmxsc@gmail.com>
Date: Mon, 18 Dec 2017 20:09:36 +0800
Subject: [PATCH 2/6] Expose param_attr and bias_attr.

---
 paddle/operators/lstm_unit_op.cc    | 5 ++++-
 python/paddle/v2/fluid/layers/nn.py | 9 +++++++++
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/paddle/operators/lstm_unit_op.cc b/paddle/operators/lstm_unit_op.cc
index 18b9cdf2a39e82..b6eb33bafe5054 100644
--- a/paddle/operators/lstm_unit_op.cc
+++ b/paddle/operators/lstm_unit_op.cc
@@ -51,7 +51,10 @@ class LstmUnitOpMaker : public framework::OpProtoAndCheckerMaker {
   LstmUnitOpMaker(framework::OpProto* proto,
                   framework::OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "FC input before the non-linear activation.");
+    AddInput("X",
+             "Lstm unit only applies non-linear activations, please make sure"
+             "that linear tranformation has already been applied to `X`. "
+             "Linear tranformation can be applied by adding a `fc` layer");
     AddInput(
         "C_prev",
         "The cell state tensor of last time-step in the Lstm Unit operator.");
diff --git a/python/paddle/v2/fluid/layers/nn.py b/python/paddle/v2/fluid/layers/nn.py
index 84e62d988ce9db..1c101c62c2dc4c 100644
--- a/python/paddle/v2/fluid/layers/nn.py
+++ b/python/paddle/v2/fluid/layers/nn.py
@@ -5,6 +5,7 @@
 from ..layer_helper import LayerHelper
 from ..initializer import Normal, Constant
 from ..framework import Variable
+from ..param_attr import ParamAttr
 from tensor import concat
 
 __all__ = [
@@ -796,6 +797,8 @@ def lstm_unit(x_t,
               hidden_t_prev,
               cell_t_prev,
               forget_bias=0.0,
+              param_attr=None,
+              bias_attr=ParamAttr(),
               main_program=None,
               startup_program=None):
     """Lstm unit layer. The equation of a lstm step is:
@@ -836,6 +839,10 @@ def lstm_unit(x_t,
         hidden_t_prev (Variable): The hidden value of lstm unit.
         cell_t_prev (Variable): The cell value of lstm unit.
         forget_bias (float): The forget bias of lstm unit.
+        param_attr (ParamAttr): The attributes of parameter weights, used to set
+            initializer, name etc.
+        bias_attr (ParamAttr): The attributes of bias weights, used to set
+            initializer, name etc.
         main_program (Program): The main program.
         startup_program (Program): the startup program.
 
@@ -882,6 +889,8 @@ def lstm_unit(x_t,
         startup_program=startup_program)
     fc_out = fc(input=concat_out,
                 size=4 * size,
+                param_attr=param_attr,
+                bias_attr=bias_attr,
                 main_program=main_program,
                 startup_program=startup_program)
     dtype = x_t.dtype

From 58d6946c874bbe539ace4fde05e7fb4693f30ca1 Mon Sep 17 00:00:00 2001
From: yangyaming <mxscmxsc@gmail.com>
Date: Tue, 19 Dec 2017 11:03:20 +0800
Subject: [PATCH 3/6] Set the act to 'linear'.

---
 python/paddle/v2/fluid/layers/nn.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python/paddle/v2/fluid/layers/nn.py b/python/paddle/v2/fluid/layers/nn.py
index 1c101c62c2dc4c..ab443826bd7b44 100644
--- a/python/paddle/v2/fluid/layers/nn.py
+++ b/python/paddle/v2/fluid/layers/nn.py
@@ -891,6 +891,7 @@ def lstm_unit(x_t,
                 size=4 * size,
                 param_attr=param_attr,
                 bias_attr=bias_attr,
+                act='linear',
                 main_program=main_program,
                 startup_program=startup_program)
     dtype = x_t.dtype

From d993a4f58b7e2be4a76fda406e964229edff2dcb Mon Sep 17 00:00:00 2001
From: yangyaming <mxscmxsc@gmail.com>
Date: Tue, 19 Dec 2017 11:19:24 +0800
Subject: [PATCH 4/6] Change default value for bias_attr.

---
 python/paddle/v2/fluid/layers/nn.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/python/paddle/v2/fluid/layers/nn.py b/python/paddle/v2/fluid/layers/nn.py
index 9728adba735d99..31a0a312dbe12f 100644
--- a/python/paddle/v2/fluid/layers/nn.py
+++ b/python/paddle/v2/fluid/layers/nn.py
@@ -866,7 +866,7 @@ def lstm_unit(x_t,
               cell_t_prev,
               forget_bias=0.0,
               param_attr=None,
-              bias_attr=ParamAttr(),
+              bias_attr=None,
               main_program=None,
               startup_program=None):
     """Lstm unit layer. The equation of a lstm step is:
@@ -909,8 +909,8 @@ def lstm_unit(x_t,
         forget_bias (float): The forget bias of lstm unit.
         param_attr (ParamAttr): The attributes of parameter weights, used to set
             initializer, name etc.
-        bias_attr (ParamAttr): The attributes of bias weights, used to set
-            initializer, name etc.
+        bias_attr (ParamAttr): The attributes of bias weights, if not False,
+            bias weights will be created and be set to default value.
         main_program (Program): The main program.
         startup_program (Program): the startup program.
 
@@ -949,6 +949,9 @@ def lstm_unit(x_t,
         raise ValueError("The 1s dimension of x_t, hidden_t_prev and "
                          "cell_t_prev must be the same.")
 
+    if bias_attr is None:
+        bias_attr = ParamAttr()
+
     size = cell_t_prev.shape[1]
     concat_out = concat(
         input=[x_t, hidden_t_prev],

From 9ee9fefd2de46f2383309f489033fc6d94cd8628 Mon Sep 17 00:00:00 2001
From: yangyaming <mxscmxsc@gmail.com>
Date: Tue, 19 Dec 2017 11:27:35 +0800
Subject: [PATCH 5/6] Change the return order to h, c.

---
 python/paddle/v2/fluid/layers/nn.py         | 8 ++++----
 python/paddle/v2/fluid/tests/test_layers.py | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/python/paddle/v2/fluid/layers/nn.py b/python/paddle/v2/fluid/layers/nn.py
index 31a0a312dbe12f..dd6bb54599af74 100644
--- a/python/paddle/v2/fluid/layers/nn.py
+++ b/python/paddle/v2/fluid/layers/nn.py
@@ -900,7 +900,7 @@ def lstm_unit(x_t,
 
             i_t = \sigma(L_{i_t})
 
-    This layer has two outputs including :math:`o_t` and :math:`h_t`.
+    This layer has two outputs including :math:`h_t` and :math:`o_t`.
 
     Args:
         x_t (Variable): The input value of current step.
@@ -915,7 +915,7 @@ def lstm_unit(x_t,
         startup_program (Program): the startup program.
 
     Returns:
-        tuple: The cell value and hidden value of lstm unit.
+        tuple: The hidden value and cell value of lstm unit.
 
     Raises:
         ValueError: The ranks of **x_t**, **hidden_t_prev** and **cell_t_prev**\
@@ -929,7 +929,7 @@ def lstm_unit(x_t,
              x_t = fluid.layers.fc(input=x_t_data, size=10)
              prev_hidden = fluid.layers.fc(input=prev_hidden_data, size=20)
              prev_cell = fluid.layers.fc(input=prev_cell_data, size=30)
-             cell_value, hidden_value = fluid.layers.lstm_unit(x_t=x_t,
+             hidden_value, cell_value = fluid.layers.lstm_unit(x_t=x_t,
                                                     hidden_t_prev=prev_hidden,
                                                     cell_t_prev=prev_cell)
     """
@@ -977,4 +977,4 @@ def lstm_unit(x_t,
                  "H": h},
         attrs={"forget_bias": forget_bias})
 
-    return c, h
+    return h, c
diff --git a/python/paddle/v2/fluid/tests/test_layers.py b/python/paddle/v2/fluid/tests/test_layers.py
index 7b56ae464c633d..d4a95bf6fc98fa 100644
--- a/python/paddle/v2/fluid/tests/test_layers.py
+++ b/python/paddle/v2/fluid/tests/test_layers.py
@@ -161,7 +161,7 @@ def test_sigmoid_cross_entropy(self):
                     x=dat, label=lbl))
         print(str(program))
 
-    def test_seq_expand(self):
+    def test_sequence_expand(self):
         program = Program()
         with program_guard(program):
             x = layers.data(name='x', shape=[10], dtype='float32')

From 9573256f9d802dfe1daf9f6887044931ff03f636 Mon Sep 17 00:00:00 2001
From: yangyaming <mxscmxsc@gmail.com>
Date: Tue, 19 Dec 2017 13:24:12 +0800
Subject: [PATCH 6/6] Remove main_program and startup_program.

---
 python/paddle/v2/fluid/layers/nn.py | 21 ++++-----------------
 1 file changed, 4 insertions(+), 17 deletions(-)

diff --git a/python/paddle/v2/fluid/layers/nn.py b/python/paddle/v2/fluid/layers/nn.py
index 1d03f357eb2394..2c38c232240fbe 100644
--- a/python/paddle/v2/fluid/layers/nn.py
+++ b/python/paddle/v2/fluid/layers/nn.py
@@ -764,7 +764,7 @@ def conv2d_transpose(input,
     return out
 
 
-def sequence_expand(x, y, main_program=None, startup_program=None):
+def sequence_expand(x, y):
     """Sequence Expand Layer. This layer will expand the input variable **x**
     according to LoD information of **y**. And the following examples will
     explain how sequence_expand works:
@@ -808,8 +808,6 @@ def sequence_expand(x, y, main_program=None, startup_program=None):
     Args:
         x (Variable): The input variable which is a Tensor or LoDTensor.
         y (Variable): The input variable which is a LoDTensor.
-        main_program (Program): The main program.
-        startup_program (Program): The startup program.
 
     Returns:
         Variable: The expanded variable which is a LoDTensor.
@@ -836,9 +834,7 @@ def lstm_unit(x_t,
               cell_t_prev,
               forget_bias=0.0,
               param_attr=None,
-              bias_attr=None,
-              main_program=None,
-              startup_program=None):
+              bias_attr=None):
     """Lstm unit layer. The equation of a lstm step is:
 
         .. math::
@@ -881,8 +877,6 @@ def lstm_unit(x_t,
             initializer, name etc.
         bias_attr (ParamAttr): The attributes of bias weights, if not False,
             bias weights will be created and be set to default value.
-        main_program (Program): The main program.
-        startup_program (Program): the startup program.
 
     Returns:
         tuple: The hidden value and cell value of lstm unit.
@@ -923,18 +917,11 @@ def lstm_unit(x_t,
         bias_attr = ParamAttr()
 
     size = cell_t_prev.shape[1]
-    concat_out = concat(
-        input=[x_t, hidden_t_prev],
-        axis=1,
-        main_program=main_program,
-        startup_program=startup_program)
+    concat_out = concat(input=[x_t, hidden_t_prev], axis=1)
     fc_out = fc(input=concat_out,
                 size=4 * size,
                 param_attr=param_attr,
-                bias_attr=bias_attr,
-                act='linear',
-                main_program=main_program,
-                startup_program=startup_program)
+                bias_attr=bias_attr)
     dtype = x_t.dtype
     c = helper.create_tmp_variable(dtype)
     h = helper.create_tmp_variable(dtype)