From 3f10e336e67f09f6f0891dce8164dcf3ceb152e4 Mon Sep 17 00:00:00 2001
From: lfchener <lfchener@outlook.com>
Date: Tue, 18 Aug 2020 03:26:07 +0000
Subject: [PATCH 1/3] add functional ctc_loss and CTCLoss class.

---
 .../fluid/tests/unittests/test_warpctc_op.py  | 148 +++++++++++++++---
 python/paddle/nn/__init__.py                  |   1 +
 python/paddle/nn/functional/__init__.py       |   3 +
 python/paddle/nn/functional/loss.py           | 119 +++++++++++++-
 python/paddle/nn/layer/__init__.py            |   1 +
 python/paddle/nn/layer/loss.py                | 108 ++++++++++++-
 6 files changed, 355 insertions(+), 25 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_warpctc_op.py b/python/paddle/fluid/tests/unittests/test_warpctc_op.py
index 449ac95918894..6bc42f0712a1a 100644
--- a/python/paddle/fluid/tests/unittests/test_warpctc_op.py
+++ b/python/paddle/fluid/tests/unittests/test_warpctc_op.py
@@ -21,25 +21,25 @@
 from test_softmax_op import stable_softmax
 import paddle.fluid as fluid
 from paddle.fluid import Program, program_guard
+import paddle
+import paddle.nn.functional as F
 
 CUDA_BLOCK_SIZE = 512
 
 
 class CTCForward(object):
-    def __init__(self, softmax, softmax_lod, labels, labels_lod, blank,
-                 norm_by_times):
+    def __init__(self, softmax, softmax_lod, labels, labels_lod, num_classes,
+                 batch_size, blank, norm_by_times):
         self.softmax = softmax
         self.softmax_lod = softmax_lod
-        assert labels.shape[1] == 1
         self.labels = labels
         self.labels_lod = labels_lod
         self.blank = blank
         self.norm_by_times = norm_by_times
 
         self.level = 0
-        self.num_classes = softmax.shape[1]
-        self.batch_size = len(softmax_lod[self.level])
-        assert self.batch_size == len(labels_lod[self.level])
+        self.num_classes = num_classes
+        self.batch_size = batch_size
 
         self.loss = np.zeros([self.batch_size, 1], dtype="float32")
         self.gradient = np.zeros(self.softmax.shape, dtype="float32")
@@ -163,17 +163,25 @@ def forward(self):
         softmax_offset = 0
         labels_offset = 0
         for i in range(self.batch_size):
-            softmax_start_i = softmax_offset
-            softmax_end_i = softmax_offset + self.softmax_lod[self.level][i]
-            labels_start_i = labels_offset
-            labels_end_i = labels_offset + self.labels_lod[self.level][i]
-
-            softmax_a_sequence = self.softmax[softmax_start_i:softmax_end_i, :]
-            labels_a_sequence = self.labels[labels_start_i:labels_end_i, :]
-            self.loss[i] = self.forward_a_sequence(softmax_a_sequence,
-                                                   labels_a_sequence)
-            softmax_offset += self.softmax_lod[self.level][i]
-            labels_offset += self.labels_lod[self.level][i]
+            if self.labels.shape[1] == 1:
+                softmax_start_i = softmax_offset
+                softmax_end_i = softmax_offset + self.softmax_lod[self.level][i]
+                labels_start_i = labels_offset
+                labels_end_i = labels_offset + self.labels_lod[self.level][i]
+
+                softmax_a_sequence = self.softmax[softmax_start_i:
+                                                  softmax_end_i, :]
+                labels_a_sequence = self.labels[labels_start_i:labels_end_i, :]
+                self.loss[i] = self.forward_a_sequence(softmax_a_sequence,
+                                                       labels_a_sequence)
+                softmax_offset += self.softmax_lod[self.level][i]
+                labels_offset += self.labels_lod[self.level][i]
+            else:
+                softmax_a_sequence = self.softmax[:self.softmax_lod[i], i, :]
+                labels_a_sequence = self.labels[:self.labels_lod[i], :]
+                self.loss[i] = self.forward_a_sequence(softmax_a_sequence,
+                                                       labels_a_sequence)
+
         return self.loss
 
 
@@ -201,7 +209,8 @@ def setUp(self):
             dtype="int32")
 
         ctc = CTCForward(softmax, self.logits_lod, labels, self.labels_lod,
-                         self.blank, self.norm_by_times)
+                         self.num_classes, self.batch_size, self.blank,
+                         self.norm_by_times)
         loss = ctc.forward()
 
         max_sequence_length = 0
@@ -223,7 +232,7 @@ def setUp(self):
         }
 
     def test_check_output(self):
-        self.check_output(check_dygraph=False)
+        self.check_output()
 
     def test_check_grad(self):
         self.outputs['WarpCTCGrad'] = self.gradient
@@ -237,7 +246,7 @@ def config(self):
         self.num_classes = CUDA_BLOCK_SIZE + 2
         self.logits_lod = [[4, 1, 3, 3]]
         self.labels_lod = [[3, 1, 4, 4]]
-        self.blank = 0
+        self.blank = self.num_classes - 1
         self.norm_by_times = False
 
 
@@ -267,7 +276,8 @@ def setUp(self):
             dtype="int32")
 
         ctc = CTCForward(softmax, self.logits_lod, labels, self.labels_lod,
-                         self.blank, self.norm_by_times)
+                         self.num_classes, self.batch_size, self.blank,
+                         self.norm_by_times)
         loss = ctc.forward()
 
         max_sequence_length = 0
@@ -317,7 +327,7 @@ def setUp(self):
         }
 
     def test_check_output(self):
-        self.check_output(check_dygraph=False)
+        self.check_output()
 
     def test_check_grad(self):
         self.outputs['WarpCTCGrad'] = self.gradient
@@ -333,7 +343,7 @@ def config(self):
         self.labels_lod = [[3, 1, 4, 4]]
         self.logits_length = np.array([4, 1, 3, 3], dtype=np.int64)
         self.labels_length = np.array([3, 1, 4, 4], dtype=np.int64)
-        self.blank = 0
+        self.blank = self.num_classes - 1
         self.norm_by_times = False
 
 
@@ -389,5 +399,97 @@ def test_label_len_Variable():
             self.assertRaises(TypeError, test_label_len_Variable)
 
 
+class TestCTCLossAPICase(unittest.TestCase):
+    def test_functinal_api(self):
+        self.batch_size = 4
+        self.num_classes = CUDA_BLOCK_SIZE + 2
+        self.logits_length = np.array([4, 1, 3, 3], dtype=np.int64)
+        self.labels_length = np.array([3, 1, 4, 4], dtype=np.int64)
+        self.blank = self.num_classes - 1
+        self.norm_by_times = False
+
+        logits = np.random.uniform(0.1, 1.0, [
+            max(self.logits_length), self.batch_size, self.num_classes
+        ]).astype("float32")
+        softmax = np.apply_along_axis(stable_softmax, -1, logits)
+        # labels should not be blank
+        labels = np.random.randint(
+            0,
+            self.num_classes - 1, [self.batch_size, max(self.labels_length)],
+            dtype="int32")
+
+        ctc = CTCForward(softmax, self.logits_length, labels,
+                         self.labels_length, self.num_classes, self.batch_size,
+                         self.blank, self.norm_by_times)
+        loss_np = ctc.forward()
+
+        paddle.disable_static()
+        softmax = paddle.to_variable(logits)
+        labels = paddle.to_variable(labels)
+        logits_length = paddle.to_variable(self.logits_length)
+        labels_length = paddle.to_variable(self.labels_length)
+        loss_pd_mean = F.ctc_loss(
+            softmax,
+            labels,
+            logits_length,
+            labels_length,
+            blank=self.blank,
+            reduction='mean')
+        loss_pd_mean = loss_pd_mean.numpy()
+
+        loss_pd_sum = F.ctc_loss(
+            softmax,
+            labels,
+            logits_length,
+            labels_length,
+            blank=self.blank,
+            reduction='sum')
+        loss_pd_sum = loss_pd_sum.numpy()
+        paddle.enable_static()
+        loss_np = np.squeeze(loss_np, axis=-1)
+        loss_np_mean = (loss_np / labels_length.numpy()).mean()
+        loss_np_sum = loss_np.sum()
+
+        self.assertTrue(np.allclose(loss_pd_mean, loss_np_mean, atol=1))
+        self.assertTrue(np.allclose(loss_pd_sum, loss_np_sum, atol=1))
+
+    def test_class_api(self):
+        self.batch_size = 3
+        self.num_classes = 15
+        self.logits_length = np.array([3, 3, 3], dtype=np.int64)
+        self.labels_length = np.array([0, 1, 2], dtype=np.int64)
+        self.blank = 0
+        self.norm_by_times = False
+
+        logits = np.random.uniform(0.1, 1.0, [
+            max(self.logits_length), self.batch_size, self.num_classes
+        ]).astype("float32")
+        softmax = np.apply_along_axis(stable_softmax, -1, logits)
+        # labels should not be blank
+        labels = np.random.randint(
+            1,
+            self.num_classes, [self.batch_size, max(self.labels_length)],
+            dtype="int32")
+
+        ctc = CTCForward(softmax, self.logits_length, labels,
+                         self.labels_length, self.num_classes, self.batch_size,
+                         self.blank, self.norm_by_times)
+        loss_np = ctc.forward()
+
+        paddle.disable_static()
+        softmax = paddle.to_variable(logits)
+        labels = paddle.to_variable(labels)
+        logits_length = paddle.to_variable(self.logits_length)
+        labels_length = paddle.to_variable(self.labels_length)
+
+        loss_pd = paddle.nn.CTCLoss(self.blank, 'none')(
+            softmax, labels, logits_length, labels_length)
+        loss_pd = loss_pd.numpy()
+        paddle.enable_static()
+        loss_np = np.squeeze(loss_np, axis=-1)
+
+        self.assertTrue(np.allclose(loss_pd, loss_np, atol=1))
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/nn/__init__.py b/python/paddle/nn/__init__.py
index a52d45521fd1b..a73108281380d 100644
--- a/python/paddle/nn/__init__.py
+++ b/python/paddle/nn/__init__.py
@@ -88,6 +88,7 @@
 from .layer.loss import BCELoss  #DEFINE_ALIAS
 from .layer.loss import KLDivLoss  #DEFINE_ALIAS
 from .layer.loss import MarginRankingLoss  #DEFINE_ALIAS
+from .layer.loss import CTCLoss  #DEFINE_ALIAS
 from .layer.norm import BatchNorm  #DEFINE_ALIAS
 from .layer.norm import GroupNorm  #DEFINE_ALIAS
 from .layer.norm import LayerNorm  #DEFINE_ALIAS
diff --git a/python/paddle/nn/functional/__init__.py b/python/paddle/nn/functional/__init__.py
index fa85b19426cd2..7d25749250d30 100644
--- a/python/paddle/nn/functional/__init__.py
+++ b/python/paddle/nn/functional/__init__.py
@@ -25,6 +25,8 @@
 __all__ += extension.__all__
 from . import common
 __all__ += common.__all__
+from . import loss
+__all__ += loss.__all__
 from .activation import brelu  #DEFINE_ALIAS
 from .activation import elu  #DEFINE_ALIAS
 from .activation import erf  #DEFINE_ALIAS
@@ -143,6 +145,7 @@
 from .loss import square_error_cost  #DEFINE_ALIAS
 from .loss import ssd_loss  #DEFINE_ALIAS
 from .loss import teacher_student_sigmoid_loss  #DEFINE_ALIAS
+from .loss import ctc_loss  #DEFINE_ALIAS
 # from .norm import batch_norm        #DEFINE_ALIAS
 # from .norm import data_norm        #DEFINE_ALIAS
 # from .norm import group_norm        #DEFINE_ALIAS
diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py
index e08c707b8daa6..fa5b35c164fb9 100644
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -13,6 +13,9 @@
 # limitations under the License.
 
 import paddle
+from ...fluid.layer_helper import LayerHelper
+from ...fluid.data_feeder import check_variable_and_dtype
+import paddle.fluid as fluid
 
 # TODO: define loss functions of neural network  
 import numpy as np
@@ -68,7 +71,8 @@
     'softmax_with_cross_entropy',
     'square_error_cost',
     'ssd_loss',
-    'teacher_student_sigmoid_loss'
+    'teacher_student_sigmoid_loss',
+    'ctc_loss',
 ]
 
 
@@ -569,3 +573,116 @@ def mse_loss(input, label, reduction='mean', name=None):
         return paddle.sum(paddle.fluid.layers.square(
             paddle.fluid.layers.elementwise_sub(input, label)),
                           name=name)
+
+
+def ctc_loss(log_probs,
+             labels,
+             input_lengths,
+             label_lengths,
+             blank=0,
+             reduction='mean'):
+    """
+	:alias_main: paddle.nn.functional.ctc_loss
+
+    An operator integrating the open source Warp-CTC library (https://github.com/baidu-research/warp-ctc) 
+    to compute Connectionist Temporal Classification (CTC) loss. 
+    It can be aliased as softmax with CTC, since a native softmax activation 
+    is interated to the Warp-CTC library to normalize values for each row of the input tensor.
+
+    Parameters:
+        log_probs (Variable): – The unscaled probabilities of variable-length sequences, 
+            which is a 3-D Tensor. The tensor shape is [max_logit_length, batch_size, num_classes + 1], 
+            where max_logit_length is the longest length of input logit sequence. 
+            The data type must be float32.
+        labels (Variable): The ground truth of variable-length sequence, which must be a 3-D Tensor. 
+            The tensor shape is [batch_size, max_label_length], where max_label_length is 
+            the longest length of label sequence. The data type must be int32.
+        input_lengths (Variable): The length for each input sequence, 
+            it should have shape [batch_size] and dtype int64.
+        label_lengths (Variable): The length for each label sequence, 
+            it should have shape [batch_size] and dtype int64.
+        blank (int, optional): The blank label index of Connectionist Temporal Classification (CTC) loss, 
+            which is in the half-opened interval [0, num_classes + 1). 
+            The data type must be int32. Default is 0.
+        reduction (str, optional): Indicate how to average the loss, 
+            the candicates are ``'none'`` | ``'mean'`` | ``'sum'``.
+            If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned; 
+            If :attr:`reduction` is ``'sum'``, the reduced sum loss is returned;
+            If :attr:`reduction` is ``'none'``, no reduction will be applied;
+            Default is ``'mean'``.
+
+    Returns:
+        The Connectionist Temporal Classification (CTC) loss.
+
+    Return type: Variable.
+    
+    Examples:
+
+        .. code-block:: python
+
+            # declarative mode
+            import paddle.nn.functional as F
+            import numpy as np
+            import paddle
+
+            # length of the longest logit sequence
+            max_seq_length = 4
+            #length of the longest label sequence
+            max_label_length = 3
+            # number of logit sequences
+            batch_size = 2
+            # class num
+            class_num = 3
+
+            np.random.seed(1)
+            log_probs = np.array([[[4.17021990e-01, 7.20324516e-01, 1.14374816e-04],
+                                    [3.02332580e-01, 1.46755889e-01, 9.23385918e-02]],
+
+                                    [[1.86260208e-01, 3.45560730e-01, 3.96767467e-01],
+                                    [5.38816750e-01, 4.19194520e-01, 6.85219526e-01]],
+
+                                    [[2.04452246e-01, 8.78117442e-01, 2.73875929e-02],
+                                    [6.70467496e-01, 4.17304814e-01, 5.58689833e-01]],
+
+                                    [[1.40386939e-01, 1.98101491e-01, 8.00744593e-01],
+                                    [9.68261600e-01, 3.13424170e-01, 6.92322612e-01]],
+
+                                    [[8.76389146e-01, 8.94606650e-01, 8.50442126e-02],
+                                    [3.90547849e-02, 1.69830427e-01, 8.78142476e-01]]]).astype("float32")
+            labels = np.array([[1, 2, 2],
+                            [1, 2, 2]]).astype("int32")
+            input_lengths = np.array([5, 5]).astype("int64")
+            label_lengths = np.array([3, 3]).astype("int64")
+
+            paddle.disable_static()
+            log_probs = paddle.to_variable(log_probs)
+            labels = paddle.to_variable(labels)
+            input_lengths = paddle.to_variable(input_lengths)
+            label_lengths = paddle.to_variable(label_lengths)
+
+            loss = F.ctc_loss(log_probs, labels, 
+                input_lengths, 
+                label_lengths, 
+                blank=0, 
+                reduction='none')
+            print(loss.numpy())  #[3.9179852 2.9076521]
+
+            loss = F.ctc_loss(log_probs, labels, 
+                input_lengths, 
+                label_lengths, 
+                blank=0, 
+                reduction='mean') 
+            print(loss.numpy())  #[1.1376063]
+
+    """
+
+    loss_out = fluid.layers.warpctc(log_probs, labels, blank, False,
+                                    input_lengths, label_lengths)
+
+    loss_out = fluid.layers.squeeze(loss_out, [-1])
+    assert reduction in ['mean', 'sum', 'none']
+    if reduction == 'mean':
+        loss_out = paddle.mean(loss_out / label_lengths)
+    elif reduction == 'sum':
+        loss_out = paddle.sum(loss_out)
+    return loss_out
diff --git a/python/paddle/nn/layer/__init__.py b/python/paddle/nn/layer/__init__.py
index 9fb8ea78a16ab..6743bd5650e55 100644
--- a/python/paddle/nn/layer/__init__.py
+++ b/python/paddle/nn/layer/__init__.py
@@ -64,6 +64,7 @@
 from .loss import BCELoss  #DEFINE_ALIAS
 from .loss import KLDivLoss  #DEFINE_ALIAS
 from .loss import MarginRankingLoss  #DEFINE_ALIAS
+from .loss import CTCLoss  #DEFINE_ALIAS
 from .norm import BatchNorm  #DEFINE_ALIAS
 from .norm import GroupNorm  #DEFINE_ALIAS
 from .norm import LayerNorm  #DEFINE_ALIAS
diff --git a/python/paddle/nn/layer/loss.py b/python/paddle/nn/layer/loss.py
index bc4f32f9c3186..76a0ad407624f 100644
--- a/python/paddle/nn/layer/loss.py
+++ b/python/paddle/nn/layer/loss.py
@@ -27,7 +27,8 @@
     'NLLLoss',
     'BCELoss',
     'KLDivLoss',
-    'MarginRankingLoss'
+    'MarginRankingLoss',
+    'CTCLoss',
 ]
 
 
@@ -711,3 +712,108 @@ def forward(self, input, other, label):
         out = paddle.nn.functional.margin_ranking_loss(
             input, other, label, self.margin, self.reduction, self.name)
         return out
+
+
+class CTCLoss(fluid.dygraph.Layer):
+    """
+	:alias_main: paddle.nn.CTCLoss
+	:alias: paddle.nn.CTCLoss,paddle.nn.layer.CTCLoss,paddle.nn.layer.loss.CTCLoss
+
+    An operator integrating the open source Warp-CTC library (https://github.com/baidu-research/warp-ctc) 
+    to compute Connectionist Temporal Classification (CTC) loss. 
+    It can be aliased as softmax with CTC, since a native softmax activation 
+    is interated to the Warp-CTC library to normalize values for each row of the input tensor.
+
+    Parameters:
+        blank (int, optional): The blank label index of Connectionist Temporal Classification (CTC) loss, 
+            which is in the half-opened interval [0, num_classes + 1). 
+            The data type must be int32. Default is 0.
+        reduction (str, optional): Indicate how to average the loss, 
+            the candicates are ``'none'`` | ``'mean'`` | ``'sum'``.
+            If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned; 
+            If :attr:`reduction` is ``'sum'``, the reduced sum loss is returned;
+            If :attr:`reduction` is ``'none'``, no reduction will be applied;
+            Default is ``'mean'``.
+
+    Shape:
+        log_probs (Variable): – The unscaled probabilities of variable-length sequences, 
+            which is a 3-D Tensor. The tensor shape is [max_logit_length, batch_size, num_classes + 1], 
+            where max_logit_length is the longest length of input logit sequence. 
+            The data type must be float32.
+        labels (Variable): The ground truth of variable-length sequence, which must be a 3-D Tensor. 
+            The tensor shape is [batch_size, max_label_length], where max_label_length is 
+            the longest length of label sequence. The data type must be int32.
+        input_lengths (Variable): The length for each input sequence, 
+            it should have shape [batch_size] and dtype int64.
+        label_lengths (Variable): The length for each label sequence, 
+            it should have shape [batch_size] and dtype int64.
+
+    Returns:
+        The Connectionist Temporal Classification (CTC) loss.
+
+    Return type: Variable.
+    
+    Examples:
+
+        .. code-block:: python
+
+            # declarative mode
+            import paddle.nn.functional as F
+            import numpy as np
+            import paddle
+
+            # length of the longest logit sequence
+            max_seq_length = 4
+            #length of the longest label sequence
+            max_label_length = 3
+            # number of logit sequences
+            batch_size = 2
+            # class num
+            class_num = 3
+
+            np.random.seed(1)
+            log_probs = np.array([[[4.17021990e-01, 7.20324516e-01, 1.14374816e-04],
+                                    [3.02332580e-01, 1.46755889e-01, 9.23385918e-02]],
+
+                                    [[1.86260208e-01, 3.45560730e-01, 3.96767467e-01],
+                                    [5.38816750e-01, 4.19194520e-01, 6.85219526e-01]],
+
+                                    [[2.04452246e-01, 8.78117442e-01, 2.73875929e-02],
+                                    [6.70467496e-01, 4.17304814e-01, 5.58689833e-01]],
+
+                                    [[1.40386939e-01, 1.98101491e-01, 8.00744593e-01],
+                                    [9.68261600e-01, 3.13424170e-01, 6.92322612e-01]],
+
+                                    [[8.76389146e-01, 8.94606650e-01, 8.50442126e-02],
+                                    [3.90547849e-02, 1.69830427e-01, 8.78142476e-01]]]).astype("float32")
+            labels = np.array([[1, 2, 2],
+                            [1, 2, 2]]).astype("int32")
+            input_lengths = np.array([5, 5]).astype("int64")
+            label_lengths = np.array([3, 3]).astype("int64")
+
+            paddle.disable_static()
+            log_probs = paddle.to_variable(log_probs)
+            labels = paddle.to_variable(labels)
+            input_lengths = paddle.to_variable(input_lengths)
+            label_lengths = paddle.to_variable(label_lengths)
+
+            loss = paddle.nn.CTCLoss(blank=0, reduction='none')(log_probs, labels, 
+                input_lengths, 
+                label_lengths)
+            print(loss.numpy())  #[3.9179852 2.9076521]
+
+            loss = paddle.nn.CTCLoss(blank=0, reduction='mean')(log_probs, labels, 
+                input_lengths, 
+                label_lengths)
+            print(loss.numpy())  #[1.1376063]
+    """
+
+    def __init__(self, blank=0, reduction='mean'):
+        super(CTCLoss, self).__init__()
+        self.blank = blank
+        self.reduction = reduction
+
+    def forward(self, log_probs, labels, input_lengths, label_lengths):
+        return paddle.nn.functional.ctc_loss(log_probs, labels, input_lengths,
+                                             label_lengths, self.blank,
+                                             self.reduction)

From 47f30ab1215a7c8ce9e2858c1c5d4e7283d6d6aa Mon Sep 17 00:00:00 2001
From: lfchener <lfchener@outlook.com>
Date: Tue, 18 Aug 2020 07:07:13 +0000
Subject: [PATCH 2/3] modified docstring of ctc_loss and CTCLoss

---
 python/paddle/nn/functional/loss.py | 23 +++++++++++------------
 python/paddle/nn/layer/loss.py      | 24 +++++++++++-------------
 2 files changed, 22 insertions(+), 25 deletions(-)

diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py
index fa5b35c164fb9..504a9bfc4496a 100644
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -590,31 +590,30 @@ def ctc_loss(log_probs,
     is interated to the Warp-CTC library to normalize values for each row of the input tensor.
 
     Parameters:
-        log_probs (Variable): – The unscaled probabilities of variable-length sequences, 
+        log_probs (Tensor): The unscaled probability sequence with padding,
             which is a 3-D Tensor. The tensor shape is [max_logit_length, batch_size, num_classes + 1], 
             where max_logit_length is the longest length of input logit sequence. 
             The data type must be float32.
-        labels (Variable): The ground truth of variable-length sequence, which must be a 3-D Tensor. 
+        labels (Tensor): The ground truth sequence with padding, which must be a 3-D Tensor. 
             The tensor shape is [batch_size, max_label_length], where max_label_length is 
             the longest length of label sequence. The data type must be int32.
-        input_lengths (Variable): The length for each input sequence, 
+        input_lengths (Tensor): The length for each input sequence, 
             it should have shape [batch_size] and dtype int64.
-        label_lengths (Variable): The length for each label sequence, 
+        label_lengths (Tensor): The length for each label sequence, 
             it should have shape [batch_size] and dtype int64.
         blank (int, optional): The blank label index of Connectionist Temporal Classification (CTC) loss, 
             which is in the half-opened interval [0, num_classes + 1). 
             The data type must be int32. Default is 0.
-        reduction (str, optional): Indicate how to average the loss, 
+        reduction (string, optional): Indicate how to average the loss, 
             the candicates are ``'none'`` | ``'mean'`` | ``'sum'``.
-            If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned; 
-            If :attr:`reduction` is ``'sum'``, the reduced sum loss is returned;
-            If :attr:`reduction` is ``'none'``, no reduction will be applied;
-            Default is ``'mean'``.
+            If :attr:`reduction` is ``'mean'``, the output loss will be divided by the label_lengths, 
+            and then return the mean of quotient; If :attr:`reduction` is ``'sum'``, return the sum of loss; 
+            If :attr:`reduction` is ``'none'``, no reduction will be applied. Default is ``'mean'``.
 
     Returns:
-        The Connectionist Temporal Classification (CTC) loss.
-
-    Return type: Variable.
+        Tensor, The Connectionist Temporal Classification (CTC) loss between ``log_probs`` and  ``labels``.
+            If attr:`reduction` is ``'none'``, the shape of loss is [batch_size], otherwise, 
+            the shape of loss is [1]. Data type is the same as ``log_probs``.
     
     Examples:
 
diff --git a/python/paddle/nn/layer/loss.py b/python/paddle/nn/layer/loss.py
index 76a0ad407624f..9b44d3e877f3f 100644
--- a/python/paddle/nn/layer/loss.py
+++ b/python/paddle/nn/layer/loss.py
@@ -728,37 +728,35 @@ class CTCLoss(fluid.dygraph.Layer):
         blank (int, optional): The blank label index of Connectionist Temporal Classification (CTC) loss, 
             which is in the half-opened interval [0, num_classes + 1). 
             The data type must be int32. Default is 0.
-        reduction (str, optional): Indicate how to average the loss, 
+        reduction (string, optional): Indicate how to average the loss, 
             the candicates are ``'none'`` | ``'mean'`` | ``'sum'``.
-            If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned; 
-            If :attr:`reduction` is ``'sum'``, the reduced sum loss is returned;
-            If :attr:`reduction` is ``'none'``, no reduction will be applied;
-            Default is ``'mean'``.
+            If :attr:`reduction` is ``'mean'``, the output loss will be divided by the label_lengths, 
+            and then return the mean of quotient; If :attr:`reduction` is ``'sum'``, return the sum of loss; 
+            If :attr:`reduction` is ``'none'``, no reduction will be applied. Default is ``'mean'``.
 
     Shape:
-        log_probs (Variable): – The unscaled probabilities of variable-length sequences, 
+        log_probs (Tensor): The unscaled probability sequence with padding,
             which is a 3-D Tensor. The tensor shape is [max_logit_length, batch_size, num_classes + 1], 
             where max_logit_length is the longest length of input logit sequence. 
             The data type must be float32.
-        labels (Variable): The ground truth of variable-length sequence, which must be a 3-D Tensor. 
+        labels (Tensor): The ground truth sequence with padding, which must be a 3-D Tensor. 
             The tensor shape is [batch_size, max_label_length], where max_label_length is 
             the longest length of label sequence. The data type must be int32.
-        input_lengths (Variable): The length for each input sequence, 
+        input_lengths (Tensor): The length for each input sequence, 
             it should have shape [batch_size] and dtype int64.
-        label_lengths (Variable): The length for each label sequence, 
+        label_lengths (Tensor): The length for each label sequence, 
             it should have shape [batch_size] and dtype int64.
 
     Returns:
-        The Connectionist Temporal Classification (CTC) loss.
-
-    Return type: Variable.
+        Tensor, The Connectionist Temporal Classification (CTC) loss between ``log_probs`` and  ``labels``.
+            If attr:`reduction` is ``'none'``, the shape of loss is [batch_size], otherwise, 
+            the shape of loss is [1]. Data type is the same as ``log_probs``.
     
     Examples:
 
         .. code-block:: python
 
             # declarative mode
-            import paddle.nn.functional as F
             import numpy as np
             import paddle
 

From de129d112a7ab3760339145fc7d804a029eeb462 Mon Sep 17 00:00:00 2001
From: lfchener <lfchener@outlook.com>
Date: Thu, 20 Aug 2020 02:50:58 +0000
Subject: [PATCH 3/3] modified format of docstr.

---
 python/paddle/nn/functional/loss.py | 30 +++++++---------------------
 python/paddle/nn/layer/loss.py      | 31 ++++++++---------------------
 2 files changed, 15 insertions(+), 46 deletions(-)

diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py
index 504a9bfc4496a..b45b75c9a44f6 100644
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -582,7 +582,6 @@ def ctc_loss(log_probs,
              blank=0,
              reduction='mean'):
     """
-	:alias_main: paddle.nn.functional.ctc_loss
 
     An operator integrating the open source Warp-CTC library (https://github.com/baidu-research/warp-ctc) 
     to compute Connectionist Temporal Classification (CTC) loss. 
@@ -590,30 +589,15 @@ def ctc_loss(log_probs,
     is interated to the Warp-CTC library to normalize values for each row of the input tensor.
 
     Parameters:
-        log_probs (Tensor): The unscaled probability sequence with padding,
-            which is a 3-D Tensor. The tensor shape is [max_logit_length, batch_size, num_classes + 1], 
-            where max_logit_length is the longest length of input logit sequence. 
-            The data type must be float32.
-        labels (Tensor): The ground truth sequence with padding, which must be a 3-D Tensor. 
-            The tensor shape is [batch_size, max_label_length], where max_label_length is 
-            the longest length of label sequence. The data type must be int32.
-        input_lengths (Tensor): The length for each input sequence, 
-            it should have shape [batch_size] and dtype int64.
-        label_lengths (Tensor): The length for each label sequence, 
-            it should have shape [batch_size] and dtype int64.
-        blank (int, optional): The blank label index of Connectionist Temporal Classification (CTC) loss, 
-            which is in the half-opened interval [0, num_classes + 1). 
-            The data type must be int32. Default is 0.
-        reduction (string, optional): Indicate how to average the loss, 
-            the candicates are ``'none'`` | ``'mean'`` | ``'sum'``.
-            If :attr:`reduction` is ``'mean'``, the output loss will be divided by the label_lengths, 
-            and then return the mean of quotient; If :attr:`reduction` is ``'sum'``, return the sum of loss; 
-            If :attr:`reduction` is ``'none'``, no reduction will be applied. Default is ``'mean'``.
+        log_probs (Tensor): The unscaled probability sequence with padding, which is a 3-D Tensor. The tensor shape is [max_logit_length, batch_size, num_classes + 1], where max_logit_length is the longest length of input logit sequence. The data type must be float32.
+        labels (Tensor): The ground truth sequence with padding, which must be a 3-D Tensor. The tensor shape is [batch_size, max_label_length], where max_label_length is the longest length of label sequence. The data type must be int32.
+        input_lengths (Tensor): The length for each input sequence, it should have shape [batch_size] and dtype int64.
+        label_lengths (Tensor): The length for each label sequence, it should have shape [batch_size] and dtype int64.
+        blank (int, optional): The blank label index of Connectionist Temporal Classification (CTC) loss, which is in the half-opened interval [0, num_classes + 1). The data type must be int32. Default is 0.
+        reduction (string, optional): Indicate how to average the loss, the candicates are ``'none'`` | ``'mean'`` | ``'sum'``. If :attr:`reduction` is ``'mean'``, the output loss will be divided by the label_lengths, and then return the mean of quotient; If :attr:`reduction` is ``'sum'``, return the sum of loss; If :attr:`reduction` is ``'none'``, no reduction will be applied. Default is ``'mean'``.
 
     Returns:
-        Tensor, The Connectionist Temporal Classification (CTC) loss between ``log_probs`` and  ``labels``.
-            If attr:`reduction` is ``'none'``, the shape of loss is [batch_size], otherwise, 
-            the shape of loss is [1]. Data type is the same as ``log_probs``.
+        Tensor, The Connectionist Temporal Classification (CTC) loss between ``log_probs`` and  ``labels``. If attr:`reduction` is ``'none'``, the shape of loss is [batch_size], otherwise, the shape of loss is [1]. Data type is the same as ``log_probs``.
     
     Examples:
 
diff --git a/python/paddle/nn/layer/loss.py b/python/paddle/nn/layer/loss.py
index 9b44d3e877f3f..22a1abcdb01e8 100644
--- a/python/paddle/nn/layer/loss.py
+++ b/python/paddle/nn/layer/loss.py
@@ -717,7 +717,7 @@ def forward(self, input, other, label):
 class CTCLoss(fluid.dygraph.Layer):
     """
 	:alias_main: paddle.nn.CTCLoss
-	:alias: paddle.nn.CTCLoss,paddle.nn.layer.CTCLoss,paddle.nn.layer.loss.CTCLoss
+	:alias: paddle.nn.CTCLoss, paddle.nn.layer.CTCLoss, paddle.nn.layer.loss.CTCLoss
 
     An operator integrating the open source Warp-CTC library (https://github.com/baidu-research/warp-ctc) 
     to compute Connectionist Temporal Classification (CTC) loss. 
@@ -725,32 +725,17 @@ class CTCLoss(fluid.dygraph.Layer):
     is interated to the Warp-CTC library to normalize values for each row of the input tensor.
 
     Parameters:
-        blank (int, optional): The blank label index of Connectionist Temporal Classification (CTC) loss, 
-            which is in the half-opened interval [0, num_classes + 1). 
-            The data type must be int32. Default is 0.
-        reduction (string, optional): Indicate how to average the loss, 
-            the candicates are ``'none'`` | ``'mean'`` | ``'sum'``.
-            If :attr:`reduction` is ``'mean'``, the output loss will be divided by the label_lengths, 
-            and then return the mean of quotient; If :attr:`reduction` is ``'sum'``, return the sum of loss; 
-            If :attr:`reduction` is ``'none'``, no reduction will be applied. Default is ``'mean'``.
+        blank (int, optional): The blank label index of Connectionist Temporal Classification (CTC) loss, which is in the half-opened interval [0, num_classes + 1). The data type must be int32. Default is 0.
+        reduction (string, optional): Indicate how to average the loss, the candicates are ``'none'`` | ``'mean'`` | ``'sum'``. If :attr:`reduction` is ``'mean'``, the output loss will be divided by the label_lengths, and then return the mean of quotient; If :attr:`reduction` is ``'sum'``, return the sum of loss; If :attr:`reduction` is ``'none'``, no reduction will be applied. Default is ``'mean'``.
 
     Shape:
-        log_probs (Tensor): The unscaled probability sequence with padding,
-            which is a 3-D Tensor. The tensor shape is [max_logit_length, batch_size, num_classes + 1], 
-            where max_logit_length is the longest length of input logit sequence. 
-            The data type must be float32.
-        labels (Tensor): The ground truth sequence with padding, which must be a 3-D Tensor. 
-            The tensor shape is [batch_size, max_label_length], where max_label_length is 
-            the longest length of label sequence. The data type must be int32.
-        input_lengths (Tensor): The length for each input sequence, 
-            it should have shape [batch_size] and dtype int64.
-        label_lengths (Tensor): The length for each label sequence, 
-            it should have shape [batch_size] and dtype int64.
+        log_probs (Tensor): The unscaled probability sequence with padding, which is a 3-D Tensor. The tensor shape is [max_logit_length, batch_size, num_classes + 1], where max_logit_length is the longest length of input logit sequence. The data type must be float32.
+        labels (Tensor): The ground truth sequence with padding, which must be a 3-D Tensor. The tensor shape is [batch_size, max_label_length], where max_label_length is the longest length of label sequence. The data type must be int32.
+        input_lengths (Tensor): The length for each input sequence, it should have shape [batch_size] and dtype int64.
+        label_lengths (Tensor): The length for each label sequence, it should have shape [batch_size] and dtype int64.
 
     Returns:
-        Tensor, The Connectionist Temporal Classification (CTC) loss between ``log_probs`` and  ``labels``.
-            If attr:`reduction` is ``'none'``, the shape of loss is [batch_size], otherwise, 
-            the shape of loss is [1]. Data type is the same as ``log_probs``.
+        Tensor, The Connectionist Temporal Classification (CTC) loss between ``log_probs`` and  ``labels``. If attr:`reduction` is ``'none'``, the shape of loss is [batch_size], otherwise, the shape of loss is [1]. Data type is the same as ``log_probs``.
     
     Examples: