From a348883c96c2cad29fb96ddf07b32275e1486d1b Mon Sep 17 00:00:00 2001
From: guosheng <guosheng@baidu.com>
Date: Sat, 15 Aug 2020 19:05:50 +0800
Subject: [PATCH 01/17] Add MultiHeadAttention api. test=develop

---
 python/paddle/nn/layer/transformer.py | 708 ++++++++++++++++++++++++++
 1 file changed, 708 insertions(+)

diff --git a/python/paddle/nn/layer/transformer.py b/python/paddle/nn/layer/transformer.py
index 2b926b5ab36904..0fb3f5b0c5ee6c 100644
--- a/python/paddle/nn/layer/transformer.py
+++ b/python/paddle/nn/layer/transformer.py
@@ -14,3 +14,711 @@
 
 # TODO: define the classes of Transformer neural network
 # __all__ = [ ]
+
+import numpy as np
+
+from ...fluid import layers
+from ...fluid.dygraph import Layer, Linear
+from ...fluid.initializer import Normal
+from .. import functional as F
+from ...fluid.layers import utils
+from ...fluid.layers.utils import map_structure
+
+
+class MultiHeadAttention(Layer):
+    """
+    Attention mapps queries and a set of key-value pairs to outputs, and
+    Multi-Head Attention performs multiple parallel attention to jointly attending
+    to information from different representation subspaces.
+
+    Please refer to `Attention Is All You Need <https://arxiv.org/pdf/1706.03762.pdf>`_
+    for more details.
+
+    Parameters:
+        embed_dim (int): The expected feature size in the input and output.
+        num_heads (int): The number of heads in multi-head attention.
+        dropout (float, optional): The dropout probability used on attention
+            weights to drop some attention targets. 0 for no dropout. Default 0
+        kdim (int, optional): The feature size in key. If None, assumed equal to
+            `embed_dim`. Default None.
+        vdim (int, optional): The feature size in key. If None, assumed equal to
+            `embed_dim`. Default None.
+        need_weights (bool, optional): Indicate whether to return the attention
+            weights. Default False.
+        param_attr(ParamAttr, optional):  To specify the weight parameter property.
+            Default: None, which means the default weight parameter property is used.
+            See usage for details in :ref:`api_fluid_ParamAttr` .
+        bias_attr (ParamAttr, optional): To specify the bias parameter property.
+            Default: None, which means the default bias parameter property is used.
+            See usage for details in :ref:`api_fluid_ParamAttr` .
+         
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            from paddle.incubate.hapi.text import MultiHeadAttention
+
+            # encoder input: [batch_size, sequence_length, d_model]
+            query = paddle.rand((2, 4, 128))
+            # self attention bias: [batch_size, n_head, src_len, src_len]
+            attn_bias = paddle.rand((2, 2, 4, 4))
+            multi_head_attn = MultiHeadAttention(64, 64, 128, n_head=2)
+            output = multi_head_attn(query, attn_bias=attn_bias)  # [2, 4, 128]
+    """
+
+    def __init__(self,
+                 embed_dim,
+                 num_heads,
+                 dropout=0.,
+                 kdim=None,
+                 vdim=None,
+                 need_weights=True,
+                 param_attr=None,
+                 bias_attr=None):
+        super(MultiHeadAttention, self).__init__()
+        self.embed_dim = embed_dim
+        self.kdim = kdim if kdim is not None else embed_dim
+        self.vdim = vdim if vdim is not None else embed_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.need_weights = need_weights
+
+        self.head_dim = embed_dim // num_heads
+        assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads"
+
+        self.q_proj = Linear(
+            input_dim=embed_dim,
+            output_dim=embed_dim,
+            param_attr=param_attr,
+            bias_attr=bias_attr)
+        self.k_proj = Linear(
+            input_dim=self.kdim,
+            output_dim=embed_dim,
+            param_attr=param_attr,
+            bias_attr=bias_attr)
+        self.v_proj = Linear(
+            input_dim=self.vdim,
+            output_dim=embed_dim,
+            param_attr=param_attr,
+            bias_attr=bias_attr)
+        self.out_proj = Linear(
+            input_dim=embed_dim,
+            output_dim=embed_dim,
+            param_attr=param_attr,
+            bias_attr=bias_attr)
+
+    def _prepare_qkv(self, query, key, value, cache=None):
+        """
+        Prapares linear projected queries, keys and values for usage of subsequnt
+        multiple parallel attention. If `cache` is not None, using cached results
+        to reduce redundant calculations.
+
+        Parameters:
+            query (Variable): The queries for multi-head attention. It is a
+                tensor with shape `[batch_size, sequence_length, embed_dim]`. The
+                data type should be float32 or float64.
+            key (Variable): The keys for multi-head attention. It is
+                a tensor with shape `[batch_size, sequence_length, kdim]`. The
+                data type should be float32 or float64. If None, use `query` as
+                `key`.
+            value (Variable): The values for multi-head attention. It
+                is a tensor with shape `[batch_size, sequence_length, vdim]`.
+                The data type should be float32 or float64. If None, use `query` as
+                `value`.
+            cache (dict, optional): It is a dict with `k` and `v` as keys or
+                `static_k` and `static_v` as keys, and values are tensors shaped
+                `[batch_size, num_heads, length, embed_dim]` which are results of
+                linear projection, reshape and transpose calculations. If keys are
+                `k` and `v`, the values reserve intermediate results of previous
+                positions, and would be updated by new tensors concatanating raw
+                tensors with results of current position, which mostly used for
+                decoder self attention. If keys are `static_k` and `static_v`,
+                `key` and `value` args would be ignored, and the values in dict
+                would be used as calculated results on `key` and `value`, which
+                mostly used for decoder-encoder cross attention. It is only used
+                for inference and should be None for training. Default None.
+
+        Returns:
+            tuple: A tuple including linear projected keys and values. These two \
+                tensors have shapes `[batch_size, n_head, sequence_length, d_key]` \
+                and `[batch_size, n_head, sequence_length, d_value]` separately, \
+                and their data types are same as inputs.
+        """
+        q = self.q_proj(query)
+        q = layers.reshape(x=q, shape=[0, 0, self.n_head, self.d_key])
+        q = layers.transpose(x=q, perm=[0, 2, 1, 3])
+
+        if cache is not None and "static_k" in cache:
+            # for encoder-decoder attention in inference and has cached
+            k, v = cache["static_k"], cache["static_v"]
+        else:
+            k, v = self.cal_kv(key, value)
+
+        if cache is not None and "static_k" not in cache:
+            # for decoder self-attention in inference
+            cache_k, cache_v = cache["k"], cache["v"]
+            k = layers.concat([cache_k, k], axis=2)
+            v = layers.concat([cache_v, v], axis=2)
+            cache["k"], cache["v"] = k, v
+
+        return q, k, v
+
+    def cal_kv(self, key, value):
+        """
+        Applies linear projection on input keys and values, then splits heads
+        (reshape and transpose) to get keys and values from different representation
+        subspaces. The results are used as key-values pairs for subsequent multiple
+        parallel attention.
+        
+        It is part of calculations in multi-head attention, and is provided as
+        a method to prefetch these results, by which we can use them as cache.
+
+        Parameters:
+            key (Variable, optional): The keys for multi-head attention. It is
+                a tensor with shape `[batch_size, sequence_length, kdim]`. The
+                data type should be float32 or float64.
+            value (Variable, optional): The values for multi-head attention. It
+                is a tensor with shape `[batch_size, sequence_length, vdim]`.
+                The data type should be float32 or float64.
+
+        Returns:
+            tuple: A tuple including linear projected keys and values. Their shapes \
+                both are `[batch_size, num_heads, sequence_length, embed_dim // num_heads]`. \
+                and their data types are same as inputs.
+        """
+        k = self.k_proj(key)
+        v = self.v_proj(value)
+        k = layers.reshape(x=k, shape=[0, 0, self.num_heads, self.head_dim])
+        k = layers.transpose(x=k, perm=[0, 2, 1, 3])
+        v = layers.reshape(x=v, shape=[0, 0, self.num_heads, self.head_dim])
+        v = layers.transpose(x=v, perm=[0, 2, 1, 3])
+        return k, v
+
+    def forward(self, query, key, value, attn_mask=None, cache=None):
+        """
+        Applies multi-head attention to map queries and a set of key-value pairs
+        to outputs.
+
+        Parameters:
+            query (Variable): The queries for multi-head attention. It is a
+                tensor with shape `[batch_size, sequence_length, embed_dim]`. The
+                data type should be float32 or float64.
+            key (Variable, optional): The keys for multi-head attention. It is
+                a tensor with shape `[batch_size, sequence_length, kdim]`. The
+                data type should be float32 or float64. If None, use `query` as
+                `key`. Default None.
+            value (Variable, optional): The values for multi-head attention. It
+                is a tensor with shape `[batch_size, sequence_length, vdim]`.
+                The data type should be float32 or float64. If None, use `query` as
+                `value`. Default None.
+            attn_mask (Variable, optional): A tensor used in multi-head attention
+                to prevents attention to some unwanted positions, usually the
+                paddings or the subsequent positions. It is a tensor with shape
+                broadcasted to `[batch_size, n_head, sequence_length, sequence_length]`,
+                where the unwanted positions have `-INF` values and the others
+                have 0 values. The data type should be float32 or float64. It can
+                be None when nothing wanted or needed to be prevented attention to.
+                Default None
+            cache (dict, optional): It is a dict with `k` and `v` as keys or
+                `static_k` and `static_v` as keys, and values are tensors shaped
+                `[batch_size, num_heads, length, embed_dim]` which are results of
+                linear projection, reshape and transpose calculations. If keys are
+                `k` and `v`, the values reserve intermediate results of previous
+                positions, and would be updated by new tensors concatanating raw
+                tensors with results of current position, which mostly used for
+                decoder self attention. If keys are `static_k` and `static_v`,
+                `key` and `value` args would be ignored, and the values in dict
+                would be used as calculated results on `key` and `value`, which
+                mostly used for decoder-encoder cross attention. It is only used
+                for inference and should be None for training. Default None.
+
+        Returns:
+            Variable: The output of multi-head attention. It is a tensor \
+                that has the same shape and data type as `queries`.
+        """
+        key = query if key is None else key
+        value = query if value is None else value
+        # compute q ,k ,v
+        q, k, v = self._prepare_qkv(query, key, value, cache)
+
+        # scale dot product attention
+        product = layers.matmul(
+            x=q, y=k, transpose_y=True, alpha=self.head_dim**-0.5)
+        if attn_mask is not None:
+            # TODO(guosheng): support bool mask
+            product = product + attn_mask
+        weights = layers.softmax(product)
+        if self.dropout:
+            weights = layers.dropout(
+                weights,
+                dropout_prob=self.dropout,
+                dropout_implementation="upscale_in_train",
+                is_test=False)
+
+        out = layers.matmul(weights, v)
+
+        # combine heads
+        out = layers.transpose(out, perm=[0, 2, 1, 3])
+        out = layers.reshape(x=out, shape=[0, 0, out.shape[2] * out.shape[3]])
+
+        # project to output
+        out = self.out_proj(out)
+        return (out, weights) if self.need_weights else out
+
+
+class TransformerEncoderLayer(Layer):
+    """
+    TransformerEncoderLayer is composed of two sub-layers which are self (multi-head)
+    attention and feedforward network. Before and after each sub-layer, pre-process
+    and post-precess would be applied on the input and output.
+
+    Parameters:
+        n_head (int): The number of heads in multi-head attention(MHA).
+        d_key (int): The feature size to transformer queries and keys as in
+            multi-head attention. Mostly it equals to `d_model // n_head`.
+        d_value (int): The feature size to transformer values as in multi-head
+            attention. Mostly it equals to `d_model // n_head`.
+        d_model (int): The expected feature size in the input and output.
+        d_inner_hid (int): The hidden layer size in the feedforward network(FFN).
+        prepostprocess_dropout (float, optional): The dropout probability used
+            in pre-process and post-precess of MHA and FFN sub-layer. Default 0.1
+        attention_dropout (float, optional): The dropout probability used
+            in MHA to drop some attention target. Default 0.1
+        relu_dropout (float, optional): The dropout probability used after FFN
+            activition. Default 0.1
+        preprocess_cmd (str, optional): The process applied before each MHA and
+            FFN sub-layer, and it also would be applied on output of the last
+            stacked layer. It should be a string composed of `d`, `a`, `n`,
+            where `d` for dropout, `a` for add residual connection, `n` for
+            layer normalization. Default `n`.
+        postprocess_cmd (str, optional): The process applied after each MHA and
+            FFN sub-layer. Same as `preprocess_cmd`. It should be a string
+            composed of `d`, `a`, `n`, where `d` for dropout, `a` for add
+            residual connection, `n` for layer normalization. Default `da`.
+        ffn_fc1_act (str, optional): The activation function in the feedforward
+            network. Default relu.
+         
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            import paddle.fluid as fluid
+            from paddle.incubate.hapi.text import TransformerEncoderLayer
+
+            # encoder input: [batch_size, src_len, d_model]
+            enc_input = paddle.rand((2, 4, 128))
+            # self attention bias: [batch_size, n_head, src_len, src_len]
+            attn_bias = paddle.rand((2, 2, 4, 4))
+            encoder_layer = TransformerEncoderLayer(2, 64, 64, 128, 512)
+            enc_output = encoder_layer(enc_input, attn_bias)  # [2, 4, 128]
+    """
+
+    def __init__(self,
+                 d_model,
+                 nhead,
+                 dim_feedforward,
+                 dropout=0.1,
+                 activation="relu",
+                 attn_dropout=0.1,
+                 act_dropout=0.1,
+                 norm=True):
+
+        super(TransformerEncoderLayer, self).__init__()
+
+        self.preprocesser1 = PrePostProcessLayer(preprocess_cmd, d_model,
+                                                 prepostprocess_dropout)
+        self.self_attn = MultiHeadAttention(d_key, d_value, d_model, n_head,
+                                            attention_dropout)
+        self.postprocesser1 = PrePostProcessLayer(postprocess_cmd, d_model,
+                                                  prepostprocess_dropout)
+
+        self.preprocesser2 = PrePostProcessLayer(preprocess_cmd, d_model,
+                                                 prepostprocess_dropout)
+        self.ffn = FFN(d_inner_hid, d_model, relu_dropout, fc1_act=ffn_fc1_act)
+        self.postprocesser2 = PrePostProcessLayer(postprocess_cmd, d_model,
+                                                  prepostprocess_dropout)
+
+    def forward(self, src, src_mask=None):
+        """
+        Applies a Transformer encoder layer on the input.
+
+        Parameters:
+            enc_input (Variable): The input of Transformer encoder layer. It is
+                a tensor with shape `[batch_size, sequence_length, d_model]`.
+                The data type should be float32 or float64.
+            attn_bias(Variable, optional): A tensor used in encoder self attention
+                to mask out attention on unwanted positions, usually the paddings. It
+                is a tensor with shape `[batch_size, n_head, sequence_length, sequence_length]`,
+                where the unwanted positions have `-INF` values and the others
+                have 0 values. The data type should be float32 or float64. It can
+                be None when nothing wanted or needed to be masked out. Default None
+
+        Returns:
+            Variable: The output of Transformer encoder layer. It is a tensor that \
+                has the same shape and data type as `enc_input`.
+        """
+        attn_output = self.self_attn(
+            self.preprocesser1(enc_input), None, None, attn_bias)
+        attn_output = self.postprocesser1(attn_output, enc_input)
+
+        ffn_output = self.ffn(self.preprocesser2(attn_output))
+        ffn_output = self.postprocesser2(ffn_output, attn_output)
+        return ffn_output
+
+
+class TransformerCell(Layer):
+    """
+    TransformerCell wraps a Transformer decoder producing logits from `inputs`
+    composed by ids and position.
+
+    Parameters:
+        decoder(callable): A TransformerDecoder instance. Or a wrapper of it that
+            includes a embedding layer accepting ids and positions instead of embeddings
+            and includes a output layer transforming decoder output features to logits.
+        embedding_fn(function, optional): A callable that accepts ids and position
+            as arguments and return embeddings as input of `decoder`. It can be
+            None if `decoder` includes a embedding layer. Default None.
+        output_fn(callable, optional): A callable applid on `decoder` output to
+            transform decoder output features to get logits. Mostly it is a Linear
+            layer with vocabulary size. It can be None if `decoder` includes a
+            output layer. Default None.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            import paddle.fluid as fluid
+            from paddle.fluid.dygraph import Embedding, Linear
+            from paddle.incubate.hapi.text import TransformerDecoder
+            from paddle.incubate.hapi.text import TransformerCell
+            from paddle.incubate.hapi.text import TransformerBeamSearchDecoder
+            from paddle.incubate.hapi.text import DynamicDecode
+
+            paddle.disable_static()
+
+            class Embedder(fluid.dygraph.Layer):
+                def __init__(self):
+                    super(Embedder, self).__init__()
+                    self.word_embedder = Embedding(size=[1000, 128])
+                    self.pos_embedder = Embedding(size=[500, 128])
+
+                def forward(self, word, position):
+                    return self.word_embedder(word) + self.pos_embedder(position)
+
+            embedder = Embedder()
+            output_layer = Linear(128, 1000)
+            decoder = TransformerDecoder(2, 2, 64, 64, 128, 512)
+            transformer_cell = TransformerCell(decoder, embedder, output_layer)
+            dynamic_decoder = DynamicDecode(
+                TransformerBeamSearchDecoder(
+                    transformer_cell,
+                    start_token=0,
+                    end_token=1,
+                    beam_size=4,
+                    var_dim_in_state=2),
+                max_step_num=10,
+                is_test=True)
+            
+            enc_output = paddle.rand((2, 4, 128))
+            # cross attention bias: [batch_size, n_head, trg_len, src_len]
+            trg_src_attn_bias = paddle.rand((2, 2, 1, 4))
+            # inputs for beam search on Transformer
+            caches = transformer_cell.get_initial_states(enc_output)
+            enc_output = TransformerBeamSearchDecoder.tile_beam_merge_with_batch(
+                enc_output, beam_size=4)
+            trg_src_attn_bias = TransformerBeamSearchDecoder.tile_beam_merge_with_batch(
+                trg_src_attn_bias, beam_size=4)
+            static_caches = decoder.prepare_static_cache(enc_output)
+            outputs = dynamic_decoder(
+                inits=caches,
+                enc_output=enc_output,
+                trg_src_attn_bias=trg_src_attn_bias,
+                static_caches=static_caches)
+    """
+
+    def __init__(self, decoder, embed_layer=None, output_layer=None):
+        super(TransformerCell, self).__init__()
+        self.decoder = decoder
+        self.embedding_fn = embedding_fn
+        self.output_fn = output_fn
+
+    def forward(self,
+                inputs,
+                states=None,
+                enc_output=None,
+                trg_slf_attn_bias=None,
+                trg_src_attn_bias=None,
+                static_caches=[]):
+        """
+        Produces logits from `inputs` composed by ids and positions.
+
+        Parameters:
+            inputs(tuple): A tuple includes target ids and positions. The two
+                tensors both have int64 data type and with 2D shape 
+                `[batch_size, sequence_length]` where `sequence_length` is 1
+                for inference.
+            states(list): It caches the multi-head attention intermediate results
+                of history decoding steps. It is a list of dict where the length
+                of list is decoder layer number, and each dict has `k` and `v` as
+                keys and values are cached results. Default None
+            enc_output(Variable): The output of Transformer encoder. It is a tensor
+                with shape `[batch_size, sequence_length, d_model]`. The data type
+                should be float32 or float64.
+            trg_slf_attn_bias(Variable, optional): A tensor used in decoder self
+                attention to mask out attention on unwanted target positions. It
+                is a tensor with shape `[batch_size, n_head, target_length, target_length]`,
+                where the unwanted positions have `-INF` values and the others
+                have 0 values. It can be None when nothing wanted or needed to
+                be masked out. It can be None for inference. The data type should
+                be float32 or float64. Default None
+            trg_src_attn_bias(Variable, optional): A tensor used in decoder-encoder
+                cross attention to mask out unwanted attention on source (encoder output).
+                It is a tensor with shape `[batch_size, n_head, target_length, source_length]`,
+                where the unwanted positions have `-INF` values and the others
+                have 0 values. It can be None when nothing wanted or needed to
+                be masked out. The data type should be float32 or float64. Default None
+            static_caches(list): It stores projected results of encoder output
+                to be used as keys and values in decoder-encoder cross attention
+                It is a list of dict where the length of list is decoder layer
+                number, and each dict has `static_k` and `static_v` as keys and
+                values are stored results. Default empty list
+
+        Returns:
+            tuple: A tuple( :code:`(outputs, new_states)` ), where `outputs` \
+                is a float32 or float64 3D tensor representing logits shaped \
+                `[batch_size, sequence_length, vocab_size]`. `new_states has \
+                the same structure and data type with `states` while the length \
+                is one larger since the intermediate results of current step are \
+                concatenated into it.
+        """
+        trg_word, trg_pos = inputs
+        if states and static_caches:
+            for cache, static_cache in zip(states, static_caches):
+                cache.update(static_cache)
+        if self.embedding_fn is not None:
+            dec_input = self.embedding_fn(trg_word, trg_pos)
+            outputs = self.decoder(dec_input, enc_output, None,
+                                   trg_src_attn_bias, states)
+        else:
+            outputs = self.decoder(trg_word, trg_pos, enc_output, None,
+                                   trg_src_attn_bias, states)
+        if self.output_fn is not None:
+            outputs = self.output_fn(outputs)
+
+        new_states = [{
+            "k": cache["k"],
+            "v": cache["v"]
+        } for cache in states] if states else states
+        return outputs, new_states
+
+    @property
+    def state_shape(self):
+        """
+        States of TransformerCell cache the multi-head attention intermediate
+        results of history decoding steps, and have a increasing length as
+        decoding continued.
+        
+        `state_shape` of TransformerCell is used to initialize states. It is a
+        list of dict where the length of list is decoder layer, and each dict
+        has `k` and `v` as keys and values are `[n_head, 0, d_key]`, `[n_head, 0, d_value]`
+        separately. (-1 for batch size would be automatically inserted into shape).
+
+        Returns:
+            list: It is a list of dict where the length of list is decoder layer \
+                number, and each dict has `k` and `v` as keys and values are cached \
+                results.
+        """
+        return [{
+            "k": [self.decoder.n_head, 0, self.decoder.d_key],
+            "v": [self.decoder.n_head, 0, self.decoder.d_value],
+        } for i in range(self.decoder.n_layer)]
+
+
+class TransformerBeamSearchDecoder(layers.BeamSearchDecoder):
+    """
+    Compared with a RNN step :code:`outputs, new_states = cell(inputs, states)`,
+    Transformer decoder's `inputs` uses 2D tensor shaped `[batch_size * beam_size, 1]`
+    and includes extra position data. And its `states` (caches) has increasing
+    length. These are not consistent with `BeamSearchDecoder`, thus subclass
+    `BeamSearchDecoder` to make beam search adapt to Transformer decoder.
+
+    Parameters:
+        cell(TransformerCell): An instance of `TransformerCell`.
+        start_token(int): The start token id.
+        end_token(int): The end token id.
+        beam_size(int): The beam width used in beam search.
+        var_dim_in_state(int): Indicate which dimension of states is variant.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            import paddle.fluid as fluid
+            from paddle.fluid.dygraph import Embedding, Linear
+            from paddle.incubate.hapi.text import TransformerDecoder
+            from paddle.incubate.hapi.text import TransformerCell
+            from paddle.incubate.hapi.text import TransformerBeamSearchDecoder
+            from paddle.incubate.hapi.text import DynamicDecode
+
+            paddle.disable_static()
+
+            class Embedder(fluid.dygraph.Layer):
+                def __init__(self):
+                    super(Embedder, self).__init__()
+                    self.word_embedder = Embedding(size=[1000, 128])
+                    self.pos_embedder = Embedding(size=[500, 128])
+
+                def forward(self, word, position):
+                    return self.word_embedder(word) + self.pos_embedder(position)
+
+            embedder = Embedder()
+            output_layer = Linear(128, 1000)
+            decoder = TransformerDecoder(2, 2, 64, 64, 128, 512)
+            transformer_cell = TransformerCell(decoder, embedder, output_layer)
+            dynamic_decoder = DynamicDecode(
+                TransformerBeamSearchDecoder(
+                    transformer_cell,
+                    start_token=0,
+                    end_token=1,
+                    beam_size=4,
+                    var_dim_in_state=2),
+                max_step_num=10,
+                is_test=True)
+            
+            enc_output = paddle.rand((2, 4, 128))
+            # cross attention bias: [batch_size, n_head, trg_len, src_len]
+            trg_src_attn_bias = paddle.rand((2, 2, 1, 4))
+            # inputs for beam search on Transformer
+            caches = transformer_cell.get_initial_states(enc_output)
+            enc_output = TransformerBeamSearchDecoder.tile_beam_merge_with_batch(
+                enc_output, beam_size=4)
+            trg_src_attn_bias = TransformerBeamSearchDecoder.tile_beam_merge_with_batch(
+                trg_src_attn_bias, beam_size=4)
+            static_caches = decoder.prepare_static_cache(enc_output)
+            outputs = dynamic_decoder(
+                inits=caches,
+                enc_output=enc_output,
+                trg_src_attn_bias=trg_src_attn_bias,
+                static_caches=static_caches)
+    """
+
+    def __init__(self, cell, start_token, end_token, beam_size,
+                 var_dim_in_state):
+        super(TransformerBeamSearchDecoder,
+              self).__init__(cell, start_token, end_token, beam_size)
+        self.cell = cell
+        self.var_dim_in_state = var_dim_in_state
+
+    def _merge_batch_beams_with_var_dim(self, x):
+        """
+        Reshape a tensor with shape `[batch_size, beam_size, ...]` to a new
+        tensor with shape `[batch_size * beam_size, ...]`. 
+
+        Parameters:
+            x(Variable): A tensor with shape `[batch_size, beam_size, ...]`. The
+                data type should be float32, float64, int32, int64 or bool.
+
+        Returns:
+            Variable: A tensor with shape `[batch_size * beam_size, ...]`, whose \
+                data type is same as `x`.
+        """
+        # init length of cache is 0, and it increases with decoding carrying on,
+        # thus need to reshape elaborately
+        var_dim_in_state = self.var_dim_in_state + 1  # count in beam dim
+        x = layers.transpose(x,
+                             list(range(var_dim_in_state, len(x.shape))) +
+                             list(range(0, var_dim_in_state)))
+        x = layers.reshape(
+            x, [0] * (len(x.shape) - var_dim_in_state
+                      ) + [self.batch_size * self.beam_size] +
+            [int(size) for size in x.shape[-var_dim_in_state + 2:]])
+        x = layers.transpose(
+            x,
+            list(range((len(x.shape) + 1 - var_dim_in_state), len(x.shape))) +
+            list(range(0, (len(x.shape) + 1 - var_dim_in_state))))
+        return x
+
+    def _split_batch_beams_with_var_dim(self, x):
+        """
+        Reshape a tensor with shape `[batch_size * beam_size, ...]` to a new
+        tensor with shape `[batch_size, beam_size, ...]`. 
+
+        Parameters:
+            x(Variable): A tensor with shape `[batch_size * beam_size, ...]`. The
+                data type should be float32, float64, int32, int64 or bool.
+
+        Returns:
+            Variable: A tensor with shape `[batch_size, beam_size, ...]`, whose \
+                data type is same as `x`.     
+        """
+        var_dim_size = layers.shape(x)[self.var_dim_in_state]
+        x = layers.reshape(
+            x, [-1, self.beam_size] +
+            [int(size)
+             for size in x.shape[1:self.var_dim_in_state]] + [var_dim_size] +
+            [int(size) for size in x.shape[self.var_dim_in_state + 1:]])
+        return x
+
+    def step(self, time, inputs, states, **kwargs):
+        """
+        Perform a beam search decoding step, which uses `cell` to get probabilities,
+        and follows a beam search step to calculate scores and select candidate
+        token ids.
+
+        Note: compared with `BeamSearchDecoder.step`, it feed 2D id tensor shaped
+        `[batch_size * beam_size, 1]` rather than `[batch_size * beam_size]` combined
+        position data as inputs to `cell`.
+
+        Parameters:
+            time(Variable): An `int64` tensor with shape `[1]` provided by the caller,
+                representing the current time step number of decoding.
+            inputs(Variable): A tensor variable. It is same as `initial_inputs`
+                returned by `initialize()` for the first decoding step and
+                `next_inputs` returned by `step()` for the others. It is a int64
+                id tensor with shape `[batch_size * beam_size]`
+            states(Variable): A structure of tensor variables.
+                It is same as the `initial_states` returned by `initialize()` for
+                the first decoding step and `beam_search_state` returned by
+                `step()` for the others.
+            **kwargs: Additional keyword arguments, provided by the caller. 
+        
+        Returns:
+            tuple: A tuple( :code:`(beam_search_output, beam_search_state, next_inputs, finished)` ). \
+                `beam_search_state` and `next_inputs` have the same structure, \
+                shape and data type as the input arguments `states` and `inputs` separately. \
+                `beam_search_output` is a namedtuple(including scores, predicted_ids, \
+                parent_ids as fields) of tensor variables, where \
+                `scores, predicted_ids, parent_ids` all has a tensor value shaped \
+                `[batch_size, beam_size]` with data type `float32, int64, int64`. \
+                `finished` is a `bool` tensor with shape `[batch_size, beam_size]`.
+        """
+        # compared to RNN, Transformer has 3D data at every decoding step
+        inputs = layers.reshape(inputs, [-1, 1])  # token
+        pos = layers.ones_like(inputs) * time  # pos
+        cell_states = map_structure(self._merge_batch_beams_with_var_dim,
+                                    states.cell_states)
+
+        cell_outputs, next_cell_states = self.cell((inputs, pos), cell_states,
+                                                   **kwargs)
+
+        # squeeze to adapt to BeamSearchDecoder which use 2D logits
+        cell_outputs = map_structure(
+            lambda x: layers.squeeze(x, [1]) if len(x.shape) == 3 else x,
+            cell_outputs)
+        cell_outputs = map_structure(self._split_batch_beams, cell_outputs)
+        next_cell_states = map_structure(self._split_batch_beams_with_var_dim,
+                                         next_cell_states)
+
+        beam_search_output, beam_search_state = self._beam_search_step(
+            time=time,
+            logits=cell_outputs,
+            next_cell_states=next_cell_states,
+            beam_state=states)
+        next_inputs, finished = (beam_search_output.predicted_ids,
+                                 beam_search_state.finished)
+
+        return (beam_search_output, beam_search_state, next_inputs, finished)

From b67bd96e5d17d2887e050807ef6709a6ecf3ff3c Mon Sep 17 00:00:00 2001
From: guosheng <guosheng@baidu.com>
Date: Mon, 17 Aug 2020 11:46:07 +0800
Subject: [PATCH 02/17] Add MultiHeadAttention cache type and gen_cache.
 test=develop

---
 python/paddle/nn/layer/transformer.py | 167 ++++++++++++++++++--------
 1 file changed, 117 insertions(+), 50 deletions(-)

diff --git a/python/paddle/nn/layer/transformer.py b/python/paddle/nn/layer/transformer.py
index 0fb3f5b0c5ee6c..bffc04dd9c0154 100644
--- a/python/paddle/nn/layer/transformer.py
+++ b/python/paddle/nn/layer/transformer.py
@@ -15,11 +15,12 @@
 # TODO: define the classes of Transformer neural network
 # __all__ = [ ]
 
+import collections
+
 import numpy as np
 
 from ...fluid import layers
 from ...fluid.dygraph import Layer, Linear
-from ...fluid.initializer import Normal
 from .. import functional as F
 from ...fluid.layers import utils
 from ...fluid.layers.utils import map_structure
@@ -57,16 +58,18 @@ class MultiHeadAttention(Layer):
         .. code-block:: python
 
             import paddle
-            from paddle.incubate.hapi.text import MultiHeadAttention
 
             # encoder input: [batch_size, sequence_length, d_model]
             query = paddle.rand((2, 4, 128))
-            # self attention bias: [batch_size, n_head, src_len, src_len]
+            # self attention bias: [batch_size, num_heads, query_len, query_len]
             attn_bias = paddle.rand((2, 2, 4, 4))
-            multi_head_attn = MultiHeadAttention(64, 64, 128, n_head=2)
+            multi_head_attn = paddle.MultiHeadAttention(64, 64, 128, n_head=2)
             output = multi_head_attn(query, attn_bias=attn_bias)  # [2, 4, 128]
     """
 
+    Cache = collections.namedtuple("Cache", ["k", "v"])
+    StaticCache = collections.namedtuple("StaticCache", ["k", "v"])
+
     def __init__(self,
                  embed_dim,
                  num_heads,
@@ -116,28 +119,28 @@ def _prepare_qkv(self, query, key, value, cache=None):
 
         Parameters:
             query (Variable): The queries for multi-head attention. It is a
-                tensor with shape `[batch_size, sequence_length, embed_dim]`. The
+                tensor with shape `[batch_size, query_length, embed_dim]`. The
                 data type should be float32 or float64.
             key (Variable): The keys for multi-head attention. It is
-                a tensor with shape `[batch_size, sequence_length, kdim]`. The
+                a tensor with shape `[batch_size, key_length, kdim]`. The
                 data type should be float32 or float64. If None, use `query` as
                 `key`.
             value (Variable): The values for multi-head attention. It
-                is a tensor with shape `[batch_size, sequence_length, vdim]`.
+                is a tensor with shape `[batch_size, value_length, vdim]`.
                 The data type should be float32 or float64. If None, use `query` as
                 `value`.
-            cache (dict, optional): It is a dict with `k` and `v` as keys or
-                `static_k` and `static_v` as keys, and values are tensors shaped
-                `[batch_size, num_heads, length, embed_dim]` which are results of
-                linear projection, reshape and transpose calculations. If keys are
-                `k` and `v`, the values reserve intermediate results of previous
-                positions, and would be updated by new tensors concatanating raw
-                tensors with results of current position, which mostly used for
-                decoder self attention. If keys are `static_k` and `static_v`,
-                `key` and `value` args would be ignored, and the values in dict
-                would be used as calculated results on `key` and `value`, which
-                mostly used for decoder-encoder cross attention. It is only used
-                for inference and should be None for training. Default None.
+            cache (MultiHeadAttention.Cache|MultiHeadAttention.StaticCache, optional):
+                It is a namedtuple with `k` and `v` as fields, and stores tensors
+                shaped `[batch_size, num_heads, length, embed_dim]` which are results
+                of linear projection, reshape and transpose calculations in
+                MultiHeadAttention. If is an instance of `Cache`, `k` and `v`
+                fields reserve intermediate results of previous positions, which
+                mostly used for decoder self attention. If it is an instance of
+                `StaticCache`, `key` and `value` args would be ignored, `k` and
+                `v` fields would be used as calculated results on `key` and
+                `value`, which mostly used for decoder-encoder cross attention.
+                It is only used for inference and should be None for training.
+                Default None.
 
         Returns:
             tuple: A tuple including linear projected keys and values. These two \
@@ -149,20 +152,19 @@ def _prepare_qkv(self, query, key, value, cache=None):
         q = layers.reshape(x=q, shape=[0, 0, self.n_head, self.d_key])
         q = layers.transpose(x=q, perm=[0, 2, 1, 3])
 
-        if cache is not None and "static_k" in cache:
+        if isinstance(cache, self.StaticCache):
             # for encoder-decoder attention in inference and has cached
-            k, v = cache["static_k"], cache["static_v"]
+            k, v = cache.k, cache.v
         else:
             k, v = self.cal_kv(key, value)
 
-        if cache is not None and "static_k" not in cache:
+        if isinstance(cache, self.Cache):
             # for decoder self-attention in inference
-            cache_k, cache_v = cache["k"], cache["v"]
-            k = layers.concat([cache_k, k], axis=2)
-            v = layers.concat([cache_v, v], axis=2)
-            cache["k"], cache["v"] = k, v
+            k = layers.concat([cache.k, k], axis=2)
+            v = layers.concat([cache.v, v], axis=2)
+            cache = self.Cache(k, v)
 
-        return q, k, v
+        return (q, k, v) if cache is None else (q, k, v, cache)
 
     def cal_kv(self, key, value):
         """
@@ -172,7 +174,8 @@ def cal_kv(self, key, value):
         parallel attention.
         
         It is part of calculations in multi-head attention, and is provided as
-        a method to prefetch these results, by which we can use them as cache.
+        a method to pre-compute and prefetch these results, thus we can use them
+        to construct cache for inference.
 
         Parameters:
             key (Variable, optional): The keys for multi-head attention. It is
@@ -183,8 +186,8 @@ def cal_kv(self, key, value):
                 The data type should be float32 or float64.
 
         Returns:
-            tuple: A tuple including linear projected keys and values. Their shapes \
-                both are `[batch_size, num_heads, sequence_length, embed_dim // num_heads]`. \
+            tuple: A tuple including transformed keys and values. Their shapes \
+                both are `[batch_size, num_heads, sequence_length, embed_dim // num_heads]`, \
                 and their data types are same as inputs.
         """
         k = self.k_proj(key)
@@ -195,6 +198,53 @@ def cal_kv(self, key, value):
         v = layers.transpose(x=v, perm=[0, 2, 1, 3])
         return k, v
 
+    def gen_cache(self, key, value=None, type=MultiHeadAttention.Cache):
+        """
+        Generates cache for `forward` usage accroding to arguments.
+
+        If `type` is `StaticCache`, apply `cal_kv(key, value)` and use the results
+        to create an instance of `StaticCache`. 
+        
+        If `type` is `Cache` and `value` is None, generate empty tensors shaped
+        `[batch_size, num_heads, 0, head_dim]` and use the results to create an
+        instance of `Cache`, where `batch_size` is from the first dimension of `key`.
+
+        If `type` is `Cache` and `value` is not None, use `key`, `value` to create
+        an instance of `Cache`.
+
+        Parameters:
+            key (Variable): The keys for multi-head attention. It is
+                a tensor with shape `[batch_size, key_length, kdim]`. The
+                data type should be float32 or float64.
+            value (Variable, optional): The values for multi-head attention. It
+                is a tensor with shape `[batch_size, value_length, vdim]`.
+                The data type should be float32 or float64. If None, `key` is only
+                for batch size reference. Default None.
+            type (type): It should be `MultiHeadAttention.StaticCache` or
+                `MultiHeadAttention.Cache` to indicate the cache type to generate.
+        
+        Returns:
+            namedtupe: an instance of `Cache` or `StaticCache` accordingly.
+        """
+        if type == MultiHeadAttention.StaticCache:  # static_kv
+            k, v = self.cal_kv(key, value)
+            return self.StaticCache(k, v)
+        elif value is None:  # incremental_state
+            k = layers.fill_constant_batch_size_like(
+                input=key,
+                shape=[-1, self.num_heads, 0, self.head_dim],
+                dtype=key.dtype,
+                value=0)
+            v = layers.fill_constant_batch_size_like(
+                input=key,
+                shape=[-1, self.num_heads, 0, self.head_dim],
+                dtype=key.dtype,
+                value=0)
+            return self.Cache(k, v)
+        else:
+            # incremental_state with initial value, mainly for usage like UniLM
+            return self.Cache(key, value)
+
     def forward(self, query, key, value, attn_mask=None, cache=None):
         """
         Applies multi-head attention to map queries and a set of key-value pairs
@@ -202,14 +252,14 @@ def forward(self, query, key, value, attn_mask=None, cache=None):
 
         Parameters:
             query (Variable): The queries for multi-head attention. It is a
-                tensor with shape `[batch_size, sequence_length, embed_dim]`. The
+                tensor with shape `[batch_size, query_length, embed_dim]`. The
                 data type should be float32 or float64.
             key (Variable, optional): The keys for multi-head attention. It is
-                a tensor with shape `[batch_size, sequence_length, kdim]`. The
+                a tensor with shape `[batch_size, key_length, kdim]`. The
                 data type should be float32 or float64. If None, use `query` as
                 `key`. Default None.
             value (Variable, optional): The values for multi-head attention. It
-                is a tensor with shape `[batch_size, sequence_length, vdim]`.
+                is a tensor with shape `[batch_size, value_length, vdim]`.
                 The data type should be float32 or float64. If None, use `query` as
                 `value`. Default None.
             attn_mask (Variable, optional): A tensor used in multi-head attention
@@ -220,27 +270,38 @@ def forward(self, query, key, value, attn_mask=None, cache=None):
                 have 0 values. The data type should be float32 or float64. It can
                 be None when nothing wanted or needed to be prevented attention to.
                 Default None
-            cache (dict, optional): It is a dict with `k` and `v` as keys or
-                `static_k` and `static_v` as keys, and values are tensors shaped
-                `[batch_size, num_heads, length, embed_dim]` which are results of
-                linear projection, reshape and transpose calculations. If keys are
-                `k` and `v`, the values reserve intermediate results of previous
-                positions, and would be updated by new tensors concatanating raw
-                tensors with results of current position, which mostly used for
-                decoder self attention. If keys are `static_k` and `static_v`,
-                `key` and `value` args would be ignored, and the values in dict
-                would be used as calculated results on `key` and `value`, which
-                mostly used for decoder-encoder cross attention. It is only used
-                for inference and should be None for training. Default None.
+            cache (MultiHeadAttention.Cache|MultiHeadAttention.StaticCache, optional):
+                It is a namedtuple with `k` and `v` as fields, and stores tensors
+                shaped `[batch_size, num_heads, length, embed_dim]` which are results
+                of linear projection, reshape and transpose calculations in
+                MultiHeadAttention. If is an instance of `Cache`, `k` and `v`
+                fields reserve intermediate results of previous positions, which
+                mostly used for decoder self attention. If it is an instance of
+                `StaticCache`, `key` and `value` args would be ignored, `k` and
+                `v` fields would be used as calculated results on `key` and
+                `value`, which mostly used for decoder-encoder cross attention.
+                It is only used for inference and should be None for training.
+                Default None.
 
         Returns:
-            Variable: The output of multi-head attention. It is a tensor \
-                that has the same shape and data type as `queries`.
+            Variable|tuple: It is a tensor that has the same shape and data type \
+                as `query`, representing attention output. Or a tuple if \
+                `need_weights` is True or `cache` is not None. If `need_weights` \
+                is True, except for attention output, the tuple also includes \
+                the attention weights tensor shaped `[batch_size, num_heads, query_length, key_length]`. \
+                If `cache` is not None, the tuple then includes the new cache \
+                having the same type as `cache`, and if it is `StaticCache`, it \
+                is same as the input `cache`, if it is `Cache`, the new cache \
+                reserves tensors concatanating raw tensors with intermediate \
+                results of current query.
         """
         key = query if key is None else key
         value = query if value is None else value
         # compute q ,k ,v
-        q, k, v = self._prepare_qkv(query, key, value, cache)
+        if cache is None:
+            q, k, v = self._prepare_qkv(query, key, value, cache)
+        else:
+            q, k, v, cache = self._prepare_qkv(query, key, value, cache)
 
         # scale dot product attention
         product = layers.matmul(
@@ -264,7 +325,13 @@ def forward(self, query, key, value, attn_mask=None, cache=None):
 
         # project to output
         out = self.out_proj(out)
-        return (out, weights) if self.need_weights else out
+
+        outs = [out]
+        if self.need_weights:
+            outs.append(weights)
+        if cache is not None:
+            outs.append(cache)
+        return out if len(outs) else outs
 
 
 class TransformerEncoderLayer(Layer):
@@ -323,7 +390,7 @@ def __init__(self,
                  activation="relu",
                  attn_dropout=0.1,
                  act_dropout=0.1,
-                 norm=True):
+                 normalize_before=True):
 
         super(TransformerEncoderLayer, self).__init__()
 

From d3c1709af456091553e7dd8f9c3f35d0f7d53f0a Mon Sep 17 00:00:00 2001
From: guosheng <guosheng@baidu.com>
Date: Tue, 18 Aug 2020 12:34:43 +0800
Subject: [PATCH 03/17] Add TransformerEncoderLayer and TransformerEncoder.
 test=develop

---
 python/paddle/nn/layer/transformer.py | 271 +++++++++++++++++++-------
 1 file changed, 196 insertions(+), 75 deletions(-)

diff --git a/python/paddle/nn/layer/transformer.py b/python/paddle/nn/layer/transformer.py
index bffc04dd9c0154..7dbbecb869af76 100644
--- a/python/paddle/nn/layer/transformer.py
+++ b/python/paddle/nn/layer/transformer.py
@@ -15,18 +15,51 @@
 # TODO: define the classes of Transformer neural network
 # __all__ = [ ]
 
+import copy
 import collections
 
 import numpy as np
 
 from ...fluid import layers
-from ...fluid.dygraph import Layer, Linear
+from ...fluid.param_attr import ParamAttr
+from ...fluid.dygraph import Layer, Linear, Dropout, LayerNorm, LayerList
 from .. import functional as F
 from ...fluid.layers import utils
 from ...fluid.layers.utils import map_structure
 
 
-class MultiHeadAttention(Layer):
+def _convert_param_attr_to_list(param_attr, n):
+    """
+    If `param_attr` is a list or tuple, convert every element in it to a
+    ParamAttr instance. Otherwise, repeat `param_attr` `n` times to
+    construct a list, and rename every one by appending a increasing index
+    suffix to avoid having same names when `param_attr` contains a name.
+
+    Parameters:
+        param_attr (list|tuple|ParamAttr): A list, tuple or something can be
+            converted to a ParamAttr instance by `ParamAttr._to_attr`.
+        n (int): The times to repeat to construct a list when `param_attr`
+            is not a list or tuple.
+
+    Returns:
+        list: A list composed of each including cell's `param_attr`.
+    """
+    if isinstance(param_attr, (list, tuple)):
+        assert len(param_attr) == n, (
+            "length of param_attr should be %d when it is a list/tuple" % n)
+        param_attrs = [ParamAttr._to_attr(attr) for attr in param_attr]
+    else:
+        param_attrs = []
+        attr = ParamAttr._to_attr(param_attr)
+        for i in range(n):
+            attr_i = copy.deepcopy(attr)
+            if attr.name:
+                attr_i.name = attr_i.name + "_" + str(i)
+            param_attrs.append(attr_i)
+    return param_attrs
+
+
+class MultiheadAttention(Layer):
     """
     Attention mapps queries and a set of key-value pairs to outputs, and
     Multi-Head Attention performs multiple parallel attention to jointly attending
@@ -61,10 +94,10 @@ class MultiHeadAttention(Layer):
 
             # encoder input: [batch_size, sequence_length, d_model]
             query = paddle.rand((2, 4, 128))
-            # self attention bias: [batch_size, num_heads, query_len, query_len]
-            attn_bias = paddle.rand((2, 2, 4, 4))
-            multi_head_attn = paddle.MultiHeadAttention(64, 64, 128, n_head=2)
-            output = multi_head_attn(query, attn_bias=attn_bias)  # [2, 4, 128]
+            # self attention mask: [batch_size, num_heads, query_len, query_len]
+            attn_mask = paddle.rand((2, 2, 4, 4))
+            multi_head_attn = paddle.MultiheadAttention(64, 64, 128, n_head=2)
+            output = multi_head_attn(query, attn_mask=attn_mask)  # [2, 4, 128]
     """
 
     Cache = collections.namedtuple("Cache", ["k", "v"])
@@ -76,10 +109,10 @@ def __init__(self,
                  dropout=0.,
                  kdim=None,
                  vdim=None,
-                 need_weights=True,
+                 need_weights=False,
                  param_attr=None,
                  bias_attr=None):
-        super(MultiHeadAttention, self).__init__()
+        super(MultiheadAttention, self).__init__()
         self.embed_dim = embed_dim
         self.kdim = kdim if kdim is not None else embed_dim
         self.vdim = vdim if vdim is not None else embed_dim
@@ -129,11 +162,11 @@ def _prepare_qkv(self, query, key, value, cache=None):
                 is a tensor with shape `[batch_size, value_length, vdim]`.
                 The data type should be float32 or float64. If None, use `query` as
                 `value`.
-            cache (MultiHeadAttention.Cache|MultiHeadAttention.StaticCache, optional):
+            cache (MultiheadAttention.Cache|MultiheadAttention.StaticCache, optional):
                 It is a namedtuple with `k` and `v` as fields, and stores tensors
                 shaped `[batch_size, num_heads, length, embed_dim]` which are results
                 of linear projection, reshape and transpose calculations in
-                MultiHeadAttention. If is an instance of `Cache`, `k` and `v`
+                MultiheadAttention. If is an instance of `Cache`, `k` and `v`
                 fields reserve intermediate results of previous positions, which
                 mostly used for decoder self attention. If it is an instance of
                 `StaticCache`, `key` and `value` args would be ignored, `k` and
@@ -198,7 +231,7 @@ def cal_kv(self, key, value):
         v = layers.transpose(x=v, perm=[0, 2, 1, 3])
         return k, v
 
-    def gen_cache(self, key, value=None, type=MultiHeadAttention.Cache):
+    def gen_cache(self, key, value=None, type=MultiheadAttention.Cache):
         """
         Generates cache for `forward` usage accroding to arguments.
 
@@ -215,18 +248,19 @@ def gen_cache(self, key, value=None, type=MultiHeadAttention.Cache):
         Parameters:
             key (Variable): The keys for multi-head attention. It is
                 a tensor with shape `[batch_size, key_length, kdim]`. The
-                data type should be float32 or float64.
+                data type should be float32 or float64. If `value` is None,
+                it is only for batch size and data type reference.
             value (Variable, optional): The values for multi-head attention. It
                 is a tensor with shape `[batch_size, value_length, vdim]`.
                 The data type should be float32 or float64. If None, `key` is only
                 for batch size reference. Default None.
-            type (type): It should be `MultiHeadAttention.StaticCache` or
-                `MultiHeadAttention.Cache` to indicate the cache type to generate.
+            type (type): It should be `MultiheadAttention.StaticCache` or
+                `MultiheadAttention.Cache` to indicate the cache type to generate.
         
         Returns:
             namedtupe: an instance of `Cache` or `StaticCache` accordingly.
         """
-        if type == MultiHeadAttention.StaticCache:  # static_kv
+        if type == MultiheadAttention.StaticCache:  # static_kv
             k, v = self.cal_kv(key, value)
             return self.StaticCache(k, v)
         elif value is None:  # incremental_state
@@ -270,11 +304,11 @@ def forward(self, query, key, value, attn_mask=None, cache=None):
                 have 0 values. The data type should be float32 or float64. It can
                 be None when nothing wanted or needed to be prevented attention to.
                 Default None
-            cache (MultiHeadAttention.Cache|MultiHeadAttention.StaticCache, optional):
+            cache (MultiheadAttention.Cache|MultiheadAttention.StaticCache, optional):
                 It is a namedtuple with `k` and `v` as fields, and stores tensors
                 shaped `[batch_size, num_heads, length, embed_dim]` which are results
                 of linear projection, reshape and transpose calculations in
-                MultiHeadAttention. If is an instance of `Cache`, `k` and `v`
+                MultiheadAttention. If is an instance of `Cache`, `k` and `v`
                 fields reserve intermediate results of previous positions, which
                 mostly used for decoder self attention. If it is an instance of
                 `StaticCache`, `key` and `value` args would be ignored, `k` and
@@ -331,55 +365,59 @@ def forward(self, query, key, value, attn_mask=None, cache=None):
             outs.append(weights)
         if cache is not None:
             outs.append(cache)
-        return out if len(outs) else outs
+        return out if len(outs) else tuple(outs)
 
 
 class TransformerEncoderLayer(Layer):
     """
     TransformerEncoderLayer is composed of two sub-layers which are self (multi-head)
     attention and feedforward network. Before and after each sub-layer, pre-process
-    and post-precess would be applied on the input and output.
+    and post-precess would be applied on the input and output accordingly. If
+    `normalize_before` is True, pre-process is layer normalization and post-precess
+    includes dropout, residual connection. Otherwise, no pre-process and post-precess
+    includes dropout, residual connection, layer normalization.
 
     Parameters:
-        n_head (int): The number of heads in multi-head attention(MHA).
-        d_key (int): The feature size to transformer queries and keys as in
-            multi-head attention. Mostly it equals to `d_model // n_head`.
-        d_value (int): The feature size to transformer values as in multi-head
-            attention. Mostly it equals to `d_model // n_head`.
         d_model (int): The expected feature size in the input and output.
-        d_inner_hid (int): The hidden layer size in the feedforward network(FFN).
-        prepostprocess_dropout (float, optional): The dropout probability used
-            in pre-process and post-precess of MHA and FFN sub-layer. Default 0.1
-        attention_dropout (float, optional): The dropout probability used
-            in MHA to drop some attention target. Default 0.1
-        relu_dropout (float, optional): The dropout probability used after FFN
-            activition. Default 0.1
-        preprocess_cmd (str, optional): The process applied before each MHA and
-            FFN sub-layer, and it also would be applied on output of the last
-            stacked layer. It should be a string composed of `d`, `a`, `n`,
-            where `d` for dropout, `a` for add residual connection, `n` for
-            layer normalization. Default `n`.
-        postprocess_cmd (str, optional): The process applied after each MHA and
-            FFN sub-layer. Same as `preprocess_cmd`. It should be a string
-            composed of `d`, `a`, `n`, where `d` for dropout, `a` for add
-            residual connection, `n` for layer normalization. Default `da`.
-        ffn_fc1_act (str, optional): The activation function in the feedforward
+        nhead (int): The number of heads in multi-head attention(MHA).
+        dim_feedforward (int): The hidden layer size in the feedforward network(FFN).
+        dropout (float, optional): The dropout probability used in pre-process
+            and post-precess of MHA and FFN sub-layer. Default 0.1
+        activation (str, optional): The activation function in the feedforward
             network. Default relu.
-         
+        attn_dropout (float, optional): The dropout probability used
+            in MHA to drop some attention target. If None, use the value of
+            `dropout`. Default None
+        act_dropout (float, optional): The dropout probability used after FFN
+            activition.  If None, use the value of `dropout`. Default None
+        act_dropout (float, optional): The dropout probability used after FFN
+            activition.  If None, use the value of `dropout`. Default None
+        param_attr(ParamAttr|tuple, optional): To specify the weight parameter property.
+            If it is a tuple, `param_attr[0]` would be used as `param_attr` for
+            MHA, and `param_attr[1]` would be used as `param_attr` for linear in FFN.
+            Otherwise, MHA and FFN both use it as `param_attr` to create parameters.
+            Default: None, which means the default weight parameter property is used.
+            See usage for details in :ref:`api_fluid_ParamAttr` . 
+        bias_attr (ParamAttr|tuple, optional): To specify the bias parameter property.
+            If it is a tuple, `bias_attr[0]` would be used as `bias_attr` for
+            MHA, and `bias_attr[1]` would be used as `bias_attr` for linear in FFN.
+            Otherwise, MHA and FFN both use it as `bias_attr` to create parameters.
+            Default: None, which means the default bias parameter property is used.
+            See usage for details in :ref:`api_fluid_ParamAttr` .
+
     Examples:
 
         .. code-block:: python
 
             import paddle
-            import paddle.fluid as fluid
-            from paddle.incubate.hapi.text import TransformerEncoderLayer
+            from paddle import TransformerEncoderLayer
 
             # encoder input: [batch_size, src_len, d_model]
             enc_input = paddle.rand((2, 4, 128))
-            # self attention bias: [batch_size, n_head, src_len, src_len]
-            attn_bias = paddle.rand((2, 2, 4, 4))
-            encoder_layer = TransformerEncoderLayer(2, 64, 64, 128, 512)
-            enc_output = encoder_layer(enc_input, attn_bias)  # [2, 4, 128]
+            # self attention mask: [batch_size, n_head, src_len, src_len]
+            attn_mask = paddle.rand((2, 2, 4, 4))
+            encoder_layer = TransformerEncoderLayer(128, 2, 512)
+            enc_output = encoder_layer(enc_input, attn_mask)  # [2, 4, 128]
     """
 
     def __init__(self,
@@ -388,51 +426,134 @@ def __init__(self,
                  dim_feedforward,
                  dropout=0.1,
                  activation="relu",
-                 attn_dropout=0.1,
-                 act_dropout=0.1,
-                 normalize_before=True):
+                 attn_dropout=None,
+                 act_dropout=None,
+                 normalize_before=False,
+                 param_attr=None,
+                 bias_attr=None):
+        self._config = locals()
+        self._config.pop("self")
 
         super(TransformerEncoderLayer, self).__init__()
-
-        self.preprocesser1 = PrePostProcessLayer(preprocess_cmd, d_model,
-                                                 prepostprocess_dropout)
-        self.self_attn = MultiHeadAttention(d_key, d_value, d_model, n_head,
-                                            attention_dropout)
-        self.postprocesser1 = PrePostProcessLayer(postprocess_cmd, d_model,
-                                                  prepostprocess_dropout)
-
-        self.preprocesser2 = PrePostProcessLayer(preprocess_cmd, d_model,
-                                                 prepostprocess_dropout)
-        self.ffn = FFN(d_inner_hid, d_model, relu_dropout, fc1_act=ffn_fc1_act)
-        self.postprocesser2 = PrePostProcessLayer(postprocess_cmd, d_model,
-                                                  prepostprocess_dropout)
+        attn_dropout = dropout if attn_dropout is None else attn_dropout
+        act_dropout = dropout if act_dropout is None else act_dropout
+        self.normalize_before = normalize_before
+
+        param_attrs = _convert_param_attr_to_list(param_attr, 2)
+        bias_attrs = _convert_param_attr_to_list(bias_attr, 2)
+
+        self.self_attn = MultiheadAttention(
+            d_model,
+            nhead,
+            dropout=attn_dropout,
+            param_attr=param_attrs[0],
+            bias_attr=bias_attrs[0])
+        self.linear1 = Linear(
+            d_model,
+            dim_feedforward,
+            param_attr=param_attrs[1],
+            bias_attr=bias_attrs[1])
+        self.dropout = Dropout(
+            act_dropout, dropout_implementation="upscale_in_train")
+        self.linear2 = Linear(
+            dim_feedforward,
+            d_model,
+            param_attr=param_attrs[1],
+            bias_attr=bias_attrs[1])
+        self.norm1 = LayerNorm(d_model)
+        self.norm2 = LayerNorm(d_model)
+        self.dropout1 = Dropout(
+            dropout, dropout_implementation="upscale_in_train")
+        self.dropout2 = Dropout(
+            dropout, dropout_implementation="upscale_in_train")
+        self.activation = getattr(layers, activation)
 
     def forward(self, src, src_mask=None):
         """
         Applies a Transformer encoder layer on the input.
 
         Parameters:
-            enc_input (Variable): The input of Transformer encoder layer. It is
+            src (Variable): The input of Transformer encoder layer. It is
                 a tensor with shape `[batch_size, sequence_length, d_model]`.
                 The data type should be float32 or float64.
-            attn_bias(Variable, optional): A tensor used in encoder self attention
-                to mask out attention on unwanted positions, usually the paddings. It
-                is a tensor with shape `[batch_size, n_head, sequence_length, sequence_length]`,
+            src_mask (Variable, optional): A tensor used in multi-head attention
+                to prevents attention to some unwanted positions, usually the
+                paddings or the subsequent positions. It is a tensor with shape
+                broadcasted to `[batch_size, n_head, sequence_length, sequence_length]`,
                 where the unwanted positions have `-INF` values and the others
                 have 0 values. The data type should be float32 or float64. It can
-                be None when nothing wanted or needed to be masked out. Default None
+                be None when nothing wanted or needed to be prevented attention to.
+                Default None
 
         Returns:
             Variable: The output of Transformer encoder layer. It is a tensor that \
                 has the same shape and data type as `enc_input`.
         """
-        attn_output = self.self_attn(
-            self.preprocesser1(enc_input), None, None, attn_bias)
-        attn_output = self.postprocesser1(attn_output, enc_input)
+        residual = src
+        if self.normalize_before:
+            src = self.norm1(src)
+        # TODO(guosheng): Add cache for encoder for the usage like UniLM
+        src = self.self_attn(src, src, src, src_mask)
+        src = residual + self.dropout1(src)
+        if not self.normalize_before:
+            src = self.norm1(src)
+
+        residual = src
+        if self.normalize_before:
+            src = self.norm2(src)
+        src = self.linear2(self.dropout(self.activation(self.linear1(src))))
+        src = residual + self.dropout2(src)
+        if not self.normalize_before:
+            src = self.norm2(src)
+        return src
+
+
+class TransformerEncoder(Layer):
+    """
+    TransformerEncoder is a stack of N encoder layers. 
+
+    Parameters:
+        encoder_layer (Layer): an instance of the `TransformerEncoderLayer`. It
+            would be used as the first layer, and the other layers would be created
+            according to the configurations of it.
+        num_layers (int): The number of encoder layers to be stacked.
+        norm (LayerNorm, optional): the layer normalization component. If provided,
+            apply layer normalization on the output of last encoder layer.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            from paddle import TransformerEncoderLayer, TransformerEncoder
+
+            # encoder input: [batch_size, src_len, d_model]
+            enc_input = paddle.rand((2, 4, 128))
+            # self attention mask: [batch_size, n_head, src_len, src_len]
+            attn_mask = paddle.rand((2, 2, 4, 4))
+            encoder_layer = TransformerEncoderLayer(2, 64, 64, 128, 512)
+            encoder = TransformerEncoder(encoder_layer, 2)
+            enc_output = encoder(enc_input, attn_mask)  # [2, 4, 128]
+    """
+
+    def __init__(self, encoder_layer, num_layers, norm=None):
+        super(TransformerEncoder, self).__init__()
+        self.layers = LayerList([(encoder_layer if i == 0 else
+                                  type(encoder_layer)(encoder_layer._config))
+                                 for i in range(num_layers)])
+        self.num_layers = num_layers
+        self.norm = norm
+
+    def forward(self, src, src_mask=None):
+        output = src
+
+        for mod in self.layers:
+            output = mod(output, src_mask=src_mask)
+
+        if self.norm is not None:
+            output = self.norm(output)
 
-        ffn_output = self.ffn(self.preprocesser2(attn_output))
-        ffn_output = self.postprocesser2(ffn_output, attn_output)
-        return ffn_output
+        return output
 
 
 class TransformerCell(Layer):

From 87e110638d1bd4b1262688e46767ac14396b5bcd Mon Sep 17 00:00:00 2001
From: guosheng <guosheng@baidu.com>
Date: Tue, 18 Aug 2020 21:53:28 +0800
Subject: [PATCH 04/17] Add Transformer decoder apis. test=develop

---
 python/paddle/nn/layer/transformer.py | 454 +++++++++++++++++++++++++-
 1 file changed, 444 insertions(+), 10 deletions(-)

diff --git a/python/paddle/nn/layer/transformer.py b/python/paddle/nn/layer/transformer.py
index 7dbbecb869af76..af50b66c7299c0 100644
--- a/python/paddle/nn/layer/transformer.py
+++ b/python/paddle/nn/layer/transformer.py
@@ -233,16 +233,36 @@ def cal_kv(self, key, value):
 
     def gen_cache(self, key, value=None, type=MultiheadAttention.Cache):
         """
-        Generates cache for `forward` usage accroding to arguments.
+        Generates cache for `forward` usage in inference accroding to arguments.
+        The generated cache is an instance of `MultiheadAttention.Cache` or an
+        instance of `MultiheadAttention.StaticCache`.
+
+        `Cache` or `StaticCache` is namedtuple with `k` and `v` as fields,
+        and it stores tensors shaped `[batch_size, num_heads, length, embed_dim]`
+        which are results of linear projection, reshape and transpose calculations
+        in MultiheadAttention.
+        
+        If the generated cache is an instance of `Cache`, `k` and `v` fields
+        reserve intermediate result tensors of previous positions, and the tensors
+        are incremental among decoding steps, which mostly are used for decoder
+        decoder self attention.
+        
+        If the generated cache is an instance of `StaticCache`, `k` and `v` fields
+        would be used as calculated result tensors on keys an values in `forward`,
+        and the tensors keep unchanged among decoding steps, which are mostly used
+        for decoder-encoder cross attention.
+
+        The cache is generated as follows:
 
-        If `type` is `StaticCache`, apply `cal_kv(key, value)` and use the results
-        to create an instance of `StaticCache`. 
+        1. If `type` is `StaticCache`, apply `cal_kv(key, value)` and use the results
+        to create an instance of `StaticCache`.
         
-        If `type` is `Cache` and `value` is None, generate empty tensors shaped
-        `[batch_size, num_heads, 0, head_dim]` and use the results to create an
-        instance of `Cache`, where `batch_size` is from the first dimension of `key`.
+        2. If `type` is `Cache` and `value` is None, generate empty tensors shaped
+        `[batch_size, num_heads, 0, embed_dim // num_heads]` and use the results
+        to create an instance of `Cache`, where `batch_size` is from the first
+        dimension of `key`.
 
-        If `type` is `Cache` and `value` is not None, use `key`, `value` to create
+        3. If `type` is `Cache` and `value` is not None, use `key`, `value` to create
         an instance of `Cache`.
 
         Parameters:
@@ -258,7 +278,7 @@ def gen_cache(self, key, value=None, type=MultiheadAttention.Cache):
                 `MultiheadAttention.Cache` to indicate the cache type to generate.
         
         Returns:
-            namedtupe: an instance of `Cache` or `StaticCache` accordingly.
+            namedtuple: an instance of `Cache` or `StaticCache` accordingly.
         """
         if type == MultiheadAttention.StaticCache:  # static_kv
             k, v = self.cal_kv(key, value)
@@ -308,7 +328,7 @@ def forward(self, query, key, value, attn_mask=None, cache=None):
                 It is a namedtuple with `k` and `v` as fields, and stores tensors
                 shaped `[batch_size, num_heads, length, embed_dim]` which are results
                 of linear projection, reshape and transpose calculations in
-                MultiheadAttention. If is an instance of `Cache`, `k` and `v`
+                MultiheadAttention. If it is an instance of `Cache`, `k` and `v`
                 fields reserve intermediate results of previous positions, which
                 mostly used for decoder self attention. If it is an instance of
                 `StaticCache`, `key` and `value` args would be ignored, `k` and
@@ -545,6 +565,28 @@ def __init__(self, encoder_layer, num_layers, norm=None):
         self.norm = norm
 
     def forward(self, src, src_mask=None):
+        """
+        Applies a stack of N Transformer encoder layers on inputs. If `norm` is
+        provided, also applies layer normalization on the output of last encoder
+        layer.
+
+        Parameters:
+            src (Variable): The input of Transformer encoder. It is a tensor
+                with shape `[batch_size, sequence_length, d_model]`. The data
+                type should be float32 or float64.
+            src_mask (Variable, optional): A tensor used in multi-head attention
+                to prevents attention to some unwanted positions, usually the
+                paddings or the subsequent positions. It is a tensor with shape
+                broadcasted to `[batch_size, n_head, sequence_length, sequence_length]`,
+                where the unwanted positions have `-INF` values and the others
+                have 0 values. The data type should be float32 or float64. It can
+                be None when nothing wanted or needed to be prevented attention to.
+                Default None
+
+        Returns:
+            Variable: The output of Transformer encoder. It is a tensor that \
+                has the same shape and data type as `src`.
+        """
         output = src
 
         for mod in self.layers:
@@ -556,6 +598,398 @@ def forward(self, src, src_mask=None):
         return output
 
 
+class TransformerDecoderLayer(Layer):
+    """
+    TransformerDecoderLayer is composed of three sub-layers which are decoder
+    self (multi-head) attention, decoder-encoder cross attention and feedforward
+    network. Before and after each sub-layer, pre-process and post-precess would
+    be applied on the input and output accordingly. If `normalize_before` is True,
+    pre-process is layer normalization and post-precess includes dropout, residual
+    connection. Otherwise, no pre-process and post-precess includes dropout, residual
+    connection, layer normalization.
+
+    Parameters:
+        d_model (int): The expected feature size in the input and output.
+        nhead (int): The number of heads in multi-head attention(MHA).
+        dim_feedforward (int): The hidden layer size in the feedforward network(FFN).
+        dropout (float, optional): The dropout probability used in pre-process
+            and post-precess of MHA and FFN sub-layer. Default 0.1
+        activation (str, optional): The activation function in the feedforward
+            network. Default relu.
+        attn_dropout (float, optional): The dropout probability used
+            in MHA to drop some attention target. If None, use the value of
+            `dropout`. Default None
+        act_dropout (float, optional): The dropout probability used after FFN
+            activition.  If None, use the value of `dropout`. Default None
+        act_dropout (float, optional): The dropout probability used after FFN
+            activition.  If None, use the value of `dropout`. Default None
+        param_attr(ParamAttr|tuple, optional): To specify the weight parameter property.
+            If it is a tuple, `param_attr[0]` would be used as `param_attr` for
+            self attention, `param_attr[1]` would be used as `param_attr` for
+            cross attention, and `param_attr[2]` would be used as `param_attr`
+            for linear in FFN. Otherwise, the three sub-layers all uses it as
+            `param_attr` to create parameters. Default: None, which means the
+            default weight parameter property is used. See usage for details
+            in :ref:`api_fluid_ParamAttr` . 
+        bias_attr (ParamAttr|tuple, optional): To specify the bias parameter property.
+            If it is a tuple, `bias_attr[0]` would be used as `bias_attr` for
+            self attention, `bias_attr[1]` would be used as `bias_attr` for
+            cross attention, and `bias_attr[2]` would be used as `bias_attr`
+            for linear in FFN. Otherwise, the three sub-layers all uses it as
+            `bias_attr` to create parameters. Default: None, which means the
+            default bias parameter property is used. See usage for details
+            in :ref:`api_fluid_ParamAttr` .
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            from paddle import TransformerDecoderLayer
+
+            # decoder input: [batch_size, tgt_len, d_model]
+            dec_input = paddle.rand((2, 4, 128))
+            # encoder output: [batch_size, src_len, d_model]
+            enc_output = paddle.rand((2, 6, 128))
+            # self attention mask: [batch_size, n_head, tgt_len, tgt_len]
+            self_attn_mask = paddle.rand((2, 2, 4, 4))
+            # cross attention mask: [batch_size, n_head, tgt_len, src_len]
+            cross_attn_mask = paddle.rand((2, 2, 4, 6))
+            decoder_layer = TransformerDecoderLayer(128, 2, 512)
+            output = decoder_layer(dec_input,
+                                   enc_output,
+                                   self_attn_mask,
+                                   cross_attn_mask)  # [2, 4, 128]
+    """
+
+    def __init__(self,
+                 d_model,
+                 nhead,
+                 dim_feedforward,
+                 dropout=0.1,
+                 activation="relu",
+                 attn_dropout=None,
+                 act_dropout=None,
+                 normalize_before=False,
+                 param_attr=None,
+                 bias_attr=None):
+        self._config = locals()
+        self._config.pop("self")
+
+        super(TransformerDecoderLayer, self).__init__()
+        attn_dropout = dropout if attn_dropout is None else attn_dropout
+        act_dropout = dropout if act_dropout is None else act_dropout
+        self.normalize_before = normalize_before
+
+        param_attrs = _convert_param_attr_to_list(param_attr, 3)
+        bias_attrs = _convert_param_attr_to_list(bias_attr, 3)
+
+        self.self_attn = MultiheadAttention(
+            d_model,
+            nhead,
+            dropout=attn_dropout,
+            param_attr=param_attrs[0],
+            bias_attr=bias_attrs[0])
+        self.cross_attn = MultiheadAttention(
+            d_model,
+            nhead,
+            dropout=attn_dropout,
+            param_attr=param_attrs[1],
+            bias_attr=bias_attrs[1])
+        self.linear1 = Linear(
+            d_model,
+            dim_feedforward,
+            param_attr=param_attrs[2],
+            bias_attr=bias_attrs[2])
+        self.dropout = Dropout(
+            act_dropout, dropout_implementation="upscale_in_train")
+        self.linear2 = Linear(
+            dim_feedforward,
+            d_model,
+            param_attr=param_attrs[2],
+            bias_attr=bias_attrs[2])
+        self.norm1 = LayerNorm(d_model)
+        self.norm2 = LayerNorm(d_model)
+        self.norm3 = LayerNorm(d_model)
+        self.dropout1 = Dropout(
+            dropout, dropout_implementation="upscale_in_train")
+        self.dropout2 = Dropout(
+            dropout, dropout_implementation="upscale_in_train")
+        self.dropout3 = Dropout(
+            dropout, dropout_implementation="upscale_in_train")
+        self.activation = getattr(layers, activation)
+
+    def forward(self, tgt, memory, tgt_mask=None, memory_mask=None, cache=None):
+        """
+        Applies a Transformer decoder layer on the input.
+
+        Parameters:
+            tgt (Variable): The input of Transformer decoder layer. It is a tensor
+                with shape `[batch_size, target_length, d_model]`. The data type
+                should be float32 or float64.
+            memory (Variable): The output of Transformer encoder. It is a tensor
+                with shape `[batch_size, source_length, d_model]`. The data type
+                should be float32 or float64.
+            tgt_mask (Variable, optional): A tensor used in self attention
+                to prevents attention to some unwanted positions, usually the
+                the subsequent positions. It is a tensor with shape broadcasted
+                to `[batch_size, n_head, target_length, target_length]`,
+                where the unwanted positions have `-INF` values and the others
+                have 0 values. The data type should be float32 or float64. It can
+                be None when nothing wanted or needed to be prevented attention to.
+                Default None
+            memory_mask (Variable, optional): A tensor used in decoder-encoder
+                cross attention to prevents attention to some unwanted positions,
+                usually the paddings. It is a tensor with shape broadcasted to
+               `[batch_size, n_head, target_length, source_length]`, where the
+                unwanted positions have `-INF` values and the others have 0 values.
+                The data type should be float32 or float64. It can be None when
+                nothing wanted or needed to be prevented attention to. Default None
+            cache (tuple, optional): It is a tuple( :code:`(incremental_cache, static_cache)` ),
+                `incremental_cache` is an instance of `MultiheadAttention.Cache`,
+                `static_cache` is an instance of `MultiheadAttention.StaticCache.
+                See `TransformerDecoderLayer.gen_cache` for more details. It is
+                only used for inference and should be None for training. Default
+                None.
+
+        Returns:
+            Variable|tuple: It is a tensor that has the same shape and data type \
+                as `tgt`, representing the output of Transformer decoder layer. \
+                Or a tuple if `cache` is not None, except for decoder layer output, \
+                the tuple includes the new cache which is same as input `cache` \
+                argument but `incremental_cache` in it has an incremental length. \
+                See `MultiheadAttention.gen_cache` and `MultiheadAttention.forward` \
+                for more details.
+        """
+        residual = tgt
+        if self.normalize_before:
+            tgt = self.norm1(tgt)
+        if cache is None:
+            tgt = self.self_attn(tgt, tgt, tgt, tgt_mask, None)
+        else:
+            tgt, incremental_cache = self.self_attn(tgt, tgt, tgt, tgt_mask,
+                                                    cache[0])
+        tgt = residual + self.dropout1(tgt)
+        if not self.normalize_before:
+            tgt = self.norm1(tgt)
+
+        residual = tgt
+        if self.normalize_before:
+            tgt = self.norm2(tgt)
+        if cache is None:
+            tgt = self.cross_attn(tgt, memory, memory, memory_mask, None)
+        else:
+            tgt, static_cache = self.cross_attn(tgt, memory, memory,
+                                                memory_mask, cache[1])
+        tgt = residual + self.dropout2(tgt)
+        if not self.normalize_before:
+            tgt = self.norm2(tgt)
+
+        residual = tgt
+        if self.normalize_before:
+            tgt = self.norm3(tgt)
+        tgt = self.linear2(self.dropout(self.activation(self.linear1(tgt))))
+        tgt = residual + self.dropout3(tgt)
+        if not self.normalize_before:
+            tgt = self.norm3(tgt)
+        return tgt if cache is None else (tgt, (incremental_cache,
+                                                static_cache))
+
+    def gen_cache(self, memory):
+        """
+        Generates cache for `forward` usage. The generated cache is a tuple
+        composed of an instance of `MultiheadAttention.Cache` and an instance
+        of `MultiheadAttention.StaticCache`.
+
+        Parameters:
+            memory (Variable): The output of Transformer encoder. It is a tensor
+                with shape `[batch_size, source_length, d_model]`. The data type
+                should be float32 or float64.
+
+        Returns:
+            tuple: It is a tuple( :code:`(incremental_cache, static_cache)` ). \
+                `incremental_cache` is an instance of `MultiheadAttention.Cache` \
+                produced by `self_attn.gen_cache(memory, MultiheadAttention.Cache)`, \
+                it reserves two tensors shaped `[batch_size, nhead, 0, d_model // nhead]`. \
+                `static_cache` is an instance of `MultiheadAttention.StaticCache` \
+                produced by `cross_attn.gen_cache(memory, MultiheadAttention.StaticCache)`, \
+                it reserves two tensors shaped `[batch_size, nhead, source_length, d_model // nhead]`.
+                See `MultiheadAttention.gen_cache` and `MultiheadAttention.forward` \
+                for more details.
+        """
+        incremental_cache = self.self_attn.gen_cache(
+            memory, type=self.self_attn.Cache)
+        static_cache = self.cross_attn.gen_cache(
+            memory, memory, type=self.cross_attn.StaticCache)
+        return incremental_cache, static_cache
+
+
+class TransformerDecoder(Layer):
+    """
+    TransformerDecoder is a stack of N decoder layers. 
+
+    Parameters:
+        decoder_layer (Layer): an instance of the `TransformerDecoderLayer`. It
+            would be used as the first layer, and the other layers would be created
+            according to the configurations of it.
+        num_layers (int): The number of decoder layers to be stacked.
+        norm (LayerNorm, optional): the layer normalization component. If provided,
+            apply layer normalization on the output of last encoder layer.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            from paddle import TransformerDecoderLayer, TransformerDecoder
+
+            # decoder input: [batch_size, trg_len, d_model]
+            dec_input = paddle.rand((2, 4, 128))
+            # encoder output: [batch_size, src_len, d_model]
+            enc_output = paddle.rand((2, 6, 128))
+            # self attention mask: [batch_size, n_head, trg_len, trg_len]
+            self_attn_mask = paddle.rand((2, 2, 4, 4))
+            # cross attention mask: [batch_size, n_head, trg_len, src_len]
+            cross_attn_mask = paddle.rand((2, 2, 4, 6))
+            decoder_layer = TransformerDecoderLayer(128, 2, 512)
+            decoder = TransformerDecoder(decoder_layer, 2)
+            output = decoder(dec_input,
+                             enc_output,
+                             self_attn_mask,
+                             cross_attn_mask)  # [2, 4, 128]
+    """
+
+    def __init__(self, decoder_layer, num_layers, norm=None):
+        super(TransformerDecoder, self).__init__()
+        self.layers = LayerList([(decoder_layer if i == 0 else
+                                  type(decoder_layer)(decoder_layer._config))
+                                 for i in range(num_layers)])
+        self.num_layers = num_layers
+        self.norm = norm
+
+    def forward(self, tgt, memory, tgt_mask=None, memory_mask=None, cache=None):
+        """
+        Applies a stack of N Transformer decoder layers on inputs. If `norm` is
+        provided, also applies layer normalization on the output of last decoder
+        layer.
+
+        Parameters:
+            tgt (Variable): The input of Transformer decoder. It is a tensor
+                with shape `[batch_size, target_length, d_model]`. The data type
+                should be float32 or float64.
+            memory (Variable): The output of Transformer encoder. It is a tensor
+                with shape `[batch_size, source_length, d_model]`. The data type
+                should be float32 or float64.
+            tgt_mask (Variable, optional): A tensor used in self attention
+                to prevents attention to some unwanted positions, usually the
+                the subsequent positions. It is a tensor with shape broadcasted
+                to `[batch_size, n_head, target_length, target_length]`,
+                where the unwanted positions have `-INF` values and the others
+                have 0 values. The data type should be float32 or float64. It can
+                be None when nothing wanted or needed to be prevented attention to.
+                Default None
+            memory_mask (Variable, optional): A tensor used in decoder-encoder
+                cross attention to prevents attention to some unwanted positions,
+                usually the paddings. It is a tensor with shape broadcasted to
+               `[batch_size, n_head, target_length, source_length]`, where the
+                unwanted positions have `-INF` values and the others have 0 values.
+                The data type should be float32 or float64. It can be None when
+                nothing wanted or needed to be prevented attention to. Default None
+            cache (list, optional): It is a list, and each element in the list
+                is a tuple( :code:`(incremental_cache, static_cache)` ). See
+                `TransformerDecoder.gen_cache` for more details. It is only
+                used for inference and should be None for training. Default None.
+
+        Returns:
+            Variable|tuple: It is a tensor that has the same shape and data type \
+                as `tgt`, representing the output of Transformer decoder. \
+                Or a tuple if `cache` is not None, except for decoder output, \
+                the tuple includes the new cache which is same as input `cache` \
+                argument but `incremental_cache` in it has an incremental length. \
+                See `MultiheadAttention.gen_cache` and `MultiheadAttention.forward` \
+                for more details.
+        """
+        output = tgt
+        new_caches = []
+        for i, mod in enumerate(self.layers):
+            if cache is None:
+                output = mod(output,
+                             memory,
+                             tgt_mask=tgt_mask,
+                             memory_mask=memory_mask,
+                             cache=None)
+            else:
+                output, new_cache = mod(output,
+                                        memory,
+                                        tgt_mask=tgt_mask,
+                                        memory_mask=memory_mask,
+                                        cache=cache[i]
+                                        if cache is not None else None)
+                new_caches.append(new_cache)
+
+        if self.norm is not None:
+            output = self.norm(output)
+
+        return output if cache is None else (output, new_caches)
+
+    def gen_cache(self, memory):
+        """
+        Generates cache for `forward` usage. The generated cache is a list, and
+        each element in it is a tuple( :code:`(incremental_cache, static_cache)` )
+        produced by `TransformerDecoderLayer.gen_cache`. See `TransformerDecoderLayer.gen_cache`
+        for more details.
+
+
+        Parameters:
+            memory (Variable): The output of Transformer encoder. It is a tensor
+                with shape `[batch_size, source_length, d_model]`. The data type
+                should be float32 or float64.
+
+        Returns:
+            list: It is a list, and each element in the list is a tuple produced \
+                by `TransformerDecoderLayer.gen_cache`. See `TransformerDecoderLayer.gen_cache` \
+                for more details.
+        """
+        return [layer.gen_cache(memory) for layer in self.layers]
+
+
+class Transformer(Layer):
+    def __init__(self,
+                 d_model=512,
+                 nhead=8,
+                 num_encoder_layers=6,
+                 num_decoder_layers=6,
+                 dim_feedforward=2048,
+                 dropout=0.1,
+                 activation="relu",
+                 custom_encoder=None,
+                 custom_decoder=None):
+        super(Transformer, self).__init__()
+
+        if custom_encoder is not None:
+            self.encoder = custom_encoder
+        else:
+            encoder_layer = TransformerEncoderLayer(
+                d_model, nhead, dim_feedforward, dropout, activation)
+            encoder_norm = LayerNorm(d_model)
+            self.encoder = TransformerEncoder(encoder_layer, num_encoder_layers,
+                                              encoder_norm)
+
+        if custom_decoder is not None:
+            self.decoder = custom_decoder
+        else:
+            decoder_layer = TransformerDecoderLayer(
+                d_model, nhead, dim_feedforward, dropout, activation)
+            decoder_norm = LayerNorm(d_model)
+            self.decoder = TransformerDecoder(decoder_layer, num_decoder_layers,
+                                              decoder_norm)
+
+        self._reset_parameters()
+
+        self.d_model = d_model
+        self.nhead = nhead
+
+
 class TransformerCell(Layer):
     """
     TransformerCell wraps a Transformer decoder producing logits from `inputs`
@@ -627,7 +1061,7 @@ def forward(self, word, position):
                 static_caches=static_caches)
     """
 
-    def __init__(self, decoder, embed_layer=None, output_layer=None):
+    def __init__(self, decoder, embedding_fn=None, output_fn=None):
         super(TransformerCell, self).__init__()
         self.decoder = decoder
         self.embedding_fn = embedding_fn

From c50ad434f56d355e6dc148576f8fac2c07dadf53 Mon Sep 17 00:00:00 2001
From: guosheng <guosheng@baidu.com>
Date: Wed, 19 Aug 2020 16:32:26 +0800
Subject: [PATCH 05/17] Add Transformer api. test=develop

---
 python/paddle/nn/layer/transformer.py | 178 ++++++++++++++++++++++----
 1 file changed, 156 insertions(+), 22 deletions(-)

diff --git a/python/paddle/nn/layer/transformer.py b/python/paddle/nn/layer/transformer.py
index af50b66c7299c0..20410d0bfb6069 100644
--- a/python/paddle/nn/layer/transformer.py
+++ b/python/paddle/nn/layer/transformer.py
@@ -75,7 +75,7 @@ class MultiheadAttention(Layer):
             weights to drop some attention targets. 0 for no dropout. Default 0
         kdim (int, optional): The feature size in key. If None, assumed equal to
             `embed_dim`. Default None.
-        vdim (int, optional): The feature size in key. If None, assumed equal to
+        vdim (int, optional): The feature size in value. If None, assumed equal to
             `embed_dim`. Default None.
         need_weights (bool, optional): Indicate whether to return the attention
             weights. Default False.
@@ -410,8 +410,11 @@ class TransformerEncoderLayer(Layer):
             `dropout`. Default None
         act_dropout (float, optional): The dropout probability used after FFN
             activition.  If None, use the value of `dropout`. Default None
-        act_dropout (float, optional): The dropout probability used after FFN
-            activition.  If None, use the value of `dropout`. Default None
+        normalize_before (bool, optional): Indicate whether to put layer normalization
+            into preprocessing of MHA and FFN sub-layers. If True, pre-process is layer
+            normalization and post-precess includes dropout, residual connection.
+            Otherwise, no pre-process and post-precess includes dropout, residual
+            connection, layer normalization. Default False
         param_attr(ParamAttr|tuple, optional): To specify the weight parameter property.
             If it is a tuple, `param_attr[0]` would be used as `param_attr` for
             MHA, and `param_attr[1]` would be used as `param_attr` for linear in FFN.
@@ -621,8 +624,11 @@ class TransformerDecoderLayer(Layer):
             `dropout`. Default None
         act_dropout (float, optional): The dropout probability used after FFN
             activition.  If None, use the value of `dropout`. Default None
-        act_dropout (float, optional): The dropout probability used after FFN
-            activition.  If None, use the value of `dropout`. Default None
+        normalize_before (bool, optional): Indicate whether to put layer normalization
+            into preprocessing of MHA and FFN sub-layers. If True, pre-process is layer
+            normalization and post-precess includes dropout, residual connection.
+            Otherwise, no pre-process and post-precess includes dropout, residual
+            connection, layer normalization. Default False
         param_attr(ParamAttr|tuple, optional): To specify the weight parameter property.
             If it is a tuple, `param_attr[0]` would be used as `param_attr` for
             self attention, `param_attr[1]` would be used as `param_attr` for
@@ -843,13 +849,13 @@ class TransformerDecoder(Layer):
             import paddle
             from paddle import TransformerDecoderLayer, TransformerDecoder
 
-            # decoder input: [batch_size, trg_len, d_model]
+            # decoder input: [batch_size, tgt_len, d_model]
             dec_input = paddle.rand((2, 4, 128))
             # encoder output: [batch_size, src_len, d_model]
             enc_output = paddle.rand((2, 6, 128))
-            # self attention mask: [batch_size, n_head, trg_len, trg_len]
+            # self attention mask: [batch_size, n_head, tgt_len, tgt_len]
             self_attn_mask = paddle.rand((2, 2, 4, 4))
-            # cross attention mask: [batch_size, n_head, trg_len, src_len]
+            # cross attention mask: [batch_size, n_head, tgt_len, src_len]
             cross_attn_mask = paddle.rand((2, 2, 4, 6))
             decoder_layer = TransformerDecoderLayer(128, 2, 512)
             decoder = TransformerDecoder(decoder_layer, 2)
@@ -923,8 +929,7 @@ def forward(self, tgt, memory, tgt_mask=None, memory_mask=None, cache=None):
                                         memory,
                                         tgt_mask=tgt_mask,
                                         memory_mask=memory_mask,
-                                        cache=cache[i]
-                                        if cache is not None else None)
+                                        cache=cache[i])
                 new_caches.append(new_cache)
 
         if self.norm is not None:
@@ -947,13 +952,96 @@ def gen_cache(self, memory):
 
         Returns:
             list: It is a list, and each element in the list is a tuple produced \
-                by `TransformerDecoderLayer.gen_cache`. See `TransformerDecoderLayer.gen_cache` \
+                by `TransformerDecoderLayer.gen_cache(memory)`. See `TransformerDecoderLayer.gen_cache` \
                 for more details.
         """
         return [layer.gen_cache(memory) for layer in self.layers]
 
 
 class Transformer(Layer):
+    """
+    A Transformer model composed of an instance of `TransformerEncoder` and an
+    instance of `TransformerDecoder`. While the embedding layer and output layer
+    are not included.
+
+    Please refer to `Attention is all you need <http://papers.nips.cc/paper/7181-attention-is-all-you-need.pdf>`_ ,
+    and see `TransformerEncoder` and `TransformerDecoder` for more details.
+    
+    Users can configurate the model architecture with corresponding parameters.
+    Note the usage of `normalize_before` representing where to apply layer
+    normalization (in pre-process or post-precess of multi-head attention or FFN),
+    and some transformer like models are different on this, such as
+    `BERT <https://arxiv.org/abs/1810.04805>`_ and `GPT2 <https://d4mucfpksywv.cloudfront.net/better-language-models/language-models.pdf>`_ . 
+    The default architecture here places layer normalization in pre-process and
+    applies another layer normalization on the output of last encoder/decoder layer.
+
+    Parameters:
+        d_model (int): The expected feature size in the encoder/decoder input
+            and output.
+        nhead (int): The number of heads in multi-head attention(MHA).
+        num_encoder_layers (int): The number of layers in encoder.
+        num_encoder_layers (int): The number of layers in decoder.
+        dim_feedforward (int): The hidden layer size in the feedforward network(FFN).
+        dropout (float, optional): The dropout probability used in pre-process
+            and post-precess of MHA and FFN sub-layer. Default 0.1
+        activation (str, optional): The activation function in the feedforward
+            network. Default relu.
+        attn_dropout (float, optional): The dropout probability used
+            in MHA to drop some attention target. If None, use the value of
+            `dropout`. Default None
+        act_dropout (float, optional): The dropout probability used after FFN
+            activition.  If None, use the value of `dropout`. Default None
+        normalize_before (bool, optional): Indicate whether to put layer normalization
+            into preprocessing of MHA and FFN sub-layers. If True, pre-process is layer
+            normalization and post-precess includes dropout, residual connection.
+            Otherwise, no pre-process and post-precess includes dropout, residual
+            connection, layer normalization. Default False
+        param_attr(ParamAttr|tuple, optional): To specify the weight parameter property.
+            If it is a tuple, `param_attr[0]` would be used as `param_attr` for
+            self attention, `param_attr[1]` would be used as `param_attr` for
+            cross attention, and `param_attr[2]` would be used as `param_attr`
+            for linear in FFN. Otherwise, the three sub-layers all uses it as
+            `param_attr` to create parameters. Default: None, which means the
+            default weight parameter property is used. See usage for details
+            in :ref:`api_fluid_ParamAttr` . 
+        bias_attr (ParamAttr|tuple, optional): To specify the bias parameter property.
+            If it is a tuple, `bias_attr[0]` would be used as `bias_attr` for
+            self attention, `bias_attr[1]` would be used as `bias_attr` for
+            cross attention, and `bias_attr[2]` would be used as `bias_attr`
+            for linear in FFN. Otherwise, the three sub-layers all uses it as
+            `bias_attr` to create parameters. Default: None, which means the
+            default bias parameter property is used. See usage for details
+            in :ref:`api_fluid_ParamAttr` .
+        custom_encoder (Layer): If custom encoder is provided, use it as the encoder.
+            Default None
+        custom_decoder (Layer): If custom decoder is provided, use it as the decoder.
+            Default None
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            from paddle import Transformer
+
+            # src: [batch_size, tgt_len, d_model]
+            enc_input = paddle.rand((2, 4, 128))
+            # tgt: [batch_size, src_len, d_model]
+            dec_input = paddle.rand((2, 6, 128))
+            # src_mask: [batch_size, n_head, src_len, src_len]
+            enc_self_attn_mask = paddle.rand((2, 2, 4, 4))
+            # tgt_mask: [batch_size, n_head, tgt_len, tgt_len]
+            dec_self_attn_mask = paddle.rand((2, 2, 6, 6))
+            # memory_mask: [batch_size, n_head, tgt_len, src_len]
+            cross_attn_mask = paddle.rand((2, 2, 6, 4))
+            transformer = Transformer(128, 2, 4, 4, 512)
+            output = transformer(dec_input,
+                                 enc_output,
+                                 enc_self_attn_mask,
+                                 dec_self_attn_mask,
+                                 cross_attn_mask)  # [2, 6, 128]
+    """
+
     def __init__(self,
                  d_model=512,
                  nhead=8,
@@ -962,6 +1050,11 @@ def __init__(self,
                  dim_feedforward=2048,
                  dropout=0.1,
                  activation="relu",
+                 attn_dropout=None,
+                 act_dropout=None,
+                 normalize_before=False,
+                 param_attr=None,
+                 bias_attr=None,
                  custom_encoder=None,
                  custom_decoder=None):
         super(Transformer, self).__init__()
@@ -970,7 +1063,9 @@ def __init__(self,
             self.encoder = custom_encoder
         else:
             encoder_layer = TransformerEncoderLayer(
-                d_model, nhead, dim_feedforward, dropout, activation)
+                d_model, nhead, dim_feedforward, dropout, activation,
+                attn_dropout, act_dropout, normalize_before, param_attr,
+                bias_attr)
             encoder_norm = LayerNorm(d_model)
             self.encoder = TransformerEncoder(encoder_layer, num_encoder_layers,
                                               encoder_norm)
@@ -979,27 +1074,66 @@ def __init__(self,
             self.decoder = custom_decoder
         else:
             decoder_layer = TransformerDecoderLayer(
-                d_model, nhead, dim_feedforward, dropout, activation)
+                d_model, nhead, dim_feedforward, dropout, activation,
+                attn_dropout, act_dropout, normalize_before, param_attr,
+                bias_attr)
             decoder_norm = LayerNorm(d_model)
             self.decoder = TransformerDecoder(decoder_layer, num_decoder_layers,
                                               decoder_norm)
 
-        self._reset_parameters()
-
         self.d_model = d_model
         self.nhead = nhead
 
+    def forward(self, src, tgt, src_mask=None, tgt_mask=None, memory_mask=None):
+        """
+        Applies a Transformer model on the inputs.
+
+        Parameters:
+            src (Variable): The input of Transformer encoder. It is a tensor
+                with shape `[batch_size, source_length, d_model]`. The data type
+                should be float32 or float64.
+            tgt (Variable): The input of Transformer decoder. It is a tensor
+                with shape `[batch_size, target_length, d_model]`. The data type
+                should be float32 or float64.
+            memory (Variable): The output of Transformer encoder. It is a tensor
+                with shape `[batch_size, source_length, d_model]`. The data type
+                should be float32 or float64.
+            tgt_mask (Variable, optional): A tensor used in self attention
+                to prevents attention to some unwanted positions, usually the
+                the subsequent positions. It is a tensor with shape broadcasted
+                to `[batch_size, n_head, target_length, target_length]`,
+                where the unwanted positions have `-INF` values and the others
+                have 0 values. The data type should be float32 or float64. It can
+                be None when nothing wanted or needed to be prevented attention to.
+                Default None
+            memory_mask (Variable, optional): A tensor used in decoder-encoder
+                cross attention to prevents attention to some unwanted positions,
+                usually the paddings. It is a tensor with shape broadcasted to
+               `[batch_size, n_head, target_length, source_length]`, where the
+                unwanted positions have `-INF` values and the others have 0 values.
+                The data type should be float32 or float64. It can be None when
+                nothing wanted or needed to be prevented attention to. Default None
+
+        Returns:
+            Variable: It is a tensor that has the same shape and data type \
+                as `tgt`, representing the output of Transformer decoder.
+        """
+        memory = self.encoder(src, mask=src_mask)
+        output = self.decoder(
+            tgt, memory, tgt_mask=tgt_mask, memory_mask=memory_mask)
+        return output
+
 
-class TransformerCell(Layer):
+class TransformerDecoderCell(Layer):
     """
-    TransformerCell wraps a Transformer decoder producing logits from `inputs`
-    composed by ids and position.
+    TransformerDecoderCell wraps a Transformer decoder producing logits from
+    `inputs` composed by ids and position.
 
     Parameters:
         decoder(callable): A TransformerDecoder instance. Or a wrapper of it that
             includes a embedding layer accepting ids and positions instead of embeddings
             and includes a output layer transforming decoder output features to logits.
-        embedding_fn(function, optional): A callable that accepts ids and position
+        embedding_fn(callable, optional): A callable that accepts ids and position
             as arguments and return embeddings as input of `decoder`. It can be
             None if `decoder` includes a embedding layer. Default None.
         output_fn(callable, optional): A callable applid on `decoder` output to
@@ -1045,7 +1179,7 @@ def forward(self, word, position):
                 is_test=True)
             
             enc_output = paddle.rand((2, 4, 128))
-            # cross attention bias: [batch_size, n_head, trg_len, src_len]
+            # cross attention bias: [batch_size, n_head, tgt_len, src_len]
             trg_src_attn_bias = paddle.rand((2, 2, 1, 4))
             # inputs for beam search on Transformer
             caches = transformer_cell.get_initial_states(enc_output)
@@ -1062,7 +1196,7 @@ def forward(self, word, position):
     """
 
     def __init__(self, decoder, embedding_fn=None, output_fn=None):
-        super(TransformerCell, self).__init__()
+        super(TransformerDecoderCell, self).__init__()
         self.decoder = decoder
         self.embedding_fn = embedding_fn
         self.output_fn = output_fn
@@ -1212,7 +1346,7 @@ def forward(self, word, position):
                 is_test=True)
             
             enc_output = paddle.rand((2, 4, 128))
-            # cross attention bias: [batch_size, n_head, trg_len, src_len]
+            # cross attention bias: [batch_size, n_head, tgt_len, src_len]
             trg_src_attn_bias = paddle.rand((2, 2, 1, 4))
             # inputs for beam search on Transformer
             caches = transformer_cell.get_initial_states(enc_output)

From ad7d2251e163506a9bfdf7d17869095704ea65d2 Mon Sep 17 00:00:00 2001
From: LiuChiaChi <709153940@qq.com>
Date: Thu, 20 Aug 2020 06:08:12 +0000
Subject: [PATCH 06/17] add unittests for transformer api

---
 .../fluid/tests/book/test_transformer_api.py  | 365 ++++++++++++++++++
 python/paddle/nn/layer/__init__.py            |   1 +
 python/paddle/nn/layer/transformer.py         |   5 +-
 3 files changed, 368 insertions(+), 3 deletions(-)
 create mode 100644 python/paddle/fluid/tests/book/test_transformer_api.py

diff --git a/python/paddle/fluid/tests/book/test_transformer_api.py b/python/paddle/fluid/tests/book/test_transformer_api.py
new file mode 100644
index 00000000000000..54e3f62446cc39
--- /dev/null
+++ b/python/paddle/fluid/tests/book/test_transformer_api.py
@@ -0,0 +1,365 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+from paddle.nn.layer.transformer import MultiheadAttention, TransformerEncoderLayer, TransformerDecoderLayer
+
+import unittest
+
+
+def generate_basic_params(mode="attn", self_attention=True, verbose=False):
+    batch_size, query_length = [np.random.randint(2, 10) for _ in range(2)]
+    d_head, num_heads = [np.random.randint(3, 10) for _ in range(2)]
+    attn_dropout = 0.0
+    embed_dim = d_head * num_heads
+    if mode == "attn":
+        if self_attention:
+            kdim, vdim = embed_dim, embed_dim
+            key_length, value_length = query_length, query_length
+        else:
+            kdim, vdim = [np.random.randint(5, 20) for _ in range(2)]
+            key_length = np.random.randint(2, 10)
+            value_length = key_length
+        return batch_size, query_length, key_length, value_length, embed_dim, kdim, vdim, num_heads, attn_dropout
+
+    else:
+        dropout, act_dropout = 0.0, 0.0
+        dim_feedforward = np.random.randint(128, 1024)
+        sequence_length = np.random.randint(2, 10)
+        if mode == "encoder_layer":
+            return batch_size, embed_dim, num_heads, dim_feedforward, dropout, attn_dropout, act_dropout, sequence_length
+        elif mode == "decoder_layer":
+            target_length = np.random.randint(2, 10)
+            return batch_size, embed_dim, num_heads, dim_feedforward, dropout, attn_dropout, act_dropout, sequence_length, target_length
+
+
+def generate_query_key_value_cache(self_attention,
+                                   batch_size,
+                                   num_heads,
+                                   query_length,
+                                   embed_dim,
+                                   key_length=None,
+                                   value_length=None,
+                                   kdim=None,
+                                   vdim=None,
+                                   cache=None):
+    query = np.random.rand(batch_size, query_length,
+                           embed_dim).astype("float32")
+    # attn_mask = np.zeros((batch_size, num_heads, query_length, key_length))
+    # attn_mask[0][0][0][:10] = -np.inf 
+    attn_mask = None
+    head_dim = embed_dim // num_heads
+    if self_attention:
+        key, value = query, query
+    else:
+        key = np.random.rand(batch_size, key_length, kdim).astype("float32")
+        value = np.random.rand(batch_size, value_length, vdim).astype("float32")
+    cache_dict = {}
+    if cache:
+        if not self_attention:
+            cache_dict["static_k"] = np.random.rand(
+                batch_size, num_heads, key_length, head_dim).astype("float32")
+            cache_dict["static_v"] = np.random.rand(
+                batch_size, num_heads, value_length, head_dim).astype("float32")
+        else:
+            cache_dict["k"] = np.random.rand(batch_size, num_heads, key_length,
+                                             head_dim).astype("float32")
+            cache_dict["v"] = np.random.rand(
+                batch_size, num_heads, value_length, head_dim).astype("float32")
+    else:
+        cache_dict = None
+    return query, key, value, attn_mask, cache_dict
+
+
+def fc(x, weight):
+    return np.matmul(x, weight)
+
+
+def softmax(x):
+    np.seterr(invalid='ignore')
+    output = np.zeros(x.shape, dtype=np.float64)  # ?
+    for i in range(x.shape[0]):
+        for j in range(x.shape[1]):
+            for k in range(x.shape[2]):
+                x_curr = x[i, j, k, :]
+                e_x = np.exp(x_curr - np.amax(x_curr))
+                output[i, j, k, :] = e_x / np.sum(e_x)
+    return output
+
+
+def batch_matmul(x, y):
+    assert x.shape[0] == y.shape[0]
+    assert x.shape[1] == y.shape[1]
+    retval = np.zeros(
+        (x.shape[0], x.shape[1], x.shape[2], y.shape[3]), dtype=np.float64)
+    for i in range(x.shape[0]):
+        for j in range(x.shape[1]):
+            retval[i, j, :, :] = np.matmul(x[i, j, :, :], y[i, j, :, :])
+    return retval
+
+
+def scaled_dot_product_attention(q, k, v, d_key, attn_mask, multi_head_attn):
+    k = k.transpose([0, 1, 3, 2])
+    qkt = batch_matmul(q, k / np.sqrt(d_key, dtype=np.float64))
+    if attn_mask is not None:
+        qkt += attn_mask
+    weight = softmax(qkt)
+
+    attn_heads = batch_matmul(weight, v)
+    attn_heads = attn_heads.transpose((0, 2, 1, 3))
+    attn_heads = attn_heads.reshape((attn_heads.shape[0], attn_heads.shape[1],
+                                     attn_heads.shape[2] * attn_heads.shape[3]))
+    return attn_heads
+
+
+def cal_qkv(key, value, num_heads, embed_dim, multi_head_attn):
+    with fluid.dygraph.guard():
+        head_dim = embed_dim // num_heads
+        k_weight = multi_head_attn.k_proj.weight.numpy()
+        v_weight = multi_head_attn.v_proj.weight.numpy()
+        k = fc(key, k_weight)
+        v = fc(value, v_weight)
+        k = k.reshape((k.shape[0], k.shape[1], num_heads, head_dim))
+        k = k.transpose((0, 2, 1, 3))
+        v = v.reshape((v.shape[0], v.shape[1], num_heads, head_dim))
+        v = v.transpose((0, 2, 1, 3))
+        return k, v
+
+
+def prepare_qkv(query, key, value, num_heads, embed_dim, self_attention,
+                multi_head_attn, cache_dict):
+    q_weight = multi_head_attn.q_proj.weight.numpy()
+    q = fc(query, q_weight)
+    q = q.reshape((q.shape[0], q.shape[1], num_heads, embed_dim // num_heads))
+    q = q.transpose((0, 2, 1, 3))
+
+    if not self_attention and cache_dict:
+        k, v = cache_dict["static_k"], cache_dict["static_v"]
+    else:
+        k, v = cal_qkv(key, value, num_heads, embed_dim, multi_head_attn)
+        if cache_dict is not None:
+            k = np.concatenate((cache_dict["k"], k), axis=2)
+            v = np.concatenate((cache_dict["v"], v), axis=2)
+    return (q, k, v, cache_dict)
+
+def add(x, y=None):
+    fluid.enable_dygraph()
+    with fluid.dygraph.guard():
+        x = x.numpy() if not isinstance(x, np.ndarray) else x
+        if y is not None:
+            x += y
+            return x
+        return x
+    # print("print type(x) in add", type(x))
+
+def relu(x):
+    compare = x > 0
+    return x * compare
+
+def layer_norm(x, normalized_shape, norm, epsilon=1e-05, act=None):
+    fluid.enable_dygraph()
+    with fluid.dygraph.guard():
+        # scale:
+        weight = norm.weight.numpy()
+        # shift:
+        bias = norm.bias.numpy()
+
+        batch_size, src_len, d_model = x.shape
+        x = x.reshape((batch_size * src_len, d_model))
+        mu = np.mean(x, axis=1, keepdims=True)
+        sigma_squar = np.sum(np.square(x - mu), axis=1) / d_model
+        x1_up = (x - mu)
+        x1_down_1 = sigma_squar + epsilon
+        x1_down = np.sqrt(x1_down_1)
+        x1_down = x1_down.reshape((x1_down.shape[0], 1))
+        x1 = x1_up / x1_down
+        x_scaled = weight * x1
+        x_scaled_bias = x_scaled + bias
+        x_scaled_bias = x_scaled_bias.reshape((batch_size, src_len, d_model))
+    return x_scaled_bias
+
+def ffn(src, encoder_layer, ffn_fc1_act="relu"):
+    assert ffn_fc1_act == "relu", "only relu is supported"
+    fluid.enable_dygraph()
+    with fluid.dygraph.guard():
+        src = src.numpy() if not isinstance(src, np.ndarray) else src
+        w1 = encoder_layer.linear1.weight.numpy()
+        w2 = encoder_layer.linear2.weight.numpy()
+        # fc1
+        x1 = fc(src, w1)
+        x1 = relu(x1)
+        # fc2
+        x2 = fc(x1, w2)
+        return x2
+
+
+class TestTransformer(unittest.TestCase):
+    def test_multi_head_attention(self):
+        def multihead_attention_test_helper(self_attention, cache):
+            paddle.framework.manual_seed(2020)
+            # 分四种情况:self_attention|cross_attention, cache|No cache
+            with fluid.dygraph.guard(fluid.CPUPlace()):
+                for _ in range(100):
+                    # generate params for multi_head_attention
+                    batch_size, query_length, key_length, value_length, embed_dim, kdim, vdim, num_heads, attn_dropout = generate_basic_params(
+                        "attn", self_attention, False)
+                    query, key, value, attn_mask, cache_dict = generate_query_key_value_cache(
+                        self_attention, batch_size, num_heads, query_length,
+                        embed_dim, key_length, value_length, kdim, vdim, cache)    
+                    need_weight, param_attr, bias_attr = False, None, None
+                    # call paddle's function
+                    multi_head_attn = MultiheadAttention(
+                        embed_dim, num_heads, attn_dropout, kdim, vdim,
+                        need_weight, param_attr, bias_attr)
+                    # construct cache object
+                    cache_obj = None
+                    if cache_dict:
+                        if 'k' and 'v' in cache_dict:
+                            cache_obj = multi_head_attn.Cache(
+                                paddle.to_variable(cache_dict['k']),
+                                paddle.to_variable(cache_dict['v']))
+                        elif 'static_k' and 'static_v' in cache_dict:
+                            cache_obj = multi_head_attn.StaticCache(
+                                paddle.to_variable(cache_dict['static_k']),
+                                paddle.to_variable(cache_dict['static_v']))
+
+                    attn_output = multi_head_attn(
+                        paddle.to_variable(query),
+                        paddle.to_variable(key),
+                        paddle.to_variable(value), attn_mask, cache_obj)
+
+                    # implementation by numpy
+                    # compute q, k, v
+                    q, k, v, _ = prepare_qkv(query, key, value, num_heads,
+                                             embed_dim, self_attention,
+                                             multi_head_attn, cache_dict)
+                    # scale dot product attention
+                    attn_heads = scaled_dot_product_attention(
+                        q, k, v, embed_dim // num_heads, attn_mask,
+                        multi_head_attn)
+                    out_proj_weight = multi_head_attn.out_proj.weight.numpy()
+                    reference = fc(attn_heads, out_proj_weight)
+
+                    np.testing.assert_allclose(
+                        attn_output.numpy(), reference, atol=1e-6)
+
+        multihead_attention_test_helper(True, True)
+        multihead_attention_test_helper(True, False)
+        multihead_attention_test_helper(False, True)
+        multihead_attention_test_helper(False, False)
+
+    def test_transformer_encoder_layer(self):
+
+        with fluid.dygraph.guard(fluid.CPUPlace()):
+            paddle.framework.manual_seed(2020)
+            for _ in range(100):
+                # 0.定义默认参数
+                ffn_fc1_act = "relu"
+                # 1.获取基本参数
+                batch_size, d_model, n_head, dim_feedforward, dropout, attn_dropout, act_dropout, sequence_length = generate_basic_params(
+                    mode="encoder_layer", verbose=False)
+                # 2.生成输入
+                src = np.random.rand(batch_size, sequence_length,
+                                     d_model).astype("float32")
+                residual = src
+                # src_mask = np.zeros(batch_size, n_head, sequence_length,
+                # sequence_length).astype(dtype)            
+                # src_mask [0][0][:30] = -np.inf
+
+                # 3.框架的输出
+                encoder_layer = TransformerEncoderLayer(
+                    d_model, n_head, dim_feedforward, dropout, ffn_fc1_act,
+                    attn_dropout, act_dropout)
+
+                encoder_output = encoder_layer(
+                    paddle.to_variable(src)) # paddle.to_variable(src_mask))
+                # 4.numpy:
+                # paddle self attention
+                self_attn = MultiheadAttention(
+                    d_model, n_head, dropout=attn_dropout)
+                attn_output = self_attn(
+                    paddle.to_variable(src),
+                    paddle.to_variable(src), paddle.to_variable(src)).numpy()
+
+                src = attn_output + residual
+                src_norm = layer_norm(src, d_model, encoder_layer.norm1)
+                residual = src_norm
+
+                ffn_output = ffn(src_norm, encoder_layer, ffn_fc1_act)
+                src = residual + ffn_output
+                src = layer_norm(src, d_model, encoder_layer.norm2)
+
+                np.testing.assert_allclose(
+                    encoder_output.numpy(), src, rtol=1e-5, atol=1e-6)
+
+    def test_transformer_decoder_layer(self):
+        with fluid.dygraph.guard(fluid.CPUPlace()):
+            paddle.framework.manual_seed(2020)
+            for _ in range(100):
+                activation = "relu"
+                normalize_before = False
+                batch_size, d_model, n_head, dim_feedforward, dropout, attn_dropout, act_dropout, source_length, target_length = generate_basic_params(
+                    mode="decoder_layer", verbose=False)
+                tgt = np.random.rand(batch_size, target_length,
+                                     d_model).astype("float32")
+                memory = np.random.rand(batch_size, source_length,
+                                        d_model).astype("float32")
+                tgt_mask = None  # TODO
+                memory_mask = None
+                # paddle:
+                decoder_layer = TransformerDecoderLayer(
+                    d_model, n_head, dim_feedforward, dropout, activation,
+                    attn_dropout, act_dropout, normalize_before)
+                decoder_output = decoder_layer(
+                    paddle.to_variable(tgt),
+                    paddle.to_variable(memory), tgt_mask, memory_mask).numpy()
+                # TODO: cache
+                residual = tgt
+                self_attn = MultiheadAttention(
+                    d_model, n_head, dropout=attn_dropout)
+
+                tgt = self_attn(
+                    paddle.to_variable(tgt),
+                    paddle.to_variable(tgt),
+                    paddle.to_variable(tgt), tgt_mask, None).numpy()
+
+                tgt = residual + tgt
+                # postprocess
+                tgt_norm = layer_norm(tgt, d_model, decoder_layer.norm1)
+                residual = tgt_norm
+
+                cross_attn = MultiheadAttention(
+                    d_model, n_head, dropout=attn_dropout)
+                tgt = cross_attn(
+                    paddle.to_variable(tgt_norm),
+                    paddle.to_variable(memory),
+                    paddle.to_variable(memory), memory_mask, None).numpy()
+                # postprocess
+                tgt = tgt + residual
+                tgt_norm = layer_norm(tgt, d_model, decoder_layer.norm2)
+                residual = tgt_norm
+
+                ffn_output = ffn(tgt_norm, decoder_layer, activation)
+                # post process
+                tgt = residual + ffn_output
+                tgt_norm = layer_norm(tgt, d_model, decoder_layer.norm3)
+
+                np.testing.assert_allclose(
+                    decoder_output, tgt_norm, rtol=1e-5, atol=1e-6)
+
+
+if __name__ == "__main__":
+    unittest.main()
\ No newline at end of file
diff --git a/python/paddle/nn/layer/__init__.py b/python/paddle/nn/layer/__init__.py
index 9fb8ea78a16ab4..6d25e382f7961c 100644
--- a/python/paddle/nn/layer/__init__.py
+++ b/python/paddle/nn/layer/__init__.py
@@ -21,6 +21,7 @@
 from . import activation
 from . import norm
 from . import distance
+from . import transformer
 
 from .activation import *
 from .loss import *
diff --git a/python/paddle/nn/layer/transformer.py b/python/paddle/nn/layer/transformer.py
index af50b66c7299c0..bfdf53a3ea984d 100644
--- a/python/paddle/nn/layer/transformer.py
+++ b/python/paddle/nn/layer/transformer.py
@@ -19,7 +19,6 @@
 import collections
 
 import numpy as np
-
 from ...fluid import layers
 from ...fluid.param_attr import ParamAttr
 from ...fluid.dygraph import Layer, Linear, Dropout, LayerNorm, LayerList
@@ -182,7 +181,7 @@ def _prepare_qkv(self, query, key, value, cache=None):
                 and their data types are same as inputs.
         """
         q = self.q_proj(query)
-        q = layers.reshape(x=q, shape=[0, 0, self.n_head, self.d_key])
+        q = layers.reshape(x=q, shape=[0, 0, self.num_heads, self.head_dim])
         q = layers.transpose(x=q, perm=[0, 2, 1, 3])
 
         if isinstance(cache, self.StaticCache):
@@ -231,7 +230,7 @@ def cal_kv(self, key, value):
         v = layers.transpose(x=v, perm=[0, 2, 1, 3])
         return k, v
 
-    def gen_cache(self, key, value=None, type=MultiheadAttention.Cache):
+    def gen_cache(self, key, value=None, type=Cache):
         """
         Generates cache for `forward` usage in inference accroding to arguments.
         The generated cache is an instance of `MultiheadAttention.Cache` or an

From 54e9e56eb3e01c69ed8719e085fbf628c6b9db03 Mon Sep 17 00:00:00 2001
From: LiuChiaChi <709153940@qq.com>
Date: Thu, 20 Aug 2020 06:11:16 +0000
Subject: [PATCH 07/17] add unittests for transformer api

---
 python/paddle/fluid/tests/book/test_transformer_api.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/python/paddle/fluid/tests/book/test_transformer_api.py b/python/paddle/fluid/tests/book/test_transformer_api.py
index 54e3f62446cc39..c75f018c07760c 100644
--- a/python/paddle/fluid/tests/book/test_transformer_api.py
+++ b/python/paddle/fluid/tests/book/test_transformer_api.py
@@ -155,6 +155,7 @@ def prepare_qkv(query, key, value, num_heads, embed_dim, self_attention,
             v = np.concatenate((cache_dict["v"], v), axis=2)
     return (q, k, v, cache_dict)
 
+
 def add(x, y=None):
     fluid.enable_dygraph()
     with fluid.dygraph.guard():
@@ -165,10 +166,12 @@ def add(x, y=None):
         return x
     # print("print type(x) in add", type(x))
 
+
 def relu(x):
     compare = x > 0
     return x * compare
 
+
 def layer_norm(x, normalized_shape, norm, epsilon=1e-05, act=None):
     fluid.enable_dygraph()
     with fluid.dygraph.guard():
@@ -191,6 +194,7 @@ def layer_norm(x, normalized_shape, norm, epsilon=1e-05, act=None):
         x_scaled_bias = x_scaled_bias.reshape((batch_size, src_len, d_model))
     return x_scaled_bias
 
+
 def ffn(src, encoder_layer, ffn_fc1_act="relu"):
     assert ffn_fc1_act == "relu", "only relu is supported"
     fluid.enable_dygraph()
@@ -218,7 +222,7 @@ def multihead_attention_test_helper(self_attention, cache):
                         "attn", self_attention, False)
                     query, key, value, attn_mask, cache_dict = generate_query_key_value_cache(
                         self_attention, batch_size, num_heads, query_length,
-                        embed_dim, key_length, value_length, kdim, vdim, cache)    
+                        embed_dim, key_length, value_length, kdim, vdim, cache)
                     need_weight, param_attr, bias_attr = False, None, None
                     # call paddle's function
                     multi_head_attn = MultiheadAttention(
@@ -285,7 +289,7 @@ def test_transformer_encoder_layer(self):
                     attn_dropout, act_dropout)
 
                 encoder_output = encoder_layer(
-                    paddle.to_variable(src)) # paddle.to_variable(src_mask))
+                    paddle.to_variable(src))  # paddle.to_variable(src_mask))
                 # 4.numpy:
                 # paddle self attention
                 self_attn = MultiheadAttention(
@@ -362,4 +366,4 @@ def test_transformer_decoder_layer(self):
 
 
 if __name__ == "__main__":
-    unittest.main()
\ No newline at end of file
+    unittest.main()

From 8637eeeb0430f6f5ede3a2668212a0b2b5733a06 Mon Sep 17 00:00:00 2001
From: guosheng <guosheng@baidu.com>
Date: Thu, 20 Aug 2020 21:28:40 +0800
Subject: [PATCH 08/17] Fix some bugs in Transformer apis. test=develop

---
 python/paddle/nn/layer/transformer.py | 78 +++++++++++++++------------
 1 file changed, 43 insertions(+), 35 deletions(-)

diff --git a/python/paddle/nn/layer/transformer.py b/python/paddle/nn/layer/transformer.py
index bbb638b62c7be4..7d050a47e3cf02 100644
--- a/python/paddle/nn/layer/transformer.py
+++ b/python/paddle/nn/layer/transformer.py
@@ -19,6 +19,7 @@
 import collections
 
 import numpy as np
+
 from ...fluid import layers
 from ...fluid.param_attr import ParamAttr
 from ...fluid.dygraph import Layer, Linear, Dropout, LayerNorm, LayerList
@@ -230,7 +231,7 @@ def cal_kv(self, key, value):
         v = layers.transpose(x=v, perm=[0, 2, 1, 3])
         return k, v
 
-    def gen_cache(self, key, value=None, type=Cache):
+    def gen_cache(self, key, value=None, type=MultiheadAttention.Cache):
         """
         Generates cache for `forward` usage in inference accroding to arguments.
         The generated cache is an instance of `MultiheadAttention.Cache` or an
@@ -384,7 +385,7 @@ def forward(self, query, key, value, attn_mask=None, cache=None):
             outs.append(weights)
         if cache is not None:
             outs.append(cache)
-        return out if len(outs) else tuple(outs)
+        return out if len(outs) == 1 else tuple(outs)
 
 
 class TransformerEncoderLayer(Layer):
@@ -455,6 +456,7 @@ def __init__(self,
                  bias_attr=None):
         self._config = locals()
         self._config.pop("self")
+        self._config.pop("__class__", None)  # py3
 
         super(TransformerEncoderLayer, self).__init__()
         attn_dropout = dropout if attn_dropout is None else attn_dropout
@@ -553,7 +555,7 @@ class TransformerEncoder(Layer):
             enc_input = paddle.rand((2, 4, 128))
             # self attention mask: [batch_size, n_head, src_len, src_len]
             attn_mask = paddle.rand((2, 2, 4, 4))
-            encoder_layer = TransformerEncoderLayer(2, 64, 64, 128, 512)
+            encoder_layer = TransformerEncoderLayer(128, 2, 512)
             encoder = TransformerEncoder(encoder_layer, 2)
             enc_output = encoder(enc_input, attn_mask)  # [2, 4, 128]
     """
@@ -561,7 +563,7 @@ class TransformerEncoder(Layer):
     def __init__(self, encoder_layer, num_layers, norm=None):
         super(TransformerEncoder, self).__init__()
         self.layers = LayerList([(encoder_layer if i == 0 else
-                                  type(encoder_layer)(encoder_layer._config))
+                                  type(encoder_layer)(**encoder_layer._config))
                                  for i in range(num_layers)])
         self.num_layers = num_layers
         self.norm = norm
@@ -680,6 +682,7 @@ def __init__(self,
                  bias_attr=None):
         self._config = locals()
         self._config.pop("self")
+        self._config.pop("__class__", None)  # py3
 
         super(TransformerDecoderLayer, self).__init__()
         attn_dropout = dropout if attn_dropout is None else attn_dropout
@@ -867,7 +870,7 @@ class TransformerDecoder(Layer):
     def __init__(self, decoder_layer, num_layers, norm=None):
         super(TransformerDecoder, self).__init__()
         self.layers = LayerList([(decoder_layer if i == 0 else
-                                  type(decoder_layer)(decoder_layer._config))
+                                  type(decoder_layer)(**decoder_layer._config))
                                  for i in range(num_layers)])
         self.num_layers = num_layers
         self.norm = norm
@@ -1034,8 +1037,8 @@ class Transformer(Layer):
             # memory_mask: [batch_size, n_head, tgt_len, src_len]
             cross_attn_mask = paddle.rand((2, 2, 6, 4))
             transformer = Transformer(128, 2, 4, 4, 512)
-            output = transformer(dec_input,
-                                 enc_output,
+            output = transformer(enc_input,
+                                 dec_input,
                                  enc_self_attn_mask,
                                  dec_self_attn_mask,
                                  cross_attn_mask)  # [2, 6, 128]
@@ -1125,8 +1128,11 @@ def forward(self, src, tgt, src_mask=None, tgt_mask=None, memory_mask=None):
 
 class TransformerDecoderCell(Layer):
     """
-    TransformerDecoderCell wraps a Transformer decoder producing logits from
-    `inputs` composed by ids and position.
+    TransformerDecoderCell wraps a Transformer decoder combined with an embedding
+    layer and output layer to produce logits from symbols (ids and position here).
+    It is analogy to `RNNCell` and `outputs, new_states = cell(inputs, states, *kwargs)`,
+    where `inputs` is composed of word ids and position, `states` is `cache`,
+    `kwargs` includes `memory, `tgt_mask`, `memory_mask` and `static_cache`.
 
     Parameters:
         decoder(callable): A TransformerDecoder instance. Or a wrapper of it that
@@ -1202,11 +1208,11 @@ def __init__(self, decoder, embedding_fn=None, output_fn=None):
 
     def forward(self,
                 inputs,
-                states=None,
-                enc_output=None,
-                trg_slf_attn_bias=None,
-                trg_src_attn_bias=None,
-                static_caches=[]):
+                cache=None,
+                memory=None,
+                tgt_mask=None,
+                memory_mask=None,
+                static_cache=[]):
         """
         Produces logits from `inputs` composed by ids and positions.
 
@@ -1215,27 +1221,29 @@ def forward(self,
                 tensors both have int64 data type and with 2D shape 
                 `[batch_size, sequence_length]` where `sequence_length` is 1
                 for inference.
-            states(list): It caches the multi-head attention intermediate results
+            cache(list): It caches the multi-head attention intermediate results
                 of history decoding steps. It is a list of dict where the length
                 of list is decoder layer number, and each dict has `k` and `v` as
                 keys and values are cached results. Default None
-            enc_output(Variable): The output of Transformer encoder. It is a tensor
-                with shape `[batch_size, sequence_length, d_model]`. The data type
+            memory (Variable): The output of Transformer encoder. It is a tensor
+                with shape `[batch_size, source_length, d_model]`. The data type
                 should be float32 or float64.
-            trg_slf_attn_bias(Variable, optional): A tensor used in decoder self
-                attention to mask out attention on unwanted target positions. It
-                is a tensor with shape `[batch_size, n_head, target_length, target_length]`,
-                where the unwanted positions have `-INF` values and the others
-                have 0 values. It can be None when nothing wanted or needed to
-                be masked out. It can be None for inference. The data type should
-                be float32 or float64. Default None
-            trg_src_attn_bias(Variable, optional): A tensor used in decoder-encoder
-                cross attention to mask out unwanted attention on source (encoder output).
-                It is a tensor with shape `[batch_size, n_head, target_length, source_length]`,
+            tgt_mask (Variable, optional): A tensor used in self attention
+                to prevents attention to some unwanted positions, usually the
+                the subsequent positions. It is a tensor with shape broadcasted
+                to `[batch_size, n_head, target_length, target_length]`,
                 where the unwanted positions have `-INF` values and the others
-                have 0 values. It can be None when nothing wanted or needed to
-                be masked out. The data type should be float32 or float64. Default None
-            static_caches(list): It stores projected results of encoder output
+                have 0 values. The data type should be float32 or float64. It can
+                be None when nothing wanted or needed to be prevented attention to.
+                Default None
+            memory_mask (Variable, optional): A tensor used in decoder-encoder
+                cross attention to prevents attention to some unwanted positions,
+                usually the paddings. It is a tensor with shape broadcasted to
+               `[batch_size, n_head, target_length, source_length]`, where the
+                unwanted positions have `-INF` values and the others have 0 values.
+                The data type should be float32 or float64. It can be None when
+                nothing wanted or needed to be prevented attention to. Default None
+            static_cache(list): It stores transformed results of encoder output
                 to be used as keys and values in decoder-encoder cross attention
                 It is a list of dict where the length of list is decoder layer
                 number, and each dict has `static_k` and `static_v` as keys and
@@ -1250,16 +1258,16 @@ def forward(self,
                 concatenated into it.
         """
         trg_word, trg_pos = inputs
-        if states and static_caches:
+        if cache and static_cache:
             for cache, static_cache in zip(states, static_caches):
                 cache.update(static_cache)
         if self.embedding_fn is not None:
             dec_input = self.embedding_fn(trg_word, trg_pos)
-            outputs = self.decoder(dec_input, enc_output, None,
-                                   trg_src_attn_bias, states)
+            outputs = self.decoder(dec_input, memory, tgt_mask, memory_mask,
+                                   states)
         else:
-            outputs = self.decoder(trg_word, trg_pos, enc_output, None,
-                                   trg_src_attn_bias, states)
+            outputs = self.decoder(trg_word, trg_pos, memory, tgt_mask,
+                                   memory_mask, states)
         if self.output_fn is not None:
             outputs = self.output_fn(outputs)
 

From 33741e8df9da8a056f21d94a4f24623af890f17e Mon Sep 17 00:00:00 2001
From: LiuChiaChi <709153940@qq.com>
Date: Thu, 20 Aug 2020 13:38:39 +0000
Subject: [PATCH 09/17] add unittests for encoder, decoder and transformer

---
 .../fluid/tests/book/test_transformer_api.py  | 369 --------------
 .../tests/unittests/test_transformer_api.py   | 477 ++++++++++++++++++
 python/paddle/nn/layer/transformer.py         |  11 +-
 3 files changed, 483 insertions(+), 374 deletions(-)
 delete mode 100644 python/paddle/fluid/tests/book/test_transformer_api.py
 create mode 100644 python/paddle/fluid/tests/unittests/test_transformer_api.py

diff --git a/python/paddle/fluid/tests/book/test_transformer_api.py b/python/paddle/fluid/tests/book/test_transformer_api.py
deleted file mode 100644
index c75f018c07760c..00000000000000
--- a/python/paddle/fluid/tests/book/test_transformer_api.py
+++ /dev/null
@@ -1,369 +0,0 @@
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import numpy as np
-import paddle
-import paddle.fluid as fluid
-from paddle.nn.layer.transformer import MultiheadAttention, TransformerEncoderLayer, TransformerDecoderLayer
-
-import unittest
-
-
-def generate_basic_params(mode="attn", self_attention=True, verbose=False):
-    batch_size, query_length = [np.random.randint(2, 10) for _ in range(2)]
-    d_head, num_heads = [np.random.randint(3, 10) for _ in range(2)]
-    attn_dropout = 0.0
-    embed_dim = d_head * num_heads
-    if mode == "attn":
-        if self_attention:
-            kdim, vdim = embed_dim, embed_dim
-            key_length, value_length = query_length, query_length
-        else:
-            kdim, vdim = [np.random.randint(5, 20) for _ in range(2)]
-            key_length = np.random.randint(2, 10)
-            value_length = key_length
-        return batch_size, query_length, key_length, value_length, embed_dim, kdim, vdim, num_heads, attn_dropout
-
-    else:
-        dropout, act_dropout = 0.0, 0.0
-        dim_feedforward = np.random.randint(128, 1024)
-        sequence_length = np.random.randint(2, 10)
-        if mode == "encoder_layer":
-            return batch_size, embed_dim, num_heads, dim_feedforward, dropout, attn_dropout, act_dropout, sequence_length
-        elif mode == "decoder_layer":
-            target_length = np.random.randint(2, 10)
-            return batch_size, embed_dim, num_heads, dim_feedforward, dropout, attn_dropout, act_dropout, sequence_length, target_length
-
-
-def generate_query_key_value_cache(self_attention,
-                                   batch_size,
-                                   num_heads,
-                                   query_length,
-                                   embed_dim,
-                                   key_length=None,
-                                   value_length=None,
-                                   kdim=None,
-                                   vdim=None,
-                                   cache=None):
-    query = np.random.rand(batch_size, query_length,
-                           embed_dim).astype("float32")
-    # attn_mask = np.zeros((batch_size, num_heads, query_length, key_length))
-    # attn_mask[0][0][0][:10] = -np.inf 
-    attn_mask = None
-    head_dim = embed_dim // num_heads
-    if self_attention:
-        key, value = query, query
-    else:
-        key = np.random.rand(batch_size, key_length, kdim).astype("float32")
-        value = np.random.rand(batch_size, value_length, vdim).astype("float32")
-    cache_dict = {}
-    if cache:
-        if not self_attention:
-            cache_dict["static_k"] = np.random.rand(
-                batch_size, num_heads, key_length, head_dim).astype("float32")
-            cache_dict["static_v"] = np.random.rand(
-                batch_size, num_heads, value_length, head_dim).astype("float32")
-        else:
-            cache_dict["k"] = np.random.rand(batch_size, num_heads, key_length,
-                                             head_dim).astype("float32")
-            cache_dict["v"] = np.random.rand(
-                batch_size, num_heads, value_length, head_dim).astype("float32")
-    else:
-        cache_dict = None
-    return query, key, value, attn_mask, cache_dict
-
-
-def fc(x, weight):
-    return np.matmul(x, weight)
-
-
-def softmax(x):
-    np.seterr(invalid='ignore')
-    output = np.zeros(x.shape, dtype=np.float64)  # ?
-    for i in range(x.shape[0]):
-        for j in range(x.shape[1]):
-            for k in range(x.shape[2]):
-                x_curr = x[i, j, k, :]
-                e_x = np.exp(x_curr - np.amax(x_curr))
-                output[i, j, k, :] = e_x / np.sum(e_x)
-    return output
-
-
-def batch_matmul(x, y):
-    assert x.shape[0] == y.shape[0]
-    assert x.shape[1] == y.shape[1]
-    retval = np.zeros(
-        (x.shape[0], x.shape[1], x.shape[2], y.shape[3]), dtype=np.float64)
-    for i in range(x.shape[0]):
-        for j in range(x.shape[1]):
-            retval[i, j, :, :] = np.matmul(x[i, j, :, :], y[i, j, :, :])
-    return retval
-
-
-def scaled_dot_product_attention(q, k, v, d_key, attn_mask, multi_head_attn):
-    k = k.transpose([0, 1, 3, 2])
-    qkt = batch_matmul(q, k / np.sqrt(d_key, dtype=np.float64))
-    if attn_mask is not None:
-        qkt += attn_mask
-    weight = softmax(qkt)
-
-    attn_heads = batch_matmul(weight, v)
-    attn_heads = attn_heads.transpose((0, 2, 1, 3))
-    attn_heads = attn_heads.reshape((attn_heads.shape[0], attn_heads.shape[1],
-                                     attn_heads.shape[2] * attn_heads.shape[3]))
-    return attn_heads
-
-
-def cal_qkv(key, value, num_heads, embed_dim, multi_head_attn):
-    with fluid.dygraph.guard():
-        head_dim = embed_dim // num_heads
-        k_weight = multi_head_attn.k_proj.weight.numpy()
-        v_weight = multi_head_attn.v_proj.weight.numpy()
-        k = fc(key, k_weight)
-        v = fc(value, v_weight)
-        k = k.reshape((k.shape[0], k.shape[1], num_heads, head_dim))
-        k = k.transpose((0, 2, 1, 3))
-        v = v.reshape((v.shape[0], v.shape[1], num_heads, head_dim))
-        v = v.transpose((0, 2, 1, 3))
-        return k, v
-
-
-def prepare_qkv(query, key, value, num_heads, embed_dim, self_attention,
-                multi_head_attn, cache_dict):
-    q_weight = multi_head_attn.q_proj.weight.numpy()
-    q = fc(query, q_weight)
-    q = q.reshape((q.shape[0], q.shape[1], num_heads, embed_dim // num_heads))
-    q = q.transpose((0, 2, 1, 3))
-
-    if not self_attention and cache_dict:
-        k, v = cache_dict["static_k"], cache_dict["static_v"]
-    else:
-        k, v = cal_qkv(key, value, num_heads, embed_dim, multi_head_attn)
-        if cache_dict is not None:
-            k = np.concatenate((cache_dict["k"], k), axis=2)
-            v = np.concatenate((cache_dict["v"], v), axis=2)
-    return (q, k, v, cache_dict)
-
-
-def add(x, y=None):
-    fluid.enable_dygraph()
-    with fluid.dygraph.guard():
-        x = x.numpy() if not isinstance(x, np.ndarray) else x
-        if y is not None:
-            x += y
-            return x
-        return x
-    # print("print type(x) in add", type(x))
-
-
-def relu(x):
-    compare = x > 0
-    return x * compare
-
-
-def layer_norm(x, normalized_shape, norm, epsilon=1e-05, act=None):
-    fluid.enable_dygraph()
-    with fluid.dygraph.guard():
-        # scale:
-        weight = norm.weight.numpy()
-        # shift:
-        bias = norm.bias.numpy()
-
-        batch_size, src_len, d_model = x.shape
-        x = x.reshape((batch_size * src_len, d_model))
-        mu = np.mean(x, axis=1, keepdims=True)
-        sigma_squar = np.sum(np.square(x - mu), axis=1) / d_model
-        x1_up = (x - mu)
-        x1_down_1 = sigma_squar + epsilon
-        x1_down = np.sqrt(x1_down_1)
-        x1_down = x1_down.reshape((x1_down.shape[0], 1))
-        x1 = x1_up / x1_down
-        x_scaled = weight * x1
-        x_scaled_bias = x_scaled + bias
-        x_scaled_bias = x_scaled_bias.reshape((batch_size, src_len, d_model))
-    return x_scaled_bias
-
-
-def ffn(src, encoder_layer, ffn_fc1_act="relu"):
-    assert ffn_fc1_act == "relu", "only relu is supported"
-    fluid.enable_dygraph()
-    with fluid.dygraph.guard():
-        src = src.numpy() if not isinstance(src, np.ndarray) else src
-        w1 = encoder_layer.linear1.weight.numpy()
-        w2 = encoder_layer.linear2.weight.numpy()
-        # fc1
-        x1 = fc(src, w1)
-        x1 = relu(x1)
-        # fc2
-        x2 = fc(x1, w2)
-        return x2
-
-
-class TestTransformer(unittest.TestCase):
-    def test_multi_head_attention(self):
-        def multihead_attention_test_helper(self_attention, cache):
-            paddle.framework.manual_seed(2020)
-            # 分四种情况:self_attention|cross_attention, cache|No cache
-            with fluid.dygraph.guard(fluid.CPUPlace()):
-                for _ in range(100):
-                    # generate params for multi_head_attention
-                    batch_size, query_length, key_length, value_length, embed_dim, kdim, vdim, num_heads, attn_dropout = generate_basic_params(
-                        "attn", self_attention, False)
-                    query, key, value, attn_mask, cache_dict = generate_query_key_value_cache(
-                        self_attention, batch_size, num_heads, query_length,
-                        embed_dim, key_length, value_length, kdim, vdim, cache)
-                    need_weight, param_attr, bias_attr = False, None, None
-                    # call paddle's function
-                    multi_head_attn = MultiheadAttention(
-                        embed_dim, num_heads, attn_dropout, kdim, vdim,
-                        need_weight, param_attr, bias_attr)
-                    # construct cache object
-                    cache_obj = None
-                    if cache_dict:
-                        if 'k' and 'v' in cache_dict:
-                            cache_obj = multi_head_attn.Cache(
-                                paddle.to_variable(cache_dict['k']),
-                                paddle.to_variable(cache_dict['v']))
-                        elif 'static_k' and 'static_v' in cache_dict:
-                            cache_obj = multi_head_attn.StaticCache(
-                                paddle.to_variable(cache_dict['static_k']),
-                                paddle.to_variable(cache_dict['static_v']))
-
-                    attn_output = multi_head_attn(
-                        paddle.to_variable(query),
-                        paddle.to_variable(key),
-                        paddle.to_variable(value), attn_mask, cache_obj)
-
-                    # implementation by numpy
-                    # compute q, k, v
-                    q, k, v, _ = prepare_qkv(query, key, value, num_heads,
-                                             embed_dim, self_attention,
-                                             multi_head_attn, cache_dict)
-                    # scale dot product attention
-                    attn_heads = scaled_dot_product_attention(
-                        q, k, v, embed_dim // num_heads, attn_mask,
-                        multi_head_attn)
-                    out_proj_weight = multi_head_attn.out_proj.weight.numpy()
-                    reference = fc(attn_heads, out_proj_weight)
-
-                    np.testing.assert_allclose(
-                        attn_output.numpy(), reference, atol=1e-6)
-
-        multihead_attention_test_helper(True, True)
-        multihead_attention_test_helper(True, False)
-        multihead_attention_test_helper(False, True)
-        multihead_attention_test_helper(False, False)
-
-    def test_transformer_encoder_layer(self):
-
-        with fluid.dygraph.guard(fluid.CPUPlace()):
-            paddle.framework.manual_seed(2020)
-            for _ in range(100):
-                # 0.定义默认参数
-                ffn_fc1_act = "relu"
-                # 1.获取基本参数
-                batch_size, d_model, n_head, dim_feedforward, dropout, attn_dropout, act_dropout, sequence_length = generate_basic_params(
-                    mode="encoder_layer", verbose=False)
-                # 2.生成输入
-                src = np.random.rand(batch_size, sequence_length,
-                                     d_model).astype("float32")
-                residual = src
-                # src_mask = np.zeros(batch_size, n_head, sequence_length,
-                # sequence_length).astype(dtype)            
-                # src_mask [0][0][:30] = -np.inf
-
-                # 3.框架的输出
-                encoder_layer = TransformerEncoderLayer(
-                    d_model, n_head, dim_feedforward, dropout, ffn_fc1_act,
-                    attn_dropout, act_dropout)
-
-                encoder_output = encoder_layer(
-                    paddle.to_variable(src))  # paddle.to_variable(src_mask))
-                # 4.numpy:
-                # paddle self attention
-                self_attn = MultiheadAttention(
-                    d_model, n_head, dropout=attn_dropout)
-                attn_output = self_attn(
-                    paddle.to_variable(src),
-                    paddle.to_variable(src), paddle.to_variable(src)).numpy()
-
-                src = attn_output + residual
-                src_norm = layer_norm(src, d_model, encoder_layer.norm1)
-                residual = src_norm
-
-                ffn_output = ffn(src_norm, encoder_layer, ffn_fc1_act)
-                src = residual + ffn_output
-                src = layer_norm(src, d_model, encoder_layer.norm2)
-
-                np.testing.assert_allclose(
-                    encoder_output.numpy(), src, rtol=1e-5, atol=1e-6)
-
-    def test_transformer_decoder_layer(self):
-        with fluid.dygraph.guard(fluid.CPUPlace()):
-            paddle.framework.manual_seed(2020)
-            for _ in range(100):
-                activation = "relu"
-                normalize_before = False
-                batch_size, d_model, n_head, dim_feedforward, dropout, attn_dropout, act_dropout, source_length, target_length = generate_basic_params(
-                    mode="decoder_layer", verbose=False)
-                tgt = np.random.rand(batch_size, target_length,
-                                     d_model).astype("float32")
-                memory = np.random.rand(batch_size, source_length,
-                                        d_model).astype("float32")
-                tgt_mask = None  # TODO
-                memory_mask = None
-                # paddle:
-                decoder_layer = TransformerDecoderLayer(
-                    d_model, n_head, dim_feedforward, dropout, activation,
-                    attn_dropout, act_dropout, normalize_before)
-                decoder_output = decoder_layer(
-                    paddle.to_variable(tgt),
-                    paddle.to_variable(memory), tgt_mask, memory_mask).numpy()
-                # TODO: cache
-                residual = tgt
-                self_attn = MultiheadAttention(
-                    d_model, n_head, dropout=attn_dropout)
-
-                tgt = self_attn(
-                    paddle.to_variable(tgt),
-                    paddle.to_variable(tgt),
-                    paddle.to_variable(tgt), tgt_mask, None).numpy()
-
-                tgt = residual + tgt
-                # postprocess
-                tgt_norm = layer_norm(tgt, d_model, decoder_layer.norm1)
-                residual = tgt_norm
-
-                cross_attn = MultiheadAttention(
-                    d_model, n_head, dropout=attn_dropout)
-                tgt = cross_attn(
-                    paddle.to_variable(tgt_norm),
-                    paddle.to_variable(memory),
-                    paddle.to_variable(memory), memory_mask, None).numpy()
-                # postprocess
-                tgt = tgt + residual
-                tgt_norm = layer_norm(tgt, d_model, decoder_layer.norm2)
-                residual = tgt_norm
-
-                ffn_output = ffn(tgt_norm, decoder_layer, activation)
-                # post process
-                tgt = residual + ffn_output
-                tgt_norm = layer_norm(tgt, d_model, decoder_layer.norm3)
-
-                np.testing.assert_allclose(
-                    decoder_output, tgt_norm, rtol=1e-5, atol=1e-6)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_transformer_api.py b/python/paddle/fluid/tests/unittests/test_transformer_api.py
new file mode 100644
index 00000000000000..6fb374ff2c48f4
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_transformer_api.py
@@ -0,0 +1,477 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+from paddle.nn.layer.transformer import MultiheadAttention, TransformerEncoderLayer, TransformerDecoderLayer, TransformerEncoder, TransformerDecoder, Transformer
+
+import unittest
+
+
+def generate_basic_params(mode="attn", self_attention=True):
+    batch_size, query_length = [np.random.randint(2, 10) for _ in range(2)]
+    d_head, num_heads = [np.random.randint(3, 10) for _ in range(2)]
+    attn_dropout = 0.0
+    embed_dim = d_head * num_heads
+    if mode == "attn":
+        if self_attention:
+            kdim, vdim = embed_dim, embed_dim
+            key_length, value_length = query_length, query_length
+        else:
+            kdim, vdim = [np.random.randint(5, 20) for _ in range(2)]
+            key_length = np.random.randint(2, 10)
+            value_length = key_length
+        return batch_size, query_length, key_length, value_length, embed_dim, kdim, vdim, num_heads, attn_dropout
+
+    else:
+        dropout, act_dropout = 0.0, 0.0
+        dim_feedforward = np.random.randint(128, 1024)
+        sequence_length = np.random.randint(2, 10)
+        if mode == "encoder_layer":
+            return batch_size, embed_dim, num_heads, dim_feedforward, dropout, attn_dropout, act_dropout, sequence_length
+        elif mode == "decoder_layer":
+            target_length = np.random.randint(2, 10)
+            return batch_size, embed_dim, num_heads, dim_feedforward, dropout, attn_dropout, act_dropout, sequence_length, target_length
+
+
+def generate_query_key_value_cache(self_attention,
+                                   batch_size,
+                                   num_heads,
+                                   query_length,
+                                   embed_dim,
+                                   key_length=None,
+                                   value_length=None,
+                                   kdim=None,
+                                   vdim=None,
+                                   cache=None):
+    query = np.random.rand(batch_size, query_length,
+                           embed_dim).astype("float32")
+    attn_mask = np.zeros((batch_size, num_heads, query_length, key_length))
+    attn_mask[0][0][0][0] = -1e9
+
+    head_dim = embed_dim // num_heads
+    if self_attention:
+        key, value = query, query
+    else:
+        key = np.random.rand(batch_size, key_length, kdim).astype("float32")
+        value = np.random.rand(batch_size, value_length, vdim).astype("float32")
+    cache_dict = {}
+    if cache:
+        if not self_attention:
+            cache_dict["static_k"] = np.random.rand(
+                batch_size, num_heads, key_length, head_dim).astype("float32")
+            cache_dict["static_v"] = np.random.rand(
+                batch_size, num_heads, value_length, head_dim).astype("float32")
+        else:
+            cache_dict["k"] = np.random.rand(batch_size, num_heads, key_length,
+                                             head_dim).astype("float32")
+            cache_dict["v"] = np.random.rand(
+                batch_size, num_heads, value_length, head_dim).astype("float32")
+    else:
+        cache_dict = None
+    return query, key, value, attn_mask, cache_dict
+
+
+def fc(x, weight):
+    return np.matmul(x, weight)
+
+
+def softmax(x):
+    np.seterr(invalid='ignore')
+    output = np.zeros(x.shape, dtype=np.float64)
+    for i in range(x.shape[0]):
+        for j in range(x.shape[1]):
+            for k in range(x.shape[2]):
+                x_curr = x[i, j, k, :]
+                e_x = np.exp(x_curr - np.amax(x_curr))
+                output[i, j, k, :] = e_x / np.sum(e_x)
+    return output
+
+
+def batch_matmul(x, y):
+    assert x.shape[0] == y.shape[0]
+    assert x.shape[1] == y.shape[1]
+    retval = np.zeros(
+        (x.shape[0], x.shape[1], x.shape[2], y.shape[3]), dtype=np.float64)
+    for i in range(x.shape[0]):
+        for j in range(x.shape[1]):
+            retval[i, j, :, :] = np.matmul(x[i, j, :, :], y[i, j, :, :])
+    return retval
+
+
+def scaled_dot_product_attention(q, k, v, d_key, attn_mask, multi_head_attn):
+    k = k.transpose([0, 1, 3, 2])
+    qkt = batch_matmul(q, k / np.sqrt(d_key, dtype=np.float64))
+    if attn_mask is not None:
+        qkt += attn_mask
+    weight = softmax(qkt)
+    attn_heads = batch_matmul(weight, v)
+    attn_heads = attn_heads.transpose((0, 2, 1, 3))
+    attn_heads = attn_heads.reshape((attn_heads.shape[0], attn_heads.shape[1],
+                                     attn_heads.shape[2] * attn_heads.shape[3]))
+    return attn_heads
+
+
+def cal_qkv(key, value, num_heads, embed_dim, multi_head_attn):
+    with fluid.dygraph.guard():
+        head_dim = embed_dim // num_heads
+        k_weight = multi_head_attn.k_proj.weight.numpy()
+        v_weight = multi_head_attn.v_proj.weight.numpy()
+        k = fc(key, k_weight)
+        v = fc(value, v_weight)
+        k = k.reshape((k.shape[0], k.shape[1], num_heads, head_dim))
+        k = k.transpose((0, 2, 1, 3))
+        v = v.reshape((v.shape[0], v.shape[1], num_heads, head_dim))
+        v = v.transpose((0, 2, 1, 3))
+        return k, v
+
+
+def prepare_qkv(query, key, value, num_heads, embed_dim, self_attention,
+                multi_head_attn, cache_dict):
+    q_weight = multi_head_attn.q_proj.weight.numpy()
+    q = fc(query, q_weight)
+    q = q.reshape((q.shape[0], q.shape[1], num_heads, embed_dim // num_heads))
+    q = q.transpose((0, 2, 1, 3))
+
+    if not self_attention and cache_dict:
+        k, v = cache_dict["static_k"], cache_dict["static_v"]
+    else:
+        k, v = cal_qkv(key, value, num_heads, embed_dim, multi_head_attn)
+        if cache_dict is not None:
+            k = np.concatenate((cache_dict["k"], k), axis=2)
+            v = np.concatenate((cache_dict["v"], v), axis=2)
+    return (q, k, v, cache_dict)
+
+
+def add(x, y=None):
+    fluid.enable_dygraph()
+    with fluid.dygraph.guard():
+        x = x.numpy() if not isinstance(x, np.ndarray) else x
+        if y is not None:
+            x += y
+            return x
+        return x
+
+
+def relu(x):
+    compare = x > 0
+    return x * compare
+
+
+def layer_norm(x, normalized_shape, norm, epsilon=1e-05, act=None):
+    fluid.enable_dygraph()
+    with fluid.dygraph.guard():
+        # scale:
+        weight = norm.weight.numpy()
+        # shift:
+        bias = norm.bias.numpy()
+
+        batch_size, src_len, d_model = x.shape
+        x = x.reshape((batch_size * src_len, d_model))
+        mu = np.mean(x, axis=1, keepdims=True)
+        sigma_squar = np.sum(np.square(x - mu), axis=1) / d_model
+        x1_up = (x - mu)
+        x1_down_1 = sigma_squar + epsilon
+        x1_down = np.sqrt(x1_down_1)
+        x1_down = x1_down.reshape((x1_down.shape[0], 1))
+        x1 = x1_up / x1_down
+        x_scaled = weight * x1
+        x_scaled_bias = x_scaled + bias
+        x_scaled_bias = x_scaled_bias.reshape((batch_size, src_len, d_model))
+    return x_scaled_bias
+
+
+def ffn(src, encoder_layer, ffn_fc1_act="relu"):
+    assert ffn_fc1_act == "relu", "only relu is supported"
+    fluid.enable_dygraph()
+    with fluid.dygraph.guard():
+        src = src.numpy() if not isinstance(src, np.ndarray) else src
+        w1 = encoder_layer.linear1.weight.numpy()
+        w2 = encoder_layer.linear2.weight.numpy()
+        # fc1
+        x1 = fc(src, w1)
+        x1 = relu(x1)
+        # fc2
+        x2 = fc(x1, w2)
+        return x2
+
+
+class TestTransformer(unittest.TestCase):
+    def test_multi_head_attention(self):
+        def multihead_attention_test_helper(self_attention, cache):
+            paddle.framework.manual_seed(2020)
+            # 分四种情况:self_attention|cross_attention, cache|No cache
+            with fluid.dygraph.guard(fluid.CPUPlace()):
+
+                # generate params for multi_head_attention
+                batch_size, query_length, key_length, value_length, embed_dim, kdim, vdim, num_heads, attn_dropout = generate_basic_params(
+                    "attn", self_attention)
+                query, key, value, attn_mask, cache_dict = generate_query_key_value_cache(
+                    self_attention, batch_size, num_heads, query_length,
+                    embed_dim, key_length, value_length, kdim, vdim, cache)
+                if cache and self_attention:
+                    attn_mask = np.concatenate((attn_mask, attn_mask), axis=3)
+                need_weight, param_attr, bias_attr = False, None, None
+                # call paddle's function
+                multi_head_attn = MultiheadAttention(
+                    embed_dim, num_heads, attn_dropout, kdim, vdim, need_weight,
+                    param_attr, bias_attr)
+                # construct cache object
+                cache_obj = None
+                if cache_dict:
+                    if 'k' and 'v' in cache_dict:
+                        cache_obj = multi_head_attn.Cache(
+                            paddle.to_variable(cache_dict['k']),
+                            paddle.to_variable(cache_dict['v']))
+                    elif 'static_k' and 'static_v' in cache_dict:
+                        cache_obj = multi_head_attn.StaticCache(
+                            paddle.to_variable(cache_dict['static_k']),
+                            paddle.to_variable(cache_dict['static_v']))
+                if attn_mask is not None:
+                    attn_output = multi_head_attn(
+                        paddle.to_variable(query),
+                        paddle.to_variable(key),
+                        paddle.to_variable(value),
+                        paddle.to_variable(attn_mask), cache_obj)
+                else:
+                    attn_output = multi_head_attn(
+                        paddle.to_variable(query),
+                        paddle.to_variable(key),
+                        paddle.to_variable(value), attn_mask, cache_obj)
+                attn_output = attn_output[0] if cache_dict else attn_output
+
+                # implementation by numpy
+                # compute q, k, v
+                q, k, v, _ = prepare_qkv(query, key, value, num_heads,
+                                         embed_dim, self_attention,
+                                         multi_head_attn, cache_dict)
+                # scale dot product attention
+                attn_heads = scaled_dot_product_attention(
+                    q, k, v, embed_dim // num_heads, attn_mask, multi_head_attn)
+                out_proj_weight = multi_head_attn.out_proj.weight.numpy()
+                reference = fc(attn_heads, out_proj_weight)
+
+                np.testing.assert_allclose(
+                    attn_output.numpy(), reference, atol=1e-6)
+
+        multihead_attention_test_helper(True, True)
+        multihead_attention_test_helper(True, False)
+        multihead_attention_test_helper(False, True)
+        multihead_attention_test_helper(False, False)
+
+    def test_transformer_encoder_layer(self):
+
+        with fluid.dygraph.guard(fluid.CPUPlace()):
+            paddle.framework.manual_seed(2020)
+
+            ffn_fc1_act = "relu"
+            # 1.generate basic params
+            batch_size, d_model, n_head, dim_feedforward, dropout, attn_dropout, act_dropout, sequence_length = generate_basic_params(
+                mode="encoder_layer")
+            # 2.generate input for encoder
+            src = np.random.rand(batch_size, sequence_length,
+                                 d_model).astype("float32")
+            residual = src
+            src_mask = np.zeros((batch_size, n_head, sequence_length,
+                                 sequence_length)).astype("float32")
+            src_mask[0][0][0][0] = -np.inf
+
+            # paddle
+            encoder_layer = TransformerEncoderLayer(
+                d_model, n_head, dim_feedforward, dropout, ffn_fc1_act,
+                attn_dropout, act_dropout)
+
+            encoder_output = encoder_layer(
+                paddle.to_variable(src),
+                paddle.to_variable(src_mask))  # paddle.to_variable(src_mask))
+            # 4.numpy:
+            # paddle self attention
+            self_attn = MultiheadAttention(
+                d_model, n_head, dropout=attn_dropout)
+            attn_output = self_attn(
+                paddle.to_variable(src),
+                paddle.to_variable(src),
+                paddle.to_variable(src), paddle.to_variable(src_mask)).numpy()
+
+            src = attn_output + residual
+            src_norm = layer_norm(src, d_model, encoder_layer.norm1)
+            residual = src_norm
+
+            ffn_output = ffn(src_norm, encoder_layer, ffn_fc1_act)
+            src = residual + ffn_output
+            src = layer_norm(src, d_model, encoder_layer.norm2)
+
+            np.testing.assert_allclose(
+                encoder_output.numpy(), src, rtol=1e-5, atol=1e-6)
+
+    def test_transformer_decoder_layer(self):
+        with fluid.dygraph.guard(fluid.CPUPlace()):
+            paddle.framework.manual_seed(2020)
+            activation = "relu"
+            normalize_before = False
+            batch_size, d_model, n_head, dim_feedforward, dropout, attn_dropout, act_dropout, source_length, target_length = generate_basic_params(
+                mode="decoder_layer")
+            tgt = np.random.rand(batch_size, target_length,
+                                 d_model).astype("float32")
+            memory = np.random.rand(batch_size, source_length,
+                                    d_model).astype("float32")
+            tgt_mask = np.zeros((batch_size, n_head, target_length,
+                                 target_length)).astype("float32")
+            tgt_mask[0][0][0][0] = -1e9
+            memory_mask = np.zeros((batch_size, n_head, target_length,
+                                    source_length)).astype("float32")
+            memory_mask[0][0][0][0] = -1e9
+            for cache in [True, False]:
+                self_attn = MultiheadAttention(
+                    d_model, n_head, dropout=attn_dropout)
+                cross_attn = MultiheadAttention(
+                    d_model, n_head, dropout=attn_dropout)
+
+                # paddle decoderlayer:
+                decoder_layer = TransformerDecoderLayer(
+                    d_model, n_head, dim_feedforward, dropout, activation,
+                    attn_dropout, act_dropout, normalize_before)
+                cache_objs = None
+                if cache:
+                    cache_objs = decoder_layer.gen_cache(
+                        paddle.to_variable(memory))
+
+                decoder_output = decoder_layer(
+                    paddle.to_variable(tgt),
+                    paddle.to_variable(memory),
+                    paddle.to_variable(tgt_mask),
+                    paddle.to_variable(memory_mask), cache_objs)
+
+                decoder_output = decoder_output[0].numpy(
+                ) if cache else decoder_output.numpy()
+
+                # numpy:
+                residual = tgt
+                # self-attn
+                self_attn_cache = cache_objs[
+                    0] if cache_objs is not None else None
+                tgt = self_attn(
+                    paddle.to_variable(tgt),
+                    paddle.to_variable(tgt),
+                    paddle.to_variable(tgt),
+                    paddle.to_variable(tgt_mask), self_attn_cache)
+
+                tgt = tgt[0].numpy() if cache else tgt.numpy()
+
+                tgt = residual + tgt
+                # postprocess
+                tgt_norm = layer_norm(tgt, d_model, decoder_layer.norm1)
+                residual = tgt_norm
+                # cross-attn
+                cross_attn_cache = cache_objs[
+                    1] if cache_objs is not None else None
+                tgt = cross_attn(
+                    paddle.to_variable(tgt_norm),
+                    paddle.to_variable(memory),
+                    paddle.to_variable(memory),
+                    paddle.to_variable(memory_mask), cross_attn_cache)
+                tgt = tgt[0].numpy() if cache else tgt.numpy()
+
+                # postprocess
+                tgt = tgt + residual
+                tgt_norm = layer_norm(tgt, d_model, decoder_layer.norm2)
+                residual = tgt_norm
+                # FFN
+                ffn_output = ffn(tgt_norm, decoder_layer, activation)
+                # post process
+                tgt = residual + ffn_output
+                tgt_norm = layer_norm(tgt, d_model, decoder_layer.norm3)
+
+                np.testing.assert_allclose(
+                    decoder_output, tgt_norm, rtol=1e-5, atol=1e-6)
+
+    def test_encoder(self):
+        batch_size, d_model, n_head, dim_feedforward, dropout, attn_dropout, act_dropout, sequence_length = generate_basic_params(
+            mode="encoder_layer")
+
+        src = np.random.rand(batch_size, sequence_length,
+                             d_model).astype("float32")
+
+        src_mask = np.zeros((batch_size, n_head, sequence_length,
+                             sequence_length)).astype("float32")
+        src_mask[0][0][0][0] = -np.inf
+        with fluid.dygraph.guard(fluid.CPUPlace()):
+            encoder_layer = TransformerEncoderLayer(d_model, n_head,
+                                                    dim_feedforward, dropout)
+            num_layers = 6
+            encoder = TransformerEncoder(encoder_layer, num_layers)
+            # src, src_mask
+            enc_output = encoder(
+                paddle.to_variable(src), paddle.to_variable(src_mask))
+
+    def test_decoder(self):
+        batch_size, d_model, n_head, dim_feedforward, dropout, _, _, source_length, target_length = generate_basic_params(
+            mode="decoder_layer")
+        tgt = np.random.rand(batch_size, target_length,
+                             d_model).astype("float32")
+        memory = np.random.rand(batch_size, source_length,
+                                d_model).astype("float32")
+        tgt_mask = np.zeros((batch_size, n_head, target_length,
+                             target_length)).astype("float32")
+        tgt_mask[0][0][0][0] = -1e9
+        memory_mask = np.zeros((batch_size, n_head, target_length,
+                                source_length)).astype("float32")
+        memory_mask[0][0][0][0] = -1e9
+        with fluid.dygraph.guard(fluid.CPUPlace()):
+            decoder_layer = TransformerDecoderLayer(d_model, n_head,
+                                                    dim_feedforward, dropout)
+            num_layers = 6
+            decoder = TransformerDecoder(decoder_layer, num_layers)
+
+            output = decoder(
+                paddle.to_variable(tgt),
+                paddle.to_variable(memory),
+                paddle.to_variable(tgt_mask), paddle.to_variable(memory_mask))
+
+    def test_transformer(self):
+        batch_size, d_model, n_head, dim_feedforward, dropout, _, _, source_length, target_length = generate_basic_params(
+            mode="decoder_layer")
+
+        # batch_size, source_length, target_length, d_model, n_head = 4, 8, 8, 64, 8
+        with fluid.dygraph.guard(fluid.CPUPlace()):
+            transformer = Transformer(
+                d_model,
+                n_head,
+                dim_feedforward=dim_feedforward,
+                dropout=dropout)
+            src = paddle.to_variable(
+                np.random.rand(batch_size, source_length, d_model).astype(
+                    "float32"))
+            tgt = paddle.to_variable(
+                np.random.rand(batch_size, target_length, d_model).astype(
+                    "float32"))
+            src_mask = np.zeros((batch_size, n_head, source_length,
+                                 source_length)).astype("float32")
+            src_mask[0][0][0][0] = -np.inf
+            src_mask = paddle.to_variable(src_mask)
+            tgt_mask = np.zeros((batch_size, n_head, target_length,
+                                 target_length)).astype("float32")
+            tgt_mask[0][0][0][0] = -1e9
+            memory_mask = np.zeros((batch_size, n_head, target_length,
+                                    source_length)).astype("float32")
+            memory_mask[0][0][0][0] = -1e9
+            tgt_mask, memory_mask = paddle.to_variable(
+                tgt_mask), paddle.to_variable(memory_mask)
+            trans_output = transformer(src, tgt, src_mask, tgt_mask,
+                                       memory_mask)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/nn/layer/transformer.py b/python/paddle/nn/layer/transformer.py
index bbb638b62c7be4..12b46f828d2624 100644
--- a/python/paddle/nn/layer/transformer.py
+++ b/python/paddle/nn/layer/transformer.py
@@ -384,7 +384,7 @@ def forward(self, query, key, value, attn_mask=None, cache=None):
             outs.append(weights)
         if cache is not None:
             outs.append(cache)
-        return out if len(outs) else tuple(outs)
+        return out if len(outs) == 1 else tuple(outs)
 
 
 class TransformerEncoderLayer(Layer):
@@ -455,6 +455,7 @@ def __init__(self,
                  bias_attr=None):
         self._config = locals()
         self._config.pop("self")
+        self._config.pop("__class__", None)
 
         super(TransformerEncoderLayer, self).__init__()
         attn_dropout = dropout if attn_dropout is None else attn_dropout
@@ -561,7 +562,7 @@ class TransformerEncoder(Layer):
     def __init__(self, encoder_layer, num_layers, norm=None):
         super(TransformerEncoder, self).__init__()
         self.layers = LayerList([(encoder_layer if i == 0 else
-                                  type(encoder_layer)(encoder_layer._config))
+                                  type(encoder_layer)(**encoder_layer._config))
                                  for i in range(num_layers)])
         self.num_layers = num_layers
         self.norm = norm
@@ -680,7 +681,7 @@ def __init__(self,
                  bias_attr=None):
         self._config = locals()
         self._config.pop("self")
-
+        self._config.pop("__class__", None)
         super(TransformerDecoderLayer, self).__init__()
         attn_dropout = dropout if attn_dropout is None else attn_dropout
         act_dropout = dropout if act_dropout is None else act_dropout
@@ -867,7 +868,7 @@ class TransformerDecoder(Layer):
     def __init__(self, decoder_layer, num_layers, norm=None):
         super(TransformerDecoder, self).__init__()
         self.layers = LayerList([(decoder_layer if i == 0 else
-                                  type(decoder_layer)(decoder_layer._config))
+                                  type(decoder_layer)(**decoder_layer._config))
                                  for i in range(num_layers)])
         self.num_layers = num_layers
         self.norm = norm
@@ -1117,7 +1118,7 @@ def forward(self, src, tgt, src_mask=None, tgt_mask=None, memory_mask=None):
             Variable: It is a tensor that has the same shape and data type \
                 as `tgt`, representing the output of Transformer decoder.
         """
-        memory = self.encoder(src, mask=src_mask)
+        memory = self.encoder(src, src_mask=src_mask)
         output = self.decoder(
             tgt, memory, tgt_mask=tgt_mask, memory_mask=memory_mask)
         return output

From 5578a0dd99b8c3fdb8581daa4f535a6f847d1f45 Mon Sep 17 00:00:00 2001
From: LiuChiaChi <709153940@qq.com>
Date: Thu, 20 Aug 2020 15:24:40 +0000
Subject: [PATCH 10/17] clean conflicts infor in code

---
 python/paddle/nn/layer/transformer.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/python/paddle/nn/layer/transformer.py b/python/paddle/nn/layer/transformer.py
index 9579753a385b20..72e19c010c7d73 100644
--- a/python/paddle/nn/layer/transformer.py
+++ b/python/paddle/nn/layer/transformer.py
@@ -456,11 +456,8 @@ def __init__(self,
                  bias_attr=None):
         self._config = locals()
         self._config.pop("self")
-<<<<<<< HEAD
         self._config.pop("__class__", None)
-=======
         self._config.pop("__class__", None)  # py3
->>>>>>> 8637eeeb0430f6f5ede3a2668212a0b2b5733a06
 
         super(TransformerEncoderLayer, self).__init__()
         attn_dropout = dropout if attn_dropout is None else attn_dropout

From a7a000395ec45e1767058dfaf2720c646c455e58 Mon Sep 17 00:00:00 2001
From: LiuChiaChi <709153940@qq.com>
Date: Thu, 20 Aug 2020 15:29:05 +0000
Subject: [PATCH 11/17] clean Chinese comments

---
 python/paddle/fluid/tests/unittests/test_transformer_api.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/fluid/tests/unittests/test_transformer_api.py b/python/paddle/fluid/tests/unittests/test_transformer_api.py
index 6fb374ff2c48f4..8384a346c6375e 100644
--- a/python/paddle/fluid/tests/unittests/test_transformer_api.py
+++ b/python/paddle/fluid/tests/unittests/test_transformer_api.py
@@ -212,7 +212,7 @@ class TestTransformer(unittest.TestCase):
     def test_multi_head_attention(self):
         def multihead_attention_test_helper(self_attention, cache):
             paddle.framework.manual_seed(2020)
-            # 分四种情况:self_attention|cross_attention, cache|No cache
+            # self_attention|cross_attention, cache|No cache
             with fluid.dygraph.guard(fluid.CPUPlace()):
 
                 # generate params for multi_head_attention

From d2c25dca45c7275bad56ac25facc572fe02e75cc Mon Sep 17 00:00:00 2001
From: guosheng <guosheng@baidu.com>
Date: Fri, 21 Aug 2020 11:53:16 +0800
Subject: [PATCH 12/17] Add TransformerDecoderCell and
 TransformerBeamSearchDecoder. test=develop

---
 python/paddle/nn/layer/transformer.py | 98 +++++++++++----------------
 1 file changed, 39 insertions(+), 59 deletions(-)

diff --git a/python/paddle/nn/layer/transformer.py b/python/paddle/nn/layer/transformer.py
index 72e19c010c7d73..5a225b46fae0b9 100644
--- a/python/paddle/nn/layer/transformer.py
+++ b/python/paddle/nn/layer/transformer.py
@@ -96,7 +96,7 @@ class MultiheadAttention(Layer):
             query = paddle.rand((2, 4, 128))
             # self attention mask: [batch_size, num_heads, query_len, query_len]
             attn_mask = paddle.rand((2, 2, 4, 4))
-            multi_head_attn = paddle.MultiheadAttention(64, 64, 128, n_head=2)
+            multi_head_attn = paddle.MultiheadAttention(128, 2)
             output = multi_head_attn(query, attn_mask=attn_mask)  # [2, 4, 128]
     """
 
@@ -231,7 +231,7 @@ def cal_kv(self, key, value):
         v = layers.transpose(x=v, perm=[0, 2, 1, 3])
         return k, v
 
-    def gen_cache(self, key, value=None, type=MultiheadAttention.Cache):
+    def gen_cache(self, key, value=None, type=Cache):
         """
         Generates cache for `forward` usage in inference accroding to arguments.
         The generated cache is an instance of `MultiheadAttention.Cache` or an
@@ -456,7 +456,6 @@ def __init__(self,
                  bias_attr=None):
         self._config = locals()
         self._config.pop("self")
-        self._config.pop("__class__", None)
         self._config.pop("__class__", None)  # py3
 
         super(TransformerEncoderLayer, self).__init__()
@@ -940,25 +939,32 @@ def forward(self, tgt, memory, tgt_mask=None, memory_mask=None, cache=None):
 
         return output if cache is None else (output, new_caches)
 
-    def gen_cache(self, memory):
+    def gen_cache(self, memory, do_zip=False):
         """
         Generates cache for `forward` usage. The generated cache is a list, and
         each element in it is a tuple( :code:`(incremental_cache, static_cache)` )
         produced by `TransformerDecoderLayer.gen_cache`. See `TransformerDecoderLayer.gen_cache`
-        for more details.
+        for more details. If `do_zip` is True, apply `zip` on these tuples to get
+        a list with two elements.
 
 
         Parameters:
             memory (Variable): The output of Transformer encoder. It is a tensor
                 with shape `[batch_size, source_length, d_model]`. The data type
                 should be float32 or float64.
+            do_zip (bool, optional): Indicate whether to apply `zip` on the tuples.
+                If True, return a list with two elements. Default False
 
         Returns:
             list: It is a list, and each element in the list is a tuple produced \
                 by `TransformerDecoderLayer.gen_cache(memory)`. See `TransformerDecoderLayer.gen_cache` \
-                for more details.
+                for more details. If `do_zip` is True, apply `zip` on these tuples \
+                and return a list with two elements.
         """
-        return [layer.gen_cache(memory) for layer in self.layers]
+        cache = [layer.gen_cache(memory) for layer in self.layers]
+        if do_zip:
+            cache = list(zip(*cache))
+        return cache
 
 
 class Transformer(Layer):
@@ -1152,7 +1158,6 @@ class TransformerDecoderCell(Layer):
         .. code-block:: python
 
             import paddle
-            import paddle.fluid as fluid
             from paddle.fluid.dygraph import Embedding, Linear
             from paddle.incubate.hapi.text import TransformerDecoder
             from paddle.incubate.hapi.text import TransformerCell
@@ -1209,11 +1214,11 @@ def __init__(self, decoder, embedding_fn=None, output_fn=None):
 
     def forward(self,
                 inputs,
-                cache=None,
-                memory=None,
+                cache,
+                static_cache,
+                memory,
                 tgt_mask=None,
-                memory_mask=None,
-                static_cache=[]):
+                memory_mask=None):
         """
         Produces logits from `inputs` composed by ids and positions.
 
@@ -1222,10 +1227,14 @@ def forward(self,
                 tensors both have int64 data type and with 2D shape 
                 `[batch_size, sequence_length]` where `sequence_length` is 1
                 for inference.
-            cache(list): It caches the multi-head attention intermediate results
-                of history decoding steps. It is a list of dict where the length
-                of list is decoder layer number, and each dict has `k` and `v` as
-                keys and values are cached results. Default None
+            cache(list): It is a list and each element of the list is an instance
+                of `MultiheadAttention.Cache` for corresponding decoder layer. It
+                can be produced by `TransformerDecoder.gen_cache`, thus see
+                `TransformerDecoderLayer.gen_cache` for more details.
+            static_cache(list): It is a list and each element of the
+                list is an instance of `MultiheadAttention.StaticCache` for corresponding
+                decoder layer. It can be produced by `TransformerDecoder.gen_cache`,
+                thus see `TransformerDecoderLayer.gen_cache` for more details.
             memory (Variable): The output of Transformer encoder. It is a tensor
                 with shape `[batch_size, source_length, d_model]`. The data type
                 should be float32 or float64.
@@ -1236,7 +1245,8 @@ def forward(self,
                 where the unwanted positions have `-INF` values and the others
                 have 0 values. The data type should be float32 or float64. It can
                 be None when nothing wanted or needed to be prevented attention to.
-                Default None
+                It can be None for inference since there is no subsequent in
+                auto-regression decoding. Default None
             memory_mask (Variable, optional): A tensor used in decoder-encoder
                 cross attention to prevents attention to some unwanted positions,
                 usually the paddings. It is a tensor with shape broadcasted to
@@ -1244,62 +1254,32 @@ def forward(self,
                 unwanted positions have `-INF` values and the others have 0 values.
                 The data type should be float32 or float64. It can be None when
                 nothing wanted or needed to be prevented attention to. Default None
-            static_cache(list): It stores transformed results of encoder output
-                to be used as keys and values in decoder-encoder cross attention
-                It is a list of dict where the length of list is decoder layer
-                number, and each dict has `static_k` and `static_v` as keys and
-                values are stored results. Default empty list
+            
 
         Returns:
             tuple: A tuple( :code:`(outputs, new_states)` ), where `outputs` \
                 is a float32 or float64 3D tensor representing logits shaped \
                 `[batch_size, sequence_length, vocab_size]`. `new_states has \
                 the same structure and data type with `states` while the length \
-                is one larger since the intermediate results of current step are \
-                concatenated into it.
+                is one larger since concatanating the intermediate results of \
+                current step.
         """
-        trg_word, trg_pos = inputs
+        tgt_word, tgt_pos = inputs
         if cache and static_cache:
-            for cache, static_cache in zip(states, static_caches):
-                cache.update(static_cache)
+            states = list(zip(cache, static_cache))
         if self.embedding_fn is not None:
-            dec_input = self.embedding_fn(trg_word, trg_pos)
-            outputs = self.decoder(dec_input, memory, tgt_mask, memory_mask,
-                                   states)
+            tgt = self.embedding_fn(tgt_word, tgt_pos)
+            outputs, new_states = self.decoder(tgt, memory, tgt_mask,
+                                               memory_mask, states)
         else:
-            outputs = self.decoder(trg_word, trg_pos, memory, tgt_mask,
-                                   memory_mask, states)
+            outputs, new_states = self.decoder(tgt_word, tgt_pos, memory,
+                                               tgt_mask, memory_mask, states)
         if self.output_fn is not None:
             outputs = self.output_fn(outputs)
 
-        new_states = [{
-            "k": cache["k"],
-            "v": cache["v"]
-        } for cache in states] if states else states
+        new_states = [cache[0] for cache in new_states]
         return outputs, new_states
 
-    @property
-    def state_shape(self):
-        """
-        States of TransformerCell cache the multi-head attention intermediate
-        results of history decoding steps, and have a increasing length as
-        decoding continued.
-        
-        `state_shape` of TransformerCell is used to initialize states. It is a
-        list of dict where the length of list is decoder layer, and each dict
-        has `k` and `v` as keys and values are `[n_head, 0, d_key]`, `[n_head, 0, d_value]`
-        separately. (-1 for batch size would be automatically inserted into shape).
-
-        Returns:
-            list: It is a list of dict where the length of list is decoder layer \
-                number, and each dict has `k` and `v` as keys and values are cached \
-                results.
-        """
-        return [{
-            "k": [self.decoder.n_head, 0, self.decoder.d_key],
-            "v": [self.decoder.n_head, 0, self.decoder.d_value],
-        } for i in range(self.decoder.n_layer)]
-
 
 class TransformerBeamSearchDecoder(layers.BeamSearchDecoder):
     """
@@ -1310,7 +1290,7 @@ class TransformerBeamSearchDecoder(layers.BeamSearchDecoder):
     `BeamSearchDecoder` to make beam search adapt to Transformer decoder.
 
     Parameters:
-        cell(TransformerCell): An instance of `TransformerCell`.
+        cell(TransformerDecoderCell): An instance of `TransformerDecoderCell`.
         start_token(int): The start token id.
         end_token(int): The end token id.
         beam_size(int): The beam width used in beam search.

From c768e6e90fa549e26e30369f596eab09471417f2 Mon Sep 17 00:00:00 2001
From: guosheng <guosheng@baidu.com>
Date: Fri, 21 Aug 2020 11:54:41 +0800
Subject: [PATCH 13/17] Remove TransformerDecoderCell and
 TransformerBeamSearchDecoder temporarily. test=develop

---
 python/paddle/nn/layer/transformer.py | 335 --------------------------
 1 file changed, 335 deletions(-)

diff --git a/python/paddle/nn/layer/transformer.py b/python/paddle/nn/layer/transformer.py
index 5a225b46fae0b9..04a96163e8eec7 100644
--- a/python/paddle/nn/layer/transformer.py
+++ b/python/paddle/nn/layer/transformer.py
@@ -1131,338 +1131,3 @@ def forward(self, src, tgt, src_mask=None, tgt_mask=None, memory_mask=None):
         output = self.decoder(
             tgt, memory, tgt_mask=tgt_mask, memory_mask=memory_mask)
         return output
-
-
-class TransformerDecoderCell(Layer):
-    """
-    TransformerDecoderCell wraps a Transformer decoder combined with an embedding
-    layer and output layer to produce logits from symbols (ids and position here).
-    It is analogy to `RNNCell` and `outputs, new_states = cell(inputs, states, *kwargs)`,
-    where `inputs` is composed of word ids and position, `states` is `cache`,
-    `kwargs` includes `memory, `tgt_mask`, `memory_mask` and `static_cache`.
-
-    Parameters:
-        decoder(callable): A TransformerDecoder instance. Or a wrapper of it that
-            includes a embedding layer accepting ids and positions instead of embeddings
-            and includes a output layer transforming decoder output features to logits.
-        embedding_fn(callable, optional): A callable that accepts ids and position
-            as arguments and return embeddings as input of `decoder`. It can be
-            None if `decoder` includes a embedding layer. Default None.
-        output_fn(callable, optional): A callable applid on `decoder` output to
-            transform decoder output features to get logits. Mostly it is a Linear
-            layer with vocabulary size. It can be None if `decoder` includes a
-            output layer. Default None.
-
-    Examples:
-
-        .. code-block:: python
-
-            import paddle
-            from paddle.fluid.dygraph import Embedding, Linear
-            from paddle.incubate.hapi.text import TransformerDecoder
-            from paddle.incubate.hapi.text import TransformerCell
-            from paddle.incubate.hapi.text import TransformerBeamSearchDecoder
-            from paddle.incubate.hapi.text import DynamicDecode
-
-            paddle.disable_static()
-
-            class Embedder(fluid.dygraph.Layer):
-                def __init__(self):
-                    super(Embedder, self).__init__()
-                    self.word_embedder = Embedding(size=[1000, 128])
-                    self.pos_embedder = Embedding(size=[500, 128])
-
-                def forward(self, word, position):
-                    return self.word_embedder(word) + self.pos_embedder(position)
-
-            embedder = Embedder()
-            output_layer = Linear(128, 1000)
-            decoder = TransformerDecoder(2, 2, 64, 64, 128, 512)
-            transformer_cell = TransformerCell(decoder, embedder, output_layer)
-            dynamic_decoder = DynamicDecode(
-                TransformerBeamSearchDecoder(
-                    transformer_cell,
-                    start_token=0,
-                    end_token=1,
-                    beam_size=4,
-                    var_dim_in_state=2),
-                max_step_num=10,
-                is_test=True)
-            
-            enc_output = paddle.rand((2, 4, 128))
-            # cross attention bias: [batch_size, n_head, tgt_len, src_len]
-            trg_src_attn_bias = paddle.rand((2, 2, 1, 4))
-            # inputs for beam search on Transformer
-            caches = transformer_cell.get_initial_states(enc_output)
-            enc_output = TransformerBeamSearchDecoder.tile_beam_merge_with_batch(
-                enc_output, beam_size=4)
-            trg_src_attn_bias = TransformerBeamSearchDecoder.tile_beam_merge_with_batch(
-                trg_src_attn_bias, beam_size=4)
-            static_caches = decoder.prepare_static_cache(enc_output)
-            outputs = dynamic_decoder(
-                inits=caches,
-                enc_output=enc_output,
-                trg_src_attn_bias=trg_src_attn_bias,
-                static_caches=static_caches)
-    """
-
-    def __init__(self, decoder, embedding_fn=None, output_fn=None):
-        super(TransformerDecoderCell, self).__init__()
-        self.decoder = decoder
-        self.embedding_fn = embedding_fn
-        self.output_fn = output_fn
-
-    def forward(self,
-                inputs,
-                cache,
-                static_cache,
-                memory,
-                tgt_mask=None,
-                memory_mask=None):
-        """
-        Produces logits from `inputs` composed by ids and positions.
-
-        Parameters:
-            inputs(tuple): A tuple includes target ids and positions. The two
-                tensors both have int64 data type and with 2D shape 
-                `[batch_size, sequence_length]` where `sequence_length` is 1
-                for inference.
-            cache(list): It is a list and each element of the list is an instance
-                of `MultiheadAttention.Cache` for corresponding decoder layer. It
-                can be produced by `TransformerDecoder.gen_cache`, thus see
-                `TransformerDecoderLayer.gen_cache` for more details.
-            static_cache(list): It is a list and each element of the
-                list is an instance of `MultiheadAttention.StaticCache` for corresponding
-                decoder layer. It can be produced by `TransformerDecoder.gen_cache`,
-                thus see `TransformerDecoderLayer.gen_cache` for more details.
-            memory (Variable): The output of Transformer encoder. It is a tensor
-                with shape `[batch_size, source_length, d_model]`. The data type
-                should be float32 or float64.
-            tgt_mask (Variable, optional): A tensor used in self attention
-                to prevents attention to some unwanted positions, usually the
-                the subsequent positions. It is a tensor with shape broadcasted
-                to `[batch_size, n_head, target_length, target_length]`,
-                where the unwanted positions have `-INF` values and the others
-                have 0 values. The data type should be float32 or float64. It can
-                be None when nothing wanted or needed to be prevented attention to.
-                It can be None for inference since there is no subsequent in
-                auto-regression decoding. Default None
-            memory_mask (Variable, optional): A tensor used in decoder-encoder
-                cross attention to prevents attention to some unwanted positions,
-                usually the paddings. It is a tensor with shape broadcasted to
-               `[batch_size, n_head, target_length, source_length]`, where the
-                unwanted positions have `-INF` values and the others have 0 values.
-                The data type should be float32 or float64. It can be None when
-                nothing wanted or needed to be prevented attention to. Default None
-            
-
-        Returns:
-            tuple: A tuple( :code:`(outputs, new_states)` ), where `outputs` \
-                is a float32 or float64 3D tensor representing logits shaped \
-                `[batch_size, sequence_length, vocab_size]`. `new_states has \
-                the same structure and data type with `states` while the length \
-                is one larger since concatanating the intermediate results of \
-                current step.
-        """
-        tgt_word, tgt_pos = inputs
-        if cache and static_cache:
-            states = list(zip(cache, static_cache))
-        if self.embedding_fn is not None:
-            tgt = self.embedding_fn(tgt_word, tgt_pos)
-            outputs, new_states = self.decoder(tgt, memory, tgt_mask,
-                                               memory_mask, states)
-        else:
-            outputs, new_states = self.decoder(tgt_word, tgt_pos, memory,
-                                               tgt_mask, memory_mask, states)
-        if self.output_fn is not None:
-            outputs = self.output_fn(outputs)
-
-        new_states = [cache[0] for cache in new_states]
-        return outputs, new_states
-
-
-class TransformerBeamSearchDecoder(layers.BeamSearchDecoder):
-    """
-    Compared with a RNN step :code:`outputs, new_states = cell(inputs, states)`,
-    Transformer decoder's `inputs` uses 2D tensor shaped `[batch_size * beam_size, 1]`
-    and includes extra position data. And its `states` (caches) has increasing
-    length. These are not consistent with `BeamSearchDecoder`, thus subclass
-    `BeamSearchDecoder` to make beam search adapt to Transformer decoder.
-
-    Parameters:
-        cell(TransformerDecoderCell): An instance of `TransformerDecoderCell`.
-        start_token(int): The start token id.
-        end_token(int): The end token id.
-        beam_size(int): The beam width used in beam search.
-        var_dim_in_state(int): Indicate which dimension of states is variant.
-
-    Examples:
-
-        .. code-block:: python
-
-            import paddle
-            import paddle.fluid as fluid
-            from paddle.fluid.dygraph import Embedding, Linear
-            from paddle.incubate.hapi.text import TransformerDecoder
-            from paddle.incubate.hapi.text import TransformerCell
-            from paddle.incubate.hapi.text import TransformerBeamSearchDecoder
-            from paddle.incubate.hapi.text import DynamicDecode
-
-            paddle.disable_static()
-
-            class Embedder(fluid.dygraph.Layer):
-                def __init__(self):
-                    super(Embedder, self).__init__()
-                    self.word_embedder = Embedding(size=[1000, 128])
-                    self.pos_embedder = Embedding(size=[500, 128])
-
-                def forward(self, word, position):
-                    return self.word_embedder(word) + self.pos_embedder(position)
-
-            embedder = Embedder()
-            output_layer = Linear(128, 1000)
-            decoder = TransformerDecoder(2, 2, 64, 64, 128, 512)
-            transformer_cell = TransformerCell(decoder, embedder, output_layer)
-            dynamic_decoder = DynamicDecode(
-                TransformerBeamSearchDecoder(
-                    transformer_cell,
-                    start_token=0,
-                    end_token=1,
-                    beam_size=4,
-                    var_dim_in_state=2),
-                max_step_num=10,
-                is_test=True)
-            
-            enc_output = paddle.rand((2, 4, 128))
-            # cross attention bias: [batch_size, n_head, tgt_len, src_len]
-            trg_src_attn_bias = paddle.rand((2, 2, 1, 4))
-            # inputs for beam search on Transformer
-            caches = transformer_cell.get_initial_states(enc_output)
-            enc_output = TransformerBeamSearchDecoder.tile_beam_merge_with_batch(
-                enc_output, beam_size=4)
-            trg_src_attn_bias = TransformerBeamSearchDecoder.tile_beam_merge_with_batch(
-                trg_src_attn_bias, beam_size=4)
-            static_caches = decoder.prepare_static_cache(enc_output)
-            outputs = dynamic_decoder(
-                inits=caches,
-                enc_output=enc_output,
-                trg_src_attn_bias=trg_src_attn_bias,
-                static_caches=static_caches)
-    """
-
-    def __init__(self, cell, start_token, end_token, beam_size,
-                 var_dim_in_state):
-        super(TransformerBeamSearchDecoder,
-              self).__init__(cell, start_token, end_token, beam_size)
-        self.cell = cell
-        self.var_dim_in_state = var_dim_in_state
-
-    def _merge_batch_beams_with_var_dim(self, x):
-        """
-        Reshape a tensor with shape `[batch_size, beam_size, ...]` to a new
-        tensor with shape `[batch_size * beam_size, ...]`. 
-
-        Parameters:
-            x(Variable): A tensor with shape `[batch_size, beam_size, ...]`. The
-                data type should be float32, float64, int32, int64 or bool.
-
-        Returns:
-            Variable: A tensor with shape `[batch_size * beam_size, ...]`, whose \
-                data type is same as `x`.
-        """
-        # init length of cache is 0, and it increases with decoding carrying on,
-        # thus need to reshape elaborately
-        var_dim_in_state = self.var_dim_in_state + 1  # count in beam dim
-        x = layers.transpose(x,
-                             list(range(var_dim_in_state, len(x.shape))) +
-                             list(range(0, var_dim_in_state)))
-        x = layers.reshape(
-            x, [0] * (len(x.shape) - var_dim_in_state
-                      ) + [self.batch_size * self.beam_size] +
-            [int(size) for size in x.shape[-var_dim_in_state + 2:]])
-        x = layers.transpose(
-            x,
-            list(range((len(x.shape) + 1 - var_dim_in_state), len(x.shape))) +
-            list(range(0, (len(x.shape) + 1 - var_dim_in_state))))
-        return x
-
-    def _split_batch_beams_with_var_dim(self, x):
-        """
-        Reshape a tensor with shape `[batch_size * beam_size, ...]` to a new
-        tensor with shape `[batch_size, beam_size, ...]`. 
-
-        Parameters:
-            x(Variable): A tensor with shape `[batch_size * beam_size, ...]`. The
-                data type should be float32, float64, int32, int64 or bool.
-
-        Returns:
-            Variable: A tensor with shape `[batch_size, beam_size, ...]`, whose \
-                data type is same as `x`.     
-        """
-        var_dim_size = layers.shape(x)[self.var_dim_in_state]
-        x = layers.reshape(
-            x, [-1, self.beam_size] +
-            [int(size)
-             for size in x.shape[1:self.var_dim_in_state]] + [var_dim_size] +
-            [int(size) for size in x.shape[self.var_dim_in_state + 1:]])
-        return x
-
-    def step(self, time, inputs, states, **kwargs):
-        """
-        Perform a beam search decoding step, which uses `cell` to get probabilities,
-        and follows a beam search step to calculate scores and select candidate
-        token ids.
-
-        Note: compared with `BeamSearchDecoder.step`, it feed 2D id tensor shaped
-        `[batch_size * beam_size, 1]` rather than `[batch_size * beam_size]` combined
-        position data as inputs to `cell`.
-
-        Parameters:
-            time(Variable): An `int64` tensor with shape `[1]` provided by the caller,
-                representing the current time step number of decoding.
-            inputs(Variable): A tensor variable. It is same as `initial_inputs`
-                returned by `initialize()` for the first decoding step and
-                `next_inputs` returned by `step()` for the others. It is a int64
-                id tensor with shape `[batch_size * beam_size]`
-            states(Variable): A structure of tensor variables.
-                It is same as the `initial_states` returned by `initialize()` for
-                the first decoding step and `beam_search_state` returned by
-                `step()` for the others.
-            **kwargs: Additional keyword arguments, provided by the caller. 
-        
-        Returns:
-            tuple: A tuple( :code:`(beam_search_output, beam_search_state, next_inputs, finished)` ). \
-                `beam_search_state` and `next_inputs` have the same structure, \
-                shape and data type as the input arguments `states` and `inputs` separately. \
-                `beam_search_output` is a namedtuple(including scores, predicted_ids, \
-                parent_ids as fields) of tensor variables, where \
-                `scores, predicted_ids, parent_ids` all has a tensor value shaped \
-                `[batch_size, beam_size]` with data type `float32, int64, int64`. \
-                `finished` is a `bool` tensor with shape `[batch_size, beam_size]`.
-        """
-        # compared to RNN, Transformer has 3D data at every decoding step
-        inputs = layers.reshape(inputs, [-1, 1])  # token
-        pos = layers.ones_like(inputs) * time  # pos
-        cell_states = map_structure(self._merge_batch_beams_with_var_dim,
-                                    states.cell_states)
-
-        cell_outputs, next_cell_states = self.cell((inputs, pos), cell_states,
-                                                   **kwargs)
-
-        # squeeze to adapt to BeamSearchDecoder which use 2D logits
-        cell_outputs = map_structure(
-            lambda x: layers.squeeze(x, [1]) if len(x.shape) == 3 else x,
-            cell_outputs)
-        cell_outputs = map_structure(self._split_batch_beams, cell_outputs)
-        next_cell_states = map_structure(self._split_batch_beams_with_var_dim,
-                                         next_cell_states)
-
-        beam_search_output, beam_search_state = self._beam_search_step(
-            time=time,
-            logits=cell_outputs,
-            next_cell_states=next_cell_states,
-            beam_state=states)
-        next_inputs, finished = (beam_search_output.predicted_ids,
-                                 beam_search_state.finished)
-
-        return (beam_search_output, beam_search_state, next_inputs, finished)

From 0d01c1601202be55ef00a77ab9ca5ea579e32bbc Mon Sep 17 00:00:00 2001
From: guosheng <guosheng@baidu.com>
Date: Fri, 21 Aug 2020 13:31:04 +0800
Subject: [PATCH 14/17] Add import for Transformer apis. test=develop

---
 python/paddle/nn/layer/__init__.py    |  1 +
 python/paddle/nn/layer/transformer.py | 21 ++++++++++++++-------
 2 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/python/paddle/nn/layer/__init__.py b/python/paddle/nn/layer/__init__.py
index f1069f0dfd1d3b..0cf8bb9b12651c 100644
--- a/python/paddle/nn/layer/__init__.py
+++ b/python/paddle/nn/layer/__init__.py
@@ -29,6 +29,7 @@
 from .extension import *
 from .activation import *
 from .norm import *
+from .transformer import *
 # from .activation import PReLU        #DEFINE_ALIAS
 from .activation import ReLU  #DEFINE_ALIAS
 from .activation import LeakyReLU  #DEFINE_ALIAS
diff --git a/python/paddle/nn/layer/transformer.py b/python/paddle/nn/layer/transformer.py
index 04a96163e8eec7..97409c30623ed7 100644
--- a/python/paddle/nn/layer/transformer.py
+++ b/python/paddle/nn/layer/transformer.py
@@ -13,7 +13,14 @@
 # limitations under the License.
 
 # TODO: define the classes of Transformer neural network
-# __all__ = [ ]
+__all__ = [
+    'MultiheadAttention',
+    'TransformerEncoderLayer',
+    'TransformerEncoder',
+    'TransformerDecoderLayer',
+    'TransformerDecoder',
+    'Transformer',
+]
 
 import copy
 import collections
@@ -211,12 +218,12 @@ def cal_kv(self, key, value):
         to construct cache for inference.
 
         Parameters:
-            key (Variable, optional): The keys for multi-head attention. It is
-                a tensor with shape `[batch_size, sequence_length, kdim]`. The
-                data type should be float32 or float64.
-            value (Variable, optional): The values for multi-head attention. It
-                is a tensor with shape `[batch_size, sequence_length, vdim]`.
-                The data type should be float32 or float64.
+            key (Variable): The keys for multi-head attention. It is a tensor
+                with shape `[batch_size, sequence_length, kdim]`. The data type
+                should be float32 or float64.
+            value (Variable): The values for multi-head attention. It is a tensor
+                with shape `[batch_size, sequence_length, vdim]`. The data type
+                should be float32 or float64.
 
         Returns:
             tuple: A tuple including transformed keys and values. Their shapes \

From 39a623c4ae130f9e65a5052f73836af79c5934e3 Mon Sep 17 00:00:00 2001
From: guosheng <guosheng@baidu.com>
Date: Fri, 21 Aug 2020 15:14:40 +0800
Subject: [PATCH 15/17] Update usage of weight_attr and Tensor in Transformer
 api docs. test=develop

---
 python/paddle/nn/layer/transformer.py | 195 ++++++++++++--------------
 1 file changed, 88 insertions(+), 107 deletions(-)

diff --git a/python/paddle/nn/layer/transformer.py b/python/paddle/nn/layer/transformer.py
index 97409c30623ed7..c914d51f0930ff 100644
--- a/python/paddle/nn/layer/transformer.py
+++ b/python/paddle/nn/layer/transformer.py
@@ -86,12 +86,13 @@ class MultiheadAttention(Layer):
             `embed_dim`. Default None.
         need_weights (bool, optional): Indicate whether to return the attention
             weights. Default False.
-        param_attr(ParamAttr, optional):  To specify the weight parameter property.
+        weight_attr(ParamAttr, optional):  To specify the weight parameter property.
             Default: None, which means the default weight parameter property is used.
-            See usage for details in :ref:`api_fluid_ParamAttr` .
+            See usage for details in :code:`ParamAttr` .
         bias_attr (ParamAttr, optional): To specify the bias parameter property.
             Default: None, which means the default bias parameter property is used.
-            See usage for details in :ref:`api_fluid_ParamAttr` .
+            If it is set to False, this layer will not have trainable bias parameter.
+            See usage for details in :code:`ParamAttr` .
          
     Examples:
 
@@ -117,7 +118,7 @@ def __init__(self,
                  kdim=None,
                  vdim=None,
                  need_weights=False,
-                 param_attr=None,
+                 weight_attr=None,
                  bias_attr=None):
         super(MultiheadAttention, self).__init__()
         self.embed_dim = embed_dim
@@ -131,25 +132,13 @@ def __init__(self,
         assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads"
 
         self.q_proj = Linear(
-            input_dim=embed_dim,
-            output_dim=embed_dim,
-            param_attr=param_attr,
-            bias_attr=bias_attr)
+            embed_dim, embed_dim, weight_attr, bias_attr=bias_attr)
         self.k_proj = Linear(
-            input_dim=self.kdim,
-            output_dim=embed_dim,
-            param_attr=param_attr,
-            bias_attr=bias_attr)
+            self.kdim, embed_dim, weight_attr, bias_attr=bias_attr)
         self.v_proj = Linear(
-            input_dim=self.vdim,
-            output_dim=embed_dim,
-            param_attr=param_attr,
-            bias_attr=bias_attr)
+            self.vdim, embed_dim, weight_attr, bias_attr=bias_attr)
         self.out_proj = Linear(
-            input_dim=embed_dim,
-            output_dim=embed_dim,
-            param_attr=param_attr,
-            bias_attr=bias_attr)
+            embed_dim, embed_dim, weight_attr, bias_attr=bias_attr)
 
     def _prepare_qkv(self, query, key, value, cache=None):
         """
@@ -158,14 +147,14 @@ def _prepare_qkv(self, query, key, value, cache=None):
         to reduce redundant calculations.
 
         Parameters:
-            query (Variable): The queries for multi-head attention. It is a
+            query (Tensor): The queries for multi-head attention. It is a
                 tensor with shape `[batch_size, query_length, embed_dim]`. The
                 data type should be float32 or float64.
-            key (Variable): The keys for multi-head attention. It is
+            key (Tensor): The keys for multi-head attention. It is
                 a tensor with shape `[batch_size, key_length, kdim]`. The
                 data type should be float32 or float64. If None, use `query` as
                 `key`.
-            value (Variable): The values for multi-head attention. It
+            value (Tensor): The values for multi-head attention. It
                 is a tensor with shape `[batch_size, value_length, vdim]`.
                 The data type should be float32 or float64. If None, use `query` as
                 `value`.
@@ -218,10 +207,10 @@ def cal_kv(self, key, value):
         to construct cache for inference.
 
         Parameters:
-            key (Variable): The keys for multi-head attention. It is a tensor
+            key (Tensor): The keys for multi-head attention. It is a tensor
                 with shape `[batch_size, sequence_length, kdim]`. The data type
                 should be float32 or float64.
-            value (Variable): The values for multi-head attention. It is a tensor
+            value (Tensor): The values for multi-head attention. It is a tensor
                 with shape `[batch_size, sequence_length, vdim]`. The data type
                 should be float32 or float64.
 
@@ -273,11 +262,11 @@ def gen_cache(self, key, value=None, type=Cache):
         an instance of `Cache`.
 
         Parameters:
-            key (Variable): The keys for multi-head attention. It is
+            key (Tensor): The keys for multi-head attention. It is
                 a tensor with shape `[batch_size, key_length, kdim]`. The
                 data type should be float32 or float64. If `value` is None,
                 it is only for batch size and data type reference.
-            value (Variable, optional): The values for multi-head attention. It
+            value (Tensor, optional): The values for multi-head attention. It
                 is a tensor with shape `[batch_size, value_length, vdim]`.
                 The data type should be float32 or float64. If None, `key` is only
                 for batch size reference. Default None.
@@ -312,18 +301,18 @@ def forward(self, query, key, value, attn_mask=None, cache=None):
         to outputs.
 
         Parameters:
-            query (Variable): The queries for multi-head attention. It is a
+            query (Tensor): The queries for multi-head attention. It is a
                 tensor with shape `[batch_size, query_length, embed_dim]`. The
                 data type should be float32 or float64.
-            key (Variable, optional): The keys for multi-head attention. It is
+            key (Tensor, optional): The keys for multi-head attention. It is
                 a tensor with shape `[batch_size, key_length, kdim]`. The
                 data type should be float32 or float64. If None, use `query` as
                 `key`. Default None.
-            value (Variable, optional): The values for multi-head attention. It
+            value (Tensor, optional): The values for multi-head attention. It
                 is a tensor with shape `[batch_size, value_length, vdim]`.
                 The data type should be float32 or float64. If None, use `query` as
                 `value`. Default None.
-            attn_mask (Variable, optional): A tensor used in multi-head attention
+            attn_mask (Tensor, optional): A tensor used in multi-head attention
                 to prevents attention to some unwanted positions, usually the
                 paddings or the subsequent positions. It is a tensor with shape
                 broadcasted to `[batch_size, n_head, sequence_length, sequence_length]`,
@@ -345,7 +334,7 @@ def forward(self, query, key, value, attn_mask=None, cache=None):
                 Default None.
 
         Returns:
-            Variable|tuple: It is a tensor that has the same shape and data type \
+            Tensor|tuple: It is a tensor that has the same shape and data type \
                 as `query`, representing attention output. Or a tuple if \
                 `need_weights` is True or `cache` is not None. If `need_weights` \
                 is True, except for attention output, the tuple also includes \
@@ -422,18 +411,20 @@ class TransformerEncoderLayer(Layer):
             normalization and post-precess includes dropout, residual connection.
             Otherwise, no pre-process and post-precess includes dropout, residual
             connection, layer normalization. Default False
-        param_attr(ParamAttr|tuple, optional): To specify the weight parameter property.
-            If it is a tuple, `param_attr[0]` would be used as `param_attr` for
-            MHA, and `param_attr[1]` would be used as `param_attr` for linear in FFN.
-            Otherwise, MHA and FFN both use it as `param_attr` to create parameters.
+        weight_attr(ParamAttr|tuple, optional): To specify the weight parameter property.
+            If it is a tuple, `weight_attr[0]` would be used as `weight_attr` for
+            MHA, and `weight_attr[1]` would be used as `weight_attr` for linear in FFN.
+            Otherwise, MHA and FFN both use it as `weight_attr` to create parameters.
             Default: None, which means the default weight parameter property is used.
-            See usage for details in :ref:`api_fluid_ParamAttr` . 
+            See usage for details in :code:`ParamAttr` . 
         bias_attr (ParamAttr|tuple, optional): To specify the bias parameter property.
             If it is a tuple, `bias_attr[0]` would be used as `bias_attr` for
             MHA, and `bias_attr[1]` would be used as `bias_attr` for linear in FFN.
             Otherwise, MHA and FFN both use it as `bias_attr` to create parameters.
-            Default: None, which means the default bias parameter property is used.
-            See usage for details in :ref:`api_fluid_ParamAttr` .
+            The `False` value means the corresponding layer would not have trainable
+            bias parameter. See usage for details in :code:`ParamAttr` . Default: None,
+            which means the default bias parameter property is used.
+            
 
     Examples:
 
@@ -459,7 +450,7 @@ def __init__(self,
                  attn_dropout=None,
                  act_dropout=None,
                  normalize_before=False,
-                 param_attr=None,
+                 weight_attr=None,
                  bias_attr=None):
         self._config = locals()
         self._config.pop("self")
@@ -470,27 +461,21 @@ def __init__(self,
         act_dropout = dropout if act_dropout is None else act_dropout
         self.normalize_before = normalize_before
 
-        param_attrs = _convert_param_attr_to_list(param_attr, 2)
+        weight_attrs = _convert_param_attr_to_list(weight_attr, 2)
         bias_attrs = _convert_param_attr_to_list(bias_attr, 2)
 
         self.self_attn = MultiheadAttention(
             d_model,
             nhead,
             dropout=attn_dropout,
-            param_attr=param_attrs[0],
+            weight_attr=weight_attrs[0],
             bias_attr=bias_attrs[0])
         self.linear1 = Linear(
-            d_model,
-            dim_feedforward,
-            param_attr=param_attrs[1],
-            bias_attr=bias_attrs[1])
+            d_model, dim_feedforward, weight_attrs[1], bias_attr=bias_attrs[1])
         self.dropout = Dropout(
             act_dropout, dropout_implementation="upscale_in_train")
         self.linear2 = Linear(
-            dim_feedforward,
-            d_model,
-            param_attr=param_attrs[1],
-            bias_attr=bias_attrs[1])
+            dim_feedforward, d_model, weight_attrs[1], bias_attr=bias_attrs[1])
         self.norm1 = LayerNorm(d_model)
         self.norm2 = LayerNorm(d_model)
         self.dropout1 = Dropout(
@@ -504,10 +489,10 @@ def forward(self, src, src_mask=None):
         Applies a Transformer encoder layer on the input.
 
         Parameters:
-            src (Variable): The input of Transformer encoder layer. It is
+            src (Tensor): The input of Transformer encoder layer. It is
                 a tensor with shape `[batch_size, sequence_length, d_model]`.
                 The data type should be float32 or float64.
-            src_mask (Variable, optional): A tensor used in multi-head attention
+            src_mask (Tensor, optional): A tensor used in multi-head attention
                 to prevents attention to some unwanted positions, usually the
                 paddings or the subsequent positions. It is a tensor with shape
                 broadcasted to `[batch_size, n_head, sequence_length, sequence_length]`,
@@ -517,7 +502,7 @@ def forward(self, src, src_mask=None):
                 Default None
 
         Returns:
-            Variable: The output of Transformer encoder layer. It is a tensor that \
+            Tensor: The output of Transformer encoder layer. It is a tensor that \
                 has the same shape and data type as `enc_input`.
         """
         residual = src
@@ -582,10 +567,10 @@ def forward(self, src, src_mask=None):
         layer.
 
         Parameters:
-            src (Variable): The input of Transformer encoder. It is a tensor
+            src (Tensor): The input of Transformer encoder. It is a tensor
                 with shape `[batch_size, sequence_length, d_model]`. The data
                 type should be float32 or float64.
-            src_mask (Variable, optional): A tensor used in multi-head attention
+            src_mask (Tensor, optional): A tensor used in multi-head attention
                 to prevents attention to some unwanted positions, usually the
                 paddings or the subsequent positions. It is a tensor with shape
                 broadcasted to `[batch_size, n_head, sequence_length, sequence_length]`,
@@ -595,7 +580,7 @@ def forward(self, src, src_mask=None):
                 Default None
 
         Returns:
-            Variable: The output of Transformer encoder. It is a tensor that \
+            Tensor: The output of Transformer encoder. It is a tensor that \
                 has the same shape and data type as `src`.
         """
         output = src
@@ -637,12 +622,12 @@ class TransformerDecoderLayer(Layer):
             normalization and post-precess includes dropout, residual connection.
             Otherwise, no pre-process and post-precess includes dropout, residual
             connection, layer normalization. Default False
-        param_attr(ParamAttr|tuple, optional): To specify the weight parameter property.
-            If it is a tuple, `param_attr[0]` would be used as `param_attr` for
-            self attention, `param_attr[1]` would be used as `param_attr` for
-            cross attention, and `param_attr[2]` would be used as `param_attr`
+        weight_attr(ParamAttr|tuple, optional): To specify the weight parameter property.
+            If it is a tuple, `weight_attr[0]` would be used as `weight_attr` for
+            self attention, `weight_attr[1]` would be used as `weight_attr` for
+            cross attention, and `weight_attr[2]` would be used as `weight_attr`
             for linear in FFN. Otherwise, the three sub-layers all uses it as
-            `param_attr` to create parameters. Default: None, which means the
+            `weight_attr` to create parameters. Default: None, which means the
             default weight parameter property is used. See usage for details
             in :ref:`api_fluid_ParamAttr` . 
         bias_attr (ParamAttr|tuple, optional): To specify the bias parameter property.
@@ -650,9 +635,10 @@ class TransformerDecoderLayer(Layer):
             self attention, `bias_attr[1]` would be used as `bias_attr` for
             cross attention, and `bias_attr[2]` would be used as `bias_attr`
             for linear in FFN. Otherwise, the three sub-layers all uses it as
-            `bias_attr` to create parameters. Default: None, which means the
-            default bias parameter property is used. See usage for details
-            in :ref:`api_fluid_ParamAttr` .
+            `bias_attr` to create parameters. The `False` value means the
+            corresponding layer would not have trainable bias parameter. See
+            usage for details in :code:`ParamAttr` . Default: None,which means
+            the default bias parameter property is used.
 
     Examples:
 
@@ -685,7 +671,7 @@ def __init__(self,
                  attn_dropout=None,
                  act_dropout=None,
                  normalize_before=False,
-                 param_attr=None,
+                 weight_attr=None,
                  bias_attr=None):
         self._config = locals()
         self._config.pop("self")
@@ -696,33 +682,27 @@ def __init__(self,
         act_dropout = dropout if act_dropout is None else act_dropout
         self.normalize_before = normalize_before
 
-        param_attrs = _convert_param_attr_to_list(param_attr, 3)
+        weight_attrs = _convert_param_attr_to_list(weight_attr, 3)
         bias_attrs = _convert_param_attr_to_list(bias_attr, 3)
 
         self.self_attn = MultiheadAttention(
             d_model,
             nhead,
             dropout=attn_dropout,
-            param_attr=param_attrs[0],
+            weight_attr=weight_attrs[0],
             bias_attr=bias_attrs[0])
         self.cross_attn = MultiheadAttention(
             d_model,
             nhead,
             dropout=attn_dropout,
-            param_attr=param_attrs[1],
+            weight_attr=weight_attrs[1],
             bias_attr=bias_attrs[1])
         self.linear1 = Linear(
-            d_model,
-            dim_feedforward,
-            param_attr=param_attrs[2],
-            bias_attr=bias_attrs[2])
+            d_model, dim_feedforward, weight_attrs[2], bias_attr=bias_attrs[2])
         self.dropout = Dropout(
             act_dropout, dropout_implementation="upscale_in_train")
         self.linear2 = Linear(
-            dim_feedforward,
-            d_model,
-            param_attr=param_attrs[2],
-            bias_attr=bias_attrs[2])
+            dim_feedforward, d_model, weight_attrs[2], bias_attr=bias_attrs[2])
         self.norm1 = LayerNorm(d_model)
         self.norm2 = LayerNorm(d_model)
         self.norm3 = LayerNorm(d_model)
@@ -739,13 +719,13 @@ def forward(self, tgt, memory, tgt_mask=None, memory_mask=None, cache=None):
         Applies a Transformer decoder layer on the input.
 
         Parameters:
-            tgt (Variable): The input of Transformer decoder layer. It is a tensor
+            tgt (Tensor): The input of Transformer decoder layer. It is a tensor
                 with shape `[batch_size, target_length, d_model]`. The data type
                 should be float32 or float64.
-            memory (Variable): The output of Transformer encoder. It is a tensor
+            memory (Tensor): The output of Transformer encoder. It is a tensor
                 with shape `[batch_size, source_length, d_model]`. The data type
                 should be float32 or float64.
-            tgt_mask (Variable, optional): A tensor used in self attention
+            tgt_mask (Tensor, optional): A tensor used in self attention
                 to prevents attention to some unwanted positions, usually the
                 the subsequent positions. It is a tensor with shape broadcasted
                 to `[batch_size, n_head, target_length, target_length]`,
@@ -753,7 +733,7 @@ def forward(self, tgt, memory, tgt_mask=None, memory_mask=None, cache=None):
                 have 0 values. The data type should be float32 or float64. It can
                 be None when nothing wanted or needed to be prevented attention to.
                 Default None
-            memory_mask (Variable, optional): A tensor used in decoder-encoder
+            memory_mask (Tensor, optional): A tensor used in decoder-encoder
                 cross attention to prevents attention to some unwanted positions,
                 usually the paddings. It is a tensor with shape broadcasted to
                `[batch_size, n_head, target_length, source_length]`, where the
@@ -768,7 +748,7 @@ def forward(self, tgt, memory, tgt_mask=None, memory_mask=None, cache=None):
                 None.
 
         Returns:
-            Variable|tuple: It is a tensor that has the same shape and data type \
+            Tensor|tuple: It is a tensor that has the same shape and data type \
                 as `tgt`, representing the output of Transformer decoder layer. \
                 Or a tuple if `cache` is not None, except for decoder layer output, \
                 the tuple includes the new cache which is same as input `cache` \
@@ -817,7 +797,7 @@ def gen_cache(self, memory):
         of `MultiheadAttention.StaticCache`.
 
         Parameters:
-            memory (Variable): The output of Transformer encoder. It is a tensor
+            memory (Tensor): The output of Transformer encoder. It is a tensor
                 with shape `[batch_size, source_length, d_model]`. The data type
                 should be float32 or float64.
 
@@ -889,13 +869,13 @@ def forward(self, tgt, memory, tgt_mask=None, memory_mask=None, cache=None):
         layer.
 
         Parameters:
-            tgt (Variable): The input of Transformer decoder. It is a tensor
+            tgt (Tensor): The input of Transformer decoder. It is a tensor
                 with shape `[batch_size, target_length, d_model]`. The data type
                 should be float32 or float64.
-            memory (Variable): The output of Transformer encoder. It is a tensor
+            memory (Tensor): The output of Transformer encoder. It is a tensor
                 with shape `[batch_size, source_length, d_model]`. The data type
                 should be float32 or float64.
-            tgt_mask (Variable, optional): A tensor used in self attention
+            tgt_mask (Tensor, optional): A tensor used in self attention
                 to prevents attention to some unwanted positions, usually the
                 the subsequent positions. It is a tensor with shape broadcasted
                 to `[batch_size, n_head, target_length, target_length]`,
@@ -903,7 +883,7 @@ def forward(self, tgt, memory, tgt_mask=None, memory_mask=None, cache=None):
                 have 0 values. The data type should be float32 or float64. It can
                 be None when nothing wanted or needed to be prevented attention to.
                 Default None
-            memory_mask (Variable, optional): A tensor used in decoder-encoder
+            memory_mask (Tensor, optional): A tensor used in decoder-encoder
                 cross attention to prevents attention to some unwanted positions,
                 usually the paddings. It is a tensor with shape broadcasted to
                `[batch_size, n_head, target_length, source_length]`, where the
@@ -916,7 +896,7 @@ def forward(self, tgt, memory, tgt_mask=None, memory_mask=None, cache=None):
                 used for inference and should be None for training. Default None.
 
         Returns:
-            Variable|tuple: It is a tensor that has the same shape and data type \
+            Tensor|tuple: It is a tensor that has the same shape and data type \
                 as `tgt`, representing the output of Transformer decoder. \
                 Or a tuple if `cache` is not None, except for decoder output, \
                 the tuple includes the new cache which is same as input `cache` \
@@ -956,7 +936,7 @@ def gen_cache(self, memory, do_zip=False):
 
 
         Parameters:
-            memory (Variable): The output of Transformer encoder. It is a tensor
+            memory (Tensor): The output of Transformer encoder. It is a tensor
                 with shape `[batch_size, source_length, d_model]`. The data type
                 should be float32 or float64.
             do_zip (bool, optional): Indicate whether to apply `zip` on the tuples.
@@ -988,7 +968,7 @@ class Transformer(Layer):
     normalization (in pre-process or post-precess of multi-head attention or FFN),
     and some transformer like models are different on this, such as
     `BERT <https://arxiv.org/abs/1810.04805>`_ and `GPT2 <https://d4mucfpksywv.cloudfront.net/better-language-models/language-models.pdf>`_ . 
-    The default architecture here places layer normalization in pre-process and
+    The default architecture here places layer normalization in post-process and
     applies another layer normalization on the output of last encoder/decoder layer.
 
     Parameters:
@@ -1012,22 +992,23 @@ class Transformer(Layer):
             normalization and post-precess includes dropout, residual connection.
             Otherwise, no pre-process and post-precess includes dropout, residual
             connection, layer normalization. Default False
-        param_attr(ParamAttr|tuple, optional): To specify the weight parameter property.
-            If it is a tuple, `param_attr[0]` would be used as `param_attr` for
-            self attention, `param_attr[1]` would be used as `param_attr` for
-            cross attention, and `param_attr[2]` would be used as `param_attr`
+        weight_attr(ParamAttr|tuple, optional): To specify the weight parameter property.
+            If it is a tuple, `weight_attr[0]` would be used as `weight_attr` for
+            self attention, `weight_attr[1]` would be used as `weight_attr` for
+            cross attention, and `weight_attr[2]` would be used as `weight_attr`
             for linear in FFN. Otherwise, the three sub-layers all uses it as
-            `param_attr` to create parameters. Default: None, which means the
+            `weight_attr` to create parameters. Default: None, which means the
             default weight parameter property is used. See usage for details
-            in :ref:`api_fluid_ParamAttr` . 
+            in :code:`ParamAttr` . 
         bias_attr (ParamAttr|tuple, optional): To specify the bias parameter property.
             If it is a tuple, `bias_attr[0]` would be used as `bias_attr` for
             self attention, `bias_attr[1]` would be used as `bias_attr` for
             cross attention, and `bias_attr[2]` would be used as `bias_attr`
             for linear in FFN. Otherwise, the three sub-layers all uses it as
-            `bias_attr` to create parameters. Default: None, which means the
-            default bias parameter property is used. See usage for details
-            in :ref:`api_fluid_ParamAttr` .
+            `bias_attr` to create parameters. The `False` value means the
+            corresponding layer would not have trainable bias parameter. See
+            usage for details in :code:`ParamAttr` . Default: None,which means
+            the default bias parameter property is used.
         custom_encoder (Layer): If custom encoder is provided, use it as the encoder.
             Default None
         custom_decoder (Layer): If custom decoder is provided, use it as the decoder.
@@ -1069,7 +1050,7 @@ def __init__(self,
                  attn_dropout=None,
                  act_dropout=None,
                  normalize_before=False,
-                 param_attr=None,
+                 weight_attr=None,
                  bias_attr=None,
                  custom_encoder=None,
                  custom_decoder=None):
@@ -1080,7 +1061,7 @@ def __init__(self,
         else:
             encoder_layer = TransformerEncoderLayer(
                 d_model, nhead, dim_feedforward, dropout, activation,
-                attn_dropout, act_dropout, normalize_before, param_attr,
+                attn_dropout, act_dropout, normalize_before, weight_attr,
                 bias_attr)
             encoder_norm = LayerNorm(d_model)
             self.encoder = TransformerEncoder(encoder_layer, num_encoder_layers,
@@ -1091,7 +1072,7 @@ def __init__(self,
         else:
             decoder_layer = TransformerDecoderLayer(
                 d_model, nhead, dim_feedforward, dropout, activation,
-                attn_dropout, act_dropout, normalize_before, param_attr,
+                attn_dropout, act_dropout, normalize_before, weight_attr,
                 bias_attr)
             decoder_norm = LayerNorm(d_model)
             self.decoder = TransformerDecoder(decoder_layer, num_decoder_layers,
@@ -1105,16 +1086,16 @@ def forward(self, src, tgt, src_mask=None, tgt_mask=None, memory_mask=None):
         Applies a Transformer model on the inputs.
 
         Parameters:
-            src (Variable): The input of Transformer encoder. It is a tensor
+            src (Tensor): The input of Transformer encoder. It is a tensor
                 with shape `[batch_size, source_length, d_model]`. The data type
                 should be float32 or float64.
-            tgt (Variable): The input of Transformer decoder. It is a tensor
+            tgt (Tensor): The input of Transformer decoder. It is a tensor
                 with shape `[batch_size, target_length, d_model]`. The data type
                 should be float32 or float64.
-            memory (Variable): The output of Transformer encoder. It is a tensor
+            memory (Tensor): The output of Transformer encoder. It is a tensor
                 with shape `[batch_size, source_length, d_model]`. The data type
                 should be float32 or float64.
-            tgt_mask (Variable, optional): A tensor used in self attention
+            tgt_mask (Tensor, optional): A tensor used in self attention
                 to prevents attention to some unwanted positions, usually the
                 the subsequent positions. It is a tensor with shape broadcasted
                 to `[batch_size, n_head, target_length, target_length]`,
@@ -1122,7 +1103,7 @@ def forward(self, src, tgt, src_mask=None, tgt_mask=None, memory_mask=None):
                 have 0 values. The data type should be float32 or float64. It can
                 be None when nothing wanted or needed to be prevented attention to.
                 Default None
-            memory_mask (Variable, optional): A tensor used in decoder-encoder
+            memory_mask (Tensor, optional): A tensor used in decoder-encoder
                 cross attention to prevents attention to some unwanted positions,
                 usually the paddings. It is a tensor with shape broadcasted to
                `[batch_size, n_head, target_length, source_length]`, where the
@@ -1131,7 +1112,7 @@ def forward(self, src, tgt, src_mask=None, tgt_mask=None, memory_mask=None):
                 nothing wanted or needed to be prevented attention to. Default None
 
         Returns:
-            Variable: It is a tensor that has the same shape and data type \
+            Tensor: It is a tensor that has the same shape and data type \
                 as `tgt`, representing the output of Transformer decoder.
         """
         memory = self.encoder(src, src_mask=src_mask)

From 48f97e193d6af9abe74c02128957c0d1cd892c98 Mon Sep 17 00:00:00 2001
From: guosheng <guosheng@baidu.com>
Date: Sun, 23 Aug 2020 17:03:54 +0800
Subject: [PATCH 16/17] Update Transformer apis by renaming MultiheadAttention
 and cal_kv according to comments. test=develop

---
 python/paddle/nn/__init__.py          |  6 +++
 python/paddle/nn/layer/transformer.py | 68 +++++++++++++--------------
 2 files changed, 39 insertions(+), 35 deletions(-)

diff --git a/python/paddle/nn/__init__.py b/python/paddle/nn/__init__.py
index 3dd1c1d94fbd70..6cd2379c61e67d 100644
--- a/python/paddle/nn/__init__.py
+++ b/python/paddle/nn/__init__.py
@@ -118,6 +118,12 @@
 # from .layer.rnn import RNNCell        #DEFINE_ALIAS
 # from .layer.rnn import GRUCell        #DEFINE_ALIAS
 # from .layer.rnn import LSTMCell        #DEFINE_ALIAS
+from .layer.transformer import MultiHeadAttention
+from .layer.transformer import TransformerEncoderLayer
+from .layer.transformer import TransformerEncoder
+from .layer.transformer import TransformerDecoderLayer
+from .layer.transformer import TransformerDecoder
+from .layer.transformer import Transformer
 from .layer.distance import PairwiseDistance  #DEFINE_ALIAS
 
 from .layer import loss  #DEFINE_ALIAS
diff --git a/python/paddle/nn/layer/transformer.py b/python/paddle/nn/layer/transformer.py
index c914d51f0930ff..50a8755ac9f7b0 100644
--- a/python/paddle/nn/layer/transformer.py
+++ b/python/paddle/nn/layer/transformer.py
@@ -14,7 +14,7 @@
 
 # TODO: define the classes of Transformer neural network
 __all__ = [
-    'MultiheadAttention',
+    'MultiHeadAttention',
     'TransformerEncoderLayer',
     'TransformerEncoder',
     'TransformerDecoderLayer',
@@ -25,8 +25,6 @@
 import copy
 import collections
 
-import numpy as np
-
 from ...fluid import layers
 from ...fluid.param_attr import ParamAttr
 from ...fluid.dygraph import Layer, Linear, Dropout, LayerNorm, LayerList
@@ -66,7 +64,7 @@ def _convert_param_attr_to_list(param_attr, n):
     return param_attrs
 
 
-class MultiheadAttention(Layer):
+class MultiHeadAttention(Layer):
     """
     Attention mapps queries and a set of key-value pairs to outputs, and
     Multi-Head Attention performs multiple parallel attention to jointly attending
@@ -104,7 +102,7 @@ class MultiheadAttention(Layer):
             query = paddle.rand((2, 4, 128))
             # self attention mask: [batch_size, num_heads, query_len, query_len]
             attn_mask = paddle.rand((2, 2, 4, 4))
-            multi_head_attn = paddle.MultiheadAttention(128, 2)
+            multi_head_attn = paddle.MultiHeadAttention(128, 2)
             output = multi_head_attn(query, attn_mask=attn_mask)  # [2, 4, 128]
     """
 
@@ -120,7 +118,7 @@ def __init__(self,
                  need_weights=False,
                  weight_attr=None,
                  bias_attr=None):
-        super(MultiheadAttention, self).__init__()
+        super(MultiHeadAttention, self).__init__()
         self.embed_dim = embed_dim
         self.kdim = kdim if kdim is not None else embed_dim
         self.vdim = vdim if vdim is not None else embed_dim
@@ -158,11 +156,11 @@ def _prepare_qkv(self, query, key, value, cache=None):
                 is a tensor with shape `[batch_size, value_length, vdim]`.
                 The data type should be float32 or float64. If None, use `query` as
                 `value`.
-            cache (MultiheadAttention.Cache|MultiheadAttention.StaticCache, optional):
+            cache (MultiHeadAttention.Cache|MultiHeadAttention.StaticCache, optional):
                 It is a namedtuple with `k` and `v` as fields, and stores tensors
                 shaped `[batch_size, num_heads, length, embed_dim]` which are results
                 of linear projection, reshape and transpose calculations in
-                MultiheadAttention. If is an instance of `Cache`, `k` and `v`
+                MultiHeadAttention. If is an instance of `Cache`, `k` and `v`
                 fields reserve intermediate results of previous positions, which
                 mostly used for decoder self attention. If it is an instance of
                 `StaticCache`, `key` and `value` args would be ignored, `k` and
@@ -185,7 +183,7 @@ def _prepare_qkv(self, query, key, value, cache=None):
             # for encoder-decoder attention in inference and has cached
             k, v = cache.k, cache.v
         else:
-            k, v = self.cal_kv(key, value)
+            k, v = self.compute_kv(key, value)
 
         if isinstance(cache, self.Cache):
             # for decoder self-attention in inference
@@ -195,7 +193,7 @@ def _prepare_qkv(self, query, key, value, cache=None):
 
         return (q, k, v) if cache is None else (q, k, v, cache)
 
-    def cal_kv(self, key, value):
+    def compute_kv(self, key, value):
         """
         Applies linear projection on input keys and values, then splits heads
         (reshape and transpose) to get keys and values from different representation
@@ -230,13 +228,13 @@ def cal_kv(self, key, value):
     def gen_cache(self, key, value=None, type=Cache):
         """
         Generates cache for `forward` usage in inference accroding to arguments.
-        The generated cache is an instance of `MultiheadAttention.Cache` or an
-        instance of `MultiheadAttention.StaticCache`.
+        The generated cache is an instance of `MultiHeadAttention.Cache` or an
+        instance of `MultiHeadAttention.StaticCache`.
 
         `Cache` or `StaticCache` is namedtuple with `k` and `v` as fields,
         and it stores tensors shaped `[batch_size, num_heads, length, embed_dim]`
         which are results of linear projection, reshape and transpose calculations
-        in MultiheadAttention.
+        in MultiHeadAttention.
         
         If the generated cache is an instance of `Cache`, `k` and `v` fields
         reserve intermediate result tensors of previous positions, and the tensors
@@ -250,8 +248,8 @@ def gen_cache(self, key, value=None, type=Cache):
 
         The cache is generated as follows:
 
-        1. If `type` is `StaticCache`, apply `cal_kv(key, value)` and use the results
-        to create an instance of `StaticCache`.
+        1. If `type` is `StaticCache`, apply `compute_kv(key, value)` and use the
+        results to create an instance of `StaticCache`.
         
         2. If `type` is `Cache` and `value` is None, generate empty tensors shaped
         `[batch_size, num_heads, 0, embed_dim // num_heads]` and use the results
@@ -270,14 +268,14 @@ def gen_cache(self, key, value=None, type=Cache):
                 is a tensor with shape `[batch_size, value_length, vdim]`.
                 The data type should be float32 or float64. If None, `key` is only
                 for batch size reference. Default None.
-            type (type): It should be `MultiheadAttention.StaticCache` or
-                `MultiheadAttention.Cache` to indicate the cache type to generate.
+            type (type): It should be `MultiHeadAttention.StaticCache` or
+                `MultiHeadAttention.Cache` to indicate the cache type to generate.
         
         Returns:
             namedtuple: an instance of `Cache` or `StaticCache` accordingly.
         """
-        if type == MultiheadAttention.StaticCache:  # static_kv
-            k, v = self.cal_kv(key, value)
+        if type == MultiHeadAttention.StaticCache:  # static_kv
+            k, v = self.compute_kv(key, value)
             return self.StaticCache(k, v)
         elif value is None:  # incremental_state
             k = layers.fill_constant_batch_size_like(
@@ -320,11 +318,11 @@ def forward(self, query, key, value, attn_mask=None, cache=None):
                 have 0 values. The data type should be float32 or float64. It can
                 be None when nothing wanted or needed to be prevented attention to.
                 Default None
-            cache (MultiheadAttention.Cache|MultiheadAttention.StaticCache, optional):
+            cache (MultiHeadAttention.Cache|MultiHeadAttention.StaticCache, optional):
                 It is a namedtuple with `k` and `v` as fields, and stores tensors
                 shaped `[batch_size, num_heads, length, embed_dim]` which are results
                 of linear projection, reshape and transpose calculations in
-                MultiheadAttention. If it is an instance of `Cache`, `k` and `v`
+                MultiHeadAttention. If it is an instance of `Cache`, `k` and `v`
                 fields reserve intermediate results of previous positions, which
                 mostly used for decoder self attention. If it is an instance of
                 `StaticCache`, `key` and `value` args would be ignored, `k` and
@@ -464,7 +462,7 @@ def __init__(self,
         weight_attrs = _convert_param_attr_to_list(weight_attr, 2)
         bias_attrs = _convert_param_attr_to_list(bias_attr, 2)
 
-        self.self_attn = MultiheadAttention(
+        self.self_attn = MultiHeadAttention(
             d_model,
             nhead,
             dropout=attn_dropout,
@@ -685,13 +683,13 @@ def __init__(self,
         weight_attrs = _convert_param_attr_to_list(weight_attr, 3)
         bias_attrs = _convert_param_attr_to_list(bias_attr, 3)
 
-        self.self_attn = MultiheadAttention(
+        self.self_attn = MultiHeadAttention(
             d_model,
             nhead,
             dropout=attn_dropout,
             weight_attr=weight_attrs[0],
             bias_attr=bias_attrs[0])
-        self.cross_attn = MultiheadAttention(
+        self.cross_attn = MultiHeadAttention(
             d_model,
             nhead,
             dropout=attn_dropout,
@@ -741,8 +739,8 @@ def forward(self, tgt, memory, tgt_mask=None, memory_mask=None, cache=None):
                 The data type should be float32 or float64. It can be None when
                 nothing wanted or needed to be prevented attention to. Default None
             cache (tuple, optional): It is a tuple( :code:`(incremental_cache, static_cache)` ),
-                `incremental_cache` is an instance of `MultiheadAttention.Cache`,
-                `static_cache` is an instance of `MultiheadAttention.StaticCache.
+                `incremental_cache` is an instance of `MultiHeadAttention.Cache`,
+                `static_cache` is an instance of `MultiHeadAttention.StaticCache.
                 See `TransformerDecoderLayer.gen_cache` for more details. It is
                 only used for inference and should be None for training. Default
                 None.
@@ -753,7 +751,7 @@ def forward(self, tgt, memory, tgt_mask=None, memory_mask=None, cache=None):
                 Or a tuple if `cache` is not None, except for decoder layer output, \
                 the tuple includes the new cache which is same as input `cache` \
                 argument but `incremental_cache` in it has an incremental length. \
-                See `MultiheadAttention.gen_cache` and `MultiheadAttention.forward` \
+                See `MultiHeadAttention.gen_cache` and `MultiHeadAttention.forward` \
                 for more details.
         """
         residual = tgt
@@ -793,8 +791,8 @@ def forward(self, tgt, memory, tgt_mask=None, memory_mask=None, cache=None):
     def gen_cache(self, memory):
         """
         Generates cache for `forward` usage. The generated cache is a tuple
-        composed of an instance of `MultiheadAttention.Cache` and an instance
-        of `MultiheadAttention.StaticCache`.
+        composed of an instance of `MultiHeadAttention.Cache` and an instance
+        of `MultiHeadAttention.StaticCache`.
 
         Parameters:
             memory (Tensor): The output of Transformer encoder. It is a tensor
@@ -803,13 +801,13 @@ def gen_cache(self, memory):
 
         Returns:
             tuple: It is a tuple( :code:`(incremental_cache, static_cache)` ). \
-                `incremental_cache` is an instance of `MultiheadAttention.Cache` \
-                produced by `self_attn.gen_cache(memory, MultiheadAttention.Cache)`, \
+                `incremental_cache` is an instance of `MultiHeadAttention.Cache` \
+                produced by `self_attn.gen_cache(memory, MultiHeadAttention.Cache)`, \
                 it reserves two tensors shaped `[batch_size, nhead, 0, d_model // nhead]`. \
-                `static_cache` is an instance of `MultiheadAttention.StaticCache` \
-                produced by `cross_attn.gen_cache(memory, MultiheadAttention.StaticCache)`, \
+                `static_cache` is an instance of `MultiHeadAttention.StaticCache` \
+                produced by `cross_attn.gen_cache(memory, MultiHeadAttention.StaticCache)`, \
                 it reserves two tensors shaped `[batch_size, nhead, source_length, d_model // nhead]`.
-                See `MultiheadAttention.gen_cache` and `MultiheadAttention.forward` \
+                See `MultiHeadAttention.gen_cache` and `MultiHeadAttention.forward` \
                 for more details.
         """
         incremental_cache = self.self_attn.gen_cache(
@@ -901,7 +899,7 @@ def forward(self, tgt, memory, tgt_mask=None, memory_mask=None, cache=None):
                 Or a tuple if `cache` is not None, except for decoder output, \
                 the tuple includes the new cache which is same as input `cache` \
                 argument but `incremental_cache` in it has an incremental length. \
-                See `MultiheadAttention.gen_cache` and `MultiheadAttention.forward` \
+                See `MultiHeadAttention.gen_cache` and `MultiHeadAttention.forward` \
                 for more details.
         """
         output = tgt

From 1beb0754f846293bd207b6cb04f43771a30130e4 Mon Sep 17 00:00:00 2001
From: guosheng <guosheng@baidu.com>
Date: Sun, 23 Aug 2020 19:27:45 +0800
Subject: [PATCH 17/17] Fix MultiHeadAttention in test_transformer_api.py.
 test=develop

---
 .../fluid/tests/unittests/test_transformer_api.py      | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_transformer_api.py b/python/paddle/fluid/tests/unittests/test_transformer_api.py
index 8384a346c6375e..c8d1e77134036b 100644
--- a/python/paddle/fluid/tests/unittests/test_transformer_api.py
+++ b/python/paddle/fluid/tests/unittests/test_transformer_api.py
@@ -15,7 +15,7 @@
 import numpy as np
 import paddle
 import paddle.fluid as fluid
-from paddle.nn.layer.transformer import MultiheadAttention, TransformerEncoderLayer, TransformerDecoderLayer, TransformerEncoder, TransformerDecoder, Transformer
+from paddle.nn.layer.transformer import MultiHeadAttention, TransformerEncoderLayer, TransformerDecoderLayer, TransformerEncoder, TransformerDecoder, Transformer
 
 import unittest
 
@@ -225,7 +225,7 @@ def multihead_attention_test_helper(self_attention, cache):
                     attn_mask = np.concatenate((attn_mask, attn_mask), axis=3)
                 need_weight, param_attr, bias_attr = False, None, None
                 # call paddle's function
-                multi_head_attn = MultiheadAttention(
+                multi_head_attn = MultiHeadAttention(
                     embed_dim, num_heads, attn_dropout, kdim, vdim, need_weight,
                     param_attr, bias_attr)
                 # construct cache object
@@ -298,7 +298,7 @@ def test_transformer_encoder_layer(self):
                 paddle.to_variable(src_mask))  # paddle.to_variable(src_mask))
             # 4.numpy:
             # paddle self attention
-            self_attn = MultiheadAttention(
+            self_attn = MultiHeadAttention(
                 d_model, n_head, dropout=attn_dropout)
             attn_output = self_attn(
                 paddle.to_variable(src),
@@ -334,9 +334,9 @@ def test_transformer_decoder_layer(self):
                                     source_length)).astype("float32")
             memory_mask[0][0][0][0] = -1e9
             for cache in [True, False]:
-                self_attn = MultiheadAttention(
+                self_attn = MultiHeadAttention(
                     d_model, n_head, dropout=attn_dropout)
-                cross_attn = MultiheadAttention(
+                cross_attn = MultiHeadAttention(
                     d_model, n_head, dropout=attn_dropout)
 
                 # paddle decoderlayer: