From a348883c96c2cad29fb96ddf07b32275e1486d1b Mon Sep 17 00:00:00 2001 From: guosheng Date: Sat, 15 Aug 2020 19:05:50 +0800 Subject: [PATCH 01/17] Add MultiHeadAttention api. test=develop --- python/paddle/nn/layer/transformer.py | 708 ++++++++++++++++++++++++++ 1 file changed, 708 insertions(+) diff --git a/python/paddle/nn/layer/transformer.py b/python/paddle/nn/layer/transformer.py index 2b926b5ab36904..0fb3f5b0c5ee6c 100644 --- a/python/paddle/nn/layer/transformer.py +++ b/python/paddle/nn/layer/transformer.py @@ -14,3 +14,711 @@ # TODO: define the classes of Transformer neural network # __all__ = [ ] + +import numpy as np + +from ...fluid import layers +from ...fluid.dygraph import Layer, Linear +from ...fluid.initializer import Normal +from .. import functional as F +from ...fluid.layers import utils +from ...fluid.layers.utils import map_structure + + +class MultiHeadAttention(Layer): + """ + Attention mapps queries and a set of key-value pairs to outputs, and + Multi-Head Attention performs multiple parallel attention to jointly attending + to information from different representation subspaces. + + Please refer to `Attention Is All You Need `_ + for more details. + + Parameters: + embed_dim (int): The expected feature size in the input and output. + num_heads (int): The number of heads in multi-head attention. + dropout (float, optional): The dropout probability used on attention + weights to drop some attention targets. 0 for no dropout. Default 0 + kdim (int, optional): The feature size in key. If None, assumed equal to + `embed_dim`. Default None. + vdim (int, optional): The feature size in key. If None, assumed equal to + `embed_dim`. Default None. + need_weights (bool, optional): Indicate whether to return the attention + weights. Default False. + param_attr(ParamAttr, optional): To specify the weight parameter property. + Default: None, which means the default weight parameter property is used. + See usage for details in :ref:`api_fluid_ParamAttr` . + bias_attr (ParamAttr, optional): To specify the bias parameter property. + Default: None, which means the default bias parameter property is used. + See usage for details in :ref:`api_fluid_ParamAttr` . + + Examples: + + .. code-block:: python + + import paddle + from paddle.incubate.hapi.text import MultiHeadAttention + + # encoder input: [batch_size, sequence_length, d_model] + query = paddle.rand((2, 4, 128)) + # self attention bias: [batch_size, n_head, src_len, src_len] + attn_bias = paddle.rand((2, 2, 4, 4)) + multi_head_attn = MultiHeadAttention(64, 64, 128, n_head=2) + output = multi_head_attn(query, attn_bias=attn_bias) # [2, 4, 128] + """ + + def __init__(self, + embed_dim, + num_heads, + dropout=0., + kdim=None, + vdim=None, + need_weights=True, + param_attr=None, + bias_attr=None): + super(MultiHeadAttention, self).__init__() + self.embed_dim = embed_dim + self.kdim = kdim if kdim is not None else embed_dim + self.vdim = vdim if vdim is not None else embed_dim + self.num_heads = num_heads + self.dropout = dropout + self.need_weights = need_weights + + self.head_dim = embed_dim // num_heads + assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads" + + self.q_proj = Linear( + input_dim=embed_dim, + output_dim=embed_dim, + param_attr=param_attr, + bias_attr=bias_attr) + self.k_proj = Linear( + input_dim=self.kdim, + output_dim=embed_dim, + param_attr=param_attr, + bias_attr=bias_attr) + self.v_proj = Linear( + input_dim=self.vdim, + output_dim=embed_dim, + param_attr=param_attr, + bias_attr=bias_attr) + self.out_proj = Linear( + input_dim=embed_dim, + output_dim=embed_dim, + param_attr=param_attr, + bias_attr=bias_attr) + + def _prepare_qkv(self, query, key, value, cache=None): + """ + Prapares linear projected queries, keys and values for usage of subsequnt + multiple parallel attention. If `cache` is not None, using cached results + to reduce redundant calculations. + + Parameters: + query (Variable): The queries for multi-head attention. It is a + tensor with shape `[batch_size, sequence_length, embed_dim]`. The + data type should be float32 or float64. + key (Variable): The keys for multi-head attention. It is + a tensor with shape `[batch_size, sequence_length, kdim]`. The + data type should be float32 or float64. If None, use `query` as + `key`. + value (Variable): The values for multi-head attention. It + is a tensor with shape `[batch_size, sequence_length, vdim]`. + The data type should be float32 or float64. If None, use `query` as + `value`. + cache (dict, optional): It is a dict with `k` and `v` as keys or + `static_k` and `static_v` as keys, and values are tensors shaped + `[batch_size, num_heads, length, embed_dim]` which are results of + linear projection, reshape and transpose calculations. If keys are + `k` and `v`, the values reserve intermediate results of previous + positions, and would be updated by new tensors concatanating raw + tensors with results of current position, which mostly used for + decoder self attention. If keys are `static_k` and `static_v`, + `key` and `value` args would be ignored, and the values in dict + would be used as calculated results on `key` and `value`, which + mostly used for decoder-encoder cross attention. It is only used + for inference and should be None for training. Default None. + + Returns: + tuple: A tuple including linear projected keys and values. These two \ + tensors have shapes `[batch_size, n_head, sequence_length, d_key]` \ + and `[batch_size, n_head, sequence_length, d_value]` separately, \ + and their data types are same as inputs. + """ + q = self.q_proj(query) + q = layers.reshape(x=q, shape=[0, 0, self.n_head, self.d_key]) + q = layers.transpose(x=q, perm=[0, 2, 1, 3]) + + if cache is not None and "static_k" in cache: + # for encoder-decoder attention in inference and has cached + k, v = cache["static_k"], cache["static_v"] + else: + k, v = self.cal_kv(key, value) + + if cache is not None and "static_k" not in cache: + # for decoder self-attention in inference + cache_k, cache_v = cache["k"], cache["v"] + k = layers.concat([cache_k, k], axis=2) + v = layers.concat([cache_v, v], axis=2) + cache["k"], cache["v"] = k, v + + return q, k, v + + def cal_kv(self, key, value): + """ + Applies linear projection on input keys and values, then splits heads + (reshape and transpose) to get keys and values from different representation + subspaces. The results are used as key-values pairs for subsequent multiple + parallel attention. + + It is part of calculations in multi-head attention, and is provided as + a method to prefetch these results, by which we can use them as cache. + + Parameters: + key (Variable, optional): The keys for multi-head attention. It is + a tensor with shape `[batch_size, sequence_length, kdim]`. The + data type should be float32 or float64. + value (Variable, optional): The values for multi-head attention. It + is a tensor with shape `[batch_size, sequence_length, vdim]`. + The data type should be float32 or float64. + + Returns: + tuple: A tuple including linear projected keys and values. Their shapes \ + both are `[batch_size, num_heads, sequence_length, embed_dim // num_heads]`. \ + and their data types are same as inputs. + """ + k = self.k_proj(key) + v = self.v_proj(value) + k = layers.reshape(x=k, shape=[0, 0, self.num_heads, self.head_dim]) + k = layers.transpose(x=k, perm=[0, 2, 1, 3]) + v = layers.reshape(x=v, shape=[0, 0, self.num_heads, self.head_dim]) + v = layers.transpose(x=v, perm=[0, 2, 1, 3]) + return k, v + + def forward(self, query, key, value, attn_mask=None, cache=None): + """ + Applies multi-head attention to map queries and a set of key-value pairs + to outputs. + + Parameters: + query (Variable): The queries for multi-head attention. It is a + tensor with shape `[batch_size, sequence_length, embed_dim]`. The + data type should be float32 or float64. + key (Variable, optional): The keys for multi-head attention. It is + a tensor with shape `[batch_size, sequence_length, kdim]`. The + data type should be float32 or float64. If None, use `query` as + `key`. Default None. + value (Variable, optional): The values for multi-head attention. It + is a tensor with shape `[batch_size, sequence_length, vdim]`. + The data type should be float32 or float64. If None, use `query` as + `value`. Default None. + attn_mask (Variable, optional): A tensor used in multi-head attention + to prevents attention to some unwanted positions, usually the + paddings or the subsequent positions. It is a tensor with shape + broadcasted to `[batch_size, n_head, sequence_length, sequence_length]`, + where the unwanted positions have `-INF` values and the others + have 0 values. The data type should be float32 or float64. It can + be None when nothing wanted or needed to be prevented attention to. + Default None + cache (dict, optional): It is a dict with `k` and `v` as keys or + `static_k` and `static_v` as keys, and values are tensors shaped + `[batch_size, num_heads, length, embed_dim]` which are results of + linear projection, reshape and transpose calculations. If keys are + `k` and `v`, the values reserve intermediate results of previous + positions, and would be updated by new tensors concatanating raw + tensors with results of current position, which mostly used for + decoder self attention. If keys are `static_k` and `static_v`, + `key` and `value` args would be ignored, and the values in dict + would be used as calculated results on `key` and `value`, which + mostly used for decoder-encoder cross attention. It is only used + for inference and should be None for training. Default None. + + Returns: + Variable: The output of multi-head attention. It is a tensor \ + that has the same shape and data type as `queries`. + """ + key = query if key is None else key + value = query if value is None else value + # compute q ,k ,v + q, k, v = self._prepare_qkv(query, key, value, cache) + + # scale dot product attention + product = layers.matmul( + x=q, y=k, transpose_y=True, alpha=self.head_dim**-0.5) + if attn_mask is not None: + # TODO(guosheng): support bool mask + product = product + attn_mask + weights = layers.softmax(product) + if self.dropout: + weights = layers.dropout( + weights, + dropout_prob=self.dropout, + dropout_implementation="upscale_in_train", + is_test=False) + + out = layers.matmul(weights, v) + + # combine heads + out = layers.transpose(out, perm=[0, 2, 1, 3]) + out = layers.reshape(x=out, shape=[0, 0, out.shape[2] * out.shape[3]]) + + # project to output + out = self.out_proj(out) + return (out, weights) if self.need_weights else out + + +class TransformerEncoderLayer(Layer): + """ + TransformerEncoderLayer is composed of two sub-layers which are self (multi-head) + attention and feedforward network. Before and after each sub-layer, pre-process + and post-precess would be applied on the input and output. + + Parameters: + n_head (int): The number of heads in multi-head attention(MHA). + d_key (int): The feature size to transformer queries and keys as in + multi-head attention. Mostly it equals to `d_model // n_head`. + d_value (int): The feature size to transformer values as in multi-head + attention. Mostly it equals to `d_model // n_head`. + d_model (int): The expected feature size in the input and output. + d_inner_hid (int): The hidden layer size in the feedforward network(FFN). + prepostprocess_dropout (float, optional): The dropout probability used + in pre-process and post-precess of MHA and FFN sub-layer. Default 0.1 + attention_dropout (float, optional): The dropout probability used + in MHA to drop some attention target. Default 0.1 + relu_dropout (float, optional): The dropout probability used after FFN + activition. Default 0.1 + preprocess_cmd (str, optional): The process applied before each MHA and + FFN sub-layer, and it also would be applied on output of the last + stacked layer. It should be a string composed of `d`, `a`, `n`, + where `d` for dropout, `a` for add residual connection, `n` for + layer normalization. Default `n`. + postprocess_cmd (str, optional): The process applied after each MHA and + FFN sub-layer. Same as `preprocess_cmd`. It should be a string + composed of `d`, `a`, `n`, where `d` for dropout, `a` for add + residual connection, `n` for layer normalization. Default `da`. + ffn_fc1_act (str, optional): The activation function in the feedforward + network. Default relu. + + Examples: + + .. code-block:: python + + import paddle + import paddle.fluid as fluid + from paddle.incubate.hapi.text import TransformerEncoderLayer + + # encoder input: [batch_size, src_len, d_model] + enc_input = paddle.rand((2, 4, 128)) + # self attention bias: [batch_size, n_head, src_len, src_len] + attn_bias = paddle.rand((2, 2, 4, 4)) + encoder_layer = TransformerEncoderLayer(2, 64, 64, 128, 512) + enc_output = encoder_layer(enc_input, attn_bias) # [2, 4, 128] + """ + + def __init__(self, + d_model, + nhead, + dim_feedforward, + dropout=0.1, + activation="relu", + attn_dropout=0.1, + act_dropout=0.1, + norm=True): + + super(TransformerEncoderLayer, self).__init__() + + self.preprocesser1 = PrePostProcessLayer(preprocess_cmd, d_model, + prepostprocess_dropout) + self.self_attn = MultiHeadAttention(d_key, d_value, d_model, n_head, + attention_dropout) + self.postprocesser1 = PrePostProcessLayer(postprocess_cmd, d_model, + prepostprocess_dropout) + + self.preprocesser2 = PrePostProcessLayer(preprocess_cmd, d_model, + prepostprocess_dropout) + self.ffn = FFN(d_inner_hid, d_model, relu_dropout, fc1_act=ffn_fc1_act) + self.postprocesser2 = PrePostProcessLayer(postprocess_cmd, d_model, + prepostprocess_dropout) + + def forward(self, src, src_mask=None): + """ + Applies a Transformer encoder layer on the input. + + Parameters: + enc_input (Variable): The input of Transformer encoder layer. It is + a tensor with shape `[batch_size, sequence_length, d_model]`. + The data type should be float32 or float64. + attn_bias(Variable, optional): A tensor used in encoder self attention + to mask out attention on unwanted positions, usually the paddings. It + is a tensor with shape `[batch_size, n_head, sequence_length, sequence_length]`, + where the unwanted positions have `-INF` values and the others + have 0 values. The data type should be float32 or float64. It can + be None when nothing wanted or needed to be masked out. Default None + + Returns: + Variable: The output of Transformer encoder layer. It is a tensor that \ + has the same shape and data type as `enc_input`. + """ + attn_output = self.self_attn( + self.preprocesser1(enc_input), None, None, attn_bias) + attn_output = self.postprocesser1(attn_output, enc_input) + + ffn_output = self.ffn(self.preprocesser2(attn_output)) + ffn_output = self.postprocesser2(ffn_output, attn_output) + return ffn_output + + +class TransformerCell(Layer): + """ + TransformerCell wraps a Transformer decoder producing logits from `inputs` + composed by ids and position. + + Parameters: + decoder(callable): A TransformerDecoder instance. Or a wrapper of it that + includes a embedding layer accepting ids and positions instead of embeddings + and includes a output layer transforming decoder output features to logits. + embedding_fn(function, optional): A callable that accepts ids and position + as arguments and return embeddings as input of `decoder`. It can be + None if `decoder` includes a embedding layer. Default None. + output_fn(callable, optional): A callable applid on `decoder` output to + transform decoder output features to get logits. Mostly it is a Linear + layer with vocabulary size. It can be None if `decoder` includes a + output layer. Default None. + + Examples: + + .. code-block:: python + + import paddle + import paddle.fluid as fluid + from paddle.fluid.dygraph import Embedding, Linear + from paddle.incubate.hapi.text import TransformerDecoder + from paddle.incubate.hapi.text import TransformerCell + from paddle.incubate.hapi.text import TransformerBeamSearchDecoder + from paddle.incubate.hapi.text import DynamicDecode + + paddle.disable_static() + + class Embedder(fluid.dygraph.Layer): + def __init__(self): + super(Embedder, self).__init__() + self.word_embedder = Embedding(size=[1000, 128]) + self.pos_embedder = Embedding(size=[500, 128]) + + def forward(self, word, position): + return self.word_embedder(word) + self.pos_embedder(position) + + embedder = Embedder() + output_layer = Linear(128, 1000) + decoder = TransformerDecoder(2, 2, 64, 64, 128, 512) + transformer_cell = TransformerCell(decoder, embedder, output_layer) + dynamic_decoder = DynamicDecode( + TransformerBeamSearchDecoder( + transformer_cell, + start_token=0, + end_token=1, + beam_size=4, + var_dim_in_state=2), + max_step_num=10, + is_test=True) + + enc_output = paddle.rand((2, 4, 128)) + # cross attention bias: [batch_size, n_head, trg_len, src_len] + trg_src_attn_bias = paddle.rand((2, 2, 1, 4)) + # inputs for beam search on Transformer + caches = transformer_cell.get_initial_states(enc_output) + enc_output = TransformerBeamSearchDecoder.tile_beam_merge_with_batch( + enc_output, beam_size=4) + trg_src_attn_bias = TransformerBeamSearchDecoder.tile_beam_merge_with_batch( + trg_src_attn_bias, beam_size=4) + static_caches = decoder.prepare_static_cache(enc_output) + outputs = dynamic_decoder( + inits=caches, + enc_output=enc_output, + trg_src_attn_bias=trg_src_attn_bias, + static_caches=static_caches) + """ + + def __init__(self, decoder, embed_layer=None, output_layer=None): + super(TransformerCell, self).__init__() + self.decoder = decoder + self.embedding_fn = embedding_fn + self.output_fn = output_fn + + def forward(self, + inputs, + states=None, + enc_output=None, + trg_slf_attn_bias=None, + trg_src_attn_bias=None, + static_caches=[]): + """ + Produces logits from `inputs` composed by ids and positions. + + Parameters: + inputs(tuple): A tuple includes target ids and positions. The two + tensors both have int64 data type and with 2D shape + `[batch_size, sequence_length]` where `sequence_length` is 1 + for inference. + states(list): It caches the multi-head attention intermediate results + of history decoding steps. It is a list of dict where the length + of list is decoder layer number, and each dict has `k` and `v` as + keys and values are cached results. Default None + enc_output(Variable): The output of Transformer encoder. It is a tensor + with shape `[batch_size, sequence_length, d_model]`. The data type + should be float32 or float64. + trg_slf_attn_bias(Variable, optional): A tensor used in decoder self + attention to mask out attention on unwanted target positions. It + is a tensor with shape `[batch_size, n_head, target_length, target_length]`, + where the unwanted positions have `-INF` values and the others + have 0 values. It can be None when nothing wanted or needed to + be masked out. It can be None for inference. The data type should + be float32 or float64. Default None + trg_src_attn_bias(Variable, optional): A tensor used in decoder-encoder + cross attention to mask out unwanted attention on source (encoder output). + It is a tensor with shape `[batch_size, n_head, target_length, source_length]`, + where the unwanted positions have `-INF` values and the others + have 0 values. It can be None when nothing wanted or needed to + be masked out. The data type should be float32 or float64. Default None + static_caches(list): It stores projected results of encoder output + to be used as keys and values in decoder-encoder cross attention + It is a list of dict where the length of list is decoder layer + number, and each dict has `static_k` and `static_v` as keys and + values are stored results. Default empty list + + Returns: + tuple: A tuple( :code:`(outputs, new_states)` ), where `outputs` \ + is a float32 or float64 3D tensor representing logits shaped \ + `[batch_size, sequence_length, vocab_size]`. `new_states has \ + the same structure and data type with `states` while the length \ + is one larger since the intermediate results of current step are \ + concatenated into it. + """ + trg_word, trg_pos = inputs + if states and static_caches: + for cache, static_cache in zip(states, static_caches): + cache.update(static_cache) + if self.embedding_fn is not None: + dec_input = self.embedding_fn(trg_word, trg_pos) + outputs = self.decoder(dec_input, enc_output, None, + trg_src_attn_bias, states) + else: + outputs = self.decoder(trg_word, trg_pos, enc_output, None, + trg_src_attn_bias, states) + if self.output_fn is not None: + outputs = self.output_fn(outputs) + + new_states = [{ + "k": cache["k"], + "v": cache["v"] + } for cache in states] if states else states + return outputs, new_states + + @property + def state_shape(self): + """ + States of TransformerCell cache the multi-head attention intermediate + results of history decoding steps, and have a increasing length as + decoding continued. + + `state_shape` of TransformerCell is used to initialize states. It is a + list of dict where the length of list is decoder layer, and each dict + has `k` and `v` as keys and values are `[n_head, 0, d_key]`, `[n_head, 0, d_value]` + separately. (-1 for batch size would be automatically inserted into shape). + + Returns: + list: It is a list of dict where the length of list is decoder layer \ + number, and each dict has `k` and `v` as keys and values are cached \ + results. + """ + return [{ + "k": [self.decoder.n_head, 0, self.decoder.d_key], + "v": [self.decoder.n_head, 0, self.decoder.d_value], + } for i in range(self.decoder.n_layer)] + + +class TransformerBeamSearchDecoder(layers.BeamSearchDecoder): + """ + Compared with a RNN step :code:`outputs, new_states = cell(inputs, states)`, + Transformer decoder's `inputs` uses 2D tensor shaped `[batch_size * beam_size, 1]` + and includes extra position data. And its `states` (caches) has increasing + length. These are not consistent with `BeamSearchDecoder`, thus subclass + `BeamSearchDecoder` to make beam search adapt to Transformer decoder. + + Parameters: + cell(TransformerCell): An instance of `TransformerCell`. + start_token(int): The start token id. + end_token(int): The end token id. + beam_size(int): The beam width used in beam search. + var_dim_in_state(int): Indicate which dimension of states is variant. + + Examples: + + .. code-block:: python + + import paddle + import paddle.fluid as fluid + from paddle.fluid.dygraph import Embedding, Linear + from paddle.incubate.hapi.text import TransformerDecoder + from paddle.incubate.hapi.text import TransformerCell + from paddle.incubate.hapi.text import TransformerBeamSearchDecoder + from paddle.incubate.hapi.text import DynamicDecode + + paddle.disable_static() + + class Embedder(fluid.dygraph.Layer): + def __init__(self): + super(Embedder, self).__init__() + self.word_embedder = Embedding(size=[1000, 128]) + self.pos_embedder = Embedding(size=[500, 128]) + + def forward(self, word, position): + return self.word_embedder(word) + self.pos_embedder(position) + + embedder = Embedder() + output_layer = Linear(128, 1000) + decoder = TransformerDecoder(2, 2, 64, 64, 128, 512) + transformer_cell = TransformerCell(decoder, embedder, output_layer) + dynamic_decoder = DynamicDecode( + TransformerBeamSearchDecoder( + transformer_cell, + start_token=0, + end_token=1, + beam_size=4, + var_dim_in_state=2), + max_step_num=10, + is_test=True) + + enc_output = paddle.rand((2, 4, 128)) + # cross attention bias: [batch_size, n_head, trg_len, src_len] + trg_src_attn_bias = paddle.rand((2, 2, 1, 4)) + # inputs for beam search on Transformer + caches = transformer_cell.get_initial_states(enc_output) + enc_output = TransformerBeamSearchDecoder.tile_beam_merge_with_batch( + enc_output, beam_size=4) + trg_src_attn_bias = TransformerBeamSearchDecoder.tile_beam_merge_with_batch( + trg_src_attn_bias, beam_size=4) + static_caches = decoder.prepare_static_cache(enc_output) + outputs = dynamic_decoder( + inits=caches, + enc_output=enc_output, + trg_src_attn_bias=trg_src_attn_bias, + static_caches=static_caches) + """ + + def __init__(self, cell, start_token, end_token, beam_size, + var_dim_in_state): + super(TransformerBeamSearchDecoder, + self).__init__(cell, start_token, end_token, beam_size) + self.cell = cell + self.var_dim_in_state = var_dim_in_state + + def _merge_batch_beams_with_var_dim(self, x): + """ + Reshape a tensor with shape `[batch_size, beam_size, ...]` to a new + tensor with shape `[batch_size * beam_size, ...]`. + + Parameters: + x(Variable): A tensor with shape `[batch_size, beam_size, ...]`. The + data type should be float32, float64, int32, int64 or bool. + + Returns: + Variable: A tensor with shape `[batch_size * beam_size, ...]`, whose \ + data type is same as `x`. + """ + # init length of cache is 0, and it increases with decoding carrying on, + # thus need to reshape elaborately + var_dim_in_state = self.var_dim_in_state + 1 # count in beam dim + x = layers.transpose(x, + list(range(var_dim_in_state, len(x.shape))) + + list(range(0, var_dim_in_state))) + x = layers.reshape( + x, [0] * (len(x.shape) - var_dim_in_state + ) + [self.batch_size * self.beam_size] + + [int(size) for size in x.shape[-var_dim_in_state + 2:]]) + x = layers.transpose( + x, + list(range((len(x.shape) + 1 - var_dim_in_state), len(x.shape))) + + list(range(0, (len(x.shape) + 1 - var_dim_in_state)))) + return x + + def _split_batch_beams_with_var_dim(self, x): + """ + Reshape a tensor with shape `[batch_size * beam_size, ...]` to a new + tensor with shape `[batch_size, beam_size, ...]`. + + Parameters: + x(Variable): A tensor with shape `[batch_size * beam_size, ...]`. The + data type should be float32, float64, int32, int64 or bool. + + Returns: + Variable: A tensor with shape `[batch_size, beam_size, ...]`, whose \ + data type is same as `x`. + """ + var_dim_size = layers.shape(x)[self.var_dim_in_state] + x = layers.reshape( + x, [-1, self.beam_size] + + [int(size) + for size in x.shape[1:self.var_dim_in_state]] + [var_dim_size] + + [int(size) for size in x.shape[self.var_dim_in_state + 1:]]) + return x + + def step(self, time, inputs, states, **kwargs): + """ + Perform a beam search decoding step, which uses `cell` to get probabilities, + and follows a beam search step to calculate scores and select candidate + token ids. + + Note: compared with `BeamSearchDecoder.step`, it feed 2D id tensor shaped + `[batch_size * beam_size, 1]` rather than `[batch_size * beam_size]` combined + position data as inputs to `cell`. + + Parameters: + time(Variable): An `int64` tensor with shape `[1]` provided by the caller, + representing the current time step number of decoding. + inputs(Variable): A tensor variable. It is same as `initial_inputs` + returned by `initialize()` for the first decoding step and + `next_inputs` returned by `step()` for the others. It is a int64 + id tensor with shape `[batch_size * beam_size]` + states(Variable): A structure of tensor variables. + It is same as the `initial_states` returned by `initialize()` for + the first decoding step and `beam_search_state` returned by + `step()` for the others. + **kwargs: Additional keyword arguments, provided by the caller. + + Returns: + tuple: A tuple( :code:`(beam_search_output, beam_search_state, next_inputs, finished)` ). \ + `beam_search_state` and `next_inputs` have the same structure, \ + shape and data type as the input arguments `states` and `inputs` separately. \ + `beam_search_output` is a namedtuple(including scores, predicted_ids, \ + parent_ids as fields) of tensor variables, where \ + `scores, predicted_ids, parent_ids` all has a tensor value shaped \ + `[batch_size, beam_size]` with data type `float32, int64, int64`. \ + `finished` is a `bool` tensor with shape `[batch_size, beam_size]`. + """ + # compared to RNN, Transformer has 3D data at every decoding step + inputs = layers.reshape(inputs, [-1, 1]) # token + pos = layers.ones_like(inputs) * time # pos + cell_states = map_structure(self._merge_batch_beams_with_var_dim, + states.cell_states) + + cell_outputs, next_cell_states = self.cell((inputs, pos), cell_states, + **kwargs) + + # squeeze to adapt to BeamSearchDecoder which use 2D logits + cell_outputs = map_structure( + lambda x: layers.squeeze(x, [1]) if len(x.shape) == 3 else x, + cell_outputs) + cell_outputs = map_structure(self._split_batch_beams, cell_outputs) + next_cell_states = map_structure(self._split_batch_beams_with_var_dim, + next_cell_states) + + beam_search_output, beam_search_state = self._beam_search_step( + time=time, + logits=cell_outputs, + next_cell_states=next_cell_states, + beam_state=states) + next_inputs, finished = (beam_search_output.predicted_ids, + beam_search_state.finished) + + return (beam_search_output, beam_search_state, next_inputs, finished) From b67bd96e5d17d2887e050807ef6709a6ecf3ff3c Mon Sep 17 00:00:00 2001 From: guosheng Date: Mon, 17 Aug 2020 11:46:07 +0800 Subject: [PATCH 02/17] Add MultiHeadAttention cache type and gen_cache. test=develop --- python/paddle/nn/layer/transformer.py | 167 ++++++++++++++++++-------- 1 file changed, 117 insertions(+), 50 deletions(-) diff --git a/python/paddle/nn/layer/transformer.py b/python/paddle/nn/layer/transformer.py index 0fb3f5b0c5ee6c..bffc04dd9c0154 100644 --- a/python/paddle/nn/layer/transformer.py +++ b/python/paddle/nn/layer/transformer.py @@ -15,11 +15,12 @@ # TODO: define the classes of Transformer neural network # __all__ = [ ] +import collections + import numpy as np from ...fluid import layers from ...fluid.dygraph import Layer, Linear -from ...fluid.initializer import Normal from .. import functional as F from ...fluid.layers import utils from ...fluid.layers.utils import map_structure @@ -57,16 +58,18 @@ class MultiHeadAttention(Layer): .. code-block:: python import paddle - from paddle.incubate.hapi.text import MultiHeadAttention # encoder input: [batch_size, sequence_length, d_model] query = paddle.rand((2, 4, 128)) - # self attention bias: [batch_size, n_head, src_len, src_len] + # self attention bias: [batch_size, num_heads, query_len, query_len] attn_bias = paddle.rand((2, 2, 4, 4)) - multi_head_attn = MultiHeadAttention(64, 64, 128, n_head=2) + multi_head_attn = paddle.MultiHeadAttention(64, 64, 128, n_head=2) output = multi_head_attn(query, attn_bias=attn_bias) # [2, 4, 128] """ + Cache = collections.namedtuple("Cache", ["k", "v"]) + StaticCache = collections.namedtuple("StaticCache", ["k", "v"]) + def __init__(self, embed_dim, num_heads, @@ -116,28 +119,28 @@ def _prepare_qkv(self, query, key, value, cache=None): Parameters: query (Variable): The queries for multi-head attention. It is a - tensor with shape `[batch_size, sequence_length, embed_dim]`. The + tensor with shape `[batch_size, query_length, embed_dim]`. The data type should be float32 or float64. key (Variable): The keys for multi-head attention. It is - a tensor with shape `[batch_size, sequence_length, kdim]`. The + a tensor with shape `[batch_size, key_length, kdim]`. The data type should be float32 or float64. If None, use `query` as `key`. value (Variable): The values for multi-head attention. It - is a tensor with shape `[batch_size, sequence_length, vdim]`. + is a tensor with shape `[batch_size, value_length, vdim]`. The data type should be float32 or float64. If None, use `query` as `value`. - cache (dict, optional): It is a dict with `k` and `v` as keys or - `static_k` and `static_v` as keys, and values are tensors shaped - `[batch_size, num_heads, length, embed_dim]` which are results of - linear projection, reshape and transpose calculations. If keys are - `k` and `v`, the values reserve intermediate results of previous - positions, and would be updated by new tensors concatanating raw - tensors with results of current position, which mostly used for - decoder self attention. If keys are `static_k` and `static_v`, - `key` and `value` args would be ignored, and the values in dict - would be used as calculated results on `key` and `value`, which - mostly used for decoder-encoder cross attention. It is only used - for inference and should be None for training. Default None. + cache (MultiHeadAttention.Cache|MultiHeadAttention.StaticCache, optional): + It is a namedtuple with `k` and `v` as fields, and stores tensors + shaped `[batch_size, num_heads, length, embed_dim]` which are results + of linear projection, reshape and transpose calculations in + MultiHeadAttention. If is an instance of `Cache`, `k` and `v` + fields reserve intermediate results of previous positions, which + mostly used for decoder self attention. If it is an instance of + `StaticCache`, `key` and `value` args would be ignored, `k` and + `v` fields would be used as calculated results on `key` and + `value`, which mostly used for decoder-encoder cross attention. + It is only used for inference and should be None for training. + Default None. Returns: tuple: A tuple including linear projected keys and values. These two \ @@ -149,20 +152,19 @@ def _prepare_qkv(self, query, key, value, cache=None): q = layers.reshape(x=q, shape=[0, 0, self.n_head, self.d_key]) q = layers.transpose(x=q, perm=[0, 2, 1, 3]) - if cache is not None and "static_k" in cache: + if isinstance(cache, self.StaticCache): # for encoder-decoder attention in inference and has cached - k, v = cache["static_k"], cache["static_v"] + k, v = cache.k, cache.v else: k, v = self.cal_kv(key, value) - if cache is not None and "static_k" not in cache: + if isinstance(cache, self.Cache): # for decoder self-attention in inference - cache_k, cache_v = cache["k"], cache["v"] - k = layers.concat([cache_k, k], axis=2) - v = layers.concat([cache_v, v], axis=2) - cache["k"], cache["v"] = k, v + k = layers.concat([cache.k, k], axis=2) + v = layers.concat([cache.v, v], axis=2) + cache = self.Cache(k, v) - return q, k, v + return (q, k, v) if cache is None else (q, k, v, cache) def cal_kv(self, key, value): """ @@ -172,7 +174,8 @@ def cal_kv(self, key, value): parallel attention. It is part of calculations in multi-head attention, and is provided as - a method to prefetch these results, by which we can use them as cache. + a method to pre-compute and prefetch these results, thus we can use them + to construct cache for inference. Parameters: key (Variable, optional): The keys for multi-head attention. It is @@ -183,8 +186,8 @@ def cal_kv(self, key, value): The data type should be float32 or float64. Returns: - tuple: A tuple including linear projected keys and values. Their shapes \ - both are `[batch_size, num_heads, sequence_length, embed_dim // num_heads]`. \ + tuple: A tuple including transformed keys and values. Their shapes \ + both are `[batch_size, num_heads, sequence_length, embed_dim // num_heads]`, \ and their data types are same as inputs. """ k = self.k_proj(key) @@ -195,6 +198,53 @@ def cal_kv(self, key, value): v = layers.transpose(x=v, perm=[0, 2, 1, 3]) return k, v + def gen_cache(self, key, value=None, type=MultiHeadAttention.Cache): + """ + Generates cache for `forward` usage accroding to arguments. + + If `type` is `StaticCache`, apply `cal_kv(key, value)` and use the results + to create an instance of `StaticCache`. + + If `type` is `Cache` and `value` is None, generate empty tensors shaped + `[batch_size, num_heads, 0, head_dim]` and use the results to create an + instance of `Cache`, where `batch_size` is from the first dimension of `key`. + + If `type` is `Cache` and `value` is not None, use `key`, `value` to create + an instance of `Cache`. + + Parameters: + key (Variable): The keys for multi-head attention. It is + a tensor with shape `[batch_size, key_length, kdim]`. The + data type should be float32 or float64. + value (Variable, optional): The values for multi-head attention. It + is a tensor with shape `[batch_size, value_length, vdim]`. + The data type should be float32 or float64. If None, `key` is only + for batch size reference. Default None. + type (type): It should be `MultiHeadAttention.StaticCache` or + `MultiHeadAttention.Cache` to indicate the cache type to generate. + + Returns: + namedtupe: an instance of `Cache` or `StaticCache` accordingly. + """ + if type == MultiHeadAttention.StaticCache: # static_kv + k, v = self.cal_kv(key, value) + return self.StaticCache(k, v) + elif value is None: # incremental_state + k = layers.fill_constant_batch_size_like( + input=key, + shape=[-1, self.num_heads, 0, self.head_dim], + dtype=key.dtype, + value=0) + v = layers.fill_constant_batch_size_like( + input=key, + shape=[-1, self.num_heads, 0, self.head_dim], + dtype=key.dtype, + value=0) + return self.Cache(k, v) + else: + # incremental_state with initial value, mainly for usage like UniLM + return self.Cache(key, value) + def forward(self, query, key, value, attn_mask=None, cache=None): """ Applies multi-head attention to map queries and a set of key-value pairs @@ -202,14 +252,14 @@ def forward(self, query, key, value, attn_mask=None, cache=None): Parameters: query (Variable): The queries for multi-head attention. It is a - tensor with shape `[batch_size, sequence_length, embed_dim]`. The + tensor with shape `[batch_size, query_length, embed_dim]`. The data type should be float32 or float64. key (Variable, optional): The keys for multi-head attention. It is - a tensor with shape `[batch_size, sequence_length, kdim]`. The + a tensor with shape `[batch_size, key_length, kdim]`. The data type should be float32 or float64. If None, use `query` as `key`. Default None. value (Variable, optional): The values for multi-head attention. It - is a tensor with shape `[batch_size, sequence_length, vdim]`. + is a tensor with shape `[batch_size, value_length, vdim]`. The data type should be float32 or float64. If None, use `query` as `value`. Default None. attn_mask (Variable, optional): A tensor used in multi-head attention @@ -220,27 +270,38 @@ def forward(self, query, key, value, attn_mask=None, cache=None): have 0 values. The data type should be float32 or float64. It can be None when nothing wanted or needed to be prevented attention to. Default None - cache (dict, optional): It is a dict with `k` and `v` as keys or - `static_k` and `static_v` as keys, and values are tensors shaped - `[batch_size, num_heads, length, embed_dim]` which are results of - linear projection, reshape and transpose calculations. If keys are - `k` and `v`, the values reserve intermediate results of previous - positions, and would be updated by new tensors concatanating raw - tensors with results of current position, which mostly used for - decoder self attention. If keys are `static_k` and `static_v`, - `key` and `value` args would be ignored, and the values in dict - would be used as calculated results on `key` and `value`, which - mostly used for decoder-encoder cross attention. It is only used - for inference and should be None for training. Default None. + cache (MultiHeadAttention.Cache|MultiHeadAttention.StaticCache, optional): + It is a namedtuple with `k` and `v` as fields, and stores tensors + shaped `[batch_size, num_heads, length, embed_dim]` which are results + of linear projection, reshape and transpose calculations in + MultiHeadAttention. If is an instance of `Cache`, `k` and `v` + fields reserve intermediate results of previous positions, which + mostly used for decoder self attention. If it is an instance of + `StaticCache`, `key` and `value` args would be ignored, `k` and + `v` fields would be used as calculated results on `key` and + `value`, which mostly used for decoder-encoder cross attention. + It is only used for inference and should be None for training. + Default None. Returns: - Variable: The output of multi-head attention. It is a tensor \ - that has the same shape and data type as `queries`. + Variable|tuple: It is a tensor that has the same shape and data type \ + as `query`, representing attention output. Or a tuple if \ + `need_weights` is True or `cache` is not None. If `need_weights` \ + is True, except for attention output, the tuple also includes \ + the attention weights tensor shaped `[batch_size, num_heads, query_length, key_length]`. \ + If `cache` is not None, the tuple then includes the new cache \ + having the same type as `cache`, and if it is `StaticCache`, it \ + is same as the input `cache`, if it is `Cache`, the new cache \ + reserves tensors concatanating raw tensors with intermediate \ + results of current query. """ key = query if key is None else key value = query if value is None else value # compute q ,k ,v - q, k, v = self._prepare_qkv(query, key, value, cache) + if cache is None: + q, k, v = self._prepare_qkv(query, key, value, cache) + else: + q, k, v, cache = self._prepare_qkv(query, key, value, cache) # scale dot product attention product = layers.matmul( @@ -264,7 +325,13 @@ def forward(self, query, key, value, attn_mask=None, cache=None): # project to output out = self.out_proj(out) - return (out, weights) if self.need_weights else out + + outs = [out] + if self.need_weights: + outs.append(weights) + if cache is not None: + outs.append(cache) + return out if len(outs) else outs class TransformerEncoderLayer(Layer): @@ -323,7 +390,7 @@ def __init__(self, activation="relu", attn_dropout=0.1, act_dropout=0.1, - norm=True): + normalize_before=True): super(TransformerEncoderLayer, self).__init__() From d3c1709af456091553e7dd8f9c3f35d0f7d53f0a Mon Sep 17 00:00:00 2001 From: guosheng Date: Tue, 18 Aug 2020 12:34:43 +0800 Subject: [PATCH 03/17] Add TransformerEncoderLayer and TransformerEncoder. test=develop --- python/paddle/nn/layer/transformer.py | 271 +++++++++++++++++++------- 1 file changed, 196 insertions(+), 75 deletions(-) diff --git a/python/paddle/nn/layer/transformer.py b/python/paddle/nn/layer/transformer.py index bffc04dd9c0154..7dbbecb869af76 100644 --- a/python/paddle/nn/layer/transformer.py +++ b/python/paddle/nn/layer/transformer.py @@ -15,18 +15,51 @@ # TODO: define the classes of Transformer neural network # __all__ = [ ] +import copy import collections import numpy as np from ...fluid import layers -from ...fluid.dygraph import Layer, Linear +from ...fluid.param_attr import ParamAttr +from ...fluid.dygraph import Layer, Linear, Dropout, LayerNorm, LayerList from .. import functional as F from ...fluid.layers import utils from ...fluid.layers.utils import map_structure -class MultiHeadAttention(Layer): +def _convert_param_attr_to_list(param_attr, n): + """ + If `param_attr` is a list or tuple, convert every element in it to a + ParamAttr instance. Otherwise, repeat `param_attr` `n` times to + construct a list, and rename every one by appending a increasing index + suffix to avoid having same names when `param_attr` contains a name. + + Parameters: + param_attr (list|tuple|ParamAttr): A list, tuple or something can be + converted to a ParamAttr instance by `ParamAttr._to_attr`. + n (int): The times to repeat to construct a list when `param_attr` + is not a list or tuple. + + Returns: + list: A list composed of each including cell's `param_attr`. + """ + if isinstance(param_attr, (list, tuple)): + assert len(param_attr) == n, ( + "length of param_attr should be %d when it is a list/tuple" % n) + param_attrs = [ParamAttr._to_attr(attr) for attr in param_attr] + else: + param_attrs = [] + attr = ParamAttr._to_attr(param_attr) + for i in range(n): + attr_i = copy.deepcopy(attr) + if attr.name: + attr_i.name = attr_i.name + "_" + str(i) + param_attrs.append(attr_i) + return param_attrs + + +class MultiheadAttention(Layer): """ Attention mapps queries and a set of key-value pairs to outputs, and Multi-Head Attention performs multiple parallel attention to jointly attending @@ -61,10 +94,10 @@ class MultiHeadAttention(Layer): # encoder input: [batch_size, sequence_length, d_model] query = paddle.rand((2, 4, 128)) - # self attention bias: [batch_size, num_heads, query_len, query_len] - attn_bias = paddle.rand((2, 2, 4, 4)) - multi_head_attn = paddle.MultiHeadAttention(64, 64, 128, n_head=2) - output = multi_head_attn(query, attn_bias=attn_bias) # [2, 4, 128] + # self attention mask: [batch_size, num_heads, query_len, query_len] + attn_mask = paddle.rand((2, 2, 4, 4)) + multi_head_attn = paddle.MultiheadAttention(64, 64, 128, n_head=2) + output = multi_head_attn(query, attn_mask=attn_mask) # [2, 4, 128] """ Cache = collections.namedtuple("Cache", ["k", "v"]) @@ -76,10 +109,10 @@ def __init__(self, dropout=0., kdim=None, vdim=None, - need_weights=True, + need_weights=False, param_attr=None, bias_attr=None): - super(MultiHeadAttention, self).__init__() + super(MultiheadAttention, self).__init__() self.embed_dim = embed_dim self.kdim = kdim if kdim is not None else embed_dim self.vdim = vdim if vdim is not None else embed_dim @@ -129,11 +162,11 @@ def _prepare_qkv(self, query, key, value, cache=None): is a tensor with shape `[batch_size, value_length, vdim]`. The data type should be float32 or float64. If None, use `query` as `value`. - cache (MultiHeadAttention.Cache|MultiHeadAttention.StaticCache, optional): + cache (MultiheadAttention.Cache|MultiheadAttention.StaticCache, optional): It is a namedtuple with `k` and `v` as fields, and stores tensors shaped `[batch_size, num_heads, length, embed_dim]` which are results of linear projection, reshape and transpose calculations in - MultiHeadAttention. If is an instance of `Cache`, `k` and `v` + MultiheadAttention. If is an instance of `Cache`, `k` and `v` fields reserve intermediate results of previous positions, which mostly used for decoder self attention. If it is an instance of `StaticCache`, `key` and `value` args would be ignored, `k` and @@ -198,7 +231,7 @@ def cal_kv(self, key, value): v = layers.transpose(x=v, perm=[0, 2, 1, 3]) return k, v - def gen_cache(self, key, value=None, type=MultiHeadAttention.Cache): + def gen_cache(self, key, value=None, type=MultiheadAttention.Cache): """ Generates cache for `forward` usage accroding to arguments. @@ -215,18 +248,19 @@ def gen_cache(self, key, value=None, type=MultiHeadAttention.Cache): Parameters: key (Variable): The keys for multi-head attention. It is a tensor with shape `[batch_size, key_length, kdim]`. The - data type should be float32 or float64. + data type should be float32 or float64. If `value` is None, + it is only for batch size and data type reference. value (Variable, optional): The values for multi-head attention. It is a tensor with shape `[batch_size, value_length, vdim]`. The data type should be float32 or float64. If None, `key` is only for batch size reference. Default None. - type (type): It should be `MultiHeadAttention.StaticCache` or - `MultiHeadAttention.Cache` to indicate the cache type to generate. + type (type): It should be `MultiheadAttention.StaticCache` or + `MultiheadAttention.Cache` to indicate the cache type to generate. Returns: namedtupe: an instance of `Cache` or `StaticCache` accordingly. """ - if type == MultiHeadAttention.StaticCache: # static_kv + if type == MultiheadAttention.StaticCache: # static_kv k, v = self.cal_kv(key, value) return self.StaticCache(k, v) elif value is None: # incremental_state @@ -270,11 +304,11 @@ def forward(self, query, key, value, attn_mask=None, cache=None): have 0 values. The data type should be float32 or float64. It can be None when nothing wanted or needed to be prevented attention to. Default None - cache (MultiHeadAttention.Cache|MultiHeadAttention.StaticCache, optional): + cache (MultiheadAttention.Cache|MultiheadAttention.StaticCache, optional): It is a namedtuple with `k` and `v` as fields, and stores tensors shaped `[batch_size, num_heads, length, embed_dim]` which are results of linear projection, reshape and transpose calculations in - MultiHeadAttention. If is an instance of `Cache`, `k` and `v` + MultiheadAttention. If is an instance of `Cache`, `k` and `v` fields reserve intermediate results of previous positions, which mostly used for decoder self attention. If it is an instance of `StaticCache`, `key` and `value` args would be ignored, `k` and @@ -331,55 +365,59 @@ def forward(self, query, key, value, attn_mask=None, cache=None): outs.append(weights) if cache is not None: outs.append(cache) - return out if len(outs) else outs + return out if len(outs) else tuple(outs) class TransformerEncoderLayer(Layer): """ TransformerEncoderLayer is composed of two sub-layers which are self (multi-head) attention and feedforward network. Before and after each sub-layer, pre-process - and post-precess would be applied on the input and output. + and post-precess would be applied on the input and output accordingly. If + `normalize_before` is True, pre-process is layer normalization and post-precess + includes dropout, residual connection. Otherwise, no pre-process and post-precess + includes dropout, residual connection, layer normalization. Parameters: - n_head (int): The number of heads in multi-head attention(MHA). - d_key (int): The feature size to transformer queries and keys as in - multi-head attention. Mostly it equals to `d_model // n_head`. - d_value (int): The feature size to transformer values as in multi-head - attention. Mostly it equals to `d_model // n_head`. d_model (int): The expected feature size in the input and output. - d_inner_hid (int): The hidden layer size in the feedforward network(FFN). - prepostprocess_dropout (float, optional): The dropout probability used - in pre-process and post-precess of MHA and FFN sub-layer. Default 0.1 - attention_dropout (float, optional): The dropout probability used - in MHA to drop some attention target. Default 0.1 - relu_dropout (float, optional): The dropout probability used after FFN - activition. Default 0.1 - preprocess_cmd (str, optional): The process applied before each MHA and - FFN sub-layer, and it also would be applied on output of the last - stacked layer. It should be a string composed of `d`, `a`, `n`, - where `d` for dropout, `a` for add residual connection, `n` for - layer normalization. Default `n`. - postprocess_cmd (str, optional): The process applied after each MHA and - FFN sub-layer. Same as `preprocess_cmd`. It should be a string - composed of `d`, `a`, `n`, where `d` for dropout, `a` for add - residual connection, `n` for layer normalization. Default `da`. - ffn_fc1_act (str, optional): The activation function in the feedforward + nhead (int): The number of heads in multi-head attention(MHA). + dim_feedforward (int): The hidden layer size in the feedforward network(FFN). + dropout (float, optional): The dropout probability used in pre-process + and post-precess of MHA and FFN sub-layer. Default 0.1 + activation (str, optional): The activation function in the feedforward network. Default relu. - + attn_dropout (float, optional): The dropout probability used + in MHA to drop some attention target. If None, use the value of + `dropout`. Default None + act_dropout (float, optional): The dropout probability used after FFN + activition. If None, use the value of `dropout`. Default None + act_dropout (float, optional): The dropout probability used after FFN + activition. If None, use the value of `dropout`. Default None + param_attr(ParamAttr|tuple, optional): To specify the weight parameter property. + If it is a tuple, `param_attr[0]` would be used as `param_attr` for + MHA, and `param_attr[1]` would be used as `param_attr` for linear in FFN. + Otherwise, MHA and FFN both use it as `param_attr` to create parameters. + Default: None, which means the default weight parameter property is used. + See usage for details in :ref:`api_fluid_ParamAttr` . + bias_attr (ParamAttr|tuple, optional): To specify the bias parameter property. + If it is a tuple, `bias_attr[0]` would be used as `bias_attr` for + MHA, and `bias_attr[1]` would be used as `bias_attr` for linear in FFN. + Otherwise, MHA and FFN both use it as `bias_attr` to create parameters. + Default: None, which means the default bias parameter property is used. + See usage for details in :ref:`api_fluid_ParamAttr` . + Examples: .. code-block:: python import paddle - import paddle.fluid as fluid - from paddle.incubate.hapi.text import TransformerEncoderLayer + from paddle import TransformerEncoderLayer # encoder input: [batch_size, src_len, d_model] enc_input = paddle.rand((2, 4, 128)) - # self attention bias: [batch_size, n_head, src_len, src_len] - attn_bias = paddle.rand((2, 2, 4, 4)) - encoder_layer = TransformerEncoderLayer(2, 64, 64, 128, 512) - enc_output = encoder_layer(enc_input, attn_bias) # [2, 4, 128] + # self attention mask: [batch_size, n_head, src_len, src_len] + attn_mask = paddle.rand((2, 2, 4, 4)) + encoder_layer = TransformerEncoderLayer(128, 2, 512) + enc_output = encoder_layer(enc_input, attn_mask) # [2, 4, 128] """ def __init__(self, @@ -388,51 +426,134 @@ def __init__(self, dim_feedforward, dropout=0.1, activation="relu", - attn_dropout=0.1, - act_dropout=0.1, - normalize_before=True): + attn_dropout=None, + act_dropout=None, + normalize_before=False, + param_attr=None, + bias_attr=None): + self._config = locals() + self._config.pop("self") super(TransformerEncoderLayer, self).__init__() - - self.preprocesser1 = PrePostProcessLayer(preprocess_cmd, d_model, - prepostprocess_dropout) - self.self_attn = MultiHeadAttention(d_key, d_value, d_model, n_head, - attention_dropout) - self.postprocesser1 = PrePostProcessLayer(postprocess_cmd, d_model, - prepostprocess_dropout) - - self.preprocesser2 = PrePostProcessLayer(preprocess_cmd, d_model, - prepostprocess_dropout) - self.ffn = FFN(d_inner_hid, d_model, relu_dropout, fc1_act=ffn_fc1_act) - self.postprocesser2 = PrePostProcessLayer(postprocess_cmd, d_model, - prepostprocess_dropout) + attn_dropout = dropout if attn_dropout is None else attn_dropout + act_dropout = dropout if act_dropout is None else act_dropout + self.normalize_before = normalize_before + + param_attrs = _convert_param_attr_to_list(param_attr, 2) + bias_attrs = _convert_param_attr_to_list(bias_attr, 2) + + self.self_attn = MultiheadAttention( + d_model, + nhead, + dropout=attn_dropout, + param_attr=param_attrs[0], + bias_attr=bias_attrs[0]) + self.linear1 = Linear( + d_model, + dim_feedforward, + param_attr=param_attrs[1], + bias_attr=bias_attrs[1]) + self.dropout = Dropout( + act_dropout, dropout_implementation="upscale_in_train") + self.linear2 = Linear( + dim_feedforward, + d_model, + param_attr=param_attrs[1], + bias_attr=bias_attrs[1]) + self.norm1 = LayerNorm(d_model) + self.norm2 = LayerNorm(d_model) + self.dropout1 = Dropout( + dropout, dropout_implementation="upscale_in_train") + self.dropout2 = Dropout( + dropout, dropout_implementation="upscale_in_train") + self.activation = getattr(layers, activation) def forward(self, src, src_mask=None): """ Applies a Transformer encoder layer on the input. Parameters: - enc_input (Variable): The input of Transformer encoder layer. It is + src (Variable): The input of Transformer encoder layer. It is a tensor with shape `[batch_size, sequence_length, d_model]`. The data type should be float32 or float64. - attn_bias(Variable, optional): A tensor used in encoder self attention - to mask out attention on unwanted positions, usually the paddings. It - is a tensor with shape `[batch_size, n_head, sequence_length, sequence_length]`, + src_mask (Variable, optional): A tensor used in multi-head attention + to prevents attention to some unwanted positions, usually the + paddings or the subsequent positions. It is a tensor with shape + broadcasted to `[batch_size, n_head, sequence_length, sequence_length]`, where the unwanted positions have `-INF` values and the others have 0 values. The data type should be float32 or float64. It can - be None when nothing wanted or needed to be masked out. Default None + be None when nothing wanted or needed to be prevented attention to. + Default None Returns: Variable: The output of Transformer encoder layer. It is a tensor that \ has the same shape and data type as `enc_input`. """ - attn_output = self.self_attn( - self.preprocesser1(enc_input), None, None, attn_bias) - attn_output = self.postprocesser1(attn_output, enc_input) + residual = src + if self.normalize_before: + src = self.norm1(src) + # TODO(guosheng): Add cache for encoder for the usage like UniLM + src = self.self_attn(src, src, src, src_mask) + src = residual + self.dropout1(src) + if not self.normalize_before: + src = self.norm1(src) + + residual = src + if self.normalize_before: + src = self.norm2(src) + src = self.linear2(self.dropout(self.activation(self.linear1(src)))) + src = residual + self.dropout2(src) + if not self.normalize_before: + src = self.norm2(src) + return src + + +class TransformerEncoder(Layer): + """ + TransformerEncoder is a stack of N encoder layers. + + Parameters: + encoder_layer (Layer): an instance of the `TransformerEncoderLayer`. It + would be used as the first layer, and the other layers would be created + according to the configurations of it. + num_layers (int): The number of encoder layers to be stacked. + norm (LayerNorm, optional): the layer normalization component. If provided, + apply layer normalization on the output of last encoder layer. + + Examples: + + .. code-block:: python + + import paddle + from paddle import TransformerEncoderLayer, TransformerEncoder + + # encoder input: [batch_size, src_len, d_model] + enc_input = paddle.rand((2, 4, 128)) + # self attention mask: [batch_size, n_head, src_len, src_len] + attn_mask = paddle.rand((2, 2, 4, 4)) + encoder_layer = TransformerEncoderLayer(2, 64, 64, 128, 512) + encoder = TransformerEncoder(encoder_layer, 2) + enc_output = encoder(enc_input, attn_mask) # [2, 4, 128] + """ + + def __init__(self, encoder_layer, num_layers, norm=None): + super(TransformerEncoder, self).__init__() + self.layers = LayerList([(encoder_layer if i == 0 else + type(encoder_layer)(encoder_layer._config)) + for i in range(num_layers)]) + self.num_layers = num_layers + self.norm = norm + + def forward(self, src, src_mask=None): + output = src + + for mod in self.layers: + output = mod(output, src_mask=src_mask) + + if self.norm is not None: + output = self.norm(output) - ffn_output = self.ffn(self.preprocesser2(attn_output)) - ffn_output = self.postprocesser2(ffn_output, attn_output) - return ffn_output + return output class TransformerCell(Layer): From 87e110638d1bd4b1262688e46767ac14396b5bcd Mon Sep 17 00:00:00 2001 From: guosheng Date: Tue, 18 Aug 2020 21:53:28 +0800 Subject: [PATCH 04/17] Add Transformer decoder apis. test=develop --- python/paddle/nn/layer/transformer.py | 454 +++++++++++++++++++++++++- 1 file changed, 444 insertions(+), 10 deletions(-) diff --git a/python/paddle/nn/layer/transformer.py b/python/paddle/nn/layer/transformer.py index 7dbbecb869af76..af50b66c7299c0 100644 --- a/python/paddle/nn/layer/transformer.py +++ b/python/paddle/nn/layer/transformer.py @@ -233,16 +233,36 @@ def cal_kv(self, key, value): def gen_cache(self, key, value=None, type=MultiheadAttention.Cache): """ - Generates cache for `forward` usage accroding to arguments. + Generates cache for `forward` usage in inference accroding to arguments. + The generated cache is an instance of `MultiheadAttention.Cache` or an + instance of `MultiheadAttention.StaticCache`. + + `Cache` or `StaticCache` is namedtuple with `k` and `v` as fields, + and it stores tensors shaped `[batch_size, num_heads, length, embed_dim]` + which are results of linear projection, reshape and transpose calculations + in MultiheadAttention. + + If the generated cache is an instance of `Cache`, `k` and `v` fields + reserve intermediate result tensors of previous positions, and the tensors + are incremental among decoding steps, which mostly are used for decoder + decoder self attention. + + If the generated cache is an instance of `StaticCache`, `k` and `v` fields + would be used as calculated result tensors on keys an values in `forward`, + and the tensors keep unchanged among decoding steps, which are mostly used + for decoder-encoder cross attention. + + The cache is generated as follows: - If `type` is `StaticCache`, apply `cal_kv(key, value)` and use the results - to create an instance of `StaticCache`. + 1. If `type` is `StaticCache`, apply `cal_kv(key, value)` and use the results + to create an instance of `StaticCache`. - If `type` is `Cache` and `value` is None, generate empty tensors shaped - `[batch_size, num_heads, 0, head_dim]` and use the results to create an - instance of `Cache`, where `batch_size` is from the first dimension of `key`. + 2. If `type` is `Cache` and `value` is None, generate empty tensors shaped + `[batch_size, num_heads, 0, embed_dim // num_heads]` and use the results + to create an instance of `Cache`, where `batch_size` is from the first + dimension of `key`. - If `type` is `Cache` and `value` is not None, use `key`, `value` to create + 3. If `type` is `Cache` and `value` is not None, use `key`, `value` to create an instance of `Cache`. Parameters: @@ -258,7 +278,7 @@ def gen_cache(self, key, value=None, type=MultiheadAttention.Cache): `MultiheadAttention.Cache` to indicate the cache type to generate. Returns: - namedtupe: an instance of `Cache` or `StaticCache` accordingly. + namedtuple: an instance of `Cache` or `StaticCache` accordingly. """ if type == MultiheadAttention.StaticCache: # static_kv k, v = self.cal_kv(key, value) @@ -308,7 +328,7 @@ def forward(self, query, key, value, attn_mask=None, cache=None): It is a namedtuple with `k` and `v` as fields, and stores tensors shaped `[batch_size, num_heads, length, embed_dim]` which are results of linear projection, reshape and transpose calculations in - MultiheadAttention. If is an instance of `Cache`, `k` and `v` + MultiheadAttention. If it is an instance of `Cache`, `k` and `v` fields reserve intermediate results of previous positions, which mostly used for decoder self attention. If it is an instance of `StaticCache`, `key` and `value` args would be ignored, `k` and @@ -545,6 +565,28 @@ def __init__(self, encoder_layer, num_layers, norm=None): self.norm = norm def forward(self, src, src_mask=None): + """ + Applies a stack of N Transformer encoder layers on inputs. If `norm` is + provided, also applies layer normalization on the output of last encoder + layer. + + Parameters: + src (Variable): The input of Transformer encoder. It is a tensor + with shape `[batch_size, sequence_length, d_model]`. The data + type should be float32 or float64. + src_mask (Variable, optional): A tensor used in multi-head attention + to prevents attention to some unwanted positions, usually the + paddings or the subsequent positions. It is a tensor with shape + broadcasted to `[batch_size, n_head, sequence_length, sequence_length]`, + where the unwanted positions have `-INF` values and the others + have 0 values. The data type should be float32 or float64. It can + be None when nothing wanted or needed to be prevented attention to. + Default None + + Returns: + Variable: The output of Transformer encoder. It is a tensor that \ + has the same shape and data type as `src`. + """ output = src for mod in self.layers: @@ -556,6 +598,398 @@ def forward(self, src, src_mask=None): return output +class TransformerDecoderLayer(Layer): + """ + TransformerDecoderLayer is composed of three sub-layers which are decoder + self (multi-head) attention, decoder-encoder cross attention and feedforward + network. Before and after each sub-layer, pre-process and post-precess would + be applied on the input and output accordingly. If `normalize_before` is True, + pre-process is layer normalization and post-precess includes dropout, residual + connection. Otherwise, no pre-process and post-precess includes dropout, residual + connection, layer normalization. + + Parameters: + d_model (int): The expected feature size in the input and output. + nhead (int): The number of heads in multi-head attention(MHA). + dim_feedforward (int): The hidden layer size in the feedforward network(FFN). + dropout (float, optional): The dropout probability used in pre-process + and post-precess of MHA and FFN sub-layer. Default 0.1 + activation (str, optional): The activation function in the feedforward + network. Default relu. + attn_dropout (float, optional): The dropout probability used + in MHA to drop some attention target. If None, use the value of + `dropout`. Default None + act_dropout (float, optional): The dropout probability used after FFN + activition. If None, use the value of `dropout`. Default None + act_dropout (float, optional): The dropout probability used after FFN + activition. If None, use the value of `dropout`. Default None + param_attr(ParamAttr|tuple, optional): To specify the weight parameter property. + If it is a tuple, `param_attr[0]` would be used as `param_attr` for + self attention, `param_attr[1]` would be used as `param_attr` for + cross attention, and `param_attr[2]` would be used as `param_attr` + for linear in FFN. Otherwise, the three sub-layers all uses it as + `param_attr` to create parameters. Default: None, which means the + default weight parameter property is used. See usage for details + in :ref:`api_fluid_ParamAttr` . + bias_attr (ParamAttr|tuple, optional): To specify the bias parameter property. + If it is a tuple, `bias_attr[0]` would be used as `bias_attr` for + self attention, `bias_attr[1]` would be used as `bias_attr` for + cross attention, and `bias_attr[2]` would be used as `bias_attr` + for linear in FFN. Otherwise, the three sub-layers all uses it as + `bias_attr` to create parameters. Default: None, which means the + default bias parameter property is used. See usage for details + in :ref:`api_fluid_ParamAttr` . + + Examples: + + .. code-block:: python + + import paddle + from paddle import TransformerDecoderLayer + + # decoder input: [batch_size, tgt_len, d_model] + dec_input = paddle.rand((2, 4, 128)) + # encoder output: [batch_size, src_len, d_model] + enc_output = paddle.rand((2, 6, 128)) + # self attention mask: [batch_size, n_head, tgt_len, tgt_len] + self_attn_mask = paddle.rand((2, 2, 4, 4)) + # cross attention mask: [batch_size, n_head, tgt_len, src_len] + cross_attn_mask = paddle.rand((2, 2, 4, 6)) + decoder_layer = TransformerDecoderLayer(128, 2, 512) + output = decoder_layer(dec_input, + enc_output, + self_attn_mask, + cross_attn_mask) # [2, 4, 128] + """ + + def __init__(self, + d_model, + nhead, + dim_feedforward, + dropout=0.1, + activation="relu", + attn_dropout=None, + act_dropout=None, + normalize_before=False, + param_attr=None, + bias_attr=None): + self._config = locals() + self._config.pop("self") + + super(TransformerDecoderLayer, self).__init__() + attn_dropout = dropout if attn_dropout is None else attn_dropout + act_dropout = dropout if act_dropout is None else act_dropout + self.normalize_before = normalize_before + + param_attrs = _convert_param_attr_to_list(param_attr, 3) + bias_attrs = _convert_param_attr_to_list(bias_attr, 3) + + self.self_attn = MultiheadAttention( + d_model, + nhead, + dropout=attn_dropout, + param_attr=param_attrs[0], + bias_attr=bias_attrs[0]) + self.cross_attn = MultiheadAttention( + d_model, + nhead, + dropout=attn_dropout, + param_attr=param_attrs[1], + bias_attr=bias_attrs[1]) + self.linear1 = Linear( + d_model, + dim_feedforward, + param_attr=param_attrs[2], + bias_attr=bias_attrs[2]) + self.dropout = Dropout( + act_dropout, dropout_implementation="upscale_in_train") + self.linear2 = Linear( + dim_feedforward, + d_model, + param_attr=param_attrs[2], + bias_attr=bias_attrs[2]) + self.norm1 = LayerNorm(d_model) + self.norm2 = LayerNorm(d_model) + self.norm3 = LayerNorm(d_model) + self.dropout1 = Dropout( + dropout, dropout_implementation="upscale_in_train") + self.dropout2 = Dropout( + dropout, dropout_implementation="upscale_in_train") + self.dropout3 = Dropout( + dropout, dropout_implementation="upscale_in_train") + self.activation = getattr(layers, activation) + + def forward(self, tgt, memory, tgt_mask=None, memory_mask=None, cache=None): + """ + Applies a Transformer decoder layer on the input. + + Parameters: + tgt (Variable): The input of Transformer decoder layer. It is a tensor + with shape `[batch_size, target_length, d_model]`. The data type + should be float32 or float64. + memory (Variable): The output of Transformer encoder. It is a tensor + with shape `[batch_size, source_length, d_model]`. The data type + should be float32 or float64. + tgt_mask (Variable, optional): A tensor used in self attention + to prevents attention to some unwanted positions, usually the + the subsequent positions. It is a tensor with shape broadcasted + to `[batch_size, n_head, target_length, target_length]`, + where the unwanted positions have `-INF` values and the others + have 0 values. The data type should be float32 or float64. It can + be None when nothing wanted or needed to be prevented attention to. + Default None + memory_mask (Variable, optional): A tensor used in decoder-encoder + cross attention to prevents attention to some unwanted positions, + usually the paddings. It is a tensor with shape broadcasted to + `[batch_size, n_head, target_length, source_length]`, where the + unwanted positions have `-INF` values and the others have 0 values. + The data type should be float32 or float64. It can be None when + nothing wanted or needed to be prevented attention to. Default None + cache (tuple, optional): It is a tuple( :code:`(incremental_cache, static_cache)` ), + `incremental_cache` is an instance of `MultiheadAttention.Cache`, + `static_cache` is an instance of `MultiheadAttention.StaticCache. + See `TransformerDecoderLayer.gen_cache` for more details. It is + only used for inference and should be None for training. Default + None. + + Returns: + Variable|tuple: It is a tensor that has the same shape and data type \ + as `tgt`, representing the output of Transformer decoder layer. \ + Or a tuple if `cache` is not None, except for decoder layer output, \ + the tuple includes the new cache which is same as input `cache` \ + argument but `incremental_cache` in it has an incremental length. \ + See `MultiheadAttention.gen_cache` and `MultiheadAttention.forward` \ + for more details. + """ + residual = tgt + if self.normalize_before: + tgt = self.norm1(tgt) + if cache is None: + tgt = self.self_attn(tgt, tgt, tgt, tgt_mask, None) + else: + tgt, incremental_cache = self.self_attn(tgt, tgt, tgt, tgt_mask, + cache[0]) + tgt = residual + self.dropout1(tgt) + if not self.normalize_before: + tgt = self.norm1(tgt) + + residual = tgt + if self.normalize_before: + tgt = self.norm2(tgt) + if cache is None: + tgt = self.cross_attn(tgt, memory, memory, memory_mask, None) + else: + tgt, static_cache = self.cross_attn(tgt, memory, memory, + memory_mask, cache[1]) + tgt = residual + self.dropout2(tgt) + if not self.normalize_before: + tgt = self.norm2(tgt) + + residual = tgt + if self.normalize_before: + tgt = self.norm3(tgt) + tgt = self.linear2(self.dropout(self.activation(self.linear1(tgt)))) + tgt = residual + self.dropout3(tgt) + if not self.normalize_before: + tgt = self.norm3(tgt) + return tgt if cache is None else (tgt, (incremental_cache, + static_cache)) + + def gen_cache(self, memory): + """ + Generates cache for `forward` usage. The generated cache is a tuple + composed of an instance of `MultiheadAttention.Cache` and an instance + of `MultiheadAttention.StaticCache`. + + Parameters: + memory (Variable): The output of Transformer encoder. It is a tensor + with shape `[batch_size, source_length, d_model]`. The data type + should be float32 or float64. + + Returns: + tuple: It is a tuple( :code:`(incremental_cache, static_cache)` ). \ + `incremental_cache` is an instance of `MultiheadAttention.Cache` \ + produced by `self_attn.gen_cache(memory, MultiheadAttention.Cache)`, \ + it reserves two tensors shaped `[batch_size, nhead, 0, d_model // nhead]`. \ + `static_cache` is an instance of `MultiheadAttention.StaticCache` \ + produced by `cross_attn.gen_cache(memory, MultiheadAttention.StaticCache)`, \ + it reserves two tensors shaped `[batch_size, nhead, source_length, d_model // nhead]`. + See `MultiheadAttention.gen_cache` and `MultiheadAttention.forward` \ + for more details. + """ + incremental_cache = self.self_attn.gen_cache( + memory, type=self.self_attn.Cache) + static_cache = self.cross_attn.gen_cache( + memory, memory, type=self.cross_attn.StaticCache) + return incremental_cache, static_cache + + +class TransformerDecoder(Layer): + """ + TransformerDecoder is a stack of N decoder layers. + + Parameters: + decoder_layer (Layer): an instance of the `TransformerDecoderLayer`. It + would be used as the first layer, and the other layers would be created + according to the configurations of it. + num_layers (int): The number of decoder layers to be stacked. + norm (LayerNorm, optional): the layer normalization component. If provided, + apply layer normalization on the output of last encoder layer. + + Examples: + + .. code-block:: python + + import paddle + from paddle import TransformerDecoderLayer, TransformerDecoder + + # decoder input: [batch_size, trg_len, d_model] + dec_input = paddle.rand((2, 4, 128)) + # encoder output: [batch_size, src_len, d_model] + enc_output = paddle.rand((2, 6, 128)) + # self attention mask: [batch_size, n_head, trg_len, trg_len] + self_attn_mask = paddle.rand((2, 2, 4, 4)) + # cross attention mask: [batch_size, n_head, trg_len, src_len] + cross_attn_mask = paddle.rand((2, 2, 4, 6)) + decoder_layer = TransformerDecoderLayer(128, 2, 512) + decoder = TransformerDecoder(decoder_layer, 2) + output = decoder(dec_input, + enc_output, + self_attn_mask, + cross_attn_mask) # [2, 4, 128] + """ + + def __init__(self, decoder_layer, num_layers, norm=None): + super(TransformerDecoder, self).__init__() + self.layers = LayerList([(decoder_layer if i == 0 else + type(decoder_layer)(decoder_layer._config)) + for i in range(num_layers)]) + self.num_layers = num_layers + self.norm = norm + + def forward(self, tgt, memory, tgt_mask=None, memory_mask=None, cache=None): + """ + Applies a stack of N Transformer decoder layers on inputs. If `norm` is + provided, also applies layer normalization on the output of last decoder + layer. + + Parameters: + tgt (Variable): The input of Transformer decoder. It is a tensor + with shape `[batch_size, target_length, d_model]`. The data type + should be float32 or float64. + memory (Variable): The output of Transformer encoder. It is a tensor + with shape `[batch_size, source_length, d_model]`. The data type + should be float32 or float64. + tgt_mask (Variable, optional): A tensor used in self attention + to prevents attention to some unwanted positions, usually the + the subsequent positions. It is a tensor with shape broadcasted + to `[batch_size, n_head, target_length, target_length]`, + where the unwanted positions have `-INF` values and the others + have 0 values. The data type should be float32 or float64. It can + be None when nothing wanted or needed to be prevented attention to. + Default None + memory_mask (Variable, optional): A tensor used in decoder-encoder + cross attention to prevents attention to some unwanted positions, + usually the paddings. It is a tensor with shape broadcasted to + `[batch_size, n_head, target_length, source_length]`, where the + unwanted positions have `-INF` values and the others have 0 values. + The data type should be float32 or float64. It can be None when + nothing wanted or needed to be prevented attention to. Default None + cache (list, optional): It is a list, and each element in the list + is a tuple( :code:`(incremental_cache, static_cache)` ). See + `TransformerDecoder.gen_cache` for more details. It is only + used for inference and should be None for training. Default None. + + Returns: + Variable|tuple: It is a tensor that has the same shape and data type \ + as `tgt`, representing the output of Transformer decoder. \ + Or a tuple if `cache` is not None, except for decoder output, \ + the tuple includes the new cache which is same as input `cache` \ + argument but `incremental_cache` in it has an incremental length. \ + See `MultiheadAttention.gen_cache` and `MultiheadAttention.forward` \ + for more details. + """ + output = tgt + new_caches = [] + for i, mod in enumerate(self.layers): + if cache is None: + output = mod(output, + memory, + tgt_mask=tgt_mask, + memory_mask=memory_mask, + cache=None) + else: + output, new_cache = mod(output, + memory, + tgt_mask=tgt_mask, + memory_mask=memory_mask, + cache=cache[i] + if cache is not None else None) + new_caches.append(new_cache) + + if self.norm is not None: + output = self.norm(output) + + return output if cache is None else (output, new_caches) + + def gen_cache(self, memory): + """ + Generates cache for `forward` usage. The generated cache is a list, and + each element in it is a tuple( :code:`(incremental_cache, static_cache)` ) + produced by `TransformerDecoderLayer.gen_cache`. See `TransformerDecoderLayer.gen_cache` + for more details. + + + Parameters: + memory (Variable): The output of Transformer encoder. It is a tensor + with shape `[batch_size, source_length, d_model]`. The data type + should be float32 or float64. + + Returns: + list: It is a list, and each element in the list is a tuple produced \ + by `TransformerDecoderLayer.gen_cache`. See `TransformerDecoderLayer.gen_cache` \ + for more details. + """ + return [layer.gen_cache(memory) for layer in self.layers] + + +class Transformer(Layer): + def __init__(self, + d_model=512, + nhead=8, + num_encoder_layers=6, + num_decoder_layers=6, + dim_feedforward=2048, + dropout=0.1, + activation="relu", + custom_encoder=None, + custom_decoder=None): + super(Transformer, self).__init__() + + if custom_encoder is not None: + self.encoder = custom_encoder + else: + encoder_layer = TransformerEncoderLayer( + d_model, nhead, dim_feedforward, dropout, activation) + encoder_norm = LayerNorm(d_model) + self.encoder = TransformerEncoder(encoder_layer, num_encoder_layers, + encoder_norm) + + if custom_decoder is not None: + self.decoder = custom_decoder + else: + decoder_layer = TransformerDecoderLayer( + d_model, nhead, dim_feedforward, dropout, activation) + decoder_norm = LayerNorm(d_model) + self.decoder = TransformerDecoder(decoder_layer, num_decoder_layers, + decoder_norm) + + self._reset_parameters() + + self.d_model = d_model + self.nhead = nhead + + class TransformerCell(Layer): """ TransformerCell wraps a Transformer decoder producing logits from `inputs` @@ -627,7 +1061,7 @@ def forward(self, word, position): static_caches=static_caches) """ - def __init__(self, decoder, embed_layer=None, output_layer=None): + def __init__(self, decoder, embedding_fn=None, output_fn=None): super(TransformerCell, self).__init__() self.decoder = decoder self.embedding_fn = embedding_fn From c50ad434f56d355e6dc148576f8fac2c07dadf53 Mon Sep 17 00:00:00 2001 From: guosheng Date: Wed, 19 Aug 2020 16:32:26 +0800 Subject: [PATCH 05/17] Add Transformer api. test=develop --- python/paddle/nn/layer/transformer.py | 178 ++++++++++++++++++++++---- 1 file changed, 156 insertions(+), 22 deletions(-) diff --git a/python/paddle/nn/layer/transformer.py b/python/paddle/nn/layer/transformer.py index af50b66c7299c0..20410d0bfb6069 100644 --- a/python/paddle/nn/layer/transformer.py +++ b/python/paddle/nn/layer/transformer.py @@ -75,7 +75,7 @@ class MultiheadAttention(Layer): weights to drop some attention targets. 0 for no dropout. Default 0 kdim (int, optional): The feature size in key. If None, assumed equal to `embed_dim`. Default None. - vdim (int, optional): The feature size in key. If None, assumed equal to + vdim (int, optional): The feature size in value. If None, assumed equal to `embed_dim`. Default None. need_weights (bool, optional): Indicate whether to return the attention weights. Default False. @@ -410,8 +410,11 @@ class TransformerEncoderLayer(Layer): `dropout`. Default None act_dropout (float, optional): The dropout probability used after FFN activition. If None, use the value of `dropout`. Default None - act_dropout (float, optional): The dropout probability used after FFN - activition. If None, use the value of `dropout`. Default None + normalize_before (bool, optional): Indicate whether to put layer normalization + into preprocessing of MHA and FFN sub-layers. If True, pre-process is layer + normalization and post-precess includes dropout, residual connection. + Otherwise, no pre-process and post-precess includes dropout, residual + connection, layer normalization. Default False param_attr(ParamAttr|tuple, optional): To specify the weight parameter property. If it is a tuple, `param_attr[0]` would be used as `param_attr` for MHA, and `param_attr[1]` would be used as `param_attr` for linear in FFN. @@ -621,8 +624,11 @@ class TransformerDecoderLayer(Layer): `dropout`. Default None act_dropout (float, optional): The dropout probability used after FFN activition. If None, use the value of `dropout`. Default None - act_dropout (float, optional): The dropout probability used after FFN - activition. If None, use the value of `dropout`. Default None + normalize_before (bool, optional): Indicate whether to put layer normalization + into preprocessing of MHA and FFN sub-layers. If True, pre-process is layer + normalization and post-precess includes dropout, residual connection. + Otherwise, no pre-process and post-precess includes dropout, residual + connection, layer normalization. Default False param_attr(ParamAttr|tuple, optional): To specify the weight parameter property. If it is a tuple, `param_attr[0]` would be used as `param_attr` for self attention, `param_attr[1]` would be used as `param_attr` for @@ -843,13 +849,13 @@ class TransformerDecoder(Layer): import paddle from paddle import TransformerDecoderLayer, TransformerDecoder - # decoder input: [batch_size, trg_len, d_model] + # decoder input: [batch_size, tgt_len, d_model] dec_input = paddle.rand((2, 4, 128)) # encoder output: [batch_size, src_len, d_model] enc_output = paddle.rand((2, 6, 128)) - # self attention mask: [batch_size, n_head, trg_len, trg_len] + # self attention mask: [batch_size, n_head, tgt_len, tgt_len] self_attn_mask = paddle.rand((2, 2, 4, 4)) - # cross attention mask: [batch_size, n_head, trg_len, src_len] + # cross attention mask: [batch_size, n_head, tgt_len, src_len] cross_attn_mask = paddle.rand((2, 2, 4, 6)) decoder_layer = TransformerDecoderLayer(128, 2, 512) decoder = TransformerDecoder(decoder_layer, 2) @@ -923,8 +929,7 @@ def forward(self, tgt, memory, tgt_mask=None, memory_mask=None, cache=None): memory, tgt_mask=tgt_mask, memory_mask=memory_mask, - cache=cache[i] - if cache is not None else None) + cache=cache[i]) new_caches.append(new_cache) if self.norm is not None: @@ -947,13 +952,96 @@ def gen_cache(self, memory): Returns: list: It is a list, and each element in the list is a tuple produced \ - by `TransformerDecoderLayer.gen_cache`. See `TransformerDecoderLayer.gen_cache` \ + by `TransformerDecoderLayer.gen_cache(memory)`. See `TransformerDecoderLayer.gen_cache` \ for more details. """ return [layer.gen_cache(memory) for layer in self.layers] class Transformer(Layer): + """ + A Transformer model composed of an instance of `TransformerEncoder` and an + instance of `TransformerDecoder`. While the embedding layer and output layer + are not included. + + Please refer to `Attention is all you need `_ , + and see `TransformerEncoder` and `TransformerDecoder` for more details. + + Users can configurate the model architecture with corresponding parameters. + Note the usage of `normalize_before` representing where to apply layer + normalization (in pre-process or post-precess of multi-head attention or FFN), + and some transformer like models are different on this, such as + `BERT `_ and `GPT2 `_ . + The default architecture here places layer normalization in pre-process and + applies another layer normalization on the output of last encoder/decoder layer. + + Parameters: + d_model (int): The expected feature size in the encoder/decoder input + and output. + nhead (int): The number of heads in multi-head attention(MHA). + num_encoder_layers (int): The number of layers in encoder. + num_encoder_layers (int): The number of layers in decoder. + dim_feedforward (int): The hidden layer size in the feedforward network(FFN). + dropout (float, optional): The dropout probability used in pre-process + and post-precess of MHA and FFN sub-layer. Default 0.1 + activation (str, optional): The activation function in the feedforward + network. Default relu. + attn_dropout (float, optional): The dropout probability used + in MHA to drop some attention target. If None, use the value of + `dropout`. Default None + act_dropout (float, optional): The dropout probability used after FFN + activition. If None, use the value of `dropout`. Default None + normalize_before (bool, optional): Indicate whether to put layer normalization + into preprocessing of MHA and FFN sub-layers. If True, pre-process is layer + normalization and post-precess includes dropout, residual connection. + Otherwise, no pre-process and post-precess includes dropout, residual + connection, layer normalization. Default False + param_attr(ParamAttr|tuple, optional): To specify the weight parameter property. + If it is a tuple, `param_attr[0]` would be used as `param_attr` for + self attention, `param_attr[1]` would be used as `param_attr` for + cross attention, and `param_attr[2]` would be used as `param_attr` + for linear in FFN. Otherwise, the three sub-layers all uses it as + `param_attr` to create parameters. Default: None, which means the + default weight parameter property is used. See usage for details + in :ref:`api_fluid_ParamAttr` . + bias_attr (ParamAttr|tuple, optional): To specify the bias parameter property. + If it is a tuple, `bias_attr[0]` would be used as `bias_attr` for + self attention, `bias_attr[1]` would be used as `bias_attr` for + cross attention, and `bias_attr[2]` would be used as `bias_attr` + for linear in FFN. Otherwise, the three sub-layers all uses it as + `bias_attr` to create parameters. Default: None, which means the + default bias parameter property is used. See usage for details + in :ref:`api_fluid_ParamAttr` . + custom_encoder (Layer): If custom encoder is provided, use it as the encoder. + Default None + custom_decoder (Layer): If custom decoder is provided, use it as the decoder. + Default None + + Examples: + + .. code-block:: python + + import paddle + from paddle import Transformer + + # src: [batch_size, tgt_len, d_model] + enc_input = paddle.rand((2, 4, 128)) + # tgt: [batch_size, src_len, d_model] + dec_input = paddle.rand((2, 6, 128)) + # src_mask: [batch_size, n_head, src_len, src_len] + enc_self_attn_mask = paddle.rand((2, 2, 4, 4)) + # tgt_mask: [batch_size, n_head, tgt_len, tgt_len] + dec_self_attn_mask = paddle.rand((2, 2, 6, 6)) + # memory_mask: [batch_size, n_head, tgt_len, src_len] + cross_attn_mask = paddle.rand((2, 2, 6, 4)) + transformer = Transformer(128, 2, 4, 4, 512) + output = transformer(dec_input, + enc_output, + enc_self_attn_mask, + dec_self_attn_mask, + cross_attn_mask) # [2, 6, 128] + """ + def __init__(self, d_model=512, nhead=8, @@ -962,6 +1050,11 @@ def __init__(self, dim_feedforward=2048, dropout=0.1, activation="relu", + attn_dropout=None, + act_dropout=None, + normalize_before=False, + param_attr=None, + bias_attr=None, custom_encoder=None, custom_decoder=None): super(Transformer, self).__init__() @@ -970,7 +1063,9 @@ def __init__(self, self.encoder = custom_encoder else: encoder_layer = TransformerEncoderLayer( - d_model, nhead, dim_feedforward, dropout, activation) + d_model, nhead, dim_feedforward, dropout, activation, + attn_dropout, act_dropout, normalize_before, param_attr, + bias_attr) encoder_norm = LayerNorm(d_model) self.encoder = TransformerEncoder(encoder_layer, num_encoder_layers, encoder_norm) @@ -979,27 +1074,66 @@ def __init__(self, self.decoder = custom_decoder else: decoder_layer = TransformerDecoderLayer( - d_model, nhead, dim_feedforward, dropout, activation) + d_model, nhead, dim_feedforward, dropout, activation, + attn_dropout, act_dropout, normalize_before, param_attr, + bias_attr) decoder_norm = LayerNorm(d_model) self.decoder = TransformerDecoder(decoder_layer, num_decoder_layers, decoder_norm) - self._reset_parameters() - self.d_model = d_model self.nhead = nhead + def forward(self, src, tgt, src_mask=None, tgt_mask=None, memory_mask=None): + """ + Applies a Transformer model on the inputs. + + Parameters: + src (Variable): The input of Transformer encoder. It is a tensor + with shape `[batch_size, source_length, d_model]`. The data type + should be float32 or float64. + tgt (Variable): The input of Transformer decoder. It is a tensor + with shape `[batch_size, target_length, d_model]`. The data type + should be float32 or float64. + memory (Variable): The output of Transformer encoder. It is a tensor + with shape `[batch_size, source_length, d_model]`. The data type + should be float32 or float64. + tgt_mask (Variable, optional): A tensor used in self attention + to prevents attention to some unwanted positions, usually the + the subsequent positions. It is a tensor with shape broadcasted + to `[batch_size, n_head, target_length, target_length]`, + where the unwanted positions have `-INF` values and the others + have 0 values. The data type should be float32 or float64. It can + be None when nothing wanted or needed to be prevented attention to. + Default None + memory_mask (Variable, optional): A tensor used in decoder-encoder + cross attention to prevents attention to some unwanted positions, + usually the paddings. It is a tensor with shape broadcasted to + `[batch_size, n_head, target_length, source_length]`, where the + unwanted positions have `-INF` values and the others have 0 values. + The data type should be float32 or float64. It can be None when + nothing wanted or needed to be prevented attention to. Default None + + Returns: + Variable: It is a tensor that has the same shape and data type \ + as `tgt`, representing the output of Transformer decoder. + """ + memory = self.encoder(src, mask=src_mask) + output = self.decoder( + tgt, memory, tgt_mask=tgt_mask, memory_mask=memory_mask) + return output + -class TransformerCell(Layer): +class TransformerDecoderCell(Layer): """ - TransformerCell wraps a Transformer decoder producing logits from `inputs` - composed by ids and position. + TransformerDecoderCell wraps a Transformer decoder producing logits from + `inputs` composed by ids and position. Parameters: decoder(callable): A TransformerDecoder instance. Or a wrapper of it that includes a embedding layer accepting ids and positions instead of embeddings and includes a output layer transforming decoder output features to logits. - embedding_fn(function, optional): A callable that accepts ids and position + embedding_fn(callable, optional): A callable that accepts ids and position as arguments and return embeddings as input of `decoder`. It can be None if `decoder` includes a embedding layer. Default None. output_fn(callable, optional): A callable applid on `decoder` output to @@ -1045,7 +1179,7 @@ def forward(self, word, position): is_test=True) enc_output = paddle.rand((2, 4, 128)) - # cross attention bias: [batch_size, n_head, trg_len, src_len] + # cross attention bias: [batch_size, n_head, tgt_len, src_len] trg_src_attn_bias = paddle.rand((2, 2, 1, 4)) # inputs for beam search on Transformer caches = transformer_cell.get_initial_states(enc_output) @@ -1062,7 +1196,7 @@ def forward(self, word, position): """ def __init__(self, decoder, embedding_fn=None, output_fn=None): - super(TransformerCell, self).__init__() + super(TransformerDecoderCell, self).__init__() self.decoder = decoder self.embedding_fn = embedding_fn self.output_fn = output_fn @@ -1212,7 +1346,7 @@ def forward(self, word, position): is_test=True) enc_output = paddle.rand((2, 4, 128)) - # cross attention bias: [batch_size, n_head, trg_len, src_len] + # cross attention bias: [batch_size, n_head, tgt_len, src_len] trg_src_attn_bias = paddle.rand((2, 2, 1, 4)) # inputs for beam search on Transformer caches = transformer_cell.get_initial_states(enc_output) From ad7d2251e163506a9bfdf7d17869095704ea65d2 Mon Sep 17 00:00:00 2001 From: LiuChiaChi <709153940@qq.com> Date: Thu, 20 Aug 2020 06:08:12 +0000 Subject: [PATCH 06/17] add unittests for transformer api --- .../fluid/tests/book/test_transformer_api.py | 365 ++++++++++++++++++ python/paddle/nn/layer/__init__.py | 1 + python/paddle/nn/layer/transformer.py | 5 +- 3 files changed, 368 insertions(+), 3 deletions(-) create mode 100644 python/paddle/fluid/tests/book/test_transformer_api.py diff --git a/python/paddle/fluid/tests/book/test_transformer_api.py b/python/paddle/fluid/tests/book/test_transformer_api.py new file mode 100644 index 00000000000000..54e3f62446cc39 --- /dev/null +++ b/python/paddle/fluid/tests/book/test_transformer_api.py @@ -0,0 +1,365 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import paddle +import paddle.fluid as fluid +from paddle.nn.layer.transformer import MultiheadAttention, TransformerEncoderLayer, TransformerDecoderLayer + +import unittest + + +def generate_basic_params(mode="attn", self_attention=True, verbose=False): + batch_size, query_length = [np.random.randint(2, 10) for _ in range(2)] + d_head, num_heads = [np.random.randint(3, 10) for _ in range(2)] + attn_dropout = 0.0 + embed_dim = d_head * num_heads + if mode == "attn": + if self_attention: + kdim, vdim = embed_dim, embed_dim + key_length, value_length = query_length, query_length + else: + kdim, vdim = [np.random.randint(5, 20) for _ in range(2)] + key_length = np.random.randint(2, 10) + value_length = key_length + return batch_size, query_length, key_length, value_length, embed_dim, kdim, vdim, num_heads, attn_dropout + + else: + dropout, act_dropout = 0.0, 0.0 + dim_feedforward = np.random.randint(128, 1024) + sequence_length = np.random.randint(2, 10) + if mode == "encoder_layer": + return batch_size, embed_dim, num_heads, dim_feedforward, dropout, attn_dropout, act_dropout, sequence_length + elif mode == "decoder_layer": + target_length = np.random.randint(2, 10) + return batch_size, embed_dim, num_heads, dim_feedforward, dropout, attn_dropout, act_dropout, sequence_length, target_length + + +def generate_query_key_value_cache(self_attention, + batch_size, + num_heads, + query_length, + embed_dim, + key_length=None, + value_length=None, + kdim=None, + vdim=None, + cache=None): + query = np.random.rand(batch_size, query_length, + embed_dim).astype("float32") + # attn_mask = np.zeros((batch_size, num_heads, query_length, key_length)) + # attn_mask[0][0][0][:10] = -np.inf + attn_mask = None + head_dim = embed_dim // num_heads + if self_attention: + key, value = query, query + else: + key = np.random.rand(batch_size, key_length, kdim).astype("float32") + value = np.random.rand(batch_size, value_length, vdim).astype("float32") + cache_dict = {} + if cache: + if not self_attention: + cache_dict["static_k"] = np.random.rand( + batch_size, num_heads, key_length, head_dim).astype("float32") + cache_dict["static_v"] = np.random.rand( + batch_size, num_heads, value_length, head_dim).astype("float32") + else: + cache_dict["k"] = np.random.rand(batch_size, num_heads, key_length, + head_dim).astype("float32") + cache_dict["v"] = np.random.rand( + batch_size, num_heads, value_length, head_dim).astype("float32") + else: + cache_dict = None + return query, key, value, attn_mask, cache_dict + + +def fc(x, weight): + return np.matmul(x, weight) + + +def softmax(x): + np.seterr(invalid='ignore') + output = np.zeros(x.shape, dtype=np.float64) # ? + for i in range(x.shape[0]): + for j in range(x.shape[1]): + for k in range(x.shape[2]): + x_curr = x[i, j, k, :] + e_x = np.exp(x_curr - np.amax(x_curr)) + output[i, j, k, :] = e_x / np.sum(e_x) + return output + + +def batch_matmul(x, y): + assert x.shape[0] == y.shape[0] + assert x.shape[1] == y.shape[1] + retval = np.zeros( + (x.shape[0], x.shape[1], x.shape[2], y.shape[3]), dtype=np.float64) + for i in range(x.shape[0]): + for j in range(x.shape[1]): + retval[i, j, :, :] = np.matmul(x[i, j, :, :], y[i, j, :, :]) + return retval + + +def scaled_dot_product_attention(q, k, v, d_key, attn_mask, multi_head_attn): + k = k.transpose([0, 1, 3, 2]) + qkt = batch_matmul(q, k / np.sqrt(d_key, dtype=np.float64)) + if attn_mask is not None: + qkt += attn_mask + weight = softmax(qkt) + + attn_heads = batch_matmul(weight, v) + attn_heads = attn_heads.transpose((0, 2, 1, 3)) + attn_heads = attn_heads.reshape((attn_heads.shape[0], attn_heads.shape[1], + attn_heads.shape[2] * attn_heads.shape[3])) + return attn_heads + + +def cal_qkv(key, value, num_heads, embed_dim, multi_head_attn): + with fluid.dygraph.guard(): + head_dim = embed_dim // num_heads + k_weight = multi_head_attn.k_proj.weight.numpy() + v_weight = multi_head_attn.v_proj.weight.numpy() + k = fc(key, k_weight) + v = fc(value, v_weight) + k = k.reshape((k.shape[0], k.shape[1], num_heads, head_dim)) + k = k.transpose((0, 2, 1, 3)) + v = v.reshape((v.shape[0], v.shape[1], num_heads, head_dim)) + v = v.transpose((0, 2, 1, 3)) + return k, v + + +def prepare_qkv(query, key, value, num_heads, embed_dim, self_attention, + multi_head_attn, cache_dict): + q_weight = multi_head_attn.q_proj.weight.numpy() + q = fc(query, q_weight) + q = q.reshape((q.shape[0], q.shape[1], num_heads, embed_dim // num_heads)) + q = q.transpose((0, 2, 1, 3)) + + if not self_attention and cache_dict: + k, v = cache_dict["static_k"], cache_dict["static_v"] + else: + k, v = cal_qkv(key, value, num_heads, embed_dim, multi_head_attn) + if cache_dict is not None: + k = np.concatenate((cache_dict["k"], k), axis=2) + v = np.concatenate((cache_dict["v"], v), axis=2) + return (q, k, v, cache_dict) + +def add(x, y=None): + fluid.enable_dygraph() + with fluid.dygraph.guard(): + x = x.numpy() if not isinstance(x, np.ndarray) else x + if y is not None: + x += y + return x + return x + # print("print type(x) in add", type(x)) + +def relu(x): + compare = x > 0 + return x * compare + +def layer_norm(x, normalized_shape, norm, epsilon=1e-05, act=None): + fluid.enable_dygraph() + with fluid.dygraph.guard(): + # scale: + weight = norm.weight.numpy() + # shift: + bias = norm.bias.numpy() + + batch_size, src_len, d_model = x.shape + x = x.reshape((batch_size * src_len, d_model)) + mu = np.mean(x, axis=1, keepdims=True) + sigma_squar = np.sum(np.square(x - mu), axis=1) / d_model + x1_up = (x - mu) + x1_down_1 = sigma_squar + epsilon + x1_down = np.sqrt(x1_down_1) + x1_down = x1_down.reshape((x1_down.shape[0], 1)) + x1 = x1_up / x1_down + x_scaled = weight * x1 + x_scaled_bias = x_scaled + bias + x_scaled_bias = x_scaled_bias.reshape((batch_size, src_len, d_model)) + return x_scaled_bias + +def ffn(src, encoder_layer, ffn_fc1_act="relu"): + assert ffn_fc1_act == "relu", "only relu is supported" + fluid.enable_dygraph() + with fluid.dygraph.guard(): + src = src.numpy() if not isinstance(src, np.ndarray) else src + w1 = encoder_layer.linear1.weight.numpy() + w2 = encoder_layer.linear2.weight.numpy() + # fc1 + x1 = fc(src, w1) + x1 = relu(x1) + # fc2 + x2 = fc(x1, w2) + return x2 + + +class TestTransformer(unittest.TestCase): + def test_multi_head_attention(self): + def multihead_attention_test_helper(self_attention, cache): + paddle.framework.manual_seed(2020) + # 分四种情况:self_attention|cross_attention, cache|No cache + with fluid.dygraph.guard(fluid.CPUPlace()): + for _ in range(100): + # generate params for multi_head_attention + batch_size, query_length, key_length, value_length, embed_dim, kdim, vdim, num_heads, attn_dropout = generate_basic_params( + "attn", self_attention, False) + query, key, value, attn_mask, cache_dict = generate_query_key_value_cache( + self_attention, batch_size, num_heads, query_length, + embed_dim, key_length, value_length, kdim, vdim, cache) + need_weight, param_attr, bias_attr = False, None, None + # call paddle's function + multi_head_attn = MultiheadAttention( + embed_dim, num_heads, attn_dropout, kdim, vdim, + need_weight, param_attr, bias_attr) + # construct cache object + cache_obj = None + if cache_dict: + if 'k' and 'v' in cache_dict: + cache_obj = multi_head_attn.Cache( + paddle.to_variable(cache_dict['k']), + paddle.to_variable(cache_dict['v'])) + elif 'static_k' and 'static_v' in cache_dict: + cache_obj = multi_head_attn.StaticCache( + paddle.to_variable(cache_dict['static_k']), + paddle.to_variable(cache_dict['static_v'])) + + attn_output = multi_head_attn( + paddle.to_variable(query), + paddle.to_variable(key), + paddle.to_variable(value), attn_mask, cache_obj) + + # implementation by numpy + # compute q, k, v + q, k, v, _ = prepare_qkv(query, key, value, num_heads, + embed_dim, self_attention, + multi_head_attn, cache_dict) + # scale dot product attention + attn_heads = scaled_dot_product_attention( + q, k, v, embed_dim // num_heads, attn_mask, + multi_head_attn) + out_proj_weight = multi_head_attn.out_proj.weight.numpy() + reference = fc(attn_heads, out_proj_weight) + + np.testing.assert_allclose( + attn_output.numpy(), reference, atol=1e-6) + + multihead_attention_test_helper(True, True) + multihead_attention_test_helper(True, False) + multihead_attention_test_helper(False, True) + multihead_attention_test_helper(False, False) + + def test_transformer_encoder_layer(self): + + with fluid.dygraph.guard(fluid.CPUPlace()): + paddle.framework.manual_seed(2020) + for _ in range(100): + # 0.定义默认参数 + ffn_fc1_act = "relu" + # 1.获取基本参数 + batch_size, d_model, n_head, dim_feedforward, dropout, attn_dropout, act_dropout, sequence_length = generate_basic_params( + mode="encoder_layer", verbose=False) + # 2.生成输入 + src = np.random.rand(batch_size, sequence_length, + d_model).astype("float32") + residual = src + # src_mask = np.zeros(batch_size, n_head, sequence_length, + # sequence_length).astype(dtype) + # src_mask [0][0][:30] = -np.inf + + # 3.框架的输出 + encoder_layer = TransformerEncoderLayer( + d_model, n_head, dim_feedforward, dropout, ffn_fc1_act, + attn_dropout, act_dropout) + + encoder_output = encoder_layer( + paddle.to_variable(src)) # paddle.to_variable(src_mask)) + # 4.numpy: + # paddle self attention + self_attn = MultiheadAttention( + d_model, n_head, dropout=attn_dropout) + attn_output = self_attn( + paddle.to_variable(src), + paddle.to_variable(src), paddle.to_variable(src)).numpy() + + src = attn_output + residual + src_norm = layer_norm(src, d_model, encoder_layer.norm1) + residual = src_norm + + ffn_output = ffn(src_norm, encoder_layer, ffn_fc1_act) + src = residual + ffn_output + src = layer_norm(src, d_model, encoder_layer.norm2) + + np.testing.assert_allclose( + encoder_output.numpy(), src, rtol=1e-5, atol=1e-6) + + def test_transformer_decoder_layer(self): + with fluid.dygraph.guard(fluid.CPUPlace()): + paddle.framework.manual_seed(2020) + for _ in range(100): + activation = "relu" + normalize_before = False + batch_size, d_model, n_head, dim_feedforward, dropout, attn_dropout, act_dropout, source_length, target_length = generate_basic_params( + mode="decoder_layer", verbose=False) + tgt = np.random.rand(batch_size, target_length, + d_model).astype("float32") + memory = np.random.rand(batch_size, source_length, + d_model).astype("float32") + tgt_mask = None # TODO + memory_mask = None + # paddle: + decoder_layer = TransformerDecoderLayer( + d_model, n_head, dim_feedforward, dropout, activation, + attn_dropout, act_dropout, normalize_before) + decoder_output = decoder_layer( + paddle.to_variable(tgt), + paddle.to_variable(memory), tgt_mask, memory_mask).numpy() + # TODO: cache + residual = tgt + self_attn = MultiheadAttention( + d_model, n_head, dropout=attn_dropout) + + tgt = self_attn( + paddle.to_variable(tgt), + paddle.to_variable(tgt), + paddle.to_variable(tgt), tgt_mask, None).numpy() + + tgt = residual + tgt + # postprocess + tgt_norm = layer_norm(tgt, d_model, decoder_layer.norm1) + residual = tgt_norm + + cross_attn = MultiheadAttention( + d_model, n_head, dropout=attn_dropout) + tgt = cross_attn( + paddle.to_variable(tgt_norm), + paddle.to_variable(memory), + paddle.to_variable(memory), memory_mask, None).numpy() + # postprocess + tgt = tgt + residual + tgt_norm = layer_norm(tgt, d_model, decoder_layer.norm2) + residual = tgt_norm + + ffn_output = ffn(tgt_norm, decoder_layer, activation) + # post process + tgt = residual + ffn_output + tgt_norm = layer_norm(tgt, d_model, decoder_layer.norm3) + + np.testing.assert_allclose( + decoder_output, tgt_norm, rtol=1e-5, atol=1e-6) + + +if __name__ == "__main__": + unittest.main() \ No newline at end of file diff --git a/python/paddle/nn/layer/__init__.py b/python/paddle/nn/layer/__init__.py index 9fb8ea78a16ab4..6d25e382f7961c 100644 --- a/python/paddle/nn/layer/__init__.py +++ b/python/paddle/nn/layer/__init__.py @@ -21,6 +21,7 @@ from . import activation from . import norm from . import distance +from . import transformer from .activation import * from .loss import * diff --git a/python/paddle/nn/layer/transformer.py b/python/paddle/nn/layer/transformer.py index af50b66c7299c0..bfdf53a3ea984d 100644 --- a/python/paddle/nn/layer/transformer.py +++ b/python/paddle/nn/layer/transformer.py @@ -19,7 +19,6 @@ import collections import numpy as np - from ...fluid import layers from ...fluid.param_attr import ParamAttr from ...fluid.dygraph import Layer, Linear, Dropout, LayerNorm, LayerList @@ -182,7 +181,7 @@ def _prepare_qkv(self, query, key, value, cache=None): and their data types are same as inputs. """ q = self.q_proj(query) - q = layers.reshape(x=q, shape=[0, 0, self.n_head, self.d_key]) + q = layers.reshape(x=q, shape=[0, 0, self.num_heads, self.head_dim]) q = layers.transpose(x=q, perm=[0, 2, 1, 3]) if isinstance(cache, self.StaticCache): @@ -231,7 +230,7 @@ def cal_kv(self, key, value): v = layers.transpose(x=v, perm=[0, 2, 1, 3]) return k, v - def gen_cache(self, key, value=None, type=MultiheadAttention.Cache): + def gen_cache(self, key, value=None, type=Cache): """ Generates cache for `forward` usage in inference accroding to arguments. The generated cache is an instance of `MultiheadAttention.Cache` or an From 54e9e56eb3e01c69ed8719e085fbf628c6b9db03 Mon Sep 17 00:00:00 2001 From: LiuChiaChi <709153940@qq.com> Date: Thu, 20 Aug 2020 06:11:16 +0000 Subject: [PATCH 07/17] add unittests for transformer api --- python/paddle/fluid/tests/book/test_transformer_api.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/python/paddle/fluid/tests/book/test_transformer_api.py b/python/paddle/fluid/tests/book/test_transformer_api.py index 54e3f62446cc39..c75f018c07760c 100644 --- a/python/paddle/fluid/tests/book/test_transformer_api.py +++ b/python/paddle/fluid/tests/book/test_transformer_api.py @@ -155,6 +155,7 @@ def prepare_qkv(query, key, value, num_heads, embed_dim, self_attention, v = np.concatenate((cache_dict["v"], v), axis=2) return (q, k, v, cache_dict) + def add(x, y=None): fluid.enable_dygraph() with fluid.dygraph.guard(): @@ -165,10 +166,12 @@ def add(x, y=None): return x # print("print type(x) in add", type(x)) + def relu(x): compare = x > 0 return x * compare + def layer_norm(x, normalized_shape, norm, epsilon=1e-05, act=None): fluid.enable_dygraph() with fluid.dygraph.guard(): @@ -191,6 +194,7 @@ def layer_norm(x, normalized_shape, norm, epsilon=1e-05, act=None): x_scaled_bias = x_scaled_bias.reshape((batch_size, src_len, d_model)) return x_scaled_bias + def ffn(src, encoder_layer, ffn_fc1_act="relu"): assert ffn_fc1_act == "relu", "only relu is supported" fluid.enable_dygraph() @@ -218,7 +222,7 @@ def multihead_attention_test_helper(self_attention, cache): "attn", self_attention, False) query, key, value, attn_mask, cache_dict = generate_query_key_value_cache( self_attention, batch_size, num_heads, query_length, - embed_dim, key_length, value_length, kdim, vdim, cache) + embed_dim, key_length, value_length, kdim, vdim, cache) need_weight, param_attr, bias_attr = False, None, None # call paddle's function multi_head_attn = MultiheadAttention( @@ -285,7 +289,7 @@ def test_transformer_encoder_layer(self): attn_dropout, act_dropout) encoder_output = encoder_layer( - paddle.to_variable(src)) # paddle.to_variable(src_mask)) + paddle.to_variable(src)) # paddle.to_variable(src_mask)) # 4.numpy: # paddle self attention self_attn = MultiheadAttention( @@ -362,4 +366,4 @@ def test_transformer_decoder_layer(self): if __name__ == "__main__": - unittest.main() \ No newline at end of file + unittest.main() From 8637eeeb0430f6f5ede3a2668212a0b2b5733a06 Mon Sep 17 00:00:00 2001 From: guosheng Date: Thu, 20 Aug 2020 21:28:40 +0800 Subject: [PATCH 08/17] Fix some bugs in Transformer apis. test=develop --- python/paddle/nn/layer/transformer.py | 78 +++++++++++++++------------ 1 file changed, 43 insertions(+), 35 deletions(-) diff --git a/python/paddle/nn/layer/transformer.py b/python/paddle/nn/layer/transformer.py index bbb638b62c7be4..7d050a47e3cf02 100644 --- a/python/paddle/nn/layer/transformer.py +++ b/python/paddle/nn/layer/transformer.py @@ -19,6 +19,7 @@ import collections import numpy as np + from ...fluid import layers from ...fluid.param_attr import ParamAttr from ...fluid.dygraph import Layer, Linear, Dropout, LayerNorm, LayerList @@ -230,7 +231,7 @@ def cal_kv(self, key, value): v = layers.transpose(x=v, perm=[0, 2, 1, 3]) return k, v - def gen_cache(self, key, value=None, type=Cache): + def gen_cache(self, key, value=None, type=MultiheadAttention.Cache): """ Generates cache for `forward` usage in inference accroding to arguments. The generated cache is an instance of `MultiheadAttention.Cache` or an @@ -384,7 +385,7 @@ def forward(self, query, key, value, attn_mask=None, cache=None): outs.append(weights) if cache is not None: outs.append(cache) - return out if len(outs) else tuple(outs) + return out if len(outs) == 1 else tuple(outs) class TransformerEncoderLayer(Layer): @@ -455,6 +456,7 @@ def __init__(self, bias_attr=None): self._config = locals() self._config.pop("self") + self._config.pop("__class__", None) # py3 super(TransformerEncoderLayer, self).__init__() attn_dropout = dropout if attn_dropout is None else attn_dropout @@ -553,7 +555,7 @@ class TransformerEncoder(Layer): enc_input = paddle.rand((2, 4, 128)) # self attention mask: [batch_size, n_head, src_len, src_len] attn_mask = paddle.rand((2, 2, 4, 4)) - encoder_layer = TransformerEncoderLayer(2, 64, 64, 128, 512) + encoder_layer = TransformerEncoderLayer(128, 2, 512) encoder = TransformerEncoder(encoder_layer, 2) enc_output = encoder(enc_input, attn_mask) # [2, 4, 128] """ @@ -561,7 +563,7 @@ class TransformerEncoder(Layer): def __init__(self, encoder_layer, num_layers, norm=None): super(TransformerEncoder, self).__init__() self.layers = LayerList([(encoder_layer if i == 0 else - type(encoder_layer)(encoder_layer._config)) + type(encoder_layer)(**encoder_layer._config)) for i in range(num_layers)]) self.num_layers = num_layers self.norm = norm @@ -680,6 +682,7 @@ def __init__(self, bias_attr=None): self._config = locals() self._config.pop("self") + self._config.pop("__class__", None) # py3 super(TransformerDecoderLayer, self).__init__() attn_dropout = dropout if attn_dropout is None else attn_dropout @@ -867,7 +870,7 @@ class TransformerDecoder(Layer): def __init__(self, decoder_layer, num_layers, norm=None): super(TransformerDecoder, self).__init__() self.layers = LayerList([(decoder_layer if i == 0 else - type(decoder_layer)(decoder_layer._config)) + type(decoder_layer)(**decoder_layer._config)) for i in range(num_layers)]) self.num_layers = num_layers self.norm = norm @@ -1034,8 +1037,8 @@ class Transformer(Layer): # memory_mask: [batch_size, n_head, tgt_len, src_len] cross_attn_mask = paddle.rand((2, 2, 6, 4)) transformer = Transformer(128, 2, 4, 4, 512) - output = transformer(dec_input, - enc_output, + output = transformer(enc_input, + dec_input, enc_self_attn_mask, dec_self_attn_mask, cross_attn_mask) # [2, 6, 128] @@ -1125,8 +1128,11 @@ def forward(self, src, tgt, src_mask=None, tgt_mask=None, memory_mask=None): class TransformerDecoderCell(Layer): """ - TransformerDecoderCell wraps a Transformer decoder producing logits from - `inputs` composed by ids and position. + TransformerDecoderCell wraps a Transformer decoder combined with an embedding + layer and output layer to produce logits from symbols (ids and position here). + It is analogy to `RNNCell` and `outputs, new_states = cell(inputs, states, *kwargs)`, + where `inputs` is composed of word ids and position, `states` is `cache`, + `kwargs` includes `memory, `tgt_mask`, `memory_mask` and `static_cache`. Parameters: decoder(callable): A TransformerDecoder instance. Or a wrapper of it that @@ -1202,11 +1208,11 @@ def __init__(self, decoder, embedding_fn=None, output_fn=None): def forward(self, inputs, - states=None, - enc_output=None, - trg_slf_attn_bias=None, - trg_src_attn_bias=None, - static_caches=[]): + cache=None, + memory=None, + tgt_mask=None, + memory_mask=None, + static_cache=[]): """ Produces logits from `inputs` composed by ids and positions. @@ -1215,27 +1221,29 @@ def forward(self, tensors both have int64 data type and with 2D shape `[batch_size, sequence_length]` where `sequence_length` is 1 for inference. - states(list): It caches the multi-head attention intermediate results + cache(list): It caches the multi-head attention intermediate results of history decoding steps. It is a list of dict where the length of list is decoder layer number, and each dict has `k` and `v` as keys and values are cached results. Default None - enc_output(Variable): The output of Transformer encoder. It is a tensor - with shape `[batch_size, sequence_length, d_model]`. The data type + memory (Variable): The output of Transformer encoder. It is a tensor + with shape `[batch_size, source_length, d_model]`. The data type should be float32 or float64. - trg_slf_attn_bias(Variable, optional): A tensor used in decoder self - attention to mask out attention on unwanted target positions. It - is a tensor with shape `[batch_size, n_head, target_length, target_length]`, - where the unwanted positions have `-INF` values and the others - have 0 values. It can be None when nothing wanted or needed to - be masked out. It can be None for inference. The data type should - be float32 or float64. Default None - trg_src_attn_bias(Variable, optional): A tensor used in decoder-encoder - cross attention to mask out unwanted attention on source (encoder output). - It is a tensor with shape `[batch_size, n_head, target_length, source_length]`, + tgt_mask (Variable, optional): A tensor used in self attention + to prevents attention to some unwanted positions, usually the + the subsequent positions. It is a tensor with shape broadcasted + to `[batch_size, n_head, target_length, target_length]`, where the unwanted positions have `-INF` values and the others - have 0 values. It can be None when nothing wanted or needed to - be masked out. The data type should be float32 or float64. Default None - static_caches(list): It stores projected results of encoder output + have 0 values. The data type should be float32 or float64. It can + be None when nothing wanted or needed to be prevented attention to. + Default None + memory_mask (Variable, optional): A tensor used in decoder-encoder + cross attention to prevents attention to some unwanted positions, + usually the paddings. It is a tensor with shape broadcasted to + `[batch_size, n_head, target_length, source_length]`, where the + unwanted positions have `-INF` values and the others have 0 values. + The data type should be float32 or float64. It can be None when + nothing wanted or needed to be prevented attention to. Default None + static_cache(list): It stores transformed results of encoder output to be used as keys and values in decoder-encoder cross attention It is a list of dict where the length of list is decoder layer number, and each dict has `static_k` and `static_v` as keys and @@ -1250,16 +1258,16 @@ def forward(self, concatenated into it. """ trg_word, trg_pos = inputs - if states and static_caches: + if cache and static_cache: for cache, static_cache in zip(states, static_caches): cache.update(static_cache) if self.embedding_fn is not None: dec_input = self.embedding_fn(trg_word, trg_pos) - outputs = self.decoder(dec_input, enc_output, None, - trg_src_attn_bias, states) + outputs = self.decoder(dec_input, memory, tgt_mask, memory_mask, + states) else: - outputs = self.decoder(trg_word, trg_pos, enc_output, None, - trg_src_attn_bias, states) + outputs = self.decoder(trg_word, trg_pos, memory, tgt_mask, + memory_mask, states) if self.output_fn is not None: outputs = self.output_fn(outputs) From 33741e8df9da8a056f21d94a4f24623af890f17e Mon Sep 17 00:00:00 2001 From: LiuChiaChi <709153940@qq.com> Date: Thu, 20 Aug 2020 13:38:39 +0000 Subject: [PATCH 09/17] add unittests for encoder, decoder and transformer --- .../fluid/tests/book/test_transformer_api.py | 369 -------------- .../tests/unittests/test_transformer_api.py | 477 ++++++++++++++++++ python/paddle/nn/layer/transformer.py | 11 +- 3 files changed, 483 insertions(+), 374 deletions(-) delete mode 100644 python/paddle/fluid/tests/book/test_transformer_api.py create mode 100644 python/paddle/fluid/tests/unittests/test_transformer_api.py diff --git a/python/paddle/fluid/tests/book/test_transformer_api.py b/python/paddle/fluid/tests/book/test_transformer_api.py deleted file mode 100644 index c75f018c07760c..00000000000000 --- a/python/paddle/fluid/tests/book/test_transformer_api.py +++ /dev/null @@ -1,369 +0,0 @@ -# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import numpy as np -import paddle -import paddle.fluid as fluid -from paddle.nn.layer.transformer import MultiheadAttention, TransformerEncoderLayer, TransformerDecoderLayer - -import unittest - - -def generate_basic_params(mode="attn", self_attention=True, verbose=False): - batch_size, query_length = [np.random.randint(2, 10) for _ in range(2)] - d_head, num_heads = [np.random.randint(3, 10) for _ in range(2)] - attn_dropout = 0.0 - embed_dim = d_head * num_heads - if mode == "attn": - if self_attention: - kdim, vdim = embed_dim, embed_dim - key_length, value_length = query_length, query_length - else: - kdim, vdim = [np.random.randint(5, 20) for _ in range(2)] - key_length = np.random.randint(2, 10) - value_length = key_length - return batch_size, query_length, key_length, value_length, embed_dim, kdim, vdim, num_heads, attn_dropout - - else: - dropout, act_dropout = 0.0, 0.0 - dim_feedforward = np.random.randint(128, 1024) - sequence_length = np.random.randint(2, 10) - if mode == "encoder_layer": - return batch_size, embed_dim, num_heads, dim_feedforward, dropout, attn_dropout, act_dropout, sequence_length - elif mode == "decoder_layer": - target_length = np.random.randint(2, 10) - return batch_size, embed_dim, num_heads, dim_feedforward, dropout, attn_dropout, act_dropout, sequence_length, target_length - - -def generate_query_key_value_cache(self_attention, - batch_size, - num_heads, - query_length, - embed_dim, - key_length=None, - value_length=None, - kdim=None, - vdim=None, - cache=None): - query = np.random.rand(batch_size, query_length, - embed_dim).astype("float32") - # attn_mask = np.zeros((batch_size, num_heads, query_length, key_length)) - # attn_mask[0][0][0][:10] = -np.inf - attn_mask = None - head_dim = embed_dim // num_heads - if self_attention: - key, value = query, query - else: - key = np.random.rand(batch_size, key_length, kdim).astype("float32") - value = np.random.rand(batch_size, value_length, vdim).astype("float32") - cache_dict = {} - if cache: - if not self_attention: - cache_dict["static_k"] = np.random.rand( - batch_size, num_heads, key_length, head_dim).astype("float32") - cache_dict["static_v"] = np.random.rand( - batch_size, num_heads, value_length, head_dim).astype("float32") - else: - cache_dict["k"] = np.random.rand(batch_size, num_heads, key_length, - head_dim).astype("float32") - cache_dict["v"] = np.random.rand( - batch_size, num_heads, value_length, head_dim).astype("float32") - else: - cache_dict = None - return query, key, value, attn_mask, cache_dict - - -def fc(x, weight): - return np.matmul(x, weight) - - -def softmax(x): - np.seterr(invalid='ignore') - output = np.zeros(x.shape, dtype=np.float64) # ? - for i in range(x.shape[0]): - for j in range(x.shape[1]): - for k in range(x.shape[2]): - x_curr = x[i, j, k, :] - e_x = np.exp(x_curr - np.amax(x_curr)) - output[i, j, k, :] = e_x / np.sum(e_x) - return output - - -def batch_matmul(x, y): - assert x.shape[0] == y.shape[0] - assert x.shape[1] == y.shape[1] - retval = np.zeros( - (x.shape[0], x.shape[1], x.shape[2], y.shape[3]), dtype=np.float64) - for i in range(x.shape[0]): - for j in range(x.shape[1]): - retval[i, j, :, :] = np.matmul(x[i, j, :, :], y[i, j, :, :]) - return retval - - -def scaled_dot_product_attention(q, k, v, d_key, attn_mask, multi_head_attn): - k = k.transpose([0, 1, 3, 2]) - qkt = batch_matmul(q, k / np.sqrt(d_key, dtype=np.float64)) - if attn_mask is not None: - qkt += attn_mask - weight = softmax(qkt) - - attn_heads = batch_matmul(weight, v) - attn_heads = attn_heads.transpose((0, 2, 1, 3)) - attn_heads = attn_heads.reshape((attn_heads.shape[0], attn_heads.shape[1], - attn_heads.shape[2] * attn_heads.shape[3])) - return attn_heads - - -def cal_qkv(key, value, num_heads, embed_dim, multi_head_attn): - with fluid.dygraph.guard(): - head_dim = embed_dim // num_heads - k_weight = multi_head_attn.k_proj.weight.numpy() - v_weight = multi_head_attn.v_proj.weight.numpy() - k = fc(key, k_weight) - v = fc(value, v_weight) - k = k.reshape((k.shape[0], k.shape[1], num_heads, head_dim)) - k = k.transpose((0, 2, 1, 3)) - v = v.reshape((v.shape[0], v.shape[1], num_heads, head_dim)) - v = v.transpose((0, 2, 1, 3)) - return k, v - - -def prepare_qkv(query, key, value, num_heads, embed_dim, self_attention, - multi_head_attn, cache_dict): - q_weight = multi_head_attn.q_proj.weight.numpy() - q = fc(query, q_weight) - q = q.reshape((q.shape[0], q.shape[1], num_heads, embed_dim // num_heads)) - q = q.transpose((0, 2, 1, 3)) - - if not self_attention and cache_dict: - k, v = cache_dict["static_k"], cache_dict["static_v"] - else: - k, v = cal_qkv(key, value, num_heads, embed_dim, multi_head_attn) - if cache_dict is not None: - k = np.concatenate((cache_dict["k"], k), axis=2) - v = np.concatenate((cache_dict["v"], v), axis=2) - return (q, k, v, cache_dict) - - -def add(x, y=None): - fluid.enable_dygraph() - with fluid.dygraph.guard(): - x = x.numpy() if not isinstance(x, np.ndarray) else x - if y is not None: - x += y - return x - return x - # print("print type(x) in add", type(x)) - - -def relu(x): - compare = x > 0 - return x * compare - - -def layer_norm(x, normalized_shape, norm, epsilon=1e-05, act=None): - fluid.enable_dygraph() - with fluid.dygraph.guard(): - # scale: - weight = norm.weight.numpy() - # shift: - bias = norm.bias.numpy() - - batch_size, src_len, d_model = x.shape - x = x.reshape((batch_size * src_len, d_model)) - mu = np.mean(x, axis=1, keepdims=True) - sigma_squar = np.sum(np.square(x - mu), axis=1) / d_model - x1_up = (x - mu) - x1_down_1 = sigma_squar + epsilon - x1_down = np.sqrt(x1_down_1) - x1_down = x1_down.reshape((x1_down.shape[0], 1)) - x1 = x1_up / x1_down - x_scaled = weight * x1 - x_scaled_bias = x_scaled + bias - x_scaled_bias = x_scaled_bias.reshape((batch_size, src_len, d_model)) - return x_scaled_bias - - -def ffn(src, encoder_layer, ffn_fc1_act="relu"): - assert ffn_fc1_act == "relu", "only relu is supported" - fluid.enable_dygraph() - with fluid.dygraph.guard(): - src = src.numpy() if not isinstance(src, np.ndarray) else src - w1 = encoder_layer.linear1.weight.numpy() - w2 = encoder_layer.linear2.weight.numpy() - # fc1 - x1 = fc(src, w1) - x1 = relu(x1) - # fc2 - x2 = fc(x1, w2) - return x2 - - -class TestTransformer(unittest.TestCase): - def test_multi_head_attention(self): - def multihead_attention_test_helper(self_attention, cache): - paddle.framework.manual_seed(2020) - # 分四种情况:self_attention|cross_attention, cache|No cache - with fluid.dygraph.guard(fluid.CPUPlace()): - for _ in range(100): - # generate params for multi_head_attention - batch_size, query_length, key_length, value_length, embed_dim, kdim, vdim, num_heads, attn_dropout = generate_basic_params( - "attn", self_attention, False) - query, key, value, attn_mask, cache_dict = generate_query_key_value_cache( - self_attention, batch_size, num_heads, query_length, - embed_dim, key_length, value_length, kdim, vdim, cache) - need_weight, param_attr, bias_attr = False, None, None - # call paddle's function - multi_head_attn = MultiheadAttention( - embed_dim, num_heads, attn_dropout, kdim, vdim, - need_weight, param_attr, bias_attr) - # construct cache object - cache_obj = None - if cache_dict: - if 'k' and 'v' in cache_dict: - cache_obj = multi_head_attn.Cache( - paddle.to_variable(cache_dict['k']), - paddle.to_variable(cache_dict['v'])) - elif 'static_k' and 'static_v' in cache_dict: - cache_obj = multi_head_attn.StaticCache( - paddle.to_variable(cache_dict['static_k']), - paddle.to_variable(cache_dict['static_v'])) - - attn_output = multi_head_attn( - paddle.to_variable(query), - paddle.to_variable(key), - paddle.to_variable(value), attn_mask, cache_obj) - - # implementation by numpy - # compute q, k, v - q, k, v, _ = prepare_qkv(query, key, value, num_heads, - embed_dim, self_attention, - multi_head_attn, cache_dict) - # scale dot product attention - attn_heads = scaled_dot_product_attention( - q, k, v, embed_dim // num_heads, attn_mask, - multi_head_attn) - out_proj_weight = multi_head_attn.out_proj.weight.numpy() - reference = fc(attn_heads, out_proj_weight) - - np.testing.assert_allclose( - attn_output.numpy(), reference, atol=1e-6) - - multihead_attention_test_helper(True, True) - multihead_attention_test_helper(True, False) - multihead_attention_test_helper(False, True) - multihead_attention_test_helper(False, False) - - def test_transformer_encoder_layer(self): - - with fluid.dygraph.guard(fluid.CPUPlace()): - paddle.framework.manual_seed(2020) - for _ in range(100): - # 0.定义默认参数 - ffn_fc1_act = "relu" - # 1.获取基本参数 - batch_size, d_model, n_head, dim_feedforward, dropout, attn_dropout, act_dropout, sequence_length = generate_basic_params( - mode="encoder_layer", verbose=False) - # 2.生成输入 - src = np.random.rand(batch_size, sequence_length, - d_model).astype("float32") - residual = src - # src_mask = np.zeros(batch_size, n_head, sequence_length, - # sequence_length).astype(dtype) - # src_mask [0][0][:30] = -np.inf - - # 3.框架的输出 - encoder_layer = TransformerEncoderLayer( - d_model, n_head, dim_feedforward, dropout, ffn_fc1_act, - attn_dropout, act_dropout) - - encoder_output = encoder_layer( - paddle.to_variable(src)) # paddle.to_variable(src_mask)) - # 4.numpy: - # paddle self attention - self_attn = MultiheadAttention( - d_model, n_head, dropout=attn_dropout) - attn_output = self_attn( - paddle.to_variable(src), - paddle.to_variable(src), paddle.to_variable(src)).numpy() - - src = attn_output + residual - src_norm = layer_norm(src, d_model, encoder_layer.norm1) - residual = src_norm - - ffn_output = ffn(src_norm, encoder_layer, ffn_fc1_act) - src = residual + ffn_output - src = layer_norm(src, d_model, encoder_layer.norm2) - - np.testing.assert_allclose( - encoder_output.numpy(), src, rtol=1e-5, atol=1e-6) - - def test_transformer_decoder_layer(self): - with fluid.dygraph.guard(fluid.CPUPlace()): - paddle.framework.manual_seed(2020) - for _ in range(100): - activation = "relu" - normalize_before = False - batch_size, d_model, n_head, dim_feedforward, dropout, attn_dropout, act_dropout, source_length, target_length = generate_basic_params( - mode="decoder_layer", verbose=False) - tgt = np.random.rand(batch_size, target_length, - d_model).astype("float32") - memory = np.random.rand(batch_size, source_length, - d_model).astype("float32") - tgt_mask = None # TODO - memory_mask = None - # paddle: - decoder_layer = TransformerDecoderLayer( - d_model, n_head, dim_feedforward, dropout, activation, - attn_dropout, act_dropout, normalize_before) - decoder_output = decoder_layer( - paddle.to_variable(tgt), - paddle.to_variable(memory), tgt_mask, memory_mask).numpy() - # TODO: cache - residual = tgt - self_attn = MultiheadAttention( - d_model, n_head, dropout=attn_dropout) - - tgt = self_attn( - paddle.to_variable(tgt), - paddle.to_variable(tgt), - paddle.to_variable(tgt), tgt_mask, None).numpy() - - tgt = residual + tgt - # postprocess - tgt_norm = layer_norm(tgt, d_model, decoder_layer.norm1) - residual = tgt_norm - - cross_attn = MultiheadAttention( - d_model, n_head, dropout=attn_dropout) - tgt = cross_attn( - paddle.to_variable(tgt_norm), - paddle.to_variable(memory), - paddle.to_variable(memory), memory_mask, None).numpy() - # postprocess - tgt = tgt + residual - tgt_norm = layer_norm(tgt, d_model, decoder_layer.norm2) - residual = tgt_norm - - ffn_output = ffn(tgt_norm, decoder_layer, activation) - # post process - tgt = residual + ffn_output - tgt_norm = layer_norm(tgt, d_model, decoder_layer.norm3) - - np.testing.assert_allclose( - decoder_output, tgt_norm, rtol=1e-5, atol=1e-6) - - -if __name__ == "__main__": - unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_transformer_api.py b/python/paddle/fluid/tests/unittests/test_transformer_api.py new file mode 100644 index 00000000000000..6fb374ff2c48f4 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_transformer_api.py @@ -0,0 +1,477 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import paddle +import paddle.fluid as fluid +from paddle.nn.layer.transformer import MultiheadAttention, TransformerEncoderLayer, TransformerDecoderLayer, TransformerEncoder, TransformerDecoder, Transformer + +import unittest + + +def generate_basic_params(mode="attn", self_attention=True): + batch_size, query_length = [np.random.randint(2, 10) for _ in range(2)] + d_head, num_heads = [np.random.randint(3, 10) for _ in range(2)] + attn_dropout = 0.0 + embed_dim = d_head * num_heads + if mode == "attn": + if self_attention: + kdim, vdim = embed_dim, embed_dim + key_length, value_length = query_length, query_length + else: + kdim, vdim = [np.random.randint(5, 20) for _ in range(2)] + key_length = np.random.randint(2, 10) + value_length = key_length + return batch_size, query_length, key_length, value_length, embed_dim, kdim, vdim, num_heads, attn_dropout + + else: + dropout, act_dropout = 0.0, 0.0 + dim_feedforward = np.random.randint(128, 1024) + sequence_length = np.random.randint(2, 10) + if mode == "encoder_layer": + return batch_size, embed_dim, num_heads, dim_feedforward, dropout, attn_dropout, act_dropout, sequence_length + elif mode == "decoder_layer": + target_length = np.random.randint(2, 10) + return batch_size, embed_dim, num_heads, dim_feedforward, dropout, attn_dropout, act_dropout, sequence_length, target_length + + +def generate_query_key_value_cache(self_attention, + batch_size, + num_heads, + query_length, + embed_dim, + key_length=None, + value_length=None, + kdim=None, + vdim=None, + cache=None): + query = np.random.rand(batch_size, query_length, + embed_dim).astype("float32") + attn_mask = np.zeros((batch_size, num_heads, query_length, key_length)) + attn_mask[0][0][0][0] = -1e9 + + head_dim = embed_dim // num_heads + if self_attention: + key, value = query, query + else: + key = np.random.rand(batch_size, key_length, kdim).astype("float32") + value = np.random.rand(batch_size, value_length, vdim).astype("float32") + cache_dict = {} + if cache: + if not self_attention: + cache_dict["static_k"] = np.random.rand( + batch_size, num_heads, key_length, head_dim).astype("float32") + cache_dict["static_v"] = np.random.rand( + batch_size, num_heads, value_length, head_dim).astype("float32") + else: + cache_dict["k"] = np.random.rand(batch_size, num_heads, key_length, + head_dim).astype("float32") + cache_dict["v"] = np.random.rand( + batch_size, num_heads, value_length, head_dim).astype("float32") + else: + cache_dict = None + return query, key, value, attn_mask, cache_dict + + +def fc(x, weight): + return np.matmul(x, weight) + + +def softmax(x): + np.seterr(invalid='ignore') + output = np.zeros(x.shape, dtype=np.float64) + for i in range(x.shape[0]): + for j in range(x.shape[1]): + for k in range(x.shape[2]): + x_curr = x[i, j, k, :] + e_x = np.exp(x_curr - np.amax(x_curr)) + output[i, j, k, :] = e_x / np.sum(e_x) + return output + + +def batch_matmul(x, y): + assert x.shape[0] == y.shape[0] + assert x.shape[1] == y.shape[1] + retval = np.zeros( + (x.shape[0], x.shape[1], x.shape[2], y.shape[3]), dtype=np.float64) + for i in range(x.shape[0]): + for j in range(x.shape[1]): + retval[i, j, :, :] = np.matmul(x[i, j, :, :], y[i, j, :, :]) + return retval + + +def scaled_dot_product_attention(q, k, v, d_key, attn_mask, multi_head_attn): + k = k.transpose([0, 1, 3, 2]) + qkt = batch_matmul(q, k / np.sqrt(d_key, dtype=np.float64)) + if attn_mask is not None: + qkt += attn_mask + weight = softmax(qkt) + attn_heads = batch_matmul(weight, v) + attn_heads = attn_heads.transpose((0, 2, 1, 3)) + attn_heads = attn_heads.reshape((attn_heads.shape[0], attn_heads.shape[1], + attn_heads.shape[2] * attn_heads.shape[3])) + return attn_heads + + +def cal_qkv(key, value, num_heads, embed_dim, multi_head_attn): + with fluid.dygraph.guard(): + head_dim = embed_dim // num_heads + k_weight = multi_head_attn.k_proj.weight.numpy() + v_weight = multi_head_attn.v_proj.weight.numpy() + k = fc(key, k_weight) + v = fc(value, v_weight) + k = k.reshape((k.shape[0], k.shape[1], num_heads, head_dim)) + k = k.transpose((0, 2, 1, 3)) + v = v.reshape((v.shape[0], v.shape[1], num_heads, head_dim)) + v = v.transpose((0, 2, 1, 3)) + return k, v + + +def prepare_qkv(query, key, value, num_heads, embed_dim, self_attention, + multi_head_attn, cache_dict): + q_weight = multi_head_attn.q_proj.weight.numpy() + q = fc(query, q_weight) + q = q.reshape((q.shape[0], q.shape[1], num_heads, embed_dim // num_heads)) + q = q.transpose((0, 2, 1, 3)) + + if not self_attention and cache_dict: + k, v = cache_dict["static_k"], cache_dict["static_v"] + else: + k, v = cal_qkv(key, value, num_heads, embed_dim, multi_head_attn) + if cache_dict is not None: + k = np.concatenate((cache_dict["k"], k), axis=2) + v = np.concatenate((cache_dict["v"], v), axis=2) + return (q, k, v, cache_dict) + + +def add(x, y=None): + fluid.enable_dygraph() + with fluid.dygraph.guard(): + x = x.numpy() if not isinstance(x, np.ndarray) else x + if y is not None: + x += y + return x + return x + + +def relu(x): + compare = x > 0 + return x * compare + + +def layer_norm(x, normalized_shape, norm, epsilon=1e-05, act=None): + fluid.enable_dygraph() + with fluid.dygraph.guard(): + # scale: + weight = norm.weight.numpy() + # shift: + bias = norm.bias.numpy() + + batch_size, src_len, d_model = x.shape + x = x.reshape((batch_size * src_len, d_model)) + mu = np.mean(x, axis=1, keepdims=True) + sigma_squar = np.sum(np.square(x - mu), axis=1) / d_model + x1_up = (x - mu) + x1_down_1 = sigma_squar + epsilon + x1_down = np.sqrt(x1_down_1) + x1_down = x1_down.reshape((x1_down.shape[0], 1)) + x1 = x1_up / x1_down + x_scaled = weight * x1 + x_scaled_bias = x_scaled + bias + x_scaled_bias = x_scaled_bias.reshape((batch_size, src_len, d_model)) + return x_scaled_bias + + +def ffn(src, encoder_layer, ffn_fc1_act="relu"): + assert ffn_fc1_act == "relu", "only relu is supported" + fluid.enable_dygraph() + with fluid.dygraph.guard(): + src = src.numpy() if not isinstance(src, np.ndarray) else src + w1 = encoder_layer.linear1.weight.numpy() + w2 = encoder_layer.linear2.weight.numpy() + # fc1 + x1 = fc(src, w1) + x1 = relu(x1) + # fc2 + x2 = fc(x1, w2) + return x2 + + +class TestTransformer(unittest.TestCase): + def test_multi_head_attention(self): + def multihead_attention_test_helper(self_attention, cache): + paddle.framework.manual_seed(2020) + # 分四种情况:self_attention|cross_attention, cache|No cache + with fluid.dygraph.guard(fluid.CPUPlace()): + + # generate params for multi_head_attention + batch_size, query_length, key_length, value_length, embed_dim, kdim, vdim, num_heads, attn_dropout = generate_basic_params( + "attn", self_attention) + query, key, value, attn_mask, cache_dict = generate_query_key_value_cache( + self_attention, batch_size, num_heads, query_length, + embed_dim, key_length, value_length, kdim, vdim, cache) + if cache and self_attention: + attn_mask = np.concatenate((attn_mask, attn_mask), axis=3) + need_weight, param_attr, bias_attr = False, None, None + # call paddle's function + multi_head_attn = MultiheadAttention( + embed_dim, num_heads, attn_dropout, kdim, vdim, need_weight, + param_attr, bias_attr) + # construct cache object + cache_obj = None + if cache_dict: + if 'k' and 'v' in cache_dict: + cache_obj = multi_head_attn.Cache( + paddle.to_variable(cache_dict['k']), + paddle.to_variable(cache_dict['v'])) + elif 'static_k' and 'static_v' in cache_dict: + cache_obj = multi_head_attn.StaticCache( + paddle.to_variable(cache_dict['static_k']), + paddle.to_variable(cache_dict['static_v'])) + if attn_mask is not None: + attn_output = multi_head_attn( + paddle.to_variable(query), + paddle.to_variable(key), + paddle.to_variable(value), + paddle.to_variable(attn_mask), cache_obj) + else: + attn_output = multi_head_attn( + paddle.to_variable(query), + paddle.to_variable(key), + paddle.to_variable(value), attn_mask, cache_obj) + attn_output = attn_output[0] if cache_dict else attn_output + + # implementation by numpy + # compute q, k, v + q, k, v, _ = prepare_qkv(query, key, value, num_heads, + embed_dim, self_attention, + multi_head_attn, cache_dict) + # scale dot product attention + attn_heads = scaled_dot_product_attention( + q, k, v, embed_dim // num_heads, attn_mask, multi_head_attn) + out_proj_weight = multi_head_attn.out_proj.weight.numpy() + reference = fc(attn_heads, out_proj_weight) + + np.testing.assert_allclose( + attn_output.numpy(), reference, atol=1e-6) + + multihead_attention_test_helper(True, True) + multihead_attention_test_helper(True, False) + multihead_attention_test_helper(False, True) + multihead_attention_test_helper(False, False) + + def test_transformer_encoder_layer(self): + + with fluid.dygraph.guard(fluid.CPUPlace()): + paddle.framework.manual_seed(2020) + + ffn_fc1_act = "relu" + # 1.generate basic params + batch_size, d_model, n_head, dim_feedforward, dropout, attn_dropout, act_dropout, sequence_length = generate_basic_params( + mode="encoder_layer") + # 2.generate input for encoder + src = np.random.rand(batch_size, sequence_length, + d_model).astype("float32") + residual = src + src_mask = np.zeros((batch_size, n_head, sequence_length, + sequence_length)).astype("float32") + src_mask[0][0][0][0] = -np.inf + + # paddle + encoder_layer = TransformerEncoderLayer( + d_model, n_head, dim_feedforward, dropout, ffn_fc1_act, + attn_dropout, act_dropout) + + encoder_output = encoder_layer( + paddle.to_variable(src), + paddle.to_variable(src_mask)) # paddle.to_variable(src_mask)) + # 4.numpy: + # paddle self attention + self_attn = MultiheadAttention( + d_model, n_head, dropout=attn_dropout) + attn_output = self_attn( + paddle.to_variable(src), + paddle.to_variable(src), + paddle.to_variable(src), paddle.to_variable(src_mask)).numpy() + + src = attn_output + residual + src_norm = layer_norm(src, d_model, encoder_layer.norm1) + residual = src_norm + + ffn_output = ffn(src_norm, encoder_layer, ffn_fc1_act) + src = residual + ffn_output + src = layer_norm(src, d_model, encoder_layer.norm2) + + np.testing.assert_allclose( + encoder_output.numpy(), src, rtol=1e-5, atol=1e-6) + + def test_transformer_decoder_layer(self): + with fluid.dygraph.guard(fluid.CPUPlace()): + paddle.framework.manual_seed(2020) + activation = "relu" + normalize_before = False + batch_size, d_model, n_head, dim_feedforward, dropout, attn_dropout, act_dropout, source_length, target_length = generate_basic_params( + mode="decoder_layer") + tgt = np.random.rand(batch_size, target_length, + d_model).astype("float32") + memory = np.random.rand(batch_size, source_length, + d_model).astype("float32") + tgt_mask = np.zeros((batch_size, n_head, target_length, + target_length)).astype("float32") + tgt_mask[0][0][0][0] = -1e9 + memory_mask = np.zeros((batch_size, n_head, target_length, + source_length)).astype("float32") + memory_mask[0][0][0][0] = -1e9 + for cache in [True, False]: + self_attn = MultiheadAttention( + d_model, n_head, dropout=attn_dropout) + cross_attn = MultiheadAttention( + d_model, n_head, dropout=attn_dropout) + + # paddle decoderlayer: + decoder_layer = TransformerDecoderLayer( + d_model, n_head, dim_feedforward, dropout, activation, + attn_dropout, act_dropout, normalize_before) + cache_objs = None + if cache: + cache_objs = decoder_layer.gen_cache( + paddle.to_variable(memory)) + + decoder_output = decoder_layer( + paddle.to_variable(tgt), + paddle.to_variable(memory), + paddle.to_variable(tgt_mask), + paddle.to_variable(memory_mask), cache_objs) + + decoder_output = decoder_output[0].numpy( + ) if cache else decoder_output.numpy() + + # numpy: + residual = tgt + # self-attn + self_attn_cache = cache_objs[ + 0] if cache_objs is not None else None + tgt = self_attn( + paddle.to_variable(tgt), + paddle.to_variable(tgt), + paddle.to_variable(tgt), + paddle.to_variable(tgt_mask), self_attn_cache) + + tgt = tgt[0].numpy() if cache else tgt.numpy() + + tgt = residual + tgt + # postprocess + tgt_norm = layer_norm(tgt, d_model, decoder_layer.norm1) + residual = tgt_norm + # cross-attn + cross_attn_cache = cache_objs[ + 1] if cache_objs is not None else None + tgt = cross_attn( + paddle.to_variable(tgt_norm), + paddle.to_variable(memory), + paddle.to_variable(memory), + paddle.to_variable(memory_mask), cross_attn_cache) + tgt = tgt[0].numpy() if cache else tgt.numpy() + + # postprocess + tgt = tgt + residual + tgt_norm = layer_norm(tgt, d_model, decoder_layer.norm2) + residual = tgt_norm + # FFN + ffn_output = ffn(tgt_norm, decoder_layer, activation) + # post process + tgt = residual + ffn_output + tgt_norm = layer_norm(tgt, d_model, decoder_layer.norm3) + + np.testing.assert_allclose( + decoder_output, tgt_norm, rtol=1e-5, atol=1e-6) + + def test_encoder(self): + batch_size, d_model, n_head, dim_feedforward, dropout, attn_dropout, act_dropout, sequence_length = generate_basic_params( + mode="encoder_layer") + + src = np.random.rand(batch_size, sequence_length, + d_model).astype("float32") + + src_mask = np.zeros((batch_size, n_head, sequence_length, + sequence_length)).astype("float32") + src_mask[0][0][0][0] = -np.inf + with fluid.dygraph.guard(fluid.CPUPlace()): + encoder_layer = TransformerEncoderLayer(d_model, n_head, + dim_feedforward, dropout) + num_layers = 6 + encoder = TransformerEncoder(encoder_layer, num_layers) + # src, src_mask + enc_output = encoder( + paddle.to_variable(src), paddle.to_variable(src_mask)) + + def test_decoder(self): + batch_size, d_model, n_head, dim_feedforward, dropout, _, _, source_length, target_length = generate_basic_params( + mode="decoder_layer") + tgt = np.random.rand(batch_size, target_length, + d_model).astype("float32") + memory = np.random.rand(batch_size, source_length, + d_model).astype("float32") + tgt_mask = np.zeros((batch_size, n_head, target_length, + target_length)).astype("float32") + tgt_mask[0][0][0][0] = -1e9 + memory_mask = np.zeros((batch_size, n_head, target_length, + source_length)).astype("float32") + memory_mask[0][0][0][0] = -1e9 + with fluid.dygraph.guard(fluid.CPUPlace()): + decoder_layer = TransformerDecoderLayer(d_model, n_head, + dim_feedforward, dropout) + num_layers = 6 + decoder = TransformerDecoder(decoder_layer, num_layers) + + output = decoder( + paddle.to_variable(tgt), + paddle.to_variable(memory), + paddle.to_variable(tgt_mask), paddle.to_variable(memory_mask)) + + def test_transformer(self): + batch_size, d_model, n_head, dim_feedforward, dropout, _, _, source_length, target_length = generate_basic_params( + mode="decoder_layer") + + # batch_size, source_length, target_length, d_model, n_head = 4, 8, 8, 64, 8 + with fluid.dygraph.guard(fluid.CPUPlace()): + transformer = Transformer( + d_model, + n_head, + dim_feedforward=dim_feedforward, + dropout=dropout) + src = paddle.to_variable( + np.random.rand(batch_size, source_length, d_model).astype( + "float32")) + tgt = paddle.to_variable( + np.random.rand(batch_size, target_length, d_model).astype( + "float32")) + src_mask = np.zeros((batch_size, n_head, source_length, + source_length)).astype("float32") + src_mask[0][0][0][0] = -np.inf + src_mask = paddle.to_variable(src_mask) + tgt_mask = np.zeros((batch_size, n_head, target_length, + target_length)).astype("float32") + tgt_mask[0][0][0][0] = -1e9 + memory_mask = np.zeros((batch_size, n_head, target_length, + source_length)).astype("float32") + memory_mask[0][0][0][0] = -1e9 + tgt_mask, memory_mask = paddle.to_variable( + tgt_mask), paddle.to_variable(memory_mask) + trans_output = transformer(src, tgt, src_mask, tgt_mask, + memory_mask) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/nn/layer/transformer.py b/python/paddle/nn/layer/transformer.py index bbb638b62c7be4..12b46f828d2624 100644 --- a/python/paddle/nn/layer/transformer.py +++ b/python/paddle/nn/layer/transformer.py @@ -384,7 +384,7 @@ def forward(self, query, key, value, attn_mask=None, cache=None): outs.append(weights) if cache is not None: outs.append(cache) - return out if len(outs) else tuple(outs) + return out if len(outs) == 1 else tuple(outs) class TransformerEncoderLayer(Layer): @@ -455,6 +455,7 @@ def __init__(self, bias_attr=None): self._config = locals() self._config.pop("self") + self._config.pop("__class__", None) super(TransformerEncoderLayer, self).__init__() attn_dropout = dropout if attn_dropout is None else attn_dropout @@ -561,7 +562,7 @@ class TransformerEncoder(Layer): def __init__(self, encoder_layer, num_layers, norm=None): super(TransformerEncoder, self).__init__() self.layers = LayerList([(encoder_layer if i == 0 else - type(encoder_layer)(encoder_layer._config)) + type(encoder_layer)(**encoder_layer._config)) for i in range(num_layers)]) self.num_layers = num_layers self.norm = norm @@ -680,7 +681,7 @@ def __init__(self, bias_attr=None): self._config = locals() self._config.pop("self") - + self._config.pop("__class__", None) super(TransformerDecoderLayer, self).__init__() attn_dropout = dropout if attn_dropout is None else attn_dropout act_dropout = dropout if act_dropout is None else act_dropout @@ -867,7 +868,7 @@ class TransformerDecoder(Layer): def __init__(self, decoder_layer, num_layers, norm=None): super(TransformerDecoder, self).__init__() self.layers = LayerList([(decoder_layer if i == 0 else - type(decoder_layer)(decoder_layer._config)) + type(decoder_layer)(**decoder_layer._config)) for i in range(num_layers)]) self.num_layers = num_layers self.norm = norm @@ -1117,7 +1118,7 @@ def forward(self, src, tgt, src_mask=None, tgt_mask=None, memory_mask=None): Variable: It is a tensor that has the same shape and data type \ as `tgt`, representing the output of Transformer decoder. """ - memory = self.encoder(src, mask=src_mask) + memory = self.encoder(src, src_mask=src_mask) output = self.decoder( tgt, memory, tgt_mask=tgt_mask, memory_mask=memory_mask) return output From 5578a0dd99b8c3fdb8581daa4f535a6f847d1f45 Mon Sep 17 00:00:00 2001 From: LiuChiaChi <709153940@qq.com> Date: Thu, 20 Aug 2020 15:24:40 +0000 Subject: [PATCH 10/17] clean conflicts infor in code --- python/paddle/nn/layer/transformer.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/python/paddle/nn/layer/transformer.py b/python/paddle/nn/layer/transformer.py index 9579753a385b20..72e19c010c7d73 100644 --- a/python/paddle/nn/layer/transformer.py +++ b/python/paddle/nn/layer/transformer.py @@ -456,11 +456,8 @@ def __init__(self, bias_attr=None): self._config = locals() self._config.pop("self") -<<<<<<< HEAD self._config.pop("__class__", None) -======= self._config.pop("__class__", None) # py3 ->>>>>>> 8637eeeb0430f6f5ede3a2668212a0b2b5733a06 super(TransformerEncoderLayer, self).__init__() attn_dropout = dropout if attn_dropout is None else attn_dropout From a7a000395ec45e1767058dfaf2720c646c455e58 Mon Sep 17 00:00:00 2001 From: LiuChiaChi <709153940@qq.com> Date: Thu, 20 Aug 2020 15:29:05 +0000 Subject: [PATCH 11/17] clean Chinese comments --- python/paddle/fluid/tests/unittests/test_transformer_api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/paddle/fluid/tests/unittests/test_transformer_api.py b/python/paddle/fluid/tests/unittests/test_transformer_api.py index 6fb374ff2c48f4..8384a346c6375e 100644 --- a/python/paddle/fluid/tests/unittests/test_transformer_api.py +++ b/python/paddle/fluid/tests/unittests/test_transformer_api.py @@ -212,7 +212,7 @@ class TestTransformer(unittest.TestCase): def test_multi_head_attention(self): def multihead_attention_test_helper(self_attention, cache): paddle.framework.manual_seed(2020) - # 分四种情况:self_attention|cross_attention, cache|No cache + # self_attention|cross_attention, cache|No cache with fluid.dygraph.guard(fluid.CPUPlace()): # generate params for multi_head_attention From d2c25dca45c7275bad56ac25facc572fe02e75cc Mon Sep 17 00:00:00 2001 From: guosheng Date: Fri, 21 Aug 2020 11:53:16 +0800 Subject: [PATCH 12/17] Add TransformerDecoderCell and TransformerBeamSearchDecoder. test=develop --- python/paddle/nn/layer/transformer.py | 98 +++++++++++---------------- 1 file changed, 39 insertions(+), 59 deletions(-) diff --git a/python/paddle/nn/layer/transformer.py b/python/paddle/nn/layer/transformer.py index 72e19c010c7d73..5a225b46fae0b9 100644 --- a/python/paddle/nn/layer/transformer.py +++ b/python/paddle/nn/layer/transformer.py @@ -96,7 +96,7 @@ class MultiheadAttention(Layer): query = paddle.rand((2, 4, 128)) # self attention mask: [batch_size, num_heads, query_len, query_len] attn_mask = paddle.rand((2, 2, 4, 4)) - multi_head_attn = paddle.MultiheadAttention(64, 64, 128, n_head=2) + multi_head_attn = paddle.MultiheadAttention(128, 2) output = multi_head_attn(query, attn_mask=attn_mask) # [2, 4, 128] """ @@ -231,7 +231,7 @@ def cal_kv(self, key, value): v = layers.transpose(x=v, perm=[0, 2, 1, 3]) return k, v - def gen_cache(self, key, value=None, type=MultiheadAttention.Cache): + def gen_cache(self, key, value=None, type=Cache): """ Generates cache for `forward` usage in inference accroding to arguments. The generated cache is an instance of `MultiheadAttention.Cache` or an @@ -456,7 +456,6 @@ def __init__(self, bias_attr=None): self._config = locals() self._config.pop("self") - self._config.pop("__class__", None) self._config.pop("__class__", None) # py3 super(TransformerEncoderLayer, self).__init__() @@ -940,25 +939,32 @@ def forward(self, tgt, memory, tgt_mask=None, memory_mask=None, cache=None): return output if cache is None else (output, new_caches) - def gen_cache(self, memory): + def gen_cache(self, memory, do_zip=False): """ Generates cache for `forward` usage. The generated cache is a list, and each element in it is a tuple( :code:`(incremental_cache, static_cache)` ) produced by `TransformerDecoderLayer.gen_cache`. See `TransformerDecoderLayer.gen_cache` - for more details. + for more details. If `do_zip` is True, apply `zip` on these tuples to get + a list with two elements. Parameters: memory (Variable): The output of Transformer encoder. It is a tensor with shape `[batch_size, source_length, d_model]`. The data type should be float32 or float64. + do_zip (bool, optional): Indicate whether to apply `zip` on the tuples. + If True, return a list with two elements. Default False Returns: list: It is a list, and each element in the list is a tuple produced \ by `TransformerDecoderLayer.gen_cache(memory)`. See `TransformerDecoderLayer.gen_cache` \ - for more details. + for more details. If `do_zip` is True, apply `zip` on these tuples \ + and return a list with two elements. """ - return [layer.gen_cache(memory) for layer in self.layers] + cache = [layer.gen_cache(memory) for layer in self.layers] + if do_zip: + cache = list(zip(*cache)) + return cache class Transformer(Layer): @@ -1152,7 +1158,6 @@ class TransformerDecoderCell(Layer): .. code-block:: python import paddle - import paddle.fluid as fluid from paddle.fluid.dygraph import Embedding, Linear from paddle.incubate.hapi.text import TransformerDecoder from paddle.incubate.hapi.text import TransformerCell @@ -1209,11 +1214,11 @@ def __init__(self, decoder, embedding_fn=None, output_fn=None): def forward(self, inputs, - cache=None, - memory=None, + cache, + static_cache, + memory, tgt_mask=None, - memory_mask=None, - static_cache=[]): + memory_mask=None): """ Produces logits from `inputs` composed by ids and positions. @@ -1222,10 +1227,14 @@ def forward(self, tensors both have int64 data type and with 2D shape `[batch_size, sequence_length]` where `sequence_length` is 1 for inference. - cache(list): It caches the multi-head attention intermediate results - of history decoding steps. It is a list of dict where the length - of list is decoder layer number, and each dict has `k` and `v` as - keys and values are cached results. Default None + cache(list): It is a list and each element of the list is an instance + of `MultiheadAttention.Cache` for corresponding decoder layer. It + can be produced by `TransformerDecoder.gen_cache`, thus see + `TransformerDecoderLayer.gen_cache` for more details. + static_cache(list): It is a list and each element of the + list is an instance of `MultiheadAttention.StaticCache` for corresponding + decoder layer. It can be produced by `TransformerDecoder.gen_cache`, + thus see `TransformerDecoderLayer.gen_cache` for more details. memory (Variable): The output of Transformer encoder. It is a tensor with shape `[batch_size, source_length, d_model]`. The data type should be float32 or float64. @@ -1236,7 +1245,8 @@ def forward(self, where the unwanted positions have `-INF` values and the others have 0 values. The data type should be float32 or float64. It can be None when nothing wanted or needed to be prevented attention to. - Default None + It can be None for inference since there is no subsequent in + auto-regression decoding. Default None memory_mask (Variable, optional): A tensor used in decoder-encoder cross attention to prevents attention to some unwanted positions, usually the paddings. It is a tensor with shape broadcasted to @@ -1244,62 +1254,32 @@ def forward(self, unwanted positions have `-INF` values and the others have 0 values. The data type should be float32 or float64. It can be None when nothing wanted or needed to be prevented attention to. Default None - static_cache(list): It stores transformed results of encoder output - to be used as keys and values in decoder-encoder cross attention - It is a list of dict where the length of list is decoder layer - number, and each dict has `static_k` and `static_v` as keys and - values are stored results. Default empty list + Returns: tuple: A tuple( :code:`(outputs, new_states)` ), where `outputs` \ is a float32 or float64 3D tensor representing logits shaped \ `[batch_size, sequence_length, vocab_size]`. `new_states has \ the same structure and data type with `states` while the length \ - is one larger since the intermediate results of current step are \ - concatenated into it. + is one larger since concatanating the intermediate results of \ + current step. """ - trg_word, trg_pos = inputs + tgt_word, tgt_pos = inputs if cache and static_cache: - for cache, static_cache in zip(states, static_caches): - cache.update(static_cache) + states = list(zip(cache, static_cache)) if self.embedding_fn is not None: - dec_input = self.embedding_fn(trg_word, trg_pos) - outputs = self.decoder(dec_input, memory, tgt_mask, memory_mask, - states) + tgt = self.embedding_fn(tgt_word, tgt_pos) + outputs, new_states = self.decoder(tgt, memory, tgt_mask, + memory_mask, states) else: - outputs = self.decoder(trg_word, trg_pos, memory, tgt_mask, - memory_mask, states) + outputs, new_states = self.decoder(tgt_word, tgt_pos, memory, + tgt_mask, memory_mask, states) if self.output_fn is not None: outputs = self.output_fn(outputs) - new_states = [{ - "k": cache["k"], - "v": cache["v"] - } for cache in states] if states else states + new_states = [cache[0] for cache in new_states] return outputs, new_states - @property - def state_shape(self): - """ - States of TransformerCell cache the multi-head attention intermediate - results of history decoding steps, and have a increasing length as - decoding continued. - - `state_shape` of TransformerCell is used to initialize states. It is a - list of dict where the length of list is decoder layer, and each dict - has `k` and `v` as keys and values are `[n_head, 0, d_key]`, `[n_head, 0, d_value]` - separately. (-1 for batch size would be automatically inserted into shape). - - Returns: - list: It is a list of dict where the length of list is decoder layer \ - number, and each dict has `k` and `v` as keys and values are cached \ - results. - """ - return [{ - "k": [self.decoder.n_head, 0, self.decoder.d_key], - "v": [self.decoder.n_head, 0, self.decoder.d_value], - } for i in range(self.decoder.n_layer)] - class TransformerBeamSearchDecoder(layers.BeamSearchDecoder): """ @@ -1310,7 +1290,7 @@ class TransformerBeamSearchDecoder(layers.BeamSearchDecoder): `BeamSearchDecoder` to make beam search adapt to Transformer decoder. Parameters: - cell(TransformerCell): An instance of `TransformerCell`. + cell(TransformerDecoderCell): An instance of `TransformerDecoderCell`. start_token(int): The start token id. end_token(int): The end token id. beam_size(int): The beam width used in beam search. From c768e6e90fa549e26e30369f596eab09471417f2 Mon Sep 17 00:00:00 2001 From: guosheng Date: Fri, 21 Aug 2020 11:54:41 +0800 Subject: [PATCH 13/17] Remove TransformerDecoderCell and TransformerBeamSearchDecoder temporarily. test=develop --- python/paddle/nn/layer/transformer.py | 335 -------------------------- 1 file changed, 335 deletions(-) diff --git a/python/paddle/nn/layer/transformer.py b/python/paddle/nn/layer/transformer.py index 5a225b46fae0b9..04a96163e8eec7 100644 --- a/python/paddle/nn/layer/transformer.py +++ b/python/paddle/nn/layer/transformer.py @@ -1131,338 +1131,3 @@ def forward(self, src, tgt, src_mask=None, tgt_mask=None, memory_mask=None): output = self.decoder( tgt, memory, tgt_mask=tgt_mask, memory_mask=memory_mask) return output - - -class TransformerDecoderCell(Layer): - """ - TransformerDecoderCell wraps a Transformer decoder combined with an embedding - layer and output layer to produce logits from symbols (ids and position here). - It is analogy to `RNNCell` and `outputs, new_states = cell(inputs, states, *kwargs)`, - where `inputs` is composed of word ids and position, `states` is `cache`, - `kwargs` includes `memory, `tgt_mask`, `memory_mask` and `static_cache`. - - Parameters: - decoder(callable): A TransformerDecoder instance. Or a wrapper of it that - includes a embedding layer accepting ids and positions instead of embeddings - and includes a output layer transforming decoder output features to logits. - embedding_fn(callable, optional): A callable that accepts ids and position - as arguments and return embeddings as input of `decoder`. It can be - None if `decoder` includes a embedding layer. Default None. - output_fn(callable, optional): A callable applid on `decoder` output to - transform decoder output features to get logits. Mostly it is a Linear - layer with vocabulary size. It can be None if `decoder` includes a - output layer. Default None. - - Examples: - - .. code-block:: python - - import paddle - from paddle.fluid.dygraph import Embedding, Linear - from paddle.incubate.hapi.text import TransformerDecoder - from paddle.incubate.hapi.text import TransformerCell - from paddle.incubate.hapi.text import TransformerBeamSearchDecoder - from paddle.incubate.hapi.text import DynamicDecode - - paddle.disable_static() - - class Embedder(fluid.dygraph.Layer): - def __init__(self): - super(Embedder, self).__init__() - self.word_embedder = Embedding(size=[1000, 128]) - self.pos_embedder = Embedding(size=[500, 128]) - - def forward(self, word, position): - return self.word_embedder(word) + self.pos_embedder(position) - - embedder = Embedder() - output_layer = Linear(128, 1000) - decoder = TransformerDecoder(2, 2, 64, 64, 128, 512) - transformer_cell = TransformerCell(decoder, embedder, output_layer) - dynamic_decoder = DynamicDecode( - TransformerBeamSearchDecoder( - transformer_cell, - start_token=0, - end_token=1, - beam_size=4, - var_dim_in_state=2), - max_step_num=10, - is_test=True) - - enc_output = paddle.rand((2, 4, 128)) - # cross attention bias: [batch_size, n_head, tgt_len, src_len] - trg_src_attn_bias = paddle.rand((2, 2, 1, 4)) - # inputs for beam search on Transformer - caches = transformer_cell.get_initial_states(enc_output) - enc_output = TransformerBeamSearchDecoder.tile_beam_merge_with_batch( - enc_output, beam_size=4) - trg_src_attn_bias = TransformerBeamSearchDecoder.tile_beam_merge_with_batch( - trg_src_attn_bias, beam_size=4) - static_caches = decoder.prepare_static_cache(enc_output) - outputs = dynamic_decoder( - inits=caches, - enc_output=enc_output, - trg_src_attn_bias=trg_src_attn_bias, - static_caches=static_caches) - """ - - def __init__(self, decoder, embedding_fn=None, output_fn=None): - super(TransformerDecoderCell, self).__init__() - self.decoder = decoder - self.embedding_fn = embedding_fn - self.output_fn = output_fn - - def forward(self, - inputs, - cache, - static_cache, - memory, - tgt_mask=None, - memory_mask=None): - """ - Produces logits from `inputs` composed by ids and positions. - - Parameters: - inputs(tuple): A tuple includes target ids and positions. The two - tensors both have int64 data type and with 2D shape - `[batch_size, sequence_length]` where `sequence_length` is 1 - for inference. - cache(list): It is a list and each element of the list is an instance - of `MultiheadAttention.Cache` for corresponding decoder layer. It - can be produced by `TransformerDecoder.gen_cache`, thus see - `TransformerDecoderLayer.gen_cache` for more details. - static_cache(list): It is a list and each element of the - list is an instance of `MultiheadAttention.StaticCache` for corresponding - decoder layer. It can be produced by `TransformerDecoder.gen_cache`, - thus see `TransformerDecoderLayer.gen_cache` for more details. - memory (Variable): The output of Transformer encoder. It is a tensor - with shape `[batch_size, source_length, d_model]`. The data type - should be float32 or float64. - tgt_mask (Variable, optional): A tensor used in self attention - to prevents attention to some unwanted positions, usually the - the subsequent positions. It is a tensor with shape broadcasted - to `[batch_size, n_head, target_length, target_length]`, - where the unwanted positions have `-INF` values and the others - have 0 values. The data type should be float32 or float64. It can - be None when nothing wanted or needed to be prevented attention to. - It can be None for inference since there is no subsequent in - auto-regression decoding. Default None - memory_mask (Variable, optional): A tensor used in decoder-encoder - cross attention to prevents attention to some unwanted positions, - usually the paddings. It is a tensor with shape broadcasted to - `[batch_size, n_head, target_length, source_length]`, where the - unwanted positions have `-INF` values and the others have 0 values. - The data type should be float32 or float64. It can be None when - nothing wanted or needed to be prevented attention to. Default None - - - Returns: - tuple: A tuple( :code:`(outputs, new_states)` ), where `outputs` \ - is a float32 or float64 3D tensor representing logits shaped \ - `[batch_size, sequence_length, vocab_size]`. `new_states has \ - the same structure and data type with `states` while the length \ - is one larger since concatanating the intermediate results of \ - current step. - """ - tgt_word, tgt_pos = inputs - if cache and static_cache: - states = list(zip(cache, static_cache)) - if self.embedding_fn is not None: - tgt = self.embedding_fn(tgt_word, tgt_pos) - outputs, new_states = self.decoder(tgt, memory, tgt_mask, - memory_mask, states) - else: - outputs, new_states = self.decoder(tgt_word, tgt_pos, memory, - tgt_mask, memory_mask, states) - if self.output_fn is not None: - outputs = self.output_fn(outputs) - - new_states = [cache[0] for cache in new_states] - return outputs, new_states - - -class TransformerBeamSearchDecoder(layers.BeamSearchDecoder): - """ - Compared with a RNN step :code:`outputs, new_states = cell(inputs, states)`, - Transformer decoder's `inputs` uses 2D tensor shaped `[batch_size * beam_size, 1]` - and includes extra position data. And its `states` (caches) has increasing - length. These are not consistent with `BeamSearchDecoder`, thus subclass - `BeamSearchDecoder` to make beam search adapt to Transformer decoder. - - Parameters: - cell(TransformerDecoderCell): An instance of `TransformerDecoderCell`. - start_token(int): The start token id. - end_token(int): The end token id. - beam_size(int): The beam width used in beam search. - var_dim_in_state(int): Indicate which dimension of states is variant. - - Examples: - - .. code-block:: python - - import paddle - import paddle.fluid as fluid - from paddle.fluid.dygraph import Embedding, Linear - from paddle.incubate.hapi.text import TransformerDecoder - from paddle.incubate.hapi.text import TransformerCell - from paddle.incubate.hapi.text import TransformerBeamSearchDecoder - from paddle.incubate.hapi.text import DynamicDecode - - paddle.disable_static() - - class Embedder(fluid.dygraph.Layer): - def __init__(self): - super(Embedder, self).__init__() - self.word_embedder = Embedding(size=[1000, 128]) - self.pos_embedder = Embedding(size=[500, 128]) - - def forward(self, word, position): - return self.word_embedder(word) + self.pos_embedder(position) - - embedder = Embedder() - output_layer = Linear(128, 1000) - decoder = TransformerDecoder(2, 2, 64, 64, 128, 512) - transformer_cell = TransformerCell(decoder, embedder, output_layer) - dynamic_decoder = DynamicDecode( - TransformerBeamSearchDecoder( - transformer_cell, - start_token=0, - end_token=1, - beam_size=4, - var_dim_in_state=2), - max_step_num=10, - is_test=True) - - enc_output = paddle.rand((2, 4, 128)) - # cross attention bias: [batch_size, n_head, tgt_len, src_len] - trg_src_attn_bias = paddle.rand((2, 2, 1, 4)) - # inputs for beam search on Transformer - caches = transformer_cell.get_initial_states(enc_output) - enc_output = TransformerBeamSearchDecoder.tile_beam_merge_with_batch( - enc_output, beam_size=4) - trg_src_attn_bias = TransformerBeamSearchDecoder.tile_beam_merge_with_batch( - trg_src_attn_bias, beam_size=4) - static_caches = decoder.prepare_static_cache(enc_output) - outputs = dynamic_decoder( - inits=caches, - enc_output=enc_output, - trg_src_attn_bias=trg_src_attn_bias, - static_caches=static_caches) - """ - - def __init__(self, cell, start_token, end_token, beam_size, - var_dim_in_state): - super(TransformerBeamSearchDecoder, - self).__init__(cell, start_token, end_token, beam_size) - self.cell = cell - self.var_dim_in_state = var_dim_in_state - - def _merge_batch_beams_with_var_dim(self, x): - """ - Reshape a tensor with shape `[batch_size, beam_size, ...]` to a new - tensor with shape `[batch_size * beam_size, ...]`. - - Parameters: - x(Variable): A tensor with shape `[batch_size, beam_size, ...]`. The - data type should be float32, float64, int32, int64 or bool. - - Returns: - Variable: A tensor with shape `[batch_size * beam_size, ...]`, whose \ - data type is same as `x`. - """ - # init length of cache is 0, and it increases with decoding carrying on, - # thus need to reshape elaborately - var_dim_in_state = self.var_dim_in_state + 1 # count in beam dim - x = layers.transpose(x, - list(range(var_dim_in_state, len(x.shape))) + - list(range(0, var_dim_in_state))) - x = layers.reshape( - x, [0] * (len(x.shape) - var_dim_in_state - ) + [self.batch_size * self.beam_size] + - [int(size) for size in x.shape[-var_dim_in_state + 2:]]) - x = layers.transpose( - x, - list(range((len(x.shape) + 1 - var_dim_in_state), len(x.shape))) + - list(range(0, (len(x.shape) + 1 - var_dim_in_state)))) - return x - - def _split_batch_beams_with_var_dim(self, x): - """ - Reshape a tensor with shape `[batch_size * beam_size, ...]` to a new - tensor with shape `[batch_size, beam_size, ...]`. - - Parameters: - x(Variable): A tensor with shape `[batch_size * beam_size, ...]`. The - data type should be float32, float64, int32, int64 or bool. - - Returns: - Variable: A tensor with shape `[batch_size, beam_size, ...]`, whose \ - data type is same as `x`. - """ - var_dim_size = layers.shape(x)[self.var_dim_in_state] - x = layers.reshape( - x, [-1, self.beam_size] + - [int(size) - for size in x.shape[1:self.var_dim_in_state]] + [var_dim_size] + - [int(size) for size in x.shape[self.var_dim_in_state + 1:]]) - return x - - def step(self, time, inputs, states, **kwargs): - """ - Perform a beam search decoding step, which uses `cell` to get probabilities, - and follows a beam search step to calculate scores and select candidate - token ids. - - Note: compared with `BeamSearchDecoder.step`, it feed 2D id tensor shaped - `[batch_size * beam_size, 1]` rather than `[batch_size * beam_size]` combined - position data as inputs to `cell`. - - Parameters: - time(Variable): An `int64` tensor with shape `[1]` provided by the caller, - representing the current time step number of decoding. - inputs(Variable): A tensor variable. It is same as `initial_inputs` - returned by `initialize()` for the first decoding step and - `next_inputs` returned by `step()` for the others. It is a int64 - id tensor with shape `[batch_size * beam_size]` - states(Variable): A structure of tensor variables. - It is same as the `initial_states` returned by `initialize()` for - the first decoding step and `beam_search_state` returned by - `step()` for the others. - **kwargs: Additional keyword arguments, provided by the caller. - - Returns: - tuple: A tuple( :code:`(beam_search_output, beam_search_state, next_inputs, finished)` ). \ - `beam_search_state` and `next_inputs` have the same structure, \ - shape and data type as the input arguments `states` and `inputs` separately. \ - `beam_search_output` is a namedtuple(including scores, predicted_ids, \ - parent_ids as fields) of tensor variables, where \ - `scores, predicted_ids, parent_ids` all has a tensor value shaped \ - `[batch_size, beam_size]` with data type `float32, int64, int64`. \ - `finished` is a `bool` tensor with shape `[batch_size, beam_size]`. - """ - # compared to RNN, Transformer has 3D data at every decoding step - inputs = layers.reshape(inputs, [-1, 1]) # token - pos = layers.ones_like(inputs) * time # pos - cell_states = map_structure(self._merge_batch_beams_with_var_dim, - states.cell_states) - - cell_outputs, next_cell_states = self.cell((inputs, pos), cell_states, - **kwargs) - - # squeeze to adapt to BeamSearchDecoder which use 2D logits - cell_outputs = map_structure( - lambda x: layers.squeeze(x, [1]) if len(x.shape) == 3 else x, - cell_outputs) - cell_outputs = map_structure(self._split_batch_beams, cell_outputs) - next_cell_states = map_structure(self._split_batch_beams_with_var_dim, - next_cell_states) - - beam_search_output, beam_search_state = self._beam_search_step( - time=time, - logits=cell_outputs, - next_cell_states=next_cell_states, - beam_state=states) - next_inputs, finished = (beam_search_output.predicted_ids, - beam_search_state.finished) - - return (beam_search_output, beam_search_state, next_inputs, finished) From 0d01c1601202be55ef00a77ab9ca5ea579e32bbc Mon Sep 17 00:00:00 2001 From: guosheng Date: Fri, 21 Aug 2020 13:31:04 +0800 Subject: [PATCH 14/17] Add import for Transformer apis. test=develop --- python/paddle/nn/layer/__init__.py | 1 + python/paddle/nn/layer/transformer.py | 21 ++++++++++++++------- 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/python/paddle/nn/layer/__init__.py b/python/paddle/nn/layer/__init__.py index f1069f0dfd1d3b..0cf8bb9b12651c 100644 --- a/python/paddle/nn/layer/__init__.py +++ b/python/paddle/nn/layer/__init__.py @@ -29,6 +29,7 @@ from .extension import * from .activation import * from .norm import * +from .transformer import * # from .activation import PReLU #DEFINE_ALIAS from .activation import ReLU #DEFINE_ALIAS from .activation import LeakyReLU #DEFINE_ALIAS diff --git a/python/paddle/nn/layer/transformer.py b/python/paddle/nn/layer/transformer.py index 04a96163e8eec7..97409c30623ed7 100644 --- a/python/paddle/nn/layer/transformer.py +++ b/python/paddle/nn/layer/transformer.py @@ -13,7 +13,14 @@ # limitations under the License. # TODO: define the classes of Transformer neural network -# __all__ = [ ] +__all__ = [ + 'MultiheadAttention', + 'TransformerEncoderLayer', + 'TransformerEncoder', + 'TransformerDecoderLayer', + 'TransformerDecoder', + 'Transformer', +] import copy import collections @@ -211,12 +218,12 @@ def cal_kv(self, key, value): to construct cache for inference. Parameters: - key (Variable, optional): The keys for multi-head attention. It is - a tensor with shape `[batch_size, sequence_length, kdim]`. The - data type should be float32 or float64. - value (Variable, optional): The values for multi-head attention. It - is a tensor with shape `[batch_size, sequence_length, vdim]`. - The data type should be float32 or float64. + key (Variable): The keys for multi-head attention. It is a tensor + with shape `[batch_size, sequence_length, kdim]`. The data type + should be float32 or float64. + value (Variable): The values for multi-head attention. It is a tensor + with shape `[batch_size, sequence_length, vdim]`. The data type + should be float32 or float64. Returns: tuple: A tuple including transformed keys and values. Their shapes \ From 39a623c4ae130f9e65a5052f73836af79c5934e3 Mon Sep 17 00:00:00 2001 From: guosheng Date: Fri, 21 Aug 2020 15:14:40 +0800 Subject: [PATCH 15/17] Update usage of weight_attr and Tensor in Transformer api docs. test=develop --- python/paddle/nn/layer/transformer.py | 195 ++++++++++++-------------- 1 file changed, 88 insertions(+), 107 deletions(-) diff --git a/python/paddle/nn/layer/transformer.py b/python/paddle/nn/layer/transformer.py index 97409c30623ed7..c914d51f0930ff 100644 --- a/python/paddle/nn/layer/transformer.py +++ b/python/paddle/nn/layer/transformer.py @@ -86,12 +86,13 @@ class MultiheadAttention(Layer): `embed_dim`. Default None. need_weights (bool, optional): Indicate whether to return the attention weights. Default False. - param_attr(ParamAttr, optional): To specify the weight parameter property. + weight_attr(ParamAttr, optional): To specify the weight parameter property. Default: None, which means the default weight parameter property is used. - See usage for details in :ref:`api_fluid_ParamAttr` . + See usage for details in :code:`ParamAttr` . bias_attr (ParamAttr, optional): To specify the bias parameter property. Default: None, which means the default bias parameter property is used. - See usage for details in :ref:`api_fluid_ParamAttr` . + If it is set to False, this layer will not have trainable bias parameter. + See usage for details in :code:`ParamAttr` . Examples: @@ -117,7 +118,7 @@ def __init__(self, kdim=None, vdim=None, need_weights=False, - param_attr=None, + weight_attr=None, bias_attr=None): super(MultiheadAttention, self).__init__() self.embed_dim = embed_dim @@ -131,25 +132,13 @@ def __init__(self, assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads" self.q_proj = Linear( - input_dim=embed_dim, - output_dim=embed_dim, - param_attr=param_attr, - bias_attr=bias_attr) + embed_dim, embed_dim, weight_attr, bias_attr=bias_attr) self.k_proj = Linear( - input_dim=self.kdim, - output_dim=embed_dim, - param_attr=param_attr, - bias_attr=bias_attr) + self.kdim, embed_dim, weight_attr, bias_attr=bias_attr) self.v_proj = Linear( - input_dim=self.vdim, - output_dim=embed_dim, - param_attr=param_attr, - bias_attr=bias_attr) + self.vdim, embed_dim, weight_attr, bias_attr=bias_attr) self.out_proj = Linear( - input_dim=embed_dim, - output_dim=embed_dim, - param_attr=param_attr, - bias_attr=bias_attr) + embed_dim, embed_dim, weight_attr, bias_attr=bias_attr) def _prepare_qkv(self, query, key, value, cache=None): """ @@ -158,14 +147,14 @@ def _prepare_qkv(self, query, key, value, cache=None): to reduce redundant calculations. Parameters: - query (Variable): The queries for multi-head attention. It is a + query (Tensor): The queries for multi-head attention. It is a tensor with shape `[batch_size, query_length, embed_dim]`. The data type should be float32 or float64. - key (Variable): The keys for multi-head attention. It is + key (Tensor): The keys for multi-head attention. It is a tensor with shape `[batch_size, key_length, kdim]`. The data type should be float32 or float64. If None, use `query` as `key`. - value (Variable): The values for multi-head attention. It + value (Tensor): The values for multi-head attention. It is a tensor with shape `[batch_size, value_length, vdim]`. The data type should be float32 or float64. If None, use `query` as `value`. @@ -218,10 +207,10 @@ def cal_kv(self, key, value): to construct cache for inference. Parameters: - key (Variable): The keys for multi-head attention. It is a tensor + key (Tensor): The keys for multi-head attention. It is a tensor with shape `[batch_size, sequence_length, kdim]`. The data type should be float32 or float64. - value (Variable): The values for multi-head attention. It is a tensor + value (Tensor): The values for multi-head attention. It is a tensor with shape `[batch_size, sequence_length, vdim]`. The data type should be float32 or float64. @@ -273,11 +262,11 @@ def gen_cache(self, key, value=None, type=Cache): an instance of `Cache`. Parameters: - key (Variable): The keys for multi-head attention. It is + key (Tensor): The keys for multi-head attention. It is a tensor with shape `[batch_size, key_length, kdim]`. The data type should be float32 or float64. If `value` is None, it is only for batch size and data type reference. - value (Variable, optional): The values for multi-head attention. It + value (Tensor, optional): The values for multi-head attention. It is a tensor with shape `[batch_size, value_length, vdim]`. The data type should be float32 or float64. If None, `key` is only for batch size reference. Default None. @@ -312,18 +301,18 @@ def forward(self, query, key, value, attn_mask=None, cache=None): to outputs. Parameters: - query (Variable): The queries for multi-head attention. It is a + query (Tensor): The queries for multi-head attention. It is a tensor with shape `[batch_size, query_length, embed_dim]`. The data type should be float32 or float64. - key (Variable, optional): The keys for multi-head attention. It is + key (Tensor, optional): The keys for multi-head attention. It is a tensor with shape `[batch_size, key_length, kdim]`. The data type should be float32 or float64. If None, use `query` as `key`. Default None. - value (Variable, optional): The values for multi-head attention. It + value (Tensor, optional): The values for multi-head attention. It is a tensor with shape `[batch_size, value_length, vdim]`. The data type should be float32 or float64. If None, use `query` as `value`. Default None. - attn_mask (Variable, optional): A tensor used in multi-head attention + attn_mask (Tensor, optional): A tensor used in multi-head attention to prevents attention to some unwanted positions, usually the paddings or the subsequent positions. It is a tensor with shape broadcasted to `[batch_size, n_head, sequence_length, sequence_length]`, @@ -345,7 +334,7 @@ def forward(self, query, key, value, attn_mask=None, cache=None): Default None. Returns: - Variable|tuple: It is a tensor that has the same shape and data type \ + Tensor|tuple: It is a tensor that has the same shape and data type \ as `query`, representing attention output. Or a tuple if \ `need_weights` is True or `cache` is not None. If `need_weights` \ is True, except for attention output, the tuple also includes \ @@ -422,18 +411,20 @@ class TransformerEncoderLayer(Layer): normalization and post-precess includes dropout, residual connection. Otherwise, no pre-process and post-precess includes dropout, residual connection, layer normalization. Default False - param_attr(ParamAttr|tuple, optional): To specify the weight parameter property. - If it is a tuple, `param_attr[0]` would be used as `param_attr` for - MHA, and `param_attr[1]` would be used as `param_attr` for linear in FFN. - Otherwise, MHA and FFN both use it as `param_attr` to create parameters. + weight_attr(ParamAttr|tuple, optional): To specify the weight parameter property. + If it is a tuple, `weight_attr[0]` would be used as `weight_attr` for + MHA, and `weight_attr[1]` would be used as `weight_attr` for linear in FFN. + Otherwise, MHA and FFN both use it as `weight_attr` to create parameters. Default: None, which means the default weight parameter property is used. - See usage for details in :ref:`api_fluid_ParamAttr` . + See usage for details in :code:`ParamAttr` . bias_attr (ParamAttr|tuple, optional): To specify the bias parameter property. If it is a tuple, `bias_attr[0]` would be used as `bias_attr` for MHA, and `bias_attr[1]` would be used as `bias_attr` for linear in FFN. Otherwise, MHA and FFN both use it as `bias_attr` to create parameters. - Default: None, which means the default bias parameter property is used. - See usage for details in :ref:`api_fluid_ParamAttr` . + The `False` value means the corresponding layer would not have trainable + bias parameter. See usage for details in :code:`ParamAttr` . Default: None, + which means the default bias parameter property is used. + Examples: @@ -459,7 +450,7 @@ def __init__(self, attn_dropout=None, act_dropout=None, normalize_before=False, - param_attr=None, + weight_attr=None, bias_attr=None): self._config = locals() self._config.pop("self") @@ -470,27 +461,21 @@ def __init__(self, act_dropout = dropout if act_dropout is None else act_dropout self.normalize_before = normalize_before - param_attrs = _convert_param_attr_to_list(param_attr, 2) + weight_attrs = _convert_param_attr_to_list(weight_attr, 2) bias_attrs = _convert_param_attr_to_list(bias_attr, 2) self.self_attn = MultiheadAttention( d_model, nhead, dropout=attn_dropout, - param_attr=param_attrs[0], + weight_attr=weight_attrs[0], bias_attr=bias_attrs[0]) self.linear1 = Linear( - d_model, - dim_feedforward, - param_attr=param_attrs[1], - bias_attr=bias_attrs[1]) + d_model, dim_feedforward, weight_attrs[1], bias_attr=bias_attrs[1]) self.dropout = Dropout( act_dropout, dropout_implementation="upscale_in_train") self.linear2 = Linear( - dim_feedforward, - d_model, - param_attr=param_attrs[1], - bias_attr=bias_attrs[1]) + dim_feedforward, d_model, weight_attrs[1], bias_attr=bias_attrs[1]) self.norm1 = LayerNorm(d_model) self.norm2 = LayerNorm(d_model) self.dropout1 = Dropout( @@ -504,10 +489,10 @@ def forward(self, src, src_mask=None): Applies a Transformer encoder layer on the input. Parameters: - src (Variable): The input of Transformer encoder layer. It is + src (Tensor): The input of Transformer encoder layer. It is a tensor with shape `[batch_size, sequence_length, d_model]`. The data type should be float32 or float64. - src_mask (Variable, optional): A tensor used in multi-head attention + src_mask (Tensor, optional): A tensor used in multi-head attention to prevents attention to some unwanted positions, usually the paddings or the subsequent positions. It is a tensor with shape broadcasted to `[batch_size, n_head, sequence_length, sequence_length]`, @@ -517,7 +502,7 @@ def forward(self, src, src_mask=None): Default None Returns: - Variable: The output of Transformer encoder layer. It is a tensor that \ + Tensor: The output of Transformer encoder layer. It is a tensor that \ has the same shape and data type as `enc_input`. """ residual = src @@ -582,10 +567,10 @@ def forward(self, src, src_mask=None): layer. Parameters: - src (Variable): The input of Transformer encoder. It is a tensor + src (Tensor): The input of Transformer encoder. It is a tensor with shape `[batch_size, sequence_length, d_model]`. The data type should be float32 or float64. - src_mask (Variable, optional): A tensor used in multi-head attention + src_mask (Tensor, optional): A tensor used in multi-head attention to prevents attention to some unwanted positions, usually the paddings or the subsequent positions. It is a tensor with shape broadcasted to `[batch_size, n_head, sequence_length, sequence_length]`, @@ -595,7 +580,7 @@ def forward(self, src, src_mask=None): Default None Returns: - Variable: The output of Transformer encoder. It is a tensor that \ + Tensor: The output of Transformer encoder. It is a tensor that \ has the same shape and data type as `src`. """ output = src @@ -637,12 +622,12 @@ class TransformerDecoderLayer(Layer): normalization and post-precess includes dropout, residual connection. Otherwise, no pre-process and post-precess includes dropout, residual connection, layer normalization. Default False - param_attr(ParamAttr|tuple, optional): To specify the weight parameter property. - If it is a tuple, `param_attr[0]` would be used as `param_attr` for - self attention, `param_attr[1]` would be used as `param_attr` for - cross attention, and `param_attr[2]` would be used as `param_attr` + weight_attr(ParamAttr|tuple, optional): To specify the weight parameter property. + If it is a tuple, `weight_attr[0]` would be used as `weight_attr` for + self attention, `weight_attr[1]` would be used as `weight_attr` for + cross attention, and `weight_attr[2]` would be used as `weight_attr` for linear in FFN. Otherwise, the three sub-layers all uses it as - `param_attr` to create parameters. Default: None, which means the + `weight_attr` to create parameters. Default: None, which means the default weight parameter property is used. See usage for details in :ref:`api_fluid_ParamAttr` . bias_attr (ParamAttr|tuple, optional): To specify the bias parameter property. @@ -650,9 +635,10 @@ class TransformerDecoderLayer(Layer): self attention, `bias_attr[1]` would be used as `bias_attr` for cross attention, and `bias_attr[2]` would be used as `bias_attr` for linear in FFN. Otherwise, the three sub-layers all uses it as - `bias_attr` to create parameters. Default: None, which means the - default bias parameter property is used. See usage for details - in :ref:`api_fluid_ParamAttr` . + `bias_attr` to create parameters. The `False` value means the + corresponding layer would not have trainable bias parameter. See + usage for details in :code:`ParamAttr` . Default: None,which means + the default bias parameter property is used. Examples: @@ -685,7 +671,7 @@ def __init__(self, attn_dropout=None, act_dropout=None, normalize_before=False, - param_attr=None, + weight_attr=None, bias_attr=None): self._config = locals() self._config.pop("self") @@ -696,33 +682,27 @@ def __init__(self, act_dropout = dropout if act_dropout is None else act_dropout self.normalize_before = normalize_before - param_attrs = _convert_param_attr_to_list(param_attr, 3) + weight_attrs = _convert_param_attr_to_list(weight_attr, 3) bias_attrs = _convert_param_attr_to_list(bias_attr, 3) self.self_attn = MultiheadAttention( d_model, nhead, dropout=attn_dropout, - param_attr=param_attrs[0], + weight_attr=weight_attrs[0], bias_attr=bias_attrs[0]) self.cross_attn = MultiheadAttention( d_model, nhead, dropout=attn_dropout, - param_attr=param_attrs[1], + weight_attr=weight_attrs[1], bias_attr=bias_attrs[1]) self.linear1 = Linear( - d_model, - dim_feedforward, - param_attr=param_attrs[2], - bias_attr=bias_attrs[2]) + d_model, dim_feedforward, weight_attrs[2], bias_attr=bias_attrs[2]) self.dropout = Dropout( act_dropout, dropout_implementation="upscale_in_train") self.linear2 = Linear( - dim_feedforward, - d_model, - param_attr=param_attrs[2], - bias_attr=bias_attrs[2]) + dim_feedforward, d_model, weight_attrs[2], bias_attr=bias_attrs[2]) self.norm1 = LayerNorm(d_model) self.norm2 = LayerNorm(d_model) self.norm3 = LayerNorm(d_model) @@ -739,13 +719,13 @@ def forward(self, tgt, memory, tgt_mask=None, memory_mask=None, cache=None): Applies a Transformer decoder layer on the input. Parameters: - tgt (Variable): The input of Transformer decoder layer. It is a tensor + tgt (Tensor): The input of Transformer decoder layer. It is a tensor with shape `[batch_size, target_length, d_model]`. The data type should be float32 or float64. - memory (Variable): The output of Transformer encoder. It is a tensor + memory (Tensor): The output of Transformer encoder. It is a tensor with shape `[batch_size, source_length, d_model]`. The data type should be float32 or float64. - tgt_mask (Variable, optional): A tensor used in self attention + tgt_mask (Tensor, optional): A tensor used in self attention to prevents attention to some unwanted positions, usually the the subsequent positions. It is a tensor with shape broadcasted to `[batch_size, n_head, target_length, target_length]`, @@ -753,7 +733,7 @@ def forward(self, tgt, memory, tgt_mask=None, memory_mask=None, cache=None): have 0 values. The data type should be float32 or float64. It can be None when nothing wanted or needed to be prevented attention to. Default None - memory_mask (Variable, optional): A tensor used in decoder-encoder + memory_mask (Tensor, optional): A tensor used in decoder-encoder cross attention to prevents attention to some unwanted positions, usually the paddings. It is a tensor with shape broadcasted to `[batch_size, n_head, target_length, source_length]`, where the @@ -768,7 +748,7 @@ def forward(self, tgt, memory, tgt_mask=None, memory_mask=None, cache=None): None. Returns: - Variable|tuple: It is a tensor that has the same shape and data type \ + Tensor|tuple: It is a tensor that has the same shape and data type \ as `tgt`, representing the output of Transformer decoder layer. \ Or a tuple if `cache` is not None, except for decoder layer output, \ the tuple includes the new cache which is same as input `cache` \ @@ -817,7 +797,7 @@ def gen_cache(self, memory): of `MultiheadAttention.StaticCache`. Parameters: - memory (Variable): The output of Transformer encoder. It is a tensor + memory (Tensor): The output of Transformer encoder. It is a tensor with shape `[batch_size, source_length, d_model]`. The data type should be float32 or float64. @@ -889,13 +869,13 @@ def forward(self, tgt, memory, tgt_mask=None, memory_mask=None, cache=None): layer. Parameters: - tgt (Variable): The input of Transformer decoder. It is a tensor + tgt (Tensor): The input of Transformer decoder. It is a tensor with shape `[batch_size, target_length, d_model]`. The data type should be float32 or float64. - memory (Variable): The output of Transformer encoder. It is a tensor + memory (Tensor): The output of Transformer encoder. It is a tensor with shape `[batch_size, source_length, d_model]`. The data type should be float32 or float64. - tgt_mask (Variable, optional): A tensor used in self attention + tgt_mask (Tensor, optional): A tensor used in self attention to prevents attention to some unwanted positions, usually the the subsequent positions. It is a tensor with shape broadcasted to `[batch_size, n_head, target_length, target_length]`, @@ -903,7 +883,7 @@ def forward(self, tgt, memory, tgt_mask=None, memory_mask=None, cache=None): have 0 values. The data type should be float32 or float64. It can be None when nothing wanted or needed to be prevented attention to. Default None - memory_mask (Variable, optional): A tensor used in decoder-encoder + memory_mask (Tensor, optional): A tensor used in decoder-encoder cross attention to prevents attention to some unwanted positions, usually the paddings. It is a tensor with shape broadcasted to `[batch_size, n_head, target_length, source_length]`, where the @@ -916,7 +896,7 @@ def forward(self, tgt, memory, tgt_mask=None, memory_mask=None, cache=None): used for inference and should be None for training. Default None. Returns: - Variable|tuple: It is a tensor that has the same shape and data type \ + Tensor|tuple: It is a tensor that has the same shape and data type \ as `tgt`, representing the output of Transformer decoder. \ Or a tuple if `cache` is not None, except for decoder output, \ the tuple includes the new cache which is same as input `cache` \ @@ -956,7 +936,7 @@ def gen_cache(self, memory, do_zip=False): Parameters: - memory (Variable): The output of Transformer encoder. It is a tensor + memory (Tensor): The output of Transformer encoder. It is a tensor with shape `[batch_size, source_length, d_model]`. The data type should be float32 or float64. do_zip (bool, optional): Indicate whether to apply `zip` on the tuples. @@ -988,7 +968,7 @@ class Transformer(Layer): normalization (in pre-process or post-precess of multi-head attention or FFN), and some transformer like models are different on this, such as `BERT `_ and `GPT2 `_ . - The default architecture here places layer normalization in pre-process and + The default architecture here places layer normalization in post-process and applies another layer normalization on the output of last encoder/decoder layer. Parameters: @@ -1012,22 +992,23 @@ class Transformer(Layer): normalization and post-precess includes dropout, residual connection. Otherwise, no pre-process and post-precess includes dropout, residual connection, layer normalization. Default False - param_attr(ParamAttr|tuple, optional): To specify the weight parameter property. - If it is a tuple, `param_attr[0]` would be used as `param_attr` for - self attention, `param_attr[1]` would be used as `param_attr` for - cross attention, and `param_attr[2]` would be used as `param_attr` + weight_attr(ParamAttr|tuple, optional): To specify the weight parameter property. + If it is a tuple, `weight_attr[0]` would be used as `weight_attr` for + self attention, `weight_attr[1]` would be used as `weight_attr` for + cross attention, and `weight_attr[2]` would be used as `weight_attr` for linear in FFN. Otherwise, the three sub-layers all uses it as - `param_attr` to create parameters. Default: None, which means the + `weight_attr` to create parameters. Default: None, which means the default weight parameter property is used. See usage for details - in :ref:`api_fluid_ParamAttr` . + in :code:`ParamAttr` . bias_attr (ParamAttr|tuple, optional): To specify the bias parameter property. If it is a tuple, `bias_attr[0]` would be used as `bias_attr` for self attention, `bias_attr[1]` would be used as `bias_attr` for cross attention, and `bias_attr[2]` would be used as `bias_attr` for linear in FFN. Otherwise, the three sub-layers all uses it as - `bias_attr` to create parameters. Default: None, which means the - default bias parameter property is used. See usage for details - in :ref:`api_fluid_ParamAttr` . + `bias_attr` to create parameters. The `False` value means the + corresponding layer would not have trainable bias parameter. See + usage for details in :code:`ParamAttr` . Default: None,which means + the default bias parameter property is used. custom_encoder (Layer): If custom encoder is provided, use it as the encoder. Default None custom_decoder (Layer): If custom decoder is provided, use it as the decoder. @@ -1069,7 +1050,7 @@ def __init__(self, attn_dropout=None, act_dropout=None, normalize_before=False, - param_attr=None, + weight_attr=None, bias_attr=None, custom_encoder=None, custom_decoder=None): @@ -1080,7 +1061,7 @@ def __init__(self, else: encoder_layer = TransformerEncoderLayer( d_model, nhead, dim_feedforward, dropout, activation, - attn_dropout, act_dropout, normalize_before, param_attr, + attn_dropout, act_dropout, normalize_before, weight_attr, bias_attr) encoder_norm = LayerNorm(d_model) self.encoder = TransformerEncoder(encoder_layer, num_encoder_layers, @@ -1091,7 +1072,7 @@ def __init__(self, else: decoder_layer = TransformerDecoderLayer( d_model, nhead, dim_feedforward, dropout, activation, - attn_dropout, act_dropout, normalize_before, param_attr, + attn_dropout, act_dropout, normalize_before, weight_attr, bias_attr) decoder_norm = LayerNorm(d_model) self.decoder = TransformerDecoder(decoder_layer, num_decoder_layers, @@ -1105,16 +1086,16 @@ def forward(self, src, tgt, src_mask=None, tgt_mask=None, memory_mask=None): Applies a Transformer model on the inputs. Parameters: - src (Variable): The input of Transformer encoder. It is a tensor + src (Tensor): The input of Transformer encoder. It is a tensor with shape `[batch_size, source_length, d_model]`. The data type should be float32 or float64. - tgt (Variable): The input of Transformer decoder. It is a tensor + tgt (Tensor): The input of Transformer decoder. It is a tensor with shape `[batch_size, target_length, d_model]`. The data type should be float32 or float64. - memory (Variable): The output of Transformer encoder. It is a tensor + memory (Tensor): The output of Transformer encoder. It is a tensor with shape `[batch_size, source_length, d_model]`. The data type should be float32 or float64. - tgt_mask (Variable, optional): A tensor used in self attention + tgt_mask (Tensor, optional): A tensor used in self attention to prevents attention to some unwanted positions, usually the the subsequent positions. It is a tensor with shape broadcasted to `[batch_size, n_head, target_length, target_length]`, @@ -1122,7 +1103,7 @@ def forward(self, src, tgt, src_mask=None, tgt_mask=None, memory_mask=None): have 0 values. The data type should be float32 or float64. It can be None when nothing wanted or needed to be prevented attention to. Default None - memory_mask (Variable, optional): A tensor used in decoder-encoder + memory_mask (Tensor, optional): A tensor used in decoder-encoder cross attention to prevents attention to some unwanted positions, usually the paddings. It is a tensor with shape broadcasted to `[batch_size, n_head, target_length, source_length]`, where the @@ -1131,7 +1112,7 @@ def forward(self, src, tgt, src_mask=None, tgt_mask=None, memory_mask=None): nothing wanted or needed to be prevented attention to. Default None Returns: - Variable: It is a tensor that has the same shape and data type \ + Tensor: It is a tensor that has the same shape and data type \ as `tgt`, representing the output of Transformer decoder. """ memory = self.encoder(src, src_mask=src_mask) From 48f97e193d6af9abe74c02128957c0d1cd892c98 Mon Sep 17 00:00:00 2001 From: guosheng Date: Sun, 23 Aug 2020 17:03:54 +0800 Subject: [PATCH 16/17] Update Transformer apis by renaming MultiheadAttention and cal_kv according to comments. test=develop --- python/paddle/nn/__init__.py | 6 +++ python/paddle/nn/layer/transformer.py | 68 +++++++++++++-------------- 2 files changed, 39 insertions(+), 35 deletions(-) diff --git a/python/paddle/nn/__init__.py b/python/paddle/nn/__init__.py index 3dd1c1d94fbd70..6cd2379c61e67d 100644 --- a/python/paddle/nn/__init__.py +++ b/python/paddle/nn/__init__.py @@ -118,6 +118,12 @@ # from .layer.rnn import RNNCell #DEFINE_ALIAS # from .layer.rnn import GRUCell #DEFINE_ALIAS # from .layer.rnn import LSTMCell #DEFINE_ALIAS +from .layer.transformer import MultiHeadAttention +from .layer.transformer import TransformerEncoderLayer +from .layer.transformer import TransformerEncoder +from .layer.transformer import TransformerDecoderLayer +from .layer.transformer import TransformerDecoder +from .layer.transformer import Transformer from .layer.distance import PairwiseDistance #DEFINE_ALIAS from .layer import loss #DEFINE_ALIAS diff --git a/python/paddle/nn/layer/transformer.py b/python/paddle/nn/layer/transformer.py index c914d51f0930ff..50a8755ac9f7b0 100644 --- a/python/paddle/nn/layer/transformer.py +++ b/python/paddle/nn/layer/transformer.py @@ -14,7 +14,7 @@ # TODO: define the classes of Transformer neural network __all__ = [ - 'MultiheadAttention', + 'MultiHeadAttention', 'TransformerEncoderLayer', 'TransformerEncoder', 'TransformerDecoderLayer', @@ -25,8 +25,6 @@ import copy import collections -import numpy as np - from ...fluid import layers from ...fluid.param_attr import ParamAttr from ...fluid.dygraph import Layer, Linear, Dropout, LayerNorm, LayerList @@ -66,7 +64,7 @@ def _convert_param_attr_to_list(param_attr, n): return param_attrs -class MultiheadAttention(Layer): +class MultiHeadAttention(Layer): """ Attention mapps queries and a set of key-value pairs to outputs, and Multi-Head Attention performs multiple parallel attention to jointly attending @@ -104,7 +102,7 @@ class MultiheadAttention(Layer): query = paddle.rand((2, 4, 128)) # self attention mask: [batch_size, num_heads, query_len, query_len] attn_mask = paddle.rand((2, 2, 4, 4)) - multi_head_attn = paddle.MultiheadAttention(128, 2) + multi_head_attn = paddle.MultiHeadAttention(128, 2) output = multi_head_attn(query, attn_mask=attn_mask) # [2, 4, 128] """ @@ -120,7 +118,7 @@ def __init__(self, need_weights=False, weight_attr=None, bias_attr=None): - super(MultiheadAttention, self).__init__() + super(MultiHeadAttention, self).__init__() self.embed_dim = embed_dim self.kdim = kdim if kdim is not None else embed_dim self.vdim = vdim if vdim is not None else embed_dim @@ -158,11 +156,11 @@ def _prepare_qkv(self, query, key, value, cache=None): is a tensor with shape `[batch_size, value_length, vdim]`. The data type should be float32 or float64. If None, use `query` as `value`. - cache (MultiheadAttention.Cache|MultiheadAttention.StaticCache, optional): + cache (MultiHeadAttention.Cache|MultiHeadAttention.StaticCache, optional): It is a namedtuple with `k` and `v` as fields, and stores tensors shaped `[batch_size, num_heads, length, embed_dim]` which are results of linear projection, reshape and transpose calculations in - MultiheadAttention. If is an instance of `Cache`, `k` and `v` + MultiHeadAttention. If is an instance of `Cache`, `k` and `v` fields reserve intermediate results of previous positions, which mostly used for decoder self attention. If it is an instance of `StaticCache`, `key` and `value` args would be ignored, `k` and @@ -185,7 +183,7 @@ def _prepare_qkv(self, query, key, value, cache=None): # for encoder-decoder attention in inference and has cached k, v = cache.k, cache.v else: - k, v = self.cal_kv(key, value) + k, v = self.compute_kv(key, value) if isinstance(cache, self.Cache): # for decoder self-attention in inference @@ -195,7 +193,7 @@ def _prepare_qkv(self, query, key, value, cache=None): return (q, k, v) if cache is None else (q, k, v, cache) - def cal_kv(self, key, value): + def compute_kv(self, key, value): """ Applies linear projection on input keys and values, then splits heads (reshape and transpose) to get keys and values from different representation @@ -230,13 +228,13 @@ def cal_kv(self, key, value): def gen_cache(self, key, value=None, type=Cache): """ Generates cache for `forward` usage in inference accroding to arguments. - The generated cache is an instance of `MultiheadAttention.Cache` or an - instance of `MultiheadAttention.StaticCache`. + The generated cache is an instance of `MultiHeadAttention.Cache` or an + instance of `MultiHeadAttention.StaticCache`. `Cache` or `StaticCache` is namedtuple with `k` and `v` as fields, and it stores tensors shaped `[batch_size, num_heads, length, embed_dim]` which are results of linear projection, reshape and transpose calculations - in MultiheadAttention. + in MultiHeadAttention. If the generated cache is an instance of `Cache`, `k` and `v` fields reserve intermediate result tensors of previous positions, and the tensors @@ -250,8 +248,8 @@ def gen_cache(self, key, value=None, type=Cache): The cache is generated as follows: - 1. If `type` is `StaticCache`, apply `cal_kv(key, value)` and use the results - to create an instance of `StaticCache`. + 1. If `type` is `StaticCache`, apply `compute_kv(key, value)` and use the + results to create an instance of `StaticCache`. 2. If `type` is `Cache` and `value` is None, generate empty tensors shaped `[batch_size, num_heads, 0, embed_dim // num_heads]` and use the results @@ -270,14 +268,14 @@ def gen_cache(self, key, value=None, type=Cache): is a tensor with shape `[batch_size, value_length, vdim]`. The data type should be float32 or float64. If None, `key` is only for batch size reference. Default None. - type (type): It should be `MultiheadAttention.StaticCache` or - `MultiheadAttention.Cache` to indicate the cache type to generate. + type (type): It should be `MultiHeadAttention.StaticCache` or + `MultiHeadAttention.Cache` to indicate the cache type to generate. Returns: namedtuple: an instance of `Cache` or `StaticCache` accordingly. """ - if type == MultiheadAttention.StaticCache: # static_kv - k, v = self.cal_kv(key, value) + if type == MultiHeadAttention.StaticCache: # static_kv + k, v = self.compute_kv(key, value) return self.StaticCache(k, v) elif value is None: # incremental_state k = layers.fill_constant_batch_size_like( @@ -320,11 +318,11 @@ def forward(self, query, key, value, attn_mask=None, cache=None): have 0 values. The data type should be float32 or float64. It can be None when nothing wanted or needed to be prevented attention to. Default None - cache (MultiheadAttention.Cache|MultiheadAttention.StaticCache, optional): + cache (MultiHeadAttention.Cache|MultiHeadAttention.StaticCache, optional): It is a namedtuple with `k` and `v` as fields, and stores tensors shaped `[batch_size, num_heads, length, embed_dim]` which are results of linear projection, reshape and transpose calculations in - MultiheadAttention. If it is an instance of `Cache`, `k` and `v` + MultiHeadAttention. If it is an instance of `Cache`, `k` and `v` fields reserve intermediate results of previous positions, which mostly used for decoder self attention. If it is an instance of `StaticCache`, `key` and `value` args would be ignored, `k` and @@ -464,7 +462,7 @@ def __init__(self, weight_attrs = _convert_param_attr_to_list(weight_attr, 2) bias_attrs = _convert_param_attr_to_list(bias_attr, 2) - self.self_attn = MultiheadAttention( + self.self_attn = MultiHeadAttention( d_model, nhead, dropout=attn_dropout, @@ -685,13 +683,13 @@ def __init__(self, weight_attrs = _convert_param_attr_to_list(weight_attr, 3) bias_attrs = _convert_param_attr_to_list(bias_attr, 3) - self.self_attn = MultiheadAttention( + self.self_attn = MultiHeadAttention( d_model, nhead, dropout=attn_dropout, weight_attr=weight_attrs[0], bias_attr=bias_attrs[0]) - self.cross_attn = MultiheadAttention( + self.cross_attn = MultiHeadAttention( d_model, nhead, dropout=attn_dropout, @@ -741,8 +739,8 @@ def forward(self, tgt, memory, tgt_mask=None, memory_mask=None, cache=None): The data type should be float32 or float64. It can be None when nothing wanted or needed to be prevented attention to. Default None cache (tuple, optional): It is a tuple( :code:`(incremental_cache, static_cache)` ), - `incremental_cache` is an instance of `MultiheadAttention.Cache`, - `static_cache` is an instance of `MultiheadAttention.StaticCache. + `incremental_cache` is an instance of `MultiHeadAttention.Cache`, + `static_cache` is an instance of `MultiHeadAttention.StaticCache. See `TransformerDecoderLayer.gen_cache` for more details. It is only used for inference and should be None for training. Default None. @@ -753,7 +751,7 @@ def forward(self, tgt, memory, tgt_mask=None, memory_mask=None, cache=None): Or a tuple if `cache` is not None, except for decoder layer output, \ the tuple includes the new cache which is same as input `cache` \ argument but `incremental_cache` in it has an incremental length. \ - See `MultiheadAttention.gen_cache` and `MultiheadAttention.forward` \ + See `MultiHeadAttention.gen_cache` and `MultiHeadAttention.forward` \ for more details. """ residual = tgt @@ -793,8 +791,8 @@ def forward(self, tgt, memory, tgt_mask=None, memory_mask=None, cache=None): def gen_cache(self, memory): """ Generates cache for `forward` usage. The generated cache is a tuple - composed of an instance of `MultiheadAttention.Cache` and an instance - of `MultiheadAttention.StaticCache`. + composed of an instance of `MultiHeadAttention.Cache` and an instance + of `MultiHeadAttention.StaticCache`. Parameters: memory (Tensor): The output of Transformer encoder. It is a tensor @@ -803,13 +801,13 @@ def gen_cache(self, memory): Returns: tuple: It is a tuple( :code:`(incremental_cache, static_cache)` ). \ - `incremental_cache` is an instance of `MultiheadAttention.Cache` \ - produced by `self_attn.gen_cache(memory, MultiheadAttention.Cache)`, \ + `incremental_cache` is an instance of `MultiHeadAttention.Cache` \ + produced by `self_attn.gen_cache(memory, MultiHeadAttention.Cache)`, \ it reserves two tensors shaped `[batch_size, nhead, 0, d_model // nhead]`. \ - `static_cache` is an instance of `MultiheadAttention.StaticCache` \ - produced by `cross_attn.gen_cache(memory, MultiheadAttention.StaticCache)`, \ + `static_cache` is an instance of `MultiHeadAttention.StaticCache` \ + produced by `cross_attn.gen_cache(memory, MultiHeadAttention.StaticCache)`, \ it reserves two tensors shaped `[batch_size, nhead, source_length, d_model // nhead]`. - See `MultiheadAttention.gen_cache` and `MultiheadAttention.forward` \ + See `MultiHeadAttention.gen_cache` and `MultiHeadAttention.forward` \ for more details. """ incremental_cache = self.self_attn.gen_cache( @@ -901,7 +899,7 @@ def forward(self, tgt, memory, tgt_mask=None, memory_mask=None, cache=None): Or a tuple if `cache` is not None, except for decoder output, \ the tuple includes the new cache which is same as input `cache` \ argument but `incremental_cache` in it has an incremental length. \ - See `MultiheadAttention.gen_cache` and `MultiheadAttention.forward` \ + See `MultiHeadAttention.gen_cache` and `MultiHeadAttention.forward` \ for more details. """ output = tgt From 1beb0754f846293bd207b6cb04f43771a30130e4 Mon Sep 17 00:00:00 2001 From: guosheng Date: Sun, 23 Aug 2020 19:27:45 +0800 Subject: [PATCH 17/17] Fix MultiHeadAttention in test_transformer_api.py. test=develop --- .../fluid/tests/unittests/test_transformer_api.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/test_transformer_api.py b/python/paddle/fluid/tests/unittests/test_transformer_api.py index 8384a346c6375e..c8d1e77134036b 100644 --- a/python/paddle/fluid/tests/unittests/test_transformer_api.py +++ b/python/paddle/fluid/tests/unittests/test_transformer_api.py @@ -15,7 +15,7 @@ import numpy as np import paddle import paddle.fluid as fluid -from paddle.nn.layer.transformer import MultiheadAttention, TransformerEncoderLayer, TransformerDecoderLayer, TransformerEncoder, TransformerDecoder, Transformer +from paddle.nn.layer.transformer import MultiHeadAttention, TransformerEncoderLayer, TransformerDecoderLayer, TransformerEncoder, TransformerDecoder, Transformer import unittest @@ -225,7 +225,7 @@ def multihead_attention_test_helper(self_attention, cache): attn_mask = np.concatenate((attn_mask, attn_mask), axis=3) need_weight, param_attr, bias_attr = False, None, None # call paddle's function - multi_head_attn = MultiheadAttention( + multi_head_attn = MultiHeadAttention( embed_dim, num_heads, attn_dropout, kdim, vdim, need_weight, param_attr, bias_attr) # construct cache object @@ -298,7 +298,7 @@ def test_transformer_encoder_layer(self): paddle.to_variable(src_mask)) # paddle.to_variable(src_mask)) # 4.numpy: # paddle self attention - self_attn = MultiheadAttention( + self_attn = MultiHeadAttention( d_model, n_head, dropout=attn_dropout) attn_output = self_attn( paddle.to_variable(src), @@ -334,9 +334,9 @@ def test_transformer_decoder_layer(self): source_length)).astype("float32") memory_mask[0][0][0][0] = -1e9 for cache in [True, False]: - self_attn = MultiheadAttention( + self_attn = MultiHeadAttention( d_model, n_head, dropout=attn_dropout) - cross_attn = MultiheadAttention( + cross_attn = MultiHeadAttention( d_model, n_head, dropout=attn_dropout) # paddle decoderlayer: