diff --git a/.github/workflows/unittests.yml b/.github/workflows/unittests.yml index dd82bc2af3..8e3e73e1e8 100644 --- a/.github/workflows/unittests.yml +++ b/.github/workflows/unittests.yml @@ -33,9 +33,9 @@ jobs: - name: Install Other Dependencies run: | python -m pip install --user --upgrade pip - python -m pip install --user setuptools pytest pytest-cov + python -m pip install --user setuptools pytest pytest-cov contextvars python -m pip install --upgrade cython - python -m pip install --pre --user "mxnet>=2.0.0b20200604,<=2.0.0b20200619" -f https://dist.mxnet.io/python + python -m pip install --pre --user "mxnet>=2.0.0b20200716" -f https://dist.mxnet.io/python python -m pip install --user -e .[extras] - name: Test project run: | diff --git a/README.md b/README.md index 9014bfdaa4..34fc069cbc 100644 --- a/README.md +++ b/README.md @@ -21,10 +21,10 @@ First of all, install the latest MXNet. You may use the following commands: ```bash # Install the version with CUDA 10.1 -pip install -U --pre mxnet-cu101>=2.0.0b20200604 -f https://dist.mxnet.io/python +pip install -U --pre mxnet-cu101>=2.0.0b20200716 -f https://dist.mxnet.io/python # Install the cpu-only version -pip install -U --pre mxnet>=2.0.0b20200604 -f https://dist.mxnet.io/python +pip install -U --pre mxnet>=2.0.0b20200716 -f https://dist.mxnet.io/python ``` diff --git a/scripts/conversion_toolkits/convert_electra.py b/scripts/conversion_toolkits/convert_electra.py index 58dae505f7..7c62317e5e 100644 --- a/scripts/conversion_toolkits/convert_electra.py +++ b/scripts/conversion_toolkits/convert_electra.py @@ -265,11 +265,11 @@ def convert_tf_model(model_dir, save_dir, test_conversion, model_size, gpu, elec assert_allclose(tf_params[k], backbone_params[k]) # Build gluon model and initialize - gluon_model = ElectraModel.from_cfg(cfg, prefix='electra_') + gluon_model = ElectraModel.from_cfg(cfg) gluon_model.initialize(ctx=ctx) gluon_model.hybridize() - gluon_disc_model = ElectraDiscriminator(cfg, prefix='electra_') + gluon_disc_model = ElectraDiscriminator(cfg) gluon_disc_model.initialize(ctx=ctx) gluon_disc_model.hybridize() @@ -283,8 +283,7 @@ def convert_tf_model(model_dir, save_dir, test_conversion, model_size, gpu, elec word_embed_params=word_embed_params, token_type_embed_params=token_type_embed_params, token_pos_embed_params=token_pos_embed_params, - embed_layer_norm_params=embed_layer_norm_params, - prefix='generator_') + embed_layer_norm_params=embed_layer_norm_params) gluon_gen_model.initialize(ctx=ctx) gluon_gen_model.hybridize() diff --git a/scripts/conversion_toolkits/convert_mobilebert.py b/scripts/conversion_toolkits/convert_mobilebert.py index d304b3a1d8..0fb03dcbed 100644 --- a/scripts/conversion_toolkits/convert_mobilebert.py +++ b/scripts/conversion_toolkits/convert_mobilebert.py @@ -270,7 +270,7 @@ def convert_tf_model(model_dir, save_dir, test_conversion, gpu, mobilebert_dir): gluon_model.initialize(ctx=ctx) gluon_model.hybridize() - gluon_pretrain_model = MobileBertForPretrain(cfg, prefix='') + gluon_pretrain_model = MobileBertForPretrain(cfg) gluon_pretrain_model.initialize(ctx=ctx) gluon_pretrain_model.hybridize() diff --git a/scripts/conversion_toolkits/convert_tf_hub_model.py b/scripts/conversion_toolkits/convert_tf_hub_model.py index 56af511366..fa9dd1a215 100644 --- a/scripts/conversion_toolkits/convert_tf_hub_model.py +++ b/scripts/conversion_toolkits/convert_tf_hub_model.py @@ -358,7 +358,7 @@ def convert_tf_model(hub_model_dir, save_dir, test_conversion, model_type, gpu): gluon_model = PretrainedModel.from_cfg(cfg, prefix='', use_pooler=True) gluon_model.initialize(ctx=ctx) gluon_model.hybridize() - gluon_mlm_model = PretrainedMLMModel(backbone_cfg=cfg, prefix='') + gluon_mlm_model = PretrainedMLMModel(backbone_cfg=cfg) gluon_mlm_model.initialize(ctx=ctx) gluon_mlm_model.hybridize() diff --git a/scripts/machine_translation/train_transformer.py b/scripts/machine_translation/train_transformer.py index d8466d5439..d08d482848 100644 --- a/scripts/machine_translation/train_transformer.py +++ b/scripts/machine_translation/train_transformer.py @@ -357,7 +357,7 @@ def train(args): for v in model.collect_params().values(): if v.grad_req != 'null': v.grad_req = 'add' - model.collect_params().zero_grad() + model.zero_grad() model_averager = AverageSGDTracker(model.collect_params()) log_start_time = time.time() num_params, num_fixed_params = None, None @@ -422,7 +422,7 @@ def train(args): trainer.step(loss_denom.asnumpy() / rescale_loss) accum_count = 0 loss_denom = 0 - model.collect_params().zero_grad() + model.zero_grad() if (args.epochs > 0 and epoch_id >= args.epochs - args.num_averages) or \ (args.max_update > 0 and n_train_iters >= args.max_update - args.num_averages * args.save_interval_update): model_averager.step() diff --git a/scripts/pretraining/run_electra.py b/scripts/pretraining/run_electra.py index f29a9055c0..60f8277268 100644 --- a/scripts/pretraining/run_electra.py +++ b/scripts/pretraining/run_electra.py @@ -155,8 +155,7 @@ def get_pretraining_model(model_name, ctx_l, tied_generator=False, tied_embeddings=True, disallow_correct=False, - weight_initializer=TruncNorm(stdev=0.02), - prefix='Pretrain_') + weight_initializer=TruncNorm(stdev=0.02)) model.initialize(ctx=ctx_l) model.hybridize() return cfg, tokenizer, model diff --git a/scripts/question_answering/models.py b/scripts/question_answering/models.py index 64971f8daf..641247e937 100644 --- a/scripts/question_answering/models.py +++ b/scripts/question_answering/models.py @@ -14,15 +14,12 @@ class ModelForQABasic(HybridBlock): another dense layer to map the contextual embeddings to the start scores and end scores. """ - def __init__(self, backbone, weight_initializer=None, bias_initializer=None, - prefix=None, params=None): - super().__init__(prefix=prefix, params=params) - with self.name_scope(): - self.backbone = backbone - self.qa_outputs = nn.Dense(units=2, flatten=False, - weight_initializer=weight_initializer, - bias_initializer=bias_initializer, - prefix='qa_outputs_') + def __init__(self, backbone, weight_initializer=None, bias_initializer=None): + super().__init__() + self.backbone = backbone + self.qa_outputs = nn.Dense(units=2, flatten=False, + weight_initializer=weight_initializer, + bias_initializer=bias_initializer) def hybrid_forward(self, F, tokens, token_types, valid_length, p_mask): """ @@ -77,39 +74,30 @@ class ModelForQAConditionalV1(HybridBlock): """ def __init__(self, backbone, units=768, layer_norm_eps=1E-12, dropout_prob=0.1, - activation='tanh', weight_initializer=None, bias_initializer=None, - prefix=None, params=None): - super().__init__(prefix=prefix, params=params) - with self.name_scope(): - self.backbone = backbone - self.start_scores = nn.Dense(1, flatten=False, - weight_initializer=weight_initializer, - bias_initializer=bias_initializer, - prefix='start_scores_') - self.end_scores = nn.HybridSequential(prefix='end_scores_') - with self.end_scores.name_scope(): - self.end_scores.add(nn.Dense(units, flatten=False, - weight_initializer=weight_initializer, - bias_initializer=bias_initializer, - prefix='mid_')) - self.end_scores.add(get_activation(activation)) - self.end_scores.add(nn.LayerNorm(epsilon=layer_norm_eps)) - self.end_scores.add(nn.Dense(1, flatten=False, - weight_initializer=weight_initializer, - bias_initializer=bias_initializer, - prefix='out_')) - self.answerable_scores = nn.HybridSequential(prefix='answerable_scores_') - with self.answerable_scores.name_scope(): - self.answerable_scores.add(nn.Dense(units, flatten=False, - weight_initializer=weight_initializer, - bias_initializer=bias_initializer, - prefix='mid_')) - self.answerable_scores.add(get_activation(activation)) - self.answerable_scores.add(nn.Dropout(dropout_prob)) - self.answerable_scores.add(nn.Dense(2, flatten=False, - weight_initializer=weight_initializer, - bias_initializer=bias_initializer, - prefix='out_')) + activation='tanh', weight_initializer=None, bias_initializer=None): + super().__init__() + self.backbone = backbone + self.start_scores = nn.Dense(1, flatten=False, + weight_initializer=weight_initializer, + bias_initializer=bias_initializer) + self.end_scores = nn.HybridSequential() + self.end_scores.add(nn.Dense(units, flatten=False, + weight_initializer=weight_initializer, + bias_initializer=bias_initializer)) + self.end_scores.add(get_activation(activation)) + self.end_scores.add(nn.LayerNorm(epsilon=layer_norm_eps)) + self.end_scores.add(nn.Dense(1, flatten=False, + weight_initializer=weight_initializer, + bias_initializer=bias_initializer)) + self.answerable_scores = nn.HybridSequential() + self.answerable_scores.add(nn.Dense(units, flatten=False, + weight_initializer=weight_initializer, + bias_initializer=bias_initializer)) + self.answerable_scores.add(get_activation(activation)) + self.answerable_scores.add(nn.Dropout(dropout_prob)) + self.answerable_scores.add(nn.Dense(2, flatten=False, + weight_initializer=weight_initializer, + bias_initializer=bias_initializer)) def get_start_logits(self, F, contextual_embedding, p_mask): """ diff --git a/scripts/question_answering/run_squad.py b/scripts/question_answering/run_squad.py index 32f1a4eb76..aa9f404b6d 100644 --- a/scripts/question_answering/run_squad.py +++ b/scripts/question_answering/run_squad.py @@ -324,8 +324,7 @@ def get_network(model_name, backbone_params_path, num_params, num_fixed_params)) qa_net = ModelForQAConditionalV1(backbone=backbone, dropout_prob=dropout, - weight_initializer=TruncNorm(stdev=0.02), - prefix='qa_net_') + weight_initializer=TruncNorm(stdev=0.02)) if checkpoint_path is None: # Ignore the UserWarning during initialization, # There is no need to re-initialize the parameters of backbone @@ -529,7 +528,7 @@ def train(args): log_sample_num = 0 if args.num_accumulated != 1: # set grad to zero for gradient accumulation - qa_net.collect_params().zero_grad() + qa_net.zero_grad() global_tic = time.time() while not finish_flag: epoch_tic = time.time() @@ -594,7 +593,7 @@ def train(args): step_num += 1 if args.num_accumulated != 1: # set grad to zero for gradient accumulation - qa_net.collect_params().zero_grad() + qa_net.zero_grad() # saving if step_num % save_interval == 0 or step_num >= num_train_steps: @@ -964,7 +963,6 @@ def eval_validation(ckpt_name, best_eval): if __name__ == '__main__': os.environ['MXNET_GPU_MEM_POOL_TYPE'] = 'Round' - os.environ['MXNET_USE_FUSION'] = '0' # Manually disable pointwise fusion args = parse_args() logging_config(args.output_dir, name='finetune_squad{}'.format(args.version)) set_seed(args.seed) diff --git a/src/gluonnlp/attention_cell.py b/src/gluonnlp/attention_cell.py index 980d29806c..4181e62f11 100644 --- a/src/gluonnlp/attention_cell.py +++ b/src/gluonnlp/attention_cell.py @@ -601,9 +601,8 @@ class MultiHeadAttentionCell(HybridBlock): """ def __init__(self, query_units=None, num_heads=None, attention_dropout=0.0, scaled: bool = True, normalized: bool = False, eps: float = 1E-6, - dtype='float32', layout='NTK', use_einsum=False, - prefix=None, params=None): - super().__init__(prefix=prefix, params=params) + dtype='float32', layout='NTK', use_einsum=False): + super().__init__() self._query_units = query_units self._num_heads = num_heads self._attention_dropout = attention_dropout @@ -705,8 +704,7 @@ def __init__(self, query_units, dropout: float = 0.0, dtype='float32', layout='NTK', - use_einsum=False, - prefix=None, params=None): + use_einsum=False): """ Parameters @@ -725,10 +723,8 @@ def __init__(self, query_units, scaled dtype layout - prefix - params """ - super().__init__(prefix=prefix, params=params) + super().__init__() self._dropout = dropout self._method = method self._query_units = query_units @@ -744,49 +740,44 @@ def __init__(self, query_units, self._layout = layout if self._layout not in ['NKT', 'NTK', 'TNK']: raise ValueError('layout="{}" is not supported'.format(self._layout)) - with self.name_scope(): - if method == 'transformer_xl': - if pos_embed_units is None: - pos_embed_units = self._num_heads * self._head_query_units - self._rel_pos_embed = SinusoidalPositionalEmbedding(units=pos_embed_units, - prefix='rel_pos_embed_', - dtype=self._dtype) - self._rel_proj = nn.Dense(units=query_units, - in_units=pos_embed_units, - flatten=False, - use_bias=False, - prefix='rel_proj_', - dtype=self._dtype) - self._dropout_layer = nn.Dropout(dropout) - elif method == 'shaw': - assert self._max_distance is not None, 'Must set max_distance when method="shaw".' - if self._bidirectional: - vocab_size = self._max_distance * 2 + 1 - else: - vocab_size = self._max_distance + 1 - self._rel_pos_embed = LearnedPositionalEmbedding( - units=self._num_heads * self._head_query_units, - max_length=vocab_size, - weight_initializer=mx.init.Xavier(rnd_type="gaussian", - factor_type="in", - magnitude=1), - prefix='rel_pos_embed_', - mode='wrap' if self._bidirectional else 'raise', - dtype=self._dtype) - elif method == 't5': - if self._num_buckets is None: - self._num_buckets = 32 - if self._max_distance is None: - self._max_distance = 128 - self._rel_pos_embed = BucketPositionalEmbedding( - units=num_heads, - num_buckets=self._num_buckets, - max_distance=self._max_distance, - bidirectional=self._bidirectional, - prefix='rel_pos_embed_', - dtype=self._dtype) + if method == 'transformer_xl': + if pos_embed_units is None: + pos_embed_units = self._num_heads * self._head_query_units + self._rel_pos_embed = SinusoidalPositionalEmbedding(units=pos_embed_units, + dtype=self._dtype) + self._rel_proj = nn.Dense(units=query_units, + in_units=pos_embed_units, + flatten=False, + use_bias=False, + dtype=self._dtype) + self._dropout_layer = nn.Dropout(dropout) + elif method == 'shaw': + assert self._max_distance is not None, 'Must set max_distance when method="shaw".' + if self._bidirectional: + vocab_size = self._max_distance * 2 + 1 else: - raise NotImplementedError('method="{}" is currently not supported!'.format(method)) + vocab_size = self._max_distance + 1 + self._rel_pos_embed = LearnedPositionalEmbedding( + units=self._num_heads * self._head_query_units, + max_length=vocab_size, + weight_initializer=mx.init.Xavier(rnd_type="gaussian", + factor_type="in", + magnitude=1), + mode='wrap' if self._bidirectional else 'raise', + dtype=self._dtype) + elif method == 't5': + if self._num_buckets is None: + self._num_buckets = 32 + if self._max_distance is None: + self._max_distance = 128 + self._rel_pos_embed = BucketPositionalEmbedding( + units=num_heads, + num_buckets=self._num_buckets, + max_distance=self._max_distance, + bidirectional=self._bidirectional, + dtype=self._dtype) + else: + raise NotImplementedError('method="{}" is currently not supported!'.format(method)) def hybrid_forward(self, F, rel_positions, query=None): """ diff --git a/src/gluonnlp/data/loading.py b/src/gluonnlp/data/loading.py index 2932deab3d..4027f78866 100644 --- a/src/gluonnlp/data/loading.py +++ b/src/gluonnlp/data/loading.py @@ -81,7 +81,7 @@ def __init__(self, filename, **kwargs): else: raise ValueError('Unsupported extension: %s' % filename) self._keys = keys - super(NumpyDataset, self).__init__(*data) + super().__init__(*data) @property def keys(self): @@ -125,7 +125,7 @@ def __init__(self, file_pattern): files = sorted(files) if len(files) == 0: raise ValueError('Cannot find any file with path "%s"' % file_pattern) - super(_PathDataset, self).__init__(files) + super().__init__(files) def _dataset_worker_fn(urls, dataset_fn, batch_sampler_fn): diff --git a/src/gluonnlp/initializer.py b/src/gluonnlp/initializer.py index 4789c76c30..4499c69723 100644 --- a/src/gluonnlp/initializer.py +++ b/src/gluonnlp/initializer.py @@ -51,7 +51,7 @@ class TruncNorm(Initializer): """ def __init__(self, mean: float = 0, stdev: float = 0.01, scale=2, **kwargs): - super(TruncNorm, self).__init__(**kwargs) + super().__init__(**kwargs) self._mean = mean self._stdev = stdev self._scale = scale diff --git a/src/gluonnlp/layers.py b/src/gluonnlp/layers.py index 79484be9a4..f19553fd5e 100644 --- a/src/gluonnlp/layers.py +++ b/src/gluonnlp/layers.py @@ -24,8 +24,7 @@ from collections import OrderedDict import mxnet as mx from mxnet import use_np -from mxnet.gluon import HybridBlock -from mxnet.gluon import nn +from mxnet.gluon import nn, HybridBlock, Parameter, Constant from typing import Union, Optional, List from .op import relative_position_bucket @@ -118,15 +117,15 @@ class NoNorm(HybridBlock): def __init__(self, in_channels, center=True, scale=True, beta_initializer='zeros', gamma_initializer='ones', **kwargs): - super(NoNorm, self).__init__(**kwargs) + super().__init__(**kwargs) self._kwargs = {'center': center, 'scale': scale} self._in_channels = in_channels - self.gamma = self.params.get('gamma', grad_req='write' if scale else 'null', - shape=(in_channels,), init=gamma_initializer, - allow_deferred_init=True) - self.beta = self.params.get('beta', grad_req='write' if center else 'null', - shape=(in_channels,), init=beta_initializer, - allow_deferred_init=True) + self.gamma = Parameter('gamma', grad_req='write' if scale else 'null', + shape=(in_channels,), init=gamma_initializer, + allow_deferred_init=True) + self.beta = Parameter('beta', grad_req='write' if center else 'null', + shape=(in_channels,), init=beta_initializer, + allow_deferred_init=True) def hybrid_forward(self, F, data, gamma, beta): return data * gamma + beta @@ -234,8 +233,7 @@ def get_activation(act: Optional[Union[str, HybridBlock]]) -> HybridBlock: @use_np class MultiHeadDense(HybridBlock): def __init__(self, units, num_heads, use_bias=True, dtype='float32', - weight_initializer=None, bias_initializer=None, - prefix=None, params=None): + weight_initializer=None, bias_initializer=None): """Multiple Dense with different parameters and the same number of units The inner shapes of the weight and bias are weight: (self._parallel_num[0] * ... * self._parallel_num[k] * units, in_units) @@ -250,10 +248,8 @@ def __init__(self, units, num_heads, use_bias=True, dtype='float32', The data type weight_initializer : None or initialzer, default None bias_initializer : None or initializer, default None - prefix : str or None - params : None """ - super().__init__(prefix=prefix, params=params) + super().__init__() if not isinstance(num_heads, (list, tuple)): num_heads = (int(num_heads),) else: @@ -266,16 +262,15 @@ def __init__(self, units, num_heads, use_bias=True, dtype='float32', ' num_heads={}'.format(num_heads)) self._units = units self._mult = np.prod(num_heads) - with self.name_scope(): - self.weight = self.params.get('weight', shape=(self._mult * units, 0), - init=weight_initializer, dtype=dtype, - allow_deferred_init=True) - if use_bias: - self.bias = self.params.get('bias', shape=(self._mult * units,), - init=bias_initializer, dtype=dtype, - allow_deferred_init=True) - else: - self.bias = None + self.weight = Parameter('weight', shape=(self._mult * units, 0), + init=weight_initializer, dtype=dtype, + allow_deferred_init=True) + if use_bias: + self.bias = Parameter('bias', shape=(self._mult * units,), + init=bias_initializer, dtype=dtype, + allow_deferred_init=True) + else: + self.bias = None def hybrid_forward(self, F, data, weight, bias=None): """ @@ -346,16 +341,14 @@ class GELU(HybridBlock): Outputs: - **out**: output tensor with the same shape as `data`. """ - def __init__(self, mode='erf', prefix=None, params=None): + def __init__(self, mode='erf'): """ Parameters ---------- mode - prefix - params """ - super().__init__(prefix=prefix, params=params) + super().__init__() if mode not in ['erf', 'tanh', 'sigmoid']: raise ValueError('Unsupported mode, only support "erf", "tanh", or "sigmoid". ' 'Received mode={}'.format(mode)) @@ -407,24 +400,21 @@ def __repr__(self): @use_np class PositionalEmbedding(HybridBlock): def __init__(self, units, max_length=None, method='sinusoidal', - dtype='float32', prefix=None, params=None): - super().__init__(prefix=prefix, params=params) + dtype='float32'): + super().__init__() self._units = units self._max_length = max_length self._method = method self._dtype = dtype - with self.name_scope(): - if method == 'sinusoidal': - self._embed = SinusoidalPositionalEmbedding(units=units, - dtype=dtype, - prefix='embed_') - elif method == 'learned': - self._embed = LearnedPositionalEmbedding(units=units, - max_length=max_length, - dtype=dtype, - prefix='embed_') - else: - raise NotImplementedError + if method == 'sinusoidal': + self._embed = SinusoidalPositionalEmbedding(units=units, + dtype=dtype) + elif method == 'learned': + self._embed = LearnedPositionalEmbedding(units=units, + max_length=max_length, + dtype=dtype) + else: + raise NotImplementedError def hybrid_forward(self, F, positions): """ @@ -445,7 +435,7 @@ def hybrid_forward(self, F, positions): @use_np class SinusoidalPositionalEmbedding(HybridBlock): - def __init__(self, units: int, dtype: Union[str, type] = 'float32', prefix=None, params=None): + def __init__(self, units: int, dtype: Union[str, type] = 'float32'): """Use a geometric sequence of timescales. Parameters @@ -455,7 +445,7 @@ def __init__(self, units: int, dtype: Union[str, type] = 'float32', prefix=None, dtype The dtype of the inner positional embeddings """ - super().__init__(prefix=prefix, params=params) + super().__init__() def _init_sinusodial_base(units): half_units = units // 2 @@ -465,7 +455,7 @@ def _init_sinusodial_base(units): self._units = units self._dtype = dtype - self.base_mult = self.params.get_constant('base_mult', _init_sinusodial_base(units)) + self.base_mult = Constant(_init_sinusodial_base(units)) def hybrid_forward(self, F, positions, base_mult): """ @@ -501,17 +491,16 @@ def __repr__(self): @use_np class LearnedPositionalEmbedding(HybridBlock): def __init__(self, units, max_length, mode='clip', - dtype='float32', weight_initializer=None, prefix=None, params=None): - super().__init__(prefix=prefix, params=params) + dtype='float32', weight_initializer=None): + super().__init__() self._units = units self._dtype = dtype self._max_length = max_length self._mode = mode - with self.name_scope(): - self.weight = self.params.get('weight', shape=(max_length, units), - init=weight_initializer, dtype=dtype, - allow_deferred_init=True) + self.weight = Parameter('weight', shape=(max_length, units), + init=weight_initializer, dtype=dtype, + allow_deferred_init=True) def __repr__(self): s = '{name}(units={units}, max_length={max_length}, mode={mode}, dtype={dtype})' @@ -538,17 +527,16 @@ class BucketPositionalEmbedding(HybridBlock): of the buckets handles the large shifts (mapping them in logarithmically separated bins). """ def __init__(self, units, bidirectional=True, num_buckets=32, max_distance=128, - dtype='float32', embed_initializer=None, prefix=None, params=None): - super().__init__(prefix=prefix, params=params) + dtype='float32', embed_initializer=None): + super().__init__() self._units = units self._bidirectional = bidirectional self._num_buckets = num_buckets self._max_distance = max_distance self._dtype = dtype - with self.name_scope(): - self.weight = self.params.get('weight', shape=(num_buckets, units), - init=embed_initializer, dtype=dtype, - allow_deferred_init=True) + self.weight = Parameter('weight', shape=(num_buckets, units), + init=embed_initializer, dtype=dtype, + allow_deferred_init=True) def __repr__(self): s = '{name}(units={units}, bidirectional={bidirectional}, num_buckets={num_buckets},' \ @@ -588,8 +576,7 @@ def __init__(self, normalization: str = 'layer_norm', layer_norm_eps: float = 1E-5, pre_norm: bool = False, - dtype='float32', - prefix=None, params=None): + dtype='float32'): """ Parameters @@ -611,10 +598,8 @@ def __init__(self, This will stabilize the training of Transformers. You may also refer to "[Arxiv2020] Understanding the Difficulty of Training Transformers" - prefix - params """ - super().__init__(prefix=prefix, params=params) + super().__init__() self._dtype = dtype self._pre_norm = pre_norm self._kwargs = OrderedDict([ @@ -628,29 +613,25 @@ def __init__(self, ('pre_norm', pre_norm), ('dtype', self._dtype) ]) - with self.name_scope(): - self.dropout_layer = nn.Dropout(dropout) - self.activation_dropout_layer = nn.Dropout(activation_dropout) - self.ffn_1 = nn.Dense(units=hidden_size, - in_units=units, - flatten=False, - weight_initializer=weight_initializer, - bias_initializer=bias_initializer, - dtype=dtype, - prefix='ffn1_') - self.activation = get_activation(activation) - self.ffn_2 = nn.Dense(units=units, - in_units=hidden_size, - flatten=False, - weight_initializer=weight_initializer, - bias_initializer=bias_initializer, - dtype=dtype, - prefix='ffn2_') - # TODO(sxjscience) We may need to set the dtype flag in LayerNorm, need to double check - self.layer_norm = get_layer_norm(normalization=normalization, - in_channels=units, - epsilon=layer_norm_eps, - prefix='ln_') + self.dropout_layer = nn.Dropout(dropout) + self.activation_dropout_layer = nn.Dropout(activation_dropout) + self.ffn_1 = nn.Dense(units=hidden_size, + in_units=units, + flatten=False, + weight_initializer=weight_initializer, + bias_initializer=bias_initializer, + dtype=dtype) + self.activation = get_activation(activation) + self.ffn_2 = nn.Dense(units=units, + in_units=hidden_size, + flatten=False, + weight_initializer=weight_initializer, + bias_initializer=bias_initializer, + dtype=dtype) + # TODO(sxjscience) We may need to set the dtype flag in LayerNorm, need to double check + self.layer_norm = get_layer_norm(normalization=normalization, + in_channels=units, + epsilon=layer_norm_eps) def hybrid_forward(self, F, data): """ @@ -701,9 +682,7 @@ def __init__(self, vocab_size: int, dtype='float32', scaled=True, embedding_initializer: InitializerType = None, - weight_initializer: InitializerType = None, - prefix=None, - params=None): + weight_initializer: InitializerType = None): """ Parameters @@ -730,10 +709,8 @@ def __init__(self, vocab_size: int, Initializer of projection layers bias_initializer Initializer of the bias - prefix - params """ - super().__init__(prefix=prefix, params=params) + super().__init__() cutoffs = _fmt_and_check_cutoffs(cutoffs, vocab_size) if cutoffs is None: assert div_val == 1.0 @@ -755,43 +732,42 @@ def __init__(self, vocab_size: int, self._scaled = scaled if self._scaled: self._emb_scale = units**0.5 - with self.name_scope(): - if div_val == 1.0: - setattr(self, 'embed0_weight', - self.params.get('embed0_weight', - shape=(vocab_size, embed_size), - init=embedding_initializer, - allow_deferred_init=True)) - - if units != embed_size: - setattr(self, 'inter_proj0_weight', - self.params.get('inter_proj0_weight', - shape=(embed_size, units), - init=weight_initializer, - allow_deferred_init=True)) - else: - self.proj_layers = None + if div_val == 1.0: + setattr(self, 'embed0_weight', + Parameter('embed0_weight', + shape=(vocab_size, embed_size), + init=embedding_initializer, + allow_deferred_init=True)) + + if units != embed_size: + setattr(self, 'inter_proj0_weight', + Parameter('inter_proj0_weight', + shape=(embed_size, units), + init=weight_initializer, + allow_deferred_init=True)) else: - self.proj_layers = nn.HybridSequential(prefix='inter_proj') - for i, (l_idx, r_idx) in enumerate(zip([0] + cutoffs, cutoffs + [vocab_size])): - inner_embed_size = int(embed_size / div_val**i) - if inner_embed_size == 0: - raise ValueError('div_val = {} is too large for the layer. Currently, the ' - 'cutoffs are {} and the embed_size is {}. Using the ' - 'div_val = {} will cause some clusters to have ' - 'embed_size=0.'.format(div_val, cutoffs, embed_size, - div_val)) - setattr( - self, 'embed{}_weight'.format(i), - self.params.get('embed{}_weight'.format(i), - shape=(r_idx - l_idx, inner_embed_size), - init=embedding_initializer, - allow_deferred_init=True)) - setattr(self, 'inter_proj{}_weight'.format(i), - self.params.get('inter_proj{}_weight'.format(i), - shape=(inner_embed_size, units), - init=weight_initializer, - allow_deferred_init=True)) + self.proj_layers = None + else: + self.proj_layers = nn.HybridSequential() + for i, (l_idx, r_idx) in enumerate(zip([0] + cutoffs, cutoffs + [vocab_size])): + inner_embed_size = int(embed_size / div_val**i) + if inner_embed_size == 0: + raise ValueError('div_val = {} is too large for the layer. Currently, the ' + 'cutoffs are {} and the embed_size is {}. Using the ' + 'div_val = {} will cause some clusters to have ' + 'embed_size=0.'.format(div_val, cutoffs, embed_size, + div_val)) + setattr( + self, 'embed{}_weight'.format(i), + Parameter('embed{}_weight'.format(i), + shape=(r_idx - l_idx, inner_embed_size), + init=embedding_initializer, + allow_deferred_init=True)) + setattr(self, 'inter_proj{}_weight'.format(i), + Parameter('inter_proj{}_weight'.format(i), + shape=(inner_embed_size, units), + init=weight_initializer, + allow_deferred_init=True)) def hybrid_forward(self, F, inp, **params): # pylint: disable=arguments-differ """ @@ -886,9 +862,7 @@ def __init__(self, vocab_size: int, embed_size: int, in_units: int, dtype='float32', use_bias=True, weight_initializer: InitializerType = None, - bias_initializer: InitializerType = None, - prefix=None, - params=None): + bias_initializer: InitializerType = None): """ Parameters @@ -910,10 +884,8 @@ def __init__(self, vocab_size: int, embed_size: int, in_units: int, Whether to use bias when computing the scores for the tokens weight_initializer bias_initializer - prefix - params """ - super().__init__(prefix=prefix, params=params) + super().__init__() cutoffs = _fmt_and_check_cutoffs(cutoffs, vocab_size) if cutoffs is None: assert div_val == 1.0 @@ -934,55 +906,45 @@ def __init__(self, vocab_size: int, embed_size: int, in_units: int, ('dtype', dtype), ('use_bias', use_bias) ]) - with self.name_scope(): - if cutoffs is not None: - self.tail_cluster_score_proj = nn.Dense(units=self._num_tail_clusters, - in_units=embed_size, - flatten=False, - use_bias=use_bias, - weight_initializer=weight_initializer, - bias_initializer=bias_initializer, - prefix='tail_cluster_score_proj_') - self.inter_proj_l = nn.HybridSequential(prefix='inter_proj') - self.out_proj_l = nn.HybridSequential(prefix='embed') - if div_val == 1.0: - if in_units != embed_size: - with self.inter_proj_l.name_scope(): - self.inter_proj_l.add(nn.Dense(in_units=in_units, - units=embed_size, - flatten=False, - use_bias=False, - prefix='0_', - weight_initializer=weight_initializer, - bias_initializer=bias_initializer)) - with self.out_proj_l.name_scope(): - self.out_proj_l.add(nn.Dense(in_units=embed_size, - units=vocab_size, - flatten=False, - use_bias=use_bias, - prefix='0_', - weight_initializer=weight_initializer, - bias_initializer=bias_initializer)) - else: - for i, (l_idx, r_idx) in enumerate(zip([0] + self._cutoffs, - self._cutoffs + [vocab_size])): - ele_embed_size = int(embed_size / (div_val ** i)) - with self.inter_proj_l.name_scope(): - self.inter_proj_l.add(nn.Dense(in_units=in_units, - units=ele_embed_size, - flatten=False, - use_bias=False, - prefix='{}_'.format(i), - weight_initializer=weight_initializer, - bias_initializer=bias_initializer)) - with self.out_proj_l.name_scope(): - self.out_proj_l.add(nn.Dense(in_units=ele_embed_size, - units=r_idx - l_idx, - flatten=False, - use_bias=use_bias, - prefix='{}_'.format(i), - weight_initializer=weight_initializer, - bias_initializer=bias_initializer)) + if cutoffs is not None: + self.tail_cluster_score_proj = nn.Dense(units=self._num_tail_clusters, + in_units=embed_size, + flatten=False, + use_bias=use_bias, + weight_initializer=weight_initializer, + bias_initializer=bias_initializer) + self.inter_proj_l = nn.HybridSequential() + self.out_proj_l = nn.HybridSequential() + if div_val == 1.0: + if in_units != embed_size: + self.inter_proj_l.add(nn.Dense(in_units=in_units, + units=embed_size, + flatten=False, + use_bias=False, + weight_initializer=weight_initializer, + bias_initializer=bias_initializer)) + self.out_proj_l.add(nn.Dense(in_units=embed_size, + units=vocab_size, + flatten=False, + use_bias=use_bias, + weight_initializer=weight_initializer, + bias_initializer=bias_initializer)) + else: + for i, (l_idx, r_idx) in enumerate(zip([0] + self._cutoffs, + self._cutoffs + [vocab_size])): + ele_embed_size = int(embed_size / (div_val ** i)) + self.inter_proj_l.add(nn.Dense(in_units=in_units, + units=ele_embed_size, + flatten=False, + use_bias=False, + weight_initializer=weight_initializer, + bias_initializer=bias_initializer)) + self.out_proj_l.add(nn.Dense(in_units=ele_embed_size, + units=r_idx - l_idx, + flatten=False, + use_bias=use_bias, + weight_initializer=weight_initializer, + bias_initializer=bias_initializer)) def get_logits(self, F, hidden): """Get all the logits. diff --git a/src/gluonnlp/loss.py b/src/gluonnlp/loss.py index 998a7863cb..f592399fad 100644 --- a/src/gluonnlp/loss.py +++ b/src/gluonnlp/loss.py @@ -29,7 +29,7 @@ class LabelSmoothCrossEntropyLoss(HybridBlock): Whether input is a log probability (usually from log_softmax) instead of unnormalized numbers. """ def __init__(self, num_labels: int, alpha: float = 0.1, from_logits: bool = False, **kwargs): - super(LabelSmoothCrossEntropyLoss, self).__init__(**kwargs) + super().__init__(**kwargs) self._num_labels = num_labels self._alpha = alpha self._from_logits = from_logits diff --git a/src/gluonnlp/lr_scheduler.py b/src/gluonnlp/lr_scheduler.py index b297780258..c2c5c490a0 100644 --- a/src/gluonnlp/lr_scheduler.py +++ b/src/gluonnlp/lr_scheduler.py @@ -23,7 +23,7 @@ class InverseSquareRootScheduler(lr_scheduler.LRScheduler): """ def __init__(self, warmup_steps: int, base_lr: float = 1E-3, warmup_init_lr: float = 0.0): - super(InverseSquareRootScheduler, self).__init__( + super().__init__( base_lr, warmup_steps, warmup_init_lr, 'linear') self.base_lr = base_lr self.warmup_steps = warmup_steps diff --git a/src/gluonnlp/models/albert.py b/src/gluonnlp/models/albert.py index fc8064d63f..8ad154a96c 100644 --- a/src/gluonnlp/models/albert.py +++ b/src/gluonnlp/models/albert.py @@ -93,8 +93,8 @@ def __init__(self, units=512, hidden_size=2048, layer_norm_eps=1E-12, weight_initializer=TruncNorm(stdev=0.02), bias_initializer='zeros', - activation='gelu', prefix=None, params=None): - super().__init__(prefix=prefix, params=params) + activation='gelu'): + super().__init__() assert units % num_heads == 0,\ 'In AlbertEncoder, The units should be divided exactly ' \ 'by the number of heads. Received units={}, num_heads={}' \ @@ -109,22 +109,18 @@ def __init__(self, units=512, hidden_size=2048, self._output_attention = output_attention self._output_all_encodings = output_all_encodings - with self.name_scope(): - self.all_encoder_groups = nn.HybridSequential(prefix='groups_') - - with self.all_encoder_groups.name_scope(): - for group_idx in range(num_groups): - self.all_encoder_groups.add( - TransformerEncoderLayer(units=units, - hidden_size=hidden_size, - num_heads=num_heads, - attention_dropout_prob=attention_dropout_prob, - hidden_dropout_prob=hidden_dropout_prob, - layer_norm_eps=layer_norm_eps, - weight_initializer=weight_initializer, - bias_initializer=bias_initializer, - activation=activation, - prefix='{}_'.format(group_idx))) + self.all_encoder_groups = nn.HybridSequential() + for group_idx in range(num_groups): + self.all_encoder_groups.add( + TransformerEncoderLayer(units=units, + hidden_size=hidden_size, + num_heads=num_heads, + attention_dropout_prob=attention_dropout_prob, + hidden_dropout_prob=hidden_dropout_prob, + layer_norm_eps=layer_norm_eps, + weight_initializer=weight_initializer, + bias_initializer=bias_initializer, + activation=activation)) def hybrid_forward(self, F, data, valid_length): """ @@ -195,10 +191,8 @@ def __init__(self, weight_initializer=TruncNorm(stdev=0.02), bias_initializer='zeros', dtype='float32', - use_pooler=True, - prefix=None, - params=None): - super().__init__(prefix=prefix, params=params) + use_pooler=True): + super().__init__() self._dtype = dtype self.use_pooler = use_pooler self.pos_embed_type = pos_embed_type @@ -212,60 +206,52 @@ def __init__(self, self.weight_initializer = weight_initializer self.bias_initializer = bias_initializer self.layer_norm_eps = layer_norm_eps - with self.name_scope(): - # Construct AlbertEncoder - self.encoder = AlbertEncoder( - units=units, - hidden_size=hidden_size, - num_layers=num_layers, - num_heads=num_heads, - num_groups=num_groups, - attention_dropout_prob=attention_dropout_prob, - hidden_dropout_prob=hidden_dropout_prob, - output_attention=False, - output_all_encodings=False, - activation=activation, - layer_norm_eps=layer_norm_eps, - weight_initializer=weight_initializer, - bias_initializer=bias_initializer, - dtype=dtype, - prefix='enc_', - ) - self.encoder.hybridize() - # Construct word embedding - self.word_embed = nn.Embedding(input_dim=vocab_size, - output_dim=embed_size, - weight_initializer=embed_initializer, - dtype=dtype, - prefix='word_embed_') - if embed_size != units: - self.embed_factorized_proj = nn.Dense(units=units, - flatten=False, - weight_initializer=weight_initializer, - bias_initializer=bias_initializer, - prefix='embed_factorized_proj_') - self.embed_layer_norm = nn.LayerNorm(epsilon=self.layer_norm_eps, - prefix='embed_ln_') - self.embed_dropout = nn.Dropout(hidden_dropout_prob) - # Construct token type embedding - self.token_type_embed = nn.Embedding(input_dim=num_token_types, - output_dim=embed_size, - weight_initializer=weight_initializer, - prefix='token_type_embed_') - self.token_pos_embed = PositionalEmbedding(units=embed_size, - max_length=max_length, - dtype=self._dtype, - method=pos_embed_type, - prefix='token_pos_embed_') - if self.use_pooler: - # Construct pooler - self.pooler = nn.Dense(units=units, - in_units=units, - flatten=False, - activation='tanh', - weight_initializer=weight_initializer, - bias_initializer=bias_initializer, - prefix='pooler_') + # Construct AlbertEncoder + self.encoder = AlbertEncoder( + units=units, + hidden_size=hidden_size, + num_layers=num_layers, + num_heads=num_heads, + num_groups=num_groups, + attention_dropout_prob=attention_dropout_prob, + hidden_dropout_prob=hidden_dropout_prob, + output_attention=False, + output_all_encodings=False, + activation=activation, + layer_norm_eps=layer_norm_eps, + weight_initializer=weight_initializer, + bias_initializer=bias_initializer, + dtype=dtype, + ) + self.encoder.hybridize() + # Construct word embedding + self.word_embed = nn.Embedding(input_dim=vocab_size, + output_dim=embed_size, + weight_initializer=embed_initializer, + dtype=dtype) + if embed_size != units: + self.embed_factorized_proj = nn.Dense(units=units, + flatten=False, + weight_initializer=weight_initializer, + bias_initializer=bias_initializer) + self.embed_layer_norm = nn.LayerNorm(epsilon=self.layer_norm_eps) + self.embed_dropout = nn.Dropout(hidden_dropout_prob) + # Construct token type embedding + self.token_type_embed = nn.Embedding(input_dim=num_token_types, + output_dim=embed_size, + weight_initializer=weight_initializer) + self.token_pos_embed = PositionalEmbedding(units=embed_size, + max_length=max_length, + dtype=self._dtype, + method=pos_embed_type) + if self.use_pooler: + # Construct pooler + self.pooler = nn.Dense(units=units, + in_units=units, + flatten=False, + activation='tanh', + weight_initializer=weight_initializer, + bias_initializer=bias_initializer) def hybrid_forward(self, F, inputs, token_types, valid_length=None): # pylint: disable=arguments-differ @@ -387,7 +373,7 @@ def get_cfg(key=None): return cfg @classmethod - def from_cfg(cls, cfg, use_pooler=True, prefix=None, params=None) -> 'AlbertModel': + def from_cfg(cls, cfg, use_pooler=True) -> 'AlbertModel': """ Parameters @@ -395,8 +381,6 @@ def from_cfg(cls, cfg, use_pooler=True, prefix=None, params=None) -> 'AlbertMode cfg use_pooler Whether to use pooler - prefix - params Returns ------- @@ -426,18 +410,14 @@ def from_cfg(cls, cfg, use_pooler=True, prefix=None, params=None) -> 'AlbertMode embed_initializer=embed_initializer, weight_initializer=weight_initializer, bias_initializer=bias_initializer, - use_pooler=use_pooler, - prefix=prefix, - params=params) + use_pooler=use_pooler) @use_np class AlbertForMLM(HybridBlock): def __init__(self, backbone_cfg, weight_initializer=None, - bias_initializer=None, - prefix=None, - params=None): + bias_initializer=None): """ Parameters @@ -445,36 +425,29 @@ def __init__(self, backbone_cfg, backbone_cfg weight_initializer bias_initializer - prefix - params """ - super().__init__(prefix=prefix, params=params) - with self.name_scope(): - self.backbone_model = AlbertModel.from_cfg(backbone_cfg, prefix='') - if weight_initializer is None: - weight_initializer = self.backbone_model.weight_initializer - if bias_initializer is None: - bias_initializer = self.backbone_model.bias_initializer - self.mlm_decoder = nn.HybridSequential(prefix='mlm_') - with self.mlm_decoder.name_scope(): - # Extra non-linear layer - self.mlm_decoder.add(nn.Dense(units=self.backbone_model.embed_size, - flatten=False, - weight_initializer=weight_initializer, - bias_initializer=bias_initializer, - prefix='proj_')) - self.mlm_decoder.add(get_activation(self.backbone_model.activation)) - self.mlm_decoder.add(nn.LayerNorm(epsilon=self.backbone_model.layer_norm_eps, - prefix='ln_')) - # only load the dense weights with a re-initialized bias - # parameters are stored in 'word_embed_bias' which is - # not used in original embedding - self.mlm_decoder.add(nn.Dense(units=self.backbone_model.vocab_size, - flatten=False, - params=self.backbone_model.word_embed.collect_params('.*weight'), - bias_initializer=bias_initializer, - prefix='score_')) - self.mlm_decoder.hybridize() + super().__init__() + self.backbone_model = AlbertModel.from_cfg(backbone_cfg) + if weight_initializer is None: + weight_initializer = self.backbone_model.weight_initializer + if bias_initializer is None: + bias_initializer = self.backbone_model.bias_initializer + self.mlm_decoder = nn.HybridSequential() + # Extra non-linear layer + self.mlm_decoder.add(nn.Dense(units=self.backbone_model.embed_size, + flatten=False, + weight_initializer=weight_initializer, + bias_initializer=bias_initializer)) + self.mlm_decoder.add(get_activation(self.backbone_model.activation)) + self.mlm_decoder.add(nn.LayerNorm(epsilon=self.backbone_model.layer_norm_eps)) + # only load the dense weights with a re-initialized bias + # parameters are stored in 'word_embed_bias' which is + # not used in original embedding + self.mlm_decoder.add(nn.Dense(units=self.backbone_model.vocab_size, + flatten=False, + bias_initializer=bias_initializer)) + self.mlm_decoder[-1].weight = self.backbone_model.word_embed.weight + self.mlm_decoder.hybridize() def hybrid_forward(self, F, inputs, token_types, valid_length, masked_positions): @@ -515,8 +488,7 @@ def hybrid_forward(self, F, inputs, token_types, valid_length, class AlbertForPretrain(HybridBlock): def __init__(self, backbone_cfg, weight_initializer=None, - bias_initializer=None, - prefix=None, params=None): + bias_initializer=None): """ Parameters @@ -525,40 +497,32 @@ def __init__(self, backbone_cfg, The cfg of the backbone model weight_initializer bias_initializer - prefix - params """ - super().__init__(prefix=prefix, params=params) - with self.name_scope(): - self.backbone_model = AlbertModel.from_cfg(backbone_cfg, prefix='') - if weight_initializer is None: - weight_initializer = self.backbone_model.weight_initializer - if bias_initializer is None: - bias_initializer = self.backbone_model.bias_initializer - # Construct sop_classifier for sentence order prediction - self.sop_classifier = nn.Dense(units=2, - weight_initializer=weight_initializer, - prefix='sop_') - self.mlm_decoder = nn.HybridSequential(prefix='mlm_') - with self.mlm_decoder.name_scope(): - # Extra non-linear layer - self.mlm_decoder.add(nn.Dense(units=self.backbone_model.embed_size, - flatten=False, - weight_initializer=weight_initializer, - bias_initializer=bias_initializer, - prefix='proj_')) - self.mlm_decoder.add(get_activation(self.backbone_model.activation)) - self.mlm_decoder.add(nn.LayerNorm(epsilon=self.backbone_model.layer_norm_eps, - prefix='ln_')) - # only load the dense weights with a re-initialized bias - # parameters are stored in 'word_embed_bias' which is - # not used in original embedding - self.mlm_decoder.add(nn.Dense(units=self.backbone_model.vocab_size, - flatten=False, - params=self.backbone_model.word_embed.collect_params('.*weight'), - bias_initializer=bias_initializer, - prefix='score_')) - self.mlm_decoder.hybridize() + super().__init__() + self.backbone_model = AlbertModel.from_cfg(backbone_cfg) + if weight_initializer is None: + weight_initializer = self.backbone_model.weight_initializer + if bias_initializer is None: + bias_initializer = self.backbone_model.bias_initializer + # Construct sop_classifier for sentence order prediction + self.sop_classifier = nn.Dense(units=2, + weight_initializer=weight_initializer) + self.mlm_decoder = nn.HybridSequential() + # Extra non-linear layer + self.mlm_decoder.add(nn.Dense(units=self.backbone_model.embed_size, + flatten=False, + weight_initializer=weight_initializer, + bias_initializer=bias_initializer)) + self.mlm_decoder.add(get_activation(self.backbone_model.activation)) + self.mlm_decoder.add(nn.LayerNorm(epsilon=self.backbone_model.layer_norm_eps)) + # only load the dense weights with a re-initialized bias + # parameters are stored in 'word_embed_bias' which is + # not used in original embedding + self.mlm_decoder.add(nn.Dense(units=self.backbone_model.vocab_size, + flatten=False, + bias_initializer=bias_initializer)) + self.mlm_decoder[-1].weight = self.backbone_model.word_embed.weight + self.mlm_decoder.hybridize() def hybrid_forward(self, F, inputs, token_types, valid_length, masked_positions): diff --git a/src/gluonnlp/models/bert.py b/src/gluonnlp/models/bert.py index 47c75398ef..6fe81e8da4 100644 --- a/src/gluonnlp/models/bert.py +++ b/src/gluonnlp/models/bert.py @@ -119,9 +119,8 @@ def __init__(self, units: int = 512, layer_norm_eps: float = 1E-12, weight_initializer: InitializerType = TruncNorm(stdev=0.02), bias_initializer: InitializerType = 'zeros', - activation='gelu', - prefix=None, params=None): - super().__init__(prefix=prefix, params=params) + activation='gelu'): + super().__init__() assert units % num_heads == 0,\ 'In BertTransformer, The units should be divided exactly ' \ 'by the number of heads. Received units={}, num_heads={}' \ @@ -132,21 +131,18 @@ def __init__(self, units: int = 512, self._output_attention = output_attention self._output_all_encodings = output_all_encodings - with self.name_scope(): - self.all_layers = nn.HybridSequential(prefix='layers_') - with self.all_layers.name_scope(): - for layer_idx in range(num_layers): - self.all_layers.add( - TransformerEncoderLayer(units=units, - hidden_size=hidden_size, - num_heads=num_heads, - attention_dropout_prob=attention_dropout_prob, - hidden_dropout_prob=hidden_dropout_prob, - layer_norm_eps=layer_norm_eps, - weight_initializer=weight_initializer, - bias_initializer=bias_initializer, - activation=activation, - prefix='{}_'.format(layer_idx))) + self.all_layers = nn.HybridSequential() + for layer_idx in range(num_layers): + self.all_layers.add( + TransformerEncoderLayer(units=units, + hidden_size=hidden_size, + num_heads=num_heads, + attention_dropout_prob=attention_dropout_prob, + hidden_dropout_prob=hidden_dropout_prob, + layer_norm_eps=layer_norm_eps, + weight_initializer=weight_initializer, + bias_initializer=bias_initializer, + activation=activation)) def hybrid_forward(self, F, data, valid_length): """ @@ -214,10 +210,8 @@ def __init__(self, weight_initializer=TruncNorm(stdev=0.02), bias_initializer='zeros', dtype='float32', - use_pooler=True, - prefix=None, - params=None): - super().__init__(prefix=prefix, params=params) + use_pooler=True): + super().__init__() self._dtype = dtype self.use_pooler = use_pooler self.pos_embed_type = pos_embed_type @@ -230,53 +224,46 @@ def __init__(self, self.weight_initializer = weight_initializer self.bias_initializer = bias_initializer self.layer_norm_eps = layer_norm_eps - with self.name_scope(): - # Construct BertTransformer - self.encoder = BertTransformer( - units=units, - hidden_size=hidden_size, - num_layers=num_layers, - num_heads=num_heads, - attention_dropout_prob=attention_dropout_prob, - hidden_dropout_prob=hidden_dropout_prob, - output_attention=False, - output_all_encodings=False, - activation=activation, - layer_norm_eps=layer_norm_eps, - weight_initializer=weight_initializer, - bias_initializer=bias_initializer, - dtype=dtype, - prefix='enc_', - ) - self.encoder.hybridize() - # Construct word embedding - self.word_embed = nn.Embedding(input_dim=vocab_size, - output_dim=units, - weight_initializer=embed_initializer, - dtype=dtype, - prefix='word_embed_') - self.embed_layer_norm = nn.LayerNorm(epsilon=self.layer_norm_eps, - prefix='embed_ln_') - self.embed_dropout = nn.Dropout(hidden_dropout_prob) - # Construct token type embedding - self.token_type_embed = nn.Embedding(input_dim=num_token_types, - output_dim=units, - weight_initializer=weight_initializer, - prefix='token_type_embed_') - self.token_pos_embed = PositionalEmbedding(units=units, - max_length=max_length, - dtype=self._dtype, - method=pos_embed_type, - prefix='token_pos_embed_') - if self.use_pooler: - # Construct pooler - self.pooler = nn.Dense(units=units, - in_units=units, - flatten=False, - activation='tanh', - weight_initializer=weight_initializer, - bias_initializer=bias_initializer, - prefix='pooler_') + # Construct BertTransformer + self.encoder = BertTransformer( + units=units, + hidden_size=hidden_size, + num_layers=num_layers, + num_heads=num_heads, + attention_dropout_prob=attention_dropout_prob, + hidden_dropout_prob=hidden_dropout_prob, + output_attention=False, + output_all_encodings=False, + activation=activation, + layer_norm_eps=layer_norm_eps, + weight_initializer=weight_initializer, + bias_initializer=bias_initializer, + dtype=dtype, + ) + self.encoder.hybridize() + # Construct word embedding + self.word_embed = nn.Embedding(input_dim=vocab_size, + output_dim=units, + weight_initializer=embed_initializer, + dtype=dtype) + self.embed_layer_norm = nn.LayerNorm(epsilon=self.layer_norm_eps) + self.embed_dropout = nn.Dropout(hidden_dropout_prob) + # Construct token type embedding + self.token_type_embed = nn.Embedding(input_dim=num_token_types, + output_dim=units, + weight_initializer=weight_initializer) + self.token_pos_embed = PositionalEmbedding(units=units, + max_length=max_length, + dtype=self._dtype, + method=pos_embed_type) + if self.use_pooler: + # Construct pooler + self.pooler = nn.Dense(units=units, + in_units=units, + flatten=False, + activation='tanh', + weight_initializer=weight_initializer, + bias_initializer=bias_initializer) def hybrid_forward(self, F, inputs, token_types, valid_length): # pylint: disable=arguments-differ @@ -394,7 +381,7 @@ def get_cfg(key=None): return cfg @classmethod - def from_cfg(cls, cfg, use_pooler=True, prefix=None, params=None): + def from_cfg(cls, cfg, use_pooler=True): cfg = BertModel.get_cfg().clone_merge(cfg) assert cfg.VERSION == 1, 'Wrong version!' embed_initializer = mx.init.create(*cfg.INITIALIZER.embed) @@ -416,18 +403,14 @@ def from_cfg(cls, cfg, use_pooler=True, prefix=None, params=None): embed_initializer=embed_initializer, weight_initializer=weight_initializer, bias_initializer=bias_initializer, - use_pooler=use_pooler, - prefix=prefix, - params=params) + use_pooler=use_pooler) @use_np class BertForMLM(HybridBlock): def __init__(self, backbone_cfg, weight_initializer=None, - bias_initializer=None, - prefix=None, - params=None): + bias_initializer=None): """ Parameters @@ -435,36 +418,29 @@ def __init__(self, backbone_cfg, backbone_cfg weight_initializer bias_initializer - prefix - params """ - super().__init__(prefix=prefix, params=params) - with self.name_scope(): - self.backbone_model = BertModel.from_cfg(backbone_cfg, prefix='') - if weight_initializer is None: - weight_initializer = self.backbone_model.weight_initializer - if bias_initializer is None: - bias_initializer = self.backbone_model.bias_initializer - self.mlm_decoder = nn.HybridSequential(prefix='mlm_') - with self.mlm_decoder.name_scope(): - # Extra non-linear layer - self.mlm_decoder.add(nn.Dense(units=self.backbone_model.units, - flatten=False, - weight_initializer=weight_initializer, - bias_initializer=bias_initializer, - prefix='proj_')) - self.mlm_decoder.add(get_activation(self.backbone_model.activation)) - self.mlm_decoder.add(nn.LayerNorm(epsilon=self.backbone_model.layer_norm_eps, - prefix='ln_')) - # only load the dense weights with a re-initialized bias - # parameters are stored in 'word_embed_bias' which is - # not used in original embedding - self.mlm_decoder.add(nn.Dense(units=self.backbone_model.vocab_size, - flatten=False, - params=self.backbone_model.word_embed.collect_params('.*weight'), - bias_initializer=bias_initializer, - prefix='score_')) - self.mlm_decoder.hybridize() + super().__init__() + self.backbone_model = BertModel.from_cfg(backbone_cfg) + if weight_initializer is None: + weight_initializer = self.backbone_model.weight_initializer + if bias_initializer is None: + bias_initializer = self.backbone_model.bias_initializer + self.mlm_decoder = nn.HybridSequential() + # Extra non-linear layer + self.mlm_decoder.add(nn.Dense(units=self.backbone_model.units, + flatten=False, + weight_initializer=weight_initializer, + bias_initializer=bias_initializer)) + self.mlm_decoder.add(get_activation(self.backbone_model.activation)) + self.mlm_decoder.add(nn.LayerNorm(epsilon=self.backbone_model.layer_norm_eps)) + # only load the dense weights with a re-initialized bias + # parameters are stored in 'word_embed_bias' which is + # not used in original embedding + self.mlm_decoder.add(nn.Dense(units=self.backbone_model.vocab_size, + flatten=False, + bias_initializer=bias_initializer)) + self.mlm_decoder[-1].weight = self.backbone_model.word_embed.weight + self.mlm_decoder.hybridize() def hybrid_forward(self, F, inputs, token_types, valid_length, masked_positions): @@ -506,8 +482,7 @@ def hybrid_forward(self, F, inputs, token_types, valid_length, class BertForPretrain(HybridBlock): def __init__(self, backbone_cfg, weight_initializer=None, - bias_initializer=None, - prefix=None, params=None): + bias_initializer=None): """ Parameters @@ -516,40 +491,32 @@ def __init__(self, backbone_cfg, The cfg of the backbone model weight_initializer bias_initializer - prefix - params """ - super().__init__(prefix=prefix, params=params) - with self.name_scope(): - self.backbone_model = BertModel.from_cfg(backbone_cfg, prefix='') - if weight_initializer is None: - weight_initializer = self.backbone_model.weight_initializer - if bias_initializer is None: - bias_initializer = self.backbone_model.bias_initializer - # Construct nsp_classifier for next sentence prediction - self.nsp_classifier = nn.Dense(units=2, - weight_initializer=weight_initializer, - prefix='nsp_') - self.mlm_decoder = nn.HybridSequential(prefix='mlm_') - with self.mlm_decoder.name_scope(): - # Extra non-linear layer - self.mlm_decoder.add(nn.Dense(units=self.backbone_model.units, - flatten=False, - weight_initializer=weight_initializer, - bias_initializer=bias_initializer, - prefix='proj_')) - self.mlm_decoder.add(get_activation(self.backbone_model.activation)) - self.mlm_decoder.add(nn.LayerNorm(epsilon=self.backbone_model.layer_norm_eps, - prefix='ln_')) - # only load the dense weights with a re-initialized bias - # parameters are stored in 'word_embed_bias' which is - # not used in original embedding - self.mlm_decoder.add(nn.Dense(units=self.backbone_model.vocab_size, - flatten=False, - params=self.backbone_model.word_embed.collect_params('.*weight'), - bias_initializer=bias_initializer, - prefix='score_')) - self.mlm_decoder.hybridize() + super().__init__() + self.backbone_model = BertModel.from_cfg(backbone_cfg) + if weight_initializer is None: + weight_initializer = self.backbone_model.weight_initializer + if bias_initializer is None: + bias_initializer = self.backbone_model.bias_initializer + # Construct nsp_classifier for next sentence prediction + self.nsp_classifier = nn.Dense(units=2, + weight_initializer=weight_initializer) + self.mlm_decoder = nn.HybridSequential() + # Extra non-linear layer + self.mlm_decoder.add(nn.Dense(units=self.backbone_model.units, + flatten=False, + weight_initializer=weight_initializer, + bias_initializer=bias_initializer)) + self.mlm_decoder.add(get_activation(self.backbone_model.activation)) + self.mlm_decoder.add(nn.LayerNorm(epsilon=self.backbone_model.layer_norm_eps)) + # only load the dense weights with a re-initialized bias + # parameters are stored in 'word_embed_bias' which is + # not used in original embedding + self.mlm_decoder.add(nn.Dense(units=self.backbone_model.vocab_size, + flatten=False, + bias_initializer=bias_initializer)) + self.mlm_decoder[-1].weight = self.backbone_model.word_embed.weight + self.mlm_decoder.hybridize() def hybrid_forward(self, F, inputs, token_types, valid_length, masked_positions): diff --git a/src/gluonnlp/models/electra.py b/src/gluonnlp/models/electra.py index d896b0b50e..7f9512d915 100644 --- a/src/gluonnlp/models/electra.py +++ b/src/gluonnlp/models/electra.py @@ -107,8 +107,8 @@ def __init__(self, units=512, layer_norm_eps=1E-12, weight_initializer=TruncNorm(stdev=0.02), bias_initializer='zeros', - activation='gelu', prefix=None, params=None): - super().__init__(prefix=prefix, params=params) + activation='gelu'): + super().__init__() assert units % num_heads == 0, \ 'In ElectraEncoder, The units should be divisible ' \ 'by the number of heads. Received units={}, num_heads={}' \ @@ -120,21 +120,18 @@ def __init__(self, units=512, self._output_attention = output_attention self._output_all_encodings = output_all_encodings - with self.name_scope(): - self.all_encoder_layers = nn.HybridSequential(prefix='layers_') - with self.all_encoder_layers.name_scope(): - for layer_idx in range(num_layers): - self.all_encoder_layers.add( - TransformerEncoderLayer(units=units, - hidden_size=hidden_size, - num_heads=num_heads, - attention_dropout_prob=attention_dropout_prob, - hidden_dropout_prob=hidden_dropout_prob, - layer_norm_eps=layer_norm_eps, - weight_initializer=weight_initializer, - bias_initializer=bias_initializer, - activation=activation, - prefix='{}_'.format(layer_idx))) + self.all_encoder_layers = nn.HybridSequential() + for layer_idx in range(num_layers): + self.all_encoder_layers.add( + TransformerEncoderLayer(units=units, + hidden_size=hidden_size, + num_heads=num_heads, + attention_dropout_prob=attention_dropout_prob, + hidden_dropout_prob=hidden_dropout_prob, + layer_norm_eps=layer_norm_eps, + weight_initializer=weight_initializer, + bias_initializer=bias_initializer, + activation=activation)) def hybrid_forward(self, F, data, valid_length): """ @@ -208,15 +205,8 @@ def __init__(self, weight_initializer=TruncNorm(stdev=0.02), bias_initializer='zeros', dtype='float32', - use_pooler=True, - tied_embeddings=False, - word_embed_params=None, - token_type_embed_params=None, - token_pos_embed_params=None, - embed_layer_norm_params=None, - prefix=None, - params=None): - super().__init__(prefix=prefix, params=params) + use_pooler=True): + super().__init__() self._dtype = dtype self.use_pooler = use_pooler self.pos_embed_type = pos_embed_type @@ -230,65 +220,44 @@ def __init__(self, self.weight_initializer = weight_initializer self.bias_initializer = bias_initializer self.layer_norm_eps = layer_norm_eps - with self.name_scope(): - # Construct ElectraEncoder - self.encoder = ElectraEncoder( - units=units, - hidden_size=hidden_size, - num_layers=num_layers, - num_heads=num_heads, - attention_dropout_prob=attention_dropout_prob, - hidden_dropout_prob=hidden_dropout_prob, - output_attention=False, - output_all_encodings=False, - activation=activation, - layer_norm_eps=layer_norm_eps, - weight_initializer=weight_initializer, - bias_initializer=bias_initializer, - dtype=dtype, - prefix='enc_', - ) - self.encoder.hybridize() - # Construct model embedding which consists of three parts including word embedding, - # type embedding and positional embeddings. - # The hyper-parameters "tied_embeddings" is particularly - # used for sharing the embeddings between the Electra generator and the - # Electra discriminator. - if tied_embeddings: - assert word_embed_params is not None - assert token_type_embed_params is not None - assert token_pos_embed_params is not None - assert embed_layer_norm_params is not None - - self.word_embed = nn.Embedding(input_dim=vocab_size, - output_dim=embed_size, - weight_initializer=embed_initializer, - dtype=dtype, - params=word_embed_params, - prefix='word_embed_') - # Construct token type embedding - self.token_type_embed = nn.Embedding(input_dim=num_token_types, - output_dim=embed_size, - weight_initializer=weight_initializer, - params=token_type_embed_params, - prefix='token_type_embed_') - self.token_pos_embed = PositionalEmbedding(units=embed_size, - max_length=max_length, - dtype=self._dtype, - method=pos_embed_type, - params=token_pos_embed_params, - prefix='token_pos_embed_') - self.embed_layer_norm = nn.LayerNorm(epsilon=self.layer_norm_eps, - params=embed_layer_norm_params, - prefix='embed_ln_') - - self.embed_dropout = nn.Dropout(hidden_dropout_prob) - if embed_size != units: - self.embed_factorized_proj = nn.Dense(units=units, - flatten=False, - weight_initializer=weight_initializer, - bias_initializer=bias_initializer, - prefix='embed_factorized_proj_') + # Construct ElectraEncoder + self.encoder = ElectraEncoder( + units=units, + hidden_size=hidden_size, + num_layers=num_layers, + num_heads=num_heads, + attention_dropout_prob=attention_dropout_prob, + hidden_dropout_prob=hidden_dropout_prob, + output_attention=False, + output_all_encodings=False, + activation=activation, + layer_norm_eps=layer_norm_eps, + weight_initializer=weight_initializer, + bias_initializer=bias_initializer, + dtype=dtype, + ) + self.encoder.hybridize() + + self.word_embed = nn.Embedding(input_dim=vocab_size, + output_dim=embed_size, + weight_initializer=embed_initializer, + dtype=dtype) + # Construct token type embedding + self.token_type_embed = nn.Embedding(input_dim=num_token_types, + output_dim=embed_size, + weight_initializer=weight_initializer) + self.token_pos_embed = PositionalEmbedding(units=embed_size, + max_length=max_length, + dtype=self._dtype, + method=pos_embed_type) + self.embed_layer_norm = nn.LayerNorm(epsilon=self.layer_norm_eps) + + self.embed_dropout = nn.Dropout(hidden_dropout_prob) + if embed_size != units: + self.embed_factorized_proj = nn.Dense(units=units, + flatten=False, + weight_initializer=weight_initializer, + bias_initializer=bias_initializer) def hybrid_forward(self, F, inputs, token_types, valid_length=None): # pylint: disable=arguments-differ @@ -403,16 +372,7 @@ def get_cfg(key=None): return cfg @classmethod - def from_cfg(cls, - cfg, - use_pooler=True, - tied_embeddings=False, - word_embed_params=None, - token_type_embed_params=None, - token_pos_embed_params=None, - embed_layer_norm_params=None, - prefix=None, - params=None): + def from_cfg(cls, cfg, use_pooler=True): cfg = ElectraModel.get_cfg().clone_merge(cfg) assert cfg.VERSION == 1, 'Wrong version!' embed_initializer = mx.init.create(*cfg.INITIALIZER.embed) @@ -435,14 +395,7 @@ def from_cfg(cls, embed_initializer=embed_initializer, weight_initializer=weight_initializer, bias_initializer=bias_initializer, - use_pooler=use_pooler, - tied_embeddings=tied_embeddings, - word_embed_params=word_embed_params, - token_type_embed_params=token_type_embed_params, - token_pos_embed_params=token_pos_embed_params, - embed_layer_norm_params=embed_layer_norm_params, - prefix=prefix, - params=params) + use_pooler=use_pooler) @use_np @@ -456,9 +409,7 @@ class ElectraDiscriminator(HybridBlock): def __init__(self, backbone_cfg, weight_initializer=None, - bias_initializer=None, - prefix=None, - params=None): + bias_initializer=None): """ Parameters @@ -466,31 +417,25 @@ def __init__(self, backbone_cfg, backbone_cfg weight_initializer bias_initializer - prefix - params """ - super().__init__(prefix=prefix, params=params) - with self.name_scope(): - self.backbone_model = ElectraModel.from_cfg(backbone_cfg, prefix='') - if weight_initializer is None: - weight_initializer = self.backbone_model.weight_initializer - if bias_initializer is None: - bias_initializer = self.backbone_model.bias_initializer - self.rtd_encoder = nn.HybridSequential(prefix='disc_') - with self.rtd_encoder.name_scope(): - # Extra non-linear layer - self.rtd_encoder.add(nn.Dense(units=self.backbone_model.units, - flatten=False, - weight_initializer=weight_initializer, - bias_initializer=bias_initializer, - prefix='proj_')) - self.rtd_encoder.add(get_activation(self.backbone_model.activation)) - self.rtd_encoder.add(nn.Dense(units=1, - flatten=False, - weight_initializer=weight_initializer, - bias_initializer=bias_initializer, - prefix='predctions_')) - self.rtd_encoder.hybridize() + super().__init__() + self.backbone_model = ElectraModel.from_cfg(backbone_cfg) + if weight_initializer is None: + weight_initializer = self.backbone_model.weight_initializer + if bias_initializer is None: + bias_initializer = self.backbone_model.bias_initializer + self.rtd_encoder = nn.HybridSequential() + # Extra non-linear layer + self.rtd_encoder.add(nn.Dense(units=self.backbone_model.units, + flatten=False, + weight_initializer=weight_initializer, + bias_initializer=bias_initializer)) + self.rtd_encoder.add(get_activation(self.backbone_model.activation)) + self.rtd_encoder.add(nn.Dense(units=1, + flatten=False, + weight_initializer=weight_initializer, + bias_initializer=bias_initializer)) + self.rtd_encoder.hybridize() def hybrid_forward(self, F, inputs, token_types, valid_length): """Getting the scores of the replaced token detection of the whole sentence @@ -531,72 +476,49 @@ class ElectraGenerator(HybridBlock): """ def __init__(self, backbone_cfg, - tied_embeddings=True, - word_embed_params=None, - token_type_embed_params=None, - token_pos_embed_params=None, - embed_layer_norm_params=None, weight_initializer=None, - bias_initializer=None, - prefix=None, - params=None): + bias_initializer=None): """ Parameters ---------- backbone_cfg Configuration of the backbone model - tied_embeddings - Reuse the embeddings of discriminator - word_embed_params - The parameters to load into word embeddings - token_type_embed_params - The parameters to load into word token type embeddings - token_pos_embed_params - The parameters to load into token positional embeddings - embed_layer_norm_params - The parameters to load into token layer normalization layer of embeddings weight_initializer bias_initializer - prefix - params """ - super().__init__(prefix=prefix, params=params) - with self.name_scope(): - self.backbone_model = ElectraModel.from_cfg( - backbone_cfg, - tied_embeddings=tied_embeddings, - word_embed_params=word_embed_params, - token_type_embed_params=token_type_embed_params, - token_pos_embed_params=token_pos_embed_params, - embed_layer_norm_params=embed_layer_norm_params, - prefix='') - if weight_initializer is None: - weight_initializer = self.backbone_model.weight_initializer - if bias_initializer is None: - bias_initializer = self.backbone_model.bias_initializer - self.mlm_decoder = nn.HybridSequential(prefix='gen_') - with self.mlm_decoder.name_scope(): - # Extra non-linear layer - self.mlm_decoder.add(nn.Dense(units=self.backbone_model.embed_size, - flatten=False, - weight_initializer=weight_initializer, - bias_initializer=bias_initializer, - prefix='proj_')) - self.mlm_decoder.add(get_activation(self.backbone_model.activation)) - self.mlm_decoder.add(nn.LayerNorm(epsilon=self.backbone_model.layer_norm_eps, - prefix='ln_')) - # only load the dense weights with a re-initialized bias - # parameters are stored in 'word_embed_bias' which is - # not used in original embedding - self.mlm_decoder.add( - nn.Dense( - units=self.backbone_model.vocab_size, - flatten=False, - params=self.backbone_model.word_embed.collect_params('.*weight'), - bias_initializer=bias_initializer, - prefix='score_')) - self.mlm_decoder.hybridize() + super().__init__() + self.backbone_model = ElectraModel.from_cfg(backbone_cfg) + if weight_initializer is None: + weight_initializer = self.backbone_model.weight_initializer + if bias_initializer is None: + bias_initializer = self.backbone_model.bias_initializer + self.mlm_decoder = nn.HybridSequential() + # Extra non-linear layer + self.mlm_decoder.add(nn.Dense(units=self.backbone_model.embed_size, + flatten=False, + weight_initializer=weight_initializer, + bias_initializer=bias_initializer)) + self.mlm_decoder.add(get_activation(self.backbone_model.activation)) + self.mlm_decoder.add(nn.LayerNorm(epsilon=self.backbone_model.layer_norm_eps)) + # only load the dense weights with a re-initialized bias + # parameters are stored in 'word_embed_bias' which is + # not used in original embedding + self.mlm_decoder.add( + nn.Dense( + units=self.backbone_model.vocab_size, + flatten=False, + bias_initializer=bias_initializer)) + self.mlm_decoder[-1].weight = self.backbone_model.word_embed.weight + self.mlm_decoder.hybridize() + + def tie_embeddings(self, word_embed_params=None, token_type_embed_params=None, + token_pos_embed_params=None, embed_layer_norm_params=None): + self.backbone_model.word_embed.share_parameters(word_embed_params) + self.mlm_decoder[-1].share_parameters(word_embed_params) + self.backbone_model.token_type_embed.share_parameters(token_type_embed_params) + self.backbone_model.token_pos_embed.share_parameters(token_pos_embed_params) + self.backbone_model.embed_layer_norm.share_parameters(embed_layer_norm_params) def hybrid_forward(self, F, inputs, token_types, valid_length, masked_positions): """Getting the scores of the masked positions. @@ -652,9 +574,7 @@ def __init__(self, temperature=1.0, dtype='float32', weight_initializer=None, - bias_initializer=None, - prefix=None, - params=None): + bias_initializer=None): """ Parameters @@ -677,10 +597,8 @@ def __init__(self, Temperature of gumbel distribution for sampling from generator weight_initializer bias_initializer - prefix - params """ - super().__init__(prefix=prefix, params=params) + super().__init__() self._uniform_generator = uniform_generator self._tied_generator = tied_generator self._tied_embeddings = tied_embeddings @@ -691,34 +609,21 @@ def __init__(self, self.disc_cfg = disc_cfg self.vocab_size = disc_cfg.MODEL.vocab_size self.gen_cfg = get_generator_cfg(disc_cfg) - self.discriminator = ElectraDiscriminator(disc_cfg, prefix='electra_') + self.discriminator = ElectraDiscriminator(disc_cfg) self.disc_backbone = self.discriminator.backbone_model - if tied_embeddings: - word_embed_params = self.disc_backbone.word_embed.collect_params() - token_type_embed_params = self.disc_backbone.token_pos_embed.collect_params() - token_pos_embed_params = self.disc_backbone.token_pos_embed.collect_params() - embed_layer_norm_params = self.disc_backbone.embed_layer_norm.collect_params() - else: - word_embed_params = None - token_type_embed_params = None - token_pos_embed_params = None - embed_layer_norm_params = None if not uniform_generator and not tied_generator: - self.generator = ElectraGenerator( - self.gen_cfg, - tied_embeddings=tied_embeddings, - word_embed_params=word_embed_params, - token_type_embed_params=token_type_embed_params, - token_pos_embed_params=token_pos_embed_params, - embed_layer_norm_params=embed_layer_norm_params, - prefix='generator_') + self.generator = ElectraGenerator(self.gen_cfg) + if tied_embeddings: + self.generator.tie_embeddings(self.disc_backbone.word_embed.collect_params(), + self.disc_backbone.token_type_embed.collect_params(), + self.disc_backbone.token_pos_embed.collect_params(), + self.disc_backbone.embed_layer_norm.collect_params()) self.generator.hybridize() elif tied_generator: # Reuse the weight of the discriminator backbone model - self.generator = ElectraGenerator( - self.gen_cfg, tied_embeddings=False, prefix='generator_') + self.generator = ElectraGenerator(self.gen_cfg) self.generator.backbone_model = self.disc_backbone self.generator.hybridize() elif uniform_generator: diff --git a/src/gluonnlp/models/mobilebert.py b/src/gluonnlp/models/mobilebert.py index 21e3aad697..65a862d1a3 100644 --- a/src/gluonnlp/models/mobilebert.py +++ b/src/gluonnlp/models/mobilebert.py @@ -84,8 +84,7 @@ def __init__(self, use_qkv_bias: bool = True, weight_initializer: Optional[InitializerType] = None, bias_initializer: Optional[InitializerType] = 'zeros', - dtype='float32', - prefix=None, params=None): + dtype='float32'): """ Parameters @@ -110,10 +109,8 @@ def __init__(self, weight_initializer bias_initializer dtype - prefix - params """ - super().__init__(prefix=prefix, params=params) + super().__init__() self._use_bottleneck = use_bottleneck self._units = units self._real_units = real_units @@ -122,109 +119,94 @@ def __init__(self, self._bottleneck_strategy = bottleneck_strategy self._dtype = dtype assert real_units % num_heads == 0, 'units must be divisive by the number of heads' - with self.name_scope(): - self.dropout_layer = nn.Dropout(hidden_dropout_prob) - if use_bottleneck: - self.in_bottleneck_proj = nn.Dense(units=real_units, - in_units=units, - flatten=False, - weight_initializer=weight_initializer, - bias_initializer=bias_initializer, - dtype=self._dtype, - prefix='in_bottleneck_proj_') - self.in_bottleneck_ln = get_layer_norm(normalization=normalization, - in_channels=real_units, - epsilon=layer_norm_eps, - prefix='in_bottleneck_ln_') - self.out_bottleneck_proj = nn.Dense(units=units, - in_units=real_units, - flatten=False, - weight_initializer=weight_initializer, - bias_initializer=bias_initializer, - dtype=self._dtype, - prefix='out_bottleneck_proj_') - self.out_bottleneck_ln = get_layer_norm(normalization=normalization, - in_channels=units, - epsilon=layer_norm_eps, - prefix='out_bottleneck_ln_') - - if bottleneck_strategy == 'qk_sharing': - self.shared_qk = nn.Dense(units=real_units, - in_units=units, - flatten=False, - weight_initializer=weight_initializer, - bias_initializer=bias_initializer, - dtype=self._dtype, - prefix='shared_qk_') - self.shared_qk_ln = get_layer_norm(normalization=normalization, - in_channels=real_units, - epsilon=layer_norm_eps, - prefix='shared_qk_ln_') - self.attention_proj = nn.Dense(units=real_units, - flatten=False, - in_units=real_units, - use_bias=True, - weight_initializer=weight_initializer, - bias_initializer=bias_initializer, - dtype=self._dtype, - prefix='proj_') - # The in_units of qkv varies according to the sharing strategy - self.attn_query = nn.Dense(units=real_units, - flatten=False, - use_bias=use_qkv_bias, - weight_initializer=weight_initializer, - bias_initializer=bias_initializer, - dtype=self._dtype, - prefix='attn_query_') - self.attn_key = nn.Dense(units=real_units, - flatten=False, - use_bias=use_qkv_bias, - weight_initializer=weight_initializer, - bias_initializer=bias_initializer, - dtype=self._dtype, - prefix='attn_key_') - self.attn_value = nn.Dense(units=real_units, + self.dropout_layer = nn.Dropout(hidden_dropout_prob) + if use_bottleneck: + self.in_bottleneck_proj = nn.Dense(units=real_units, + in_units=units, + flatten=False, + weight_initializer=weight_initializer, + bias_initializer=bias_initializer, + dtype=self._dtype) + self.in_bottleneck_ln = get_layer_norm(normalization=normalization, + in_channels=real_units, + epsilon=layer_norm_eps) + self.out_bottleneck_proj = nn.Dense(units=units, + in_units=real_units, + flatten=False, + weight_initializer=weight_initializer, + bias_initializer=bias_initializer, + dtype=self._dtype) + self.out_bottleneck_ln = get_layer_norm(normalization=normalization, + in_channels=units, + epsilon=layer_norm_eps) + + if bottleneck_strategy == 'qk_sharing': + self.shared_qk = nn.Dense(units=real_units, + in_units=units, + flatten=False, + weight_initializer=weight_initializer, + bias_initializer=bias_initializer, + dtype=self._dtype) + self.shared_qk_ln = get_layer_norm(normalization=normalization, + in_channels=real_units, + epsilon=layer_norm_eps) + self.attention_proj = nn.Dense(units=real_units, flatten=False, - use_bias=use_qkv_bias, + in_units=real_units, + use_bias=True, weight_initializer=weight_initializer, bias_initializer=bias_initializer, - dtype=self._dtype, - prefix='attn_value_') - self.attention_cell = \ - MultiHeadAttentionCell( - query_units=real_units, - num_heads=num_heads, - attention_dropout=attention_dropout_prob, - scaled=True, - prefix='attn_cell_', - dtype=self._dtype, - layout='NTK' - ) - self.layer_norm = get_layer_norm(normalization=normalization, - in_channels=real_units, - epsilon=layer_norm_eps, - prefix='ln_') - - self.stacked_ffn = nn.HybridSequential(prefix='stacked_ffns_') - with self.stacked_ffn.name_scope(): - for ffn_idx in range(num_stacked_ffn): - is_last_ffn = (ffn_idx == (num_stacked_ffn - 1)) - # only apply dropout on last ffn layer if use bottleneck - dropout = float(hidden_dropout_prob * (not use_bottleneck) * is_last_ffn) - activation_dropout = float(activation_dropout_prob * (not use_bottleneck) - * is_last_ffn) - self.stacked_ffn.add( - PositionwiseFFN(units=real_units, - hidden_size=hidden_size, - dropout=dropout, - activation_dropout=activation_dropout, - weight_initializer=weight_initializer, - bias_initializer=bias_initializer, - activation=activation, - normalization=normalization, - layer_norm_eps=layer_norm_eps, - dtype=self._dtype, - prefix='{}_'.format(ffn_idx))) + dtype=self._dtype) + # The in_units of qkv varies according to the sharing strategy + self.attn_query = nn.Dense(units=real_units, + flatten=False, + use_bias=use_qkv_bias, + weight_initializer=weight_initializer, + bias_initializer=bias_initializer, + dtype=self._dtype) + self.attn_key = nn.Dense(units=real_units, + flatten=False, + use_bias=use_qkv_bias, + weight_initializer=weight_initializer, + bias_initializer=bias_initializer, + dtype=self._dtype) + self.attn_value = nn.Dense(units=real_units, + flatten=False, + use_bias=use_qkv_bias, + weight_initializer=weight_initializer, + bias_initializer=bias_initializer, + dtype=self._dtype) + self.attention_cell = \ + MultiHeadAttentionCell( + query_units=real_units, + num_heads=num_heads, + attention_dropout=attention_dropout_prob, + scaled=True, + dtype=self._dtype, + layout='NTK' + ) + self.layer_norm = get_layer_norm(normalization=normalization, + in_channels=real_units, + epsilon=layer_norm_eps) + + self.stacked_ffn = nn.HybridSequential() + for ffn_idx in range(num_stacked_ffn): + is_last_ffn = (ffn_idx == (num_stacked_ffn - 1)) + # only apply dropout on last ffn layer if use bottleneck + dropout = float(hidden_dropout_prob * (not use_bottleneck) * is_last_ffn) + activation_dropout = float(activation_dropout_prob * (not use_bottleneck) + * is_last_ffn) + self.stacked_ffn.add( + PositionwiseFFN(units=real_units, + hidden_size=hidden_size, + dropout=dropout, + activation_dropout=activation_dropout, + weight_initializer=weight_initializer, + bias_initializer=bias_initializer, + activation=activation, + normalization=normalization, + layer_norm_eps=layer_norm_eps, + dtype=self._dtype)) def hybrid_forward(self, F, data, attn_mask): """ @@ -315,9 +297,8 @@ def __init__(self, layer_norm_eps: float = 1E-12, weight_initializer: InitializerType = TruncNorm(stdev=0.02), bias_initializer: InitializerType = 'zeros', - dtype='float32', - prefix=None, params=None): - super().__init__(prefix=prefix, params=params) + dtype='float32'): + super().__init__() self._dtype = dtype self._num_layers = num_layers self._output_attention = output_attention @@ -331,26 +312,23 @@ def __init__(self, 'by the number of heads. Received real_units={}, num_heads={}' \ .format(real_units, num_heads) - with self.name_scope(): - self.all_layers = nn.HybridSequential(prefix='layers_') - with self.all_layers.name_scope(): - for layer_idx in range(num_layers): - self.all_layers.add( - MobileBertEncoderLayer(use_bottleneck=use_bottleneck, - units=units, - real_units=real_units, - hidden_size=hidden_size, - num_heads=num_heads, - attention_dropout_prob=attention_dropout_prob, - hidden_dropout_prob=hidden_dropout_prob, - num_stacked_ffn=num_stacked_ffn, - bottleneck_strategy=bottleneck_strategy, - layer_norm_eps=layer_norm_eps, - weight_initializer=weight_initializer, - bias_initializer=bias_initializer, - normalization=normalization, - activation=activation, - prefix='{}_'.format(layer_idx))) + self.all_layers = nn.HybridSequential() + for layer_idx in range(num_layers): + self.all_layers.add( + MobileBertEncoderLayer(use_bottleneck=use_bottleneck, + units=units, + real_units=real_units, + hidden_size=hidden_size, + num_heads=num_heads, + attention_dropout_prob=attention_dropout_prob, + hidden_dropout_prob=hidden_dropout_prob, + num_stacked_ffn=num_stacked_ffn, + bottleneck_strategy=bottleneck_strategy, + layer_norm_eps=layer_norm_eps, + weight_initializer=weight_initializer, + bias_initializer=bias_initializer, + normalization=normalization, + activation=activation)) def hybrid_forward(self, F, data, valid_length): """ @@ -427,10 +405,8 @@ def __init__(self, trigram_embed=True, use_pooler=True, classifier_activation=False, - dtype='float32', - prefix=None, - params=None): - super().__init__(prefix=prefix, params=params) + dtype='float32'): + super().__init__() self._dtype = dtype self.use_bottleneck = use_bottleneck self.bottleneck_strategy = bottleneck_strategy @@ -451,66 +427,58 @@ def __init__(self, self.weight_initializer = weight_initializer self.bias_initializer = bias_initializer self.layer_norm_eps = layer_norm_eps - with self.name_scope(): - # Construct MobileBertTransformer - self.encoder = MobileBertTransformer( - units=units, - hidden_size=hidden_size, - num_layers=num_layers, - num_heads=num_heads, - inner_size=inner_size, - num_stacked_ffn=num_stacked_ffn, - bottleneck_strategy=bottleneck_strategy, - attention_dropout_prob=attention_dropout_prob, - hidden_dropout_prob=hidden_dropout_prob, - output_attention=False, - output_all_encodings=False, - activation=activation, - normalization=normalization, - layer_norm_eps=layer_norm_eps, - weight_initializer=weight_initializer, - bias_initializer=bias_initializer, - dtype=dtype, - prefix='enc_', - ) - self.encoder.hybridize() - # Construct word embedding - self.word_embed = nn.Embedding(input_dim=vocab_size, - output_dim=embed_size, - weight_initializer=embed_initializer, - dtype=dtype, - prefix='word_embed_') - if trigram_embed or embed_size != units: - self.embed_factorized_proj = nn.Dense(units=units, - flatten=False, - weight_initializer=weight_initializer, - bias_initializer=bias_initializer, - prefix='embed_factorized_proj_') - self.embed_layer_norm = get_layer_norm(normalization=normalization, - in_channels=units, - epsilon=self.layer_norm_eps, - prefix='embed_ln_') - - self.embed_dropout = nn.Dropout(hidden_dropout_prob) - # Construct token type embedding - self.token_type_embed = nn.Embedding(input_dim=num_token_types, - output_dim=units, - weight_initializer=weight_initializer, - prefix='token_type_embed_') - self.token_pos_embed = PositionalEmbedding(units=units, - max_length=max_length, - dtype=self._dtype, - method=pos_embed_type, - prefix='token_pos_embed_') - if self.use_pooler and self.classifier_activation: - # Construct pooler - self.pooler = nn.Dense(units=units, - in_units=units, - flatten=False, - activation='tanh', - weight_initializer=weight_initializer, - bias_initializer=bias_initializer, - prefix='pooler_') + # Construct MobileBertTransformer + self.encoder = MobileBertTransformer( + units=units, + hidden_size=hidden_size, + num_layers=num_layers, + num_heads=num_heads, + inner_size=inner_size, + num_stacked_ffn=num_stacked_ffn, + bottleneck_strategy=bottleneck_strategy, + attention_dropout_prob=attention_dropout_prob, + hidden_dropout_prob=hidden_dropout_prob, + output_attention=False, + output_all_encodings=False, + activation=activation, + normalization=normalization, + layer_norm_eps=layer_norm_eps, + weight_initializer=weight_initializer, + bias_initializer=bias_initializer, + dtype=dtype, + ) + self.encoder.hybridize() + # Construct word embedding + self.word_embed = nn.Embedding(input_dim=vocab_size, + output_dim=embed_size, + weight_initializer=embed_initializer, + dtype=dtype) + if trigram_embed or embed_size != units: + self.embed_factorized_proj = nn.Dense(units=units, + flatten=False, + weight_initializer=weight_initializer, + bias_initializer=bias_initializer) + self.embed_layer_norm = get_layer_norm(normalization=normalization, + in_channels=units, + epsilon=self.layer_norm_eps) + + self.embed_dropout = nn.Dropout(hidden_dropout_prob) + # Construct token type embedding + self.token_type_embed = nn.Embedding(input_dim=num_token_types, + output_dim=units, + weight_initializer=weight_initializer) + self.token_pos_embed = PositionalEmbedding(units=units, + max_length=max_length, + dtype=self._dtype, + method=pos_embed_type) + if self.use_pooler and self.classifier_activation: + # Construct pooler + self.pooler = nn.Dense(units=units, + in_units=units, + flatten=False, + activation='tanh', + weight_initializer=weight_initializer, + bias_initializer=bias_initializer) def hybrid_forward(self, F, inputs, token_types, valid_length): # pylint: disable=arguments-differ @@ -649,9 +617,7 @@ def from_cfg(cls, use_bottleneck=True, trigram_embed=True, use_pooler=True, - classifier_activation=False, - prefix=None, - params=None): + classifier_activation=False): cfg = MobileBertModel.get_cfg().clone_merge(cfg) assert cfg.VERSION == 1, 'Wrong version!' embed_initializer = mx.init.create(*cfg.INITIALIZER.embed) @@ -681,9 +647,7 @@ def from_cfg(cls, use_bottleneck=use_bottleneck, trigram_embed=trigram_embed, use_pooler=use_pooler, - classifier_activation=classifier_activation, - prefix=prefix, - params=params) + classifier_activation=classifier_activation) @use_np @@ -692,9 +656,7 @@ def __init__(self, backbone_cfg, use_bottleneck=True, trigram_embed=True, weight_initializer=None, - bias_initializer=None, - prefix=None, - params=None): + bias_initializer=None): """ Parameters @@ -702,50 +664,41 @@ def __init__(self, backbone_cfg, backbone_cfg weight_initializer bias_initializer - prefix - params """ - super().__init__(prefix=prefix, params=params) - with self.name_scope(): - self.backbone_model = MobileBertModel.from_cfg(backbone_cfg, - use_bottleneck=use_bottleneck, - trigram_embed=trigram_embed, - prefix='') - if weight_initializer is None: - weight_initializer = self.backbone_model.weight_initializer - if bias_initializer is None: - bias_initializer = self.backbone_model.bias_initializer - self.mlm_decoder = nn.HybridSequential(prefix='mlm_') - with self.mlm_decoder.name_scope(): - # Extra non-linear layer - self.mlm_decoder.add(nn.Dense(units=self.backbone_model.units, - flatten=False, - weight_initializer=weight_initializer, - bias_initializer=bias_initializer, - prefix='proj_')) - self.mlm_decoder.add(get_activation(self.backbone_model.activation)) - # use basic layer normalization for pretaining - self.mlm_decoder.add(nn.LayerNorm(epsilon=self.backbone_model.layer_norm_eps, - prefix='ln_')) - self.mlm_decoder.hybridize() - # only load the dense weights with a re-initialized bias - # parameters are stored in 'word_embed_bias' which is - # not used in original embedding - self.embedding_table = nn.Dense( + super().__init__() + self.backbone_model = MobileBertModel.from_cfg(backbone_cfg, + use_bottleneck=use_bottleneck, + trigram_embed=trigram_embed) + if weight_initializer is None: + weight_initializer = self.backbone_model.weight_initializer + if bias_initializer is None: + bias_initializer = self.backbone_model.bias_initializer + self.mlm_decoder = nn.HybridSequential() + # Extra non-linear layer + self.mlm_decoder.add(nn.Dense(units=self.backbone_model.units, + flatten=False, + weight_initializer=weight_initializer, + bias_initializer=bias_initializer)) + self.mlm_decoder.add(get_activation(self.backbone_model.activation)) + # use basic layer normalization for pretaining + self.mlm_decoder.add(nn.LayerNorm(epsilon=self.backbone_model.layer_norm_eps)) + self.mlm_decoder.hybridize() + # only load the dense weights with a re-initialized bias + # parameters are stored in 'word_embed_bias' which is + # not used in original embedding + self.embedding_table = nn.Dense( + units=self.backbone_model.vocab_size, + in_units=self.backbone_model.embed_size, + flatten=False, + bias_initializer=bias_initializer) + self.embedding_table.weight = self.backbone_model.word_embed.weight + if self.backbone_model.embed_size != self.backbone_model.units: + self.extra_table = nn.Dense( units=self.backbone_model.vocab_size, - in_units=self.backbone_model.embed_size, - flatten=False, - params=self.backbone_model.word_embed.collect_params('.*weight'), - bias_initializer=bias_initializer, - prefix='mlm_score_') - if self.backbone_model.embed_size != self.backbone_model.units: - self.extra_table = nn.Dense( - units=self.backbone_model.vocab_size, - use_bias=False, - in_units=self.backbone_model.units - - self.backbone_model.embed_size, - flatten=False, - prefix='mlm_extra_score_') + use_bias=False, + in_units=self.backbone_model.units - + self.backbone_model.embed_size, + flatten=False) def hybrid_forward(self, F, inputs, token_types, valid_length, masked_positions): @@ -796,8 +749,7 @@ def __init__(self, backbone_cfg, use_bottleneck=True, trigram_embed=True, weight_initializer=None, - bias_initializer=None, - prefix=None, params=None): + bias_initializer=None): """ Parameters @@ -806,55 +758,45 @@ def __init__(self, backbone_cfg, The cfg of the backbone model weight_initializer bias_initializer - prefix - params """ - super().__init__(prefix=prefix, params=params) - with self.name_scope(): - self.backbone_model = MobileBertModel.from_cfg(backbone_cfg, - use_bottleneck=use_bottleneck, - trigram_embed=trigram_embed, - prefix='') - if weight_initializer is None: - weight_initializer = self.backbone_model.weight_initializer - if bias_initializer is None: - bias_initializer = self.backbone_model.bias_initializer - # Construct nsp_classifier for next sentence prediction - self.nsp_classifier = nn.Dense(units=2, - weight_initializer=weight_initializer, - prefix='nsp_') - self.mlm_decoder = nn.HybridSequential(prefix='mlm_') - with self.mlm_decoder.name_scope(): - # Extra non-linear layer - self.mlm_decoder.add(nn.Dense(units=self.backbone_model.units, - flatten=False, - weight_initializer=weight_initializer, - bias_initializer=bias_initializer, - prefix='proj_')) - self.mlm_decoder.add(get_activation(self.backbone_model.activation)) - # use basic layer normalization for pretaining - self.mlm_decoder.add(nn.LayerNorm(epsilon=self.backbone_model.layer_norm_eps, - prefix='ln_')) - self.mlm_decoder.hybridize() - # only load the dense weights with a re-initialized bias - # parameters are stored in 'word_embed_bias' which is - # not used in original embedding - self.embedding_table = nn.Dense( + super().__init__() + self.backbone_model = MobileBertModel.from_cfg(backbone_cfg, + use_bottleneck=use_bottleneck, + trigram_embed=trigram_embed) + if weight_initializer is None: + weight_initializer = self.backbone_model.weight_initializer + if bias_initializer is None: + bias_initializer = self.backbone_model.bias_initializer + # Construct nsp_classifier for next sentence prediction + self.nsp_classifier = nn.Dense(units=2, + weight_initializer=weight_initializer) + self.mlm_decoder = nn.HybridSequential() + # Extra non-linear layer + self.mlm_decoder.add(nn.Dense(units=self.backbone_model.units, + flatten=False, + weight_initializer=weight_initializer, + bias_initializer=bias_initializer)) + self.mlm_decoder.add(get_activation(self.backbone_model.activation)) + # use basic layer normalization for pretaining + self.mlm_decoder.add(nn.LayerNorm(epsilon=self.backbone_model.layer_norm_eps)) + self.mlm_decoder.hybridize() + # only load the dense weights with a re-initialized bias + # parameters are stored in 'word_embed_bias' which is + # not used in original embedding + self.embedding_table = nn.Dense( + units=self.backbone_model.vocab_size, + in_units=self.backbone_model.embed_size, + flatten=False, + bias_initializer=bias_initializer) + self.embedding_table.weight = self.backbone_model.word_embed.weight + if self.backbone_model.embed_size != self.backbone_model.units: + self.extra_table = nn.Dense( units=self.backbone_model.vocab_size, - in_units=self.backbone_model.embed_size, + in_units=self.backbone_model.units - + self.backbone_model.embed_size, flatten=False, - params=self.backbone_model.word_embed.collect_params('.*weight'), - bias_initializer=bias_initializer, - prefix='mlm_score_') - if self.backbone_model.embed_size != self.backbone_model.units: - self.extra_table = nn.Dense( - units=self.backbone_model.vocab_size, - in_units=self.backbone_model.units - - self.backbone_model.embed_size, - flatten=False, - use_bias=False, - bias_initializer=bias_initializer, - prefix='mlm_extra_score_') + use_bias=False, + bias_initializer=bias_initializer) def hybrid_forward(self, F, inputs, token_types, valid_length, masked_positions): diff --git a/src/gluonnlp/models/roberta.py b/src/gluonnlp/models/roberta.py index a9493f86b5..44a1a20c1e 100644 --- a/src/gluonnlp/models/roberta.py +++ b/src/gluonnlp/models/roberta.py @@ -130,9 +130,7 @@ def __init__(self, use_mlm=True, untie_weight=False, encoder_normalize_before=True, - return_all_hiddens=False, - prefix=None, - params=None): + return_all_hiddens=False): """ Parameters @@ -161,10 +159,8 @@ def __init__(self, Whether to untie weights between embeddings and classifiers encoder_normalize_before return_all_hiddens - prefix - params """ - super(RobertaModel, self).__init__(prefix=prefix, params=params) + super().__init__() self.vocab_size = vocab_size self.units = units self.hidden_size = hidden_size @@ -183,60 +179,55 @@ def __init__(self, self.untie_weight = untie_weight self.encoder_normalize_before = encoder_normalize_before self.return_all_hiddens = return_all_hiddens - with self.name_scope(): - self.tokens_embed = nn.Embedding( - input_dim=self.vocab_size, - output_dim=self.units, - weight_initializer=embed_initializer, - dtype=self.dtype, - prefix='tokens_embed_' + self.tokens_embed = nn.Embedding( + input_dim=self.vocab_size, + output_dim=self.units, + weight_initializer=embed_initializer, + dtype=self.dtype, + ) + if self.encoder_normalize_before: + self.embed_ln = nn.LayerNorm( + epsilon=self.layer_norm_eps, + in_channels=self.units, ) - if self.encoder_normalize_before: - self.embed_ln = nn.LayerNorm( - epsilon=self.layer_norm_eps, - in_channels=self.units, - prefix='embed_ln_' - ) - else: - self.embed_ln = None - self.embed_dropout = nn.Dropout(self.hidden_dropout_prob) - self.pos_embed = PositionalEmbedding( - units=self.units, - max_length=self.max_length, - dtype=self.dtype, - method=pos_embed_type, - prefix='pos_embed_' - ) - - self.encoder = RobertaEncoder( - units=self.units, - hidden_size=self.hidden_size, - num_layers=self.num_layers, - num_heads=self.num_heads, - attention_dropout_prob=self.attention_dropout_prob, - hidden_dropout_prob=self.hidden_dropout_prob, + else: + self.embed_ln = None + self.embed_dropout = nn.Dropout(self.hidden_dropout_prob) + self.pos_embed = PositionalEmbedding( + units=self.units, + max_length=self.max_length, + dtype=self.dtype, + method=pos_embed_type, + ) + + self.encoder = RobertaEncoder( + units=self.units, + hidden_size=self.hidden_size, + num_layers=self.num_layers, + num_heads=self.num_heads, + attention_dropout_prob=self.attention_dropout_prob, + hidden_dropout_prob=self.hidden_dropout_prob, + layer_norm_eps=self.layer_norm_eps, + weight_initializer=weight_initializer, + bias_initializer=bias_initializer, + activation=self.activation, + dtype=self.dtype, + return_all_hiddens=self.return_all_hiddens + ) + self.encoder.hybridize() + + if self.use_mlm: + self.lm_head = RobertaLMHead( + self.units, + self.vocab_size, + self.activation, layer_norm_eps=self.layer_norm_eps, weight_initializer=weight_initializer, - bias_initializer=bias_initializer, - activation=self.activation, - dtype=self.dtype, - return_all_hiddens=self.return_all_hiddens + bias_initializer=bias_initializer ) - self.encoder.hybridize() - - if self.use_mlm: - embed_weight = None if untie_weight else \ - self.tokens_embed.collect_params('.*weight') - self.lm_head = RobertaLMHead( - self.units, - self.vocab_size, - self.activation, - layer_norm_eps=self.layer_norm_eps, - embed_weight=embed_weight, - weight_initializer=weight_initializer, - bias_initializer=bias_initializer - ) - self.lm_head.hybridize() + if not untie_weight: + self.lm_head.dense2.weight = self.tokens_embed.weight + self.lm_head.hybridize() # TODO support use_pooler def hybrid_forward(self, F, tokens, valid_length): @@ -271,9 +262,7 @@ def from_cfg(cls, use_mlm=True, untie_weight=False, encoder_normalize_before=True, - return_all_hiddens=False, - prefix=None, - params=None): + return_all_hiddens=False): cfg = RobertaModel.get_cfg().clone_merge(cfg) embed_initializer = mx.init.create(*cfg.INITIALIZER.embed) weight_initializer = mx.init.create(*cfg.INITIALIZER.weight) @@ -298,9 +287,7 @@ def from_cfg(cls, use_mlm=use_mlm, untie_weight=untie_weight, encoder_normalize_before=encoder_normalize_before, - return_all_hiddens=return_all_hiddens, - prefix=prefix, - params=params) + return_all_hiddens=return_all_hiddens) @use_np class RobertaEncoder(HybridBlock): @@ -316,10 +303,8 @@ def __init__(self, bias_initializer='zeros', activation='gelu', dtype='float32', - return_all_hiddens=False, - prefix='encoder_', - params=None): - super(RobertaEncoder, self).__init__(prefix=prefix, params=params) + return_all_hiddens=False): + super().__init__() self.units = units self.hidden_size = hidden_size self.num_layers = num_layers @@ -330,25 +315,22 @@ def __init__(self, self.activation = activation self.dtype = dtype self.return_all_hiddens = return_all_hiddens - with self.name_scope(): - self.all_layers = nn.HybridSequential(prefix='layers_') - with self.all_layers.name_scope(): - for layer_idx in range(self.num_layers): - self.all_layers.add( - TransformerEncoderLayer( - units=self.units, - hidden_size=self.hidden_size, - num_heads=self.num_heads, - attention_dropout_prob=self.attention_dropout_prob, - hidden_dropout_prob=self.hidden_dropout_prob, - layer_norm_eps=self.layer_norm_eps, - weight_initializer=weight_initializer, - bias_initializer=bias_initializer, - activation=self.activation, - dtype=self.dtype, - prefix='{}_'.format(layer_idx) - ) - ) + self.all_layers = nn.HybridSequential() + for layer_idx in range(self.num_layers): + self.all_layers.add( + TransformerEncoderLayer( + units=self.units, + hidden_size=self.hidden_size, + num_heads=self.num_heads, + attention_dropout_prob=self.attention_dropout_prob, + hidden_dropout_prob=self.hidden_dropout_prob, + layer_norm_eps=self.layer_norm_eps, + weight_initializer=weight_initializer, + bias_initializer=bias_initializer, + activation=self.activation, + dtype=self.dtype, + ) + ) def hybrid_forward(self, F, x, valid_length): atten_mask = gen_self_attn_mask(F, x, valid_length, @@ -369,41 +351,24 @@ def __init__(self, output_dim=50265, activation_fn='gelu', layer_norm_eps=1E-5, - embed_weight=None, weight_initializer=TruncNorm(stdev=0.02), - bias_initializer='zeros', - prefix='lm_', - params=None): - super(RobertaLMHead, self).__init__(prefix=prefix, params=params) - with self.name_scope(): - self.dense1 = nn.Dense(in_units=embed_dim, - units=embed_dim, - flatten=False, - weight_initializer=weight_initializer, - bias_initializer=bias_initializer, - prefix='dense1_') - self.activation_fn = get_activation(activation_fn) - self.ln = nn.LayerNorm( - epsilon=layer_norm_eps, - in_channels=embed_dim, - prefix='ln_') - if embed_weight: - # notice the bias of dense2 here - # will be *tokens_embed_bias - self.dense2 = nn.Dense(in_units=embed_dim, - units=output_dim, - flatten=False, - params=embed_weight, - bias_initializer='zeros', - prefix='dense2_') - else: - self.dense2 = nn.Dense(in_units=embed_dim, - units=output_dim, - activation=None, - flatten=False, - weight_initializer=weight_initializer, - bias_initializer='zeros', - prefix='dense2_') + bias_initializer='zeros'): + super().__init__() + self.dense1 = nn.Dense(in_units=embed_dim, + units=embed_dim, + flatten=False, + weight_initializer=weight_initializer, + bias_initializer=bias_initializer) + self.activation_fn = get_activation(activation_fn) + self.ln = nn.LayerNorm( + epsilon=layer_norm_eps, + in_channels=embed_dim) + self.dense2 = nn.Dense(in_units=embed_dim, + units=output_dim, + activation=None, + flatten=False, + weight_initializer=weight_initializer, + bias_initializer='zeros') def hybrid_forward(self, F, x): x = self.dense1(x) diff --git a/src/gluonnlp/models/transformer.py b/src/gluonnlp/models/transformer.py index 073ee73708..18a01930ea 100644 --- a/src/gluonnlp/models/transformer.py +++ b/src/gluonnlp/models/transformer.py @@ -140,8 +140,7 @@ def __init__(self, weight_initializer: Optional[InitializerType] = None, bias_initializer: Optional[InitializerType] = 'zeros', activation: str = 'relu', - dtype='float32', - prefix=None, params=None): + dtype='float32'): """ Parameters @@ -165,10 +164,8 @@ def __init__(self, bias_initializer activation dtype - prefix - params """ - super().__init__(prefix=prefix, params=params) + super().__init__() self._units = units self._hidden_size = hidden_size self._num_heads = num_heads @@ -178,48 +175,42 @@ def __init__(self, self._pre_norm = pre_norm self._dtype = dtype assert self._units % self._num_heads == 0, 'units must be divisive by the number of heads' - with self.name_scope(): - self.dropout_layer = nn.Dropout(hidden_dropout_prob) - self.attn_qkv = nn.Dense(3 * units, - flatten=False, - use_bias=use_qkv_bias, - in_units=units, - weight_initializer=weight_initializer, - bias_initializer=bias_initializer, - dtype=self._dtype, - prefix='attn_qkv_') - self.attention_proj = nn.Dense(units=units, - flatten=False, - in_units=units, - use_bias=True, - weight_initializer=weight_initializer, - bias_initializer=bias_initializer, - dtype=self._dtype, - prefix='proj_') - self.attention_cell =\ - MultiHeadAttentionCell( - query_units=self._units, - num_heads=self._num_heads, - attention_dropout=self._attention_dropout_prob, - scaled=True, - prefix='attn_cell_', - dtype=self._dtype, - layout='NTK' - ) - self.layer_norm = nn.LayerNorm(epsilon=layer_norm_eps, - in_channels=units, - prefix='ln_') - self.ffn = PositionwiseFFN(units=units, - hidden_size=hidden_size, - dropout=hidden_dropout_prob, - activation_dropout=activation_dropout_prob, + self.dropout_layer = nn.Dropout(hidden_dropout_prob) + self.attn_qkv = nn.Dense(3 * units, + flatten=False, + use_bias=use_qkv_bias, + in_units=units, + weight_initializer=weight_initializer, + bias_initializer=bias_initializer, + dtype=self._dtype) + self.attention_proj = nn.Dense(units=units, + flatten=False, + in_units=units, + use_bias=True, weight_initializer=weight_initializer, bias_initializer=bias_initializer, - layer_norm_eps=layer_norm_eps, - activation=activation, - pre_norm=pre_norm, - dtype=self._dtype, - prefix='ffn_') + dtype=self._dtype) + self.attention_cell =\ + MultiHeadAttentionCell( + query_units=self._units, + num_heads=self._num_heads, + attention_dropout=self._attention_dropout_prob, + scaled=True, + dtype=self._dtype, + layout='NTK' + ) + self.layer_norm = nn.LayerNorm(epsilon=layer_norm_eps, + in_channels=units) + self.ffn = PositionwiseFFN(units=units, + hidden_size=hidden_size, + dropout=hidden_dropout_prob, + activation_dropout=activation_dropout_prob, + weight_initializer=weight_initializer, + bias_initializer=bias_initializer, + layer_norm_eps=layer_norm_eps, + activation=activation, + pre_norm=pre_norm, + dtype=self._dtype) def hybrid_forward(self, F, data, attn_mask): """ @@ -264,8 +255,7 @@ def __init__(self, num_layers=6, recurrent=False, activation_dropout=0.0, dropout=0.1, attention_dropout=0.1, layer_norm_eps=1E-5, data_norm=False, pre_norm=False, weight_initializer=None, bias_initializer='zeros', - activation='relu', dtype='float32', - prefix=None, params=None): + activation='relu', dtype='float32'): """ Parameters @@ -286,44 +276,37 @@ def __init__(self, num_layers=6, recurrent=False, weight_initializer bias_initializer activation - prefix - params """ - super(TransformerEncoder, self).__init__(prefix=prefix, params=params) + super().__init__() self._dtype = dtype self.num_layers = num_layers self._recurrent = recurrent self._data_norm = data_norm self._pre_norm = pre_norm - with self.name_scope(): - self.dropout_layer = nn.Dropout(dropout) - if self._pre_norm: - self.ln_final = nn.LayerNorm(epsilon=layer_norm_eps, - in_channels=units, - prefix='ln_final_') - if self._data_norm: - self.ln_data = nn.LayerNorm(epsilon=layer_norm_eps, - in_channels=units, - prefix='ln_data_') - # Construct the intermediate layers - self.layers = nn.HybridSequential(prefix='layers_') - real_num_layers = 1 if recurrent else num_layers - with self.layers.name_scope(): - for i in range(real_num_layers): - self.layers.add(TransformerEncoderLayer( - units=units, - hidden_size=hidden_size, - num_heads=num_heads, - hidden_dropout_prob=dropout, - attention_dropout_prob=attention_dropout, - activation_dropout_prob=activation_dropout, - layer_norm_eps=layer_norm_eps, - weight_initializer=weight_initializer, - bias_initializer=bias_initializer, - pre_norm=pre_norm, - activation=activation, - dtype=dtype, - prefix='{}_'.format(i))) + self.dropout_layer = nn.Dropout(dropout) + if self._pre_norm: + self.ln_final = nn.LayerNorm(epsilon=layer_norm_eps, + in_channels=units) + if self._data_norm: + self.ln_data = nn.LayerNorm(epsilon=layer_norm_eps, + in_channels=units) + # Construct the intermediate layers + self.layers = nn.HybridSequential() + real_num_layers = 1 if recurrent else num_layers + for i in range(real_num_layers): + self.layers.add(TransformerEncoderLayer( + units=units, + hidden_size=hidden_size, + num_heads=num_heads, + hidden_dropout_prob=dropout, + attention_dropout_prob=attention_dropout, + activation_dropout_prob=activation_dropout, + layer_norm_eps=layer_norm_eps, + weight_initializer=weight_initializer, + bias_initializer=bias_initializer, + pre_norm=pre_norm, + activation=activation, + dtype=dtype)) def hybrid_forward(self, F, data, valid_length): """ @@ -372,9 +355,7 @@ def __init__(self, units: int = 512, pre_norm: bool = False, weight_initializer=None, bias_initializer='zeros', - dtype='float32', - prefix=None, - params=None): + dtype='float32'): """ Parameters @@ -395,10 +376,8 @@ def __init__(self, units: int = 512, weight_initializer bias_initializer dtype - prefix - params """ - super(TransformerDecoderLayer, self).__init__(prefix=prefix, params=params) + super().__init__() self._dtype = dtype self._units = units if mem_units is None: @@ -408,78 +387,66 @@ def __init__(self, units: int = 512, self._num_heads = num_heads self._attention_dropout = attention_dropout self._dtype = dtype - with self.name_scope(): - self.dropout_layer = nn.Dropout(dropout) - if units % num_heads: - raise ValueError('In Transformer, units should be divided exactly by the number of ' - 'heads. Received units={}, num_heads={}'.format(units, num_heads)) - self.attn_in_qkv = nn.Dense(3 * units, in_units=units, - use_bias=False, - flatten=False, - weight_initializer=weight_initializer, - bias_initializer=bias_initializer, - dtype=dtype, - prefix='attn_in_qkv_') - self.self_attention = MultiHeadAttentionCell(query_units=units, - num_heads=num_heads, - attention_dropout=self._attention_dropout, - dtype=dtype, - layout='NTK', - prefix='self_attn_') - self.proj_in = nn.Dense(units=units, in_units=units, flatten=False, use_bias=False, + self.dropout_layer = nn.Dropout(dropout) + if units % num_heads: + raise ValueError('In Transformer, units should be divided exactly by the number of ' + 'heads. Received units={}, num_heads={}'.format(units, num_heads)) + self.attn_in_qkv = nn.Dense(3 * units, in_units=units, + use_bias=False, + flatten=False, weight_initializer=weight_initializer, bias_initializer=bias_initializer, - dtype=dtype, - prefix='proj_in_') - self.attn_inter_q = nn.Dense(units, - in_units=units, - use_bias=False, - flatten=False, - weight_initializer=weight_initializer, - bias_initializer=bias_initializer, - dtype=dtype, - prefix='attn_inter_q_') - self.attn_inter_k = nn.Dense(units, in_units=mem_units, - use_bias=False, - flatten=False, - weight_initializer=weight_initializer, - bias_initializer=bias_initializer, - dtype=dtype, - prefix='attn_inter_k_') - self.attn_inter_v = nn.Dense(units, in_units=mem_units, - use_bias=False, - flatten=False, - weight_initializer=weight_initializer, - bias_initializer=bias_initializer, - dtype=dtype, - prefix='attn_inter_v_') - self.inter_attention = MultiHeadAttentionCell(query_units=units, - num_heads=num_heads, - attention_dropout=self._attention_dropout, - dtype=dtype, - layout='NTK', - prefix='inter_attn_') - self.proj_inter = nn.Dense(units=units, in_units=units, - flatten=False, use_bias=False, - weight_initializer=weight_initializer, - bias_initializer=bias_initializer, - dtype=dtype, - prefix='proj_inter_') - # TODO(sxjscience) Add DType to LayerNorm - self.ln_in = nn.LayerNorm(epsilon=layer_norm_eps, - in_channels=units, - prefix='ln_in_') - self.ln_inter = nn.LayerNorm(epsilon=layer_norm_eps, - in_channels=units, - prefix='ln_inter_') - self.ffn = PositionwiseFFN(units=units, - hidden_size=hidden_size, - dropout=dropout, - activation_dropout=activation_dropout, - activation=activation, - pre_norm=pre_norm, - dtype=dtype, - prefix='ffn_') + dtype=dtype) + self.self_attention = MultiHeadAttentionCell(query_units=units, + num_heads=num_heads, + attention_dropout=self._attention_dropout, + dtype=dtype, + layout='NTK') + self.proj_in = nn.Dense(units=units, in_units=units, flatten=False, use_bias=False, + weight_initializer=weight_initializer, + bias_initializer=bias_initializer, + dtype=dtype) + self.attn_inter_q = nn.Dense(units, + in_units=units, + use_bias=False, + flatten=False, + weight_initializer=weight_initializer, + bias_initializer=bias_initializer, + dtype=dtype) + self.attn_inter_k = nn.Dense(units, in_units=mem_units, + use_bias=False, + flatten=False, + weight_initializer=weight_initializer, + bias_initializer=bias_initializer, + dtype=dtype) + self.attn_inter_v = nn.Dense(units, in_units=mem_units, + use_bias=False, + flatten=False, + weight_initializer=weight_initializer, + bias_initializer=bias_initializer, + dtype=dtype) + self.inter_attention = MultiHeadAttentionCell(query_units=units, + num_heads=num_heads, + attention_dropout=self._attention_dropout, + dtype=dtype, + layout='NTK') + self.proj_inter = nn.Dense(units=units, in_units=units, + flatten=False, use_bias=False, + weight_initializer=weight_initializer, + bias_initializer=bias_initializer, + dtype=dtype) + # TODO(sxjscience) Add DType to LayerNorm + self.ln_in = nn.LayerNorm(epsilon=layer_norm_eps, + in_channels=units) + self.ln_inter = nn.LayerNorm(epsilon=layer_norm_eps, + in_channels=units) + self.ffn = PositionwiseFFN(units=units, + hidden_size=hidden_size, + dropout=dropout, + activation_dropout=activation_dropout, + activation=activation, + pre_norm=pre_norm, + dtype=dtype) def hybrid_forward(self, F, data, mem, self_causal_mask, mem_attn_mask): """ @@ -658,8 +625,8 @@ def __init__(self, num_layers=6, recurrent=False, num_heads=8, max_shift=None, rel_pos_embed=False, activation_dropout=0.0, dropout=0.1, attention_dropout=0.1, layer_norm_eps=1E-5, data_norm=False, pre_norm=False, weight_initializer=None, bias_initializer=None, - activation='relu', dtype='float32', prefix=None, params=None): - super(TransformerDecoder, self).__init__(prefix=prefix, params=params) + activation='relu', dtype='float32'): + super().__init__() self._dtype = dtype self._units = units self._mem_units = mem_units @@ -669,35 +636,30 @@ def __init__(self, num_layers=6, recurrent=False, self.rel_pos_embed = rel_pos_embed self._data_norm = data_norm self._pre_norm = pre_norm - with self.name_scope(): - self.dropout_layer = nn.Dropout(dropout) - if self._data_norm: - self.ln_data = nn.LayerNorm(epsilon=layer_norm_eps, - in_channels=units, - prefix='ln_data_') - if self._pre_norm: - self.ln_final = nn.LayerNorm(epsilon=layer_norm_eps, - in_channels=units, - prefix='ln_final_') - # Construct the intermediate layers - self.layers = nn.HybridSequential(prefix='layers_') - real_num_layers = 1 if recurrent else num_layers - with self.layers.name_scope(): - for i in range(real_num_layers): - self.layers.add(TransformerDecoderLayer(units=units, - mem_units=mem_units, - hidden_size=hidden_size, - num_heads=num_heads, - activation_dropout=activation_dropout, - dropout=dropout, - attention_dropout=attention_dropout, - layer_norm_eps=layer_norm_eps, - weight_initializer=weight_initializer, - bias_initializer=bias_initializer, - activation=activation, - pre_norm=pre_norm, - dtype=dtype, - prefix='{}_'.format(i))) + self.dropout_layer = nn.Dropout(dropout) + if self._data_norm: + self.ln_data = nn.LayerNorm(epsilon=layer_norm_eps, + in_channels=units) + if self._pre_norm: + self.ln_final = nn.LayerNorm(epsilon=layer_norm_eps, + in_channels=units) + # Construct the intermediate layers + self.layers = nn.HybridSequential() + real_num_layers = 1 if recurrent else num_layers + for i in range(real_num_layers): + self.layers.add(TransformerDecoderLayer(units=units, + mem_units=mem_units, + hidden_size=hidden_size, + num_heads=num_heads, + activation_dropout=activation_dropout, + dropout=dropout, + attention_dropout=attention_dropout, + layer_norm_eps=layer_norm_eps, + weight_initializer=weight_initializer, + bias_initializer=bias_initializer, + activation=activation, + pre_norm=pre_norm, + dtype=dtype)) def hybrid_forward(self, F, data, valid_length, mem_data, mem_valid_length): """ @@ -852,8 +814,7 @@ def __init__(self, src_vocab_size: int, embed_initializer=mx.init.Xavier('gaussian', 'in', 1), weight_initializer=mx.init.Xavier('uniform', 'avg', 3), bias_initializer='zeros', - dtype='float32', - prefix=None, params=None): + dtype='float32'): """ Parameters @@ -922,10 +883,8 @@ def __init__(self, src_vocab_size: int, Initializer of the bias dtype Data type of the weights - prefix - params """ - super(TransformerNMTModel, self).__init__(prefix=prefix, params=params) + super().__init__() assert src_vocab_size > 0 and tgt_vocab_size > 0,\ 'Cannot set "src_vocab_size" and "tgt_vocab_size" to negative numbers. ' \ 'Are you creating ' \ @@ -948,80 +907,71 @@ def __init__(self, src_vocab_size: int, assert shared_embed is False, 'Cannot share embedding when the enc_units and dec_units ' \ 'are different! enc_units={},' \ ' dec_units={}'.format(enc_units, dec_units) - with self.name_scope(): - self.src_embed_layer = nn.Embedding(input_dim=src_vocab_size, - output_dim=enc_units, - prefix='src_embed_', - weight_initializer=embed_initializer, - dtype=self._dtype) - self.tgt_embed_layer = nn.Embedding(input_dim=tgt_vocab_size, - output_dim=dec_units, - prefix='tgt_embed_', - params=self.src_embed_layer.params - if shared_embed else None, - weight_initializer=embed_initializer, - dtype=self._dtype) - if pos_embed_type is not None: - self.src_pos_embed_layer = PositionalEmbedding(units=enc_units, - max_length=max_src_length, - dtype=self._dtype, - method=pos_embed_type, - prefix='src_pos_embed_') - self.tgt_pos_embed_layer = PositionalEmbedding(units=dec_units, - max_length=max_tgt_length, - dtype=self._dtype, - method=pos_embed_type, - prefix='tgt_pos_embed_') - self.encoder = TransformerEncoder(num_layers=enc_num_layers, - recurrent=enc_recurrent, - units=enc_units, - hidden_size=enc_hidden_size, - num_heads=enc_num_heads, - activation_dropout=activation_dropout, - dropout=dropout, - attention_dropout=attention_dropout, - layer_norm_eps=layer_norm_eps, - weight_initializer=weight_initializer, - bias_initializer=bias_initializer, - activation=enc_activation, - data_norm=data_norm, - pre_norm=enc_pre_norm, - dtype=self._dtype, - prefix='enc_') - self.decoder = TransformerDecoder(num_layers=dec_num_layers, - recurrent=dec_recurrent, - units=dec_units, - mem_units=enc_units, - hidden_size=dec_hidden_size, - num_heads=dec_num_heads, - activation_dropout=activation_dropout, - dropout=dropout, - attention_dropout=attention_dropout, - layer_norm_eps=layer_norm_eps, - weight_initializer=weight_initializer, - bias_initializer=bias_initializer, - activation=dec_activation, - data_norm=data_norm, - pre_norm=dec_pre_norm, - dtype=self._dtype, - prefix='dec_') - if tie_weights: - self.tgt_final_layer =\ - nn.Dense(tgt_vocab_size, flatten=False, - bias_initializer=bias_initializer, - use_bias=False, - dtype=self._dtype, - params=self.tgt_embed_layer.collect_params(), - prefix='tgt_final_') - else: - self.tgt_final_layer = \ - nn.Dense(tgt_vocab_size, - flatten=False, - weight_initializer=weight_initializer, - bias_initializer=bias_initializer, - use_bias=False, - dtype=self._dtype, - prefix='tgt_final_') + self.src_embed_layer = nn.Embedding(input_dim=src_vocab_size, + output_dim=enc_units, + weight_initializer=embed_initializer, + dtype=self._dtype) + self.tgt_embed_layer = nn.Embedding(input_dim=tgt_vocab_size, + output_dim=dec_units, + weight_initializer=embed_initializer, + dtype=self._dtype) + if shared_embed: + self.tgt_embed_layer.weight = self.src_embed_layer.weight + if pos_embed_type is not None: + self.src_pos_embed_layer = PositionalEmbedding(units=enc_units, + max_length=max_src_length, + dtype=self._dtype, + method=pos_embed_type) + self.tgt_pos_embed_layer = PositionalEmbedding(units=dec_units, + max_length=max_tgt_length, + dtype=self._dtype, + method=pos_embed_type) + self.encoder = TransformerEncoder(num_layers=enc_num_layers, + recurrent=enc_recurrent, + units=enc_units, + hidden_size=enc_hidden_size, + num_heads=enc_num_heads, + activation_dropout=activation_dropout, + dropout=dropout, + attention_dropout=attention_dropout, + layer_norm_eps=layer_norm_eps, + weight_initializer=weight_initializer, + bias_initializer=bias_initializer, + activation=enc_activation, + data_norm=data_norm, + pre_norm=enc_pre_norm, + dtype=self._dtype) + self.decoder = TransformerDecoder(num_layers=dec_num_layers, + recurrent=dec_recurrent, + units=dec_units, + mem_units=enc_units, + hidden_size=dec_hidden_size, + num_heads=dec_num_heads, + activation_dropout=activation_dropout, + dropout=dropout, + attention_dropout=attention_dropout, + layer_norm_eps=layer_norm_eps, + weight_initializer=weight_initializer, + bias_initializer=bias_initializer, + activation=dec_activation, + data_norm=data_norm, + pre_norm=dec_pre_norm, + dtype=self._dtype) + if tie_weights: + self.tgt_final_layer =\ + nn.Dense(tgt_vocab_size, flatten=False, + bias_initializer=bias_initializer, + use_bias=False, + dtype=self._dtype) + self.tgt_final_layer.weight = self.tgt_embed_layer.weight + else: + self.tgt_final_layer = \ + nn.Dense(tgt_vocab_size, + flatten=False, + weight_initializer=weight_initializer, + bias_initializer=bias_initializer, + use_bias=False, + dtype=self._dtype) self.encoder.hybridize() self.decoder.hybridize() @@ -1122,7 +1072,7 @@ def get_cfg(cls, key=None): return transformer_nmt_cfg_reg.create(key) @classmethod - def from_cfg(cls, cfg, prefix=None, params=None): + def from_cfg(cls, cfg): cfg = cls.get_cfg().clone_merge(cfg) embed_initializer = mx.init.create(*cfg.INITIALIZER.embed) weight_initializer = mx.init.create(*cfg.INITIALIZER.weight) @@ -1154,23 +1104,19 @@ def from_cfg(cls, cfg, prefix=None, params=None): dec_pre_norm=cfg.MODEL.DECODER.pre_norm, embed_initializer=embed_initializer, weight_initializer=weight_initializer, - bias_initializer=bias_initializer, - prefix=prefix, - params=params) + bias_initializer=bias_initializer) @use_np class TransformerNMTInference(HybridBlock, BaseStepDecoder): - def __init__(self, model, prefix=None, params=None): + def __init__(self, model): """ Parameters ---------- model - prefix - params """ - super(TransformerNMTInference, self).__init__(prefix=prefix, params=params) + super().__init__() self.model = model def initialize(self, **kwargs): diff --git a/src/gluonnlp/models/transformer_xl.py b/src/gluonnlp/models/transformer_xl.py index d94928cd1a..419687639b 100644 --- a/src/gluonnlp/models/transformer_xl.py +++ b/src/gluonnlp/models/transformer_xl.py @@ -1,7 +1,7 @@ import numpy as np import mxnet as mx from mxnet import use_np -from mxnet.gluon import nn, Block, HybridBlock +from mxnet.gluon import nn, Block, HybridBlock, Parameter from ..attention_cell import multi_head_dot_attn, gen_self_attn_mask, gen_mem_attn_mask,\ RelAttentionScoreCell, MultiHeadAttentionCell from ..layers import get_activation, PositionalEmbedding, PositionwiseFFN,\ @@ -26,9 +26,8 @@ def __init__(self, units: int = 512, bias_initializer='zeros', pre_norm=False, dtype='float32', - layout='NT', - prefix=None, params=None): - super().__init__(prefix=prefix, params=params) + layout='NT'): + super().__init__() self._pre_norm = pre_norm self._dtype = dtype self._num_heads = num_heads @@ -41,54 +40,46 @@ def __init__(self, units: int = 512, else: raise NotImplementedError assert units % num_heads == 0 - with self.name_scope(): - self.dropout_layer = nn.Dropout(dropout) - self.attn_query = nn.Dense(units, in_units=units, - use_bias=False, flatten=False, - weight_initializer=weight_initializer, - bias_initializer=bias_initializer, - dtype=dtype, - prefix='attn_query_') - self.attn_kv = nn.Dense(2 * units, in_units=units, - use_bias=False, flatten=False, - weight_initializer=weight_initializer, - bias_initializer=bias_initializer, - dtype=dtype, - prefix='attn_kv_') - self.rel_pos_score_cell = RelAttentionScoreCell(query_units=units, - num_heads=num_heads, - bidirectional=False, - method='transformer_xl', - dropout=dropout, - dtype=dtype, - layout=self._cell_layout, - prefix='rel_pos_score_cell_') - self.attn_cell = MultiHeadAttentionCell(query_units=units, - num_heads=num_heads, - attention_dropout=attention_dropout, - dtype=dtype, - layout=self._cell_layout, - prefix='attn_cell_') - self.out_proj = nn.Dense(units, in_units=units, - use_bias=False, flatten=False, - weight_initializer=weight_initializer, - bias_initializer=bias_initializer, - dtype=dtype, - prefix='out_proj_') - self.layer_norm = nn.LayerNorm(epsilon=layer_norm_eps, - in_channels=units, - prefix='ln_') - self.ffn = PositionwiseFFN(units=units, - hidden_size=hidden_size, - activation=activation, - activation_dropout=activation_dropout, - dropout=dropout, - weight_initializer=weight_initializer, - bias_initializer=bias_initializer, - layer_norm_eps=layer_norm_eps, - pre_norm=pre_norm, - dtype=dtype, - prefix='ffn_') + self.dropout_layer = nn.Dropout(dropout) + self.attn_query = nn.Dense(units, in_units=units, + use_bias=False, flatten=False, + weight_initializer=weight_initializer, + bias_initializer=bias_initializer, + dtype=dtype) + self.attn_kv = nn.Dense(2 * units, in_units=units, + use_bias=False, flatten=False, + weight_initializer=weight_initializer, + bias_initializer=bias_initializer, + dtype=dtype) + self.rel_pos_score_cell = RelAttentionScoreCell(query_units=units, + num_heads=num_heads, + bidirectional=False, + method='transformer_xl', + dropout=dropout, + dtype=dtype, + layout=self._cell_layout) + self.attn_cell = MultiHeadAttentionCell(query_units=units, + num_heads=num_heads, + attention_dropout=attention_dropout, + dtype=dtype, + layout=self._cell_layout) + self.out_proj = nn.Dense(units, in_units=units, + use_bias=False, flatten=False, + weight_initializer=weight_initializer, + bias_initializer=bias_initializer, + dtype=dtype) + self.layer_norm = nn.LayerNorm(epsilon=layer_norm_eps, + in_channels=units) + self.ffn = PositionwiseFFN(units=units, + hidden_size=hidden_size, + activation=activation, + activation_dropout=activation_dropout, + dropout=dropout, + weight_initializer=weight_initializer, + bias_initializer=bias_initializer, + layer_norm_eps=layer_norm_eps, + pre_norm=pre_norm, + dtype=dtype) def hybrid_forward(self, F, data, mem, rel_positions, mask, query_r_bias, query_k_bias): """ @@ -173,36 +164,32 @@ def __init__(self, num_layers=3, layout='NT', pre_norm=False, weight_initializer=None, - bias_initializer=None, - prefix=None, params=None): - super().__init__(prefix=prefix, params=params) - with self.name_scope(): - self.query_k_bias = self.params.get('query_k_bias', - shape=(num_heads, units // num_heads), - init=bias_initializer, - allow_deferred_init=True) - self.query_r_bias = self.params.get('query_r_bias', - shape=(num_heads, units // num_heads), - init=bias_initializer, - allow_deferred_init=True) - self.decoder_layers = nn.HybridSequential(prefix='l') - with self.decoder_layers.name_scope(): - for i in range(num_layers): - self.decoder_layers.add( - TransformerXLDecoderLayer(units=units, - hidden_size=hidden_size, - num_heads=num_heads, - activation_dropout=activation_dropout, - dropout=dropout, - attention_dropout=attention_dropout, - layer_norm_eps=layernorm_eps, - activation=activation, - dtype=dtype, - layout=layout, - pre_norm=pre_norm, - weight_initializer=weight_initializer, - bias_initializer=bias_initializer, - prefix='{}_'.format(i))) + bias_initializer=None): + super().__init__() + self.query_k_bias = Parameter('query_k_bias', + shape=(num_heads, units // num_heads), + init=bias_initializer, + allow_deferred_init=True) + self.query_r_bias = Parameter('query_r_bias', + shape=(num_heads, units // num_heads), + init=bias_initializer, + allow_deferred_init=True) + self.decoder_layers = nn.HybridSequential() + for i in range(num_layers): + self.decoder_layers.add( + TransformerXLDecoderLayer(units=units, + hidden_size=hidden_size, + num_heads=num_heads, + activation_dropout=activation_dropout, + dropout=dropout, + attention_dropout=attention_dropout, + layer_norm_eps=layernorm_eps, + activation=activation, + dtype=dtype, + layout=layout, + pre_norm=pre_norm, + weight_initializer=weight_initializer, + bias_initializer=bias_initializer)) def hybrid_forward(self, F, data, mem_l, rel_positions, mask, **params): """ @@ -249,8 +236,8 @@ def hybrid_forward(self, F, data, mem_l, rel_positions, mask, **params): @use_np class TransformerXLForLM(Block): - def __init__(self, cfg=None, prefix=None, params=None): - super().__init__(prefix=prefix, params=params) + def __init__(self, cfg=None): + super().__init__() if cfg is None: cfg = TransformerXLForLM.get_cfg() else: @@ -265,53 +252,49 @@ def __init__(self, cfg=None, prefix=None, params=None): self._units = cfg.MODEL.units self._dtype = cfg.MODEL.dtype assert cfg.MODEL.units % cfg.MODEL.num_heads == 0 - with self.name_scope(): - self.word_emb = AdaptiveEmbedding(vocab_size=cfg.MODEL.vocab_size, - embed_size=cfg.MODEL.embed_units, - units=cfg.MODEL.units, - cutoffs=cfg.MODEL.cutoffs, - div_val=cfg.MODEL.div_val, - scaled=True, - embedding_initializer=embed_initializer, - weight_initializer=weight_initializer, - dtype=cfg.MODEL.dtype, - prefix='word_emb_') - self.dropout_layer = nn.Dropout(cfg.MODEL.dropout) - self.decoder = TransformerXLDecoder(num_layers=cfg.MODEL.num_layers, - units=cfg.MODEL.units, - hidden_size=cfg.MODEL.hidden_size, - num_heads=cfg.MODEL.num_heads, - activation_dropout=cfg.MODEL.activation_dropout, - dropout=cfg.MODEL.dropout, - attention_dropout=cfg.MODEL.attention_dropout, - layernorm_eps=cfg.MODEL.layernorm_eps, - activation=cfg.MODEL.activation, - dtype=cfg.MODEL.dtype, - layout=cfg.MODEL.layout, - pre_norm=cfg.MODEL.pre_norm, - weight_initializer=weight_initializer, - bias_initializer=bias_initializer, - prefix='decoder_') - if cfg.MODEL.tie_weights and cfg.MODEL.tie_projs: - crit_params = self.word_emb.collect_params('(.*_embed|.*_inter_proj)') - elif cfg.MODEL.tie_weights and not cfg.MODEL.tie_projs: - crit_params = self.word_emb.collect_params('.*_embed') - elif not cfg.MODEL.tie_weights and cfg.MODEL.tie_projs: - crit_params = self.word_emb.collect_params('.*_inter_proj') - else: - crit_params = None - self.crit = ProjectedAdaptiveLogSoftmaxWithLoss( - vocab_size=cfg.MODEL.vocab_size, - embed_size=cfg.MODEL.embed_units, - in_units=cfg.MODEL.units, - cutoffs=cfg.MODEL.cutoffs, - div_val=cfg.MODEL.div_val, - dtype=cfg.MODEL.dtype, - use_bias=True, - weight_initializer=weight_initializer, - bias_initializer=bias_initializer, - params=crit_params, - prefix='crit_') + self.word_emb = AdaptiveEmbedding(vocab_size=cfg.MODEL.vocab_size, + embed_size=cfg.MODEL.embed_units, + units=cfg.MODEL.units, + cutoffs=cfg.MODEL.cutoffs, + div_val=cfg.MODEL.div_val, + scaled=True, + embedding_initializer=embed_initializer, + weight_initializer=weight_initializer, + dtype=cfg.MODEL.dtype) + self.dropout_layer = nn.Dropout(cfg.MODEL.dropout) + self.decoder = TransformerXLDecoder(num_layers=cfg.MODEL.num_layers, + units=cfg.MODEL.units, + hidden_size=cfg.MODEL.hidden_size, + num_heads=cfg.MODEL.num_heads, + activation_dropout=cfg.MODEL.activation_dropout, + dropout=cfg.MODEL.dropout, + attention_dropout=cfg.MODEL.attention_dropout, + layernorm_eps=cfg.MODEL.layernorm_eps, + activation=cfg.MODEL.activation, + dtype=cfg.MODEL.dtype, + layout=cfg.MODEL.layout, + pre_norm=cfg.MODEL.pre_norm, + weight_initializer=weight_initializer, + bias_initializer=bias_initializer) + if cfg.MODEL.tie_weights and cfg.MODEL.tie_projs: + crit_params = self.word_emb.collect_params('(.*_embed|.*_inter_proj)') + elif cfg.MODEL.tie_weights and not cfg.MODEL.tie_projs: + crit_params = self.word_emb.collect_params('.*_embed') + elif not cfg.MODEL.tie_weights and cfg.MODEL.tie_projs: + crit_params = self.word_emb.collect_params('.*_inter_proj') + else: + crit_params = None + self.crit = ProjectedAdaptiveLogSoftmaxWithLoss( + vocab_size=cfg.MODEL.vocab_size, + embed_size=cfg.MODEL.embed_units, + in_units=cfg.MODEL.units, + cutoffs=cfg.MODEL.cutoffs, + div_val=cfg.MODEL.div_val, + dtype=cfg.MODEL.dtype, + use_bias=True, + weight_initializer=weight_initializer, + bias_initializer=bias_initializer) + self.crit.share_parameters(crit_params) @property def cfg(self): @@ -358,8 +341,8 @@ def get_cfg(cls, key=None): return config @classmethod - def from_cfg(cls, cfg, prefix=None, params=None): - return cls(cfg=cfg, prefix=prefix, params=params) + def from_cfg(cls, cfg): + return cls(cfg=cfg) @property def state_batch_axis(self): diff --git a/src/gluonnlp/optimizer.py b/src/gluonnlp/optimizer.py index 761a88e7e8..8b86f925ec 100644 --- a/src/gluonnlp/optimizer.py +++ b/src/gluonnlp/optimizer.py @@ -79,7 +79,7 @@ class AdamW(optimizer.Optimizer): """ def __init__(self, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-6, correct_bias=True, use_fused_step=True, **kwargs): - super(AdamW, self).__init__(use_fused_step=use_fused_step, + super().__init__(use_fused_step=use_fused_step, learning_rate=learning_rate, **kwargs) self.beta1 = beta1 diff --git a/src/gluonnlp/sequence_sampler.py b/src/gluonnlp/sequence_sampler.py index 4eb793b1d6..855f4030cd 100644 --- a/src/gluonnlp/sequence_sampler.py +++ b/src/gluonnlp/sequence_sampler.py @@ -97,9 +97,8 @@ class BeamSearchScorer(HybridBlock): def __init__(self, alpha: float = 1.0, K: float = 5.0, from_logits: bool = False, - temperature: float = 1.0, - prefix=None, params=None): - super().__init__(prefix=prefix, params=params) + temperature: float = 1.0): + super().__init__() self._alpha = float(alpha) self._K = K self._temperature = temperature @@ -126,7 +125,7 @@ def __call__(self, outputs, scores, step): # pylint: disable=arguments-differ The scores of all the candidates. Shape (d1, d2, ..., dn, V), where V is the size of the vocabulary. """ - return super(BeamSearchScorer, self).__call__(outputs, scores, step) + return super().__call__(outputs, scores, step) def hybrid_forward(self, F, outputs, scores, step): # pylint: disable=arguments-differ if not self._from_logits: @@ -248,7 +247,7 @@ def _choose_states(F, states, indices, state_batch_axis=None): class _BeamSearchStepUpdate(HybridBlock): def __init__(self, beam_size, vocab_size, eos_id, scorer, state_batch_axis, - stochastic=False, prefix=None, params=None): + stochastic=False): """ Parameters @@ -262,7 +261,7 @@ def __init__(self, beam_size, vocab_size, eos_id, scorer, state_batch_axis, prefix : None params : None """ - super(_BeamSearchStepUpdate, self).__init__(prefix=prefix, params=params) + super().__init__() self._beam_size = beam_size self._vocab_size = vocab_size self._eos_id = eos_id @@ -630,9 +629,8 @@ def __repr__(self): class _MultinomialStepUpdate(HybridBlock): def __init__(self, beam_size, vocab_size, eos_id, state_batch_axis, - sampling_topp=-1.0, sampling_topk=-1, temperature=1.0, - prefix=None, params=None): - super(_MultinomialStepUpdate, self).__init__(prefix=prefix, params=params) + sampling_topp=-1.0, sampling_topk=-1, temperature=1.0): + super().__init__() self._beam_size = beam_size self._vocab_size = vocab_size self._eos_id = eos_id diff --git a/src/gluonnlp/utils/misc.py b/src/gluonnlp/utils/misc.py index 54e8bbd265..7a1a7880a9 100644 --- a/src/gluonnlp/utils/misc.py +++ b/src/gluonnlp/utils/misc.py @@ -104,7 +104,7 @@ def step(self): 'All shapes of the tracked parameters must be given.' \ ' The shape of {} is {}, and it has not been fully initialized.' \ ' You should call step after the first forward of the model.'.format(k, v.shape) - ctx = self._track_params.list_ctx()[0] + ctx = next(iter(self._track_params.values())).list_ctx()[0] if self._average_params is None: self._average_params = OrderedDict([(k, v.data(ctx).copy()) for k, v in self._track_params.items()]) self._n_steps += 1 diff --git a/tests/test_attention_cell.py b/tests/test_attention_cell.py index 52c0479b5d..489f566beb 100644 --- a/tests/test_attention_cell.py +++ b/tests/test_attention_cell.py @@ -173,8 +173,8 @@ def test_dot_product_attention(scaled, normalized): @pytest.mark.seed(123) def test_gen_attn_mask(): class GenSelfAttnMask(HybridBlock): - def __init__(self, dtype, attn_type, prefix=None, params=None): - super(GenSelfAttnMask, self).__init__(prefix=prefix, params=params) + def __init__(self, dtype, attn_type): + super().__init__() self._dtype = dtype self._attn_type = attn_type @@ -183,8 +183,8 @@ def hybrid_forward(self, F, data, valid_length): dtype=self._dtype, attn_type=self._attn_type) class GenMemAttnMask(HybridBlock): - def __init__(self, dtype, prefix=None, params=None): - super(GenMemAttnMask, self).__init__(prefix=prefix, params=params) + def __init__(self, dtype): + super().__init__() self._dtype = dtype def hybrid_forward(self, F, mem, mem_valid_length, data, valid_length): @@ -348,9 +348,7 @@ def test_multi_head_rel_attn_score(num_heads, method, bidirectional, hybridize): score_cell.initialize() if hybridize: score_cell.hybridize() - for k, param in score_cell.collect_params().items(): - param_k = k[len(score_cell.prefix):] - param.set_data(base_score_cell.collect_params().get(param_k).data()) + score_cell.load_dict({name: param.data() for name, param in base_score_cell.collect_params().items()}) query.attach_grad() query.grad[:] = 0 with mx.autograd.record(): diff --git a/tests/test_gluon_block.py b/tests/test_gluon_block.py index c495f25a6a..17a2c250b1 100644 --- a/tests/test_gluon_block.py +++ b/tests/test_gluon_block.py @@ -1,7 +1,7 @@ import mxnet as mx import numpy as np from numpy.testing import assert_allclose -from mxnet.gluon import HybridBlock +from mxnet.gluon import HybridBlock, Constant from mxnet.gluon.data import DataLoader import itertools mx.npx.set_np() @@ -9,9 +9,9 @@ def test_const(): class Foo(HybridBlock): - def __init__(self, prefix=None, params=None): - super(Foo, self).__init__(prefix=prefix, params=params) - self.weight = self.params.get_constant('const', np.ones((10, 10))) + def __init__(self): + super().__init__() + self.weight = Constant(np.ones((10, 10))) def hybrid_forward(self, F, x, weight): return x, weight.astype(np.float32) @@ -35,8 +35,8 @@ def hybrid_forward(self, F, x): def test_gluon_nonzero_hybridize(): class Foo(HybridBlock): - def __init__(self, prefix=None, params=None): - super(Foo, self).__init__(prefix=prefix, params=params) + def __init__(self): + super().__init__() def hybrid_forward(self, F, x): dat = F.np._internal.nonzero(x) diff --git a/tests/test_initializer.py b/tests/test_initializer.py index 9c3a080ee7..002ab5ca0e 100644 --- a/tests/test_initializer.py +++ b/tests/test_initializer.py @@ -7,7 +7,7 @@ def test_truncnorm_string_alias_works(): try: - layer = nn.Dense(prefix="test_layer", in_units=1, units=1, weight_initializer='truncnorm') + layer = nn.Dense(in_units=1, units=1, weight_initializer='truncnorm') layer.initialize() except RuntimeError: pytest.fail('Layer couldn\'t be initialized') @@ -16,7 +16,7 @@ def test_truncnorm_string_alias_works(): def test_truncnorm_all_values_inside_boundaries(): mean = 0 std = 0.01 - layer = nn.Dense(prefix="test_layer", in_units=1, units=1000) + layer = nn.Dense(in_units=1, units=1000) layer.initialize(init=initializer.TruncNorm(mean, std)) assert (layer.weight.data() <= 2 * std).asnumpy().all() assert (layer.weight.data() >= -2 * std).asnumpy().all() @@ -27,7 +27,7 @@ def test_truncnorm_generates_values_with_defined_mean_and_std(): mean = 10 std = 5 - layer = nn.Dense(prefix="test_layer", in_units=1, units=100000) + layer = nn.Dense(in_units=1, units=100000) layer.initialize(init=initializer.TruncNorm(mean, std)) samples = layer.weight.data().reshape((-1, )).asnumpy() diff --git a/tests/test_layers.py b/tests/test_layers.py index aadc706e49..76143c8841 100644 --- a/tests/test_layers.py +++ b/tests/test_layers.py @@ -120,6 +120,7 @@ def test_adaptive_embedding(vocab_size, cutoffs, embed_size, units, div_val): [1000, None, 1.0]]) @pytest.mark.parametrize('embed_size', [128]) @pytest.mark.parametrize('in_units', [16]) +# TODO This test even passes without sharing the parameters. It needs to be improved. def test_projected_adaptive_softmax(vocab_size, cutoffs, embed_size, in_units, div_val): layer = ProjectedAdaptiveLogSoftmaxWithLoss(vocab_size=vocab_size, cutoffs=cutoffs, embed_size=embed_size, in_units=in_units, @@ -141,23 +142,22 @@ def test_projected_adaptive_softmax(vocab_size, cutoffs, embed_size, in_units, d cutoffs=cutoffs, embed_size=embed_size, in_units=in_units, - div_val=div_val, - params=embed_layer.collect_params('.*_inter_proj')) + div_val=div_val) + layer_with_shared_proj.share_parameters(embed_layer.collect_params('.*_inter_proj')) layer_with_shared_embed = \ ProjectedAdaptiveLogSoftmaxWithLoss(vocab_size=vocab_size, cutoffs=cutoffs, embed_size=embed_size, in_units=in_units, - div_val=div_val, - params=embed_layer.collect_params('.*_embed')) + div_val=div_val) + layer_with_shared_embed.share_parameters(embed_layer.collect_params('.*_embed')) layer_with_shared_proj_embed = \ ProjectedAdaptiveLogSoftmaxWithLoss(vocab_size=vocab_size, cutoffs=cutoffs, embed_size=embed_size, in_units=in_units, - div_val=div_val, - params=embed_layer.collect_params( - '(.*_embed|.*_inter_proj)')) + div_val=div_val) + layer_with_shared_proj_embed.share_parameters(embed_layer.collect_params('(.*_embed|.*_inter_proj)')) embed_layer.initialize() embed_layer.hybridize() layer_with_shared_proj.initialize() diff --git a/tests/test_models_electra.py b/tests/test_models_electra.py index 598ff87174..8866cd7921 100644 --- a/tests/test_models_electra.py +++ b/tests/test_models_electra.py @@ -26,17 +26,14 @@ def test_bert_get_pretrained(model_name): electra_disc_model.backbone_model.load_parameters(backbone_params_path) gen_cfg = get_generator_cfg(cfg) - word_embed_params = electra_disc_model.backbone_model.word_embed.collect_params() - token_type_embed_params = electra_disc_model.backbone_model.token_pos_embed.collect_params() - token_pos_embed_params = electra_disc_model.backbone_model.token_pos_embed.collect_params() - embed_layer_norm_params = electra_disc_model.backbone_model.embed_layer_norm.collect_params() - electra_gen_model = ElectraGenerator(gen_cfg, - tied_embeddings=True, - word_embed_params=word_embed_params, - token_type_embed_params=token_type_embed_params, - token_pos_embed_params=token_pos_embed_params, - embed_layer_norm_params=embed_layer_norm_params, - ) + electra_gen_model = ElectraGenerator(gen_cfg) electra_gen_model.load_parameters(gen_params_path) - electra_gen_model = ElectraGenerator(cfg, tied_embeddings=False) + electra_gen_model.tie_embeddings( + electra_disc_model.backbone_model.word_embed.collect_params(), + electra_disc_model.backbone_model.token_type_embed.collect_params(), + electra_disc_model.backbone_model.token_pos_embed.collect_params(), + electra_disc_model.backbone_model.embed_layer_norm.collect_params()) + + + electra_gen_model = ElectraGenerator(cfg) electra_gen_model.backbone_model.load_parameters(backbone_params_path) diff --git a/tests/test_models_transformer.py b/tests/test_models_transformer.py index 1273a56baf..b1e772ce73 100644 --- a/tests/test_models_transformer.py +++ b/tests/test_models_transformer.py @@ -19,9 +19,9 @@ def test_transformer_encoder_decoder(pre_norm, num_enc_layers, num_dec_layers): tgt_seq_length = 15 units = 32 enc = TransformerEncoder(units=units, hidden_size=64, num_layers=num_enc_layers, num_heads=4, - dropout=0.0, pre_norm=pre_norm, prefix='enc_') + dropout=0.0, pre_norm=pre_norm) dec = TransformerDecoder(units=units, hidden_size=64, num_layers=num_dec_layers, num_heads=4, - dropout=0.0, pre_norm=pre_norm, prefix='dec_') + dropout=0.0, pre_norm=pre_norm) enc.hybridize() dec.hybridize() enc.initialize() diff --git a/tests/test_models_transformer_xl.py b/tests/test_models_transformer_xl.py index 0f8fd4e0ec..f10a9aab66 100644 --- a/tests/test_models_transformer_xl.py +++ b/tests/test_models_transformer_xl.py @@ -27,17 +27,16 @@ def test_transformer_xl_for_lm(cutoffs, div_val, mem_length, query_length): cfg.MODEL.activation_dropout = 0.0 cfg.MODEL.attention_dropout = 0.0 cfg.freeze() - nt_model = TransformerXLForLM(cfg, prefix='transformer_xl_nt_') + nt_model = TransformerXLForLM(cfg) nt_model.initialize() tn_cfg = cfg.clone() tn_cfg.defrost() tn_cfg.MODEL.layout = 'TN' - tn_model = TransformerXLForLM(tn_cfg, prefix='transformer_xl_tn_') + tn_model = TransformerXLForLM(tn_cfg) tn_model.initialize() - for k, param in tn_model.collect_params().items(): - param_k = k[len(tn_model.prefix):] - param.set_data(nt_model.collect_params().get(param_k).data()) + for name, param in tn_model.collect_params().items(): + param.set_data(nt_model.collect_params().get(name).data()) assert_allclose(sum( mx.np.linalg.norm(param.data()).asnumpy() for param in nt_model.collect_params().values()), sum(mx.np.linalg.norm(param.data()).asnumpy() for param in @@ -66,8 +65,8 @@ def test_transformer_xl_for_lm(cutoffs, div_val, mem_length, query_length): loss = tn_logits.sum() loss.backward() assert_allclose(tn_logits.T.asnumpy(), nt_logits.asnumpy(), 1E-5, 1E-5) - for k, tn_param in tn_model.collect_params().items(): - nt_param = nt_model.collect_params().get(k[len(tn_model.prefix):]) + for name, tn_param in tn_model.collect_params().items(): + nt_param = nt_model.collect_params().get(name) if nt_param.grad_req != 'null': assert_allclose(nt_param.grad().asnumpy(), tn_param.grad().asnumpy(), 1E-4, 1E-4) diff --git a/tests/test_sequence_sampler.py b/tests/test_sequence_sampler.py index 777fa85e16..317c81568f 100644 --- a/tests/test_sequence_sampler.py +++ b/tests/test_sequence_sampler.py @@ -40,8 +40,8 @@ def test_beam_search_score(length, alpha, K, batch_size, vocab_size, from_logits @pytest.mark.parametrize('early_return', [False, True]) def test_beam_search(early_return): class SimpleStepDecoder(HybridBlock): - def __init__(self, vocab_size=5, hidden_units=4, prefix=None, params=None): - super(SimpleStepDecoder, self).__init__(prefix=prefix, params=params) + def __init__(self, vocab_size=5, hidden_units=4): + super().__init__() self.x2h_map = nn.Embedding(input_dim=vocab_size, output_dim=hidden_units) self.h2h_map = nn.Dense(units=hidden_units, flatten=False) self.vocab_map = nn.Dense(units=vocab_size, flatten=False) @@ -98,8 +98,8 @@ def hybrid_forward(self, F, data, state): @pytest.mark.parametrize('early_return', [False, True]) def test_beam_search_stochastic(early_return): class SimpleStepDecoder(HybridBlock): - def __init__(self, vocab_size=5, hidden_units=4, prefix=None, params=None): - super(SimpleStepDecoder, self).__init__(prefix=prefix, params=params) + def __init__(self, vocab_size=5, hidden_units=4): + super().__init__() self.x2h_map = nn.Embedding(input_dim=vocab_size, output_dim=hidden_units) self.h2h_map = nn.Dense(units=hidden_units, flatten=False) self.vocab_map = nn.Dense(units=vocab_size, flatten=False) @@ -164,8 +164,8 @@ def hybrid_forward(self, F, data, state): @pytest.mark.parametrize('sampling_paras', [(-1.0, -1), (0.05, -1), (-1.0, 1), (-1.0, 3)]) def test_multinomial_sampling(early_return, sampling_paras): class SimpleStepDecoder(HybridBlock): - def __init__(self, vocab_size=5, hidden_units=4, prefix=None, params=None): - super(SimpleStepDecoder, self).__init__(prefix=prefix, params=params) + def __init__(self, vocab_size=5, hidden_units=4): + super().__init__() self.x2h_map = nn.Embedding(input_dim=vocab_size, output_dim=hidden_units) self.h2h_map = nn.Dense(units=hidden_units, flatten=False) self.vocab_map = nn.Dense(units=vocab_size, flatten=False) diff --git a/tests/test_utils_misc.py b/tests/test_utils_misc.py index 78543b496f..5f594ea9db 100644 --- a/tests/test_utils_misc.py +++ b/tests/test_utils_misc.py @@ -18,10 +18,8 @@ def test_average_sgd_tracker(): moving_avg_param = None net_final_moving_avg_param = None for use_moving_avg in [False, True]: - net = nn.HybridSequential(prefix='net_') - with net.name_scope(): - net.add(nn.Dense(10)) - net.add(nn.Dense(3)) + net = nn.HybridSequential() + net.add(nn.Dense(10), nn.Dense(3)) net.initialize(init=mx.init.One()) net.hybridize() trainer = mx.gluon.Trainer(net.collect_params(), 'adam')