Skip to content
This repository was archived by the owner on Jan 15, 2024. It is now read-only.

Update for Block API #1261

Merged
merged 7 commits into from
Jul 17, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/workflows/unittests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,9 @@ jobs:
- name: Install Other Dependencies
run: |
python -m pip install --user --upgrade pip
python -m pip install --user setuptools pytest pytest-cov
python -m pip install --user setuptools pytest pytest-cov contextvars
python -m pip install --upgrade cython
python -m pip install --pre --user "mxnet>=2.0.0b20200604,<=2.0.0b20200619" -f https://dist.mxnet.io/python
python -m pip install --pre --user "mxnet>=2.0.0b20200716" -f https://dist.mxnet.io/python
python -m pip install --user -e .[extras]
- name: Test project
run: |
Expand Down
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,10 @@ First of all, install the latest MXNet. You may use the following commands:
```bash

# Install the version with CUDA 10.1
pip install -U --pre mxnet-cu101>=2.0.0b20200604 -f https://dist.mxnet.io/python
pip install -U --pre mxnet-cu101>=2.0.0b20200716 -f https://dist.mxnet.io/python

# Install the cpu-only version
pip install -U --pre mxnet>=2.0.0b20200604 -f https://dist.mxnet.io/python
pip install -U --pre mxnet>=2.0.0b20200716 -f https://dist.mxnet.io/python
```


Expand Down
7 changes: 3 additions & 4 deletions scripts/conversion_toolkits/convert_electra.py
Original file line number Diff line number Diff line change
Expand Up @@ -265,11 +265,11 @@ def convert_tf_model(model_dir, save_dir, test_conversion, model_size, gpu, elec
assert_allclose(tf_params[k], backbone_params[k])

# Build gluon model and initialize
gluon_model = ElectraModel.from_cfg(cfg, prefix='electra_')
gluon_model = ElectraModel.from_cfg(cfg)
gluon_model.initialize(ctx=ctx)
gluon_model.hybridize()

gluon_disc_model = ElectraDiscriminator(cfg, prefix='electra_')
gluon_disc_model = ElectraDiscriminator(cfg)
gluon_disc_model.initialize(ctx=ctx)
gluon_disc_model.hybridize()

Expand All @@ -283,8 +283,7 @@ def convert_tf_model(model_dir, save_dir, test_conversion, model_size, gpu, elec
word_embed_params=word_embed_params,
token_type_embed_params=token_type_embed_params,
token_pos_embed_params=token_pos_embed_params,
embed_layer_norm_params=embed_layer_norm_params,
prefix='generator_')
embed_layer_norm_params=embed_layer_norm_params)
gluon_gen_model.initialize(ctx=ctx)
gluon_gen_model.hybridize()

Expand Down
2 changes: 1 addition & 1 deletion scripts/conversion_toolkits/convert_mobilebert.py
Original file line number Diff line number Diff line change
Expand Up @@ -270,7 +270,7 @@ def convert_tf_model(model_dir, save_dir, test_conversion, gpu, mobilebert_dir):
gluon_model.initialize(ctx=ctx)
gluon_model.hybridize()

gluon_pretrain_model = MobileBertForPretrain(cfg, prefix='')
gluon_pretrain_model = MobileBertForPretrain(cfg)
gluon_pretrain_model.initialize(ctx=ctx)
gluon_pretrain_model.hybridize()

Expand Down
2 changes: 1 addition & 1 deletion scripts/conversion_toolkits/convert_tf_hub_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -358,7 +358,7 @@ def convert_tf_model(hub_model_dir, save_dir, test_conversion, model_type, gpu):
gluon_model = PretrainedModel.from_cfg(cfg, prefix='', use_pooler=True)
gluon_model.initialize(ctx=ctx)
gluon_model.hybridize()
gluon_mlm_model = PretrainedMLMModel(backbone_cfg=cfg, prefix='')
gluon_mlm_model = PretrainedMLMModel(backbone_cfg=cfg)
gluon_mlm_model.initialize(ctx=ctx)
gluon_mlm_model.hybridize()

Expand Down
4 changes: 2 additions & 2 deletions scripts/machine_translation/train_transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -357,7 +357,7 @@ def train(args):
for v in model.collect_params().values():
if v.grad_req != 'null':
v.grad_req = 'add'
model.collect_params().zero_grad()
model.zero_grad()
model_averager = AverageSGDTracker(model.collect_params())
log_start_time = time.time()
num_params, num_fixed_params = None, None
Expand Down Expand Up @@ -422,7 +422,7 @@ def train(args):
trainer.step(loss_denom.asnumpy() / rescale_loss)
accum_count = 0
loss_denom = 0
model.collect_params().zero_grad()
model.zero_grad()
if (args.epochs > 0 and epoch_id >= args.epochs - args.num_averages) or \
(args.max_update > 0 and n_train_iters >= args.max_update - args.num_averages * args.save_interval_update):
model_averager.step()
Expand Down
3 changes: 1 addition & 2 deletions scripts/pretraining/run_electra.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,8 +155,7 @@ def get_pretraining_model(model_name, ctx_l,
tied_generator=False,
tied_embeddings=True,
disallow_correct=False,
weight_initializer=TruncNorm(stdev=0.02),
prefix='Pretrain_')
weight_initializer=TruncNorm(stdev=0.02))
model.initialize(ctx=ctx_l)
model.hybridize()
return cfg, tokenizer, model
Expand Down
72 changes: 30 additions & 42 deletions scripts/question_answering/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,15 +14,12 @@ class ModelForQABasic(HybridBlock):
another dense layer to map the contextual embeddings to the start scores and end scores.

"""
def __init__(self, backbone, weight_initializer=None, bias_initializer=None,
prefix=None, params=None):
super().__init__(prefix=prefix, params=params)
with self.name_scope():
self.backbone = backbone
self.qa_outputs = nn.Dense(units=2, flatten=False,
weight_initializer=weight_initializer,
bias_initializer=bias_initializer,
prefix='qa_outputs_')
def __init__(self, backbone, weight_initializer=None, bias_initializer=None):
super().__init__()
self.backbone = backbone
self.qa_outputs = nn.Dense(units=2, flatten=False,
weight_initializer=weight_initializer,
bias_initializer=bias_initializer)

def hybrid_forward(self, F, tokens, token_types, valid_length, p_mask):
"""
Expand Down Expand Up @@ -77,39 +74,30 @@ class ModelForQAConditionalV1(HybridBlock):

"""
def __init__(self, backbone, units=768, layer_norm_eps=1E-12, dropout_prob=0.1,
activation='tanh', weight_initializer=None, bias_initializer=None,
prefix=None, params=None):
super().__init__(prefix=prefix, params=params)
with self.name_scope():
self.backbone = backbone
self.start_scores = nn.Dense(1, flatten=False,
weight_initializer=weight_initializer,
bias_initializer=bias_initializer,
prefix='start_scores_')
self.end_scores = nn.HybridSequential(prefix='end_scores_')
with self.end_scores.name_scope():
self.end_scores.add(nn.Dense(units, flatten=False,
weight_initializer=weight_initializer,
bias_initializer=bias_initializer,
prefix='mid_'))
self.end_scores.add(get_activation(activation))
self.end_scores.add(nn.LayerNorm(epsilon=layer_norm_eps))
self.end_scores.add(nn.Dense(1, flatten=False,
weight_initializer=weight_initializer,
bias_initializer=bias_initializer,
prefix='out_'))
self.answerable_scores = nn.HybridSequential(prefix='answerable_scores_')
with self.answerable_scores.name_scope():
self.answerable_scores.add(nn.Dense(units, flatten=False,
weight_initializer=weight_initializer,
bias_initializer=bias_initializer,
prefix='mid_'))
self.answerable_scores.add(get_activation(activation))
self.answerable_scores.add(nn.Dropout(dropout_prob))
self.answerable_scores.add(nn.Dense(2, flatten=False,
weight_initializer=weight_initializer,
bias_initializer=bias_initializer,
prefix='out_'))
activation='tanh', weight_initializer=None, bias_initializer=None):
super().__init__()
self.backbone = backbone
self.start_scores = nn.Dense(1, flatten=False,
weight_initializer=weight_initializer,
bias_initializer=bias_initializer)
self.end_scores = nn.HybridSequential()
self.end_scores.add(nn.Dense(units, flatten=False,
weight_initializer=weight_initializer,
bias_initializer=bias_initializer))
self.end_scores.add(get_activation(activation))
self.end_scores.add(nn.LayerNorm(epsilon=layer_norm_eps))
self.end_scores.add(nn.Dense(1, flatten=False,
weight_initializer=weight_initializer,
bias_initializer=bias_initializer))
self.answerable_scores = nn.HybridSequential()
self.answerable_scores.add(nn.Dense(units, flatten=False,
weight_initializer=weight_initializer,
bias_initializer=bias_initializer))
self.answerable_scores.add(get_activation(activation))
self.answerable_scores.add(nn.Dropout(dropout_prob))
self.answerable_scores.add(nn.Dense(2, flatten=False,
weight_initializer=weight_initializer,
bias_initializer=bias_initializer))

def get_start_logits(self, F, contextual_embedding, p_mask):
"""
Expand Down
8 changes: 3 additions & 5 deletions scripts/question_answering/run_squad.py
Original file line number Diff line number Diff line change
Expand Up @@ -324,8 +324,7 @@ def get_network(model_name,
backbone_params_path, num_params, num_fixed_params))
qa_net = ModelForQAConditionalV1(backbone=backbone,
dropout_prob=dropout,
weight_initializer=TruncNorm(stdev=0.02),
prefix='qa_net_')
weight_initializer=TruncNorm(stdev=0.02))
if checkpoint_path is None:
# Ignore the UserWarning during initialization,
# There is no need to re-initialize the parameters of backbone
Expand Down Expand Up @@ -529,7 +528,7 @@ def train(args):
log_sample_num = 0
if args.num_accumulated != 1:
# set grad to zero for gradient accumulation
qa_net.collect_params().zero_grad()
qa_net.zero_grad()
global_tic = time.time()
while not finish_flag:
epoch_tic = time.time()
Expand Down Expand Up @@ -594,7 +593,7 @@ def train(args):
step_num += 1
if args.num_accumulated != 1:
# set grad to zero for gradient accumulation
qa_net.collect_params().zero_grad()
qa_net.zero_grad()

# saving
if step_num % save_interval == 0 or step_num >= num_train_steps:
Expand Down Expand Up @@ -964,7 +963,6 @@ def eval_validation(ckpt_name, best_eval):

if __name__ == '__main__':
os.environ['MXNET_GPU_MEM_POOL_TYPE'] = 'Round'
os.environ['MXNET_USE_FUSION'] = '0' # Manually disable pointwise fusion
args = parse_args()
logging_config(args.output_dir, name='finetune_squad{}'.format(args.version))
set_seed(args.seed)
Expand Down
91 changes: 41 additions & 50 deletions src/gluonnlp/attention_cell.py
Original file line number Diff line number Diff line change
Expand Up @@ -601,9 +601,8 @@ class MultiHeadAttentionCell(HybridBlock):
"""
def __init__(self, query_units=None, num_heads=None, attention_dropout=0.0,
scaled: bool = True, normalized: bool = False, eps: float = 1E-6,
dtype='float32', layout='NTK', use_einsum=False,
prefix=None, params=None):
super().__init__(prefix=prefix, params=params)
dtype='float32', layout='NTK', use_einsum=False):
super().__init__()
self._query_units = query_units
self._num_heads = num_heads
self._attention_dropout = attention_dropout
Expand Down Expand Up @@ -705,8 +704,7 @@ def __init__(self, query_units,
dropout: float = 0.0,
dtype='float32',
layout='NTK',
use_einsum=False,
prefix=None, params=None):
use_einsum=False):
"""

Parameters
Expand All @@ -725,10 +723,8 @@ def __init__(self, query_units,
scaled
dtype
layout
prefix
params
"""
super().__init__(prefix=prefix, params=params)
super().__init__()
self._dropout = dropout
self._method = method
self._query_units = query_units
Expand All @@ -744,49 +740,44 @@ def __init__(self, query_units,
self._layout = layout
if self._layout not in ['NKT', 'NTK', 'TNK']:
raise ValueError('layout="{}" is not supported'.format(self._layout))
with self.name_scope():
if method == 'transformer_xl':
if pos_embed_units is None:
pos_embed_units = self._num_heads * self._head_query_units
self._rel_pos_embed = SinusoidalPositionalEmbedding(units=pos_embed_units,
prefix='rel_pos_embed_',
dtype=self._dtype)
self._rel_proj = nn.Dense(units=query_units,
in_units=pos_embed_units,
flatten=False,
use_bias=False,
prefix='rel_proj_',
dtype=self._dtype)
self._dropout_layer = nn.Dropout(dropout)
elif method == 'shaw':
assert self._max_distance is not None, 'Must set max_distance when method="shaw".'
if self._bidirectional:
vocab_size = self._max_distance * 2 + 1
else:
vocab_size = self._max_distance + 1
self._rel_pos_embed = LearnedPositionalEmbedding(
units=self._num_heads * self._head_query_units,
max_length=vocab_size,
weight_initializer=mx.init.Xavier(rnd_type="gaussian",
factor_type="in",
magnitude=1),
prefix='rel_pos_embed_',
mode='wrap' if self._bidirectional else 'raise',
dtype=self._dtype)
elif method == 't5':
if self._num_buckets is None:
self._num_buckets = 32
if self._max_distance is None:
self._max_distance = 128
self._rel_pos_embed = BucketPositionalEmbedding(
units=num_heads,
num_buckets=self._num_buckets,
max_distance=self._max_distance,
bidirectional=self._bidirectional,
prefix='rel_pos_embed_',
dtype=self._dtype)
if method == 'transformer_xl':
if pos_embed_units is None:
pos_embed_units = self._num_heads * self._head_query_units
self._rel_pos_embed = SinusoidalPositionalEmbedding(units=pos_embed_units,
dtype=self._dtype)
self._rel_proj = nn.Dense(units=query_units,
in_units=pos_embed_units,
flatten=False,
use_bias=False,
dtype=self._dtype)
self._dropout_layer = nn.Dropout(dropout)
elif method == 'shaw':
assert self._max_distance is not None, 'Must set max_distance when method="shaw".'
if self._bidirectional:
vocab_size = self._max_distance * 2 + 1
else:
raise NotImplementedError('method="{}" is currently not supported!'.format(method))
vocab_size = self._max_distance + 1
self._rel_pos_embed = LearnedPositionalEmbedding(
units=self._num_heads * self._head_query_units,
max_length=vocab_size,
weight_initializer=mx.init.Xavier(rnd_type="gaussian",
factor_type="in",
magnitude=1),
mode='wrap' if self._bidirectional else 'raise',
dtype=self._dtype)
elif method == 't5':
if self._num_buckets is None:
self._num_buckets = 32
if self._max_distance is None:
self._max_distance = 128
self._rel_pos_embed = BucketPositionalEmbedding(
units=num_heads,
num_buckets=self._num_buckets,
max_distance=self._max_distance,
bidirectional=self._bidirectional,
dtype=self._dtype)
else:
raise NotImplementedError('method="{}" is currently not supported!'.format(method))

def hybrid_forward(self, F, rel_positions, query=None):
"""
Expand Down
4 changes: 2 additions & 2 deletions src/gluonnlp/data/loading.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ def __init__(self, filename, **kwargs):
else:
raise ValueError('Unsupported extension: %s' % filename)
self._keys = keys
super(NumpyDataset, self).__init__(*data)
super().__init__(*data)

@property
def keys(self):
Expand Down Expand Up @@ -125,7 +125,7 @@ def __init__(self, file_pattern):
files = sorted(files)
if len(files) == 0:
raise ValueError('Cannot find any file with path "%s"' % file_pattern)
super(_PathDataset, self).__init__(files)
super().__init__(files)


def _dataset_worker_fn(urls, dataset_fn, batch_sampler_fn):
Expand Down
2 changes: 1 addition & 1 deletion src/gluonnlp/initializer.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ class TruncNorm(Initializer):
"""
def __init__(self, mean: float = 0, stdev: float = 0.01,
scale=2, **kwargs):
super(TruncNorm, self).__init__(**kwargs)
super().__init__(**kwargs)
self._mean = mean
self._stdev = stdev
self._scale = scale
Expand Down
Loading