diff --git a/README.md b/README.md index 7a5fcf1902..148ca40031 100644 --- a/README.md +++ b/README.md @@ -16,7 +16,7 @@ process the text data, and train models. # Features -- Easy-to-use Text Processing Tools and APIs +- Easy-to-use Text Processing Tools and Modular APIs - Pretrained Model Zoo - Write Models with Numpy-like API - Fast Inference via [Apache TVM (incubating)](https://tvm.apache.org/) (Experimental) @@ -28,16 +28,16 @@ First of all, install the latest MXNet. You may use the following commands: ```bash # Install the version with CUDA 10.0 -python3 -m pip install -U --pre "mxnet-cu100>=2.0.0b20200926" -f https://dist.mxnet.io/python +python3 -m pip install -U --pre "mxnet-cu100>=2.0.0b20201101" -f https://dist.mxnet.io/python # Install the version with CUDA 10.1 -python3 -m pip install -U --pre "mxnet-cu101>=2.0.0b20200926" -f https://dist.mxnet.io/python +python3 -m pip install -U --pre "mxnet-cu101>=2.0.0b20201101" -f https://dist.mxnet.io/python # Install the version with CUDA 10.2 -python3 -m pip install -U --pre "mxnet-cu102>=2.0.0b20200926" -f https://dist.mxnet.io/python +python3 -m pip install -U --pre "mxnet-cu102>=2.0.0b20201101" -f https://dist.mxnet.io/python # Install the cpu-only version -python3 -m pip install -U --pre "mxnet>=2.0.0b20200926" -f https://dist.mxnet.io/python +python3 -m pip install -U --pre "mxnet>=2.0.0b20201101" -f https://dist.mxnet.io/python ``` diff --git a/scripts/benchmarks/benchmark_gluonnlp.py b/scripts/benchmarks/benchmark_gluonnlp.py index 350c3411f3..1e7bf2913e 100644 --- a/scripts/benchmarks/benchmark_gluonnlp.py +++ b/scripts/benchmarks/benchmark_gluonnlp.py @@ -58,13 +58,14 @@ def get_parser(): help='Whether to use TVM for inference/training') parser.add_argument('--instance_type', choices=['c4', 'c5', 'g4', 'p3'], default='g4', help='The instance type that the profiling script will be run on.') + parser.add_argument('--use_fp16', action='store_true') parser.add_argument('--mode', type=str, default='train', choices=['train', 'inference']) return parser def run_benchmark(workload, model_name, out_file_name, is_train, - use_tvm, instance_type): + use_tvm, instance_type, use_fp16): if is_train: benchmark = GluonNLPBackboneBenchmark( workloads=workload, @@ -72,6 +73,7 @@ def run_benchmark(workload, model_name, out_file_name, is_train, profile_inference=False, profile_train=True, to_csv=True, + use_fp16=use_fp16, train_out_csv_file=out_file_name) benchmark.run() else: @@ -83,6 +85,7 @@ def run_benchmark(workload, model_name, out_file_name, is_train, use_tvm=use_tvm, instance_type=instance_type, to_csv=True, + use_fp16=use_fp16, inference_out_csv_file=out_file_name) benchmark.run() return @@ -94,13 +97,15 @@ def run_benchmark(workload, model_name, out_file_name, is_train, args = parser.parse_args() if args.compute_layout is None: args.compute_layout = args.layout + dtype = 'float32' if not args.use_fp16 else 'float16' for layout, compute_layout in [(args.layout, args.compute_layout)]: if compute_layout != layout: profile_models = [ele for ele in MODELS if 'bart' not in ele] else: profile_models = [ele for ele in MODELS] if args.mode == 'inference': - out_dir = 'infer_fp32_{}_{}_tvm{}'.format(layout, compute_layout, int(args.use_tvm)) + out_dir = 'infer_{}_{}_{}_tvm{}'.format(dtype, layout, compute_layout, + int(args.use_tvm)) df = pd.DataFrame(columns=['model', 'batch_size', 'sequence_length', 'latency', 'memory']) os.makedirs(out_dir, exist_ok=True) @@ -111,16 +116,17 @@ def run_benchmark(workload, model_name, out_file_name, is_train, process = Process( target=run_benchmark, args=(workload, model_name, out_path, False, - args.use_tvm, args.instance_type)) + args.use_tvm, args.instance_type, args.use_fp16)) process.start() process.join() new_df = pd.read_csv(out_path) df = df.append(new_df, ignore_index=True) - df.to_csv('gluonnlp_infer_fp32_{}_{}_tvm{}.csv'.format(layout, + df.to_csv('gluonnlp_infer_{}_{}_{}_tvm{}.csv'.format(dtype, + layout, compute_layout, int(args.use_tvm))) elif args.mode == 'train': - out_dir = 'train_fp32_{}_{}'.format(layout, compute_layout) + out_dir = 'train_{}_{}_{}'.format(dtype, layout, compute_layout) df = pd.DataFrame(columns=['model', 'batch_size', 'sequence_length', 'latency', 'memory']) os.makedirs(out_dir, exist_ok=True) @@ -130,11 +136,12 @@ def run_benchmark(workload, model_name, out_file_name, is_train, workload[1])) process = Process( target=run_benchmark, - args=(workload, model_name, out_path, True)) + args=(workload, model_name, out_path, True, False, + args.instance_type, args.use_fp16)) process.start() process.join() new_df = pd.read_csv(out_path) df = df.append(new_df, ignore_index=True) - df.to_csv('gluonnlp_train_fp32_{}_{}.csv'.format(layout, compute_layout)) + df.to_csv('gluonnlp_train_{}_{}_{}.csv'.format(dtype, layout, compute_layout)) else: raise NotImplementedError diff --git a/scripts/benchmarks/benchmark_gluonnlp_fp16.sh b/scripts/benchmarks/benchmark_gluonnlp_fp16.sh new file mode 100644 index 0000000000..784e73dc7a --- /dev/null +++ b/scripts/benchmarks/benchmark_gluonnlp_fp16.sh @@ -0,0 +1,14 @@ +for mode in train inference +do + python3 benchmark_gluonnlp.py --layout NT --compute_layout NT --mode $mode --use_fp16 +done + +for mode in train inference +do + python3 benchmark_gluonnlp.py --layout NT --compute_layout TN --mode $mode --use_fp16 +done + +for mode in train inference +do + python3 benchmark_gluonnlp.py --layout TN --compute_layout TN --mode $mode --use_fp16 +done diff --git a/scripts/benchmarks/benchmark_utils.py b/scripts/benchmarks/benchmark_utils.py index b4f32ff601..65e22b189b 100644 --- a/scripts/benchmarks/benchmark_utils.py +++ b/scripts/benchmarks/benchmark_utils.py @@ -748,7 +748,6 @@ def __init__(self, workloads, model_names, use_fp16=False, self._inference_out_csv_file = inference_out_csv_file self._train_out_csv_file = train_out_csv_file self._env_info_file = env_info_file - assert use_fp16 is False, 'Currently fp16 benchmark has not been supported yet.' @property def model_names(self): @@ -760,22 +759,26 @@ def workloads(self): def _inference_speed_memory(self, model_name: str, batch_size: int, sequence_length: int)\ -> Tuple[float, Memory]: + if self._use_fp16: + dtype = 'float16' + else: + dtype = 'float32' if self._use_gpu: ctx = mxnet.gpu() else: ctx = mxnet.cpu() model_cls, cfg, tokenizer, backbone_param_path, _ = get_backbone(model_name) - # TODO Support fp16 profiling cfg.defrost() cfg.MODEL.layout = self._layout if model_cls.__name__ not in ['BartModel']: cfg.MODEL.compute_layout = self._compute_layout cfg.freeze() if model_cls.__name__ in ['BartModel']: - model = model_cls.from_cfg(cfg, extract_feature=True) + model = model_cls.from_cfg(cfg, extract_feature=True, dtype=dtype) else: - model = model_cls.from_cfg(cfg) - model.load_parameters(backbone_param_path, ctx=ctx) + model = model_cls.from_cfg(cfg, dtype=dtype) + model.load_parameters(backbone_param_path, ctx=ctx, cast_dtype=True) + model.cast(dtype) model.hybridize() vocab_size = cfg.MODEL.vocab_size if self._layout == 'NT': @@ -860,12 +863,15 @@ def run_tvm_forward(): def _train_speed_memory(self, model_name: str, batch_size: int, sequence_length: int)\ -> Tuple[float, Memory]: + if self._use_fp16: + from mxnet import amp + amp.init() + if self._use_gpu: ctx = mxnet.gpu() else: ctx = mxnet.cpu() model_cls, cfg, tokenizer, backbone_param_path, _ = get_backbone(model_name) - # TODO Support fp16 profiling cfg.defrost() cfg.MODEL.layout = self._layout if model_cls.__name__ not in ['BartModel']: diff --git a/scripts/machine_translation/README.md b/scripts/machine_translation/README.md index 4b729cc117..c0d37981a2 100644 --- a/scripts/machine_translation/README.md +++ b/scripts/machine_translation/README.md @@ -30,6 +30,7 @@ python3 train_transformer.py \ --save_dir transformer_base_wmt2014_en_de_${SUBWORD_ALGO} \ --cfg transformer_base \ --lr 0.002 \ + --num_accumulated 32 \ --sampler BoundedBudgetSampler \ --max_num_tokens 2700 \ --epochs 30 \ diff --git a/scripts/machine_translation/train_transformer.py b/scripts/machine_translation/train_transformer.py index b51c9858d0..4e55c10675 100644 --- a/scripts/machine_translation/train_transformer.py +++ b/scripts/machine_translation/train_transformer.py @@ -441,8 +441,10 @@ def train(args): for sample_data, ctx in zip(sample_data_l, ctx_l): if sample_data is None: continue - src_token_ids, tgt_token_ids, src_valid_length, tgt_valid_length, sample_ids = sample_data - src_wc, tgt_wc, bs = src_valid_length.sum(), tgt_valid_length.sum(), src_token_ids.shape[0] + src_token_ids, tgt_token_ids, src_valid_length,\ + tgt_valid_length, sample_ids = sample_data + src_wc, tgt_wc, bs = src_valid_length.sum(),\ + tgt_valid_length.sum(), src_token_ids.shape[0] loss_denom += tgt_wc - bs log_loss_denom += tgt_wc - bs log_wc += src_wc + tgt_wc diff --git a/scripts/question_answering/README.md b/scripts/question_answering/README.md index 9f4065dbab..1dbd5377a8 100644 --- a/scripts/question_answering/README.md +++ b/scripts/question_answering/README.md @@ -84,6 +84,10 @@ horovodrun -np 4 -H localhost:4 python3 run_squad.py \ ... ``` +### Using AMP + +Just add `--dtype float16` if you'd like to use AMP for training and half-precision for inference. + ### Finetuning Details As for ELECTRA model, we fine-tune it with layer-wise learning rate decay as @@ -145,16 +149,16 @@ Performance are shown in the table below, in which the SQuAD1.1 are evaluated wi Notice that the standard metrics of SQuAD are `EM/F1`. The former is an exact match score between predictions and references, while the latter is a token-level F1 score in which the common tokens are considered as True Positives. -|Reproduced ALBERT Models (F1/EM) | SQuAD 1.1 dev | SQuAD 2.0 dev | SQuAD 2.0 Results File | Log | Command | -|----------------------------------|---------------|---------------|------|-----| --------| -|ALBERT base | 90.55/83.83 | 82.09/79.40 |[json](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/fintune_google_albert_base_v2_squad_2.0/best_results.json) | [log](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/fintune_google_albert_base_v2_squad_2.0/finetune_squad2.0.log) | [command](./commands/run_squad2_albert_base.sh) | -|ALBERT large | 92.66/86.43 | 84.98/82.19 |[json](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/fintune_google_albert_large_v2_squad_2.0/best_results.json) | [log](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/fintune_google_albert_large_v2_squad_2.0/finetune_squad2.0.log) | [command](./commands/run_squad2_albert_large.sh) | -|ALBERT xlarge | 93.85/87.71 | 87.92/85.04 |[json](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/fintune_google_albert_xlarge_v2_squad_2.0/best_results.json) | [log](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/fintune_google_albert_xlarge_v2_squad_2.0/finetune_squad2.0.log) | [command](./commands/run_squad2_albert_xlarge.sh) | -|ALBERT xxlarge | 95.00/89.01 | 89.91/86.87 |[json](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/fintune_google_albert_xxlarge_v2_squad_2.0/best_results.json) | [log](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/fintune_google_albert_xxlarge_v2_squad_2.0/finetune_squad2.0.log) | [command](./commands/run_squad2_albert_xxlarge.sh) | +|Reproduced ALBERT Models (F1/EM) | SQuAD 1.1 dev | SQuAD 2.0 dev | SQuAD 2.0 Results File | Log | Command | Weight | +|----------------------------------|---------------|---------------|------|-----|---------|----------| +|ALBERT base | 90.55/83.83 | 82.57/79.75 |[json](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/20201025/test_squad2_albert_base/fintune_google_albert_base_v2_squad_2.0/best_results.json) | [log](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/20201025/test_squad2_albert_base/fintune_google_albert_base_v2_squad_2.0/finetune_squad2.0.log) | [command](./commands/run_squad2_albert_base.sh) | [weight](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/20201025/test_squad2_albert_base/fintune_google_albert_base_v2_squad_2.0/google_albert_base_v2_squad2.0_8163.params) | +|ALBERT large | 92.66/86.43 | 85.21/82.50 |[json](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/20201025/test_squad2_albert_large/fintune_google_albert_large_v2_squad_2.0/best_results.json) | [log](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/20201025/test_squad2_albert_large/fintune_google_albert_large_v2_squad_2.0/finetune_squad2.0.log) | [command](./commands/run_squad2_albert_large.sh) | [weight](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/20201025/test_squad2_albert_large/fintune_google_albert_large_v2_squad_2.0/google_albert_large_v2_squad2.0_8163.params) | +|ALBERT xlarge | 93.85/87.71 | 87.73/84.83 |[json](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/20201025/test_squad2_albert_xlarge/fintune_google_albert_xlarge_v2_squad_2.0/best_results.json) | [log](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/20201025/test_squad2_albert_xlarge/fintune_google_albert_xlarge_v2_squad_2.0/finetune_squad2.0.log) | [command](./commands/run_squad2_albert_xlarge.sh) | [weight](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/20201025/test_squad2_albert_xlarge/fintune_google_albert_xlarge_v2_squad_2.0/google_albert_xlarge_v2_squad2.0_8163.params) | +|ALBERT xxlarge | 95.00/89.01 | 89.84/86.79 |[json](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/20201025/test_squad2_albert_xxlarge/fintune_google_albert_xxlarge_v2_squad_2.0/best_results.json) | [log](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/20201025/test_squad2_albert_xxlarge/fintune_google_albert_xxlarge_v2_squad_2.0/finetune_squad2.0.log) | [command](./commands/run_squad2_albert_xxlarge.sh) | [weight](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/20201025/test_squad2_albert_xxlarge/fintune_google_albert_xxlarge_v2_squad_2.0/google_albert_xxlarge_v2_squad2.0_8163.params) | For reference, we've included the results from Google's Original Experiments -| Model Name | SQuAD 1.1 dev | SQuAD 2.0 dev| +| Model Name (F1/EM) | SQuAD 1.1 dev | SQuAD 2.0 dev| |------------|---------------|--------------| |ALBERT base (googleresearch/albert) | 90.2/83.2 | 82.1/79.3 | |ALBERT large (googleresearch/albert) | 91.8/85.2 | 84.9/81.8 | @@ -163,19 +167,19 @@ For reference, we've included the results from Google's Original Experiments For the reset pretrained models, the results on SQuAD1.1 and SQuAD2.0 are given as follows. -| Model Name | SQuAD1.1 dev | SQuAD2.0 dev | SQuAD 2.0 Results File | Log | Command | -|--------------------------|---------------|--------------|------|-----|--------| -|BERT base | 88.40/81.24 | 76.43/73.59 |[json](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/fintune_google_en_uncased_bert_base_squad_2.0/best_results.json) | [log](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/fintune_google_en_uncased_bert_base_squad_2.0/finetune_squad2.0.log) | [command](./commands/run_squad2_uncased_bert_base.sh) | -|BERT large | 90.45/83.55 | 81.41/78.46 | [json](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/fintune_google_en_uncased_bert_large_squad_2.0/best_results.json) | [log](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/fintune_google_en_uncased_bert_large_squad_2.0/finetune_squad2.0.log) | [command](./commands/run_squad2_uncased_bert_large.sh) | -|ELECTRA small | 85.42/78.95 | 73.93/71.36 |[json](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/fintune_google_electra_small_squad_2.0/best_results.json) | [log](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/fintune_google_electra_small_squad_2.0/finetune_squad2.0.log) | [command](./commands/run_squad2_electra_small.sh) | -|ELECTRA base | 92.63/87.34 | 86.65/83.95 |[json](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/fintune_google_electra_base_squad_2.0/best_results.json) | [log](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/fintune_google_electra_base_squad_2.0/finetune_squad2.0.log) | [command](./commands/run_squad2_electra_small.sh) | -|ELECTRA large | 94.95/89.94 | 90.67/88.32 |[json](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/fintune_google_electra_large_squad_2.0/best_results.json) | [log](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/fintune_google_electra_large_squad_2.0/finetune_squad2.0.log) | [command](./commands/run_squad2_electra_base.sh) | -|MobileBERT | 89.87/83.26 | 80.54/77.81 |[json](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/fintune_google_uncased_mobilebert_squad_2.0/best_results.json) | [log](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/fintune_google_uncased_mobilebert_squad_2.0/finetune_squad2.0.log) | [command](./commands/run_squad2_mobilebert.sh) | -|RoBERTa large | 94.58/88.86 | 89.69/86.80 |[json](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/fintune_fairseq_roberta_large_squad_2.0/best_results.json) | [log](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/fintune_fairseq_roberta_large_squad_2.0/finetune_squad2.0.log) | [command](./commands/run_squad2_electra_large.sh) | +| Model Name (F1/EM) | SQuAD1.1 dev | SQuAD2.0 dev | SQuAD 2.0 Results File | Log | Command | Weight | +|--------------------------|---------------|--------------|------|-----|--------|---------| +|BERT base | 88.44/81.54 | 76.32/73.64 |[json](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/20201025/test_squad2_uncased_bert_base/fintune_google_en_uncased_bert_base_squad_2.0/best_results.json) | [log](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/20201025/test_squad2_uncased_bert_base/fintune_google_en_uncased_bert_base_squad_2.0/finetune_squad2.0.log) | [command](./commands/run_squad2_uncased_bert_base.sh) | [weight](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/20201025/test_squad2_uncased_bert_base/fintune_google_en_uncased_bert_base_squad_2.0/google_en_uncased_bert_base_squad2.0_8160.params) | +|BERT large | 90.65/84.02 | 81.22/78.22 |[json](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/20201025/test_squad2_uncased_bert_large/fintune_google_en_uncased_bert_large_squad_2.0/best_results.json) | [log](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/20201025/test_squad2_uncased_bert_large/fintune_google_en_uncased_bert_large_squad_2.0/finetune_squad2.0.log) | [command](./commands/run_squad2_uncased_bert_large.sh) | [weight](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/20201025/test_squad2_uncased_bert_large/fintune_google_en_uncased_bert_large_squad_2.0/google_en_uncased_bert_large_squad2.0_8159.params) | +|ELECTRA small | 85.76/79.16 | 74.07/71.56 |[json](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/20201025/test_squad2_electra_small/fintune_google_electra_small_squad_2.0/best_results.json) | [log](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/20201025/test_squad2_electra_small/fintune_google_electra_small_squad_2.0/finetune_squad2.0.log) | [command](./commands/run_squad2_electra_small.sh) | [weight](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/20201025/test_squad2_electra_small/fintune_google_electra_small_squad_2.0/google_electra_small_squad2.0_8160.params) | +|ELECTRA base | 92.64/86.99 | 86.33/83.67 |[json](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/20201025/test_squad2_electra_base/fintune_google_electra_base_squad_2.0/best_results.json) | [log](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/20201025/test_squad2_electra_base/fintune_google_electra_base_squad_2.0/finetune_squad2.0.log) | [command](./commands/run_squad2_electra_base.sh) | [weight](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/20201025/test_squad2_electra_base/fintune_google_electra_base_squad_2.0/google_electra_base_squad2.0_8160.params) | +|ELECTRA large | 94.79/89.52 | 90.55/88.24 |[json](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/20201025/test_squad2_electra_large/fintune_google_electra_large_squad_2.0/best_results.json) | [log](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/20201025/test_squad2_electra_large/fintune_google_electra_large_squad_2.0/finetune_squad2.0.log) | [command](./commands/run_squad2_electra_large.sh) | [weight](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/20201025/test_squad2_electra_large/fintune_google_electra_large_squad_2.0/google_electra_large_squad2.0_8159.params) | +|MobileBERT | 89.69/82.88 | 80.27/77.60 |[json](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/20201025/test_squad2_mobilebert/fintune_google_uncased_mobilebert_squad_2.0/best_results.json) | [log](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/20201025/test_squad2_mobilebert/fintune_google_uncased_mobilebert_squad_2.0/finetune_squad2.0.log) | [command](./commands/run_squad2_mobilebert.sh) | [weight](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/20201025/test_squad2_mobilebert/fintune_google_uncased_mobilebert_squad_2.0/google_uncased_mobilebert_squad2.0_20615.params) | +|RoBERTa large | 94.57/88.88 | 89.70/86.79 |[json](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/20201025/test_squad2_roberta_large/fintune_fairseq_roberta_large_squad_2.0/best_results.json) | [log](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/20201025/test_squad2_roberta_large/fintune_fairseq_roberta_large_squad_2.0/finetune_squad2.0.log) | [command](./commands/run_squad2_electra_large.sh) | [weight](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/20201025/test_squad2_roberta_large/fintune_fairseq_roberta_large_squad_2.0/fairseq_roberta_large_squad2.0_8160.params) | For reference, we have also included the results of original version from Google and Fairseq -| Model Name | SQuAD1.1 dev | SQuAD2.0 dev | +| Model Name (F1/EM) | SQuAD1.1 dev | SQuAD2.0 dev | |--------------------------|----------------|---------------| |Google BERT base | 88.5/80.8 | - / - | |Google BERT large | 90.9/84.1 | - / - | diff --git a/scripts/question_answering/commands/generate_commands.py b/scripts/question_answering/commands/generate_commands.py index e6c1fedbf2..9245d89daf 100644 --- a/scripts/question_answering/commands/generate_commands.py +++ b/scripts/question_answering/commands/generate_commands.py @@ -1,5 +1,6 @@ -from gluonnlp.utils.config import CfgNode import re +import os +from gluonnlp.utils.config import CfgNode def base_cfg(): @@ -12,9 +13,10 @@ def base_cfg(): cfg.lr = 2e-5 cfg.warmup_ratio = 0.1 cfg.wd = 0.01 - cfg.max_grad_norm = 0.1 + cfg.max_grad_norm = 1.0 cfg.max_seq_length = 512 cfg.layerwise_decay = -1 + cfg.dtype = 'float32' return cfg @@ -35,6 +37,7 @@ def albert_xlarge_cfg(): cfg.model_name = 'google_albert_xlarge_v2' cfg.batch_size = 1 cfg.num_accumulated = 12 + cfg.max_grad_norm = 0.1 return cfg @@ -118,6 +121,12 @@ def uncased_bert_large_cfg(): return cfg +def gluon_en_cased_bert_base_v1_cfg(): + cfg = uncased_bert_base_cfg() + cfg.model_name = 'gluon_en_cased_bert_base_v1' + return cfg + + def gen_command(config, template_path, out_path): print(f'Generating from "{template_path}" to "{out_path}"') @@ -134,7 +143,8 @@ def replace_fn(match): if __name__ == '__main__': for cfg_func in [albert_base_cfg, albert_large_cfg, albert_xlarge_cfg, albert_xxlarge_cfg, electra_base_cfg, electra_large_cfg, electra_small_cfg, mobilebert_cfg, - roberta_large_cfg, uncased_bert_base_cfg, uncased_bert_large_cfg]: + roberta_large_cfg, uncased_bert_base_cfg, uncased_bert_large_cfg, + gluon_en_cased_bert_base_v1_cfg]: prefix = cfg_func.__name__[:-len('_cfg')] gen_command(cfg_func(), 'run_squad.template', f'run_squad2_{prefix}.sh') diff --git a/scripts/question_answering/commands/run_squad.template b/scripts/question_answering/commands/run_squad.template index a67b23bce3..d24ff71723 100644 --- a/scripts/question_answering/commands/run_squad.template +++ b/scripts/question_answering/commands/run_squad.template @@ -2,6 +2,7 @@ USE_HOROVOD=${1:-0} # Horovod flag. 0 --> not use horovod, 1 --> use horovod VERSION=${2:-2.0} # SQuAD Version +DTYPE=${3:-{{ dtype }}} # Default training data type MODEL_NAME={{ model_name }} BATCH_SIZE={{ batch_size }} NUM_ACCUMULATED={{ num_accumulated }} @@ -16,12 +17,14 @@ LAYERWISE_DECAY={{ layerwise_decay }} # Prepare the Data nlp_data prepare_squad --version ${VERSION} +RUN_SQUAD_PATH=$(dirname "$0")/../run_squad.py + # Run the script if [ ${USE_HOROVOD} -eq 0 ]; then - RUN_COMMAND="python3 run_squad.py --gpus 0,1,2,3" + RUN_COMMAND="python3 ${RUN_SQUAD_PATH} --gpus 0,1,2,3" else - RUN_COMMAND="horovodrun -np 4 -H localhost:4 python3 run_squad.py --comm_backend horovod" + RUN_COMMAND="horovodrun -np 4 -H localhost:4 python3 ${RUN_SQUAD_PATH} --comm_backend horovod" fi ${RUN_COMMAND} \ --model_name ${MODEL_NAME} \ @@ -39,4 +42,5 @@ ${RUN_COMMAND} \ --wd ${WD} \ --max_seq_length ${MAX_SEQ_LENGTH} \ --max_grad_norm ${MAX_GRAD_NORM} \ + --dtype ${DTYPE} \ --overwrite_cache diff --git a/scripts/question_answering/commands/run_squad2_albert_base.sh b/scripts/question_answering/commands/run_squad2_albert_base.sh index 732b3abef8..ab960650f3 100644 --- a/scripts/question_answering/commands/run_squad2_albert_base.sh +++ b/scripts/question_answering/commands/run_squad2_albert_base.sh @@ -2,6 +2,7 @@ USE_HOROVOD=${1:-0} # Horovod flag. 0 --> not use horovod, 1 --> use horovod VERSION=${2:-2.0} # SQuAD Version +DTYPE=${3:-float32} # Default training data type MODEL_NAME=google_albert_base_v2 BATCH_SIZE=4 NUM_ACCUMULATED=3 @@ -10,7 +11,7 @@ LR=2e-05 WARMUP_RATIO=0.1 WD=0.01 MAX_SEQ_LENGTH=512 -MAX_GRAD_NORM=0.1 +MAX_GRAD_NORM=1.0 LAYERWISE_DECAY=-1 # Prepare the Data @@ -41,4 +42,5 @@ ${RUN_COMMAND} \ --wd ${WD} \ --max_seq_length ${MAX_SEQ_LENGTH} \ --max_grad_norm ${MAX_GRAD_NORM} \ + --dtype ${DTYPE} \ --overwrite_cache diff --git a/scripts/question_answering/commands/run_squad2_albert_large.sh b/scripts/question_answering/commands/run_squad2_albert_large.sh index fb92b7cda9..4007617869 100644 --- a/scripts/question_answering/commands/run_squad2_albert_large.sh +++ b/scripts/question_answering/commands/run_squad2_albert_large.sh @@ -2,6 +2,7 @@ USE_HOROVOD=${1:-0} # Horovod flag. 0 --> not use horovod, 1 --> use horovod VERSION=${2:-2.0} # SQuAD Version +DTYPE=${3:-float32} # Default training data type MODEL_NAME=google_albert_large_v2 BATCH_SIZE=3 NUM_ACCUMULATED=4 @@ -10,7 +11,7 @@ LR=2e-05 WARMUP_RATIO=0.1 WD=0.01 MAX_SEQ_LENGTH=512 -MAX_GRAD_NORM=0.1 +MAX_GRAD_NORM=1.0 LAYERWISE_DECAY=-1 # Prepare the Data @@ -41,4 +42,5 @@ ${RUN_COMMAND} \ --wd ${WD} \ --max_seq_length ${MAX_SEQ_LENGTH} \ --max_grad_norm ${MAX_GRAD_NORM} \ + --dtype ${DTYPE} \ --overwrite_cache diff --git a/scripts/question_answering/commands/run_squad2_albert_xlarge.sh b/scripts/question_answering/commands/run_squad2_albert_xlarge.sh index 0bd28952d5..3392f2f9a2 100644 --- a/scripts/question_answering/commands/run_squad2_albert_xlarge.sh +++ b/scripts/question_answering/commands/run_squad2_albert_xlarge.sh @@ -2,6 +2,7 @@ USE_HOROVOD=${1:-0} # Horovod flag. 0 --> not use horovod, 1 --> use horovod VERSION=${2:-2.0} # SQuAD Version +DTYPE=${3:-float32} # Default training data type MODEL_NAME=google_albert_xlarge_v2 BATCH_SIZE=1 NUM_ACCUMULATED=12 @@ -41,4 +42,5 @@ ${RUN_COMMAND} \ --wd ${WD} \ --max_seq_length ${MAX_SEQ_LENGTH} \ --max_grad_norm ${MAX_GRAD_NORM} \ + --dtype ${DTYPE} \ --overwrite_cache diff --git a/scripts/question_answering/commands/run_squad2_albert_xxlarge.sh b/scripts/question_answering/commands/run_squad2_albert_xxlarge.sh index 9383cbc873..d2bc808f46 100644 --- a/scripts/question_answering/commands/run_squad2_albert_xxlarge.sh +++ b/scripts/question_answering/commands/run_squad2_albert_xxlarge.sh @@ -2,6 +2,7 @@ USE_HOROVOD=${1:-0} # Horovod flag. 0 --> not use horovod, 1 --> use horovod VERSION=${2:-2.0} # SQuAD Version +DTYPE=${3:-float32} # Default training data type MODEL_NAME=google_albert_xxlarge_v2 BATCH_SIZE=1 NUM_ACCUMULATED=12 @@ -41,4 +42,5 @@ ${RUN_COMMAND} \ --wd ${WD} \ --max_seq_length ${MAX_SEQ_LENGTH} \ --max_grad_norm ${MAX_GRAD_NORM} \ + --dtype ${DTYPE} \ --overwrite_cache diff --git a/scripts/question_answering/commands/run_squad2_electra_base.sh b/scripts/question_answering/commands/run_squad2_electra_base.sh index 16ee8cdb98..2aa9755069 100644 --- a/scripts/question_answering/commands/run_squad2_electra_base.sh +++ b/scripts/question_answering/commands/run_squad2_electra_base.sh @@ -2,6 +2,7 @@ USE_HOROVOD=${1:-0} # Horovod flag. 0 --> not use horovod, 1 --> use horovod VERSION=${2:-2.0} # SQuAD Version +DTYPE=${3:-float32} # Default training data type MODEL_NAME=google_electra_base BATCH_SIZE=8 NUM_ACCUMULATED=1 @@ -10,7 +11,7 @@ LR=0.0001 WARMUP_RATIO=0.1 WD=0 MAX_SEQ_LENGTH=512 -MAX_GRAD_NORM=0.1 +MAX_GRAD_NORM=1.0 LAYERWISE_DECAY=0.8 # Prepare the Data @@ -41,4 +42,5 @@ ${RUN_COMMAND} \ --wd ${WD} \ --max_seq_length ${MAX_SEQ_LENGTH} \ --max_grad_norm ${MAX_GRAD_NORM} \ + --dtype ${DTYPE} \ --overwrite_cache diff --git a/scripts/question_answering/commands/run_squad2_electra_large.sh b/scripts/question_answering/commands/run_squad2_electra_large.sh index d8a52235e8..389375f614 100644 --- a/scripts/question_answering/commands/run_squad2_electra_large.sh +++ b/scripts/question_answering/commands/run_squad2_electra_large.sh @@ -2,6 +2,7 @@ USE_HOROVOD=${1:-0} # Horovod flag. 0 --> not use horovod, 1 --> use horovod VERSION=${2:-2.0} # SQuAD Version +DTYPE=${3:-float32} # Default training data type MODEL_NAME=google_electra_large BATCH_SIZE=2 NUM_ACCUMULATED=4 @@ -41,4 +42,5 @@ ${RUN_COMMAND} \ --wd ${WD} \ --max_seq_length ${MAX_SEQ_LENGTH} \ --max_grad_norm ${MAX_GRAD_NORM} \ + --dtype ${DTYPE} \ --overwrite_cache diff --git a/scripts/question_answering/commands/run_squad2_electra_small.sh b/scripts/question_answering/commands/run_squad2_electra_small.sh index d6228ef0bc..c565ce4403 100644 --- a/scripts/question_answering/commands/run_squad2_electra_small.sh +++ b/scripts/question_answering/commands/run_squad2_electra_small.sh @@ -2,6 +2,7 @@ USE_HOROVOD=${1:-0} # Horovod flag. 0 --> not use horovod, 1 --> use horovod VERSION=${2:-2.0} # SQuAD Version +DTYPE=${3:-float32} # Default training data type MODEL_NAME=google_electra_small BATCH_SIZE=8 NUM_ACCUMULATED=1 @@ -10,7 +11,7 @@ LR=0.0003 WARMUP_RATIO=0.1 WD=0 MAX_SEQ_LENGTH=512 -MAX_GRAD_NORM=0.1 +MAX_GRAD_NORM=1.0 LAYERWISE_DECAY=0.8 # Prepare the Data @@ -41,4 +42,5 @@ ${RUN_COMMAND} \ --wd ${WD} \ --max_seq_length ${MAX_SEQ_LENGTH} \ --max_grad_norm ${MAX_GRAD_NORM} \ + --dtype ${DTYPE} \ --overwrite_cache diff --git a/scripts/question_answering/commands/run_squad2_gluon_en_cased_bert_base_v1.sh b/scripts/question_answering/commands/run_squad2_gluon_en_cased_bert_base_v1.sh new file mode 100644 index 0000000000..a3801f4cdc --- /dev/null +++ b/scripts/question_answering/commands/run_squad2_gluon_en_cased_bert_base_v1.sh @@ -0,0 +1,46 @@ +# Generated by "generate_commands.py" + +USE_HOROVOD=${1:-0} # Horovod flag. 0 --> not use horovod, 1 --> use horovod +VERSION=${2:-2.0} # SQuAD Version +DTYPE=${3:-float32} # Default training data type +MODEL_NAME=gluon_en_cased_bert_base_v1 +BATCH_SIZE=6 +NUM_ACCUMULATED=2 +EPOCHS=3 +LR=3e-05 +WARMUP_RATIO=0.1 +WD=0.01 +MAX_SEQ_LENGTH=512 +MAX_GRAD_NORM=1.0 +LAYERWISE_DECAY=-1 + +# Prepare the Data +nlp_data prepare_squad --version ${VERSION} + +RUN_SQUAD_PATH=$(dirname "$0")/../run_squad.py + +# Run the script +if [ ${USE_HOROVOD} -eq 0 ]; +then + RUN_COMMAND="python3 ${RUN_SQUAD_PATH} --gpus 0,1,2,3" +else + RUN_COMMAND="horovodrun -np 4 -H localhost:4 python3 ${RUN_SQUAD_PATH} --comm_backend horovod" +fi +${RUN_COMMAND} \ + --model_name ${MODEL_NAME} \ + --data_dir squad \ + --output_dir fintune_${MODEL_NAME}_squad_${VERSION} \ + --version ${VERSION} \ + --do_eval \ + --do_train \ + --batch_size ${BATCH_SIZE} \ + --num_accumulated ${NUM_ACCUMULATED} \ + --layerwise_decay ${LAYERWISE_DECAY} \ + --epochs ${EPOCHS} \ + --lr ${LR} \ + --warmup_ratio ${WARMUP_RATIO} \ + --wd ${WD} \ + --max_seq_length ${MAX_SEQ_LENGTH} \ + --max_grad_norm ${MAX_GRAD_NORM} \ + --dtype ${DTYPE} \ + --overwrite_cache diff --git a/scripts/question_answering/commands/run_squad2_mobilebert.sh b/scripts/question_answering/commands/run_squad2_mobilebert.sh index 24fece841d..f59c16cd9e 100644 --- a/scripts/question_answering/commands/run_squad2_mobilebert.sh +++ b/scripts/question_answering/commands/run_squad2_mobilebert.sh @@ -2,6 +2,7 @@ USE_HOROVOD=${1:-0} # Horovod flag. 0 --> not use horovod, 1 --> use horovod VERSION=${2:-2.0} # SQuAD Version +DTYPE=${3:-float32} # Default training data type MODEL_NAME=google_uncased_mobilebert BATCH_SIZE=8 NUM_ACCUMULATED=1 @@ -10,7 +11,7 @@ LR=4e-05 WARMUP_RATIO=0.1 WD=0.01 MAX_SEQ_LENGTH=384 -MAX_GRAD_NORM=0.1 +MAX_GRAD_NORM=1.0 LAYERWISE_DECAY=-1 # Prepare the Data @@ -41,4 +42,5 @@ ${RUN_COMMAND} \ --wd ${WD} \ --max_seq_length ${MAX_SEQ_LENGTH} \ --max_grad_norm ${MAX_GRAD_NORM} \ + --dtype ${DTYPE} \ --overwrite_cache diff --git a/scripts/question_answering/commands/run_squad2_roberta_large.sh b/scripts/question_answering/commands/run_squad2_roberta_large.sh index 2bf51e6b6c..b95b949757 100644 --- a/scripts/question_answering/commands/run_squad2_roberta_large.sh +++ b/scripts/question_answering/commands/run_squad2_roberta_large.sh @@ -2,6 +2,7 @@ USE_HOROVOD=${1:-0} # Horovod flag. 0 --> not use horovod, 1 --> use horovod VERSION=${2:-2.0} # SQuAD Version +DTYPE=${3:-float32} # Default training data type MODEL_NAME=fairseq_roberta_large BATCH_SIZE=2 NUM_ACCUMULATED=6 @@ -10,7 +11,7 @@ LR=3e-05 WARMUP_RATIO=0.2 WD=0.01 MAX_SEQ_LENGTH=512 -MAX_GRAD_NORM=0.1 +MAX_GRAD_NORM=1.0 LAYERWISE_DECAY=-1 # Prepare the Data @@ -41,4 +42,5 @@ ${RUN_COMMAND} \ --wd ${WD} \ --max_seq_length ${MAX_SEQ_LENGTH} \ --max_grad_norm ${MAX_GRAD_NORM} \ + --dtype ${DTYPE} \ --overwrite_cache diff --git a/scripts/question_answering/commands/run_squad2_uncased_bert_base.sh b/scripts/question_answering/commands/run_squad2_uncased_bert_base.sh index f2a0738282..ee3d8d0208 100644 --- a/scripts/question_answering/commands/run_squad2_uncased_bert_base.sh +++ b/scripts/question_answering/commands/run_squad2_uncased_bert_base.sh @@ -2,6 +2,7 @@ USE_HOROVOD=${1:-0} # Horovod flag. 0 --> not use horovod, 1 --> use horovod VERSION=${2:-2.0} # SQuAD Version +DTYPE=${3:-float32} # Default training data type MODEL_NAME=google_en_uncased_bert_base BATCH_SIZE=6 NUM_ACCUMULATED=2 @@ -10,7 +11,7 @@ LR=3e-05 WARMUP_RATIO=0.1 WD=0.01 MAX_SEQ_LENGTH=512 -MAX_GRAD_NORM=0.1 +MAX_GRAD_NORM=1.0 LAYERWISE_DECAY=-1 # Prepare the Data @@ -41,4 +42,5 @@ ${RUN_COMMAND} \ --wd ${WD} \ --max_seq_length ${MAX_SEQ_LENGTH} \ --max_grad_norm ${MAX_GRAD_NORM} \ + --dtype ${DTYPE} \ --overwrite_cache diff --git a/scripts/question_answering/commands/run_squad2_uncased_bert_large.sh b/scripts/question_answering/commands/run_squad2_uncased_bert_large.sh index 2f19c4c5e7..ee94b544c1 100644 --- a/scripts/question_answering/commands/run_squad2_uncased_bert_large.sh +++ b/scripts/question_answering/commands/run_squad2_uncased_bert_large.sh @@ -2,6 +2,7 @@ USE_HOROVOD=${1:-0} # Horovod flag. 0 --> not use horovod, 1 --> use horovod VERSION=${2:-2.0} # SQuAD Version +DTYPE=${3:-float32} # Default training data type MODEL_NAME=google_en_uncased_bert_large BATCH_SIZE=2 NUM_ACCUMULATED=6 @@ -10,7 +11,7 @@ LR=3e-05 WARMUP_RATIO=0.1 WD=0.01 MAX_SEQ_LENGTH=512 -MAX_GRAD_NORM=0.1 +MAX_GRAD_NORM=1.0 LAYERWISE_DECAY=-1 # Prepare the Data @@ -41,4 +42,5 @@ ${RUN_COMMAND} \ --wd ${WD} \ --max_seq_length ${MAX_SEQ_LENGTH} \ --max_grad_norm ${MAX_GRAD_NORM} \ + --dtype ${DTYPE} \ --overwrite_cache diff --git a/scripts/question_answering/run_squad.py b/scripts/question_answering/run_squad.py index 70ba408843..8874f9068d 100644 --- a/scripts/question_answering/run_squad.py +++ b/scripts/question_answering/run_squad.py @@ -8,8 +8,11 @@ import time import logging import argparse +import ast import functools import collections +import dataclasses +from dataclasses import dataclass from multiprocessing import Pool, cpu_count import mxnet as mx @@ -142,12 +145,27 @@ def parse_args(): 'instead of only last one') parser.add_argument('--max_saved_ckpt', type=int, default=5, help='The maximum number of saved checkpoints') - parser.add_argument('--eval_dtype', type=str, default='float32', - help='Data type used for evaluation. Either float32 or float16') + parser.add_argument('--dtype', type=str, default='float32', + help='Data type used for evaluation. Either float32 or float16. When you ' + 'use --dtype float16, amp will be turned on in the training phase and ' + 'fp16 will be used in evaluation.') args = parser.parse_args() return args +ChunkFeature = collections.namedtuple('ChunkFeature', + ['qas_id', + 'data', + 'valid_length', + 'segment_ids', + 'masks', + 'is_impossible', + 'gt_start', + 'gt_end', + 'context_offset', + 'chunk_start', + 'chunk_length']) + class SquadDatasetProcessor: def __init__(self, tokenizer, doc_stride, max_seq_length, max_query_length): @@ -176,24 +194,13 @@ def __init__(self, tokenizer, doc_stride, max_seq_length, max_query_length): self.sep_id = vocab.eos_id if 'sep_token' not in vocab.special_token_keys else vocab.sep_id # TODO(sxjscience) Consider to combine the NamedTuple and batchify functionality. - self.ChunkFeature = collections.namedtuple('ChunkFeature', - ['qas_id', - 'data', - 'valid_length', - 'segment_ids', - 'masks', - 'is_impossible', - 'gt_start', - 'gt_end', - 'context_offset', - 'chunk_start', - 'chunk_length']) - self.BatchifyFunction = bf.NamedTuple(self.ChunkFeature, + # Here, we use round_to=8 to improve the throughput. + self.BatchifyFunction = bf.NamedTuple(ChunkFeature, {'qas_id': bf.List(), - 'data': bf.Pad(val=self.pad_id), + 'data': bf.Pad(val=self.pad_id, round_to=8), 'valid_length': bf.Stack(), 'segment_ids': bf.Pad(), - 'masks': bf.Pad(val=1), + 'masks': bf.Pad(val=1, round_to=8), 'is_impossible': bf.Stack(), 'gt_start': bf.Stack(), 'gt_end': bf.Stack(), @@ -266,17 +273,17 @@ def process_sample(self, feature: SquadFeature): # Here, we increase the start and end because we put query before context start_pos = chunk.gt_start_pos + context_offset end_pos = chunk.gt_end_pos + context_offset - chunk_feature = self.ChunkFeature(qas_id=feature.qas_id, - data=data, - valid_length=valid_length, - segment_ids=segment_ids, - masks=masks, - is_impossible=chunk.is_impossible, - gt_start=start_pos, - gt_end=end_pos, - context_offset=context_offset, - chunk_start=chunk.start, - chunk_length=chunk.length) + chunk_feature = ChunkFeature(qas_id=feature.qas_id, + data=data, + valid_length=valid_length, + segment_ids=segment_ids, + masks=masks, + is_impossible=chunk.is_impossible, + gt_start=start_pos, + gt_end=end_pos, + context_offset=context_offset, + chunk_start=chunk.start, + chunk_length=chunk.length) ret.append(chunk_feature) return ret @@ -427,7 +434,9 @@ def setup_logging(args, local_rank): set_seed(args.seed) logging.debug('Random seed set to {}'.format(args.seed)) + def train(args): + use_amp = args.dtype == 'float16' store, num_workers, rank, local_rank, is_master_node, ctx_l = init_comm( args.comm_backend, args.gpus) setup_logging(args, local_rank) @@ -527,7 +536,7 @@ def train(args): 'wd': args.wd, 'lr_scheduler': lr_scheduler, } - adam_betas = eval(args.adam_betas) + adam_betas = ast.literal_eval(args.adam_betas) if args.optimizer == 'adamw': optimizer_params.update({'beta1': adam_betas[0], 'beta2': adam_betas[1], @@ -539,12 +548,15 @@ def train(args): 'beta2': adam_betas[1], 'epsilon': args.adam_epsilon, }) + if use_amp: + optimizer_params.update({'multi_precision': True}) if args.comm_backend == 'horovod': trainer = hvd.DistributedTrainer(param_dict, args.optimizer, optimizer_params) else: trainer = mx.gluon.Trainer(param_dict, args.optimizer, optimizer_params, update_on_kvstore=False) - + if use_amp: + amp.init_trainer(trainer) log_span_loss = 0 log_answerable_loss = 0 log_total_loss = 0 @@ -584,9 +596,18 @@ def train(args): loss_l.append(loss) span_loss_l.append(span_loss) answerable_loss_l.append(answerable_loss) + if use_amp: + with mx.autograd.record(): + with amp.scale_loss(loss_l, trainer) as loss_l: + for loss in loss_l: + loss.backward() + norm_clip_mult = num_workers * trainer._amp_loss_scaler.loss_scale + else: + with mx.autograd.record(): + for loss in loss_l: + loss.backward() + norm_clip_mult = num_workers - for loss in loss_l: - loss.backward() # All Reduce the Step Loss log_span_loss += sum([ele.as_in_ctx(ctx_l[0]) for ele in span_loss_l]).asnumpy() log_total_loss += sum([ele.as_in_ctx(ctx_l[0]) @@ -598,7 +619,7 @@ def train(args): if args.max_grad_norm > 0: total_norm, ratio, is_finite = clip_grad_global_norm( - params, args.max_grad_norm * num_workers) + params, args.max_grad_norm * norm_clip_mult) else: total_norm = grad_global_norm(params) @@ -610,7 +631,7 @@ def train(args): # gluon.trainer._scale is default to 1 trainer.update(num_workers, ignore_stale_grad=True) - total_norm = total_norm / num_workers + total_norm = total_norm / norm_clip_mult if args.num_accumulated > 1: # set grad to zero for gradient accumulation qa_net.zero_grad() @@ -651,7 +672,6 @@ def train(args): log_answerable_loss = 0 log_total_loss = 0 log_sample_num = 0 - num_samples_per_update = 0 if (step_num + 1) >= num_train_steps: toc = time.time() @@ -808,8 +828,8 @@ def evaluate(args, last=True): str(ctx_l))) cfg, tokenizer, qa_net, use_segmentation = get_network( - args.model_name, ctx_l, args.classifier_dropout, dtype=args.eval_dtype) - if args.eval_dtype == 'float16': + args.model_name, ctx_l, args.classifier_dropout, dtype=args.dtype) + if args.dtype == 'float16': qa_net.cast('float16') qa_net.hybridize() @@ -978,6 +998,10 @@ def eval_validation(ckpt_name, best_eval): os.environ['MXNET_GPU_MEM_POOL_TYPE'] = 'Round' args = parse_args() if args.do_train: + if args.dtype == 'float16': + # Initialize amp if it's fp16 training + from mxnet import amp + amp.init() train(args) if args.do_eval: evaluate(args, last=not args.all_evaluate) diff --git a/src/gluonnlp/attention_cell.py b/src/gluonnlp/attention_cell.py index 1ea4350127..a6e9df089c 100644 --- a/src/gluonnlp/attention_cell.py +++ b/src/gluonnlp/attention_cell.py @@ -160,7 +160,7 @@ def gen_self_attn_mask(data, else: raise NotImplementedError mask = mask.astype(dtype) - return mask + return mask.astype(np.bool) def gen_mem_attn_mask(mem, mem_valid_length, data, data_valid_length=None, @@ -241,7 +241,7 @@ def gen_mem_attn_mask(mem, mem_valid_length, data, data_valid_length=None, else: query_length_ones = np.ones_like(data_steps) mask = query_length_ones.reshape((1, -1, 1)) * mem_mask - return mask + return mask.astype(np.bool) # TODO(sxjscience) Directly implement a kernel for masked softmax @@ -273,7 +273,7 @@ def masked_softmax(att_score, mask, dtype=np.float32, axis: int = -1): else: try: # if AMP (automatic mixed precision) is enabled, -1e18 will cause NaN. - from mxnet.contrib import amp + from mxnet import amp if amp.amp._amp_initialized: neg = -1e4 except ImportError: diff --git a/src/gluonnlp/data/sampler.py b/src/gluonnlp/data/sampler.py index aabfe7a688..08fd9b48ca 100644 --- a/src/gluonnlp/data/sampler.py +++ b/src/gluonnlp/data/sampler.py @@ -266,20 +266,25 @@ def __len__(self): class BoundedBudgetSampler(BaseSampler): - r"""Assign each data sample to bounded budget batches. Samples will be sorted by length before batchfy - see https://github.com/pytorch/fairseq/blob/master/fairseq/data/data_utils_fast.pyx + r"""Assign each data sample to bounded budget batches. + We will ensure that within the batch, + the total number of tokens is smaller than the provided max_num_tokens, + and the total number of sentences is smaller than the provided max_num_sentences. + + Samples will be sorted by length before batchify + See Also https://github.com/pytorch/fairseq/blob/master/fairseq/data/data_utils_fast.pyx Parameters ---------- lengths The length of the sequences in the input data sample. max_num_tokens - max tokens num of each batch + Max number of tokens of each batch max_num_sentences - max sentences num of each batch + Max number of sentences of each batch required_batch_size_multiple - require batch size to be a multiple of N (default: 1). - better throughput in GPU. + Require batch size to be a multiple of N (default: 1). + This will generally have better throughput in GPU. shuffle Whether to shuffle the batches. seed @@ -295,7 +300,7 @@ def __init__(self, lengths: Union[Sequence[int], Sequence[Sequence[int]]], self._lengths = np.array(lengths) if self._lengths.ndim == 2: self._lengths = self._lengths.max(axis=1) - self._indices = np.array(range(len(lengths))) + self._indices = np.arange(len(lengths)) self._max_num_tokens = max_num_tokens self._max_num_sentences = max_num_sentences self._batches = [] @@ -313,11 +318,11 @@ def __init__(self, lengths: Union[Sequence[int], Sequence[Sequence[int]]], batch_num_tokens = batch_num_sentences * batch_max_sample_len if (self._max_num_sentences > 0 and batch_num_sentences > self._max_num_sentences) or \ (self._max_num_tokens > 0 and batch_num_tokens > self._max_num_tokens): - # moded_bs = len(batch) % required_batch_size_multiple when len(batch) < required_batch_size_multiple - moded_bs = max( - required_batch_size_multiple * (len(batch) // required_batch_size_multiple), - len(batch) % required_batch_size_multiple - ) + if len(batch) < required_batch_size_multiple: + moded_bs = len(batch) + else: + moded_bs = required_batch_size_multiple\ + * (len(batch) // required_batch_size_multiple) self._batches.append(np.array(batch[:moded_bs])) batch = batch[moded_bs:] batch_max_sample_len = max( diff --git a/src/gluonnlp/models/bart.py b/src/gluonnlp/models/bart.py index 8d935b14fd..2112aeb22e 100644 --- a/src/gluonnlp/models/bart.py +++ b/src/gluonnlp/models/bart.py @@ -51,7 +51,7 @@ @bart_cfg_reg.register() -def bart_base(): +def fairseq_bart_base(): cfg = CN() # Config for the bart base model cfg.MODEL = CN() @@ -104,8 +104,8 @@ def bart_base(): @bart_cfg_reg.register() -def bart_large(): - cfg = bart_base() +def fairseq_bart_large(): + cfg = fairseq_bart_base() cfg.defrost() cfg.MODEL.vocab_size = 50265 cfg.MODEL.ENCODER.units = 1024 @@ -122,14 +122,14 @@ def bart_large(): PRETRAINED_URL = { 'fairseq_bart_base': { - 'cfg': bart_base(), + 'cfg': fairseq_bart_base(), 'merges': 'fairseq_bart_base/gpt2-396d4d8e.merges', 'vocab': 'fairseq_bart_base/gpt2-f4dedacb.vocab', 'params': 'fairseq_bart_base/model-8f4929b5.params', 'lowercase': False, }, 'fairseq_bart_large': { - 'cfg': bart_large(), + 'cfg': fairseq_bart_large(), 'merges': 'fairseq_bart_large/gpt2-396d4d8e.merges', 'vocab': 'fairseq_bart_large/gpt2-f1335494.vocab', 'params': 'fairseq_bart_large/model-862277b1.params', @@ -200,7 +200,6 @@ def forward(self, src_data, src_valid_length, tgt_data, tgt_valid_length): Parameters ---------- - F src_data - layout = 'NT' Shape (batch_size, src_length) @@ -273,10 +272,10 @@ def apply_pooling(self, sequence, valid_length): Shape (batch_size, units) """ if self._layout == 'NT': - batch_indices = F.npx.arange_like(sequence, axis=0).astype(mx.np.int32) + batch_indices = mx.npx.arange_like(sequence, axis=0).astype(mx.np.int32) outputs = sequence[batch_indices, valid_length - 1] elif self._layout == 'TN': - batch_indices = F.npx.arange_like(sequence, axis=1).astype(mx.np.int32) + batch_indices = mx.npx.arange_like(sequence, axis=1).astype(mx.np.int32) outputs = sequence[valid_length - 1, batch_indices] else: raise NotImplementedError @@ -296,7 +295,7 @@ def vocab_size(self): @classmethod def get_cfg(cls, key=None): if key is None: - return bart_base() + return fairseq_bart_base() else: return bart_cfg_reg.create(key) diff --git a/src/gluonnlp/models/gpt2.py b/src/gluonnlp/models/gpt2.py index 94f0a0ca1d..88387e81b8 100644 --- a/src/gluonnlp/models/gpt2.py +++ b/src/gluonnlp/models/gpt2.py @@ -558,7 +558,7 @@ def get_initial_embedding(self, inputs, prev_len): embedding = self._embed_dropout(embedding) return embedding - def init_states(self, batch_size, ctx): + def init_states(self, batch_size, ctx, dtype=None): """Initialize the states required for incremental decoding Returns @@ -569,10 +569,12 @@ def init_states(self, batch_size, ctx): - layout = 'TN' Shape (num_layers, 2, 0, batch_size, C_in) """ + if dtype is None: + dtype = self._dtype return mx.np.zeros(shape=(self._num_layers, 2, batch_size, 0, - self._units), ctx=ctx, dtype=self._dtype) if self.layout == 'NT' else \ + self._units), ctx=ctx, dtype=dtype) if self.layout == 'NT' else \ mx.np.zeros(shape=(self._num_layers, 2, 0, batch_size, - self._units), ctx=ctx, dtype=self._dtype) + self._units), ctx=ctx, dtype=dtype) @staticmethod def get_cfg(key=None): diff --git a/src/gluonnlp/models/transformer.py b/src/gluonnlp/models/transformer.py index 1deedd4dd1..75fbf56688 100644 --- a/src/gluonnlp/models/transformer.py +++ b/src/gluonnlp/models/transformer.py @@ -1227,7 +1227,6 @@ def forward(self, src_data, src_valid_length, tgt_data, tgt_valid_length): Parameters ---------- - F src_data - layout = 'NT' Shape (batch_size, src_length) diff --git a/src/gluonnlp/models/transformer_xl.py b/src/gluonnlp/models/transformer_xl.py index 341aa5e812..bf938cf068 100644 --- a/src/gluonnlp/models/transformer_xl.py +++ b/src/gluonnlp/models/transformer_xl.py @@ -347,8 +347,14 @@ def get_cfg(cls, key=None): return config @classmethod - def from_cfg(cls, cfg): - return cls(cfg=cfg) + def from_cfg(cls, cfg, dtype=None): + if dtype is not None: + new_cfg = cfg.clone() + new_cfg.defrost() + new_cfg.MODEL.dtype = dtype + return cls(cfg=new_cfg) + else: + return cls(cfg=cfg) @property def state_batch_axis(self): diff --git a/src/gluonnlp/optimizer.py b/src/gluonnlp/optimizer.py index 8b86f925ec..1629ce78d3 100644 --- a/src/gluonnlp/optimizer.py +++ b/src/gluonnlp/optimizer.py @@ -80,15 +80,14 @@ class AdamW(optimizer.Optimizer): def __init__(self, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-6, correct_bias=True, use_fused_step=True, **kwargs): super().__init__(use_fused_step=use_fused_step, - learning_rate=learning_rate, - **kwargs) + learning_rate=learning_rate, + **kwargs) self.beta1 = beta1 self.beta2 = beta2 self.epsilon = epsilon self.correct_bias = correct_bias self.aggregate_num = max(1, min(50, int(os.getenv('MXNET_OPTIMIZER_AGGREGATION_SIZE', '4')))) - assert self.multi_precision is False, 'Currently we do not support multi-precision.' def create_state(self, index, weight): """state creation function.""" diff --git a/src/gluonnlp/utils/testing.py b/src/gluonnlp/utils/testing.py index 7eb1f54457..7e45e0da2d 100644 --- a/src/gluonnlp/utils/testing.py +++ b/src/gluonnlp/utils/testing.py @@ -1,8 +1,10 @@ __all__ = ['is_match_states_batch_size', 'verify_nmt_model', 'verify_nmt_inference'] import numpy.testing as npt +import numpy as np import mxnet as mx from mxnet.util import use_np +from .parameter import move_to_ctx def is_match_states_batch_size(states, states_batch_axis, batch_size) -> bool: @@ -106,10 +108,15 @@ def verify_nmt_inference(train_model, inference_model, Parameters ---------- train_model + The training model inference_model + The inference model batch_size + Batch size src_seq_length + Length of the source sequence tgt_seq_length + Length of the target sequence atol Absolute tolerance rtol @@ -161,3 +168,103 @@ def verify_nmt_inference(train_model, inference_model, partial_out[:, :partial_batch_size].asnumpy(), atol, rtol) else: raise NotImplementedError + + +def _match_struct_output(lhs, rhs, atol=1E-2, rtol=1E-2): + if isinstance(lhs, (list, tuple)): + for lhs_ele, rhs_ele in zip(lhs, rhs): + _match_struct_output(lhs_ele, rhs_ele, atol=atol, rtol=rtol) + else: + npt.assert_allclose(lhs.asnumpy().astype('float32'), + rhs.asnumpy().astype('float32'), atol=atol, rtol=rtol) + + +def _cast_nested_to_fp16(nested_dat): + """Cast the nested input to fp16 + + Parameters + ---------- + dat + The input nested data structure + + Returns + ------- + output + The casted output data + """ + if isinstance(nested_dat, (mx.np.ndarray, np.ndarray)): + if nested_dat.dtype == np.float32: + return nested_dat.astype(np.float16) + else: + return nested_dat + elif isinstance(nested_dat, list): + return [_cast_nested_to_fp16(ele) for ele in nested_dat] + elif isinstance(nested_dat, tuple): + return tuple([_cast_nested_to_fp16(ele) for ele in nested_dat]) + else: + raise NotImplementedError('Type is not supported!') + + +def verify_backbone_fp16(model_cls, cfg, ctx, inputs, + atol=1E-2, rtol=1E-2, check_amp=True): + """Test whether the backbone model has the comparable parameter gradient + + + Parameters + ---------- + model_cls + The modeling class + cfg + The configuration + ctx + The context + inputs + The input tensors of the model. We will + atol + The absolute tolerance + rtol + The relative tolerance + check_amp + Whether to check the AMP process. You will need to ensure that there is no + randomness in the model when it is turned on. + + """ + model_fp32 = model_cls.from_cfg(cfg, dtype='float32') + model_fp32.initialize(ctx=ctx) + model_fp32.hybridize() + # Check forward + fp32_inputs = move_to_ctx(inputs, ctx=ctx) + outputs_fp32 = model_fp32(*fp32_inputs) + mx.npx.waitall() + # Check forward of fp16 + model_fp16 = model_cls.from_cfg(cfg, dtype='float16') + model_fp16.share_parameters(model_fp32.collect_params()) + model_fp16.cast('float16') + model_fp16.hybridize() + for param in model_fp16.collect_params().values(): + assert param.dtype == 'float16' + fp16_inputs = move_to_ctx(_cast_nested_to_fp16(inputs), ctx=ctx) + outputs_fp16 = model_fp16(*fp16_inputs) + mx.npx.waitall() + _match_struct_output(outputs_fp16, outputs_fp32, atol=atol, rtol=rtol) + if check_amp: + from mxnet import amp + amp.init() + # Reconstruct the fp32 model + model_fp32 = model_cls.from_cfg(cfg, dtype='float32') + model_fp32.initialize(ctx=ctx) + model_fp32.hybridize() + trainer = mx.gluon.Trainer(model_fp32.collect_params(), 'adam', + {'learning_rate': 1E-3, 'wd': 1E-4, + 'multi_precision': True}, + update_on_kvstore=False) + amp.init_trainer(trainer) + with mx.autograd.record(): + outputs_amp = model_fp32(*fp32_inputs) + if not isinstance(outputs_amp, (tuple, list)): + loss = outputs_amp.mean() + else: + loss = sum([ele.mean() for ele in outputs_amp]) + with amp.scale_loss(loss, trainer) as scaled_loss: + mx.autograd.backward(scaled_loss) + trainer.step(1) + mx.npx.waitall() diff --git a/tests/README.md b/tests/README.md index 1e3261d742..233e336e93 100644 --- a/tests/README.md +++ b/tests/README.md @@ -3,31 +3,31 @@ To run the unittests, use the following command ```bash -python3 -m pytest --device="cpu" . +python3 -m pytest --forked --device="cpu" . ``` To test for certain file, e.g., the `test_models_transformer.py`, use the following command ```bash -python3 -m pytest --device="cpu" test_models_transformer.py +python3 -m pytest --forked --device="cpu" test_models_transformer.py ``` To test only for gpu device, use the following command ```bash -python3 -m pytest --device="gpu" test_models_transformer.py +python3 -m pytest --forked --device="gpu" test_models_transformer.py ``` To test both for cpu and gpu device, use the following command ```bash -python3 -m pytest --device="cpu" --device="gpu" test_models_transformer.py +python3 -m pytest --forked --device="cpu" --device="gpu" test_models_transformer.py ``` In addition, to run all the tests, you should add the `--runslow` flag ```bash -python3 -m pytest --device="gpu" --runslow test_models.py +python3 -m pytest --forked --device="gpu" --runslow test_models.py ``` Refer to the [official guide of pytest](https://docs.pytest.org/en/latest/) for more details. diff --git a/tests/test_models.py b/tests/test_models.py index 6ad85c85e4..6c476e8b44 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -68,6 +68,7 @@ def test_get_backbone(name, ctx): @pytest.mark.parametrize('layout', ['NT', 'TN']) @pytest.mark.skipif(not tvm_enabled(), reason='TVM is not supported. So this test is skipped.') +@pytest.mark.skip('TVM issue https://github.com/dmlc/gluon-nlp/issues/1425.') def test_tvm_integration(model_name, batch_size, seq_length, layout, ctx): tvm = try_import_tvm() from tvm import relay diff --git a/tests/test_models_bart.py b/tests/test_models_bart.py index 4bf241b80e..62421499e4 100644 --- a/tests/test_models_bart.py +++ b/tests/test_models_bart.py @@ -1,8 +1,11 @@ import pytest import mxnet as mx import tempfile +import numpy as np +import numpy.testing as npt from gluonnlp.models.bart import BartModel, \ list_pretrained_bart, get_pretrained_bart, bart_cfg_reg +from gluonnlp.utils.testing import verify_backbone_fp16 mx.npx.set_np() @@ -35,19 +38,48 @@ def test_bart_cfg_registry(): assert len(bart_cfg_reg.list_keys()) > 0 -@pytest.mark.parametrize('cfg_key', bart_cfg_reg.list_keys()) -def test_bart_cfg(cfg_key): +@pytest.mark.parametrize('cfg_key', ['fairseq_bart_base']) +def test_bart_cfg(cfg_key, ctx): cfg = BartModel.get_cfg(cfg_key) cfg.defrost() cfg.MODEL.vocab_size = 32 cfg.freeze() - model = BartModel.from_cfg(cfg) - model.initialize() - model.hybridize() - cfg.defrost() - cfg.MODEL.layout = 'TN' - cfg.freeze() - model_tn = BartModel.from_cfg(cfg) - model_tn.share_parameters(model.collect_params()) - model_tn.hybridize() - mx.npx.waitall() + + cfg_tn = cfg.clone() + cfg_tn.defrost() + cfg_tn.MODEL.layout = 'TN' + cfg_tn.freeze() + + batch_size = 4 + src_length = 32 + tgt_length = 16 + + with ctx: + src_data = mx.np.random.randint(0, cfg.MODEL.vocab_size, (batch_size, src_length), + dtype=np.int32) + src_valid_length = mx.np.random.randint(src_length // 2, src_length, (batch_size,), + dtype=np.int32) + tgt_data = mx.np.random.randint(0, cfg.MODEL.vocab_size, (batch_size, tgt_length), + dtype=np.int32) + tgt_valid_length = mx.np.random.randint(tgt_length // 2, tgt_length, (batch_size, ), + dtype=np.int32) + model = BartModel.from_cfg(cfg, extract_feature=True) + model.initialize() + model.hybridize() + + contextual_embedding, pooled_output = model(src_data, src_valid_length, + tgt_data, tgt_valid_length) + model_tn = BartModel.from_cfg(cfg_tn, extract_feature=True) + model_tn.share_parameters(model.collect_params()) + model_tn.hybridize() + contextual_embedding_tn, pooled_out_tn = model_tn(src_data.T, src_valid_length, + tgt_data.T, tgt_valid_length) + npt.assert_allclose(contextual_embedding.asnumpy(), + np.transpose(contextual_embedding_tn.asnumpy(), (1, 0, 2)), 5E-3, 5E-3) + npt.assert_allclose(pooled_out_tn.asnumpy(), pooled_output.asnumpy(), 5E-3, 5E-3) + mx.npx.waitall() + + # Verify Float16 + if ctx.device_type == 'gpu': + verify_backbone_fp16(model_cls=BartModel, cfg=cfg, ctx=ctx, + inputs=[src_data, src_valid_length, tgt_data, tgt_valid_length]) diff --git a/tests/test_models_bert.py b/tests/test_models_bert.py index 7abff8468d..26de787c07 100644 --- a/tests/test_models_bert.py +++ b/tests/test_models_bert.py @@ -4,6 +4,7 @@ import tempfile from gluonnlp.models.bert import BertModel, BertForMLM, BertForPretrain,\ list_pretrained_bert, get_pretrained_bert +from gluonnlp.utils.testing import verify_backbone_fp16 mx.npx.set_np() @@ -52,19 +53,6 @@ def test_bert_small_cfg(compute_layout, ctx): 1E-4, 1E-4) assert_allclose(pooled_out.asnumpy(), pooled_out_tn.asnumpy(), 1E-4, 1E-4) - # Test BertModel FP16 - device_type = ctx.device_type - if device_type == 'gpu': - bert_model_fp16 = BertModel.from_cfg(cfg, dtype='float16') - bert_model_fp16.share_parameters(bert_model.collect_params()) - bert_model_fp16.cast('float16') - bert_model_fp16.hybridize() - contextual_embedding_fp16, pooled_out_fp16 = bert_model_fp16(inputs,\ - token_types, valid_length) - assert_allclose(contextual_embedding_fp16.asnumpy(), - mx.np.swapaxes(contextual_embedding_tn, 0, 1).asnumpy(), - 1E-2, 1E-2) - # Test for BertForMLM bert_mlm_model = BertForMLM(cfg) bert_mlm_model.initialize() @@ -79,8 +67,8 @@ def test_bert_small_cfg(compute_layout, ctx): assert_allclose(contextual_embedding.asnumpy(), mx.np.swapaxes(contextual_embedding_tn, 0, 1).asnumpy(), 1E-4, 1E-4) - assert_allclose(pooled_out.asnumpy(), pooled_out_tn.asnumpy(), 1E-4, 1E-4) - assert_allclose(mlm_score.asnumpy(), mlm_score_tn.asnumpy(), 1E-4, 1E-4) + assert_allclose(pooled_out.asnumpy(), pooled_out_tn.asnumpy(), 1E-3, 1E-3) + assert_allclose(mlm_score.asnumpy(), mlm_score_tn.asnumpy(), 1E-3, 1E-3) # Test for BertForPretrain bert_pretrain_model = BertForPretrain(cfg) @@ -95,10 +83,16 @@ def test_bert_small_cfg(compute_layout, ctx): bert_pretrain_model_tn(inputs.T, token_types.T, valid_length, masked_positions) assert_allclose(contextual_embedding.asnumpy(), mx.np.swapaxes(contextual_embedding_tn, 0, 1).asnumpy(), - 1E-4, 1E-4) - assert_allclose(pooled_out.asnumpy(), pooled_out_tn.asnumpy(), 1E-4, 1E-4) - assert_allclose(nsp_score.asnumpy(), nsp_score_tn.asnumpy(), 1E-4, 1E-4) - assert_allclose(mlm_score.asnumpy(), mlm_score_tn.asnumpy(), 1E-4, 1E-4) + 1E-3, 1E-3) + assert_allclose(pooled_out.asnumpy(), pooled_out_tn.asnumpy(), 1E-3, 1E-3) + assert_allclose(nsp_score.asnumpy(), nsp_score_tn.asnumpy(), 1E-3, 1E-3) + assert_allclose(mlm_score.asnumpy(), mlm_score_tn.asnumpy(), 1E-3, 1E-3) + + # Test BertModel FP16 + device_type = ctx.device_type + if device_type == 'gpu': + verify_backbone_fp16(model_cls=BertModel, cfg=cfg, ctx=ctx, + inputs=[inputs, token_types, valid_length]) @pytest.mark.slow diff --git a/tests/test_models_electra.py b/tests/test_models_electra.py index 34ba059473..e3142e4739 100644 --- a/tests/test_models_electra.py +++ b/tests/test_models_electra.py @@ -6,6 +6,7 @@ from gluonnlp.models.electra import ElectraModel, ElectraDiscriminator,\ ElectraGenerator,\ list_pretrained_electra, get_pretrained_electra, get_generator_cfg +from gluonnlp.utils.testing import verify_backbone_fp16 mx.npx.set_np() @@ -52,6 +53,7 @@ def test_electra_model(compute_layout, ctx): electra_model.initialize() electra_model.hybridize() contextual_embedding, pooled_out = electra_model(inputs, token_types, valid_length) + electra_model_tn = ElectraModel.from_cfg(cfg_tn) electra_model_tn.share_parameters(electra_model.collect_params()) electra_model_tn.hybridize() @@ -62,6 +64,12 @@ def test_electra_model(compute_layout, ctx): assert_allclose(pooled_out.asnumpy(), pooled_out_tn.asnumpy(), 1E-4, 1E-4) + # Verify Float16 + if ctx.device_type == 'gpu': + verify_backbone_fp16(model_cls=ElectraModel, cfg=cfg, ctx=ctx, + inputs=[inputs, token_types, valid_length]) + + @pytest.mark.slow @pytest.mark.remote_required diff --git a/tests/test_models_gpt2.py b/tests/test_models_gpt2.py index 260ab74886..09536f27bc 100644 --- a/tests/test_models_gpt2.py +++ b/tests/test_models_gpt2.py @@ -6,6 +6,7 @@ from gluonnlp.models.gpt2 import GPT2Model, GPT2ForLM, \ list_pretrained_gpt2, get_pretrained_gpt2 from gluonnlp.loss import LabelSmoothCrossEntropyLoss +from gluonnlp.utils.testing import verify_backbone_fp16 mx.npx.set_np() @@ -43,6 +44,7 @@ def test_gpt2_small_config(compute_layout, ctx): inputs, gpt2_model.init_states(batch_size, ctx) ) + gpt2_model_tn = GPT2Model.from_cfg(cfg_tn) gpt2_model_tn.share_parameters(gpt2_model.collect_params()) gpt2_model_tn.hybridize() @@ -73,6 +75,15 @@ def test_gpt2_small_config(compute_layout, ctx): assert_allclose(np.swapaxes(states_tn.asnumpy(), 2, 3), states.asnumpy(), 1E-4, 1E-4) + # Verify Float16 + if ctx.device_type == 'gpu': + verify_backbone_fp16(model_cls=GPT2Model, cfg=cfg, ctx=ctx, + inputs=[inputs, + gpt2_model.init_states(batch_size, ctx)], + check_amp=False) + pytest.skip('GPT-2 test has been turned off. ' + 'Issue: https://github.com/apache/incubator-mxnet/issues/19463') + def test_gpt2_incremental_states(ctx): with ctx: @@ -107,7 +118,8 @@ def test_gpt2_incremental_states(ctx): @pytest.mark.slow @pytest.mark.remote_required -@pytest.mark.parametrize('model_name', ['gpt2_124M', 'gpt2_355M', 'gpt2_774M']) +# Just run forward test with the small model to reduce the time cost. +@pytest.mark.parametrize('model_name', ['gpt2_124M']) def test_gpt2(model_name, ctx): # test from pretrained assert len(list_pretrained_gpt2()) > 0 diff --git a/tests/test_models_mobilebert.py b/tests/test_models_mobilebert.py index d7f22ac533..6cc11228f5 100644 --- a/tests/test_models_mobilebert.py +++ b/tests/test_models_mobilebert.py @@ -5,6 +5,7 @@ import tempfile from gluonnlp.models.mobilebert import MobileBertModel, MobileBertForMLM, MobileBertForPretrain,\ list_pretrained_mobilebert, get_pretrained_mobilebert +from gluonnlp.utils.testing import verify_backbone_fp16 mx.npx.set_np() @@ -13,79 +14,86 @@ def test_list_pretrained_mobilebert(): @pytest.mark.parametrize('compute_layout', ['auto', 'TN', 'NT']) -def test_mobilebert_model_small_cfg(compute_layout): - cfg = MobileBertModel.get_cfg() - cfg.defrost() - cfg.MODEL.vocab_size = 100 - cfg.MODEL.num_layers = 2 - cfg.MODEL.hidden_size = 128 - cfg.MODEL.num_heads = 2 - cfg.MODEL.compute_layout = compute_layout - cfg.freeze() +def test_mobilebert_model_small_cfg(compute_layout, ctx): + with ctx: + cfg = MobileBertModel.get_cfg() + cfg.defrost() + cfg.MODEL.vocab_size = 100 + cfg.MODEL.num_layers = 2 + cfg.MODEL.hidden_size = 128 + cfg.MODEL.num_heads = 2 + cfg.MODEL.compute_layout = compute_layout + cfg.freeze() - # Generate TN layout - cfg_tn = cfg.clone() - cfg_tn.defrost() - cfg_tn.MODEL.layout = 'TN' - cfg_tn.freeze() + # Generate TN layout + cfg_tn = cfg.clone() + cfg_tn.defrost() + cfg_tn.MODEL.layout = 'TN' + cfg_tn.freeze() - batch_size = 4 - sequence_length = 16 - num_mask = 3 - inputs = mx.np.random.randint(0, 10, (batch_size, sequence_length)) - token_types = mx.np.random.randint(0, 2, (batch_size, sequence_length)) - valid_length = mx.np.random.randint(3, sequence_length, (batch_size,)) - masked_positions = mx.np.random.randint(0, 3, (batch_size, num_mask)) + batch_size = 4 + sequence_length = 16 + num_mask = 3 + inputs = mx.np.random.randint(0, 10, (batch_size, sequence_length)) + token_types = mx.np.random.randint(0, 2, (batch_size, sequence_length)) + valid_length = mx.np.random.randint(3, sequence_length, (batch_size,)) + masked_positions = mx.np.random.randint(0, 3, (batch_size, num_mask)) - mobile_bert_model = MobileBertModel.from_cfg(cfg) - mobile_bert_model.initialize() - mobile_bert_model.hybridize() - mobile_bert_model_tn = MobileBertModel.from_cfg(cfg_tn) - mobile_bert_model_tn.share_parameters(mobile_bert_model.collect_params()) - mobile_bert_model_tn.hybridize() - contextual_embedding, pooled_out = mobile_bert_model(inputs, token_types, valid_length) - contextual_embedding_tn, pooled_out_tn = mobile_bert_model_tn(inputs.T, - token_types.T, valid_length) - assert_allclose(contextual_embedding.asnumpy(), - np.swapaxes(contextual_embedding_tn.asnumpy(), 0, 1), - 1E-4, 1E-4) - assert_allclose(pooled_out.asnumpy(), pooled_out_tn.asnumpy(), 1E-4, 1E-4) + mobile_bert_model = MobileBertModel.from_cfg(cfg) + mobile_bert_model.initialize() + mobile_bert_model.hybridize() + mobile_bert_model_tn = MobileBertModel.from_cfg(cfg_tn) + mobile_bert_model_tn.share_parameters(mobile_bert_model.collect_params()) + mobile_bert_model_tn.hybridize() + contextual_embedding, pooled_out = mobile_bert_model(inputs, token_types, valid_length) + contextual_embedding_tn, pooled_out_tn = mobile_bert_model_tn(inputs.T, + token_types.T, valid_length) + assert_allclose(contextual_embedding.asnumpy(), + np.swapaxes(contextual_embedding_tn.asnumpy(), 0, 1), + 1E-3, 1E-3) + assert_allclose(pooled_out.asnumpy(), pooled_out_tn.asnumpy(), 1E-3, 1E-3) - # Test for MobileBertForMLM - mobile_bert_mlm_model = MobileBertForMLM(cfg) - mobile_bert_mlm_model.initialize() - mobile_bert_mlm_model.hybridize() - mobile_bert_mlm_model_tn = MobileBertForMLM(cfg_tn) - mobile_bert_mlm_model_tn.share_parameters(mobile_bert_mlm_model.collect_params()) - mobile_bert_model_tn.hybridize() - contextual_embedding, pooled_out, mlm_scores = mobile_bert_mlm_model(inputs, token_types, - valid_length, - masked_positions) - contextual_embedding_tn, pooled_out_tn, mlm_scores_tn =\ - mobile_bert_mlm_model_tn(inputs.T, token_types.T, valid_length, masked_positions) - assert_allclose(contextual_embedding.asnumpy(), - np.swapaxes(contextual_embedding_tn.asnumpy(), 0, 1), - 1E-4, 1E-4) - assert_allclose(pooled_out_tn.asnumpy(), pooled_out.asnumpy(), 1E-4, 1E-4) - assert_allclose(mlm_scores_tn.asnumpy(), mlm_scores.asnumpy(), 1E-4, 1E-4) + # Test for MobileBertForMLM + mobile_bert_mlm_model = MobileBertForMLM(cfg) + mobile_bert_mlm_model.initialize() + mobile_bert_mlm_model.hybridize() + mobile_bert_mlm_model_tn = MobileBertForMLM(cfg_tn) + mobile_bert_mlm_model_tn.share_parameters(mobile_bert_mlm_model.collect_params()) + mobile_bert_model_tn.hybridize() + contextual_embedding, pooled_out, mlm_scores = mobile_bert_mlm_model(inputs, token_types, + valid_length, + masked_positions) + contextual_embedding_tn, pooled_out_tn, mlm_scores_tn =\ + mobile_bert_mlm_model_tn(inputs.T, token_types.T, valid_length, masked_positions) + assert_allclose(contextual_embedding.asnumpy(), + np.swapaxes(contextual_embedding_tn.asnumpy(), 0, 1), + 1E-3, 1E-3) + assert_allclose(pooled_out_tn.asnumpy(), pooled_out.asnumpy(), 1E-3, 1E-3) + assert_allclose(mlm_scores_tn.asnumpy(), mlm_scores.asnumpy(), 1E-3, 1E-3) - # Test for MobileBertForPretrain - mobile_bert_pretrain_model = MobileBertForPretrain(cfg) - mobile_bert_pretrain_model.initialize() - mobile_bert_pretrain_model.hybridize() - mobile_bert_pretrain_model_tn = MobileBertForPretrain(cfg_tn) - mobile_bert_pretrain_model_tn.share_parameters(mobile_bert_pretrain_model.collect_params()) - mobile_bert_pretrain_model_tn.hybridize() - contextual_embedding, pooled_out, nsp_score, mlm_scores =\ - mobile_bert_pretrain_model(inputs, token_types, valid_length, masked_positions) - contextual_embedding_tn, pooled_out_tn, nsp_score_tn, mlm_scores_tn = \ - mobile_bert_pretrain_model_tn(inputs.T, token_types.T, valid_length, masked_positions) - assert_allclose(contextual_embedding.asnumpy(), - np.swapaxes(contextual_embedding_tn.asnumpy(), 0, 1), - 1E-4, 1E-4) - assert_allclose(pooled_out.asnumpy(), pooled_out_tn.asnumpy(), 1E-4, 1E-4) - assert_allclose(nsp_score.asnumpy(), nsp_score_tn.asnumpy(), 1E-4, 1E-4) - assert_allclose(mlm_scores.asnumpy(), mlm_scores_tn.asnumpy(), 1E-4, 1E-4) + # Test for MobileBertForPretrain + mobile_bert_pretrain_model = MobileBertForPretrain(cfg) + mobile_bert_pretrain_model.initialize() + mobile_bert_pretrain_model.hybridize() + mobile_bert_pretrain_model_tn = MobileBertForPretrain(cfg_tn) + mobile_bert_pretrain_model_tn.share_parameters(mobile_bert_pretrain_model.collect_params()) + mobile_bert_pretrain_model_tn.hybridize() + contextual_embedding, pooled_out, nsp_score, mlm_scores =\ + mobile_bert_pretrain_model(inputs, token_types, valid_length, masked_positions) + contextual_embedding_tn, pooled_out_tn, nsp_score_tn, mlm_scores_tn = \ + mobile_bert_pretrain_model_tn(inputs.T, token_types.T, valid_length, masked_positions) + assert_allclose(contextual_embedding.asnumpy(), + np.swapaxes(contextual_embedding_tn.asnumpy(), 0, 1), + 1E-3, 1E-3) + assert_allclose(pooled_out.asnumpy(), pooled_out_tn.asnumpy(), 1E-3, 1E-3) + assert_allclose(nsp_score.asnumpy(), nsp_score_tn.asnumpy(), 1E-3, 1E-3) + assert_allclose(mlm_scores.asnumpy(), mlm_scores_tn.asnumpy(), 1E-3, 1E-3) + + # Test for fp16 + if ctx.device_type == 'gpu': + pytest.skip('MobileBERT will have nan values in FP16 mode.') + verify_backbone_fp16(model_cls=MobileBertModel, cfg=cfg, ctx=ctx, + inputs=[inputs, token_types, valid_length]) @pytest.mark.remote_required diff --git a/tests/test_models_roberta.py b/tests/test_models_roberta.py index 2fd9e63131..8953321cc7 100644 --- a/tests/test_models_roberta.py +++ b/tests/test_models_roberta.py @@ -6,6 +6,8 @@ from gluonnlp.models.roberta import RobertaModel, RobertaForMLM, \ list_pretrained_roberta, get_pretrained_roberta from gluonnlp.loss import LabelSmoothCrossEntropyLoss +from gluonnlp.utils.testing import verify_backbone_fp16 + mx.npx.set_np() @@ -15,64 +17,70 @@ def test_list_pretrained_roberta(): @pytest.mark.parametrize('compute_layout', ['auto', 'TN', 'NT']) -def test_robert_small_config(compute_layout): - cfg = RobertaModel.get_cfg() - cfg.defrost() - cfg.MODEL.vocab_size = 1000 - cfg.MODEL.num_layers = 2 - cfg.MODEL.hidden_size = 128 - cfg.MODEL.num_heads = 2 - cfg.MODEL.compute_layout = compute_layout - cfg.freeze() +def test_robert_small_config(compute_layout, ctx): + with ctx: + cfg = RobertaModel.get_cfg() + cfg.defrost() + cfg.MODEL.vocab_size = 1000 + cfg.MODEL.num_layers = 2 + cfg.MODEL.hidden_size = 128 + cfg.MODEL.num_heads = 2 + cfg.MODEL.compute_layout = compute_layout + cfg.freeze() + + # Generate TN layout + cfg_tn = cfg.clone() + cfg_tn.defrost() + cfg_tn.MODEL.layout = 'TN' + cfg_tn.freeze() - # Generate TN layout - cfg_tn = cfg.clone() - cfg_tn.defrost() - cfg_tn.MODEL.layout = 'TN' - cfg_tn.freeze() + batch_size = 4 + sequence_length = 16 + num_mask = 3 + inputs = mx.np.random.randint(0, 10, (batch_size, sequence_length)) + valid_length = mx.np.random.randint(3, sequence_length, (batch_size,)) + masked_positions = mx.np.random.randint(0, 3, (batch_size, num_mask)) - batch_size = 4 - sequence_length = 16 - num_mask = 3 - inputs = mx.np.random.randint(0, 10, (batch_size, sequence_length)) - valid_length = mx.np.random.randint(3, sequence_length, (batch_size,)) - masked_positions = mx.np.random.randint(0, 3, (batch_size, num_mask)) + roberta_model = RobertaModel.from_cfg(cfg) + roberta_model.initialize() + roberta_model.hybridize() + contextual_embeddings, pooled_out = roberta_model(inputs, valid_length) + roberta_model_tn = RobertaModel.from_cfg(cfg_tn) + roberta_model_tn.share_parameters(roberta_model.collect_params()) + roberta_model_tn.hybridize() + contextual_embeddings_tn, pooled_out_tn = roberta_model_tn(inputs.T, valid_length) + assert_allclose(np.swapaxes(contextual_embeddings_tn.asnumpy(), 0, 1), + contextual_embeddings.asnumpy(), 1E-3, 1E-3) + assert_allclose(pooled_out_tn.asnumpy(), pooled_out.asnumpy(), 1E-3, 1E-3) - roberta_model = RobertaModel.from_cfg(cfg) - roberta_model.initialize() - roberta_model.hybridize() - contextual_embeddings, pooled_out = roberta_model(inputs, valid_length) - roberta_model_tn = RobertaModel.from_cfg(cfg_tn) - roberta_model_tn.share_parameters(roberta_model.collect_params()) - roberta_model_tn.hybridize() - contextual_embeddings_tn, pooled_out_tn = roberta_model_tn(inputs.T, valid_length) - assert_allclose(np.swapaxes(contextual_embeddings_tn.asnumpy(), 0, 1), - contextual_embeddings.asnumpy(), 1E-4, 1E-4) - assert_allclose(pooled_out_tn.asnumpy(), pooled_out.asnumpy(), 1E-4, 1E-4) + # Test for RobertaForMLM + roberta_mlm_model = RobertaForMLM(cfg) + roberta_mlm_model.initialize() + roberta_mlm_model.hybridize() + contextual_embedding, pooled_out, mlm_scores = roberta_mlm_model(inputs, valid_length, + masked_positions) + roberta_mlm_model_tn = RobertaForMLM(cfg_tn) + roberta_mlm_model_tn.share_parameters(roberta_mlm_model.collect_params()) + roberta_mlm_model_tn.hybridize() + contextual_embedding_tn, pooled_out_tn, mlm_scores_tn =\ + roberta_mlm_model_tn(inputs.T, valid_length.T, masked_positions) + assert_allclose(np.swapaxes(contextual_embedding_tn.asnumpy(), 0, 1), + contextual_embedding.asnumpy(), 1E-3, 1E-3) + assert_allclose(pooled_out_tn.asnumpy(), pooled_out.asnumpy(), 1E-3, 1E-3) + assert_allclose(mlm_scores_tn.asnumpy(), mlm_scores.asnumpy(), 1E-3, 1E-3) - # Test for RobertaForMLM - roberta_mlm_model = RobertaForMLM(cfg) - roberta_mlm_model.initialize() - roberta_mlm_model.hybridize() - contextual_embedding, pooled_out, mlm_scores = roberta_mlm_model(inputs, valid_length, - masked_positions) - roberta_mlm_model_tn = RobertaForMLM(cfg_tn) - roberta_mlm_model_tn.share_parameters(roberta_mlm_model.collect_params()) - roberta_mlm_model_tn.hybridize() - contextual_embedding_tn, pooled_out_tn, mlm_scores_tn =\ - roberta_mlm_model_tn(inputs.T, valid_length.T, masked_positions) - assert_allclose(np.swapaxes(contextual_embedding_tn.asnumpy(), 0, 1), - contextual_embedding.asnumpy(), 1E-4, 1E-4) - assert_allclose(pooled_out_tn.asnumpy(), pooled_out.asnumpy(), 1E-4, 1E-4) - assert_allclose(mlm_scores_tn.asnumpy(), mlm_scores.asnumpy(), 1E-4, 1E-4) + # Test for fp16 + if ctx.device_type == 'gpu': + verify_backbone_fp16(model_cls=RobertaModel, cfg=cfg, ctx=ctx, + inputs=[inputs, valid_length]) @pytest.mark.slow @pytest.mark.remote_required -@pytest.mark.parametrize('model_name', list_pretrained_roberta()) +# Just test the fairseq_roberta_base to reduce the time +@pytest.mark.parametrize('model_name', ['fairseq_roberta_base']) def test_roberta(model_name): # test from pretrained - assert len(list_pretrained_roberta()) > 0 with tempfile.TemporaryDirectory() as root: cfg, tokenizer, params_path, mlm_params_path =\ get_pretrained_roberta(model_name, load_backbone=True, load_mlm=True, root=root) @@ -108,7 +116,7 @@ def test_roberta(model_name): ), dtype=np.int32 ) - contextual_embeddings, pooled_out = roberta_model(input_ids, valid_length) + roberta_model(input_ids, valid_length) mx.npx.waitall() # test backward label_smooth_loss = LabelSmoothCrossEntropyLoss(num_labels=vocab_size) diff --git a/tests/test_models_transformer.py b/tests/test_models_transformer.py index 3588fe2663..6e9502ec72 100644 --- a/tests/test_models_transformer.py +++ b/tests/test_models_transformer.py @@ -1,3 +1,4 @@ +import numpy as np import mxnet as mx import pytest from numpy.testing import assert_allclose @@ -7,6 +8,9 @@ transformer_cfg_reg from gluonnlp.attention_cell import gen_mem_attn_mask, gen_self_attn_mask from gluonnlp.utils.testing import verify_nmt_model, verify_nmt_inference +from gluonnlp.utils.testing import verify_backbone_fp16 + + mx.npx.set_np() @@ -111,8 +115,6 @@ def test_transformer_nmt_model(train_hybridize, inference_hybridize, enc_num_layers, dec_num_layers, enc_recurrent, dec_recurrent, tie_weights, layout): - if inference_hybridize: - pytest.skip('inference model hybridization is not working') src_seq_length = 20 tgt_seq_length = 15 src_vocab_size = 32 @@ -172,3 +174,68 @@ def test_transformer_cfg(cfg_key): model_tn.share_parameters(model.collect_params()) model_tn.hybridize() mx.npx.waitall() + + +@pytest.mark.parametrize('enc_pre_norm,dec_pre_norm', + [(False, False), (True, True)]) +@pytest.mark.parametrize('enc_num_layers,dec_num_layers,enc_units,dec_units', + [(2, 2, 24, 24), + (2, 3, 16, 16)]) +@pytest.mark.parametrize('enc_recurrent', [False, True]) +@pytest.mark.parametrize('dec_recurrent', [False, True]) +@pytest.mark.parametrize('tie_weights,layout', [(False, 'NT'), (True, 'NT'), (True, 'TN')]) +def test_transformer_fp16_amp(enc_pre_norm, dec_pre_norm, + enc_units, dec_units, + enc_num_layers, dec_num_layers, + enc_recurrent, dec_recurrent, tie_weights, + layout, ctx): + if ctx.device_type != 'gpu': + pytest.skip('Only test amp when running on GPU.') + # Generate configuration for testing + cfg = TransformerModel.get_cfg() + cfg.defrost() + cfg.MODEL.src_vocab_size = 32 + cfg.MODEL.tgt_vocab_size = 32 + cfg.MODEL.max_src_length = 20 + cfg.MODEL.max_tgt_length = 15 + cfg.MODEL.tie_weights = tie_weights + cfg.MODEL.layout = layout + + # Encoder config + cfg.MODEL.ENCODER.pre_norm = enc_pre_norm + cfg.MODEL.ENCODER.units = enc_units + cfg.MODEL.ENCODER.num_layers = enc_num_layers + cfg.MODEL.ENCODER.recurrent = enc_recurrent + + # Decoder config + cfg.MODEL.DECODER.pre_norm = dec_pre_norm + cfg.MODEL.DECODER.units = dec_units + cfg.MODEL.DECODER.num_layers = dec_num_layers + cfg.MODEL.DECODER.recurrent = dec_recurrent + cfg.freeze() + + batch_size = 4 + seq_length = 16 + with ctx: + if layout == 'NT': + src_data = mx.np.random.randint(0, cfg.MODEL.src_vocab_size, + (batch_size, seq_length), dtype=np.int32) + src_valid_length = mx.np.random.randint(seq_length // 2, seq_length, + (batch_size,), dtype=np.int32) + tgt_data = mx.np.random.randint(0, cfg.MODEL.tgt_vocab_size, + (batch_size, seq_length), dtype=np.int32) + tgt_valid_length = mx.np.random.randint(seq_length // 2, seq_length, + (batch_size,), dtype=np.int32) + elif layout == 'TN': + src_data = mx.np.random.randint(0, cfg.MODEL.src_vocab_size, + (seq_length, batch_size), dtype=np.int32) + src_valid_length = mx.np.random.randint(seq_length // 2, seq_length, + (batch_size,), dtype=np.int32) + tgt_data = mx.np.random.randint(0, cfg.MODEL.tgt_vocab_size, + (seq_length, batch_size), dtype=np.int32) + tgt_valid_length = mx.np.random.randint(seq_length // 2, seq_length, + (batch_size,), dtype=np.int32) + else: + raise NotImplementedError + verify_backbone_fp16(TransformerModel, cfg, ctx, + inputs=[src_data, src_valid_length, tgt_data, tgt_valid_length]) diff --git a/tests/test_models_xlmr.py b/tests/test_models_xlmr.py index ec19af95ff..b2d3c4b8d9 100644 --- a/tests/test_models_xlmr.py +++ b/tests/test_models_xlmr.py @@ -13,6 +13,7 @@ def test_list_pretrained_xlmr(): assert len(list_pretrained_xlmr()) > 0 +# We choose to not test amp for XLMR because it's the same as RoBERTa. @pytest.mark.slow @pytest.mark.remote_required @pytest.mark.parametrize('model_name', list_pretrained_xlmr()) diff --git a/tests/test_optimizer.py b/tests/test_optimizer.py index 48c2331a7a..d3dfd5ccd3 100644 --- a/tests/test_optimizer.py +++ b/tests/test_optimizer.py @@ -1,4 +1,5 @@ import itertools +import pytest import numpy as np from gluonnlp.optimizer import AdamW import mxnet as mx @@ -6,7 +7,8 @@ mx.npx.reset_np() -def test_adam(ctx): +@pytest.mark.parametrize('dtype', [np.float16, np.float32]) +def test_adam(dtype, ctx): with ctx: opt1 = AdamW opt2 = AdamW @@ -16,18 +18,17 @@ def test_adam(ctx): cg_options = [{}, {'clip_gradient': 0.4}, {'clip_gradient': 0.5}] rg_options = [{}, {'rescale_grad': 0.14}, {'rescale_grad': 0.8}] wd_options = [{}, {'wd': 0.03}, {'wd': 0.05}, {'wd': 0.07}] - mp_options = [{'multi_precision': False}] # TODO(sxjscience) Test for FP16 + mp_options = [{'multi_precision': False}, {'multi_precision': True}] agg_options = [{'aggregate_num': 0}, {'aggregate_num': 1}, {'aggregate_num': 4}, {'aggregate_num': np.inf}] correct_bias_options = [{'correct_bias': True}, {'correct_bias': False}] - for dtype in [np.float16, np.float32]: - for params in itertools.product(beta1_options, beta2_options, cg_options, - rg_options, wd_options, mp_options, - agg_options, correct_bias_options): - kwarg = {k: v for param in params for k, v in param.items()} - if (dtype == np.float16 and ('multi_precision' not in kwarg or - not kwarg['multi_precision'])): - continue - compare_optimizer(opt1(use_fused_step=False, **kwarg), - opt2(use_fused_step=True, **kwarg), shapes, dtype, - rtol=1e-4, atol=2e-5) + for params in itertools.product(beta1_options, beta2_options, cg_options, + rg_options, wd_options, mp_options, + agg_options, correct_bias_options): + kwarg = {k: v for param in params for k, v in param.items()} + if (dtype == np.float16 and ('multi_precision' not in kwarg or + not kwarg['multi_precision'])): + continue + compare_optimizer(opt1(use_fused_step=False, **kwarg), + opt2(use_fused_step=True, **kwarg), shapes, dtype, + rtol=1e-3, atol=2e-3) diff --git a/tools/batch/README.md b/tools/batch/README.md index e95d2e4c6f..1bffcdf2bd 100644 --- a/tools/batch/README.md +++ b/tools/batch/README.md @@ -24,19 +24,33 @@ several pre-trained models could be converted through the corresponding conversi bash run_batch_conversion ${MODEL_TYPE} ``` -## Fine-tuning Downstream Tasks +## SQuAD Training -### Question Answering -We can quickly run the squad finetuning via [squad fine-tuning scripts](../../scripts/question_answering#squad) and the AWS Batch job. - -The code is given in [run_batch_squad.sh](run_batch_squad.sh) +The code is given in [question_answering/run_batch_squad.sh](question_answering/run_batch_squad.sh) ```bash # AWS Batch training without horovod on SQuAD 2.0 -bash run_batch_squad.sh +bash question_answering/run_batch_squad.sh 0 2.0 submit_squad_v2_fp32.log float32 # AWS Batch training with horovod on SQuAD 2.0 -bash run_batch_squad.sh 1 2.0 submit_squad_v2_horovod.log +bash question_answering/run_batch_squad.sh 1 2.0 submit_squad_v2_horovod_fp32.log float32 + +# AWS Batch training with horovod on SQuAD 1.1 +bash question_answering/run_batch_squad.sh 1 1.1 submit_squad_v1_horovod_fp32.log float32 +``` + +```bash +# AWS Batch training with horovod on SQuAD 2.0 + FP16 +bash question_answering/run_batch_squad.sh 1 2.0 submit_squad_v2_horovod_fp16.log float16 + +# AWS Batch training with horovod on SQuAD 1.1 + FP16 +bash question_answering/run_batch_squad.sh 1 1.1 submit_squad_v1_horovod_fp16.log float16 +``` + +Also, after you have submitted the jobs, you may sync the results via +```bash +bash question_answering/sync_batch_result.sh submit_squad_v2.log squad_v2_no_horovod +bash question_answering/sync_batch_result.sh submit_squad_v2_horovod.log squad_v2_horovod ``` Internally, it will train the following models on SQuAD 2.0 dataset: @@ -52,4 +66,5 @@ Internally, it will train the following models on SQuAD 2.0 dataset: | electra_base | | electra_large | | roberta_large | +| gluon_en_cased_bert_base_v1 | | mobilebert | diff --git a/tools/batch/run_batch_squad.sh b/tools/batch/question_answering/run_batch_squad.sh similarity index 68% rename from tools/batch/run_batch_squad.sh rename to tools/batch/question_answering/run_batch_squad.sh index 8349716c29..0682fe8cce 100644 --- a/tools/batch/run_batch_squad.sh +++ b/tools/batch/question_answering/run_batch_squad.sh @@ -1,8 +1,13 @@ +#!/bin/bash + set -ex USE_HOROVOD=${1:-0} VERSION=${2:-2.0} LOG_PATH=${3:-submit_squad_v2.log} +DTYPE=${4:-float32} +SUBMIT_SCRIPT_PATH=$(dirname "$0")/../../../tools/batch/submit-job.py + for MODEL_NAME in albert_base \ albert_large \ @@ -14,9 +19,10 @@ for MODEL_NAME in albert_base \ roberta_large \ uncased_bert_base \ uncased_bert_large \ + gluon_en_cased_bert_base_v1 \ mobilebert do - python3 submit-job.py \ + python3 ${SUBMIT_SCRIPT_PATH} \ --region us-east-1 \ --source-ref master \ --job-type g4dn.12x \ @@ -24,5 +30,7 @@ do --name test_squad2_${MODEL_NAME} \ --work-dir scripts/question_answering \ --remote https://github.com/dmlc/gluon-nlp/ \ - --command "bash commands/run_squad2_${MODEL_NAME}.sh ${USE_HOROVOD} ${VERSION} | tee stdout.log" >> ${LOG_PATH} + --command "bash commands/run_squad2_${MODEL_NAME}.sh ${USE_HOROVOD} ${VERSION} ${DTYPE} | tee stdout.log" \ + | perl -pe 's/Submitted job \[([0-9|a-z|_].+)\] to the job queue .+/$1/' \ + | sed -e 's/ - / /g' >> ${LOG_PATH} done diff --git a/tools/batch/question_answering/sync_batch_result.sh b/tools/batch/question_answering/sync_batch_result.sh new file mode 100644 index 0000000000..fe350bd340 --- /dev/null +++ b/tools/batch/question_answering/sync_batch_result.sh @@ -0,0 +1,10 @@ +#!/bin/bash + +set -ex + +LOG_PATH=$1 +SAVE_DIR_NAME=${2:-squad_2.0} + +while read -r job_name job_id; do + aws s3 sync s3://gluon-nlp-dev/batch/${job_id}/temp ${SAVE_DIR_NAME}/${job_name} +done < ${LOG_PATH} diff --git a/tools/batch/wait-job.py b/tools/batch/wait-job.py index 87d8679255..ea3319ae54 100644 --- a/tools/batch/wait-job.py +++ b/tools/batch/wait-job.py @@ -10,12 +10,14 @@ parser.add_argument('--profile', help='profile name of aws account.', type=str, default=None) +parser.add_argument('--region', help='Default region when creating new connections', type=str, + default=None) parser.add_argument('--job-id', help='job id to check status and wait.', type=str, default=None) args = parser.parse_args() -session = boto3.Session(profile_name=args.profile) +session = boto3.Session(profile_name=args.profile, region_name=args.region) batch, cloudwatch = [session.client(service_name=sn) for sn in ['batch', 'logs']] def printLogs(logGroupName, logStreamName, startTime):