diff --git a/README.md b/README.md
index 7a5fcf1902..148ca40031 100644
--- a/README.md
+++ b/README.md
@@ -16,7 +16,7 @@ process the text data, and train models.
 
 # Features
 
-- Easy-to-use Text Processing Tools and APIs
+- Easy-to-use Text Processing Tools and Modular APIs
 - Pretrained Model Zoo
 - Write Models with Numpy-like API
 - Fast Inference via [Apache TVM (incubating)](https://tvm.apache.org/) (Experimental)
@@ -28,16 +28,16 @@ First of all, install the latest MXNet. You may use the following commands:
 
 ```bash
 # Install the version with CUDA 10.0
-python3 -m pip install -U --pre "mxnet-cu100>=2.0.0b20200926" -f https://dist.mxnet.io/python
+python3 -m pip install -U --pre "mxnet-cu100>=2.0.0b20201101" -f https://dist.mxnet.io/python
 
 # Install the version with CUDA 10.1
-python3 -m pip install -U --pre "mxnet-cu101>=2.0.0b20200926" -f https://dist.mxnet.io/python
+python3 -m pip install -U --pre "mxnet-cu101>=2.0.0b20201101" -f https://dist.mxnet.io/python
 
 # Install the version with CUDA 10.2
-python3 -m pip install -U --pre "mxnet-cu102>=2.0.0b20200926" -f https://dist.mxnet.io/python
+python3 -m pip install -U --pre "mxnet-cu102>=2.0.0b20201101" -f https://dist.mxnet.io/python
 
 # Install the cpu-only version
-python3 -m pip install -U --pre "mxnet>=2.0.0b20200926" -f https://dist.mxnet.io/python
+python3 -m pip install -U --pre "mxnet>=2.0.0b20201101" -f https://dist.mxnet.io/python
 ```
 
 
diff --git a/scripts/benchmarks/benchmark_gluonnlp.py b/scripts/benchmarks/benchmark_gluonnlp.py
index 350c3411f3..1e7bf2913e 100644
--- a/scripts/benchmarks/benchmark_gluonnlp.py
+++ b/scripts/benchmarks/benchmark_gluonnlp.py
@@ -58,13 +58,14 @@ def get_parser():
                         help='Whether to use TVM for inference/training')
     parser.add_argument('--instance_type', choices=['c4', 'c5', 'g4', 'p3'], default='g4',
                         help='The instance type that the profiling script will be run on.')
+    parser.add_argument('--use_fp16', action='store_true')
     parser.add_argument('--mode', type=str, default='train',
                         choices=['train', 'inference'])
     return parser
 
 
 def run_benchmark(workload, model_name, out_file_name, is_train,
-                  use_tvm, instance_type):
+                  use_tvm, instance_type, use_fp16):
     if is_train:
         benchmark = GluonNLPBackboneBenchmark(
             workloads=workload,
@@ -72,6 +73,7 @@ def run_benchmark(workload, model_name, out_file_name, is_train,
             profile_inference=False,
             profile_train=True,
             to_csv=True,
+            use_fp16=use_fp16,
             train_out_csv_file=out_file_name)
         benchmark.run()
     else:
@@ -83,6 +85,7 @@ def run_benchmark(workload, model_name, out_file_name, is_train,
             use_tvm=use_tvm,
             instance_type=instance_type,
             to_csv=True,
+            use_fp16=use_fp16,
             inference_out_csv_file=out_file_name)
         benchmark.run()
     return
@@ -94,13 +97,15 @@ def run_benchmark(workload, model_name, out_file_name, is_train,
     args = parser.parse_args()
     if args.compute_layout is None:
         args.compute_layout = args.layout
+    dtype = 'float32' if not args.use_fp16 else 'float16'
     for layout, compute_layout in [(args.layout, args.compute_layout)]:
         if compute_layout != layout:
             profile_models = [ele for ele in MODELS if 'bart' not in ele]
         else:
             profile_models = [ele for ele in MODELS]
         if args.mode == 'inference':
-            out_dir = 'infer_fp32_{}_{}_tvm{}'.format(layout, compute_layout, int(args.use_tvm))
+            out_dir = 'infer_{}_{}_{}_tvm{}'.format(dtype, layout, compute_layout,
+                                                    int(args.use_tvm))
             df = pd.DataFrame(columns=['model', 'batch_size', 'sequence_length',
                                        'latency', 'memory'])
             os.makedirs(out_dir, exist_ok=True)
@@ -111,16 +116,17 @@ def run_benchmark(workload, model_name, out_file_name, is_train,
                     process = Process(
                         target=run_benchmark,
                         args=(workload, model_name, out_path, False,
-                              args.use_tvm, args.instance_type))
+                              args.use_tvm, args.instance_type, args.use_fp16))
                     process.start()
                     process.join()
                     new_df = pd.read_csv(out_path)
                     df = df.append(new_df, ignore_index=True)
-                    df.to_csv('gluonnlp_infer_fp32_{}_{}_tvm{}.csv'.format(layout,
+                    df.to_csv('gluonnlp_infer_{}_{}_{}_tvm{}.csv'.format(dtype,
+                                                                         layout,
                                                                            compute_layout,
                                                                            int(args.use_tvm)))
         elif args.mode == 'train':
-            out_dir = 'train_fp32_{}_{}'.format(layout, compute_layout)
+            out_dir = 'train_{}_{}_{}'.format(dtype, layout, compute_layout)
             df = pd.DataFrame(columns=['model', 'batch_size', 'sequence_length',
                                        'latency', 'memory'])
             os.makedirs(out_dir, exist_ok=True)
@@ -130,11 +136,12 @@ def run_benchmark(workload, model_name, out_file_name, is_train,
                                                                            workload[1]))
                     process = Process(
                         target=run_benchmark,
-                        args=(workload, model_name, out_path, True))
+                        args=(workload, model_name, out_path, True, False,
+                              args.instance_type, args.use_fp16))
                     process.start()
                     process.join()
                     new_df = pd.read_csv(out_path)
                     df = df.append(new_df, ignore_index=True)
-                    df.to_csv('gluonnlp_train_fp32_{}_{}.csv'.format(layout, compute_layout))
+                    df.to_csv('gluonnlp_train_{}_{}_{}.csv'.format(dtype, layout, compute_layout))
         else:
             raise NotImplementedError
diff --git a/scripts/benchmarks/benchmark_gluonnlp_fp16.sh b/scripts/benchmarks/benchmark_gluonnlp_fp16.sh
new file mode 100644
index 0000000000..784e73dc7a
--- /dev/null
+++ b/scripts/benchmarks/benchmark_gluonnlp_fp16.sh
@@ -0,0 +1,14 @@
+for mode in train inference
+do
+  python3 benchmark_gluonnlp.py --layout NT --compute_layout NT --mode $mode --use_fp16
+done
+
+for mode in train inference
+do
+  python3 benchmark_gluonnlp.py --layout NT --compute_layout TN --mode $mode --use_fp16
+done
+
+for mode in train inference
+do
+  python3 benchmark_gluonnlp.py --layout TN --compute_layout TN --mode $mode --use_fp16
+done
diff --git a/scripts/benchmarks/benchmark_utils.py b/scripts/benchmarks/benchmark_utils.py
index b4f32ff601..65e22b189b 100644
--- a/scripts/benchmarks/benchmark_utils.py
+++ b/scripts/benchmarks/benchmark_utils.py
@@ -748,7 +748,6 @@ def __init__(self, workloads, model_names, use_fp16=False,
         self._inference_out_csv_file = inference_out_csv_file
         self._train_out_csv_file = train_out_csv_file
         self._env_info_file = env_info_file
-        assert use_fp16 is False, 'Currently fp16 benchmark has not been supported yet.'
 
     @property
     def model_names(self):
@@ -760,22 +759,26 @@ def workloads(self):
 
     def _inference_speed_memory(self, model_name: str, batch_size: int, sequence_length: int)\
             -> Tuple[float, Memory]:
+        if self._use_fp16:
+            dtype = 'float16'
+        else:
+            dtype = 'float32'
         if self._use_gpu:
             ctx = mxnet.gpu()
         else:
             ctx = mxnet.cpu()
         model_cls, cfg, tokenizer, backbone_param_path, _ = get_backbone(model_name)
-        # TODO Support fp16 profiling
         cfg.defrost()
         cfg.MODEL.layout = self._layout
         if model_cls.__name__ not in ['BartModel']:
             cfg.MODEL.compute_layout = self._compute_layout
         cfg.freeze()
         if model_cls.__name__ in ['BartModel']:
-            model = model_cls.from_cfg(cfg, extract_feature=True)
+            model = model_cls.from_cfg(cfg, extract_feature=True, dtype=dtype)
         else:
-            model = model_cls.from_cfg(cfg)
-        model.load_parameters(backbone_param_path, ctx=ctx)
+            model = model_cls.from_cfg(cfg, dtype=dtype)
+        model.load_parameters(backbone_param_path, ctx=ctx, cast_dtype=True)
+        model.cast(dtype)
         model.hybridize()
         vocab_size = cfg.MODEL.vocab_size
         if self._layout == 'NT':
@@ -860,12 +863,15 @@ def run_tvm_forward():
 
     def _train_speed_memory(self, model_name: str, batch_size: int, sequence_length: int)\
             -> Tuple[float, Memory]:
+        if self._use_fp16:
+            from mxnet import amp
+            amp.init()
+
         if self._use_gpu:
             ctx = mxnet.gpu()
         else:
             ctx = mxnet.cpu()
         model_cls, cfg, tokenizer, backbone_param_path, _ = get_backbone(model_name)
-        # TODO Support fp16 profiling
         cfg.defrost()
         cfg.MODEL.layout = self._layout
         if model_cls.__name__ not in ['BartModel']:
diff --git a/scripts/machine_translation/README.md b/scripts/machine_translation/README.md
index 4b729cc117..c0d37981a2 100644
--- a/scripts/machine_translation/README.md
+++ b/scripts/machine_translation/README.md
@@ -30,6 +30,7 @@ python3 train_transformer.py \
     --save_dir transformer_base_wmt2014_en_de_${SUBWORD_ALGO} \
     --cfg transformer_base \
     --lr 0.002 \
+    --num_accumulated 32 \
     --sampler BoundedBudgetSampler \
     --max_num_tokens 2700 \
     --epochs 30 \
diff --git a/scripts/machine_translation/train_transformer.py b/scripts/machine_translation/train_transformer.py
index b51c9858d0..4e55c10675 100644
--- a/scripts/machine_translation/train_transformer.py
+++ b/scripts/machine_translation/train_transformer.py
@@ -441,8 +441,10 @@ def train(args):
             for sample_data, ctx in zip(sample_data_l, ctx_l):
                 if sample_data is None:
                     continue
-                src_token_ids, tgt_token_ids, src_valid_length, tgt_valid_length, sample_ids = sample_data
-                src_wc, tgt_wc, bs = src_valid_length.sum(), tgt_valid_length.sum(), src_token_ids.shape[0]
+                src_token_ids, tgt_token_ids, src_valid_length,\
+                tgt_valid_length, sample_ids = sample_data
+                src_wc, tgt_wc, bs = src_valid_length.sum(),\
+                                     tgt_valid_length.sum(), src_token_ids.shape[0]
                 loss_denom += tgt_wc - bs
                 log_loss_denom += tgt_wc - bs
                 log_wc += src_wc + tgt_wc
diff --git a/scripts/question_answering/README.md b/scripts/question_answering/README.md
index 9f4065dbab..1dbd5377a8 100644
--- a/scripts/question_answering/README.md
+++ b/scripts/question_answering/README.md
@@ -84,6 +84,10 @@ horovodrun -np 4 -H localhost:4 python3 run_squad.py \
     ...
 ```
 
+### Using AMP
+
+Just add `--dtype float16` if you'd like to use AMP for training and half-precision for inference.
+
 ### Finetuning Details
 As for ELECTRA model, we fine-tune it with layer-wise learning rate decay as
 
@@ -145,16 +149,16 @@ Performance are shown in the table below, in which the SQuAD1.1 are evaluated wi
 Notice that the standard metrics of SQuAD are `EM/F1`. The former is an exact match score between predictions and references, 
 while the latter is a token-level F1 score in which the common tokens are considered as True Positives.
 
-|Reproduced ALBERT Models (F1/EM)  | SQuAD 1.1 dev | SQuAD 2.0 dev | SQuAD 2.0 Results File | Log | Command |
-|----------------------------------|---------------|---------------|------|-----| --------|
-|ALBERT base                       | 90.55/83.83   | 82.09/79.40   |[json](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/fintune_google_albert_base_v2_squad_2.0/best_results.json) | [log](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/fintune_google_albert_base_v2_squad_2.0/finetune_squad2.0.log) | [command](./commands/run_squad2_albert_base.sh) |
-|ALBERT large                      | 92.66/86.43   | 84.98/82.19   |[json](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/fintune_google_albert_large_v2_squad_2.0/best_results.json) | [log](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/fintune_google_albert_large_v2_squad_2.0/finetune_squad2.0.log) | [command](./commands/run_squad2_albert_large.sh) |
-|ALBERT xlarge                     | 93.85/87.71   | 87.92/85.04   |[json](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/fintune_google_albert_xlarge_v2_squad_2.0/best_results.json) | [log](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/fintune_google_albert_xlarge_v2_squad_2.0/finetune_squad2.0.log) | [command](./commands/run_squad2_albert_xlarge.sh) |
-|ALBERT xxlarge                    | 95.00/89.01   | 89.91/86.87    |[json](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/fintune_google_albert_xxlarge_v2_squad_2.0/best_results.json) | [log](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/fintune_google_albert_xxlarge_v2_squad_2.0/finetune_squad2.0.log) | [command](./commands/run_squad2_albert_xxlarge.sh) |
+|Reproduced ALBERT Models (F1/EM)  | SQuAD 1.1 dev | SQuAD 2.0 dev | SQuAD 2.0 Results File | Log | Command | Weight |
+|----------------------------------|---------------|---------------|------|-----|---------|----------|
+|ALBERT base                       | 90.55/83.83   | 82.57/79.75   |[json](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/20201025/test_squad2_albert_base/fintune_google_albert_base_v2_squad_2.0/best_results.json) | [log](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/20201025/test_squad2_albert_base/fintune_google_albert_base_v2_squad_2.0/finetune_squad2.0.log) | [command](./commands/run_squad2_albert_base.sh) | [weight](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/20201025/test_squad2_albert_base/fintune_google_albert_base_v2_squad_2.0/google_albert_base_v2_squad2.0_8163.params) |
+|ALBERT large                      | 92.66/86.43   | 85.21/82.50   |[json](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/20201025/test_squad2_albert_large/fintune_google_albert_large_v2_squad_2.0/best_results.json) | [log](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/20201025/test_squad2_albert_large/fintune_google_albert_large_v2_squad_2.0/finetune_squad2.0.log) | [command](./commands/run_squad2_albert_large.sh) | [weight](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/20201025/test_squad2_albert_large/fintune_google_albert_large_v2_squad_2.0/google_albert_large_v2_squad2.0_8163.params) |
+|ALBERT xlarge                     | 93.85/87.71   | 87.73/84.83   |[json](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/20201025/test_squad2_albert_xlarge/fintune_google_albert_xlarge_v2_squad_2.0/best_results.json) | [log](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/20201025/test_squad2_albert_xlarge/fintune_google_albert_xlarge_v2_squad_2.0/finetune_squad2.0.log) | [command](./commands/run_squad2_albert_xlarge.sh) | [weight](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/20201025/test_squad2_albert_xlarge/fintune_google_albert_xlarge_v2_squad_2.0/google_albert_xlarge_v2_squad2.0_8163.params) |
+|ALBERT xxlarge                    | 95.00/89.01   | 89.84/86.79   |[json](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/20201025/test_squad2_albert_xxlarge/fintune_google_albert_xxlarge_v2_squad_2.0/best_results.json) | [log](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/20201025/test_squad2_albert_xxlarge/fintune_google_albert_xxlarge_v2_squad_2.0/finetune_squad2.0.log) | [command](./commands/run_squad2_albert_xxlarge.sh) | [weight](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/20201025/test_squad2_albert_xxlarge/fintune_google_albert_xxlarge_v2_squad_2.0/google_albert_xxlarge_v2_squad2.0_8163.params) |
 
 For reference, we've included the results from Google's Original Experiments
 
-| Model Name | SQuAD 1.1 dev | SQuAD 2.0 dev|
+| Model Name (F1/EM) | SQuAD 1.1 dev | SQuAD 2.0 dev|
 |------------|---------------|--------------|
 |ALBERT base (googleresearch/albert)    | 90.2/83.2     | 82.1/79.3    |
 |ALBERT large (googleresearch/albert)   | 91.8/85.2     | 84.9/81.8    |
@@ -163,19 +167,19 @@ For reference, we've included the results from Google's Original Experiments
 
 For the reset pretrained models, the results on SQuAD1.1 and SQuAD2.0 are given as follows.
 
-| Model Name    | SQuAD1.1 dev  | SQuAD2.0 dev | SQuAD 2.0 Results File | Log | Command |
-|--------------------------|---------------|--------------|------|-----|--------|
-|BERT base                 | 88.40/81.24   | 76.43/73.59  |[json](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/fintune_google_en_uncased_bert_base_squad_2.0/best_results.json) | [log](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/fintune_google_en_uncased_bert_base_squad_2.0/finetune_squad2.0.log) | [command](./commands/run_squad2_uncased_bert_base.sh) |
-|BERT large                | 90.45/83.55   | 81.41/78.46  | [json](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/fintune_google_en_uncased_bert_large_squad_2.0/best_results.json) | [log](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/fintune_google_en_uncased_bert_large_squad_2.0/finetune_squad2.0.log) | [command](./commands/run_squad2_uncased_bert_large.sh) |
-|ELECTRA small             | 85.42/78.95   | 73.93/71.36  |[json](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/fintune_google_electra_small_squad_2.0/best_results.json) | [log](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/fintune_google_electra_small_squad_2.0/finetune_squad2.0.log) | [command](./commands/run_squad2_electra_small.sh) |      
-|ELECTRA base              | 92.63/87.34   | 86.65/83.95  |[json](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/fintune_google_electra_base_squad_2.0/best_results.json) | [log](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/fintune_google_electra_base_squad_2.0/finetune_squad2.0.log) | [command](./commands/run_squad2_electra_small.sh) |
-|ELECTRA large             | 94.95/89.94   | 90.67/88.32  |[json](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/fintune_google_electra_large_squad_2.0/best_results.json) | [log](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/fintune_google_electra_large_squad_2.0/finetune_squad2.0.log) | [command](./commands/run_squad2_electra_base.sh) |
-|MobileBERT                | 89.87/83.26 | 80.54/77.81  |[json](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/fintune_google_uncased_mobilebert_squad_2.0/best_results.json) | [log](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/fintune_google_uncased_mobilebert_squad_2.0/finetune_squad2.0.log) | [command](./commands/run_squad2_mobilebert.sh) |
-|RoBERTa large             | 94.58/88.86   | 89.69/86.80  |[json](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/fintune_fairseq_roberta_large_squad_2.0/best_results.json) | [log](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/fintune_fairseq_roberta_large_squad_2.0/finetune_squad2.0.log) | [command](./commands/run_squad2_electra_large.sh) |
+| Model Name (F1/EM)    | SQuAD1.1 dev  | SQuAD2.0 dev | SQuAD 2.0 Results File | Log | Command | Weight |
+|--------------------------|---------------|--------------|------|-----|--------|---------|
+|BERT base                 | 88.44/81.54   | 76.32/73.64  |[json](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/20201025/test_squad2_uncased_bert_base/fintune_google_en_uncased_bert_base_squad_2.0/best_results.json) | [log](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/20201025/test_squad2_uncased_bert_base/fintune_google_en_uncased_bert_base_squad_2.0/finetune_squad2.0.log) | [command](./commands/run_squad2_uncased_bert_base.sh) | [weight](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/20201025/test_squad2_uncased_bert_base/fintune_google_en_uncased_bert_base_squad_2.0/google_en_uncased_bert_base_squad2.0_8160.params) |
+|BERT large                | 90.65/84.02   | 81.22/78.22  |[json](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/20201025/test_squad2_uncased_bert_large/fintune_google_en_uncased_bert_large_squad_2.0/best_results.json) | [log](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/20201025/test_squad2_uncased_bert_large/fintune_google_en_uncased_bert_large_squad_2.0/finetune_squad2.0.log) | [command](./commands/run_squad2_uncased_bert_large.sh) | [weight](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/20201025/test_squad2_uncased_bert_large/fintune_google_en_uncased_bert_large_squad_2.0/google_en_uncased_bert_large_squad2.0_8159.params) |
+|ELECTRA small             | 85.76/79.16   | 74.07/71.56  |[json](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/20201025/test_squad2_electra_small/fintune_google_electra_small_squad_2.0/best_results.json) | [log](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/20201025/test_squad2_electra_small/fintune_google_electra_small_squad_2.0/finetune_squad2.0.log) | [command](./commands/run_squad2_electra_small.sh) | [weight](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/20201025/test_squad2_electra_small/fintune_google_electra_small_squad_2.0/google_electra_small_squad2.0_8160.params) |
+|ELECTRA base              | 92.64/86.99   | 86.33/83.67  |[json](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/20201025/test_squad2_electra_base/fintune_google_electra_base_squad_2.0/best_results.json) | [log](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/20201025/test_squad2_electra_base/fintune_google_electra_base_squad_2.0/finetune_squad2.0.log) | [command](./commands/run_squad2_electra_base.sh) | [weight](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/20201025/test_squad2_electra_base/fintune_google_electra_base_squad_2.0/google_electra_base_squad2.0_8160.params) |
+|ELECTRA large             | 94.79/89.52   | 90.55/88.24  |[json](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/20201025/test_squad2_electra_large/fintune_google_electra_large_squad_2.0/best_results.json) | [log](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/20201025/test_squad2_electra_large/fintune_google_electra_large_squad_2.0/finetune_squad2.0.log) | [command](./commands/run_squad2_electra_large.sh) | [weight](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/20201025/test_squad2_electra_large/fintune_google_electra_large_squad_2.0/google_electra_large_squad2.0_8159.params) |
+|MobileBERT                | 89.69/82.88   | 80.27/77.60  |[json](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/20201025/test_squad2_mobilebert/fintune_google_uncased_mobilebert_squad_2.0/best_results.json) | [log](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/20201025/test_squad2_mobilebert/fintune_google_uncased_mobilebert_squad_2.0/finetune_squad2.0.log) | [command](./commands/run_squad2_mobilebert.sh) | [weight](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/20201025/test_squad2_mobilebert/fintune_google_uncased_mobilebert_squad_2.0/google_uncased_mobilebert_squad2.0_20615.params) |
+|RoBERTa large             | 94.57/88.88   | 89.70/86.79  |[json](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/20201025/test_squad2_roberta_large/fintune_fairseq_roberta_large_squad_2.0/best_results.json) | [log](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/20201025/test_squad2_roberta_large/fintune_fairseq_roberta_large_squad_2.0/finetune_squad2.0.log) | [command](./commands/run_squad2_electra_large.sh) | [weight](https://gluon-nlp-log.s3.amazonaws.com/squad_training_log/20201025/test_squad2_roberta_large/fintune_fairseq_roberta_large_squad_2.0/fairseq_roberta_large_squad2.0_8160.params) |
 
 For reference, we have also included the results of original version from Google and Fairseq
 
-| Model Name               | SQuAD1.1 dev   | SQuAD2.0 dev  |
+| Model Name (F1/EM)       | SQuAD1.1 dev   | SQuAD2.0 dev  |
 |--------------------------|----------------|---------------|
 |Google BERT base          |   88.5/80.8    |     - / -     |
 |Google BERT large         |   90.9/84.1    |     - / -     |
diff --git a/scripts/question_answering/commands/generate_commands.py b/scripts/question_answering/commands/generate_commands.py
index e6c1fedbf2..9245d89daf 100644
--- a/scripts/question_answering/commands/generate_commands.py
+++ b/scripts/question_answering/commands/generate_commands.py
@@ -1,5 +1,6 @@
-from gluonnlp.utils.config import CfgNode
 import re
+import os
+from gluonnlp.utils.config import CfgNode
 
 
 def base_cfg():
@@ -12,9 +13,10 @@ def base_cfg():
     cfg.lr = 2e-5
     cfg.warmup_ratio = 0.1
     cfg.wd = 0.01
-    cfg.max_grad_norm = 0.1
+    cfg.max_grad_norm = 1.0
     cfg.max_seq_length = 512
     cfg.layerwise_decay = -1
+    cfg.dtype = 'float32'
     return cfg
 
 
@@ -35,6 +37,7 @@ def albert_xlarge_cfg():
     cfg.model_name = 'google_albert_xlarge_v2'
     cfg.batch_size = 1
     cfg.num_accumulated = 12
+    cfg.max_grad_norm = 0.1
     return cfg
 
 
@@ -118,6 +121,12 @@ def uncased_bert_large_cfg():
     return cfg
 
 
+def gluon_en_cased_bert_base_v1_cfg():
+    cfg = uncased_bert_base_cfg()
+    cfg.model_name = 'gluon_en_cased_bert_base_v1'
+    return cfg
+
+
 def gen_command(config, template_path, out_path):
     print(f'Generating from "{template_path}" to "{out_path}"')
 
@@ -134,7 +143,8 @@ def replace_fn(match):
 if __name__ == '__main__':
     for cfg_func in [albert_base_cfg, albert_large_cfg, albert_xlarge_cfg, albert_xxlarge_cfg,
                      electra_base_cfg, electra_large_cfg, electra_small_cfg, mobilebert_cfg,
-                     roberta_large_cfg, uncased_bert_base_cfg, uncased_bert_large_cfg]:
+                     roberta_large_cfg, uncased_bert_base_cfg, uncased_bert_large_cfg,
+                     gluon_en_cased_bert_base_v1_cfg]:
         prefix = cfg_func.__name__[:-len('_cfg')]
         gen_command(cfg_func(), 'run_squad.template',
                     f'run_squad2_{prefix}.sh')
diff --git a/scripts/question_answering/commands/run_squad.template b/scripts/question_answering/commands/run_squad.template
index a67b23bce3..d24ff71723 100644
--- a/scripts/question_answering/commands/run_squad.template
+++ b/scripts/question_answering/commands/run_squad.template
@@ -2,6 +2,7 @@
 
 USE_HOROVOD=${1:-0}  # Horovod flag. 0 --> not use horovod, 1 --> use horovod
 VERSION=${2:-2.0}   # SQuAD Version
+DTYPE=${3:-{{ dtype }}}   # Default training data type
 MODEL_NAME={{ model_name }}
 BATCH_SIZE={{ batch_size }}
 NUM_ACCUMULATED={{ num_accumulated }}
@@ -16,12 +17,14 @@ LAYERWISE_DECAY={{ layerwise_decay }}
 # Prepare the Data
 nlp_data prepare_squad --version ${VERSION}
 
+RUN_SQUAD_PATH=$(dirname "$0")/../run_squad.py
+
 # Run the script
 if [ ${USE_HOROVOD} -eq 0 ];
 then
-  RUN_COMMAND="python3 run_squad.py --gpus 0,1,2,3"
+  RUN_COMMAND="python3 ${RUN_SQUAD_PATH} --gpus 0,1,2,3"
 else
-  RUN_COMMAND="horovodrun -np 4 -H localhost:4 python3 run_squad.py --comm_backend horovod"
+  RUN_COMMAND="horovodrun -np 4 -H localhost:4 python3 ${RUN_SQUAD_PATH} --comm_backend horovod"
 fi
 ${RUN_COMMAND} \
     --model_name ${MODEL_NAME} \
@@ -39,4 +42,5 @@ ${RUN_COMMAND} \
     --wd ${WD} \
     --max_seq_length ${MAX_SEQ_LENGTH} \
     --max_grad_norm ${MAX_GRAD_NORM} \
+    --dtype ${DTYPE} \
     --overwrite_cache
diff --git a/scripts/question_answering/commands/run_squad2_albert_base.sh b/scripts/question_answering/commands/run_squad2_albert_base.sh
index 732b3abef8..ab960650f3 100644
--- a/scripts/question_answering/commands/run_squad2_albert_base.sh
+++ b/scripts/question_answering/commands/run_squad2_albert_base.sh
@@ -2,6 +2,7 @@
 
 USE_HOROVOD=${1:-0}  # Horovod flag. 0 --> not use horovod, 1 --> use horovod
 VERSION=${2:-2.0}   # SQuAD Version
+DTYPE=${3:-float32}   # Default training data type
 MODEL_NAME=google_albert_base_v2
 BATCH_SIZE=4
 NUM_ACCUMULATED=3
@@ -10,7 +11,7 @@ LR=2e-05
 WARMUP_RATIO=0.1
 WD=0.01
 MAX_SEQ_LENGTH=512
-MAX_GRAD_NORM=0.1
+MAX_GRAD_NORM=1.0
 LAYERWISE_DECAY=-1
 
 # Prepare the Data
@@ -41,4 +42,5 @@ ${RUN_COMMAND} \
     --wd ${WD} \
     --max_seq_length ${MAX_SEQ_LENGTH} \
     --max_grad_norm ${MAX_GRAD_NORM} \
+    --dtype ${DTYPE} \
     --overwrite_cache
diff --git a/scripts/question_answering/commands/run_squad2_albert_large.sh b/scripts/question_answering/commands/run_squad2_albert_large.sh
index fb92b7cda9..4007617869 100644
--- a/scripts/question_answering/commands/run_squad2_albert_large.sh
+++ b/scripts/question_answering/commands/run_squad2_albert_large.sh
@@ -2,6 +2,7 @@
 
 USE_HOROVOD=${1:-0}  # Horovod flag. 0 --> not use horovod, 1 --> use horovod
 VERSION=${2:-2.0}   # SQuAD Version
+DTYPE=${3:-float32}   # Default training data type
 MODEL_NAME=google_albert_large_v2
 BATCH_SIZE=3
 NUM_ACCUMULATED=4
@@ -10,7 +11,7 @@ LR=2e-05
 WARMUP_RATIO=0.1
 WD=0.01
 MAX_SEQ_LENGTH=512
-MAX_GRAD_NORM=0.1
+MAX_GRAD_NORM=1.0
 LAYERWISE_DECAY=-1
 
 # Prepare the Data
@@ -41,4 +42,5 @@ ${RUN_COMMAND} \
     --wd ${WD} \
     --max_seq_length ${MAX_SEQ_LENGTH} \
     --max_grad_norm ${MAX_GRAD_NORM} \
+    --dtype ${DTYPE} \
     --overwrite_cache
diff --git a/scripts/question_answering/commands/run_squad2_albert_xlarge.sh b/scripts/question_answering/commands/run_squad2_albert_xlarge.sh
index 0bd28952d5..3392f2f9a2 100644
--- a/scripts/question_answering/commands/run_squad2_albert_xlarge.sh
+++ b/scripts/question_answering/commands/run_squad2_albert_xlarge.sh
@@ -2,6 +2,7 @@
 
 USE_HOROVOD=${1:-0}  # Horovod flag. 0 --> not use horovod, 1 --> use horovod
 VERSION=${2:-2.0}   # SQuAD Version
+DTYPE=${3:-float32}   # Default training data type
 MODEL_NAME=google_albert_xlarge_v2
 BATCH_SIZE=1
 NUM_ACCUMULATED=12
@@ -41,4 +42,5 @@ ${RUN_COMMAND} \
     --wd ${WD} \
     --max_seq_length ${MAX_SEQ_LENGTH} \
     --max_grad_norm ${MAX_GRAD_NORM} \
+    --dtype ${DTYPE} \
     --overwrite_cache
diff --git a/scripts/question_answering/commands/run_squad2_albert_xxlarge.sh b/scripts/question_answering/commands/run_squad2_albert_xxlarge.sh
index 9383cbc873..d2bc808f46 100644
--- a/scripts/question_answering/commands/run_squad2_albert_xxlarge.sh
+++ b/scripts/question_answering/commands/run_squad2_albert_xxlarge.sh
@@ -2,6 +2,7 @@
 
 USE_HOROVOD=${1:-0}  # Horovod flag. 0 --> not use horovod, 1 --> use horovod
 VERSION=${2:-2.0}   # SQuAD Version
+DTYPE=${3:-float32}   # Default training data type
 MODEL_NAME=google_albert_xxlarge_v2
 BATCH_SIZE=1
 NUM_ACCUMULATED=12
@@ -41,4 +42,5 @@ ${RUN_COMMAND} \
     --wd ${WD} \
     --max_seq_length ${MAX_SEQ_LENGTH} \
     --max_grad_norm ${MAX_GRAD_NORM} \
+    --dtype ${DTYPE} \
     --overwrite_cache
diff --git a/scripts/question_answering/commands/run_squad2_electra_base.sh b/scripts/question_answering/commands/run_squad2_electra_base.sh
index 16ee8cdb98..2aa9755069 100644
--- a/scripts/question_answering/commands/run_squad2_electra_base.sh
+++ b/scripts/question_answering/commands/run_squad2_electra_base.sh
@@ -2,6 +2,7 @@
 
 USE_HOROVOD=${1:-0}  # Horovod flag. 0 --> not use horovod, 1 --> use horovod
 VERSION=${2:-2.0}   # SQuAD Version
+DTYPE=${3:-float32}   # Default training data type
 MODEL_NAME=google_electra_base
 BATCH_SIZE=8
 NUM_ACCUMULATED=1
@@ -10,7 +11,7 @@ LR=0.0001
 WARMUP_RATIO=0.1
 WD=0
 MAX_SEQ_LENGTH=512
-MAX_GRAD_NORM=0.1
+MAX_GRAD_NORM=1.0
 LAYERWISE_DECAY=0.8
 
 # Prepare the Data
@@ -41,4 +42,5 @@ ${RUN_COMMAND} \
     --wd ${WD} \
     --max_seq_length ${MAX_SEQ_LENGTH} \
     --max_grad_norm ${MAX_GRAD_NORM} \
+    --dtype ${DTYPE} \
     --overwrite_cache
diff --git a/scripts/question_answering/commands/run_squad2_electra_large.sh b/scripts/question_answering/commands/run_squad2_electra_large.sh
index d8a52235e8..389375f614 100644
--- a/scripts/question_answering/commands/run_squad2_electra_large.sh
+++ b/scripts/question_answering/commands/run_squad2_electra_large.sh
@@ -2,6 +2,7 @@
 
 USE_HOROVOD=${1:-0}  # Horovod flag. 0 --> not use horovod, 1 --> use horovod
 VERSION=${2:-2.0}   # SQuAD Version
+DTYPE=${3:-float32}   # Default training data type
 MODEL_NAME=google_electra_large
 BATCH_SIZE=2
 NUM_ACCUMULATED=4
@@ -41,4 +42,5 @@ ${RUN_COMMAND} \
     --wd ${WD} \
     --max_seq_length ${MAX_SEQ_LENGTH} \
     --max_grad_norm ${MAX_GRAD_NORM} \
+    --dtype ${DTYPE} \
     --overwrite_cache
diff --git a/scripts/question_answering/commands/run_squad2_electra_small.sh b/scripts/question_answering/commands/run_squad2_electra_small.sh
index d6228ef0bc..c565ce4403 100644
--- a/scripts/question_answering/commands/run_squad2_electra_small.sh
+++ b/scripts/question_answering/commands/run_squad2_electra_small.sh
@@ -2,6 +2,7 @@
 
 USE_HOROVOD=${1:-0}  # Horovod flag. 0 --> not use horovod, 1 --> use horovod
 VERSION=${2:-2.0}   # SQuAD Version
+DTYPE=${3:-float32}   # Default training data type
 MODEL_NAME=google_electra_small
 BATCH_SIZE=8
 NUM_ACCUMULATED=1
@@ -10,7 +11,7 @@ LR=0.0003
 WARMUP_RATIO=0.1
 WD=0
 MAX_SEQ_LENGTH=512
-MAX_GRAD_NORM=0.1
+MAX_GRAD_NORM=1.0
 LAYERWISE_DECAY=0.8
 
 # Prepare the Data
@@ -41,4 +42,5 @@ ${RUN_COMMAND} \
     --wd ${WD} \
     --max_seq_length ${MAX_SEQ_LENGTH} \
     --max_grad_norm ${MAX_GRAD_NORM} \
+    --dtype ${DTYPE} \
     --overwrite_cache
diff --git a/scripts/question_answering/commands/run_squad2_gluon_en_cased_bert_base_v1.sh b/scripts/question_answering/commands/run_squad2_gluon_en_cased_bert_base_v1.sh
new file mode 100644
index 0000000000..a3801f4cdc
--- /dev/null
+++ b/scripts/question_answering/commands/run_squad2_gluon_en_cased_bert_base_v1.sh
@@ -0,0 +1,46 @@
+# Generated by "generate_commands.py"
+
+USE_HOROVOD=${1:-0}  # Horovod flag. 0 --> not use horovod, 1 --> use horovod
+VERSION=${2:-2.0}   # SQuAD Version
+DTYPE=${3:-float32}   # Default training data type
+MODEL_NAME=gluon_en_cased_bert_base_v1
+BATCH_SIZE=6
+NUM_ACCUMULATED=2
+EPOCHS=3
+LR=3e-05
+WARMUP_RATIO=0.1
+WD=0.01
+MAX_SEQ_LENGTH=512
+MAX_GRAD_NORM=1.0
+LAYERWISE_DECAY=-1
+
+# Prepare the Data
+nlp_data prepare_squad --version ${VERSION}
+
+RUN_SQUAD_PATH=$(dirname "$0")/../run_squad.py
+
+# Run the script
+if [ ${USE_HOROVOD} -eq 0 ];
+then
+  RUN_COMMAND="python3 ${RUN_SQUAD_PATH} --gpus 0,1,2,3"
+else
+  RUN_COMMAND="horovodrun -np 4 -H localhost:4 python3 ${RUN_SQUAD_PATH} --comm_backend horovod"
+fi
+${RUN_COMMAND} \
+    --model_name ${MODEL_NAME} \
+    --data_dir squad \
+    --output_dir fintune_${MODEL_NAME}_squad_${VERSION} \
+    --version ${VERSION} \
+    --do_eval \
+    --do_train \
+    --batch_size ${BATCH_SIZE} \
+    --num_accumulated ${NUM_ACCUMULATED} \
+    --layerwise_decay ${LAYERWISE_DECAY} \
+    --epochs ${EPOCHS} \
+    --lr ${LR} \
+    --warmup_ratio ${WARMUP_RATIO} \
+    --wd ${WD} \
+    --max_seq_length ${MAX_SEQ_LENGTH} \
+    --max_grad_norm ${MAX_GRAD_NORM} \
+    --dtype ${DTYPE} \
+    --overwrite_cache
diff --git a/scripts/question_answering/commands/run_squad2_mobilebert.sh b/scripts/question_answering/commands/run_squad2_mobilebert.sh
index 24fece841d..f59c16cd9e 100644
--- a/scripts/question_answering/commands/run_squad2_mobilebert.sh
+++ b/scripts/question_answering/commands/run_squad2_mobilebert.sh
@@ -2,6 +2,7 @@
 
 USE_HOROVOD=${1:-0}  # Horovod flag. 0 --> not use horovod, 1 --> use horovod
 VERSION=${2:-2.0}   # SQuAD Version
+DTYPE=${3:-float32}   # Default training data type
 MODEL_NAME=google_uncased_mobilebert
 BATCH_SIZE=8
 NUM_ACCUMULATED=1
@@ -10,7 +11,7 @@ LR=4e-05
 WARMUP_RATIO=0.1
 WD=0.01
 MAX_SEQ_LENGTH=384
-MAX_GRAD_NORM=0.1
+MAX_GRAD_NORM=1.0
 LAYERWISE_DECAY=-1
 
 # Prepare the Data
@@ -41,4 +42,5 @@ ${RUN_COMMAND} \
     --wd ${WD} \
     --max_seq_length ${MAX_SEQ_LENGTH} \
     --max_grad_norm ${MAX_GRAD_NORM} \
+    --dtype ${DTYPE} \
     --overwrite_cache
diff --git a/scripts/question_answering/commands/run_squad2_roberta_large.sh b/scripts/question_answering/commands/run_squad2_roberta_large.sh
index 2bf51e6b6c..b95b949757 100644
--- a/scripts/question_answering/commands/run_squad2_roberta_large.sh
+++ b/scripts/question_answering/commands/run_squad2_roberta_large.sh
@@ -2,6 +2,7 @@
 
 USE_HOROVOD=${1:-0}  # Horovod flag. 0 --> not use horovod, 1 --> use horovod
 VERSION=${2:-2.0}   # SQuAD Version
+DTYPE=${3:-float32}   # Default training data type
 MODEL_NAME=fairseq_roberta_large
 BATCH_SIZE=2
 NUM_ACCUMULATED=6
@@ -10,7 +11,7 @@ LR=3e-05
 WARMUP_RATIO=0.2
 WD=0.01
 MAX_SEQ_LENGTH=512
-MAX_GRAD_NORM=0.1
+MAX_GRAD_NORM=1.0
 LAYERWISE_DECAY=-1
 
 # Prepare the Data
@@ -41,4 +42,5 @@ ${RUN_COMMAND} \
     --wd ${WD} \
     --max_seq_length ${MAX_SEQ_LENGTH} \
     --max_grad_norm ${MAX_GRAD_NORM} \
+    --dtype ${DTYPE} \
     --overwrite_cache
diff --git a/scripts/question_answering/commands/run_squad2_uncased_bert_base.sh b/scripts/question_answering/commands/run_squad2_uncased_bert_base.sh
index f2a0738282..ee3d8d0208 100644
--- a/scripts/question_answering/commands/run_squad2_uncased_bert_base.sh
+++ b/scripts/question_answering/commands/run_squad2_uncased_bert_base.sh
@@ -2,6 +2,7 @@
 
 USE_HOROVOD=${1:-0}  # Horovod flag. 0 --> not use horovod, 1 --> use horovod
 VERSION=${2:-2.0}   # SQuAD Version
+DTYPE=${3:-float32}   # Default training data type
 MODEL_NAME=google_en_uncased_bert_base
 BATCH_SIZE=6
 NUM_ACCUMULATED=2
@@ -10,7 +11,7 @@ LR=3e-05
 WARMUP_RATIO=0.1
 WD=0.01
 MAX_SEQ_LENGTH=512
-MAX_GRAD_NORM=0.1
+MAX_GRAD_NORM=1.0
 LAYERWISE_DECAY=-1
 
 # Prepare the Data
@@ -41,4 +42,5 @@ ${RUN_COMMAND} \
     --wd ${WD} \
     --max_seq_length ${MAX_SEQ_LENGTH} \
     --max_grad_norm ${MAX_GRAD_NORM} \
+    --dtype ${DTYPE} \
     --overwrite_cache
diff --git a/scripts/question_answering/commands/run_squad2_uncased_bert_large.sh b/scripts/question_answering/commands/run_squad2_uncased_bert_large.sh
index 2f19c4c5e7..ee94b544c1 100644
--- a/scripts/question_answering/commands/run_squad2_uncased_bert_large.sh
+++ b/scripts/question_answering/commands/run_squad2_uncased_bert_large.sh
@@ -2,6 +2,7 @@
 
 USE_HOROVOD=${1:-0}  # Horovod flag. 0 --> not use horovod, 1 --> use horovod
 VERSION=${2:-2.0}   # SQuAD Version
+DTYPE=${3:-float32}   # Default training data type
 MODEL_NAME=google_en_uncased_bert_large
 BATCH_SIZE=2
 NUM_ACCUMULATED=6
@@ -10,7 +11,7 @@ LR=3e-05
 WARMUP_RATIO=0.1
 WD=0.01
 MAX_SEQ_LENGTH=512
-MAX_GRAD_NORM=0.1
+MAX_GRAD_NORM=1.0
 LAYERWISE_DECAY=-1
 
 # Prepare the Data
@@ -41,4 +42,5 @@ ${RUN_COMMAND} \
     --wd ${WD} \
     --max_seq_length ${MAX_SEQ_LENGTH} \
     --max_grad_norm ${MAX_GRAD_NORM} \
+    --dtype ${DTYPE} \
     --overwrite_cache
diff --git a/scripts/question_answering/run_squad.py b/scripts/question_answering/run_squad.py
index 70ba408843..8874f9068d 100644
--- a/scripts/question_answering/run_squad.py
+++ b/scripts/question_answering/run_squad.py
@@ -8,8 +8,11 @@
 import time
 import logging
 import argparse
+import ast
 import functools
 import collections
+import dataclasses
+from dataclasses import dataclass
 from multiprocessing import Pool, cpu_count
 
 import mxnet as mx
@@ -142,12 +145,27 @@ def parse_args():
                              'instead of only last one')
     parser.add_argument('--max_saved_ckpt', type=int, default=5,
                         help='The maximum number of saved checkpoints')
-    parser.add_argument('--eval_dtype', type=str, default='float32',
-                        help='Data type used for evaluation. Either float32 or float16')
+    parser.add_argument('--dtype', type=str, default='float32',
+                        help='Data type used for evaluation. Either float32 or float16. When you '
+                             'use --dtype float16, amp will be turned on in the training phase and '
+                             'fp16 will be used in evaluation.')
     args = parser.parse_args()
     return args
 
 
+ChunkFeature = collections.namedtuple('ChunkFeature',
+                                      ['qas_id',
+                                       'data',
+                                       'valid_length',
+                                       'segment_ids',
+                                       'masks',
+                                       'is_impossible',
+                                       'gt_start',
+                                       'gt_end',
+                                       'context_offset',
+                                       'chunk_start',
+                                       'chunk_length'])
+
 class SquadDatasetProcessor:
 
     def __init__(self, tokenizer, doc_stride, max_seq_length, max_query_length):
@@ -176,24 +194,13 @@ def __init__(self, tokenizer, doc_stride, max_seq_length, max_query_length):
         self.sep_id = vocab.eos_id if 'sep_token' not in vocab.special_token_keys else vocab.sep_id
 
         # TODO(sxjscience) Consider to combine the NamedTuple and batchify functionality.
-        self.ChunkFeature = collections.namedtuple('ChunkFeature',
-                                              ['qas_id',
-                                               'data',
-                                               'valid_length',
-                                               'segment_ids',
-                                               'masks',
-                                               'is_impossible',
-                                               'gt_start',
-                                               'gt_end',
-                                               'context_offset',
-                                               'chunk_start',
-                                               'chunk_length'])
-        self.BatchifyFunction = bf.NamedTuple(self.ChunkFeature,
+        # Here, we use round_to=8 to improve the throughput.
+        self.BatchifyFunction = bf.NamedTuple(ChunkFeature,
                                          {'qas_id': bf.List(),
-                                          'data': bf.Pad(val=self.pad_id),
+                                          'data': bf.Pad(val=self.pad_id, round_to=8),
                                           'valid_length': bf.Stack(),
                                           'segment_ids': bf.Pad(),
-                                          'masks': bf.Pad(val=1),
+                                          'masks': bf.Pad(val=1, round_to=8),
                                           'is_impossible': bf.Stack(),
                                           'gt_start': bf.Stack(),
                                           'gt_end': bf.Stack(),
@@ -266,17 +273,17 @@ def process_sample(self, feature: SquadFeature):
                 # Here, we increase the start and end because we put query before context
                 start_pos = chunk.gt_start_pos + context_offset
                 end_pos = chunk.gt_end_pos + context_offset
-            chunk_feature = self.ChunkFeature(qas_id=feature.qas_id,
-                                              data=data,
-                                              valid_length=valid_length,
-                                              segment_ids=segment_ids,
-                                              masks=masks,
-                                              is_impossible=chunk.is_impossible,
-                                              gt_start=start_pos,
-                                              gt_end=end_pos,
-                                              context_offset=context_offset,
-                                              chunk_start=chunk.start,
-                                              chunk_length=chunk.length)
+            chunk_feature = ChunkFeature(qas_id=feature.qas_id,
+                                         data=data,
+                                         valid_length=valid_length,
+                                         segment_ids=segment_ids,
+                                         masks=masks,
+                                         is_impossible=chunk.is_impossible,
+                                         gt_start=start_pos,
+                                         gt_end=end_pos,
+                                         context_offset=context_offset,
+                                         chunk_start=chunk.start,
+                                         chunk_length=chunk.length)
             ret.append(chunk_feature)
         return ret
 
@@ -427,7 +434,9 @@ def setup_logging(args, local_rank):
     set_seed(args.seed)
     logging.debug('Random seed set to {}'.format(args.seed))
 
+
 def train(args):
+    use_amp = args.dtype == 'float16'
     store, num_workers, rank, local_rank, is_master_node, ctx_l = init_comm(
         args.comm_backend, args.gpus)
     setup_logging(args, local_rank)
@@ -527,7 +536,7 @@ def train(args):
                         'wd': args.wd,
                         'lr_scheduler': lr_scheduler,
                         }
-    adam_betas = eval(args.adam_betas)
+    adam_betas = ast.literal_eval(args.adam_betas)
     if args.optimizer == 'adamw':
         optimizer_params.update({'beta1': adam_betas[0],
                                  'beta2': adam_betas[1],
@@ -539,12 +548,15 @@ def train(args):
                                  'beta2': adam_betas[1],
                                  'epsilon': args.adam_epsilon,
                                  })
+    if use_amp:
+        optimizer_params.update({'multi_precision': True})
     if args.comm_backend == 'horovod':
         trainer = hvd.DistributedTrainer(param_dict, args.optimizer, optimizer_params)
     else:
         trainer = mx.gluon.Trainer(param_dict, args.optimizer, optimizer_params,
                                    update_on_kvstore=False)
-
+    if use_amp:
+        amp.init_trainer(trainer)
     log_span_loss = 0
     log_answerable_loss = 0
     log_total_loss = 0
@@ -584,9 +596,18 @@ def train(args):
                     loss_l.append(loss)
                     span_loss_l.append(span_loss)
                     answerable_loss_l.append(answerable_loss)
+            if use_amp:
+                with mx.autograd.record():
+                    with amp.scale_loss(loss_l, trainer) as loss_l:
+                        for loss in loss_l:
+                            loss.backward()
+                norm_clip_mult = num_workers * trainer._amp_loss_scaler.loss_scale
+            else:
+                with mx.autograd.record():
+                    for loss in loss_l:
+                        loss.backward()
+                norm_clip_mult = num_workers
 
-            for loss in loss_l:
-                loss.backward()
             # All Reduce the Step Loss
             log_span_loss += sum([ele.as_in_ctx(ctx_l[0]) for ele in span_loss_l]).asnumpy()
             log_total_loss += sum([ele.as_in_ctx(ctx_l[0])
@@ -598,7 +619,7 @@ def train(args):
 
         if args.max_grad_norm > 0:
             total_norm, ratio, is_finite = clip_grad_global_norm(
-                params, args.max_grad_norm * num_workers)
+                params, args.max_grad_norm * norm_clip_mult)
         else:
             total_norm = grad_global_norm(params)
 
@@ -610,7 +631,7 @@ def train(args):
             # gluon.trainer._scale is default to 1
             trainer.update(num_workers, ignore_stale_grad=True)
 
-        total_norm = total_norm / num_workers
+        total_norm = total_norm / norm_clip_mult
         if args.num_accumulated > 1:
             # set grad to zero for gradient accumulation
             qa_net.zero_grad()
@@ -651,7 +672,6 @@ def train(args):
             log_answerable_loss = 0
             log_total_loss = 0
             log_sample_num = 0
-            num_samples_per_update = 0
 
         if (step_num + 1) >= num_train_steps:
             toc = time.time()
@@ -808,8 +828,8 @@ def evaluate(args, last=True):
             str(ctx_l)))
 
     cfg, tokenizer, qa_net, use_segmentation = get_network(
-        args.model_name, ctx_l, args.classifier_dropout, dtype=args.eval_dtype)
-    if args.eval_dtype == 'float16':
+        args.model_name, ctx_l, args.classifier_dropout, dtype=args.dtype)
+    if args.dtype == 'float16':
         qa_net.cast('float16')
         qa_net.hybridize()
 
@@ -978,6 +998,10 @@ def eval_validation(ckpt_name, best_eval):
     os.environ['MXNET_GPU_MEM_POOL_TYPE'] = 'Round'
     args = parse_args()
     if args.do_train:
+        if args.dtype == 'float16':
+            # Initialize amp if it's fp16 training
+            from mxnet import amp
+            amp.init()
         train(args)
     if args.do_eval:
         evaluate(args, last=not args.all_evaluate)
diff --git a/src/gluonnlp/attention_cell.py b/src/gluonnlp/attention_cell.py
index 1ea4350127..a6e9df089c 100644
--- a/src/gluonnlp/attention_cell.py
+++ b/src/gluonnlp/attention_cell.py
@@ -160,7 +160,7 @@ def gen_self_attn_mask(data,
     else:
         raise NotImplementedError
     mask = mask.astype(dtype)
-    return mask
+    return mask.astype(np.bool)
 
 
 def gen_mem_attn_mask(mem, mem_valid_length, data, data_valid_length=None,
@@ -241,7 +241,7 @@ def gen_mem_attn_mask(mem, mem_valid_length, data, data_valid_length=None,
     else:
         query_length_ones = np.ones_like(data_steps)
         mask = query_length_ones.reshape((1, -1, 1)) * mem_mask
-    return mask
+    return mask.astype(np.bool)
 
 
 # TODO(sxjscience) Directly implement a kernel for masked softmax
@@ -273,7 +273,7 @@ def masked_softmax(att_score, mask, dtype=np.float32, axis: int = -1):
         else:
             try:
                 # if AMP (automatic mixed precision) is enabled, -1e18 will cause NaN.
-                from mxnet.contrib import amp
+                from mxnet import amp
                 if amp.amp._amp_initialized:
                     neg = -1e4
             except ImportError:
diff --git a/src/gluonnlp/data/sampler.py b/src/gluonnlp/data/sampler.py
index aabfe7a688..08fd9b48ca 100644
--- a/src/gluonnlp/data/sampler.py
+++ b/src/gluonnlp/data/sampler.py
@@ -266,20 +266,25 @@ def __len__(self):
 
 
 class BoundedBudgetSampler(BaseSampler):
-    r"""Assign each data sample to bounded budget batches. Samples will be sorted by length before batchfy
-    see https://github.com/pytorch/fairseq/blob/master/fairseq/data/data_utils_fast.pyx
+    r"""Assign each data sample to bounded budget batches.
+    We will ensure that within the batch,
+    the total number of tokens is smaller than the provided max_num_tokens,
+    and the total number of sentences is smaller than the provided max_num_sentences.
+
+    Samples will be sorted by length before batchify
+    See Also https://github.com/pytorch/fairseq/blob/master/fairseq/data/data_utils_fast.pyx
 
     Parameters
     ----------
     lengths
         The length of the sequences in the input data sample.
     max_num_tokens
-        max tokens num of each batch
+        Max number of tokens of each batch
     max_num_sentences
-        max sentences num of each batch
+        Max number of sentences of each batch
     required_batch_size_multiple
-        require batch size to be a multiple of N (default: 1).
-        better throughput in GPU.
+        Require batch size to be a multiple of N (default: 1).
+        This will generally have better throughput in GPU.
     shuffle
         Whether to shuffle the batches.
     seed
@@ -295,7 +300,7 @@ def __init__(self, lengths: Union[Sequence[int], Sequence[Sequence[int]]],
         self._lengths = np.array(lengths)
         if self._lengths.ndim == 2:
             self._lengths = self._lengths.max(axis=1)
-        self._indices = np.array(range(len(lengths)))
+        self._indices = np.arange(len(lengths))
         self._max_num_tokens = max_num_tokens
         self._max_num_sentences = max_num_sentences
         self._batches = []
@@ -313,11 +318,11 @@ def __init__(self, lengths: Union[Sequence[int], Sequence[Sequence[int]]],
             batch_num_tokens = batch_num_sentences * batch_max_sample_len
             if (self._max_num_sentences > 0 and batch_num_sentences > self._max_num_sentences) or \
                (self._max_num_tokens > 0 and batch_num_tokens > self._max_num_tokens):
-                # moded_bs = len(batch) % required_batch_size_multiple when len(batch) < required_batch_size_multiple
-                moded_bs = max(
-                    required_batch_size_multiple * (len(batch) // required_batch_size_multiple),
-                    len(batch) % required_batch_size_multiple
-                )
+                if len(batch) < required_batch_size_multiple:
+                    moded_bs = len(batch)
+                else:
+                    moded_bs = required_batch_size_multiple\
+                               * (len(batch) // required_batch_size_multiple)
                 self._batches.append(np.array(batch[:moded_bs]))
                 batch = batch[moded_bs:]
                 batch_max_sample_len = max(
diff --git a/src/gluonnlp/models/bart.py b/src/gluonnlp/models/bart.py
index 8d935b14fd..2112aeb22e 100644
--- a/src/gluonnlp/models/bart.py
+++ b/src/gluonnlp/models/bart.py
@@ -51,7 +51,7 @@
 
 
 @bart_cfg_reg.register()
-def bart_base():
+def fairseq_bart_base():
     cfg = CN()
     # Config for the bart base model
     cfg.MODEL = CN()
@@ -104,8 +104,8 @@ def bart_base():
 
 
 @bart_cfg_reg.register()
-def bart_large():
-    cfg = bart_base()
+def fairseq_bart_large():
+    cfg = fairseq_bart_base()
     cfg.defrost()
     cfg.MODEL.vocab_size = 50265
     cfg.MODEL.ENCODER.units = 1024
@@ -122,14 +122,14 @@ def bart_large():
 
 PRETRAINED_URL = {
     'fairseq_bart_base': {
-        'cfg': bart_base(),
+        'cfg': fairseq_bart_base(),
         'merges': 'fairseq_bart_base/gpt2-396d4d8e.merges',
         'vocab': 'fairseq_bart_base/gpt2-f4dedacb.vocab',
         'params': 'fairseq_bart_base/model-8f4929b5.params',
         'lowercase': False,
     },
     'fairseq_bart_large': {
-        'cfg': bart_large(),
+        'cfg': fairseq_bart_large(),
         'merges': 'fairseq_bart_large/gpt2-396d4d8e.merges',
         'vocab': 'fairseq_bart_large/gpt2-f1335494.vocab',
         'params': 'fairseq_bart_large/model-862277b1.params',
@@ -200,7 +200,6 @@ def forward(self, src_data, src_valid_length, tgt_data, tgt_valid_length):
 
         Parameters
         ----------
-        F
         src_data
             - layout = 'NT'
                 Shape (batch_size, src_length)
@@ -273,10 +272,10 @@ def apply_pooling(self, sequence, valid_length):
             Shape (batch_size, units)
         """
         if self._layout == 'NT':
-            batch_indices = F.npx.arange_like(sequence, axis=0).astype(mx.np.int32)
+            batch_indices = mx.npx.arange_like(sequence, axis=0).astype(mx.np.int32)
             outputs = sequence[batch_indices, valid_length - 1]
         elif self._layout == 'TN':
-            batch_indices = F.npx.arange_like(sequence, axis=1).astype(mx.np.int32)
+            batch_indices = mx.npx.arange_like(sequence, axis=1).astype(mx.np.int32)
             outputs = sequence[valid_length - 1, batch_indices]
         else:
             raise NotImplementedError
@@ -296,7 +295,7 @@ def vocab_size(self):
     @classmethod
     def get_cfg(cls, key=None):
         if key is None:
-            return bart_base()
+            return fairseq_bart_base()
         else:
             return bart_cfg_reg.create(key)
 
diff --git a/src/gluonnlp/models/gpt2.py b/src/gluonnlp/models/gpt2.py
index 94f0a0ca1d..88387e81b8 100644
--- a/src/gluonnlp/models/gpt2.py
+++ b/src/gluonnlp/models/gpt2.py
@@ -558,7 +558,7 @@ def get_initial_embedding(self, inputs, prev_len):
         embedding = self._embed_dropout(embedding)
         return embedding
 
-    def init_states(self, batch_size, ctx):
+    def init_states(self, batch_size, ctx, dtype=None):
         """Initialize the states required for incremental decoding
 
         Returns
@@ -569,10 +569,12 @@ def init_states(self, batch_size, ctx):
             - layout = 'TN'
                 Shape (num_layers, 2, 0, batch_size, C_in)
         """
+        if dtype is None:
+            dtype = self._dtype
         return mx.np.zeros(shape=(self._num_layers, 2, batch_size, 0,
-                                  self._units), ctx=ctx, dtype=self._dtype) if self.layout == 'NT' else \
+                                  self._units), ctx=ctx, dtype=dtype) if self.layout == 'NT' else \
                mx.np.zeros(shape=(self._num_layers, 2, 0, batch_size,
-                                  self._units), ctx=ctx, dtype=self._dtype)
+                                  self._units), ctx=ctx, dtype=dtype)
 
     @staticmethod
     def get_cfg(key=None):
diff --git a/src/gluonnlp/models/transformer.py b/src/gluonnlp/models/transformer.py
index 1deedd4dd1..75fbf56688 100644
--- a/src/gluonnlp/models/transformer.py
+++ b/src/gluonnlp/models/transformer.py
@@ -1227,7 +1227,6 @@ def forward(self, src_data, src_valid_length, tgt_data, tgt_valid_length):
 
         Parameters
         ----------
-        F
         src_data
             - layout = 'NT'
                 Shape (batch_size, src_length)
diff --git a/src/gluonnlp/models/transformer_xl.py b/src/gluonnlp/models/transformer_xl.py
index 341aa5e812..bf938cf068 100644
--- a/src/gluonnlp/models/transformer_xl.py
+++ b/src/gluonnlp/models/transformer_xl.py
@@ -347,8 +347,14 @@ def get_cfg(cls, key=None):
         return config
 
     @classmethod
-    def from_cfg(cls, cfg):
-        return cls(cfg=cfg)
+    def from_cfg(cls, cfg, dtype=None):
+        if dtype is not None:
+            new_cfg = cfg.clone()
+            new_cfg.defrost()
+            new_cfg.MODEL.dtype = dtype
+            return cls(cfg=new_cfg)
+        else:
+            return cls(cfg=cfg)
 
     @property
     def state_batch_axis(self):
diff --git a/src/gluonnlp/optimizer.py b/src/gluonnlp/optimizer.py
index 8b86f925ec..1629ce78d3 100644
--- a/src/gluonnlp/optimizer.py
+++ b/src/gluonnlp/optimizer.py
@@ -80,15 +80,14 @@ class AdamW(optimizer.Optimizer):
     def __init__(self, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-6,
                  correct_bias=True, use_fused_step=True, **kwargs):
         super().__init__(use_fused_step=use_fused_step,
-                                    learning_rate=learning_rate,
-                                    **kwargs)
+                         learning_rate=learning_rate,
+                         **kwargs)
         self.beta1 = beta1
         self.beta2 = beta2
         self.epsilon = epsilon
         self.correct_bias = correct_bias
         self.aggregate_num = max(1, min(50,
                                         int(os.getenv('MXNET_OPTIMIZER_AGGREGATION_SIZE', '4'))))
-        assert self.multi_precision is False, 'Currently we do not support multi-precision.'
 
     def create_state(self, index, weight):
         """state creation function."""
diff --git a/src/gluonnlp/utils/testing.py b/src/gluonnlp/utils/testing.py
index 7eb1f54457..7e45e0da2d 100644
--- a/src/gluonnlp/utils/testing.py
+++ b/src/gluonnlp/utils/testing.py
@@ -1,8 +1,10 @@
 __all__ = ['is_match_states_batch_size', 'verify_nmt_model', 'verify_nmt_inference']
 
 import numpy.testing as npt
+import numpy as np
 import mxnet as mx
 from mxnet.util import use_np
+from .parameter import move_to_ctx
 
 
 def is_match_states_batch_size(states, states_batch_axis, batch_size) -> bool:
@@ -106,10 +108,15 @@ def verify_nmt_inference(train_model, inference_model,
     Parameters
     ----------
     train_model
+        The training model
     inference_model
+        The inference model
     batch_size
+        Batch size
     src_seq_length
+        Length of the source sequence
     tgt_seq_length
+        Length of the target sequence
     atol
         Absolute tolerance
     rtol
@@ -161,3 +168,103 @@ def verify_nmt_inference(train_model, inference_model,
                                 partial_out[:, :partial_batch_size].asnumpy(), atol, rtol)
     else:
         raise NotImplementedError
+
+
+def _match_struct_output(lhs, rhs, atol=1E-2, rtol=1E-2):
+    if isinstance(lhs, (list, tuple)):
+        for lhs_ele, rhs_ele in zip(lhs, rhs):
+            _match_struct_output(lhs_ele, rhs_ele, atol=atol, rtol=rtol)
+    else:
+        npt.assert_allclose(lhs.asnumpy().astype('float32'),
+                            rhs.asnumpy().astype('float32'), atol=atol, rtol=rtol)
+
+
+def _cast_nested_to_fp16(nested_dat):
+    """Cast the nested input to fp16
+
+    Parameters
+    ----------
+    dat
+        The input nested data structure
+
+    Returns
+    -------
+    output
+        The casted output data
+    """
+    if isinstance(nested_dat, (mx.np.ndarray, np.ndarray)):
+        if nested_dat.dtype == np.float32:
+            return nested_dat.astype(np.float16)
+        else:
+            return nested_dat
+    elif isinstance(nested_dat, list):
+        return [_cast_nested_to_fp16(ele) for ele in nested_dat]
+    elif isinstance(nested_dat, tuple):
+        return tuple([_cast_nested_to_fp16(ele) for ele in nested_dat])
+    else:
+        raise NotImplementedError('Type is not supported!')
+
+
+def verify_backbone_fp16(model_cls, cfg, ctx, inputs,
+                         atol=1E-2, rtol=1E-2, check_amp=True):
+    """Test whether the backbone model has the comparable parameter gradient +
+
+    Parameters
+    ----------
+    model_cls
+        The modeling class
+    cfg
+        The configuration
+    ctx
+        The context
+    inputs
+        The input tensors of the model. We will
+    atol
+        The absolute tolerance
+    rtol
+        The relative tolerance
+    check_amp
+        Whether to check the AMP process. You will need to ensure that there is no
+        randomness in the model when it is turned on.
+
+    """
+    model_fp32 = model_cls.from_cfg(cfg, dtype='float32')
+    model_fp32.initialize(ctx=ctx)
+    model_fp32.hybridize()
+    # Check forward
+    fp32_inputs = move_to_ctx(inputs, ctx=ctx)
+    outputs_fp32 = model_fp32(*fp32_inputs)
+    mx.npx.waitall()
+    # Check forward of fp16
+    model_fp16 = model_cls.from_cfg(cfg, dtype='float16')
+    model_fp16.share_parameters(model_fp32.collect_params())
+    model_fp16.cast('float16')
+    model_fp16.hybridize()
+    for param in model_fp16.collect_params().values():
+        assert param.dtype == 'float16'
+    fp16_inputs = move_to_ctx(_cast_nested_to_fp16(inputs), ctx=ctx)
+    outputs_fp16 = model_fp16(*fp16_inputs)
+    mx.npx.waitall()
+    _match_struct_output(outputs_fp16, outputs_fp32, atol=atol, rtol=rtol)
+    if check_amp:
+        from mxnet import amp
+        amp.init()
+        # Reconstruct the fp32 model
+        model_fp32 = model_cls.from_cfg(cfg, dtype='float32')
+        model_fp32.initialize(ctx=ctx)
+        model_fp32.hybridize()
+        trainer = mx.gluon.Trainer(model_fp32.collect_params(), 'adam',
+                                   {'learning_rate': 1E-3, 'wd': 1E-4,
+                                    'multi_precision': True},
+                                   update_on_kvstore=False)
+        amp.init_trainer(trainer)
+        with mx.autograd.record():
+            outputs_amp = model_fp32(*fp32_inputs)
+            if not isinstance(outputs_amp, (tuple, list)):
+                loss = outputs_amp.mean()
+            else:
+                loss = sum([ele.mean() for ele in outputs_amp])
+            with amp.scale_loss(loss, trainer) as scaled_loss:
+                mx.autograd.backward(scaled_loss)
+        trainer.step(1)
+        mx.npx.waitall()
diff --git a/tests/README.md b/tests/README.md
index 1e3261d742..233e336e93 100644
--- a/tests/README.md
+++ b/tests/README.md
@@ -3,31 +3,31 @@
 To run the unittests, use the following command
 
 ```bash
-python3 -m pytest --device="cpu" .
+python3 -m pytest --forked --device="cpu" .
 ```
 
 To test for certain file, e.g., the `test_models_transformer.py`, use the following command
 
 ```bash
-python3 -m pytest --device="cpu" test_models_transformer.py
+python3 -m pytest --forked --device="cpu" test_models_transformer.py
 ```
 
 To test only for gpu device, use the following command
 
 ```bash
-python3 -m pytest --device="gpu" test_models_transformer.py
+python3 -m pytest --forked --device="gpu" test_models_transformer.py
 ```
 
 To test both for cpu and gpu device, use the following command
 
 ```bash
-python3 -m pytest --device="cpu" --device="gpu" test_models_transformer.py
+python3 -m pytest --forked --device="cpu" --device="gpu" test_models_transformer.py
 ```
 
 In addition, to run all the tests, you should add the `--runslow` flag
 
 ```bash
-python3 -m pytest --device="gpu" --runslow test_models.py
+python3 -m pytest --forked --device="gpu" --runslow test_models.py
 ```
 
 Refer to the [official guide of pytest](https://docs.pytest.org/en/latest/) for more details.
diff --git a/tests/test_models.py b/tests/test_models.py
index 6ad85c85e4..6c476e8b44 100644
--- a/tests/test_models.py
+++ b/tests/test_models.py
@@ -68,6 +68,7 @@ def test_get_backbone(name, ctx):
 @pytest.mark.parametrize('layout', ['NT', 'TN'])
 @pytest.mark.skipif(not tvm_enabled(),
                     reason='TVM is not supported. So this test is skipped.')
+@pytest.mark.skip('TVM issue https://github.com/dmlc/gluon-nlp/issues/1425.')
 def test_tvm_integration(model_name, batch_size, seq_length, layout, ctx):
     tvm = try_import_tvm()
     from tvm import relay
diff --git a/tests/test_models_bart.py b/tests/test_models_bart.py
index 4bf241b80e..62421499e4 100644
--- a/tests/test_models_bart.py
+++ b/tests/test_models_bart.py
@@ -1,8 +1,11 @@
 import pytest
 import mxnet as mx
 import tempfile
+import numpy as np
+import numpy.testing as npt
 from gluonnlp.models.bart import BartModel, \
     list_pretrained_bart, get_pretrained_bart, bart_cfg_reg
+from gluonnlp.utils.testing import verify_backbone_fp16
 
 
 mx.npx.set_np()
@@ -35,19 +38,48 @@ def test_bart_cfg_registry():
     assert len(bart_cfg_reg.list_keys()) > 0
 
 
-@pytest.mark.parametrize('cfg_key', bart_cfg_reg.list_keys())
-def test_bart_cfg(cfg_key):
+@pytest.mark.parametrize('cfg_key', ['fairseq_bart_base'])
+def test_bart_cfg(cfg_key, ctx):
     cfg = BartModel.get_cfg(cfg_key)
     cfg.defrost()
     cfg.MODEL.vocab_size = 32
     cfg.freeze()
-    model = BartModel.from_cfg(cfg)
-    model.initialize()
-    model.hybridize()
-    cfg.defrost()
-    cfg.MODEL.layout = 'TN'
-    cfg.freeze()
-    model_tn = BartModel.from_cfg(cfg)
-    model_tn.share_parameters(model.collect_params())
-    model_tn.hybridize()
-    mx.npx.waitall()
+
+    cfg_tn = cfg.clone()
+    cfg_tn.defrost()
+    cfg_tn.MODEL.layout = 'TN'
+    cfg_tn.freeze()
+
+    batch_size = 4
+    src_length = 32
+    tgt_length = 16
+
+    with ctx:
+        src_data = mx.np.random.randint(0, cfg.MODEL.vocab_size, (batch_size, src_length),
+                                        dtype=np.int32)
+        src_valid_length = mx.np.random.randint(src_length // 2, src_length, (batch_size,),
+                                                dtype=np.int32)
+        tgt_data = mx.np.random.randint(0, cfg.MODEL.vocab_size, (batch_size, tgt_length),
+                                        dtype=np.int32)
+        tgt_valid_length = mx.np.random.randint(tgt_length // 2, tgt_length, (batch_size, ),
+                                                dtype=np.int32)
+        model = BartModel.from_cfg(cfg, extract_feature=True)
+        model.initialize()
+        model.hybridize()
+
+        contextual_embedding, pooled_output = model(src_data, src_valid_length,
+                                                    tgt_data, tgt_valid_length)
+        model_tn = BartModel.from_cfg(cfg_tn, extract_feature=True)
+        model_tn.share_parameters(model.collect_params())
+        model_tn.hybridize()
+        contextual_embedding_tn, pooled_out_tn = model_tn(src_data.T, src_valid_length,
+                                                          tgt_data.T, tgt_valid_length)
+        npt.assert_allclose(contextual_embedding.asnumpy(),
+                            np.transpose(contextual_embedding_tn.asnumpy(), (1, 0, 2)), 5E-3, 5E-3)
+        npt.assert_allclose(pooled_out_tn.asnumpy(), pooled_output.asnumpy(), 5E-3, 5E-3)
+        mx.npx.waitall()
+
+        # Verify Float16
+        if ctx.device_type == 'gpu':
+            verify_backbone_fp16(model_cls=BartModel, cfg=cfg, ctx=ctx,
+                                 inputs=[src_data, src_valid_length, tgt_data, tgt_valid_length])
diff --git a/tests/test_models_bert.py b/tests/test_models_bert.py
index 7abff8468d..26de787c07 100644
--- a/tests/test_models_bert.py
+++ b/tests/test_models_bert.py
@@ -4,6 +4,7 @@
 import tempfile
 from gluonnlp.models.bert import BertModel, BertForMLM, BertForPretrain,\
     list_pretrained_bert, get_pretrained_bert
+from gluonnlp.utils.testing import verify_backbone_fp16
 mx.npx.set_np()
 
 
@@ -52,19 +53,6 @@ def test_bert_small_cfg(compute_layout, ctx):
                         1E-4, 1E-4)
         assert_allclose(pooled_out.asnumpy(), pooled_out_tn.asnumpy(), 1E-4, 1E-4)
 
-        # Test BertModel FP16
-        device_type = ctx.device_type
-        if device_type == 'gpu':
-            bert_model_fp16 = BertModel.from_cfg(cfg, dtype='float16')
-            bert_model_fp16.share_parameters(bert_model.collect_params())
-            bert_model_fp16.cast('float16')
-            bert_model_fp16.hybridize()
-            contextual_embedding_fp16, pooled_out_fp16 = bert_model_fp16(inputs,\
-                    token_types, valid_length)
-            assert_allclose(contextual_embedding_fp16.asnumpy(),
-                            mx.np.swapaxes(contextual_embedding_tn, 0, 1).asnumpy(),
-                            1E-2, 1E-2)
-
         # Test for BertForMLM
         bert_mlm_model = BertForMLM(cfg)
         bert_mlm_model.initialize()
@@ -79,8 +67,8 @@ def test_bert_small_cfg(compute_layout, ctx):
         assert_allclose(contextual_embedding.asnumpy(),
                         mx.np.swapaxes(contextual_embedding_tn, 0, 1).asnumpy(),
                         1E-4, 1E-4)
-        assert_allclose(pooled_out.asnumpy(), pooled_out_tn.asnumpy(), 1E-4, 1E-4)
-        assert_allclose(mlm_score.asnumpy(), mlm_score_tn.asnumpy(), 1E-4, 1E-4)
+        assert_allclose(pooled_out.asnumpy(), pooled_out_tn.asnumpy(), 1E-3, 1E-3)
+        assert_allclose(mlm_score.asnumpy(), mlm_score_tn.asnumpy(), 1E-3, 1E-3)
 
         # Test for BertForPretrain
         bert_pretrain_model = BertForPretrain(cfg)
@@ -95,10 +83,16 @@ def test_bert_small_cfg(compute_layout, ctx):
             bert_pretrain_model_tn(inputs.T, token_types.T, valid_length, masked_positions)
         assert_allclose(contextual_embedding.asnumpy(),
                         mx.np.swapaxes(contextual_embedding_tn, 0, 1).asnumpy(),
-                        1E-4, 1E-4)
-        assert_allclose(pooled_out.asnumpy(), pooled_out_tn.asnumpy(), 1E-4, 1E-4)
-        assert_allclose(nsp_score.asnumpy(), nsp_score_tn.asnumpy(), 1E-4, 1E-4)
-        assert_allclose(mlm_score.asnumpy(), mlm_score_tn.asnumpy(), 1E-4, 1E-4)
+                        1E-3, 1E-3)
+        assert_allclose(pooled_out.asnumpy(), pooled_out_tn.asnumpy(), 1E-3, 1E-3)
+        assert_allclose(nsp_score.asnumpy(), nsp_score_tn.asnumpy(), 1E-3, 1E-3)
+        assert_allclose(mlm_score.asnumpy(), mlm_score_tn.asnumpy(), 1E-3, 1E-3)
+
+        # Test BertModel FP16
+        device_type = ctx.device_type
+        if device_type == 'gpu':
+            verify_backbone_fp16(model_cls=BertModel, cfg=cfg, ctx=ctx,
+                                 inputs=[inputs, token_types, valid_length])
 
 
 @pytest.mark.slow
diff --git a/tests/test_models_electra.py b/tests/test_models_electra.py
index 34ba059473..e3142e4739 100644
--- a/tests/test_models_electra.py
+++ b/tests/test_models_electra.py
@@ -6,6 +6,7 @@
 from gluonnlp.models.electra import ElectraModel, ElectraDiscriminator,\
     ElectraGenerator,\
     list_pretrained_electra, get_pretrained_electra, get_generator_cfg
+from gluonnlp.utils.testing import verify_backbone_fp16
 mx.npx.set_np()
 
 
@@ -52,6 +53,7 @@ def test_electra_model(compute_layout, ctx):
         electra_model.initialize()
         electra_model.hybridize()
         contextual_embedding, pooled_out = electra_model(inputs, token_types, valid_length)
+
         electra_model_tn = ElectraModel.from_cfg(cfg_tn)
         electra_model_tn.share_parameters(electra_model.collect_params())
         electra_model_tn.hybridize()
@@ -62,6 +64,12 @@ def test_electra_model(compute_layout, ctx):
         assert_allclose(pooled_out.asnumpy(), pooled_out_tn.asnumpy(),
                         1E-4, 1E-4)
 
+        # Verify Float16
+        if ctx.device_type == 'gpu':
+            verify_backbone_fp16(model_cls=ElectraModel, cfg=cfg, ctx=ctx,
+                                 inputs=[inputs, token_types, valid_length])
+
+
 
 @pytest.mark.slow
 @pytest.mark.remote_required
diff --git a/tests/test_models_gpt2.py b/tests/test_models_gpt2.py
index 260ab74886..09536f27bc 100644
--- a/tests/test_models_gpt2.py
+++ b/tests/test_models_gpt2.py
@@ -6,6 +6,7 @@
 from gluonnlp.models.gpt2 import GPT2Model, GPT2ForLM, \
     list_pretrained_gpt2, get_pretrained_gpt2
 from gluonnlp.loss import LabelSmoothCrossEntropyLoss
+from gluonnlp.utils.testing import verify_backbone_fp16
 
 mx.npx.set_np()
 
@@ -43,6 +44,7 @@ def test_gpt2_small_config(compute_layout, ctx):
             inputs,
             gpt2_model.init_states(batch_size, ctx)
         )
+
         gpt2_model_tn = GPT2Model.from_cfg(cfg_tn)
         gpt2_model_tn.share_parameters(gpt2_model.collect_params())
         gpt2_model_tn.hybridize()
@@ -73,6 +75,15 @@ def test_gpt2_small_config(compute_layout, ctx):
         assert_allclose(np.swapaxes(states_tn.asnumpy(), 2, 3),
                         states.asnumpy(), 1E-4, 1E-4)
 
+        # Verify Float16
+        if ctx.device_type == 'gpu':
+            verify_backbone_fp16(model_cls=GPT2Model, cfg=cfg, ctx=ctx,
+                                 inputs=[inputs,
+                                         gpt2_model.init_states(batch_size, ctx)],
+                                 check_amp=False)
+            pytest.skip('GPT-2 test has been turned off. '
+                        'Issue: https://github.com/apache/incubator-mxnet/issues/19463')
+
 
 def test_gpt2_incremental_states(ctx):
     with ctx:
@@ -107,7 +118,8 @@ def test_gpt2_incremental_states(ctx):
 
 @pytest.mark.slow
 @pytest.mark.remote_required
-@pytest.mark.parametrize('model_name', ['gpt2_124M', 'gpt2_355M', 'gpt2_774M'])
+# Just run forward test with the small model to reduce the time cost.
+@pytest.mark.parametrize('model_name', ['gpt2_124M'])
 def test_gpt2(model_name, ctx):
     # test from pretrained
     assert len(list_pretrained_gpt2()) > 0
diff --git a/tests/test_models_mobilebert.py b/tests/test_models_mobilebert.py
index d7f22ac533..6cc11228f5 100644
--- a/tests/test_models_mobilebert.py
+++ b/tests/test_models_mobilebert.py
@@ -5,6 +5,7 @@
 import tempfile
 from gluonnlp.models.mobilebert import MobileBertModel, MobileBertForMLM, MobileBertForPretrain,\
     list_pretrained_mobilebert, get_pretrained_mobilebert
+from gluonnlp.utils.testing import verify_backbone_fp16
 mx.npx.set_np()
 
 
@@ -13,79 +14,86 @@ def test_list_pretrained_mobilebert():
 
 
 @pytest.mark.parametrize('compute_layout', ['auto', 'TN', 'NT'])
-def test_mobilebert_model_small_cfg(compute_layout):
-    cfg = MobileBertModel.get_cfg()
-    cfg.defrost()
-    cfg.MODEL.vocab_size = 100
-    cfg.MODEL.num_layers = 2
-    cfg.MODEL.hidden_size = 128
-    cfg.MODEL.num_heads = 2
-    cfg.MODEL.compute_layout = compute_layout
-    cfg.freeze()
+def test_mobilebert_model_small_cfg(compute_layout, ctx):
+    with ctx:
+        cfg = MobileBertModel.get_cfg()
+        cfg.defrost()
+        cfg.MODEL.vocab_size = 100
+        cfg.MODEL.num_layers = 2
+        cfg.MODEL.hidden_size = 128
+        cfg.MODEL.num_heads = 2
+        cfg.MODEL.compute_layout = compute_layout
+        cfg.freeze()
 
-    # Generate TN layout
-    cfg_tn = cfg.clone()
-    cfg_tn.defrost()
-    cfg_tn.MODEL.layout = 'TN'
-    cfg_tn.freeze()
+        # Generate TN layout
+        cfg_tn = cfg.clone()
+        cfg_tn.defrost()
+        cfg_tn.MODEL.layout = 'TN'
+        cfg_tn.freeze()
 
-    batch_size = 4
-    sequence_length = 16
-    num_mask = 3
-    inputs = mx.np.random.randint(0, 10, (batch_size, sequence_length))
-    token_types = mx.np.random.randint(0, 2, (batch_size, sequence_length))
-    valid_length = mx.np.random.randint(3, sequence_length, (batch_size,))
-    masked_positions = mx.np.random.randint(0, 3, (batch_size, num_mask))
+        batch_size = 4
+        sequence_length = 16
+        num_mask = 3
+        inputs = mx.np.random.randint(0, 10, (batch_size, sequence_length))
+        token_types = mx.np.random.randint(0, 2, (batch_size, sequence_length))
+        valid_length = mx.np.random.randint(3, sequence_length, (batch_size,))
+        masked_positions = mx.np.random.randint(0, 3, (batch_size, num_mask))
 
-    mobile_bert_model = MobileBertModel.from_cfg(cfg)
-    mobile_bert_model.initialize()
-    mobile_bert_model.hybridize()
-    mobile_bert_model_tn = MobileBertModel.from_cfg(cfg_tn)
-    mobile_bert_model_tn.share_parameters(mobile_bert_model.collect_params())
-    mobile_bert_model_tn.hybridize()
-    contextual_embedding, pooled_out = mobile_bert_model(inputs, token_types, valid_length)
-    contextual_embedding_tn, pooled_out_tn = mobile_bert_model_tn(inputs.T,
-                                                                  token_types.T, valid_length)
-    assert_allclose(contextual_embedding.asnumpy(),
-                    np.swapaxes(contextual_embedding_tn.asnumpy(), 0, 1),
-                    1E-4, 1E-4)
-    assert_allclose(pooled_out.asnumpy(), pooled_out_tn.asnumpy(), 1E-4, 1E-4)
+        mobile_bert_model = MobileBertModel.from_cfg(cfg)
+        mobile_bert_model.initialize()
+        mobile_bert_model.hybridize()
+        mobile_bert_model_tn = MobileBertModel.from_cfg(cfg_tn)
+        mobile_bert_model_tn.share_parameters(mobile_bert_model.collect_params())
+        mobile_bert_model_tn.hybridize()
+        contextual_embedding, pooled_out = mobile_bert_model(inputs, token_types, valid_length)
+        contextual_embedding_tn, pooled_out_tn = mobile_bert_model_tn(inputs.T,
+                                                                      token_types.T, valid_length)
+        assert_allclose(contextual_embedding.asnumpy(),
+                        np.swapaxes(contextual_embedding_tn.asnumpy(), 0, 1),
+                        1E-3, 1E-3)
+        assert_allclose(pooled_out.asnumpy(), pooled_out_tn.asnumpy(), 1E-3, 1E-3)
 
-    # Test for MobileBertForMLM
-    mobile_bert_mlm_model = MobileBertForMLM(cfg)
-    mobile_bert_mlm_model.initialize()
-    mobile_bert_mlm_model.hybridize()
-    mobile_bert_mlm_model_tn = MobileBertForMLM(cfg_tn)
-    mobile_bert_mlm_model_tn.share_parameters(mobile_bert_mlm_model.collect_params())
-    mobile_bert_model_tn.hybridize()
-    contextual_embedding, pooled_out, mlm_scores = mobile_bert_mlm_model(inputs, token_types,
-                                                                         valid_length,
-                                                                         masked_positions)
-    contextual_embedding_tn, pooled_out_tn, mlm_scores_tn =\
-        mobile_bert_mlm_model_tn(inputs.T, token_types.T, valid_length, masked_positions)
-    assert_allclose(contextual_embedding.asnumpy(),
-                    np.swapaxes(contextual_embedding_tn.asnumpy(), 0, 1),
-                    1E-4, 1E-4)
-    assert_allclose(pooled_out_tn.asnumpy(), pooled_out.asnumpy(), 1E-4, 1E-4)
-    assert_allclose(mlm_scores_tn.asnumpy(), mlm_scores.asnumpy(), 1E-4, 1E-4)
+        # Test for MobileBertForMLM
+        mobile_bert_mlm_model = MobileBertForMLM(cfg)
+        mobile_bert_mlm_model.initialize()
+        mobile_bert_mlm_model.hybridize()
+        mobile_bert_mlm_model_tn = MobileBertForMLM(cfg_tn)
+        mobile_bert_mlm_model_tn.share_parameters(mobile_bert_mlm_model.collect_params())
+        mobile_bert_model_tn.hybridize()
+        contextual_embedding, pooled_out, mlm_scores = mobile_bert_mlm_model(inputs, token_types,
+                                                                             valid_length,
+                                                                             masked_positions)
+        contextual_embedding_tn, pooled_out_tn, mlm_scores_tn =\
+            mobile_bert_mlm_model_tn(inputs.T, token_types.T, valid_length, masked_positions)
+        assert_allclose(contextual_embedding.asnumpy(),
+                        np.swapaxes(contextual_embedding_tn.asnumpy(), 0, 1),
+                        1E-3, 1E-3)
+        assert_allclose(pooled_out_tn.asnumpy(), pooled_out.asnumpy(), 1E-3, 1E-3)
+        assert_allclose(mlm_scores_tn.asnumpy(), mlm_scores.asnumpy(), 1E-3, 1E-3)
 
-    # Test for MobileBertForPretrain
-    mobile_bert_pretrain_model = MobileBertForPretrain(cfg)
-    mobile_bert_pretrain_model.initialize()
-    mobile_bert_pretrain_model.hybridize()
-    mobile_bert_pretrain_model_tn = MobileBertForPretrain(cfg_tn)
-    mobile_bert_pretrain_model_tn.share_parameters(mobile_bert_pretrain_model.collect_params())
-    mobile_bert_pretrain_model_tn.hybridize()
-    contextual_embedding, pooled_out, nsp_score, mlm_scores =\
-        mobile_bert_pretrain_model(inputs, token_types, valid_length, masked_positions)
-    contextual_embedding_tn, pooled_out_tn, nsp_score_tn, mlm_scores_tn = \
-        mobile_bert_pretrain_model_tn(inputs.T, token_types.T, valid_length, masked_positions)
-    assert_allclose(contextual_embedding.asnumpy(),
-                    np.swapaxes(contextual_embedding_tn.asnumpy(), 0, 1),
-                    1E-4, 1E-4)
-    assert_allclose(pooled_out.asnumpy(), pooled_out_tn.asnumpy(), 1E-4, 1E-4)
-    assert_allclose(nsp_score.asnumpy(), nsp_score_tn.asnumpy(), 1E-4, 1E-4)
-    assert_allclose(mlm_scores.asnumpy(), mlm_scores_tn.asnumpy(), 1E-4, 1E-4)
+        # Test for MobileBertForPretrain
+        mobile_bert_pretrain_model = MobileBertForPretrain(cfg)
+        mobile_bert_pretrain_model.initialize()
+        mobile_bert_pretrain_model.hybridize()
+        mobile_bert_pretrain_model_tn = MobileBertForPretrain(cfg_tn)
+        mobile_bert_pretrain_model_tn.share_parameters(mobile_bert_pretrain_model.collect_params())
+        mobile_bert_pretrain_model_tn.hybridize()
+        contextual_embedding, pooled_out, nsp_score, mlm_scores =\
+            mobile_bert_pretrain_model(inputs, token_types, valid_length, masked_positions)
+        contextual_embedding_tn, pooled_out_tn, nsp_score_tn, mlm_scores_tn = \
+            mobile_bert_pretrain_model_tn(inputs.T, token_types.T, valid_length, masked_positions)
+        assert_allclose(contextual_embedding.asnumpy(),
+                        np.swapaxes(contextual_embedding_tn.asnumpy(), 0, 1),
+                        1E-3, 1E-3)
+        assert_allclose(pooled_out.asnumpy(), pooled_out_tn.asnumpy(), 1E-3, 1E-3)
+        assert_allclose(nsp_score.asnumpy(), nsp_score_tn.asnumpy(), 1E-3, 1E-3)
+        assert_allclose(mlm_scores.asnumpy(), mlm_scores_tn.asnumpy(), 1E-3, 1E-3)
+
+        # Test for fp16
+        if ctx.device_type == 'gpu':
+            pytest.skip('MobileBERT will have nan values in FP16 mode.')
+            verify_backbone_fp16(model_cls=MobileBertModel, cfg=cfg, ctx=ctx,
+                                 inputs=[inputs, token_types, valid_length])
 
 
 @pytest.mark.remote_required
diff --git a/tests/test_models_roberta.py b/tests/test_models_roberta.py
index 2fd9e63131..8953321cc7 100644
--- a/tests/test_models_roberta.py
+++ b/tests/test_models_roberta.py
@@ -6,6 +6,8 @@
 from gluonnlp.models.roberta import RobertaModel, RobertaForMLM, \
     list_pretrained_roberta, get_pretrained_roberta
 from gluonnlp.loss import LabelSmoothCrossEntropyLoss
+from gluonnlp.utils.testing import verify_backbone_fp16
+
 
 mx.npx.set_np()
 
@@ -15,64 +17,70 @@ def test_list_pretrained_roberta():
 
 
 @pytest.mark.parametrize('compute_layout', ['auto', 'TN', 'NT'])
-def test_robert_small_config(compute_layout):
-    cfg = RobertaModel.get_cfg()
-    cfg.defrost()
-    cfg.MODEL.vocab_size = 1000
-    cfg.MODEL.num_layers = 2
-    cfg.MODEL.hidden_size = 128
-    cfg.MODEL.num_heads = 2
-    cfg.MODEL.compute_layout = compute_layout
-    cfg.freeze()
+def test_robert_small_config(compute_layout, ctx):
+    with ctx:
+        cfg = RobertaModel.get_cfg()
+        cfg.defrost()
+        cfg.MODEL.vocab_size = 1000
+        cfg.MODEL.num_layers = 2
+        cfg.MODEL.hidden_size = 128
+        cfg.MODEL.num_heads = 2
+        cfg.MODEL.compute_layout = compute_layout
+        cfg.freeze()
+
+        # Generate TN layout
+        cfg_tn = cfg.clone()
+        cfg_tn.defrost()
+        cfg_tn.MODEL.layout = 'TN'
+        cfg_tn.freeze()
 
-    # Generate TN layout
-    cfg_tn = cfg.clone()
-    cfg_tn.defrost()
-    cfg_tn.MODEL.layout = 'TN'
-    cfg_tn.freeze()
+        batch_size = 4
+        sequence_length = 16
+        num_mask = 3
+        inputs = mx.np.random.randint(0, 10, (batch_size, sequence_length))
+        valid_length = mx.np.random.randint(3, sequence_length, (batch_size,))
+        masked_positions = mx.np.random.randint(0, 3, (batch_size, num_mask))
 
-    batch_size = 4
-    sequence_length = 16
-    num_mask = 3
-    inputs = mx.np.random.randint(0, 10, (batch_size, sequence_length))
-    valid_length = mx.np.random.randint(3, sequence_length, (batch_size,))
-    masked_positions = mx.np.random.randint(0, 3, (batch_size, num_mask))
+        roberta_model = RobertaModel.from_cfg(cfg)
+        roberta_model.initialize()
+        roberta_model.hybridize()
+        contextual_embeddings, pooled_out = roberta_model(inputs, valid_length)
+        roberta_model_tn = RobertaModel.from_cfg(cfg_tn)
+        roberta_model_tn.share_parameters(roberta_model.collect_params())
+        roberta_model_tn.hybridize()
+        contextual_embeddings_tn, pooled_out_tn = roberta_model_tn(inputs.T, valid_length)
+        assert_allclose(np.swapaxes(contextual_embeddings_tn.asnumpy(), 0, 1),
+                        contextual_embeddings.asnumpy(), 1E-3, 1E-3)
+        assert_allclose(pooled_out_tn.asnumpy(), pooled_out.asnumpy(), 1E-3, 1E-3)
 
-    roberta_model = RobertaModel.from_cfg(cfg)
-    roberta_model.initialize()
-    roberta_model.hybridize()
-    contextual_embeddings, pooled_out = roberta_model(inputs, valid_length)
-    roberta_model_tn = RobertaModel.from_cfg(cfg_tn)
-    roberta_model_tn.share_parameters(roberta_model.collect_params())
-    roberta_model_tn.hybridize()
-    contextual_embeddings_tn, pooled_out_tn = roberta_model_tn(inputs.T, valid_length)
-    assert_allclose(np.swapaxes(contextual_embeddings_tn.asnumpy(), 0, 1),
-                    contextual_embeddings.asnumpy(), 1E-4, 1E-4)
-    assert_allclose(pooled_out_tn.asnumpy(), pooled_out.asnumpy(), 1E-4, 1E-4)
+        # Test for RobertaForMLM
+        roberta_mlm_model = RobertaForMLM(cfg)
+        roberta_mlm_model.initialize()
+        roberta_mlm_model.hybridize()
+        contextual_embedding, pooled_out, mlm_scores = roberta_mlm_model(inputs, valid_length,
+                                                                         masked_positions)
+        roberta_mlm_model_tn = RobertaForMLM(cfg_tn)
+        roberta_mlm_model_tn.share_parameters(roberta_mlm_model.collect_params())
+        roberta_mlm_model_tn.hybridize()
+        contextual_embedding_tn, pooled_out_tn, mlm_scores_tn =\
+            roberta_mlm_model_tn(inputs.T, valid_length.T, masked_positions)
+        assert_allclose(np.swapaxes(contextual_embedding_tn.asnumpy(), 0, 1),
+                        contextual_embedding.asnumpy(), 1E-3, 1E-3)
+        assert_allclose(pooled_out_tn.asnumpy(), pooled_out.asnumpy(), 1E-3, 1E-3)
+        assert_allclose(mlm_scores_tn.asnumpy(), mlm_scores.asnumpy(), 1E-3, 1E-3)
 
-    # Test for RobertaForMLM
-    roberta_mlm_model = RobertaForMLM(cfg)
-    roberta_mlm_model.initialize()
-    roberta_mlm_model.hybridize()
-    contextual_embedding, pooled_out, mlm_scores = roberta_mlm_model(inputs, valid_length,
-                                                                     masked_positions)
-    roberta_mlm_model_tn = RobertaForMLM(cfg_tn)
-    roberta_mlm_model_tn.share_parameters(roberta_mlm_model.collect_params())
-    roberta_mlm_model_tn.hybridize()
-    contextual_embedding_tn, pooled_out_tn, mlm_scores_tn =\
-        roberta_mlm_model_tn(inputs.T, valid_length.T, masked_positions)
-    assert_allclose(np.swapaxes(contextual_embedding_tn.asnumpy(), 0, 1),
-                    contextual_embedding.asnumpy(), 1E-4, 1E-4)
-    assert_allclose(pooled_out_tn.asnumpy(), pooled_out.asnumpy(), 1E-4, 1E-4)
-    assert_allclose(mlm_scores_tn.asnumpy(), mlm_scores.asnumpy(), 1E-4, 1E-4)
+        # Test for fp16
+        if ctx.device_type == 'gpu':
+            verify_backbone_fp16(model_cls=RobertaModel, cfg=cfg, ctx=ctx,
+                                 inputs=[inputs, valid_length])
 
 
 @pytest.mark.slow
 @pytest.mark.remote_required
-@pytest.mark.parametrize('model_name', list_pretrained_roberta())
+# Just test the fairseq_roberta_base to reduce the time
+@pytest.mark.parametrize('model_name', ['fairseq_roberta_base'])
 def test_roberta(model_name):
     # test from pretrained
-    assert len(list_pretrained_roberta()) > 0
     with tempfile.TemporaryDirectory() as root:
         cfg, tokenizer, params_path, mlm_params_path =\
             get_pretrained_roberta(model_name, load_backbone=True, load_mlm=True, root=root)
@@ -108,7 +116,7 @@ def test_roberta(model_name):
             ),
             dtype=np.int32
         )
-        contextual_embeddings, pooled_out = roberta_model(input_ids, valid_length)
+        roberta_model(input_ids, valid_length)
         mx.npx.waitall()
         # test backward
         label_smooth_loss = LabelSmoothCrossEntropyLoss(num_labels=vocab_size)
diff --git a/tests/test_models_transformer.py b/tests/test_models_transformer.py
index 3588fe2663..6e9502ec72 100644
--- a/tests/test_models_transformer.py
+++ b/tests/test_models_transformer.py
@@ -1,3 +1,4 @@
+import numpy as np
 import mxnet as mx
 import pytest
 from numpy.testing import assert_allclose
@@ -7,6 +8,9 @@
     transformer_cfg_reg
 from gluonnlp.attention_cell import gen_mem_attn_mask, gen_self_attn_mask
 from gluonnlp.utils.testing import verify_nmt_model, verify_nmt_inference
+from gluonnlp.utils.testing import verify_backbone_fp16
+
+
 mx.npx.set_np()
 
 
@@ -111,8 +115,6 @@ def test_transformer_nmt_model(train_hybridize, inference_hybridize,
                                enc_num_layers, dec_num_layers,
                                enc_recurrent, dec_recurrent, tie_weights,
                                layout):
-    if inference_hybridize:
-        pytest.skip('inference model hybridization is not working')
     src_seq_length = 20
     tgt_seq_length = 15
     src_vocab_size = 32
@@ -172,3 +174,68 @@ def test_transformer_cfg(cfg_key):
     model_tn.share_parameters(model.collect_params())
     model_tn.hybridize()
     mx.npx.waitall()
+
+
+@pytest.mark.parametrize('enc_pre_norm,dec_pre_norm',
+                         [(False, False), (True, True)])
+@pytest.mark.parametrize('enc_num_layers,dec_num_layers,enc_units,dec_units',
+                         [(2, 2, 24, 24),
+                          (2, 3, 16, 16)])
+@pytest.mark.parametrize('enc_recurrent', [False, True])
+@pytest.mark.parametrize('dec_recurrent', [False, True])
+@pytest.mark.parametrize('tie_weights,layout', [(False, 'NT'), (True, 'NT'), (True, 'TN')])
+def test_transformer_fp16_amp(enc_pre_norm, dec_pre_norm,
+                              enc_units, dec_units,
+                              enc_num_layers, dec_num_layers,
+                              enc_recurrent, dec_recurrent, tie_weights,
+                              layout, ctx):
+    if ctx.device_type != 'gpu':
+        pytest.skip('Only test amp when running on GPU.')
+    # Generate configuration for testing
+    cfg = TransformerModel.get_cfg()
+    cfg.defrost()
+    cfg.MODEL.src_vocab_size = 32
+    cfg.MODEL.tgt_vocab_size = 32
+    cfg.MODEL.max_src_length = 20
+    cfg.MODEL.max_tgt_length = 15
+    cfg.MODEL.tie_weights = tie_weights
+    cfg.MODEL.layout = layout
+
+    # Encoder config
+    cfg.MODEL.ENCODER.pre_norm = enc_pre_norm
+    cfg.MODEL.ENCODER.units = enc_units
+    cfg.MODEL.ENCODER.num_layers = enc_num_layers
+    cfg.MODEL.ENCODER.recurrent = enc_recurrent
+
+    # Decoder config
+    cfg.MODEL.DECODER.pre_norm = dec_pre_norm
+    cfg.MODEL.DECODER.units = dec_units
+    cfg.MODEL.DECODER.num_layers = dec_num_layers
+    cfg.MODEL.DECODER.recurrent = dec_recurrent
+    cfg.freeze()
+
+    batch_size = 4
+    seq_length = 16
+    with ctx:
+        if layout == 'NT':
+            src_data = mx.np.random.randint(0, cfg.MODEL.src_vocab_size,
+                                            (batch_size, seq_length), dtype=np.int32)
+            src_valid_length = mx.np.random.randint(seq_length // 2, seq_length,
+                                                    (batch_size,), dtype=np.int32)
+            tgt_data = mx.np.random.randint(0, cfg.MODEL.tgt_vocab_size,
+                                            (batch_size, seq_length), dtype=np.int32)
+            tgt_valid_length = mx.np.random.randint(seq_length // 2, seq_length,
+                                                    (batch_size,), dtype=np.int32)
+        elif layout == 'TN':
+            src_data = mx.np.random.randint(0, cfg.MODEL.src_vocab_size,
+                                            (seq_length, batch_size), dtype=np.int32)
+            src_valid_length = mx.np.random.randint(seq_length // 2, seq_length,
+                                                    (batch_size,), dtype=np.int32)
+            tgt_data = mx.np.random.randint(0, cfg.MODEL.tgt_vocab_size,
+                                            (seq_length, batch_size), dtype=np.int32)
+            tgt_valid_length = mx.np.random.randint(seq_length // 2, seq_length,
+                                                    (batch_size,), dtype=np.int32)
+        else:
+            raise NotImplementedError
+        verify_backbone_fp16(TransformerModel, cfg, ctx,
+                             inputs=[src_data, src_valid_length, tgt_data, tgt_valid_length])
diff --git a/tests/test_models_xlmr.py b/tests/test_models_xlmr.py
index ec19af95ff..b2d3c4b8d9 100644
--- a/tests/test_models_xlmr.py
+++ b/tests/test_models_xlmr.py
@@ -13,6 +13,7 @@ def test_list_pretrained_xlmr():
     assert len(list_pretrained_xlmr()) > 0
 
 
+# We choose to not test amp for XLMR because it's the same as RoBERTa.
 @pytest.mark.slow
 @pytest.mark.remote_required
 @pytest.mark.parametrize('model_name', list_pretrained_xlmr())
diff --git a/tests/test_optimizer.py b/tests/test_optimizer.py
index 48c2331a7a..d3dfd5ccd3 100644
--- a/tests/test_optimizer.py
+++ b/tests/test_optimizer.py
@@ -1,4 +1,5 @@
 import itertools
+import pytest
 import numpy as np
 from gluonnlp.optimizer import AdamW
 import mxnet as mx
@@ -6,7 +7,8 @@
 mx.npx.reset_np()
 
 
-def test_adam(ctx):
+@pytest.mark.parametrize('dtype', [np.float16, np.float32])
+def test_adam(dtype, ctx):
     with ctx:
         opt1 = AdamW
         opt2 = AdamW
@@ -16,18 +18,17 @@ def test_adam(ctx):
         cg_options = [{}, {'clip_gradient': 0.4}, {'clip_gradient': 0.5}]
         rg_options = [{}, {'rescale_grad': 0.14}, {'rescale_grad': 0.8}]
         wd_options = [{}, {'wd': 0.03}, {'wd': 0.05}, {'wd': 0.07}]
-        mp_options = [{'multi_precision': False}]  # TODO(sxjscience) Test for FP16
+        mp_options = [{'multi_precision': False}, {'multi_precision': True}]
         agg_options = [{'aggregate_num': 0}, {'aggregate_num': 1},
                        {'aggregate_num': 4}, {'aggregate_num': np.inf}]
         correct_bias_options = [{'correct_bias': True}, {'correct_bias': False}]
-        for dtype in [np.float16, np.float32]:
-            for params in itertools.product(beta1_options, beta2_options, cg_options,
-                                            rg_options, wd_options, mp_options,
-                                            agg_options, correct_bias_options):
-                kwarg = {k: v for param in params for k, v in param.items()}
-                if (dtype == np.float16 and ('multi_precision' not in kwarg or
-                                             not kwarg['multi_precision'])):
-                    continue
-                compare_optimizer(opt1(use_fused_step=False, **kwarg),
-                                  opt2(use_fused_step=True, **kwarg), shapes, dtype,
-                                  rtol=1e-4, atol=2e-5)
+        for params in itertools.product(beta1_options, beta2_options, cg_options,
+                                        rg_options, wd_options, mp_options,
+                                        agg_options, correct_bias_options):
+            kwarg = {k: v for param in params for k, v in param.items()}
+            if (dtype == np.float16 and ('multi_precision' not in kwarg or
+                                         not kwarg['multi_precision'])):
+                continue
+            compare_optimizer(opt1(use_fused_step=False, **kwarg),
+                              opt2(use_fused_step=True, **kwarg), shapes, dtype,
+                              rtol=1e-3, atol=2e-3)
diff --git a/tools/batch/README.md b/tools/batch/README.md
index e95d2e4c6f..1bffcdf2bd 100644
--- a/tools/batch/README.md
+++ b/tools/batch/README.md
@@ -24,19 +24,33 @@ several pre-trained models could be converted through the corresponding conversi
 bash run_batch_conversion ${MODEL_TYPE}
 ```
 
-## Fine-tuning Downstream Tasks
+## SQuAD Training
 
-### Question Answering
-We can quickly run the squad finetuning via [squad fine-tuning scripts](../../scripts/question_answering#squad) and the AWS Batch job.
-
-The code is given in [run_batch_squad.sh](run_batch_squad.sh)
+The code is given in [question_answering/run_batch_squad.sh](question_answering/run_batch_squad.sh)
 
 ```bash
 # AWS Batch training without horovod on SQuAD 2.0
-bash run_batch_squad.sh
+bash question_answering/run_batch_squad.sh 0 2.0 submit_squad_v2_fp32.log float32
 
 # AWS Batch training with horovod on SQuAD 2.0
-bash run_batch_squad.sh 1 2.0 submit_squad_v2_horovod.log
+bash question_answering/run_batch_squad.sh 1 2.0 submit_squad_v2_horovod_fp32.log float32
+
+# AWS Batch training with horovod on SQuAD 1.1
+bash question_answering/run_batch_squad.sh 1 1.1 submit_squad_v1_horovod_fp32.log float32
+```
+
+```bash
+# AWS Batch training with horovod on SQuAD 2.0 + FP16
+bash question_answering/run_batch_squad.sh 1 2.0 submit_squad_v2_horovod_fp16.log float16
+
+# AWS Batch training with horovod on SQuAD 1.1 + FP16
+bash question_answering/run_batch_squad.sh 1 1.1 submit_squad_v1_horovod_fp16.log float16
+```
+
+Also, after you have submitted the jobs, you may sync the results via
+```bash
+bash question_answering/sync_batch_result.sh submit_squad_v2.log squad_v2_no_horovod
+bash question_answering/sync_batch_result.sh submit_squad_v2_horovod.log squad_v2_horovod
 ```
 
 Internally, it will train the following models on SQuAD 2.0 dataset:
@@ -52,4 +66,5 @@ Internally, it will train the following models on SQuAD 2.0 dataset:
 | electra_base       |
 | electra_large      |
 | roberta_large      |
+| gluon_en_cased_bert_base_v1    |
 | mobilebert         |
diff --git a/tools/batch/run_batch_squad.sh b/tools/batch/question_answering/run_batch_squad.sh
similarity index 68%
rename from tools/batch/run_batch_squad.sh
rename to tools/batch/question_answering/run_batch_squad.sh
index 8349716c29..0682fe8cce 100644
--- a/tools/batch/run_batch_squad.sh
+++ b/tools/batch/question_answering/run_batch_squad.sh
@@ -1,8 +1,13 @@
+#!/bin/bash
+
 set -ex
 
 USE_HOROVOD=${1:-0}
 VERSION=${2:-2.0}
 LOG_PATH=${3:-submit_squad_v2.log}
+DTYPE=${4:-float32}
+SUBMIT_SCRIPT_PATH=$(dirname "$0")/../../../tools/batch/submit-job.py
+
 
 for MODEL_NAME in albert_base \
                   albert_large \
@@ -14,9 +19,10 @@ for MODEL_NAME in albert_base \
                   roberta_large \
                   uncased_bert_base \
                   uncased_bert_large \
+                  gluon_en_cased_bert_base_v1 \
                   mobilebert
 do
-  python3 submit-job.py \
+  python3 ${SUBMIT_SCRIPT_PATH} \
       --region us-east-1 \
       --source-ref master \
       --job-type g4dn.12x \
@@ -24,5 +30,7 @@ do
       --name test_squad2_${MODEL_NAME} \
       --work-dir scripts/question_answering \
       --remote https://github.com/dmlc/gluon-nlp/ \
-      --command "bash commands/run_squad2_${MODEL_NAME}.sh ${USE_HOROVOD} ${VERSION} | tee stdout.log" >> ${LOG_PATH}
+      --command "bash commands/run_squad2_${MODEL_NAME}.sh ${USE_HOROVOD} ${VERSION} ${DTYPE} | tee stdout.log" \
+      | perl -pe 's/Submitted job \[([0-9|a-z|_].+)\] to the job queue .+/$1/' \
+      | sed -e 's/ - / /g' >> ${LOG_PATH}
 done
diff --git a/tools/batch/question_answering/sync_batch_result.sh b/tools/batch/question_answering/sync_batch_result.sh
new file mode 100644
index 0000000000..fe350bd340
--- /dev/null
+++ b/tools/batch/question_answering/sync_batch_result.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+
+set -ex
+
+LOG_PATH=$1
+SAVE_DIR_NAME=${2:-squad_2.0}
+
+while read -r job_name job_id; do
+    aws s3 sync s3://gluon-nlp-dev/batch/${job_id}/temp ${SAVE_DIR_NAME}/${job_name}
+done < ${LOG_PATH}
diff --git a/tools/batch/wait-job.py b/tools/batch/wait-job.py
index 87d8679255..ea3319ae54 100644
--- a/tools/batch/wait-job.py
+++ b/tools/batch/wait-job.py
@@ -10,12 +10,14 @@
 
 parser.add_argument('--profile', help='profile name of aws account.', type=str,
                     default=None)
+parser.add_argument('--region', help='Default region when creating new connections', type=str,
+                    default=None)
 parser.add_argument('--job-id', help='job id to check status and wait.', type=str,
                     default=None)
 
 args = parser.parse_args()
 
-session = boto3.Session(profile_name=args.profile)
+session = boto3.Session(profile_name=args.profile, region_name=args.region)
 batch, cloudwatch = [session.client(service_name=sn) for sn in ['batch', 'logs']]
 
 def printLogs(logGroupName, logStreamName, startTime):