Add AMP + Update Benchmarking Script (#1405)

* Update transformer_xl.py Update testing.py Update testing.py Update testing.py Update testing.py Update testing.py Update testing.py Update testing.py Update testing.py Update testing.py Update testing.py Update testing.py Update testing.py Update testing.py Update testing.py Update testing.py Update attention_cell.py Update testing.py Update testing.py Update testing.py Update test_models_bert.py Update run_batch_squad.sh Update generate_commands.py Update run_batch_squad.sh Update run_batch_squad.sh Update run_batch_squad.sh Add region Update generate_commands.py Update run_squad.template Try to use clip 1.0 update Update README.md Update attention_cell.py Update benchmark_gluonnlp.py Update attention_cell.py Update testing.py Update run_squad.py Update attention_cell.py Update attention_cell.py Update attention_cell.py update Update attention_cell.py update Update numbers + log + weight update update Update testing.py * Update run_squad.py * Update test_models_mobilebert.py * Update README.md * Update test_models_bert.py * Update testing.py * Update test_models_mobilebert.py * Update test_models_roberta.py * Update gpt2.py * Update testing.py * Update bart.py * Update testing.py * Update testing.py * Update README.md * Update README.md * Update testing.py * fix * update * Update test_models_roberta.py * Update test_models_bart.py * Update test_models_bart.py * Update test_models_bart.py * Update testing.py * only include bart-base * Update bart.py * Update bart.py * update * Update test_models_transformer.py * Update test_models_transformer.py * Update test_models_transformer.py * Update test_models_transformer.py * Update run_squad.py * Update attention_cell.py * Update README.md * Update test_models.py * Update run_squad.py * Update run_squad.py * update * Update run_squad.template * update * Update generate_commands.py * Update optimizer.py * update * Update run_squad.py * update * Update run_batch_squad.sh * update * Update testing.py * Update test_optimizer.py * Update benchmark_utils.py * update * fix bug in inference * Update benchmark_gluonnlp.py * Update run_batch_squad.sh * Update benchmark_utils.py * Update run_squad.py * Update run_squad.py * Update run_squad.py * Update run_squad.py * update
dmlc · Nov 6, 2020 · dd45270 · dd45270
1 parent 1726dd2
commit dd45270
Show file tree

Hide file tree

Showing 45 changed files with 723 additions and 299 deletions.
diff --git a/README.md b/README.md
@@ -16,7 +16,7 @@ process the text data, and train models.
 
 # Features
 
-- Easy-to-use Text Processing Tools and APIs
+- Easy-to-use Text Processing Tools and Modular APIs
 - Pretrained Model Zoo
 - Write Models with Numpy-like API
 - Fast Inference via [Apache TVM (incubating)](https://tvm.apache.org/) (Experimental)
@@ -28,16 +28,16 @@ First of all, install the latest MXNet. You may use the following commands:
 
 ```bash
 # Install the version with CUDA 10.0
-python3 -m pip install -U --pre "mxnet-cu100>=2.0.0b20200926" -f https://dist.mxnet.io/python
+python3 -m pip install -U --pre "mxnet-cu100>=2.0.0b20201101" -f https://dist.mxnet.io/python
 
 # Install the version with CUDA 10.1
-python3 -m pip install -U --pre "mxnet-cu101>=2.0.0b20200926" -f https://dist.mxnet.io/python
+python3 -m pip install -U --pre "mxnet-cu101>=2.0.0b20201101" -f https://dist.mxnet.io/python
 
 # Install the version with CUDA 10.2
-python3 -m pip install -U --pre "mxnet-cu102>=2.0.0b20200926" -f https://dist.mxnet.io/python
+python3 -m pip install -U --pre "mxnet-cu102>=2.0.0b20201101" -f https://dist.mxnet.io/python
 
 # Install the cpu-only version
-python3 -m pip install -U --pre "mxnet>=2.0.0b20200926" -f https://dist.mxnet.io/python
+python3 -m pip install -U --pre "mxnet>=2.0.0b20201101" -f https://dist.mxnet.io/python
 ```
 
 

diff --git a/scripts/benchmarks/benchmark_gluonnlp.py b/scripts/benchmarks/benchmark_gluonnlp.py
@@ -58,20 +58,22 @@ def get_parser():
                         help='Whether to use TVM for inference/training')
     parser.add_argument('--instance_type', choices=['c4', 'c5', 'g4', 'p3'], default='g4',
                         help='The instance type that the profiling script will be run on.')
+    parser.add_argument('--use_fp16', action='store_true')
     parser.add_argument('--mode', type=str, default='train',
                         choices=['train', 'inference'])
     return parser
 
 
 def run_benchmark(workload, model_name, out_file_name, is_train,
-                  use_tvm, instance_type):
+                  use_tvm, instance_type, use_fp16):
     if is_train:
         benchmark = GluonNLPBackboneBenchmark(
             workloads=workload,
             model_names=model_name,
             profile_inference=False,
             profile_train=True,
             to_csv=True,
+            use_fp16=use_fp16,
             train_out_csv_file=out_file_name)
         benchmark.run()
     else:
@@ -83,6 +85,7 @@ def run_benchmark(workload, model_name, out_file_name, is_train,
             use_tvm=use_tvm,
             instance_type=instance_type,
             to_csv=True,
+            use_fp16=use_fp16,
             inference_out_csv_file=out_file_name)
         benchmark.run()
     return
@@ -94,13 +97,15 @@ def run_benchmark(workload, model_name, out_file_name, is_train,
     args = parser.parse_args()
     if args.compute_layout is None:
         args.compute_layout = args.layout
+    dtype = 'float32' if not args.use_fp16 else 'float16'
     for layout, compute_layout in [(args.layout, args.compute_layout)]:
         if compute_layout != layout:
             profile_models = [ele for ele in MODELS if 'bart' not in ele]
         else:
             profile_models = [ele for ele in MODELS]
         if args.mode == 'inference':
-            out_dir = 'infer_fp32_{}_{}_tvm{}'.format(layout, compute_layout, int(args.use_tvm))
+            out_dir = 'infer_{}_{}_{}_tvm{}'.format(dtype, layout, compute_layout,
+                                                    int(args.use_tvm))
             df = pd.DataFrame(columns=['model', 'batch_size', 'sequence_length',
                                        'latency', 'memory'])
             os.makedirs(out_dir, exist_ok=True)
@@ -111,16 +116,17 @@ def run_benchmark(workload, model_name, out_file_name, is_train,
                     process = Process(
                         target=run_benchmark,
                         args=(workload, model_name, out_path, False,
-                              args.use_tvm, args.instance_type))
+                              args.use_tvm, args.instance_type, args.use_fp16))
                     process.start()
                     process.join()
                     new_df = pd.read_csv(out_path)
                     df = df.append(new_df, ignore_index=True)
-                    df.to_csv('gluonnlp_infer_fp32_{}_{}_tvm{}.csv'.format(layout,
+                    df.to_csv('gluonnlp_infer_{}_{}_{}_tvm{}.csv'.format(dtype,
+                                                                         layout,
                                                                            compute_layout,
                                                                            int(args.use_tvm)))
         elif args.mode == 'train':
-            out_dir = 'train_fp32_{}_{}'.format(layout, compute_layout)
+            out_dir = 'train_{}_{}_{}'.format(dtype, layout, compute_layout)
             df = pd.DataFrame(columns=['model', 'batch_size', 'sequence_length',
                                        'latency', 'memory'])
             os.makedirs(out_dir, exist_ok=True)
@@ -130,11 +136,12 @@ def run_benchmark(workload, model_name, out_file_name, is_train,
                                                                            workload[1]))
                     process = Process(
                         target=run_benchmark,
-                        args=(workload, model_name, out_path, True))
+                        args=(workload, model_name, out_path, True, False,
+                              args.instance_type, args.use_fp16))
                     process.start()
                     process.join()
                     new_df = pd.read_csv(out_path)
                     df = df.append(new_df, ignore_index=True)
-                    df.to_csv('gluonnlp_train_fp32_{}_{}.csv'.format(layout, compute_layout))
+                    df.to_csv('gluonnlp_train_{}_{}_{}.csv'.format(dtype, layout, compute_layout))
         else:
             raise NotImplementedError
diff --git a/scripts/benchmarks/benchmark_gluonnlp_fp16.sh b/scripts/benchmarks/benchmark_gluonnlp_fp16.sh
@@ -0,0 +1,14 @@
+for mode in train inference
+do
+  python3 benchmark_gluonnlp.py --layout NT --compute_layout NT --mode $mode --use_fp16
+done
+
+for mode in train inference
+do
+  python3 benchmark_gluonnlp.py --layout NT --compute_layout TN --mode $mode --use_fp16
+done
+
+for mode in train inference
+do
+  python3 benchmark_gluonnlp.py --layout TN --compute_layout TN --mode $mode --use_fp16
+done
diff --git a/scripts/benchmarks/benchmark_utils.py b/scripts/benchmarks/benchmark_utils.py
@@ -748,7 +748,6 @@ def __init__(self, workloads, model_names, use_fp16=False,
         self._inference_out_csv_file = inference_out_csv_file
         self._train_out_csv_file = train_out_csv_file
         self._env_info_file = env_info_file
-        assert use_fp16 is False, 'Currently fp16 benchmark has not been supported yet.'
 
     @property
     def model_names(self):
@@ -760,22 +759,26 @@ def workloads(self):
 
     def _inference_speed_memory(self, model_name: str, batch_size: int, sequence_length: int)\
             -> Tuple[float, Memory]:
+        if self._use_fp16:
+            dtype = 'float16'
+        else:
+            dtype = 'float32'
         if self._use_gpu:
             ctx = mxnet.gpu()
         else:
             ctx = mxnet.cpu()
         model_cls, cfg, tokenizer, backbone_param_path, _ = get_backbone(model_name)
-        # TODO Support fp16 profiling
         cfg.defrost()
         cfg.MODEL.layout = self._layout
         if model_cls.__name__ not in ['BartModel']:
             cfg.MODEL.compute_layout = self._compute_layout
         cfg.freeze()
         if model_cls.__name__ in ['BartModel']:
-            model = model_cls.from_cfg(cfg, extract_feature=True)
+            model = model_cls.from_cfg(cfg, extract_feature=True, dtype=dtype)
         else:
-            model = model_cls.from_cfg(cfg)
-        model.load_parameters(backbone_param_path, ctx=ctx)
+            model = model_cls.from_cfg(cfg, dtype=dtype)
+        model.load_parameters(backbone_param_path, ctx=ctx, cast_dtype=True)
+        model.cast(dtype)
         model.hybridize()
         vocab_size = cfg.MODEL.vocab_size
         if self._layout == 'NT':
@@ -860,12 +863,15 @@ def run_tvm_forward():
 
     def _train_speed_memory(self, model_name: str, batch_size: int, sequence_length: int)\
             -> Tuple[float, Memory]:
+        if self._use_fp16:
+            from mxnet import amp
+            amp.init()
+
         if self._use_gpu:
             ctx = mxnet.gpu()
         else:
             ctx = mxnet.cpu()
         model_cls, cfg, tokenizer, backbone_param_path, _ = get_backbone(model_name)
-        # TODO Support fp16 profiling
         cfg.defrost()
         cfg.MODEL.layout = self._layout
         if model_cls.__name__ not in ['BartModel']:

diff --git a/scripts/machine_translation/README.md b/scripts/machine_translation/README.md
@@ -30,6 +30,7 @@ python3 train_transformer.py \
     --save_dir transformer_base_wmt2014_en_de_${SUBWORD_ALGO} \
     --cfg transformer_base \
     --lr 0.002 \
+    --num_accumulated 32 \
     --sampler BoundedBudgetSampler \
     --max_num_tokens 2700 \
     --epochs 30 \

diff --git a/scripts/machine_translation/train_transformer.py b/scripts/machine_translation/train_transformer.py
@@ -441,8 +441,10 @@ def train(args):
             for sample_data, ctx in zip(sample_data_l, ctx_l):
                 if sample_data is None:
                     continue
-                src_token_ids, tgt_token_ids, src_valid_length, tgt_valid_length, sample_ids = sample_data
-                src_wc, tgt_wc, bs = src_valid_length.sum(), tgt_valid_length.sum(), src_token_ids.shape[0]
+                src_token_ids, tgt_token_ids, src_valid_length,\
+                tgt_valid_length, sample_ids = sample_data
+                src_wc, tgt_wc, bs = src_valid_length.sum(),\
+                                     tgt_valid_length.sum(), src_token_ids.shape[0]
                 loss_denom += tgt_wc - bs
                 log_loss_denom += tgt_wc - bs
                 log_wc += src_wc + tgt_wc