dmlc · szha · Nov 6, 2020 · Oct 21, 2020 · Oct 29, 2020 · Oct 29, 2020
@@ -16,7 +16,7 @@ process the text data, and train models.
 
 # Features
 
-- Easy-to-use Text Processing Tools and APIs
+- Easy-to-use Text Processing Tools and Modular APIs
 - Pretrained Model Zoo
 - Write Models with Numpy-like API
 - Fast Inference via [Apache TVM (incubating)](https://tvm.apache.org/) (Experimental)
@@ -28,16 +28,16 @@ First of all, install the latest MXNet. You may use the following commands:
 
 ```bash
 # Install the version with CUDA 10.0
-python3 -m pip install -U --pre "mxnet-cu100>=2.0.0b20200926" -f https://dist.mxnet.io/python
+python3 -m pip install -U --pre "mxnet-cu100>=2.0.0b20201101" -f https://dist.mxnet.io/python
 
 # Install the version with CUDA 10.1
-python3 -m pip install -U --pre "mxnet-cu101>=2.0.0b20200926" -f https://dist.mxnet.io/python
+python3 -m pip install -U --pre "mxnet-cu101>=2.0.0b20201101" -f https://dist.mxnet.io/python
 
 # Install the version with CUDA 10.2
-python3 -m pip install -U --pre "mxnet-cu102>=2.0.0b20200926" -f https://dist.mxnet.io/python
+python3 -m pip install -U --pre "mxnet-cu102>=2.0.0b20201101" -f https://dist.mxnet.io/python
 
 # Install the cpu-only version
-python3 -m pip install -U --pre "mxnet>=2.0.0b20200926" -f https://dist.mxnet.io/python
+python3 -m pip install -U --pre "mxnet>=2.0.0b20201101" -f https://dist.mxnet.io/python
 ```
 
 

@@ -58,20 +58,22 @@ def get_parser():
                         help='Whether to use TVM for inference/training')
     parser.add_argument('--instance_type', choices=['c4', 'c5', 'g4', 'p3'], default='g4',
                         help='The instance type that the profiling script will be run on.')
+    parser.add_argument('--use_fp16', action='store_true')
     parser.add_argument('--mode', type=str, default='train',
                         choices=['train', 'inference'])
     return parser
 
 
 def run_benchmark(workload, model_name, out_file_name, is_train,
-                  use_tvm, instance_type):
+                  use_tvm, instance_type, use_fp16):
     if is_train:
         benchmark = GluonNLPBackboneBenchmark(
             workloads=workload,
             model_names=model_name,
             profile_inference=False,
             profile_train=True,
             to_csv=True,
+            use_fp16=use_fp16,
             train_out_csv_file=out_file_name)
         benchmark.run()
     else:
@@ -83,6 +85,7 @@ def run_benchmark(workload, model_name, out_file_name, is_train,
             use_tvm=use_tvm,
             instance_type=instance_type,
             to_csv=True,
+            use_fp16=use_fp16,
             inference_out_csv_file=out_file_name)
         benchmark.run()
     return
@@ -94,13 +97,15 @@ def run_benchmark(workload, model_name, out_file_name, is_train,
     args = parser.parse_args()
     if args.compute_layout is None:
         args.compute_layout = args.layout
+    dtype = 'float32' if not args.use_fp16 else 'float16'
     for layout, compute_layout in [(args.layout, args.compute_layout)]:
         if compute_layout != layout:
             profile_models = [ele for ele in MODELS if 'bart' not in ele]
         else:
             profile_models = [ele for ele in MODELS]
         if args.mode == 'inference':
-            out_dir = 'infer_fp32_{}_{}_tvm{}'.format(layout, compute_layout, int(args.use_tvm))
+            out_dir = 'infer_{}_{}_{}_tvm{}'.format(dtype, layout, compute_layout,
+                                                    int(args.use_tvm))
             df = pd.DataFrame(columns=['model', 'batch_size', 'sequence_length',
                                        'latency', 'memory'])
             os.makedirs(out_dir, exist_ok=True)
@@ -111,16 +116,17 @@ def run_benchmark(workload, model_name, out_file_name, is_train,
                     process = Process(
                         target=run_benchmark,
                         args=(workload, model_name, out_path, False,
-                              args.use_tvm, args.instance_type))
+                              args.use_tvm, args.instance_type, args.use_fp16))
                     process.start()
                     process.join()
                     new_df = pd.read_csv(out_path)
                     df = df.append(new_df, ignore_index=True)
-                    df.to_csv('gluonnlp_infer_fp32_{}_{}_tvm{}.csv'.format(layout,
+                    df.to_csv('gluonnlp_infer_{}_{}_{}_tvm{}.csv'.format(dtype,
+                                                                         layout,
                                                                            compute_layout,
                                                                            int(args.use_tvm)))
         elif args.mode == 'train':
-            out_dir = 'train_fp32_{}_{}'.format(layout, compute_layout)
+            out_dir = 'train_{}_{}_{}'.format(dtype, layout, compute_layout)
             df = pd.DataFrame(columns=['model', 'batch_size', 'sequence_length',
                                        'latency', 'memory'])
             os.makedirs(out_dir, exist_ok=True)
@@ -130,11 +136,12 @@ def run_benchmark(workload, model_name, out_file_name, is_train,
                                                                            workload[1]))
                     process = Process(
                         target=run_benchmark,
-                        args=(workload, model_name, out_path, True))
+                        args=(workload, model_name, out_path, True, False,
+                              args.instance_type, args.use_fp16))
                     process.start()
                     process.join()
                     new_df = pd.read_csv(out_path)
                     df = df.append(new_df, ignore_index=True)
-                    df.to_csv('gluonnlp_train_fp32_{}_{}.csv'.format(layout, compute_layout))
+                    df.to_csv('gluonnlp_train_{}_{}_{}.csv'.format(dtype, layout, compute_layout))
         else:
             raise NotImplementedError
@@ -0,0 +1,14 @@
+for mode in train inference
+do
+  python3 benchmark_gluonnlp.py --layout NT --compute_layout NT --mode $mode --use_fp16
+done
+
+for mode in train inference
+do
+  python3 benchmark_gluonnlp.py --layout NT --compute_layout TN --mode $mode --use_fp16
+done
+
+for mode in train inference
+do
+  python3 benchmark_gluonnlp.py --layout TN --compute_layout TN --mode $mode --use_fp16
+done
@@ -748,7 +748,6 @@ def __init__(self, workloads, model_names, use_fp16=False,
         self._inference_out_csv_file = inference_out_csv_file
         self._train_out_csv_file = train_out_csv_file
         self._env_info_file = env_info_file
-        assert use_fp16 is False, 'Currently fp16 benchmark has not been supported yet.'
 
     @property
     def model_names(self):
@@ -760,22 +759,26 @@ def workloads(self):
 
     def _inference_speed_memory(self, model_name: str, batch_size: int, sequence_length: int)\
             -> Tuple[float, Memory]:
+        if self._use_fp16:
+            dtype = 'float16'
+        else:
+            dtype = 'float32'
         if self._use_gpu:
             ctx = mxnet.gpu()
         else:
             ctx = mxnet.cpu()
         model_cls, cfg, tokenizer, backbone_param_path, _ = get_backbone(model_name)
-        # TODO Support fp16 profiling
         cfg.defrost()
         cfg.MODEL.layout = self._layout
         if model_cls.__name__ not in ['BartModel']:
             cfg.MODEL.compute_layout = self._compute_layout
         cfg.freeze()
         if model_cls.__name__ in ['BartModel']:
-            model = model_cls.from_cfg(cfg, extract_feature=True)
+            model = model_cls.from_cfg(cfg, extract_feature=True, dtype=dtype)
         else:
-            model = model_cls.from_cfg(cfg)
-        model.load_parameters(backbone_param_path, ctx=ctx)
+            model = model_cls.from_cfg(cfg, dtype=dtype)
+        model.load_parameters(backbone_param_path, ctx=ctx, cast_dtype=True)
+        model.cast(dtype)
         model.hybridize()
         vocab_size = cfg.MODEL.vocab_size
         if self._layout == 'NT':
@@ -860,12 +863,15 @@ def run_tvm_forward():
 
     def _train_speed_memory(self, model_name: str, batch_size: int, sequence_length: int)\
             -> Tuple[float, Memory]:
+        if self._use_fp16:
+            from mxnet import amp
+            amp.init()
+
         if self._use_gpu:
             ctx = mxnet.gpu()
         else:
             ctx = mxnet.cpu()
         model_cls, cfg, tokenizer, backbone_param_path, _ = get_backbone(model_name)
-        # TODO Support fp16 profiling
         cfg.defrost()
         cfg.MODEL.layout = self._layout
         if model_cls.__name__ not in ['BartModel']:

@@ -30,6 +30,7 @@ python3 train_transformer.py \
     --save_dir transformer_base_wmt2014_en_de_${SUBWORD_ALGO} \
     --cfg transformer_base \
     --lr 0.002 \
+    --num_accumulated 32 \
     --sampler BoundedBudgetSampler \
     --max_num_tokens 2700 \
     --epochs 30 \

@@ -441,8 +441,10 @@ def train(args):
             for sample_data, ctx in zip(sample_data_l, ctx_l):
                 if sample_data is None:
                     continue
-                src_token_ids, tgt_token_ids, src_valid_length, tgt_valid_length, sample_ids = sample_data
-                src_wc, tgt_wc, bs = src_valid_length.sum(), tgt_valid_length.sum(), src_token_ids.shape[0]
+                src_token_ids, tgt_token_ids, src_valid_length,\
+                tgt_valid_length, sample_ids = sample_data
+                src_wc, tgt_wc, bs = src_valid_length.sum(),\
+                                     tgt_valid_length.sum(), src_token_ids.shape[0]
                 loss_denom += tgt_wc - bs
                 log_loss_denom += tgt_wc - bs
                 log_wc += src_wc + tgt_wc