Skip to content
This repository has been archived by the owner on Jan 15, 2024. It is now read-only.

Add AMP + Update Benchmarking Script #1405

Merged
merged 64 commits into from
Nov 6, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
64 commits
Select commit Hold shift + click to select a range
22992c3
Update transformer_xl.py
sxjscience Oct 21, 2020
790a6c8
Update run_squad.py
sxjscience Oct 29, 2020
4e8c2bb
Merge remote-tracking branch 'upstream/master' into amp
sxjscience Oct 29, 2020
051e264
Update test_models_mobilebert.py
sxjscience Oct 29, 2020
8713da5
Update README.md
sxjscience Oct 29, 2020
20993da
Update test_models_bert.py
sxjscience Oct 29, 2020
5538c4f
Update testing.py
sxjscience Oct 29, 2020
30261e0
Merge remote-tracking branch 'upstream/master' into amp
sxjscience Nov 1, 2020
3f7aec1
Update test_models_mobilebert.py
sxjscience Nov 1, 2020
6e219fa
Update test_models_roberta.py
sxjscience Nov 1, 2020
d3fa48e
Update gpt2.py
sxjscience Nov 1, 2020
61a636c
Update testing.py
sxjscience Nov 1, 2020
71b0d07
Update bart.py
sxjscience Nov 1, 2020
871a7dc
Update testing.py
sxjscience Nov 1, 2020
1d56a9f
Update testing.py
sxjscience Nov 2, 2020
e55080c
Update README.md
sxjscience Nov 2, 2020
d3fd1f5
Update README.md
sxjscience Nov 2, 2020
b0bcbe1
Update testing.py
sxjscience Nov 2, 2020
a157c1e
fix
sxjscience Nov 2, 2020
c6e79b6
update
sxjscience Nov 2, 2020
767b12c
Update test_models_roberta.py
sxjscience Nov 2, 2020
d0a095d
Update test_models_bart.py
sxjscience Nov 2, 2020
1c08c35
Update test_models_bart.py
sxjscience Nov 2, 2020
889af13
Update test_models_bart.py
sxjscience Nov 2, 2020
b06b445
Update testing.py
sxjscience Nov 2, 2020
a1924a9
only include bart-base
sxjscience Nov 2, 2020
13cc93b
Update bart.py
sxjscience Nov 2, 2020
6cc2db8
Update bart.py
sxjscience Nov 2, 2020
d249a2e
update
sxjscience Nov 2, 2020
d3d5e30
Update test_models_transformer.py
sxjscience Nov 2, 2020
8777377
Update test_models_transformer.py
sxjscience Nov 2, 2020
77ed30a
Update test_models_transformer.py
sxjscience Nov 2, 2020
b3c4f4d
Update test_models_transformer.py
sxjscience Nov 2, 2020
ccd92f2
Update run_squad.py
sxjscience Nov 4, 2020
e9e9f7b
Update attention_cell.py
sxjscience Nov 4, 2020
ff55364
Update README.md
sxjscience Nov 4, 2020
def32d5
Update test_models.py
sxjscience Nov 4, 2020
adf2aa7
Update run_squad.py
sxjscience Nov 4, 2020
5dc8ff7
Update run_squad.py
sxjscience Nov 4, 2020
450e425
update
sxjscience Nov 4, 2020
60440f1
Update run_squad.template
sxjscience Nov 4, 2020
45e56c3
update
sxjscience Nov 5, 2020
9cd11b0
Update generate_commands.py
sxjscience Nov 5, 2020
e4fa5a8
Update optimizer.py
sxjscience Nov 5, 2020
6a81452
update
sxjscience Nov 5, 2020
027b5dd
Update run_squad.py
sxjscience Nov 5, 2020
ffee5ac
update
sxjscience Nov 5, 2020
c4856bc
Update run_batch_squad.sh
sxjscience Nov 5, 2020
847f4c7
update
sxjscience Nov 5, 2020
8ccc487
Update testing.py
sxjscience Nov 5, 2020
4077929
Update test_optimizer.py
sxjscience Nov 5, 2020
8f5d5b7
Update benchmark_utils.py
sxjscience Nov 5, 2020
d95b8c8
update
sxjscience Nov 5, 2020
9bba04d
fix bug in inference
sxjscience Nov 5, 2020
474bc57
Update benchmark_gluonnlp.py
sxjscience Nov 5, 2020
c14a340
Update run_batch_squad.sh
sxjscience Nov 5, 2020
184ae0f
Merge remote-tracking branch 'upstream/master' into amp
sxjscience Nov 5, 2020
4e47f42
Update benchmark_utils.py
sxjscience Nov 5, 2020
4d6151f
Update run_squad.py
sxjscience Nov 5, 2020
f5bcb56
Update run_squad.py
sxjscience Nov 5, 2020
addff4a
Update run_squad.py
sxjscience Nov 5, 2020
e797a62
Merge remote-tracking branch 'upstream/master' into amp
sxjscience Nov 5, 2020
236f35e
Update run_squad.py
sxjscience Nov 5, 2020
6b2e1ea
update
sxjscience Nov 6, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ process the text data, and train models.

# Features

- Easy-to-use Text Processing Tools and APIs
- Easy-to-use Text Processing Tools and Modular APIs
- Pretrained Model Zoo
- Write Models with Numpy-like API
- Fast Inference via [Apache TVM (incubating)](https://tvm.apache.org/) (Experimental)
Expand All @@ -28,16 +28,16 @@ First of all, install the latest MXNet. You may use the following commands:

```bash
# Install the version with CUDA 10.0
python3 -m pip install -U --pre "mxnet-cu100>=2.0.0b20200926" -f https://dist.mxnet.io/python
python3 -m pip install -U --pre "mxnet-cu100>=2.0.0b20201101" -f https://dist.mxnet.io/python

# Install the version with CUDA 10.1
python3 -m pip install -U --pre "mxnet-cu101>=2.0.0b20200926" -f https://dist.mxnet.io/python
python3 -m pip install -U --pre "mxnet-cu101>=2.0.0b20201101" -f https://dist.mxnet.io/python

# Install the version with CUDA 10.2
python3 -m pip install -U --pre "mxnet-cu102>=2.0.0b20200926" -f https://dist.mxnet.io/python
python3 -m pip install -U --pre "mxnet-cu102>=2.0.0b20201101" -f https://dist.mxnet.io/python

# Install the cpu-only version
python3 -m pip install -U --pre "mxnet>=2.0.0b20200926" -f https://dist.mxnet.io/python
python3 -m pip install -U --pre "mxnet>=2.0.0b20201101" -f https://dist.mxnet.io/python
```


Expand Down
21 changes: 14 additions & 7 deletions scripts/benchmarks/benchmark_gluonnlp.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,20 +58,22 @@ def get_parser():
help='Whether to use TVM for inference/training')
parser.add_argument('--instance_type', choices=['c4', 'c5', 'g4', 'p3'], default='g4',
help='The instance type that the profiling script will be run on.')
parser.add_argument('--use_fp16', action='store_true')
parser.add_argument('--mode', type=str, default='train',
choices=['train', 'inference'])
return parser


def run_benchmark(workload, model_name, out_file_name, is_train,
use_tvm, instance_type):
use_tvm, instance_type, use_fp16):
if is_train:
benchmark = GluonNLPBackboneBenchmark(
workloads=workload,
model_names=model_name,
profile_inference=False,
profile_train=True,
to_csv=True,
use_fp16=use_fp16,
train_out_csv_file=out_file_name)
benchmark.run()
else:
Expand All @@ -83,6 +85,7 @@ def run_benchmark(workload, model_name, out_file_name, is_train,
use_tvm=use_tvm,
instance_type=instance_type,
to_csv=True,
use_fp16=use_fp16,
inference_out_csv_file=out_file_name)
benchmark.run()
return
Expand All @@ -94,13 +97,15 @@ def run_benchmark(workload, model_name, out_file_name, is_train,
args = parser.parse_args()
if args.compute_layout is None:
args.compute_layout = args.layout
dtype = 'float32' if not args.use_fp16 else 'float16'
for layout, compute_layout in [(args.layout, args.compute_layout)]:
if compute_layout != layout:
profile_models = [ele for ele in MODELS if 'bart' not in ele]
else:
profile_models = [ele for ele in MODELS]
if args.mode == 'inference':
out_dir = 'infer_fp32_{}_{}_tvm{}'.format(layout, compute_layout, int(args.use_tvm))
out_dir = 'infer_{}_{}_{}_tvm{}'.format(dtype, layout, compute_layout,
int(args.use_tvm))
df = pd.DataFrame(columns=['model', 'batch_size', 'sequence_length',
'latency', 'memory'])
os.makedirs(out_dir, exist_ok=True)
Expand All @@ -111,16 +116,17 @@ def run_benchmark(workload, model_name, out_file_name, is_train,
process = Process(
target=run_benchmark,
args=(workload, model_name, out_path, False,
args.use_tvm, args.instance_type))
args.use_tvm, args.instance_type, args.use_fp16))
process.start()
process.join()
new_df = pd.read_csv(out_path)
df = df.append(new_df, ignore_index=True)
df.to_csv('gluonnlp_infer_fp32_{}_{}_tvm{}.csv'.format(layout,
df.to_csv('gluonnlp_infer_{}_{}_{}_tvm{}.csv'.format(dtype,
layout,
compute_layout,
int(args.use_tvm)))
elif args.mode == 'train':
out_dir = 'train_fp32_{}_{}'.format(layout, compute_layout)
out_dir = 'train_{}_{}_{}'.format(dtype, layout, compute_layout)
df = pd.DataFrame(columns=['model', 'batch_size', 'sequence_length',
'latency', 'memory'])
os.makedirs(out_dir, exist_ok=True)
Expand All @@ -130,11 +136,12 @@ def run_benchmark(workload, model_name, out_file_name, is_train,
workload[1]))
process = Process(
target=run_benchmark,
args=(workload, model_name, out_path, True))
args=(workload, model_name, out_path, True, False,
args.instance_type, args.use_fp16))
process.start()
process.join()
new_df = pd.read_csv(out_path)
df = df.append(new_df, ignore_index=True)
df.to_csv('gluonnlp_train_fp32_{}_{}.csv'.format(layout, compute_layout))
df.to_csv('gluonnlp_train_{}_{}_{}.csv'.format(dtype, layout, compute_layout))
else:
raise NotImplementedError
14 changes: 14 additions & 0 deletions scripts/benchmarks/benchmark_gluonnlp_fp16.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
for mode in train inference
do
python3 benchmark_gluonnlp.py --layout NT --compute_layout NT --mode $mode --use_fp16
done

for mode in train inference
do
python3 benchmark_gluonnlp.py --layout NT --compute_layout TN --mode $mode --use_fp16
done

for mode in train inference
do
python3 benchmark_gluonnlp.py --layout TN --compute_layout TN --mode $mode --use_fp16
done
18 changes: 12 additions & 6 deletions scripts/benchmarks/benchmark_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -748,7 +748,6 @@ def __init__(self, workloads, model_names, use_fp16=False,
self._inference_out_csv_file = inference_out_csv_file
self._train_out_csv_file = train_out_csv_file
self._env_info_file = env_info_file
assert use_fp16 is False, 'Currently fp16 benchmark has not been supported yet.'

@property
def model_names(self):
Expand All @@ -760,22 +759,26 @@ def workloads(self):

def _inference_speed_memory(self, model_name: str, batch_size: int, sequence_length: int)\
-> Tuple[float, Memory]:
if self._use_fp16:
dtype = 'float16'
else:
dtype = 'float32'
if self._use_gpu:
ctx = mxnet.gpu()
else:
ctx = mxnet.cpu()
model_cls, cfg, tokenizer, backbone_param_path, _ = get_backbone(model_name)
# TODO Support fp16 profiling
cfg.defrost()
cfg.MODEL.layout = self._layout
if model_cls.__name__ not in ['BartModel']:
cfg.MODEL.compute_layout = self._compute_layout
cfg.freeze()
if model_cls.__name__ in ['BartModel']:
model = model_cls.from_cfg(cfg, extract_feature=True)
model = model_cls.from_cfg(cfg, extract_feature=True, dtype=dtype)
else:
model = model_cls.from_cfg(cfg)
model.load_parameters(backbone_param_path, ctx=ctx)
model = model_cls.from_cfg(cfg, dtype=dtype)
model.load_parameters(backbone_param_path, ctx=ctx, cast_dtype=True)
model.cast(dtype)
model.hybridize()
vocab_size = cfg.MODEL.vocab_size
if self._layout == 'NT':
Expand Down Expand Up @@ -860,12 +863,15 @@ def run_tvm_forward():

def _train_speed_memory(self, model_name: str, batch_size: int, sequence_length: int)\
-> Tuple[float, Memory]:
if self._use_fp16:
from mxnet import amp
amp.init()

if self._use_gpu:
ctx = mxnet.gpu()
else:
ctx = mxnet.cpu()
model_cls, cfg, tokenizer, backbone_param_path, _ = get_backbone(model_name)
# TODO Support fp16 profiling
cfg.defrost()
cfg.MODEL.layout = self._layout
if model_cls.__name__ not in ['BartModel']:
Expand Down
1 change: 1 addition & 0 deletions scripts/machine_translation/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ python3 train_transformer.py \
--save_dir transformer_base_wmt2014_en_de_${SUBWORD_ALGO} \
--cfg transformer_base \
--lr 0.002 \
--num_accumulated 32 \
szha marked this conversation as resolved.
Show resolved Hide resolved
--sampler BoundedBudgetSampler \
--max_num_tokens 2700 \
--epochs 30 \
Expand Down
6 changes: 4 additions & 2 deletions scripts/machine_translation/train_transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -441,8 +441,10 @@ def train(args):
for sample_data, ctx in zip(sample_data_l, ctx_l):
if sample_data is None:
continue
src_token_ids, tgt_token_ids, src_valid_length, tgt_valid_length, sample_ids = sample_data
src_wc, tgt_wc, bs = src_valid_length.sum(), tgt_valid_length.sum(), src_token_ids.shape[0]
src_token_ids, tgt_token_ids, src_valid_length,\
tgt_valid_length, sample_ids = sample_data
src_wc, tgt_wc, bs = src_valid_length.sum(),\
tgt_valid_length.sum(), src_token_ids.shape[0]
loss_denom += tgt_wc - bs
log_loss_denom += tgt_wc - bs
log_wc += src_wc + tgt_wc
Expand Down
Loading