-
Notifications
You must be signed in to change notification settings - Fork 3k
/
Copy pathrun_bert.sh
84 lines (68 loc) · 3.09 KB
/
run_bert.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
if [ "$#" -ne 12 ]; then
echo "Usage: $0 ngpu batch_size seq_len num_train_steps optimizer model_size training_mode[fp32|fp16] display_loss_steps gradient_accumulation_steps loss_scale gpu_name profile"
exit 1
fi
ngpu=${1:-1}
batch_size=${2:-64}
seq_len=${3:-128}
if [ ${seq_len} == 128 ]; then
max_predictions_per_seq=20
elif [ ${seq_len} == 512 ]; then
max_predictions_per_seq=80
else
echo "seq_len is not 128 or 512"
exit 1
fi
num_train_steps=${4:-400}
optimizer=${5:-"adam"}
model_size=${6:-"large"}
training_mode=${7:-"fp32"}
display_loss_steps=${8:-1}
grad_acc=${9:-1}
loss_scale=${10:-1024}
gpu_name=${11:-"mi100"}
profile=${12:-0}
lr=5e-5
warmup_ratio=0.2843
warmup_mode=Poly
effective_batch_size=$((ngpu * batch_size * grad_acc))
time_now=$(date +%m%d%H%M)
HOME_DIR=/workspace
ORT_DIR=${HOME_DIR}/github/onnxruntime
commit=$(git -C ${ORT_DIR} rev-parse HEAD | cut -c1-8)
if [ ${model_size} == "large" ]; then
model_dir=${HOME_DIR}/model/bert-large-uncased_L_24_H_1024_A_16_V_30528_S_512_Dp_0.1_optimized_layer_norm_opset12
elif [ ${model_size} == "base" ]; then
model_dir=${HOME_DIR}/model/bert-base-uncased_L_12_H_768_A_12_V_30528_S_512_Dp_0.1_optimized_layer_norm_opset12
elif [ ${model_size} == "tiny" ]; then
model_dir=${HOME_DIR}/model/bert-tiny-uncased_L_3_H_128_A_2_V_30528_S_512_Dp_0.1_optimized_layer_norm_opset12
else
echo "model_size is not large, base or tiny"
exit 1
fi
data_dir=/data/wezhan/bert/${seq_len}/train
training_bert_dir=${ORT_DIR}/build/RelWithDebInfo
log_dir=${HOME_DIR}/logs/bert_${model_size}/$(date +%m%d)
if [ ! -d ${log_dir} ]; then
mkdir -p ${log_dir}
fi
run_name=bert_${model_size}_${commit}_g${ngpu}_bs${batch_size}_sl${seq_len}_steps${num_train_steps}_${optimizer}_${training_mode}_acc${grad_acc}_efbs${effective_batch_size}_${time_now}_${gpu_name}
if [ ! -d ${log_dir}/${run_name} ]; then
mkdir -p ${log_dir}/${run_name}
fi
if [ ${ngpu} != 1 ]; then
mpi_cmd="${OPENMPI_DIR}/bin/mpirun --allow-run-as-root -n ${ngpu} -x NCCL_DEBUG=INFO -x NCCL_DEBUG_SUBSYS=INIT,COLL -x NCCL_MIN_NCHANNELS=4"
fi
if [ ${training_mode} == "fp16" ]; then
fp16_commands="--use_mixed_precision --allreduce_in_fp16 --loss_scale ${loss_scale}"
fi
if [ ${profile} == 1 ]; then
if [ ${gpu_name} == "mi100" ]; then
profile_commands="/opt/rocm/bin/rocprof --obj-tracking on --stats"
elif [ ${gpu_name} == "v100" ]; then
profile_commands="nvprof --print-gpu-summary --log-file ${log_dir}/${run_name}-trace.log"
fi
fi
nohup ${profile_commands} ${mpi_cmd} ${training_bert_dir}/onnxruntime_training_bert --model_name ${model_dir} --train_data_dir ${data_dir} --test_data_dir ${data_dir} --train_batch_size ${batch_size} --mode train --num_train_steps ${num_train_steps} --optimizer ${optimizer} --learning_rate ${lr} --warmup_ratio ${warmup_ratio} --warmup_mode ${warmup_mode} --gradient_accumulation_steps ${grad_acc} --max_seq_length ${seq_len} --max_predictions_per_seq=${max_predictions_per_seq} --use_nccl --lambda 0 ${fp16_commands} --display_loss_steps ${display_loss_steps} --log_dir ${log_dir}/${run_name} > ${log_dir}/${run_name}.log 2>&1 &
tail -f ${log_dir}/${run_name}.log
exit 0