-
Notifications
You must be signed in to change notification settings - Fork 11
/
Copy pathrun.sh
executable file
·178 lines (145 loc) · 7.05 KB
/
run.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
#!/bin/bash
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright (C) 2020 Phonexia
# Author: Jan Profant <jan.profant@phonexia.com>
# All Rights Reserved
nnet_dir=exp/xvector_nnet
stage=0
train_stage=-1
. ./cmd.sh || exit 1
. ./path.sh || exit 1
set -e
. ./utils/parse_options.sh
vaddir=mfcc
mfccdir=mfcc
fbankdir=fbank
plda_train_dir=data/plda_train
min_len=400
rate=16k
all_data_dir=all_combined
# set directory for corresponding datasets
voxceleb1_path=
voxceleb2_dev_path=
voxceleb_cn_path=
export musan_dir=
if [ ${stage} -le 1 ]; then
# prepare voxceleb1, voxceleb2 dev data and voxceleb-cn
# parameter --remove-speakers removes test speakers from voxceleb1
# please see script utils/make_data_dir_from_voxceleb.py to adapt it to your needs
python utils/make_data_dir_from_voxceleb.py --out-data-dir data/voxceleb1 \
--dataset-name voxceleb1 --remove-speakers local/voxceleb1-test_speakers.txt \
--dataset-path ${voxceleb1_path} --rate ${rate}
utils/fix_data_dir.sh data/voxceleb1
python utils/make_data_dir_from_voxceleb.py --out-data-dir data/voxceleb2 \
--dataset-name voxceleb2 --format raw --rate ${rate} \
--dataset-path ${voxceleb2_dev_path}
utils/fix_data_dir.sh data/voxceleb2
python utils/make_data_dir_from_voxceleb.py --out-data-dir data/voxcelebcn \
--dataset-name voxcelebcn --no-links --rate ${rate} \
--dataset-path ${voxceleb_cn_path}
utils/fix_data_dir.sh data/voxcelebcn
# combine all data into one data directory
utils/combine_data.sh data/${all_data_dir} data/voxceleb1 data/voxceleb2 data/voxcelebcn
fi
if [ ${stage} -le 2 ]; then
# in this stage, we compute VAD and prepare features for both clean and augmented audio
# make mfccs from clean audios (will be only used to compute vad afterwards)
steps/make_mfcc.sh --write-utt2num-frames true --mfcc-config conf/mfcc_${rate}.conf --nj 500 --cmd \
"${feats_cmd}" data/${all_data_dir} exp/make_fbank ${mfccdir}
utils/fix_data_dir.sh data/${all_data_dir}
# compute VAD for clean audio
local/compute_vad_decision.sh --nj 500 --cmd \
"${vad_cmd}" data/${all_data_dir} exp/make_vad ${vaddir}
utils/fix_data_dir.sh data/${all_data_dir}
# make fbanks from clean audios
steps/make_fbank.sh --write-utt2num-frames true --fbank-config conf/fbank_${rate}.conf --nj 500 --cmd \
"${feats_cmd}" data/${all_data_dir} exp/make_fbank ${fbankdir}
utils/fix_data_dir.sh data/${all_data_dir}
# augment directory
utils/augment_data_dir.sh ${all_data_dir}
# extract features from augmented data
steps/make_fbank.sh --write-utt2num-frames true --fbank-config conf/fbank_${rate}.conf --nj 500 --cmd \
"${feats_cmd}" data/${all_data_dir}_aug exp/make_fbank ${fbankdir}
utils/fix_data_dir.sh data/${all_data_dir}_aug
utils/combine_data.sh data/${all_data_dir}_aug_and_clean data/${all_data_dir}_aug data/${all_data_dir}
fi
name=${all_data_dir}_aug_and_clean
if [ ${stage} -le 3 ]; then
# Now we prepare the features to generate examples for xvector training.
# This script applies CMVN and removes nonspeech frames. Note that this is somewhat
# wasteful, as it roughly doubles the amount of training data on disk. After
# creating training examples, this can be removed.
local/nnet3/xvector/prepare_feats_for_egs.sh --nj 100 --cmd "${train_cmd}" \
data/${name} data/${name}_with_aug_no_sil exp/${name}_with_aug_no_sil
utils/fix_data_dir.sh data/${name}_with_aug_no_sil
# Now, we need to remove features that are too short after removing silence
# frames. We want at least 4s (400 frames) per utterance.
mv data/${name}_with_aug_no_sil/utt2num_frames data/${name}_with_aug_no_sil/utt2num_frames.bak
awk -v min_len=${min_len} '$2 > min_len {print $1, $2}' data/${name}_with_aug_no_sil/utt2num_frames.bak > data/${name}_with_aug_no_sil/utt2num_frames
utils/filter_scp.pl data/${name}_with_aug_no_sil/utt2num_frames data/${name}_with_aug_no_sil/utt2spk > data/${name}_with_aug_no_sil/utt2spk.new
mv data/${name}_with_aug_no_sil/utt2spk.new data/${name}_with_aug_no_sil/utt2spk
utils/fix_data_dir.sh data/${name}_with_aug_no_sil
# We also want several utterances per speaker. Now we'll throw out speakers
# with fewer than 8 utterances.
min_num_utts=8
awk '{print $1, NF-1}' data/${name}_with_aug_no_sil/spk2utt > data/${name}_with_aug_no_sil/spk2num
awk -v min_num_utts=${min_num_utts} '$2 >= min_num_utts {print $1, $2}' data/${name}_with_aug_no_sil/spk2num | utils/filter_scp.pl - data/${name}_with_aug_no_sil/spk2utt > data/${name}_with_aug_no_sil/spk2utt.new
mv data/${name}_with_aug_no_sil/spk2utt.new data/${name}_with_aug_no_sil/spk2utt
utils/spk2utt_to_utt2spk.pl data/${name}_with_aug_no_sil/spk2utt > data/${name}_with_aug_no_sil/utt2spk
utils/filter_scp.pl data/${name}_with_aug_no_sil/utt2spk data/${name}_with_aug_no_sil/utt2num_frames > data/${name}_with_aug_no_sil/utt2num_frames.new
mv data/${name}_with_aug_no_sil/utt2num_frames.new data/${name}_with_aug_no_sil/utt2num_frames
# Now we're ready to create training examples.
utils/fix_data_dir.sh data/${name}_with_aug_no_sil
fi
if [ ${stage} -le 4 ]; then
echo "$0: Getting neural network training egs";
local/nnet3/xvector/get_egs_but.sh --cmd "$train_cmd" \
--nj 16 \
--stage 0 \
--frames-per-chunk 400 \
--not-used-frames-percentage 40 \
--num-archives 1000 \
--num-diagnostic-archives 1 \
--num-repeats 10 \
data/${name}_with_aug_no_sil exp/egs
fi
num_gpus=2
if [ ${stage} -le 5 ]; then
# set all needed parameters in train.sh script
# this will start NN training
./train.sh /media/ssd-local/profant/exp/egs exp/nnet
# convert pytoch model to onnx (much faster)
# if this end with error it should be fine, just check if onnx file is present
python local/convert_resnet2onnx.py -i exp/nnet/ResNet101_add_margin_embed256_${num_gpus}gpu/models/model_final -o exp/nnet/ResNet101_add_margin_embed256_${num_gpus}gpu/models/model_final.onnx
fi
if [ ${stage} -le 6 ]; then
# create data directory for training of PLDA
# randomly pick one only clean or augmented utterance
mkdir -p data/plda_train
cp data/${name}_with_aug_no_sil/* data/plda_train
python local/create_plda_train_dir.py --input-data-dir data/${name}_with_aug_no_sil --output-data-dir data/plda_train
utils/fix_data_dir.sh data/plda_train
# split plda_train data dir to how many gpus you are gonna use for extraction
utils/split_data.sh data/plda_train/ ${num_gpus}
# extract embedding for PLDA training
# hardcoded path to model, modify if needed
for i in $(seq 0 $((num_gpus-1)))
do
python local/predict.py \
--model exp/nnet/ResNet101_add_margin_embed256_${num_gpus}gpu/models/model_final.onnx \
--kaldi-data-dir data/plda_train/split${num_gpus}/$((i+1)) \
--emb-out-dir exp/xvectors_plda_train_$((i+1)) \
--gpus ${i} &
done
wait
fi
if [ ${stage} -le 7 ]; then
# train PLDA
for i in $(seq 0 $((num_gpus-1)))
do
cat exp/xvectors_plda_train_$((i+1))/*.txt
done | python local/train_transform.py --utt2spk data/plda_train/utt2spk --output-h5 exp/transform.h5 | ivector-compute-plda ark:data/plda_train/spk2utt ark,cs:- exp/plda
fi
exit 0