From c68b4590ab20eaf55e0b96b82325a90177fffd5c Mon Sep 17 00:00:00 2001 From: Tomoki Hayashi Date: Tue, 15 Aug 2023 14:56:15 +0900 Subject: [PATCH] fix the case when segments has different #lines from wav.scp (#413) --- .github/workflows/ci.yaml | 35 +++++++++++++++++++++++++++++++ egs/yesno/voc1/local/data_prep.sh | 12 +++++++++++ egs/yesno/voc1/run.sh | 4 ++++ utils/make_subset_data.sh | 20 ++++++------------ utils/split_data.sh | 3 +-- 5 files changed, 58 insertions(+), 16 deletions(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 2d111eef..ffd96792 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -102,6 +102,41 @@ jobs: name: artifacts-${{ matrix.config }} path: egs/yesno/voc1 + integration_segments: + runs-on: ubuntu-20.04 + strategy: + max-parallel: 10 + matrix: + python-version: [3.9] + pytorch-version: [1.13.1] + steps: + - uses: actions/checkout@master + - uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + architecture: 'x64' + - uses: actions/cache@v2 + with: + path: ~/.cache/pip + key: ${{ runner.os }}-${{ matrix.python-version }}-${{ matrix.pytorch-version }}-pip-${{ hashFiles('**/setup.py') }} + restore-keys: | + ${{ runner.os }}-${{ matrix.python-version }}-${{ matrix.pytorch-version }}-pip- + - name: Install dependencies + run: | + sudo apt-get install libsndfile-dev jq + # make python env + cd tools; make CUDA_VERSION="" PYTHON=python${{ matrix.python-version }} PYTORCH_VERSION=${{ matrix.pytorch-version }} + source venv/bin/activate + pip install torch-yin + - name: Integration + run: | + cd egs/yesno/voc1 && ./run.sh --use_fake_segments true + - uses: actions/upload-artifact@v1 + if: failure() + with: + name: artifacts-${{ matrix.config }} + path: egs/yesno/voc1 + integration_vq: runs-on: ubuntu-20.04 strategy: diff --git a/egs/yesno/voc1/local/data_prep.sh b/egs/yesno/voc1/local/data_prep.sh index 5f2fbc8d..1b6fab6c 100755 --- a/egs/yesno/voc1/local/data_prep.sh +++ b/egs/yesno/voc1/local/data_prep.sh @@ -12,6 +12,7 @@ train_set="train_nodev" dev_set="dev" eval_set="eval" shuffle=false +use_fake_segments=false # shellcheck disable=SC1091 . utils/parse_options.sh || exit 1; @@ -31,6 +32,7 @@ if [ $# != 2 ]; then echo " --dev_set: name of dev set (default=dev)." echo " --eval_set: name of eval set (default=eval)." echo " --shuffle: whether to perform shuffle in making dev / eval set (default=false)." + echo " --use_fake_segments: whether to use fake segments (default=false)." exit 1 fi @@ -40,18 +42,28 @@ set -euo pipefail # set filenames scp="${data_dir}/all/wav.scp" +segments="${data_dir}/all/segments" # check file existence [ -e "${scp}" ] && rm "${scp}" +[ -e "${segments}" ] && rm "${segments}" # make all scp find "${db_root}" -follow -name "*.wav" | sort | while read -r filename; do id=$(basename "${filename}" | sed -e "s/\.[^\.]*$//g") echo "${id} ${filename}" >> "${scp}" + # NOTE(kan-bayashi): for integration test + if "${use_fake_segments}"; then + echo "${id}_1 ${id} 0.0 $(soxi -D "${filename}")" >> "${data_dir}/all/segments" + echo "${id}_2 ${id} 0.0 $(soxi -D "${filename}")" >> "${data_dir}/all/segments" + fi done # split num_all=$(wc -l < "${scp}") +if "${use_fake_segments}"; then + num_all=$(wc -l < "${segments}") +fi num_deveval=$((num_dev + num_eval)) num_train=$((num_all - num_deveval)) utils/split_data.sh \ diff --git a/egs/yesno/voc1/run.sh b/egs/yesno/voc1/run.sh index 55e7282b..bbff905a 100755 --- a/egs/yesno/voc1/run.sh +++ b/egs/yesno/voc1/run.sh @@ -20,6 +20,9 @@ conf=conf/parallel_wavegan.v1.debug.yaml download_dir=downloads # direcotry to save downloaded files dumpdir=dump # directory to dump features +# data setting +use_fake_segments=false # for testing + # training related setting tag="" # tag for directory to save model resume="" # checkpoint path to resume training @@ -47,6 +50,7 @@ fi if [ "${stage}" -le 0 ] && [ "${stop_stage}" -ge 0 ]; then echo "Stage 0: Data preparation" local/data_prep.sh \ + --use_fake_segments "${use_fake_segments}" \ --train_set "${train_set}" \ --dev_set "${dev_set}" \ --eval_set "${eval_set}" \ diff --git a/utils/make_subset_data.sh b/utils/make_subset_data.sh index b5aeffd1..b510e3a5 100755 --- a/utils/make_subset_data.sh +++ b/utils/make_subset_data.sh @@ -22,29 +22,21 @@ num_split=$2 dst_dir=$3 src_scp=${src_dir}/wav.scp +num_src_utts=$(wc -l < "${src_scp}") +has_utt2spk=false +has_segments=false + if [ -e "${src_dir}/segments" ]; then has_segments=true src_segments=${src_dir}/segments -else - has_segments=false + num_src_utts=$(wc -l < "${src_segments}") fi + if [ -e "${src_dir}/utt2spk" ]; then has_utt2spk=true src_utt2spk=${src_dir}/utt2spk -else - has_utt2spk=false fi -src_scp=${src_dir}/wav.scp -num_src_utts=$(wc -l < "${src_scp}") -# NOTE: We assume that wav.scp and segments has the same number of lines -if ${has_segments}; then - num_src_segments=$(wc -l < "${src_segments}") - if [ "${num_src_segments}" -ne "${num_src_utts}" ]; then - echo "ERROR: wav.scp and segments has different #lines (${num_src_utts} vs ${num_src_segments})." >&2 - exit 1; - fi -fi if ${has_utt2spk}; then num_src_utt2spk=$(wc -l < "${src_utt2spk}") if [ "${num_src_utt2spk}" -ne "${num_src_utts}" ]; then diff --git a/utils/split_data.sh b/utils/split_data.sh index 82a0735a..1ba7ae2d 100755 --- a/utils/split_data.sh +++ b/utils/split_data.sh @@ -28,7 +28,7 @@ if [ $# -ne 3 ]; then exit 1 fi -set -eu +set -eux src_dir=$1 first_dist_dir=$2 @@ -49,7 +49,6 @@ if [ -e "${src_dir}/utt2spk" ]; then else has_utt2spk=false fi -src_scp=${src_dir}/wav.scp if ${has_utt2spk}; then num_src_utt2spk=$(wc -l < "${src_utt2spk}")