Skip to content

Commit

Permalink
Merged in mdelcroix (pull request #2)
Browse files Browse the repository at this point in the history
Mdelcroix
  • Loading branch information
nttcslab-sp-admin committed Jul 17, 2015
2 parents 8fc29e6 + 0026e37 commit 3d970b2
Show file tree
Hide file tree
Showing 27 changed files with 1,004 additions and 0 deletions.
25 changes: 25 additions & 0 deletions egs/jsalt15-ffs/s5/cmd.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# "queue.pl" uses qsub. The options to it are
# options to qsub. If you have GridEngine installed,
# change this to a queue you have access to.
# Otherwise, use "run.pl", which will run jobs locally
# (make sure your --num-jobs options are no more than
# the number of cpus on your machine.

# On Eddie use:
#export train_cmd="queue.pl -P inf_hcrc_cstr_nst -l h_rt=08:00:00"
#export decode_cmd="queue.pl -P inf_hcrc_cstr_nst -l h_rt=05:00:00 -pe memory-2G 4"
#export highmem_cmd="queue.pl -P inf_hcrc_cstr_nst -l h_rt=05:00:00 -pe memory-2G 4"
#export scoring_cmd="queue.pl -P inf_hcrc_cstr_nst -l h_rt=00:20:00"

#On WS15 cluster( AWS-EC2)
export train_cmd="queue.pl -l arch=*64*"
export decode_cmd="queue.pl -l arch=*64* --mem 4G"
export highmem_cmd="queue.pl -l arch=*64* --mem 4G"
export scoring_cmd="queue.pl -l arch=*64*"
export cuda_cmd="queue.pl -q gpu.q"

# To run locally, use:
#export train_cmd=run.pl
#export decode_cmd=run.pl
#export highmem_cmd=run.pl
#export cuda_cmd=run.pl
170 changes: 170 additions & 0 deletions egs/jsalt15-ffs/s5/local/REVERB_mcwsjav_data_prep.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,170 @@
#!/bin/bash

# Copyright 2013 MERL (author: Felix Weninger)
# Contains some code by Microsoft Corporation, Johns Hopkins University (author: Daniel Povey)
#
# Modified: Marc Delcroix NTT Corporation, July 17 2015
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.


# for REVERB challenge:

dir=`pwd`/data/local/data
lmdir=`pwd`/data/local/nist_lm
mkdir -p $dir $lmdir
local=`pwd`/local
utils=`pwd`/utils
root=`pwd`

. ./path.sh # Needed for KALDI_ROOT
export PATH=$PATH:$KALDI_ROOT/tools/irstlm/bin
sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe
if [ ! -x $sph2pipe ]; then
echo "Could not find (or execute) the sph2pipe program at $sph2pipe";
exit 1;
fi

cd $dir

MIC=primary

# original input corpus (original or processed, tr or dt, etc.)
RWSJ_ORIG=$1
if [ ! -d "$RWSJ_ORIG" ]; then
echo Could not find directory $RWSJ_ORIG! Check pathnames in corpus.sh!
exit 1
fi

# enhanced input corpus (original or processed, tr or dt, etc.)
RWSJ_ENH=$2
if [ ! -d "$RWSJ_ENH" ]; then
echo Could not find directory $RWSJ_ENH! Check pathnames in corpus.sh!
exit 1
fi

# the name of the dataset to be created
dataset=REVERB_Real_dt

if [ ! -z "$3" ]; then
dataset=$3
fi

# the WSJCAM0 set that the set is based on (tr, dt, ...)
# this will be used to find the correct transcriptions etc.

# dt or et
dt_or_x=dt
if [ ! -z "$4" ]; then
dt_or_x=$4
fi


mcwsjav_mlf=$RWSJ_ORIG/mlf/WSJ.mlf

enhan=
if [ ! -z "$5" ]; then
enhan=_$5
fi

# unfortunately, we need a pointer to HTK baseline
# since the corpus does NOT contain the data set descriptions
# for the REVERB Challenge

taskFileDir=$dir/../reverb_tools/ReleasePackage/reverb_tools_for_asr_ver2.0/taskFiles/1ch
#taskFiles=`ls $taskFileDir/*Data_dt_for_*`
taskFiles=`ls $taskFileDir/RealData_${dt_or_x}_for_1ch_{far,near}*`

dir2=$dir/${dataset}${enhan}
mkdir -p $dir2

for taskFile in $taskFiles; do

set=`basename $taskFile`


echo $mcwsjav_mlf

# MLF transcription correction
# taken from HTK baseline script
sed -e '
# dos to unix line feed conversion
s/\x0D$//' \
-e "
s/\x60//g # remove unicode character grave accent.
" \
-e "
# fix the single quote for the word yield
# and the quoted ROOTS
# e.g. yield' --> yield
# reason: YIELD' is not in dict, while YIELD is
s/YIELD'/YIELD/g
s/'ROOTS'/ROOTS/g
s/'WHERE/WHERE/g
s/PEOPLE'/PEOPLE/g
s/SIT'/SIT/g
s/'DOMINEE/DOMINEE/g
s/CHURCH'/CHURCH/g" \
-e '
# fix the single missing double full stop issue at the end of an utterance
# e.g. I. C. N should be I. C. N.
# reason: N is not in dict, while N. is
/^[A-Z]$/ {
# append a line
N
# search for single dot on the second line
/\n\./ {
# found it - now replace the
s/\([A-Z]\)\n\./\1\.\n\./
}
}' \
$mcwsjav_mlf |\
perl $local/mlf2text.pl > $dir2/$set.txt1


# contains pointer to wav files with relative path --> add absolute path
echo taskFile = $taskFile
awk '{print "'$RWSJ_ENH'"$1}' < $taskFile > $dir2/${set}.flist || exit 1;

# this is like flist2scp.pl but it can take wav file list as input
(perl -e 'while(<>){
m:^\S+/[\w\-]*_(T\w{6,7})\.wav$: || die "Bad line $_";
$id = lc $1;
print "$id $_";
}' < $dir2/$set.flist || exit 1) | sort > $dir2/${set}_wav.scp


# Make the utt2spk and spk2utt files.
cat $dir2/${set}_wav.scp | awk '{print $1, $1}' > $dir2/$set.utt2spk || exit 1;
cat $dir2/$set.utt2spk | $utils/utt2spk_to_spk2utt.pl > $dir2/$set.spk2utt || exit 1;

awk '{print $1}' < $dir2/$set.utt2spk |\
$local/find_transcripts_txt.pl $dir2/$set.txt1 | sort | uniq > $dir2/$set.txt
#rm $dir2/$set.txt1

# Create directory structure required by decoding scripts

cd $root
data_dir=data/$dataset${enhan}/$set
mkdir -p $data_dir
cp $dir2/${set}_wav.scp ${data_dir}/wav.scp || exit 1;
cp $dir2/$set.txt ${data_dir}/text || exit 1;
cp $dir2/$set.spk2utt ${data_dir}/spk2utt || exit 1;
cp $dir2/$set.utt2spk ${data_dir}/utt2spk || exit 1;

echo "Data preparation for $set succeeded"
#echo "Put files into $dir2/$set.*"


done
1 change: 1 addition & 0 deletions egs/jsalt15-ffs/s5/local/ami_mdm_data_prep.sh
1 change: 1 addition & 0 deletions egs/jsalt15-ffs/s5/local/ami_mdm_scoring_data_prep.sh
70 changes: 70 additions & 0 deletions egs/jsalt15-ffs/s5/local/chime3_data_prep.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
#!/bin/bash -u

# Copyright 2009-2012 Microsoft Corporation Johns Hopkins University (Author: Daniel Povey)
# Apache 2.0.

# Copyright 2015 Mitsubishi Electric Research Laboratories (MERL) (Author: Shinji Watanabe)
# Apache 2.0.

# Copyright 2015 NTT Corporation (Author: Marc Delcroix)
# Apache 2.0.

. ./cmd.sh
. ./path.sh

# Begin configuration section.
chime3_enh_corpus=
enhan=noisy
channel=.CH5
# End configuration section.

echo "$0 $@" # Print the command line for logging
echo in
[ -f ./path.sh ] && . ./path.sh; # source the path.
. parse_options.sh || exit 1;

echo $#

if [ $# != 1 ]; then
echo "Usage: chime3_data_prep.sh [options] <chime3-corpus>"
echo "... where <chime3-corpus> is assumed to be the directory where the"
echo " original reverb corpus is located."
echo "e.g.: steps/reverb_data_prep.sh /export/REVERB /export/LDC/LDC93S6A/11-13.1"
echo ""
echo ""
echo "main options (for others, see top of script file)"
echo " --chime3-enh-corpus <reverb-enh-corpus> # directory where the enhanced speech is located."
echo " --enhan # keyword describing the enhancement process used"
echo " --channel # reference channel used when multi-channel enhancement output exist"
exit 1;
fi


chime3_corpus=$1
if [ -z "$chime3_enh_corpus" ]; then
chime3_enh_corpus=$chime3_corpus
fi


# CHiME3 data preparation
wsj0_data=$chime3_corpus/data/WSJ0 # directory of WSJ0 in CHiME3. You can also specify your WSJ0 corpus directory

# process for clean speech and making LMs etc. from original WSJ0
# note that training on clean data means original WSJ0 data only (no booth data)
local/clean_wsj0_data_prep.sh $wsj0_data || exit 1;
local/wsj_prepare_dict.sh || exit 1;

utils/prepare_lang.sh data/local/dict "<SPOKEN_NOISE>" data/local/lang_tmp data/lang || exit 1;
local/clean_chime3_format_data.sh || exit 1;


# Create scp files for chime3 task for enhanced speech
local/real_mc_enhan_chime3_data_prep.sh $chime3_corpus \
$chime3_enh_corpus\
$enhan\
$channel || exit 1;

local/simu_mc_enhan_chime3_data_prep.sh $chime3_corpus \
$chime3_enh_corpus\
$enhan\
$channel || exit 1;
1 change: 1 addition & 0 deletions egs/jsalt15-ffs/s5/local/clean_chime3_format_data.sh
1 change: 1 addition & 0 deletions egs/jsalt15-ffs/s5/local/clean_wsj0_data_prep.sh
1 change: 1 addition & 0 deletions egs/jsalt15-ffs/s5/local/convert2stm.pl
1 change: 1 addition & 0 deletions egs/jsalt15-ffs/s5/local/cstr_ndx2flist.pl
1 change: 1 addition & 0 deletions egs/jsalt15-ffs/s5/local/english.glm
1 change: 1 addition & 0 deletions egs/jsalt15-ffs/s5/local/find_noisy_transcripts.pl
1 change: 1 addition & 0 deletions egs/jsalt15-ffs/s5/local/find_transcripts.pl
1 change: 1 addition & 0 deletions egs/jsalt15-ffs/s5/local/flist2scp.pl
1 change: 1 addition & 0 deletions egs/jsalt15-ffs/s5/local/normalize_transcript.pl
Loading

0 comments on commit 3d970b2

Please sign in to comment.