Merged in mdelcroix (pull request #2)

Mdelcroix
KarelVesely84 · Jul 17, 2015 · 3d970b2 · 3d970b2
2 parents 8fc29e6 + 0026e37
commit 3d970b2
Show file tree

Hide file tree

Showing 27 changed files with 1,004 additions and 0 deletions.
diff --git a/egs/jsalt15-ffs/s5/cmd.sh b/egs/jsalt15-ffs/s5/cmd.sh
@@ -0,0 +1,25 @@
+# "queue.pl" uses qsub.  The options to it are
+# options to qsub.  If you have GridEngine installed,
+# change this to a queue you have access to.
+# Otherwise, use "run.pl", which will run jobs locally
+# (make sure your --num-jobs options are no more than
+# the number of cpus on your machine.
+
+# On Eddie use:
+#export train_cmd="queue.pl -P inf_hcrc_cstr_nst -l h_rt=08:00:00"
+#export decode_cmd="queue.pl -P inf_hcrc_cstr_nst  -l h_rt=05:00:00 -pe memory-2G 4"
+#export highmem_cmd="queue.pl -P inf_hcrc_cstr_nst -l h_rt=05:00:00 -pe memory-2G 4"
+#export scoring_cmd="queue.pl -P inf_hcrc_cstr_nst  -l h_rt=00:20:00"
+
+#On WS15 cluster( AWS-EC2)
+export train_cmd="queue.pl -l arch=*64*"
+export decode_cmd="queue.pl -l arch=*64* --mem 4G"
+export highmem_cmd="queue.pl -l arch=*64* --mem 4G"
+export scoring_cmd="queue.pl -l arch=*64*"
+export cuda_cmd="queue.pl -q gpu.q"
+
+# To run locally, use:
+#export train_cmd=run.pl
+#export decode_cmd=run.pl
+#export highmem_cmd=run.pl
+#export cuda_cmd=run.pl
diff --git a/egs/jsalt15-ffs/s5/local/REVERB_mcwsjav_data_prep.sh b/egs/jsalt15-ffs/s5/local/REVERB_mcwsjav_data_prep.sh
@@ -0,0 +1,170 @@
+#!/bin/bash
+
+# Copyright 2013 MERL (author: Felix Weninger)
+# Contains some code by Microsoft Corporation, Johns Hopkins University (author: Daniel Povey)
+#
+# Modified: Marc Delcroix NTT Corporation, July 17 2015
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+
+# for REVERB challenge:
+
+dir=`pwd`/data/local/data
+lmdir=`pwd`/data/local/nist_lm
+mkdir -p $dir $lmdir
+local=`pwd`/local
+utils=`pwd`/utils
+root=`pwd`
+
+. ./path.sh # Needed for KALDI_ROOT
+export PATH=$PATH:$KALDI_ROOT/tools/irstlm/bin
+sph2pipe=$KALDI_ROOT/tools/sph2pipe_v2.5/sph2pipe
+if [ ! -x $sph2pipe ]; then
+   echo "Could not find (or execute) the sph2pipe program at $sph2pipe";
+   exit 1;
+fi
+
+cd $dir
+
+MIC=primary
+
+# original input corpus (original or processed, tr or dt, etc.)
+RWSJ_ORIG=$1
+if [ ! -d "$RWSJ_ORIG" ]; then
+    echo Could not find directory $RWSJ_ORIG! Check pathnames in corpus.sh!
+    exit 1
+fi
+
+# enhanced input corpus (original or processed, tr or dt, etc.)
+RWSJ_ENH=$2
+if [ ! -d "$RWSJ_ENH" ]; then
+    echo Could not find directory $RWSJ_ENH! Check pathnames in corpus.sh!
+    exit 1
+fi
+
+# the name of the dataset to be created
+dataset=REVERB_Real_dt
+
+if [ ! -z "$3" ]; then
+   dataset=$3
+fi
+
+# the WSJCAM0 set that the set is based on (tr, dt, ...)
+# this will be used to find the correct transcriptions etc.
+
+# dt or et
+dt_or_x=dt
+if [ ! -z "$4" ]; then
+   dt_or_x=$4
+fi
+
+
+mcwsjav_mlf=$RWSJ_ORIG/mlf/WSJ.mlf
+
+enhan=
+if [ ! -z "$5" ]; then
+    enhan=_$5
+fi
+
+# unfortunately, we need a pointer to HTK baseline 
+# since the corpus does NOT contain the data set descriptions 
+# for the REVERB Challenge
+
+taskFileDir=$dir/../reverb_tools/ReleasePackage/reverb_tools_for_asr_ver2.0/taskFiles/1ch
+#taskFiles=`ls $taskFileDir/*Data_dt_for_*`
+taskFiles=`ls $taskFileDir/RealData_${dt_or_x}_for_1ch_{far,near}*`
+
+dir2=$dir/${dataset}${enhan}
+mkdir -p $dir2
+
+for taskFile in $taskFiles; do
+
+set=`basename $taskFile`
+
+
+echo $mcwsjav_mlf
+
+# MLF transcription correction
+# taken from HTK baseline script
+sed -e '
+# dos to unix line feed conversion
+s/\x0D$//' \
+-e "
+            s/\x60//g              # remove unicode character grave accent.
+       " \
+-e "
+            # fix the single quote for the word yield
+            # and the quoted ROOTS
+            # e.g. yield' --> yield
+            # reason: YIELD' is not in dict, while YIELD is
+            s/YIELD'/YIELD/g
+            s/'ROOTS'/ROOTS/g 
+            s/'WHERE/WHERE/g 
+            s/PEOPLE'/PEOPLE/g
+            s/SIT'/SIT/g
+            s/'DOMINEE/DOMINEE/g 
+            s/CHURCH'/CHURCH/g" \
+-e '
+              # fix the single missing double full stop issue at the end of an utterance
+              # e.g. I. C. N should be  I. C. N.
+              # reason: N is not in dict, while N. is
+              /^[A-Z]$/ {
+              # append a line
+                      N
+              # search for single dot on the second line        
+                      /\n\./ {
+              # found it - now replace the 
+                              s/\([A-Z]\)\n\./\1\.\n\./
+                      }
+              }' \
+$mcwsjav_mlf |\
+perl $local/mlf2text.pl > $dir2/$set.txt1
+
+
+# contains pointer to wav files with relative path --> add absolute path
+echo taskFile = $taskFile
+awk '{print "'$RWSJ_ENH'"$1}' < $taskFile > $dir2/${set}.flist || exit 1;
+
+# this is like flist2scp.pl but it can take wav file list as input
+(perl -e 'while(<>){
+    m:^\S+/[\w\-]*_(T\w{6,7})\.wav$: || die "Bad line $_";
+    $id = lc $1;
+    print "$id $_";
+}' < $dir2/$set.flist || exit 1) | sort > $dir2/${set}_wav.scp
+
+
+# Make the utt2spk and spk2utt files.
+cat $dir2/${set}_wav.scp | awk '{print $1, $1}' > $dir2/$set.utt2spk || exit 1;
+cat $dir2/$set.utt2spk | $utils/utt2spk_to_spk2utt.pl > $dir2/$set.spk2utt || exit 1;
+
+awk '{print $1}' < $dir2/$set.utt2spk |\
+$local/find_transcripts_txt.pl $dir2/$set.txt1 | sort | uniq > $dir2/$set.txt
+#rm $dir2/$set.txt1
+
+# Create directory structure required by decoding scripts
+
+cd $root
+data_dir=data/$dataset${enhan}/$set
+mkdir -p $data_dir
+cp $dir2/${set}_wav.scp ${data_dir}/wav.scp || exit 1;
+cp $dir2/$set.txt ${data_dir}/text || exit 1;
+cp $dir2/$set.spk2utt ${data_dir}/spk2utt || exit 1;
+cp $dir2/$set.utt2spk ${data_dir}/utt2spk || exit 1;
+
+echo "Data preparation for $set succeeded"
+#echo "Put files into $dir2/$set.*"
+
+
+done
diff --git a/egs/jsalt15-ffs/s5/local/ami_mdm_data_prep.sh b/egs/jsalt15-ffs/s5/local/ami_mdm_data_prep.sh
@@ -0,0 +1 @@
+../../../ami/s5/local/ami_mdm_data_prep.sh
diff --git a/egs/jsalt15-ffs/s5/local/ami_mdm_scoring_data_prep.sh b/egs/jsalt15-ffs/s5/local/ami_mdm_scoring_data_prep.sh
@@ -0,0 +1 @@
+../../../ami/s5/local/ami_mdm_scoring_data_prep.sh
diff --git a/egs/jsalt15-ffs/s5/local/chime3_data_prep.sh b/egs/jsalt15-ffs/s5/local/chime3_data_prep.sh
@@ -0,0 +1,70 @@
+#!/bin/bash -u
+
+# Copyright 2009-2012  Microsoft Corporation  Johns Hopkins University (Author: Daniel Povey)
+# Apache 2.0.
+
+# Copyright 2015  Mitsubishi Electric Research Laboratories (MERL) (Author: Shinji Watanabe)
+# Apache 2.0.
+
+# Copyright 2015  NTT Corporation (Author: Marc Delcroix)
+# Apache 2.0.
+
+. ./cmd.sh
+. ./path.sh
+
+# Begin configuration section.
+chime3_enh_corpus=
+enhan=noisy
+channel=.CH5
+# End configuration section.
+
+echo "$0 $@"  # Print the command line for logging
+echo in
+[ -f ./path.sh ] && . ./path.sh; # source the path.
+. parse_options.sh || exit 1;
+
+echo $#
+
+if [ $# != 1 ]; then
+   echo "Usage: chime3_data_prep.sh [options] <chime3-corpus>"
+   echo "... where <chime3-corpus> is assumed to be the directory where the"
+   echo " original reverb corpus is located."
+   echo "e.g.: steps/reverb_data_prep.sh /export/REVERB /export/LDC/LDC93S6A/11-13.1"
+   echo ""
+   echo ""
+   echo "main options (for others, see top of script file)"
+   echo "  --chime3-enh-corpus <reverb-enh-corpus>          # directory where the enhanced speech is located."
+   echo "  --enhan                                          # keyword describing the enhancement process used"
+   echo "  --channel                                        # reference channel used when multi-channel enhancement output exist"
+   exit 1;
+fi
+
+
+chime3_corpus=$1
+if [ -z "$chime3_enh_corpus" ]; then
+    chime3_enh_corpus=$chime3_corpus
+fi
+
+
+# CHiME3 data preparation
+wsj0_data=$chime3_corpus/data/WSJ0 # directory of WSJ0 in CHiME3. You can also specify your WSJ0 corpus directory
+
+# process for clean speech and making LMs etc. from original WSJ0
+# note that training on clean data means original WSJ0 data only (no booth data)
+local/clean_wsj0_data_prep.sh $wsj0_data || exit 1;
+local/wsj_prepare_dict.sh || exit 1;
+
+utils/prepare_lang.sh data/local/dict "<SPOKEN_NOISE>" data/local/lang_tmp data/lang || exit 1;
+local/clean_chime3_format_data.sh || exit 1;
+
+
+# Create scp files for chime3 task for enhanced speech
+local/real_mc_enhan_chime3_data_prep.sh $chime3_corpus \
+					$chime3_enh_corpus\
+					$enhan\
+					$channel || exit 1;
+
+local/simu_mc_enhan_chime3_data_prep.sh $chime3_corpus \
+					$chime3_enh_corpus\
+					$enhan\
+					$channel || exit 1;
diff --git a/egs/jsalt15-ffs/s5/local/clean_chime3_format_data.sh b/egs/jsalt15-ffs/s5/local/clean_chime3_format_data.sh
@@ -0,0 +1 @@
+../../../chime3/s5/local/clean_chime3_format_data.sh
diff --git a/egs/jsalt15-ffs/s5/local/clean_wsj0_data_prep.sh b/egs/jsalt15-ffs/s5/local/clean_wsj0_data_prep.sh
@@ -0,0 +1 @@
+../../../chime3/s5/local/clean_wsj0_data_prep.sh
diff --git a/egs/jsalt15-ffs/s5/local/convert2stm.pl b/egs/jsalt15-ffs/s5/local/convert2stm.pl
@@ -0,0 +1 @@
+../../../ami/s5/local/convert2stm.pl
diff --git a/egs/jsalt15-ffs/s5/local/cstr_ndx2flist.pl b/egs/jsalt15-ffs/s5/local/cstr_ndx2flist.pl
@@ -0,0 +1 @@
+../../../chime3/s5/local/cstr_ndx2flist.pl
diff --git a/egs/jsalt15-ffs/s5/local/english.glm b/egs/jsalt15-ffs/s5/local/english.glm
@@ -0,0 +1 @@
+../../../ami/s5/local/english.glm
diff --git a/egs/jsalt15-ffs/s5/local/find_noisy_transcripts.pl b/egs/jsalt15-ffs/s5/local/find_noisy_transcripts.pl
@@ -0,0 +1 @@
+../../../chime3/s5/local/find_noisy_transcripts.pl
diff --git a/egs/jsalt15-ffs/s5/local/find_transcripts.pl b/egs/jsalt15-ffs/s5/local/find_transcripts.pl
@@ -0,0 +1 @@
+../../../chime3/s5/local/find_transcripts.pl
diff --git a/egs/jsalt15-ffs/s5/local/flist2scp.pl b/egs/jsalt15-ffs/s5/local/flist2scp.pl
@@ -0,0 +1 @@
+../../../wsj/s5/local/flist2scp.pl
diff --git a/egs/jsalt15-ffs/s5/local/normalize_transcript.pl b/egs/jsalt15-ffs/s5/local/normalize_transcript.pl
@@ -0,0 +1 @@
+../../../wsj/s5/local/normalize_transcript.pl
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		../../../ami/s5/local/ami_mdm_scoring_data_prep.sh
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		../../../chime3/s5/local/clean_chime3_format_data.sh
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		../../../chime3/s5/local/clean_wsj0_data_prep.sh
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		../../../chime3/s5/local/find_noisy_transcripts.pl