diff --git a/egs/voxceleb/v1.1/run_001_prepare_data.sh b/egs/voxceleb/v1.1/run_001_prepare_data.sh index a87531d0..dd9937b7 100755 --- a/egs/voxceleb/v1.1/run_001_prepare_data.sh +++ b/egs/voxceleb/v1.1/run_001_prepare_data.sh @@ -15,14 +15,16 @@ config_file=default_config.sh if [ $stage -le 1 ];then - - # Prepare the VoxCeleb2 dataset for training. - local/make_voxceleb2cat.pl $voxceleb2_root dev 16 data/voxceleb2cat_train - # local/make_voxceleb2cat.pl $voxceleb2_root test 16 data/voxceleb2cat_test - # utils/combine_data.sh data/voxceleb2cat data/voxceleb2cat_train data/voxceleb2cat_test + # Prepare the VoxCeleb2 dataset for training. + local/make_voxceleb2cat.pl $voxceleb2_root dev 16 data/voxceleb2cat_train + # local/make_voxceleb2cat.pl $voxceleb2_root test 16 data/voxceleb2cat_test + # utils/combine_data.sh data/voxceleb2cat data/voxceleb2cat_train data/voxceleb2cat_test fi if [ $stage -le 2 ];then - # prepare voxceleb1 for test - local/make_voxceleb1_oeh.pl $voxceleb1_root data + # prepare voxceleb1 for test + # This script is for the old version of the dataset + local/make_voxceleb1_oeh.pl $voxceleb1_root data + # Use this for the newer version of voxceleb1: + # local/make_voxceleb1_v2_oeh.pl $voxceleb1_root data fi diff --git a/egs/voxceleb/v1/local/make_voxceleb1_o.pl b/egs/voxceleb/v1/local/make_voxceleb1_o.pl index 93cbc83f..dce92245 100755 --- a/egs/voxceleb/v1/local/make_voxceleb1_o.pl +++ b/egs/voxceleb/v1/local/make_voxceleb1_o.pl @@ -5,8 +5,13 @@ # 2020 Jesus Villalba # # Usage: make_voxceleb1.pl /export/voxceleb1 data/ -# Create trial lists for Voxceleb1 original, Entire (E) and hard (H), +# Create trial lists for Voxceleb1 original, # with cleaned and non-cleaned versions +# Attention: +# - This script is for the old version of the dataset without anonymized speaker-ids +# - This script assumes that the voxceleb1 dataset has all speaker directories +# dumped in the same wav directory, NOT separated dev and test directories + if (@ARGV != 2) { print STDERR "Usage: $0 \n"; @@ -26,18 +31,47 @@ my @trials_url = ("$url_base/veri_test.txt", "$url_base/veri_test2.txt"); my @trials = ("trials_o", "trials_o_clean"); -open(META_IN, "<", "$data_base/vox1_meta.csv") or die "Could not open the meta data file $data_base/vox1_meta.csv"; +my $meta_url = "https://www.openslr.org/resources/49/vox1_meta.csv"; +my $meta_path = "$data_base/vox1_meta.csv"; +if (! -e "$meta_path") { + $meta_path = "$out_dir/vox1_meta.csv"; + system("wget -O $meta_path $meta_url"); +} + +open(META_IN, "<", "$meta_path") or die "Could not open the meta data file $meta_path"; my %id2spkr = (); +my %spkr2gender = (); +my %spkr2nation = (); while () { - chomp; - my ($vox_id, $spkr_id, $gender, $nation, $set) = split; - $id2spkr{$vox_id} = $spkr_id; - + chomp; + my ($vox_id, $spkr_id, $gender, $nation, $set) = split "\t"; + $id2spkr{$vox_id} = $spkr_id; + $spkr2gender{$spkr_id} = $gender; + $nation =~ s@ @-@g; + $spkr2nation{$spkr_id} = $nation; } close(META_IN) or die; +my $lid_url = "https://www.robots.ox.ac.uk/~vgg/data/voxceleb/data_workshop_2021/lang_vox1_final.csv"; +my $lid_path = "$data_base/lang_vox1_final.csv"; +if (! -e "$lid_path") { + $lid_path = "$out_dir/lang_vox1_final.csv"; + system("wget -O $lid_path $lid_url"); +} +open(LID_IN, "<", "$lid_path") or die "Could not open the output file $lid_path"; +my %utt2lang = (); +while () { + chomp; + my ($utt_id, $lang, $score) = split ','; + my ($vox_id, $vid_id, $file_id) = split '/', $utt_id; + my $spkr_id = $id2spkr{$vox_id}; + my $utt_id = "$spkr_id-$vid_id-00$file_id"; + $utt_id =~ s@\.wav$@@; + $utt2lang{$utt_id} = $lang; +} +close(LID_IN) or die; + #download trials from voxceleb web page -my %valid_utts = (); for($i = 0; $i <= $#trials; $i++) { my $file_i = "$out_dir/$trials_basename[$i]"; @@ -70,8 +104,6 @@ $target = "target"; } print TRIAL_OUT "$utt_id1 $utt_id2 $target\n"; - $valid_utts{$utt_id1} = 1; - $valid_utts{$utt_id2} = 1; } close(TRIAL_IN) or die; @@ -84,8 +116,11 @@ my @spkr_dirs = grep {-d "$data_base/voxceleb1_wav/$_" && ! /^\.{1,2}$/} readdir($dh); closedir $dh; -open(SPKR_TEST, ">", "$out_dir/utt2spk") or die "Could not open the output file $out_dir/utt2spk"; -open(WAV_TEST, ">", "$out_dir/wav.scp") or die "Could not open the output file $out_dir/wav.scp"; +open(SPKR, ">", "$out_dir/utt2spk") or die "Could not open the output file $out_dir/utt2spk"; +open(WAV, ">", "$out_dir/wav.scp") or die "Could not open the output file $out_dir/wav.scp"; +open(GENDER, ">", "$out_dir/spk2gender") or die "Could not open the output file $out_dir/spk2gender"; +open(NAT, ">", "$out_dir/spk2nation") or die "Could not open the output file $out_dir/spk2nation"; +open(LANG, ">", "$out_dir/utt2lang") or die "Could not open the output file $out_dir/utt2lang"; foreach (@spkr_dirs) { my $spkr_id = $_; @@ -95,6 +130,9 @@ if (exists $id2spkr{$spkr_id}) { $new_spkr_id = $id2spkr{$spkr_id}; } + print GENDER "$new_spkr_id $spkr2gender{$new_spkr_id}\n"; + print NAT "$new_spkr_id $spkr2nation{$new_spkr_id}\n"; + opendir my $dh, "$data_base/voxceleb1_wav/$spkr_id/" or die "Cannot open directory: $!"; my @files = map{s/\.[^.]+$//;$_}grep {/\.wav$/} readdir($dh); closedir $dh; @@ -104,15 +142,22 @@ my $segment = substr($filename, 12, 7); my $wav = "$data_base/voxceleb1_wav/$spkr_id/$filename.wav"; my $utt_id = "$new_spkr_id-$rec_id-$segment"; - if (exists $valid_utts{$utt_id}) { - print WAV_TEST "$utt_id", " $wav", "\n"; - print SPKR_TEST "$utt_id", " $new_spkr_id", "\n"; + print WAV "$utt_id", " $wav", "\n"; + print SPKR "$utt_id", " $new_spkr_id", "\n"; + if (exists $utt2lang{$utt_id}) { + print LANG "$utt_id", " $utt2lang{$utt_id}", "\n"; + } + else { + print LANG "$utt_id N/A\n"; } } } -close(SPKR_TEST) or die; -close(WAV_TEST) or die; +close(SPKR) or die; +close(WAV) or die; +close(LANG) or die; +close(GENDER) or die; +close(NAT) or die; if (system( "cat $out_dir/trials_* | sort -u > $out_dir/trials") != 0) { diff --git a/egs/voxceleb/v1/local/make_voxceleb1_oeh.pl b/egs/voxceleb/v1/local/make_voxceleb1_oeh.pl index 25ffa642..760ab397 100755 --- a/egs/voxceleb/v1/local/make_voxceleb1_oeh.pl +++ b/egs/voxceleb/v1/local/make_voxceleb1_oeh.pl @@ -7,6 +7,10 @@ # Usage: make_voxceleb1.pl /export/voxceleb1 data/ # Create trial lists for Voxceleb1 original, Entire (E) and hard (H), # with cleaned and non-cleaned versions +# Attention: +# - This script is for the old version of the dataset without anonymized speaker-ids +# - This script assumes that the voxceleb1 dataset has all speaker directories +# dumped in the same wav directory, NOT separated dev and test directories if (@ARGV != 2) { print STDERR "Usage: $0 \n"; @@ -26,16 +30,46 @@ my @trials_url = ("$url_base/veri_test.txt", "$url_base/veri_test2.txt", "$url_base/list_test_hard.txt", "$url_base/list_test_hard2.txt", "$url_base/list_test_all.txt", "$url_base/list_test_all2.txt"); my @trials = ("trials_o", "trials_o_clean", "trials_h", "trials_h_clean", "trials_e", "trials_e_clean"); -open(META_IN, "<", "$data_base/vox1_meta.csv") or die "Could not open the meta data file $data_base/vox1_meta.csv"; +my $meta_url = "https://www.openslr.org/resources/49/vox1_meta.csv"; +my $meta_path = "$data_base/vox1_meta.csv"; +if (! -e "$meta_path") { + $meta_path = "$out_dir/vox1_meta.csv"; + system("wget -O $meta_path $meta_url"); +} + +open(META_IN, "<", "$meta_path") or die "Could not open the meta data file $meta_path"; my %id2spkr = (); +my %spkr2gender = (); +my %spkr2nation = (); while () { - chomp; - my ($vox_id, $spkr_id, $gender, $nation, $set) = split; - $id2spkr{$vox_id} = $spkr_id; - + chomp; + my ($vox_id, $spkr_id, $gender, $nation, $set) = split "\t"; + $id2spkr{$vox_id} = $spkr_id; + $spkr2gender{$spkr_id} = $gender; + $nation =~ s@ @-@g; + $spkr2nation{$spkr_id} = $nation; } close(META_IN) or die; +my $lid_url = "https://www.robots.ox.ac.uk/~vgg/data/voxceleb/data_workshop_2021/lang_vox1_final.csv"; +my $lid_path = "$data_base/lang_vox1_final.csv"; +if (! -e "$lid_path") { + $lid_path = "$out_dir/lang_vox1_final.csv"; + system("wget -O $lid_path $lid_url"); +} +open(LID_IN, "<", "$lid_path") or die "Could not open the output file $lid_path"; +my %utt2lang = (); +while () { + chomp; + my ($utt_id, $lang, $score) = split ','; + my ($vox_id, $vid_id, $file_id) = split '/', $utt_id; + my $spkr_id = $id2spkr{$vox_id}; + my $utt_id = "$spkr_id-$vid_id-00$file_id"; + $utt_id =~ s@\.wav$@@; + $utt2lang{$utt_id} = $lang; +} +close(LID_IN) or die; + #download trials from voxceleb web page for($i = 0; $i <= $#trials; $i++) { @@ -81,8 +115,11 @@ my @spkr_dirs = grep {-d "$data_base/voxceleb1_wav/$_" && ! /^\.{1,2}$/} readdir($dh); closedir $dh; -open(SPKR_TEST, ">", "$out_dir/utt2spk") or die "Could not open the output file $out_dir/utt2spk"; -open(WAV_TEST, ">", "$out_dir/wav.scp") or die "Could not open the output file $out_dir/wav.scp"; +open(SPKR, ">", "$out_dir/utt2spk") or die "Could not open the output file $out_dir/utt2spk"; +open(WAV, ">", "$out_dir/wav.scp") or die "Could not open the output file $out_dir/wav.scp"; +open(GENDER, ">", "$out_dir/spk2gender") or die "Could not open the output file $out_dir/spk2gender"; +open(NAT, ">", "$out_dir/spk2nation") or die "Could not open the output file $out_dir/spk2nation"; +open(LANG, ">", "$out_dir/utt2lang") or die "Could not open the output file $out_dir/utt2lang"; foreach (@spkr_dirs) { my $spkr_id = $_; @@ -92,6 +129,9 @@ if (exists $id2spkr{$spkr_id}) { $new_spkr_id = $id2spkr{$spkr_id}; } + print GENDER "$new_spkr_id $spkr2gender{$new_spkr_id}\n"; + print NAT "$new_spkr_id $spkr2nation{$new_spkr_id}\n"; + opendir my $dh, "$data_base/voxceleb1_wav/$spkr_id/" or die "Cannot open directory: $!"; my @files = map{s/\.[^.]+$//;$_}grep {/\.wav$/} readdir($dh); closedir $dh; @@ -101,13 +141,22 @@ my $segment = substr($filename, 12, 7); my $wav = "$data_base/voxceleb1_wav/$spkr_id/$filename.wav"; my $utt_id = "$new_spkr_id-$rec_id-$segment"; - print WAV_TEST "$utt_id", " $wav", "\n"; - print SPKR_TEST "$utt_id", " $new_spkr_id", "\n"; + print WAV "$utt_id", " $wav", "\n"; + print SPKR "$utt_id", " $new_spkr_id", "\n"; + if (exists $utt2lang{$utt_id}) { + print LANG "$utt_id", " $utt2lang{$utt_id}", "\n"; + } + else { + print LANG "$utt_id N/A\n"; + } } } -close(SPKR_TEST) or die; -close(WAV_TEST) or die; +close(SPKR) or die; +close(WAV) or die; +close(LANG) or die; +close(GENDER) or die; +close(NAT) or die; if (system( "cat $out_dir/trials_* | sort -u > $out_dir/trials") != 0) { diff --git a/egs/voxceleb/v1/local/make_voxceleb1_v2.pl b/egs/voxceleb/v1/local/make_voxceleb1_v2.pl new file mode 100755 index 00000000..74ee23c1 --- /dev/null +++ b/egs/voxceleb/v1/local/make_voxceleb1_v2.pl @@ -0,0 +1,142 @@ +#!/usr/bin/perl +# +# Copyright 2018 Ewald Enzinger +# 2018 David Snyder +# 2018 Jesus Villalba +# +# Apache 2.0 +# Usage: make_voxceleb1_v2.pl /export/voxceleb1 data/ +# Attention: +# - This script is for the recent version of the dataset +# - This version of the script does NOT remove SITW overlap speakers +# - Files from the same video are NOT concatenated into 1 segment +# - This script assumes that the voxceleb1 dataset has all speaker directories dumped in the same wav directory, NOT separated dev and test directories + +if (@ARGV != 3) { + print STDERR "Usage: $0 fs \n"; + print STDERR "e.g. $0 /export/voxceleb1 16 data/\n"; + exit(1); +} + +($data_base, $fs, $out_dir) = @ARGV; +my $out_dir = "$out_dir/voxceleb1_train"; + +if (system("mkdir -p $out_dir") != 0) { + die "Error making directory $out_train_dir"; +} + +my $meta_url = "https://www.openslr.org/resources/49/vox1_meta.csv"; +my $meta_path = "$data_base/vox1_meta.csv"; +if (! -e "$meta_path") { + $meta_path = "$out_dir/vox1_meta.csv"; + system("wget -O $meta_path $meta_url"); +} + +open(META_IN, "<", "$meta_path") or die "Could not open the meta data file $meta_path"; + +my %id2spkr = (); +my $test_spkrs = (); +my %spkr2gender = (); +my %spkr2nation = (); +while () { + chomp; + my ($vox_id, $spkr_id, $gender, $nation, $set) = split "\t"; + $spkr2gender{$vox_id} = $gender; + $nation =~ s@ @-@g; + $spkr2nation{$vox_id} = $nation; + if ( $set eq "test"){ + $test_spkrs{$vox_id} = (); + } +} +close(META_IN) or die; + +my $lid_url = "https://www.robots.ox.ac.uk/~vgg/data/voxceleb/data_workshop_2021/lang_vox1_final.csv"; +my $lid_path = "$data_base/lang_vox1_final.csv"; +if (! -e "$lid_path") { + $lid_path = "$out_dir/lang_vox1_final.csv"; + system("wget -O $lid_path $lid_url"); +} +open(LID_IN, "<", "$lid_path") or die "Could not open the output file $lid_path"; +my %utt2lang = (); +while () { + chomp; + my ($utt_id, $lang, $score) = split ','; + my ($spkr_id, $vid_id, $file_id) = split '/', $utt_id; + $file_id =~ s@\.wav$@@; + my $utt_id = "$spkr_id-$vid_id-$file_id"; + $utt2lang{$utt_id} = $lang; +} +close(LID_IN) or die; + +my $wav_dir = "$data_base/wav"; +opendir my $dh, "$wav_dir" or die "Cannot open directory: $!"; +my @spkr_dirs = grep {-d "$wav_dir/$_" && ! /^\.{1,2}$/ || -l "$wav_dir/$_" } readdir($dh); +closedir $dh; + +open(SPKR, ">", "$out_dir/utt2spk") or die "Could not open the output file $out_dir/utt2spk"; +open(WAV, ">", "$out_dir/wav.scp") or die "Could not open the output file $out_dir/wav.scp"; +open(LANG, ">", "$out_dir/utt2lang") or die "Could not open the output file $out_dir/utt2lang"; +open(GENDER, ">", "$out_dir/spk2gender") or die "Could not open the output file $out_dir/spk2gender"; +open(NAT, ">", "$out_dir/spk2nation") or die "Could not open the output file $out_dir/spk2nation"; + +foreach (@spkr_dirs) { + my $spkr_id = $_; + + next if (exists $test_spkrs{$spkr_id}); + + print GENDER "$spkr_id $spkr2gender{$spkr_id}\n"; + print NAT "$spkr_id $spkr2nation{$spkr_id}\n"; + + my $spkr_dir = "$wav_dir/$spkr_id"; + opendir my $dh, "$spkr_dir" or die "Cannot open directory: $!"; + my @vid_dirs = grep {-d "$spkr_dir/$_" && ! /^\.{1,2}$/ } readdir($dh); + my @files = map{s/\.[^.]+$//;$_}grep {/\.wav$/} readdir($dh); + closedir $dh; + foreach (@vid_dirs) { + my $vid_id = $_; + my $vid_dir = "$spkr_dir/$vid_id"; + opendir my $dh, "$vid_dir" or die "Cannot open directory: $!"; + my @files = map{s/\.[^.]+$//;$_}grep {/\.wav$/} readdir($dh); + closedir $dh; + foreach (@files) { + my $segment = $_; + my $wav = "$vid_dir/$segment.wav"; + my $utt_id = "$spkr_id-$vid_id-$segment"; + if($fs == 8){ + $wav = "sox " . $wav . " -t wav -r 8k - |"; + } + print WAV "$utt_id", " $wav", "\n"; + print SPKR "$utt_id", " $spkr_id", "\n"; + if (exists $utt2lang{$utt_id}) { + print LANG "$utt_id", " $utt2lang{$utt_id}", "\n"; + } + else { + print LANG "$utt_id N/A\n"; + } + } + } +} +close(GENDER) or die; +close(NAT) or die; +close(SPKR) or die; +close(WAV) or die; +close(LANG) or die; + +if (system( + "utils/utt2spk_to_spk2utt.pl $out_dir/utt2spk >$out_dir/spk2utt") != 0) { + die "Error creating spk2utt file in directory $out_dir"; +} +system("env LC_COLLATE=C utils/fix_data_dir.sh $out_dir"); +if (system("env LC_COLLATE=C utils/validate_data_dir.sh --no-text --no-feats $out_dir") != 0) { + die "Error validating directory $out_dir"; +} + +if (system( + "utils/utt2spk_to_spk2utt.pl $out_dir/utt2spk >$out_dir/spk2utt") != 0) { + die "Error creating spk2utt file in directory $out_dir"; +} + +system("env LC_COLLATE=C utils/fix_data_dir.sh $out_dir"); +if (system("env LC_COLLATE=C utils/validate_data_dir.sh --no-text --no-feats $out_dir") != 0) { + die "Error validating directory $out_dir"; +} diff --git a/egs/voxceleb/v1/local/make_voxceleb1_v2_o.pl b/egs/voxceleb/v1/local/make_voxceleb1_v2_o.pl new file mode 100755 index 00000000..9ab37221 --- /dev/null +++ b/egs/voxceleb/v1/local/make_voxceleb1_v2_o.pl @@ -0,0 +1,211 @@ +#!/usr/bin/perl +# +# Copyright 2018 Ewald Enzinger +# 2018 David Snyder +# 2020 Jesus Villalba +# +# Usage: make_voxceleb1.pl /export/voxceleb1 data/ +# Create trial lists for Voxceleb1 original, +# with cleaned and non-cleaned versions +# Attention: +# - This script is for the recent version of the dataset +# - This script assumes that the voxceleb1 dataset has all speaker directories +# dumped in the same wav directory, NOT separated dev and test directories + +if (@ARGV != 2) { + print STDERR "Usage: $0 \n"; + print STDERR "e.g. $0 /export/voxceleb1 data/\n"; + exit(1); +} + +($data_base, $out_dir) = @ARGV; +my $out_dir = "$out_dir/voxceleb1_test"; + +if (system("mkdir -p $out_dir") != 0) { + die "Error making directory $out_dir"; +} + +my $url_base="http://www.robots.ox.ac.uk/~vgg/data/voxceleb/meta"; +my @trials_basename = ("very_test.txt", "very_test2.txt", "list_test_hard.txt", "list_test_hard2.txt", "list_test_all.txt", "list_test_all2.txt"); +my @trials_url = ("$url_base/veri_test.txt", "$url_base/veri_test2.txt"); +my @trials = ("trials_o", "trials_o_clean"); + +my $meta_url = "https://www.openslr.org/resources/49/vox1_meta.csv"; +my $meta_path = "$data_base/vox1_meta.csv"; +if (! -e "$meta_path") { + $meta_path = "$out_dir/vox1_meta.csv"; + system("wget -O $meta_path $meta_url"); +} + +open(META_IN, "<", "$meta_path") or die "Could not open the meta data file $meta_path"; +my %id2spkr = (); +my %spkr2gender = (); +my %spkr2nation = (); +while () { + chomp; + my ($vox_id, $spkr_id, $gender, $nation, $set) = split "\t"; + $spkr2gender{$vox_id} = $gender; + $nation =~ s@ @-@g; + $spkr2nation{$vox_id} = $nation; +} +close(META_IN) or die; + +my $lid_url = "https://www.robots.ox.ac.uk/~vgg/data/voxceleb/data_workshop_2021/lang_vox1_final.csv"; +my $lid_path = "$data_base/lang_vox1_final.csv"; +if (! -e "$lid_path") { + $lid_path = "$out_dir/lang_vox1_final.csv"; + system("wget -O $lid_path $lid_url"); +} +open(LID_IN, "<", "$lid_path") or die "Could not open the output file $lid_path"; +my %utt2lang = (); +while () { + chomp; + my ($utt_id, $lang, $score) = split ','; + my ($vox_id, $vid_id, $file_id) = split '/', $utt_id; + my $utt_id = "$vox_id-$vid_id-$file_id"; + $utt_id =~ s@\.wav$@@; + $utt2lang{$utt_id} = $lang; +} +close(LID_IN) or die; + +#download trials from voxceleb web page +for($i = 0; $i <= $#trials; $i++) { + + my $file_i = "$out_dir/$trials_basename[$i]"; + my $url_i = $trials_url[$i]; + my $trial_i = "$out_dir/$trials[$i]"; + if (! -e $file_i) { + system("wget -O $file_i $url_i"); + } + #mapping from new speaker ids and file-names to old ones + open(TRIAL_IN, "<", "$file_i") or die "Could not open the verification trials file $file_i"; + open(TRIAL_OUT, ">", "$trial_i") or die "Could not open the output file $trial_i"; + while () { + chomp; + my ($tar_or_non, $path1, $path2) = split; + + # Create entry for left-hand side of trial + my ($spkr_id, $rec_id, $segment) = split('/', $path1); + $segment =~ s/\.wav$//; + my $utt_id1 = "$spkr_id-$rec_id-$segment"; + + # Create entry for right-hand side of trial + my ($spkr_id, $rec_id, $segment) = split('/', $path2); + $segment =~ s/\.wav$//; + my $utt_id2 = "$spkr_id-$rec_id-$segment"; + + my $target = "nontarget"; + if ($tar_or_non eq "1") { + $target = "target"; + } + print TRIAL_OUT "$utt_id1 $utt_id2 $target\n"; + } + + close(TRIAL_IN) or die; + close(TRIAL_OUT) or die; + +} + +my $wav_dir = "$data_base/wav"; +opendir my $dh, "$wav_dir" or die "Cannot open directory: $!"; +my @spkr_dirs = grep {-d "$wav_dir/$_" && ! /^\.{1,2}$/ || -l "$wav_dir/$_" } readdir($dh); +closedir $dh; + +open(SPKR, ">", "$out_dir/utt2spk") or die "Could not open the output file $out_dir/utt2spk"; +open(WAV, ">", "$out_dir/wav.scp") or die "Could not open the output file $out_dir/wav.scp"; +open(GENDER, ">", "$out_dir/spk2gender") or die "Could not open the output file $out_dir/spk2gender"; +open(NAT, ">", "$out_dir/spk2nation") or die "Could not open the output file $out_dir/spk2nation"; +open(LANG, ">", "$out_dir/utt2lang") or die "Could not open the output file $out_dir/utt2lang"; + +foreach (@spkr_dirs) { + my $spkr_id = $_; + + print GENDER "$spkr_id $spkr2gender{$spkr_id}\n"; + print NAT "$spkr_id $spkr2nation{$spkr_id}\n"; + + my $spkr_dir = "$wav_dir/$spkr_id"; + opendir my $dh, "$spkr_dir" or die "Cannot open directory: $!"; + my @vid_dirs = grep {-d "$spkr_dir/$_" && ! /^\.{1,2}$/ } readdir($dh); + my @files = map{s/\.[^.]+$//;$_}grep {/\.wav$/} readdir($dh); + closedir $dh; + foreach (@vid_dirs) { + my $vid_id = $_; + my $vid_dir = "$spkr_dir/$vid_id"; + opendir my $dh, "$vid_dir" or die "Cannot open directory: $!"; + my @files = map{s/\.[^.]+$//;$_}grep {/\.wav$/} readdir($dh); + closedir $dh; + foreach (@files) { + my $segment = $_; + my $wav = "$vid_dir/$segment.wav"; + my $utt_id = "$spkr_id-$vid_id-$segment"; + if($fs == 8){ + $wav = "sox " . $wav . " -t wav -r 8k - |"; + } + print WAV "$utt_id", " $wav", "\n"; + print SPKR "$utt_id", " $spkr_id", "\n"; + if (exists $utt2lang{$utt_id}) { + print LANG "$utt_id", " $utt2lang{$utt_id}", "\n"; + } + else { + print LANG "$utt_id N/A\n"; + } + } + } +} + +# foreach (@spkr_dirs) { +# my $spkr_id = $_; +# my $new_spkr_id = $spkr_id; +# # If we're using a newer version of VoxCeleb1, we need to "deanonymize" +# # the speaker labels. +# if (exists $id2spkr{$spkr_id}) { +# $new_spkr_id = $id2spkr{$spkr_id}; +# } +# print GENDER "$new_spkr_id $spkr2gender{$new_spkr_id}\n"; +# print NAT "$new_spkr_id $spkr2nation{$new_spkr_id}\n"; + +# opendir my $dh, "$wav_dir/$spkr_id/" or die "Cannot open directory: $!"; +# my @files = map{s/\.[^.]+$//;$_}grep {/\.wav$/} readdir($dh); +# closedir $dh; +# foreach (@files) { +# my $filename = $_; +# my $rec_id = substr($filename, 0, 11); +# my $segment = substr($filename, 12, 7); +# my $wav = "$data_base/voxceleb1_wav/$spkr_id/$filename.wav"; +# my $utt_id = "$new_spkr_id-$rec_id-$segment"; +# print WAV "$utt_id", " $wav", "\n"; +# print SPKR "$utt_id", " $new_spkr_id", "\n"; +# if (exists $utt2lang{$utt_id}) { +# print LANG "$utt_id", " $utt2lang{$utt_id}", "\n"; +# } +# else { +# print LANG "$utt_id N/A\n"; +# } +# } +# } + +close(SPKR) or die; +close(WAV) or die; +close(LANG) or die; +close(GENDER) or die; +close(NAT) or die; + +if (system( + "cat $out_dir/trials_* | sort -u > $out_dir/trials") != 0) { + die "Error creating trials file in directory $out_dir"; +} + +if (system( + "awk '{ print \$1,\$1 }' $out_dir/trials | sort -u > $out_dir/utt2model") != 0) { + die "Error creating utt2model file in directory $out_dir"; +} + +if (system( + "utils/utt2spk_to_spk2utt.pl $out_dir/utt2spk >$out_dir/spk2utt") != 0) { + die "Error creating spk2utt file in directory $out_dir"; +} +system("env LC_COLLATE=C utils/fix_data_dir.sh $out_dir"); +if (system("env LC_COLLATE=C utils/validate_data_dir.sh --no-text --no-feats $out_dir") != 0) { + die "Error validating directory $out_dir"; +} + diff --git a/egs/voxceleb/v1/local/make_voxceleb1_v2_oeh.pl b/egs/voxceleb/v1/local/make_voxceleb1_v2_oeh.pl new file mode 100755 index 00000000..247ad30a --- /dev/null +++ b/egs/voxceleb/v1/local/make_voxceleb1_v2_oeh.pl @@ -0,0 +1,211 @@ +#!/usr/bin/perl +# +# Copyright 2018 Ewald Enzinger +# 2018 David Snyder +# 2020 Jesus Villalba +# +# Usage: make_voxceleb1.pl /export/voxceleb1 data/ +# Create trial lists for Voxceleb1 original, Entire (E) and hard (H), +# with cleaned and non-cleaned versions +# Attention: +# - This script is for the recent version of the dataset +# - This script assumes that the voxceleb1 dataset has all speaker directories +# dumped in the same wav directory, NOT separated dev and test directories + +if (@ARGV != 2) { + print STDERR "Usage: $0 \n"; + print STDERR "e.g. $0 /export/voxceleb1 data/\n"; + exit(1); +} + +($data_base, $out_dir) = @ARGV; +my $out_dir = "$out_dir/voxceleb1_test"; + +if (system("mkdir -p $out_dir") != 0) { + die "Error making directory $out_dir"; +} + +my $url_base="http://www.robots.ox.ac.uk/~vgg/data/voxceleb/meta"; +my @trials_basename = ("very_test.txt", "very_test2.txt", "list_test_hard.txt", "list_test_hard2.txt", "list_test_all.txt", "list_test_all2.txt"); +my @trials_url = ("$url_base/veri_test.txt", "$url_base/veri_test2.txt", "$url_base/list_test_hard.txt", "$url_base/list_test_hard2.txt", "$url_base/list_test_all.txt", "$url_base/list_test_all2.txt"); +my @trials = ("trials_o", "trials_o_clean", "trials_h", "trials_h_clean", "trials_e", "trials_e_clean"); + +my $meta_url = "https://www.openslr.org/resources/49/vox1_meta.csv"; +my $meta_path = "$data_base/vox1_meta.csv"; +if (! -e "$meta_path") { + $meta_path = "$out_dir/vox1_meta.csv"; + system("wget -O $meta_path $meta_url"); +} + +open(META_IN, "<", "$meta_path") or die "Could not open the meta data file $meta_path"; +my %id2spkr = (); +my %spkr2gender = (); +my %spkr2nation = (); +while () { + chomp; + my ($vox_id, $spkr_id, $gender, $nation, $set) = split "\t"; + $spkr2gender{$vox_id} = $gender; + $nation =~ s@ @-@g; + $spkr2nation{$vox_id} = $nation; +} +close(META_IN) or die; + +my $lid_url = "https://www.robots.ox.ac.uk/~vgg/data/voxceleb/data_workshop_2021/lang_vox1_final.csv"; +my $lid_path = "$data_base/lang_vox1_final.csv"; +if (! -e "$lid_path") { + $lid_path = "$out_dir/lang_vox1_final.csv"; + system("wget -O $lid_path $lid_url"); +} +open(LID_IN, "<", "$lid_path") or die "Could not open the output file $lid_path"; +my %utt2lang = (); +while () { + chomp; + my ($utt_id, $lang, $score) = split ','; + my ($vox_id, $vid_id, $file_id) = split '/', $utt_id; + my $utt_id = "$vox_id-$vid_id-$file_id"; + $utt_id =~ s@\.wav$@@; + $utt2lang{$utt_id} = $lang; +} +close(LID_IN) or die; + +#download trials from voxceleb web page +for($i = 0; $i <= $#trials; $i++) { + + my $file_i = "$out_dir/$trials_basename[$i]"; + my $url_i = $trials_url[$i]; + my $trial_i = "$out_dir/$trials[$i]"; + if (! -e $file_i) { + system("wget -O $file_i $url_i"); + } + #mapping from new speaker ids and file-names to old ones + open(TRIAL_IN, "<", "$file_i") or die "Could not open the verification trials file $file_i"; + open(TRIAL_OUT, ">", "$trial_i") or die "Could not open the output file $trial_i"; + while () { + chomp; + my ($tar_or_non, $path1, $path2) = split; + + # Create entry for left-hand side of trial + my ($spkr_id, $rec_id, $segment) = split('/', $path1); + $segment =~ s/\.wav$//; + my $utt_id1 = "$spkr_id-$rec_id-$segment"; + + # Create entry for right-hand side of trial + my ($spkr_id, $rec_id, $segment) = split('/', $path2); + $segment =~ s/\.wav$//; + my $utt_id2 = "$spkr_id-$rec_id-$segment"; + + my $target = "nontarget"; + if ($tar_or_non eq "1") { + $target = "target"; + } + print TRIAL_OUT "$utt_id1 $utt_id2 $target\n"; + } + + close(TRIAL_IN) or die; + close(TRIAL_OUT) or die; + +} + +my $wav_dir = "$data_base/wav"; +opendir my $dh, "$wav_dir" or die "Cannot open directory: $!"; +my @spkr_dirs = grep {-d "$wav_dir/$_" && ! /^\.{1,2}$/ || -l "$wav_dir/$_" } readdir($dh); +closedir $dh; + +open(SPKR, ">", "$out_dir/utt2spk") or die "Could not open the output file $out_dir/utt2spk"; +open(WAV, ">", "$out_dir/wav.scp") or die "Could not open the output file $out_dir/wav.scp"; +open(GENDER, ">", "$out_dir/spk2gender") or die "Could not open the output file $out_dir/spk2gender"; +open(NAT, ">", "$out_dir/spk2nation") or die "Could not open the output file $out_dir/spk2nation"; +open(LANG, ">", "$out_dir/utt2lang") or die "Could not open the output file $out_dir/utt2lang"; + +foreach (@spkr_dirs) { + my $spkr_id = $_; + + print GENDER "$spkr_id $spkr2gender{$spkr_id}\n"; + print NAT "$spkr_id $spkr2nation{$spkr_id}\n"; + + my $spkr_dir = "$wav_dir/$spkr_id"; + opendir my $dh, "$spkr_dir" or die "Cannot open directory: $!"; + my @vid_dirs = grep {-d "$spkr_dir/$_" && ! /^\.{1,2}$/ } readdir($dh); + my @files = map{s/\.[^.]+$//;$_}grep {/\.wav$/} readdir($dh); + closedir $dh; + foreach (@vid_dirs) { + my $vid_id = $_; + my $vid_dir = "$spkr_dir/$vid_id"; + opendir my $dh, "$vid_dir" or die "Cannot open directory: $!"; + my @files = map{s/\.[^.]+$//;$_}grep {/\.wav$/} readdir($dh); + closedir $dh; + foreach (@files) { + my $segment = $_; + my $wav = "$vid_dir/$segment.wav"; + my $utt_id = "$spkr_id-$vid_id-$segment"; + if($fs == 8){ + $wav = "sox " . $wav . " -t wav -r 8k - |"; + } + print WAV "$utt_id", " $wav", "\n"; + print SPKR "$utt_id", " $spkr_id", "\n"; + if (exists $utt2lang{$utt_id}) { + print LANG "$utt_id", " $utt2lang{$utt_id}", "\n"; + } + else { + print LANG "$utt_id N/A\n"; + } + } + } +} + +# foreach (@spkr_dirs) { +# my $spkr_id = $_; +# my $new_spkr_id = $spkr_id; +# # If we're using a newer version of VoxCeleb1, we need to "deanonymize" +# # the speaker labels. +# if (exists $id2spkr{$spkr_id}) { +# $new_spkr_id = $id2spkr{$spkr_id}; +# } +# print GENDER "$new_spkr_id $spkr2gender{$new_spkr_id}\n"; +# print NAT "$new_spkr_id $spkr2nation{$new_spkr_id}\n"; + +# opendir my $dh, "$wav_dir/$spkr_id/" or die "Cannot open directory: $!"; +# my @files = map{s/\.[^.]+$//;$_}grep {/\.wav$/} readdir($dh); +# closedir $dh; +# foreach (@files) { +# my $filename = $_; +# my $rec_id = substr($filename, 0, 11); +# my $segment = substr($filename, 12, 7); +# my $wav = "$data_base/voxceleb1_wav/$spkr_id/$filename.wav"; +# my $utt_id = "$new_spkr_id-$rec_id-$segment"; +# print WAV "$utt_id", " $wav", "\n"; +# print SPKR "$utt_id", " $new_spkr_id", "\n"; +# if (exists $utt2lang{$utt_id}) { +# print LANG "$utt_id", " $utt2lang{$utt_id}", "\n"; +# } +# else { +# print LANG "$utt_id N/A\n"; +# } +# } +# } + +close(SPKR) or die; +close(WAV) or die; +close(LANG) or die; +close(GENDER) or die; +close(NAT) or die; + +if (system( + "cat $out_dir/trials_* | sort -u > $out_dir/trials") != 0) { + die "Error creating trials file in directory $out_dir"; +} + +if (system( + "awk '{ print \$1,\$1 }' $out_dir/trials | sort -u > $out_dir/utt2model") != 0) { + die "Error creating utt2model file in directory $out_dir"; +} + +if (system( + "utils/utt2spk_to_spk2utt.pl $out_dir/utt2spk >$out_dir/spk2utt") != 0) { + die "Error creating spk2utt file in directory $out_dir"; +} +system("env LC_COLLATE=C utils/fix_data_dir.sh $out_dir"); +if (system("env LC_COLLATE=C utils/validate_data_dir.sh --no-text --no-feats $out_dir") != 0) { + die "Error validating directory $out_dir"; +} + diff --git a/egs/voxceleb/v1/local/make_voxceleb1cat.pl b/egs/voxceleb/v1/local/make_voxceleb1cat.pl index d8072942..59cbf0db 100755 --- a/egs/voxceleb/v1/local/make_voxceleb1cat.pl +++ b/egs/voxceleb/v1/local/make_voxceleb1cat.pl @@ -6,8 +6,12 @@ # # Apache 2.0 # Usage: make_voxceleb1cat.pl /export/voxceleb1 data/ -# This version of the script does NOT remove SITW overlap speakers -# Files from the same video are concatenated into 1 segment +# Attention: +# - This script is for the old version of the dataset without anonymized speaker-ids +# - This version of the script does NOT remove SITW overlap speakers +# - Files from the same video are concatenated into 1 segment +# - This script assumes that the voxceleb1 dataset has all speaker directories +# dumped in the same wav directory, NOT separated dev and test directories if (@ARGV != 3) { print STDERR "Usage: $0 fs \n"; @@ -22,31 +26,57 @@ die "Error making directory $out_train_dir"; } -opendir my $dh, "$data_base/voxceleb1_wav" or die "Cannot open directory: $!"; -my @spkr_dirs = grep {-d "$data_base/voxceleb1_wav/$_" && ! /^\.{1,2}$/} readdir($dh); -closedir $dh; - -if (! -e "$data_base/voxceleb1_test.txt") { - system("wget -O $data_base/voxceleb1_test.txt http://www.openslr.org/resources/49/voxceleb1_test.txt"); +my $meta_url = "https://www.openslr.org/resources/49/vox1_meta.csv"; +my $meta_path = "$data_base/vox1_meta.csv"; +if (! -e "$meta_path") { + $meta_path = "$out_dir/vox1_meta.csv"; + system("wget -O $meta_path $meta_url"); } -if (! -e "$data_base/vox1_meta.csv") { - system("wget -O $data_base/vox1_meta.csv http://www.openslr.org/resources/49/vox1_meta.csv"); -} - -open(META_IN, "<", "$data_base/vox1_meta.csv") or die "Could not open the meta data file $data_base/vox1_meta.csv"; +open(META_IN, "<", "$meta_path") or die "Could not open the meta data file $meta_path"; my %id2spkr = (); my $test_spkrs = (); +my %spkr2gender = (); +my %spkr2nation = (); while () { chomp; - my ($vox_id, $spkr_id, $gender, $nation, $set) = split; + my ($vox_id, $spkr_id, $gender, $nation, $set) = split "\t"; $id2spkr{$vox_id} = $spkr_id; + $spkr2gender{$spkr_id} = $gender; + $nation =~ s@ @-@g; + $spkr2nation{$spkr_id} = $nation; if ( $set eq "test"){ $test_spkrs{$spkr_id} = (); } } +close(META_IN) or die; + +my $lid_url = "https://www.robots.ox.ac.uk/~vgg/data/voxceleb/data_workshop_2021/lang_vox1_final.csv"; +my $lid_path = "$data_base/lang_vox1_final.csv"; +if (! -e "$lid_path") { + $lid_path = "$out_dir/lang_vox1_final.csv"; + system("wget -O $lid_path $lid_url"); +} +open(LID_IN, "<", "$lid_path") or die "Could not open the output file $lid_path"; +my %utt2lang = (); +while () { + chomp; + my ($utt_id, $lang, $score) = split ','; + my ($vox_id, $vid_id, $file_id) = split '/', $utt_id; + my $spkr_id = $id2spkr{$vox_id}; + my $utt_id = "$spkr_id-$vid_id"; + $utt2lang{$utt_id} = $lang; +} +close(LID_IN) or die; + +opendir my $dh, "$data_base/voxceleb1_wav" or die "Cannot open directory: $!"; +my @spkr_dirs = grep {-d "$data_base/voxceleb1_wav/$_" && ! /^\.{1,2}$/} readdir($dh); +closedir $dh; + +open(GENDER, ">", "$out_dir/spk2gender") or die "Could not open the output file $out_dir/spk2gender"; +open(NAT, ">", "$out_dir/spk2nation") or die "Could not open the output file $out_dir/spk2nation"; my %rec2utt = (); my %rec2spk = (); @@ -56,6 +86,9 @@ if (exists $id2spkr{$spkr_id}) { $new_spkr_id = $id2spkr{$spkr_id}; } + print GENDER "$new_spkr_id $spkr2gender{$new_spkr_id}\n"; + print NAT "$new_spkr_id $spkr2nation{$new_spkr_id}\n"; + opendir my $dh, "$data_base/voxceleb1_wav/$spkr_id/" or die "Cannot open directory: $!"; my @files = map{s/\.[^.]+$//;$_}grep {/\.wav$/} readdir($dh); closedir $dh; @@ -76,9 +109,12 @@ } } } +close(GENDER) or die; +close(NAT) or die; open(SPKR, ">", "$out_dir/utt2spk") or die "Could not open the output file $out_dir/utt2spk"; open(WAV, ">", "$out_dir/wav.scp") or die "Could not open the output file $out_dir/wav.scp"; +open(LANG, ">", "$out_dir/utt2lang") or die "Could not open the output file $out_dir/utt2lang"; foreach my $utt_id (keys %rec2spk) { my $wav = ""; @@ -89,12 +125,20 @@ $wav = "sox " . $rec2utt{$utt_id} . " -t wav - |"; } my $spkr_id = $rec2spk{$utt_id}; + my $land_id = $utt2lang{$utt_id}; print WAV "$utt_id", " $wav", "\n"; print SPKR "$utt_id", " $spkr_id", "\n"; + if (exists $utt2lang{$utt_id}) { + print LANG "$utt_id", " $utt2lang{$utt_id}", "\n"; + } + else { + print LANG "$utt_id N/A\n"; + } } close(SPKR) or die; close(WAV) or die; +close(LANG) or die; if (system( "utils/utt2spk_to_spk2utt.pl $out_dir/utt2spk >$out_dir/spk2utt") != 0) { diff --git a/egs/voxceleb/v1/local/make_voxceleb1cat_v2.pl b/egs/voxceleb/v1/local/make_voxceleb1cat_v2.pl new file mode 100755 index 00000000..e5baa746 --- /dev/null +++ b/egs/voxceleb/v1/local/make_voxceleb1cat_v2.pl @@ -0,0 +1,162 @@ +#!/usr/bin/perl +# +# Copyright 2018 Ewald Enzinger +# 2018 David Snyder +# 2018 Jesus Villalba +# +# Apache 2.0 +# Usage: make_voxceleb1cat_v2.pl /export/voxceleb1 data/ +# Attention: +# - This script is for the recent version of the dataset +# - This version of the script does NOT remove SITW overlap speakers +# - Files from the same video are concatenated into 1 segment +# - This script assumes that the voxceleb1 dataset has all speaker directories +# dumped in the same wav directory, NOT separated dev and test directories + +if (@ARGV != 3) { + print STDERR "Usage: $0 fs \n"; + print STDERR "e.g. $0 /export/voxceleb1 16 data/\n"; + exit(1); +} + +($data_base, $fs, $out_dir) = @ARGV; +my $out_dir = "$out_dir/voxceleb1cat_train"; + +if (system("mkdir -p $out_dir") != 0) { + die "Error making directory $out_train_dir"; +} + +my $meta_url = "https://www.openslr.org/resources/49/vox1_meta.csv"; +my $meta_path = "$data_base/vox1_meta.csv"; +if (! -e "$meta_path") { + $meta_path = "$out_dir/vox1_meta.csv"; + system("wget -O $meta_path $meta_url"); +} + +open(META_IN, "<", "$meta_path") or die "Could not open the meta data file $meta_path"; + +my %id2spkr = (); +my $test_spkrs = (); +my %spkr2gender = (); +my %spkr2nation = (); +while () { + chomp; + my ($vox_id, $spkr_id, $gender, $nation, $set) = split "\t"; + $spkr2gender{$vox_id} = $gender; + $nation =~ s@ @-@g; + $spkr2nation{$vox_id} = $nation; + if ( $set eq "test"){ + $test_spkrs{$vox_id} = (); + } +} +close(META_IN) or die; + +my $lid_url = "https://www.robots.ox.ac.uk/~vgg/data/voxceleb/data_workshop_2021/lang_vox1_final.csv"; +my $lid_path = "$data_base/lang_vox1_final.csv"; +if (! -e "$lid_path") { + $lid_path = "$out_dir/lang_vox1_final.csv"; + system("wget -O $lid_path $lid_url"); +} +open(LID_IN, "<", "$lid_path") or die "Could not open the output file $lid_path"; +my %utt2lang = (); +while () { + chomp; + my ($utt_id, $lang, $score) = split ','; + my ($spkr_id, $vid_id, $file_id) = split '/', $utt_id; + my $utt_id = "$spkr_id-$vid_id"; + $utt2lang{$utt_id} = $lang; +} +close(LID_IN) or die; + +my $wav_dir = "$data_base/wav"; +opendir my $dh, "$wav_dir" or die "Cannot open directory: $!"; +my @spkr_dirs = grep {-d "$wav_dir/$_" && ! /^\.{1,2}$/ || -l "$wav_dir/$_" } readdir($dh); +closedir $dh; + +open(GENDER, ">", "$out_dir/spk2gender") or die "Could not open the output file $out_dir/spk2gender"; +open(NAT, ">", "$out_dir/spk2nation") or die "Could not open the output file $out_dir/spk2nation"; + +my %utt2wav = (); +my %utt2spk = (); +foreach (@spkr_dirs) { + my $spkr_id = $_; + + print GENDER "$spkr_id $spkr2gender{$spkr_id}\n"; + print NAT "$spkr_id $spkr2nation{$spkr_id}\n"; + + my $spkr_dir = "$wav_dir/$spkr_id"; + opendir my $dh, "$spkr_dir" or die "Cannot open directory: $!"; + my @vid_dirs = grep {-d "$spkr_dir/$_" && ! /^\.{1,2}$/ } readdir($dh); + my @files = map{s/\.[^.]+$//;$_}grep {/\.wav$/} readdir($dh); + closedir $dh; + foreach (@vid_dirs) { + my $vid_id = $_; + my $vid_dir = "$spkr_dir/$vid_id"; + opendir my $dh, "$vid_dir" or die "Cannot open directory: $!"; + my @files = map{s/\.[^.]+$//;$_}grep {/\.wav$/} readdir($dh); + closedir $dh; + foreach (@files) { + my $segment = $_; + my $wav = "$vid_dir/$segment.wav"; + my $utt_id = "$spkr_id-$vid_id"; + if (not exists $test_spkrs{$spkr_id}) { + if (not exists $utt2wav{$utt_id}) { + $utt2spk{$utt_id} = $spkr_id; + $utt2wav{$utt_id} = $wav + } + else { + $utt2wav{$utt_id} = $utt2wav{$utt_id} . " " . $wav + } + } + } + } +} +close(GENDER) or die; +close(NAT) or die; + +open(SPKR, ">", "$out_dir/utt2spk") or die "Could not open the output file $out_dir/utt2spk"; +open(WAV, ">", "$out_dir/wav.scp") or die "Could not open the output file $out_dir/wav.scp"; +open(LANG, ">", "$out_dir/utt2lang") or die "Could not open the output file $out_dir/utt2lang"; + +foreach my $utt_id (keys %utt2spk) { + my $wav = ""; + if($fs == 8){ + $wav = "sox " . $utt2wav{$utt_id} . " -t wav -r 8k - |"; + } + else{ + $wav = "sox " . $utt2wav{$utt_id} . " -t wav - |"; + } + my $spkr_id = $utt2spk{$utt_id}; + my $land_id = $utt2lang{$utt_id}; + print WAV "$utt_id", " $wav", "\n"; + print SPKR "$utt_id", " $spkr_id", "\n"; + if (exists $utt2lang{$utt_id}) { + print LANG "$utt_id", " $utt2lang{$utt_id}", "\n"; + } + else { + print LANG "$utt_id N/A\n"; + } +} + +close(SPKR) or die; +close(WAV) or die; +close(LANG) or die; + +if (system( + "utils/utt2spk_to_spk2utt.pl $out_dir/utt2spk >$out_dir/spk2utt") != 0) { + die "Error creating spk2utt file in directory $out_dir"; +} +system("env LC_COLLATE=C utils/fix_data_dir.sh $out_dir"); +if (system("env LC_COLLATE=C utils/validate_data_dir.sh --no-text --no-feats $out_dir") != 0) { + die "Error validating directory $out_dir"; +} + +if (system( + "utils/utt2spk_to_spk2utt.pl $out_dir/utt2spk >$out_dir/spk2utt") != 0) { + die "Error creating spk2utt file in directory $out_dir"; +} + +system("env LC_COLLATE=C utils/fix_data_dir.sh $out_dir"); +if (system("env LC_COLLATE=C utils/validate_data_dir.sh --no-text --no-feats $out_dir") != 0) { + die "Error validating directory $out_dir"; +} diff --git a/egs/voxceleb/v1/local/make_voxceleb2.pl b/egs/voxceleb/v1/local/make_voxceleb2.pl index d88a78ce..e0ebeb0f 100755 --- a/egs/voxceleb/v1/local/make_voxceleb2.pl +++ b/egs/voxceleb/v1/local/make_voxceleb2.pl @@ -32,20 +32,59 @@ $dataset_path = "$data_base/$dataset" } -opendir my $dh, "$dataset_path" or die "Cannot open directory: $!"; -my @spkr_dirs = grep {-d "$dataset_path/$_" && ! /^\.{1,2}$/} readdir($dh); -closedir $dh; if (system("mkdir -p $out_dir") != 0) { die "Error making directory $out_dir"; } + +my $meta_url = "https://www.openslr.org/resources/49/vox2_meta.csv"; +my $meta_path = "$data_base/vox2_meta.csv"; +if (! -e "$meta_path") { + $meta_path = "$out_dir/vox2_meta.csv"; + system("wget -O $meta_path $meta_url"); +} +open(META_IN, "<", "$meta_path") or die "Could not open the output file $meta_path"; +my %spkr2gender = (); +while () { + chomp; + my ($spkr, $vox_id, $vgg_id, $gender, $set) = split; + $spkr2gender{$vox_id} = $gender; +} +close(META_IN) or die; + +my $lid_url = "https://www.robots.ox.ac.uk/~vgg/data/voxceleb/data_workshop_2021/lang_vox2_final.csv"; +my $lid_path = "$data_base/lang_vox2_final.csv"; +if (! -e "$lid_path") { + $lid_path = "$out_dir/lang_vox2_final.csv"; + system("wget -O $lid_path $lid_url"); +} +open(LID_IN, "<", "$lid_path") or die "Could not open the output file $lid_path"; +my %utt2lang = (); +while () { + chomp; + my ($utt_id, $lang, $score) = split ','; + $utt_id =~ s@/@-@g; + $utt_id =~ s@\.wav$@@; + $utt2lang{$utt_id} = $lang; +} +close(LID_IN) or die; + + open(SPKR, ">", "$out_dir/utt2spk") or die "Could not open the output file $out_dir/utt2spk"; open(WAV, ">", "$out_dir/wav.scp") or die "Could not open the output file $out_dir/wav.scp"; +open(LANG, ">", "$out_dir/utt2lang") or die "Could not open the output file $out_dir/utt2lang"; +open(GENDER, ">", "$out_dir/spk2gender") or die "Could not open the output file $out_dir/spk2gender"; + +opendir my $dh, "$dataset_path" or die "Cannot open directory: $!"; +my @spkr_dirs = grep {-d "$dataset_path/$_" && ! /^\.{1,2}$/} readdir($dh); +closedir $dh; foreach (@spkr_dirs) { my $spkr_id = $_; + print GENDER "$spkr_id $spkr2gender{$spkr_id}\n"; + opendir my $dh, "$dataset_path/$spkr_id/" or die "Cannot open directory: $!"; my @rec_dirs = grep {-d "$dataset_path/$spkr_id/$_" && ! /^\.{1,2}$/} readdir($dh); closedir $dh; @@ -66,11 +105,19 @@ my $utt_id = "$spkr_id-$rec_id-$name"; print WAV "$utt_id", " $wav", "\n"; print SPKR "$utt_id", " $spkr_id", "\n"; + if (exists $utt2lang{$utt_id}) { + print LANG "$utt_id", " $utt2lang{$utt_id}", "\n"; + } + else { + print LANG "$utt_id N/A\n"; + } } } } close(SPKR) or die; close(WAV) or die; +close(LANG) or die; +close(GENDER) or die; if (system( "utils/utt2spk_to_spk2utt.pl $out_dir/utt2spk >$out_dir/spk2utt") != 0) { diff --git a/egs/voxceleb/v1/local/make_voxceleb2cat.pl b/egs/voxceleb/v1/local/make_voxceleb2cat.pl index 6bea3737..fa4f64ab 100755 --- a/egs/voxceleb/v1/local/make_voxceleb2cat.pl +++ b/egs/voxceleb/v1/local/make_voxceleb2cat.pl @@ -33,9 +33,6 @@ $dataset_path = "$data_base/$dataset" } -opendir my $dh, "$dataset_path" or die "Cannot open directory: $!"; -my @spkr_dirs = grep {-d "$dataset_path/$_" && ! /^\.{1,2}$/} readdir($dh); -closedir $dh; if (system("mkdir -p $out_dir") != 0) { die "Error making directory $out_dir"; @@ -46,12 +43,52 @@ } +my $meta_url = "https://www.openslr.org/resources/49/vox2_meta.csv"; +my $meta_path = "$data_base/vox2_meta.csv"; +if (! -e "$meta_path") { + $meta_path = "$out_dir/vox2_meta.csv"; + system("wget -O $meta_path $meta_url"); +} +open(META_IN, "<", "$meta_path") or die "Could not open the output file $meta_path"; +my %spkr2gender = (); +while () { + chomp; + my ($spkr, $vox_id, $vgg_id, $gender, $set) = split; + $spkr2gender{$vox_id} = $gender; +} +close(META_IN) or die; + +my $lid_url = "https://www.robots.ox.ac.uk/~vgg/data/voxceleb/data_workshop_2021/lang_vox2_final.csv"; +my $lid_path = "$data_base/lang_vox2_final.csv"; +if (! -e "$lid_path") { + $lid_path = "$out_dir/lang_vox2_final.csv"; + system("wget -O $lid_path $lid_url"); +} +open(LID_IN, "<", "$lid_path") or die "Could not open the output file $lid_path"; +my %utt2lang = (); +while () { + chomp; + my ($utt_id, $lang, $score) = split ','; + $utt_id =~ s@/@-@g; + $utt_id =~ s@-[^-]*\.wav$@@; + $utt2lang{$utt_id} = $lang; +} +close(LID_IN) or die; + open(SPKR, ">", "$out_dir/utt2spk") or die "Could not open the output file $out_dir/utt2spk"; open(WAV, ">", "$out_dir/wav.scp") or die "Could not open the output file $out_dir/wav.scp"; +open(LANG, ">", "$out_dir/utt2lang") or die "Could not open the output file $out_dir/utt2lang"; +open(GENDER, ">", "$out_dir/spk2gender") or die "Could not open the output file $out_dir/spk2gender"; + +opendir my $dh, "$dataset_path" or die "Cannot open directory: $!"; +my @spkr_dirs = grep {-d "$dataset_path/$_" && ! /^\.{1,2}$/} readdir($dh); +closedir $dh; foreach (@spkr_dirs) { my $spkr_id = $_; + print GENDER "$spkr_id $spkr2gender{$spkr_id}\n"; + opendir my $dh, "$dataset_path/$spkr_id/" or die "Cannot open directory: $!"; my @rec_dirs = grep {-d "$dataset_path/$spkr_id/$_" && ! /^\.{1,2}$/} readdir($dh); closedir $dh; @@ -69,10 +106,18 @@ } print WAV "$utt_id", " $wav", "\n"; print SPKR "$utt_id", " $spkr_id", "\n"; + if (exists $utt2lang{$utt_id}) { + print LANG "$utt_id", " $utt2lang{$utt_id}", "\n"; + } + else { + print LANG "$utt_id N/A\n"; + } } } close(SPKR) or die; close(WAV) or die; +close(LANG) or die; +close(GENDER) or die; if (system( "utils/utt2spk_to_spk2utt.pl $out_dir/utt2spk >$out_dir/spk2utt") != 0) { diff --git a/hyp_utils/kaldi/utils/fix_data_dir.sh b/hyp_utils/kaldi/utils/fix_data_dir.sh index ed080eee..bb18e07b 100755 --- a/hyp_utils/kaldi/utils/fix_data_dir.sh +++ b/hyp_utils/kaldi/utils/fix_data_dir.sh @@ -117,7 +117,7 @@ function filter_speakers { ${kaldi_utils}/utt2spk_to_spk2utt.pl $data/utt2spk > $data/spk2utt cat $data/spk2utt | awk '{print $1}' > $tmpdir/speakers - for s in cmvn.scp spk2gender; do + for s in cmvn.scp spk2gender spk2nation; do f=$data/$s if [ -f $f ]; then filter_file $f $tmpdir/speakers @@ -127,7 +127,7 @@ function filter_speakers { filter_file $tmpdir/speakers $data/spk2utt ${kaldi_utils}/spk2utt_to_utt2spk.pl $data/spk2utt > $data/utt2spk - for s in cmvn.scp spk2gender $spk_extra_files; do + for s in cmvn.scp spk2gender spk2nation $spk_extra_files; do f=$data/$s if [ -f $f ]; then filter_file $tmpdir/speakers $f