Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 9 additions & 7 deletions egs/voxceleb/v1.1/run_001_prepare_data.sh
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,16 @@ config_file=default_config.sh


if [ $stage -le 1 ];then

# Prepare the VoxCeleb2 dataset for training.
local/make_voxceleb2cat.pl $voxceleb2_root dev 16 data/voxceleb2cat_train
# local/make_voxceleb2cat.pl $voxceleb2_root test 16 data/voxceleb2cat_test
# utils/combine_data.sh data/voxceleb2cat data/voxceleb2cat_train data/voxceleb2cat_test
# Prepare the VoxCeleb2 dataset for training.
local/make_voxceleb2cat.pl $voxceleb2_root dev 16 data/voxceleb2cat_train
# local/make_voxceleb2cat.pl $voxceleb2_root test 16 data/voxceleb2cat_test
# utils/combine_data.sh data/voxceleb2cat data/voxceleb2cat_train data/voxceleb2cat_test
fi

if [ $stage -le 2 ];then
# prepare voxceleb1 for test
local/make_voxceleb1_oeh.pl $voxceleb1_root data
# prepare voxceleb1 for test
# This script is for the old version of the dataset
local/make_voxceleb1_oeh.pl $voxceleb1_root data
# Use this for the newer version of voxceleb1:
# local/make_voxceleb1_v2_oeh.pl $voxceleb1_root data
fi
77 changes: 61 additions & 16 deletions egs/voxceleb/v1/local/make_voxceleb1_o.pl
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,13 @@
# 2020 Jesus Villalba
#
# Usage: make_voxceleb1.pl /export/voxceleb1 data/
# Create trial lists for Voxceleb1 original, Entire (E) and hard (H),
# Create trial lists for Voxceleb1 original,
# with cleaned and non-cleaned versions
# Attention:
# - This script is for the old version of the dataset without anonymized speaker-ids
# - This script assumes that the voxceleb1 dataset has all speaker directories
# dumped in the same wav directory, NOT separated dev and test directories


if (@ARGV != 2) {
print STDERR "Usage: $0 <path-to-voxceleb1> <path-to-data-dir>\n";
Expand All @@ -26,18 +31,47 @@
my @trials_url = ("$url_base/veri_test.txt", "$url_base/veri_test2.txt");
my @trials = ("trials_o", "trials_o_clean");

open(META_IN, "<", "$data_base/vox1_meta.csv") or die "Could not open the meta data file $data_base/vox1_meta.csv";
my $meta_url = "https://www.openslr.org/resources/49/vox1_meta.csv";
my $meta_path = "$data_base/vox1_meta.csv";
if (! -e "$meta_path") {
$meta_path = "$out_dir/vox1_meta.csv";
system("wget -O $meta_path $meta_url");
}

open(META_IN, "<", "$meta_path") or die "Could not open the meta data file $meta_path";
my %id2spkr = ();
my %spkr2gender = ();
my %spkr2nation = ();
while (<META_IN>) {
chomp;
my ($vox_id, $spkr_id, $gender, $nation, $set) = split;
$id2spkr{$vox_id} = $spkr_id;

chomp;
my ($vox_id, $spkr_id, $gender, $nation, $set) = split "\t";
$id2spkr{$vox_id} = $spkr_id;
$spkr2gender{$spkr_id} = $gender;
$nation =~ s@ @-@g;
$spkr2nation{$spkr_id} = $nation;
}
close(META_IN) or die;

my $lid_url = "https://www.robots.ox.ac.uk/~vgg/data/voxceleb/data_workshop_2021/lang_vox1_final.csv";
my $lid_path = "$data_base/lang_vox1_final.csv";
if (! -e "$lid_path") {
$lid_path = "$out_dir/lang_vox1_final.csv";
system("wget -O $lid_path $lid_url");
}
open(LID_IN, "<", "$lid_path") or die "Could not open the output file $lid_path";
my %utt2lang = ();
while (<LID_IN>) {
chomp;
my ($utt_id, $lang, $score) = split ',';
my ($vox_id, $vid_id, $file_id) = split '/', $utt_id;
my $spkr_id = $id2spkr{$vox_id};
my $utt_id = "$spkr_id-$vid_id-00$file_id";
$utt_id =~ s@\.wav$@@;
$utt2lang{$utt_id} = $lang;
}
close(LID_IN) or die;

#download trials from voxceleb web page
my %valid_utts = ();
for($i = 0; $i <= $#trials; $i++) {

my $file_i = "$out_dir/$trials_basename[$i]";
Expand Down Expand Up @@ -70,8 +104,6 @@
$target = "target";
}
print TRIAL_OUT "$utt_id1 $utt_id2 $target\n";
$valid_utts{$utt_id1} = 1;
$valid_utts{$utt_id2} = 1;
}

close(TRIAL_IN) or die;
Expand All @@ -84,8 +116,11 @@
my @spkr_dirs = grep {-d "$data_base/voxceleb1_wav/$_" && ! /^\.{1,2}$/} readdir($dh);
closedir $dh;

open(SPKR_TEST, ">", "$out_dir/utt2spk") or die "Could not open the output file $out_dir/utt2spk";
open(WAV_TEST, ">", "$out_dir/wav.scp") or die "Could not open the output file $out_dir/wav.scp";
open(SPKR, ">", "$out_dir/utt2spk") or die "Could not open the output file $out_dir/utt2spk";
open(WAV, ">", "$out_dir/wav.scp") or die "Could not open the output file $out_dir/wav.scp";
open(GENDER, ">", "$out_dir/spk2gender") or die "Could not open the output file $out_dir/spk2gender";
open(NAT, ">", "$out_dir/spk2nation") or die "Could not open the output file $out_dir/spk2nation";
open(LANG, ">", "$out_dir/utt2lang") or die "Could not open the output file $out_dir/utt2lang";

foreach (@spkr_dirs) {
my $spkr_id = $_;
Expand All @@ -95,6 +130,9 @@
if (exists $id2spkr{$spkr_id}) {
$new_spkr_id = $id2spkr{$spkr_id};
}
print GENDER "$new_spkr_id $spkr2gender{$new_spkr_id}\n";
print NAT "$new_spkr_id $spkr2nation{$new_spkr_id}\n";

opendir my $dh, "$data_base/voxceleb1_wav/$spkr_id/" or die "Cannot open directory: $!";
my @files = map{s/\.[^.]+$//;$_}grep {/\.wav$/} readdir($dh);
closedir $dh;
Expand All @@ -104,15 +142,22 @@
my $segment = substr($filename, 12, 7);
my $wav = "$data_base/voxceleb1_wav/$spkr_id/$filename.wav";
my $utt_id = "$new_spkr_id-$rec_id-$segment";
if (exists $valid_utts{$utt_id}) {
print WAV_TEST "$utt_id", " $wav", "\n";
print SPKR_TEST "$utt_id", " $new_spkr_id", "\n";
print WAV "$utt_id", " $wav", "\n";
print SPKR "$utt_id", " $new_spkr_id", "\n";
if (exists $utt2lang{$utt_id}) {
print LANG "$utt_id", " $utt2lang{$utt_id}", "\n";
}
else {
print LANG "$utt_id N/A\n";
}
}
}

close(SPKR_TEST) or die;
close(WAV_TEST) or die;
close(SPKR) or die;
close(WAV) or die;
close(LANG) or die;
close(GENDER) or die;
close(NAT) or die;

if (system(
"cat $out_dir/trials_* | sort -u > $out_dir/trials") != 0) {
Expand Down
71 changes: 60 additions & 11 deletions egs/voxceleb/v1/local/make_voxceleb1_oeh.pl
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,10 @@
# Usage: make_voxceleb1.pl /export/voxceleb1 data/
# Create trial lists for Voxceleb1 original, Entire (E) and hard (H),
# with cleaned and non-cleaned versions
# Attention:
# - This script is for the old version of the dataset without anonymized speaker-ids
# - This script assumes that the voxceleb1 dataset has all speaker directories
# dumped in the same wav directory, NOT separated dev and test directories

if (@ARGV != 2) {
print STDERR "Usage: $0 <path-to-voxceleb1> <path-to-data-dir>\n";
Expand All @@ -26,16 +30,46 @@
my @trials_url = ("$url_base/veri_test.txt", "$url_base/veri_test2.txt", "$url_base/list_test_hard.txt", "$url_base/list_test_hard2.txt", "$url_base/list_test_all.txt", "$url_base/list_test_all2.txt");
my @trials = ("trials_o", "trials_o_clean", "trials_h", "trials_h_clean", "trials_e", "trials_e_clean");

open(META_IN, "<", "$data_base/vox1_meta.csv") or die "Could not open the meta data file $data_base/vox1_meta.csv";
my $meta_url = "https://www.openslr.org/resources/49/vox1_meta.csv";
my $meta_path = "$data_base/vox1_meta.csv";
if (! -e "$meta_path") {
$meta_path = "$out_dir/vox1_meta.csv";
system("wget -O $meta_path $meta_url");
}

open(META_IN, "<", "$meta_path") or die "Could not open the meta data file $meta_path";
my %id2spkr = ();
my %spkr2gender = ();
my %spkr2nation = ();
while (<META_IN>) {
chomp;
my ($vox_id, $spkr_id, $gender, $nation, $set) = split;
$id2spkr{$vox_id} = $spkr_id;

chomp;
my ($vox_id, $spkr_id, $gender, $nation, $set) = split "\t";
$id2spkr{$vox_id} = $spkr_id;
$spkr2gender{$spkr_id} = $gender;
$nation =~ s@ @-@g;
$spkr2nation{$spkr_id} = $nation;
}
close(META_IN) or die;

my $lid_url = "https://www.robots.ox.ac.uk/~vgg/data/voxceleb/data_workshop_2021/lang_vox1_final.csv";
my $lid_path = "$data_base/lang_vox1_final.csv";
if (! -e "$lid_path") {
$lid_path = "$out_dir/lang_vox1_final.csv";
system("wget -O $lid_path $lid_url");
}
open(LID_IN, "<", "$lid_path") or die "Could not open the output file $lid_path";
my %utt2lang = ();
while (<LID_IN>) {
chomp;
my ($utt_id, $lang, $score) = split ',';
my ($vox_id, $vid_id, $file_id) = split '/', $utt_id;
my $spkr_id = $id2spkr{$vox_id};
my $utt_id = "$spkr_id-$vid_id-00$file_id";
$utt_id =~ s@\.wav$@@;
$utt2lang{$utt_id} = $lang;
}
close(LID_IN) or die;

#download trials from voxceleb web page
for($i = 0; $i <= $#trials; $i++) {

Expand Down Expand Up @@ -81,8 +115,11 @@
my @spkr_dirs = grep {-d "$data_base/voxceleb1_wav/$_" && ! /^\.{1,2}$/} readdir($dh);
closedir $dh;

open(SPKR_TEST, ">", "$out_dir/utt2spk") or die "Could not open the output file $out_dir/utt2spk";
open(WAV_TEST, ">", "$out_dir/wav.scp") or die "Could not open the output file $out_dir/wav.scp";
open(SPKR, ">", "$out_dir/utt2spk") or die "Could not open the output file $out_dir/utt2spk";
open(WAV, ">", "$out_dir/wav.scp") or die "Could not open the output file $out_dir/wav.scp";
open(GENDER, ">", "$out_dir/spk2gender") or die "Could not open the output file $out_dir/spk2gender";
open(NAT, ">", "$out_dir/spk2nation") or die "Could not open the output file $out_dir/spk2nation";
open(LANG, ">", "$out_dir/utt2lang") or die "Could not open the output file $out_dir/utt2lang";

foreach (@spkr_dirs) {
my $spkr_id = $_;
Expand All @@ -92,6 +129,9 @@
if (exists $id2spkr{$spkr_id}) {
$new_spkr_id = $id2spkr{$spkr_id};
}
print GENDER "$new_spkr_id $spkr2gender{$new_spkr_id}\n";
print NAT "$new_spkr_id $spkr2nation{$new_spkr_id}\n";

opendir my $dh, "$data_base/voxceleb1_wav/$spkr_id/" or die "Cannot open directory: $!";
my @files = map{s/\.[^.]+$//;$_}grep {/\.wav$/} readdir($dh);
closedir $dh;
Expand All @@ -101,13 +141,22 @@
my $segment = substr($filename, 12, 7);
my $wav = "$data_base/voxceleb1_wav/$spkr_id/$filename.wav";
my $utt_id = "$new_spkr_id-$rec_id-$segment";
print WAV_TEST "$utt_id", " $wav", "\n";
print SPKR_TEST "$utt_id", " $new_spkr_id", "\n";
print WAV "$utt_id", " $wav", "\n";
print SPKR "$utt_id", " $new_spkr_id", "\n";
if (exists $utt2lang{$utt_id}) {
print LANG "$utt_id", " $utt2lang{$utt_id}", "\n";
}
else {
print LANG "$utt_id N/A\n";
}
}
}

close(SPKR_TEST) or die;
close(WAV_TEST) or die;
close(SPKR) or die;
close(WAV) or die;
close(LANG) or die;
close(GENDER) or die;
close(NAT) or die;

if (system(
"cat $out_dir/trials_* | sort -u > $out_dir/trials") != 0) {
Expand Down
Loading