Skip to content

Commit cc972eb

Browse files
added voxceleb1 v2 preparation scripts for training
1 parent 882c421 commit cc972eb

File tree

4 files changed

+474
-2
lines changed

4 files changed

+474
-2
lines changed
Lines changed: 139 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,139 @@
1+
#!/usr/bin/perl
2+
#
3+
# Copyright 2018 Ewald Enzinger
4+
# 2018 David Snyder
5+
# 2018 Jesus Villalba
6+
#
7+
# Apache 2.0
8+
# Usage: make_voxceleb1_v2.pl /export/voxceleb1 data/
9+
# This version of the script does NOT remove SITW overlap speakers
10+
# Files from the same video are NOT concatenated into 1 segment
11+
12+
if (@ARGV != 3) {
13+
print STDERR "Usage: $0 <path-to-voxceleb1> fs <path-to-data-dir>\n";
14+
print STDERR "e.g. $0 /export/voxceleb1 16 data/\n";
15+
exit(1);
16+
}
17+
18+
($data_base, $fs, $out_dir) = @ARGV;
19+
my $out_dir = "$out_dir/voxceleb1_train";
20+
21+
if (system("mkdir -p $out_dir") != 0) {
22+
die "Error making directory $out_train_dir";
23+
}
24+
25+
my $meta_url = "https://www.openslr.org/resources/49/vox1_meta.csv";
26+
my $meta_path = "$data_base/vox1_meta.csv";
27+
if (! -e "$meta_path") {
28+
$meta_path = "$out_dir/vox1_meta.csv";
29+
system("wget -O $meta_path $meta_url");
30+
}
31+
32+
open(META_IN, "<", "$meta_path") or die "Could not open the meta data file $meta_path";
33+
34+
my %id2spkr = ();
35+
my $test_spkrs = ();
36+
my %spkr2gender = ();
37+
my %spkr2nation = ();
38+
while (<META_IN>) {
39+
chomp;
40+
my ($vox_id, $spkr_id, $gender, $nation, $set) = split "\t";
41+
$spkr2gender{$vox_id} = $gender;
42+
$nation =~ s@ @-@g;
43+
$spkr2nation{$vox_id} = $nation;
44+
if ( $set eq "test"){
45+
$test_spkrs{$vox_id} = ();
46+
}
47+
}
48+
close(META_IN) or die;
49+
50+
my $lid_url = "https://www.robots.ox.ac.uk/~vgg/data/voxceleb/data_workshop_2021/lang_vox1_final.csv";
51+
my $lid_path = "$data_base/lang_vox1_final.csv";
52+
if (! -e "$lid_path") {
53+
$lid_path = "$out_dir/lang_vox1_final.csv";
54+
system("wget -O $lid_path $lid_url");
55+
}
56+
open(LID_IN, "<", "$lid_path") or die "Could not open the output file $lid_path";
57+
my %utt2lang = ();
58+
while (<LID_IN>) {
59+
chomp;
60+
my ($utt_id, $lang, $score) = split ',';
61+
my ($spkr_id, $vid_id, $file_id) = split '/', $utt_id;
62+
$file_id =~ s@\.wav$@@;
63+
my $utt_id = "$spkr_id-$vid_id-$file_id";
64+
$utt2lang{$utt_id} = $lang;
65+
}
66+
close(LID_IN) or die;
67+
68+
my $wav_dir = "$data_base/wav";
69+
opendir my $dh, "$wav_dir" or die "Cannot open directory: $!";
70+
my @spkr_dirs = grep {-d "$wav_dir/$_" && ! /^\.{1,2}$/ || -l "$wav_dir/$_" } readdir($dh);
71+
closedir $dh;
72+
73+
open(SPKR, ">", "$out_dir/utt2spk") or die "Could not open the output file $out_dir/utt2spk";
74+
open(WAV, ">", "$out_dir/wav.scp") or die "Could not open the output file $out_dir/wav.scp";
75+
open(LANG, ">", "$out_dir/utt2lang") or die "Could not open the output file $out_dir/utt2lang";
76+
open(GENDER, ">", "$out_dir/spk2gender") or die "Could not open the output file $out_dir/spk2gender";
77+
open(NAT, ">", "$out_dir/spk2nation") or die "Could not open the output file $out_dir/spk2nation";
78+
79+
foreach (@spkr_dirs) {
80+
my $spkr_id = $_;
81+
82+
next if (exists $test_spkrs{$spkr_id});
83+
84+
print GENDER "$spkr_id $spkr2gender{$spkr_id}\n";
85+
print NAT "$spkr_id $spkr2nation{$spkr_id}\n";
86+
87+
my $spkr_dir = "$wav_dir/$spkr_id";
88+
opendir my $dh, "$spkr_dir" or die "Cannot open directory: $!";
89+
my @vid_dirs = grep {-d "$spkr_dir/$_" && ! /^\.{1,2}$/ } readdir($dh);
90+
my @files = map{s/\.[^.]+$//;$_}grep {/\.wav$/} readdir($dh);
91+
closedir $dh;
92+
foreach (@vid_dirs) {
93+
my $vid_id = $_;
94+
my $vid_dir = "$spkr_dir/$vid_id";
95+
opendir my $dh, "$vid_dir" or die "Cannot open directory: $!";
96+
my @files = map{s/\.[^.]+$//;$_}grep {/\.wav$/} readdir($dh);
97+
closedir $dh;
98+
foreach (@files) {
99+
my $segment = $_;
100+
my $wav = "$vid_dir/$segment.wav";
101+
my $utt_id = "$spkr_id-$vid_id-$segment";
102+
if($fs == 8){
103+
$wav = "sox " . $wav . " -t wav -r 8k - |";
104+
}
105+
print WAV "$utt_id", " $wav", "\n";
106+
print SPKR "$utt_id", " $spkr_id", "\n";
107+
if (exists $utt2lang{$utt_id}) {
108+
print LANG "$utt_id", " $utt2lang{$utt_id}", "\n";
109+
}
110+
else {
111+
print LANG "$utt_id N/A\n";
112+
}
113+
}
114+
}
115+
}
116+
close(GENDER) or die;
117+
close(NAT) or die;
118+
close(SPKR) or die;
119+
close(WAV) or die;
120+
close(LANG) or die;
121+
122+
if (system(
123+
"utils/utt2spk_to_spk2utt.pl $out_dir/utt2spk >$out_dir/spk2utt") != 0) {
124+
die "Error creating spk2utt file in directory $out_dir";
125+
}
126+
system("env LC_COLLATE=C utils/fix_data_dir.sh $out_dir");
127+
if (system("env LC_COLLATE=C utils/validate_data_dir.sh --no-text --no-feats $out_dir") != 0) {
128+
die "Error validating directory $out_dir";
129+
}
130+
131+
if (system(
132+
"utils/utt2spk_to_spk2utt.pl $out_dir/utt2spk >$out_dir/spk2utt") != 0) {
133+
die "Error creating spk2utt file in directory $out_dir";
134+
}
135+
136+
system("env LC_COLLATE=C utils/fix_data_dir.sh $out_dir");
137+
if (system("env LC_COLLATE=C utils/validate_data_dir.sh --no-text --no-feats $out_dir") != 0) {
138+
die "Error validating directory $out_dir";
139+
}
Lines changed: 175 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,175 @@
1+
#!/usr/bin/perl
2+
#
3+
# Copyright 2018 Ewald Enzinger
4+
# 2018 David Snyder
5+
# 2020 Jesus Villalba
6+
#
7+
# Usage: make_voxceleb1.pl /export/voxceleb1 data/
8+
# Create trial lists for Voxceleb1 original, Entire (E) and hard (H),
9+
# with cleaned and non-cleaned versions
10+
11+
if (@ARGV != 2) {
12+
print STDERR "Usage: $0 <path-to-voxceleb1> <path-to-data-dir>\n";
13+
print STDERR "e.g. $0 /export/voxceleb1 data/\n";
14+
exit(1);
15+
}
16+
17+
($data_base, $out_dir) = @ARGV;
18+
my $out_dir = "$out_dir/voxceleb1_test";
19+
20+
if (system("mkdir -p $out_dir") != 0) {
21+
die "Error making directory $out_dir";
22+
}
23+
24+
my $url_base="http://www.robots.ox.ac.uk/~vgg/data/voxceleb/meta";
25+
my @trials_basename = ("very_test.txt", "very_test2.txt", "list_test_hard.txt", "list_test_hard2.txt", "list_test_all.txt", "list_test_all2.txt");
26+
my @trials_url = ("$url_base/veri_test.txt", "$url_base/veri_test2.txt", "$url_base/list_test_hard.txt", "$url_base/list_test_hard2.txt", "$url_base/list_test_all.txt", "$url_base/list_test_all2.txt");
27+
my @trials = ("trials_o", "trials_o_clean", "trials_h", "trials_h_clean", "trials_e", "trials_e_clean");
28+
29+
my $meta_url = "https://www.openslr.org/resources/49/vox1_meta.csv";
30+
my $meta_path = "$data_base/vox1_meta.csv";
31+
if (! -e "$meta_path") {
32+
$meta_path = "$out_dir/vox1_meta.csv";
33+
system("wget -O $meta_path $meta_url");
34+
}
35+
36+
open(META_IN, "<", "$meta_path") or die "Could not open the meta data file $meta_path";
37+
my %id2spkr = ();
38+
my %spkr2gender = ();
39+
my %spkr2nation = ();
40+
while (<META_IN>) {
41+
chomp;
42+
my ($vox_id, $spkr_id, $gender, $nation, $set) = split "\t";
43+
$id2spkr{$vox_id} = $spkr_id;
44+
$spkr2gender{$spkr_id} = $gender;
45+
$nation =~ s@ @-@g;
46+
$spkr2nation{$spkr_id} = $nation;
47+
}
48+
close(META_IN) or die;
49+
50+
my $lid_url = "https://www.robots.ox.ac.uk/~vgg/data/voxceleb/data_workshop_2021/lang_vox1_final.csv";
51+
my $lid_path = "$data_base/lang_vox1_final.csv";
52+
if (! -e "$lid_path") {
53+
$lid_path = "$out_dir/lang_vox1_final.csv";
54+
system("wget -O $lid_path $lid_url");
55+
}
56+
open(LID_IN, "<", "$lid_path") or die "Could not open the output file $lid_path";
57+
my %utt2lang = ();
58+
while (<LID_IN>) {
59+
chomp;
60+
my ($utt_id, $lang, $score) = split ',';
61+
my ($vox_id, $vid_id, $file_id) = split '/', $utt_id;
62+
my $spkr_id = $id2spkr{$vox_id};
63+
my $utt_id = "$spkr_id-$vid_id-00$file_id";
64+
$utt_id =~ s@\.wav$@@;
65+
$utt2lang{$utt_id} = $lang;
66+
}
67+
close(LID_IN) or die;
68+
69+
#download trials from voxceleb web page
70+
for($i = 0; $i <= $#trials; $i++) {
71+
72+
my $file_i = "$out_dir/$trials_basename[$i]";
73+
my $url_i = $trials_url[$i];
74+
my $trial_i = "$out_dir/$trials[$i]";
75+
if (! -e $file_i) {
76+
system("wget -O $file_i $url_i");
77+
}
78+
#mapping from new speaker ids and file-names to old ones
79+
open(TRIAL_IN, "<", "$file_i") or die "Could not open the verification trials file $file_i";
80+
open(TRIAL_OUT, ">", "$trial_i") or die "Could not open the output file $trial_i";
81+
while (<TRIAL_IN>) {
82+
chomp;
83+
my ($tar_or_non, $path1, $path2) = split;
84+
85+
# Create entry for left-hand side of trial
86+
my ($vox_id, $rec_id, $segment) = split('/', $path1);
87+
$segment =~ s/\.wav$//;
88+
my $spkr_id = $id2spkr{$vox_id};
89+
my $utt_id1 = "$spkr_id-$rec_id-00$segment";
90+
91+
# Create entry for right-hand side of trial
92+
my ($vox_id, $rec_id, $segment) = split('/', $path2);
93+
$segment =~ s/\.wav$//;
94+
my $spkr_id = $id2spkr{$vox_id};
95+
my $utt_id2 = "$spkr_id-$rec_id-00$segment";
96+
97+
my $target = "nontarget";
98+
if ($tar_or_non eq "1") {
99+
$target = "target";
100+
}
101+
print TRIAL_OUT "$utt_id1 $utt_id2 $target\n";
102+
}
103+
104+
close(TRIAL_IN) or die;
105+
close(TRIAL_OUT) or die;
106+
107+
}
108+
109+
110+
opendir my $dh, "$data_base/voxceleb1_wav" or die "Cannot open directory: $!";
111+
my @spkr_dirs = grep {-d "$data_base/voxceleb1_wav/$_" && ! /^\.{1,2}$/} readdir($dh);
112+
closedir $dh;
113+
114+
open(SPKR, ">", "$out_dir/utt2spk") or die "Could not open the output file $out_dir/utt2spk";
115+
open(WAV, ">", "$out_dir/wav.scp") or die "Could not open the output file $out_dir/wav.scp";
116+
open(GENDER, ">", "$out_dir/spk2gender") or die "Could not open the output file $out_dir/spk2gender";
117+
open(NAT, ">", "$out_dir/spk2nation") or die "Could not open the output file $out_dir/spk2nation";
118+
open(LANG, ">", "$out_dir/utt2lang") or die "Could not open the output file $out_dir/utt2lang";
119+
120+
foreach (@spkr_dirs) {
121+
my $spkr_id = $_;
122+
my $new_spkr_id = $spkr_id;
123+
# If we're using a newer version of VoxCeleb1, we need to "deanonymize"
124+
# the speaker labels.
125+
if (exists $id2spkr{$spkr_id}) {
126+
$new_spkr_id = $id2spkr{$spkr_id};
127+
}
128+
print GENDER "$new_spkr_id $spkr2gender{$new_spkr_id}\n";
129+
print NAT "$new_spkr_id $spkr2nation{$new_spkr_id}\n";
130+
131+
opendir my $dh, "$data_base/voxceleb1_wav/$spkr_id/" or die "Cannot open directory: $!";
132+
my @files = map{s/\.[^.]+$//;$_}grep {/\.wav$/} readdir($dh);
133+
closedir $dh;
134+
foreach (@files) {
135+
my $filename = $_;
136+
my $rec_id = substr($filename, 0, 11);
137+
my $segment = substr($filename, 12, 7);
138+
my $wav = "$data_base/voxceleb1_wav/$spkr_id/$filename.wav";
139+
my $utt_id = "$new_spkr_id-$rec_id-$segment";
140+
print WAV "$utt_id", " $wav", "\n";
141+
print SPKR "$utt_id", " $new_spkr_id", "\n";
142+
if (exists $utt2lang{$utt_id}) {
143+
print LANG "$utt_id", " $utt2lang{$utt_id}", "\n";
144+
}
145+
else {
146+
print LANG "$utt_id N/A\n";
147+
}
148+
}
149+
}
150+
151+
close(SPKR) or die;
152+
close(WAV) or die;
153+
close(LANG) or die;
154+
close(GENDER) or die;
155+
close(NAT) or die;
156+
157+
if (system(
158+
"cat $out_dir/trials_* | sort -u > $out_dir/trials") != 0) {
159+
die "Error creating trials file in directory $out_dir";
160+
}
161+
162+
if (system(
163+
"awk '{ print \$1,\$1 }' $out_dir/trials | sort -u > $out_dir/utt2model") != 0) {
164+
die "Error creating utt2model file in directory $out_dir";
165+
}
166+
167+
if (system(
168+
"utils/utt2spk_to_spk2utt.pl $out_dir/utt2spk >$out_dir/spk2utt") != 0) {
169+
die "Error creating spk2utt file in directory $out_dir";
170+
}
171+
system("env LC_COLLATE=C utils/fix_data_dir.sh $out_dir");
172+
if (system("env LC_COLLATE=C utils/validate_data_dir.sh --no-text --no-feats $out_dir") != 0) {
173+
die "Error validating directory $out_dir";
174+
}
175+

0 commit comments

Comments
 (0)