-
Notifications
You must be signed in to change notification settings - Fork 1
/
mfcc_baseline.py
261 lines (234 loc) · 11.2 KB
/
mfcc_baseline.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
import argparse
import os
from librosa.sequence import dtw as lib_dtw
import librosa
import pandas as pd
import numpy as np
SR = 16000
columns = ["speaker1_word_wid", "speaker2_word_wid", "group", "DTW"]
rows = []
def kr_df(kr_mfa_path):
'''
kr_mfa_path: path to the Korean MFA output
returns: dataframe containing the information about the files
'''
files_kr = []
for spkr in os.listdir(kr_mfa_path):
if '.' in spkr[0]:
continue
if not os.path.exists(os.path.join(kr_mfa_path, spkr, 'word_split')):
print('path doesnt exist ', os.path.join(
kr_mfa_path, spkr, 'word_split'), "\t...skipping")
continue
files_kr.extend([os.path.join(kr_mfa_path, spkr, 'word_split', f)
for f in os.listdir(os.path.join(kr_mfa_path, spkr, 'word_split'))])
df = pd.DataFrame(files_kr, columns=['file_path'])
df['l1'] = df['file_path'].apply(
lambda r: os.path.basename(r).split("_")[1][0])
df['sent_id'] = df['file_path'].apply(
lambda r: os.path.basename(r).split("sent")[1].split("_")[-2])
df['word_id'] = df['file_path'].apply(
lambda r: os.path.basename(r).split("word")[1].split("_")[0])
df['word_txt'] = df['file_path'].apply(
lambda r: os.path.basename(r).split("_")[-1].split('.wav')[0])
df['speaker'] = df['file_path'].apply(
lambda r: os.path.basename(r).split("_")[1])
return df
def cmn_df(mandarin_data_path, spanish_data_path, task):
'''
mandarin_data_path: path to the mandarin data
spanish_data_path: path to the spanish data
task: task to run the baseline on (HT1/HT2/DHR/LPP)
'''
files = []
for db in [mandarin_data_path, spanish_data_path]:
for dir in os.listdir(db):
# avoid NWS task and use only sentences in english
if '.' in dir[0] or 'CMN_CMN' in dir or 'SHS_SPA' in dir or 'NWS' in dir \
or not os.path.isdir(os.path.join(db, dir)) or task not in dir:
continue
if not os.path.exists(os.path.join(db, dir, 'word_split')):
print('path doesnt exist ', os.path.join(
db, dir, 'word_split'), '\t...skipping')
continue
files.extend([os.path.join(db, dir, 'word_split', f)
for f in os.listdir(os.path.join(db, dir, 'word_split'))])
assert len(files) > 0, "No files found in the path"
df = pd.DataFrame(files, columns=['file_path'])
df['l1'] = df['file_path'].apply(
lambda r: os.path.basename(r).split("_")[3])
df['sent_lng'] = df['file_path'].apply(
lambda r: os.path.basename(r).split("_")[4])
df['sent_type'] = df['file_path'].apply(
lambda r: "_".join(os.path.basename(r).split("_")[3:5]))
df['sent_id'] = df['file_path'].apply(
lambda r: os.path.basename(r).split("sent")[1].split("_")[0])
df['word_id'] = df['file_path'].apply(
lambda r: os.path.basename(r).split("word")[1].split("_")[0])
df['word_txt'] = df['file_path'].apply(
lambda r: os.path.basename(r).split("_")[-1].split('.wav')[0])
df['exp_type'] = df['file_path'].apply(
lambda r: os.path.basename(r).split("_")[5])
df['sent'] = df['file_path'].apply(lambda r: "_".join(
os.path.basename(r).split("sent")[1].split("_")[1:-1]))
df['speaker'] = df['file_path'].apply(
lambda r: os.path.basename(r).split("_")[1])
return df
def compute_speakers_distance(file1, file2, n_fft, hop, n_bins, group):
'''
file1: df row of a speaker
file2: df row of a speaker
n_fft: number of fft bins
hop: hop length
n_bins: number of mfcc bins
group: 'BT' if between groups comparison, 'EN' if its within L1 comparison
returns: distance between the two speakers
'''
audio1, sr1 = librosa.load(file1.file_path, sr=SR)
audio2, sr2 = librosa.load(file2.file_path, sr=SR)
if len(audio1) == 0:
print("Skipping Empty audio: ", file1.file_path)
return -1
if len(audio2) == 0:
print("Skipping Empty audio: ", file2.file_path)
return -1
mfcc1 = librosa.feature.mfcc(
y=audio1, sr=SR, n_fft=n_fft, hop_length=hop, n_mfcc=n_bins)
mfcc1 = (mfcc1 - mfcc1.mean()) / mfcc1.std()
mfcc2 = librosa.feature.mfcc(
y=audio2, sr=SR, n_fft=n_fft, hop_length=hop, n_mfcc=n_bins)
mfcc2 = (mfcc2 - mfcc2.mean()) / mfcc2.std()
dist = lib_dtw(mfcc2, mfcc1)[
0][-1, -1] if (group == 'BT' and file1.l1 == 'ENG') else lib_dtw(mfcc1, mfcc2)[0][-1, -1]
return dist
def dist_cmn(mandarin_data_path, spanish_data_path, task, n_fft, n_bins, hop, save_path):
'''
Compute the DTW distance between the Mandarin/Spanish and English speakers based on MFCC features.
Since DTW isn't symmetric, a key is constructed for each pair, to maintain comparison order.
mandarin_data_path: path to the mandarin data
spanish_data_path: path to the spanish data
task: task to run the baseline on (HT1/HT2/DHR/LPP)
n_fft: number of fft bins
n_bins: number of mfcc bins
hop: hop length
save_path: path to save the distance csv file
'''
print("Running baseline with the mandarin dataset with task, ", task)
keys = []
df = cmn_df(mandarin_data_path, spanish_data_path, task)
for id1, file1 in df.iterrows():
for id2, file2 in df[(df.sent_id == file1.sent_id) & (df.speaker != file1.speaker)
& (file1.sent_lng == df.sent_lng) & (file1.exp_type == df.exp_type)
& ((df.l1 == 'ENG') | (df.l1 == file1.l1)) & (file1.word_id == df.word_id)].iterrows():
if file1.speaker == file2.speaker or file1.sent_lng != file2.sent_lng or file1.exp_type != file2.exp_type \
or (file1.l1 != 'ENG' and file2.l1 != 'ENG' and file1.l1 != file2.l1):
continue
if float(file1.speaker) < float(file2.speaker):
pair_key = f"{file1.speaker}_{file1.word_id}_{file1.sent_id}_{file2.speaker}"
else:
pair_key = f"{file2.speaker}_{file2.word_id}_{file2.sent_id}_{file1.speaker}"
if pair_key in keys:
continue
w1, w2 = file1.word_id, file2.word_id
if w1 != w2:
continue
group = file1.sent_type if file1.l1 == file2.l1 else 'BT'
dist = compute_speakers_distance(
file1, file2, n_fft, n_bins, hop, group)
if dist == -1:
continue
if group == 'BT' and file1.l1 == 'ENG':
rows.append([f"{file2.speaker}_{w2}_{file2.word_txt}_{file2.sent_id}",
f"{file1.speaker}_{w1}_{file1.word_txt}_{file1.sent_id}", group, dist])
else:
rows.append([f"{file1.speaker}_{w1}_{file1.word_txt}_{file1.sent_id}",
f"{file2.speaker}_{w2}_{file2.word_txt}_{file2.sent_id}", group, dist])
keys.append(pair_key)
dist_df = pd.DataFrame(rows, columns=columns)
dist_df.to_csv(os.path.join(
save_path, f'cmn_shs_mfcc_baseline_{task.lower()}.csv'))
def dist_kr(kr_mfa_path, n_fft, n_bins, hop, save_path):
'''
Compute the DTW distance between the Korean and English speakers based on MFCC features.
Since DTW isn't symmetric, a key is constructed for each pair, to maintain comparison order.
kr_mfa_path: path to the Korean MFA output
n_fft: number of fft bins
n_bins: number of mfcc bins
hop: hop length
save_path: path to save the distance csv file
'''
print("Running baseline with the Korean dataset")
df = kr_df(kr_mfa_path)
kspl = df[df.l1 != 'E'].speaker.unique()
kspl.sort()
espl = df[df.l1 == 'E'].speaker.unique()
espl.sort()
spkrs_rnk = {s: i for i, s in enumerate(np.concatenate((kspl, espl)))}
keys = []
for id1, file1 in df.iterrows():
for id2, file2 in df[(df.sent_id == file1.sent_id)
& (df.speaker != file1.speaker) & (file1.word_id == df.word_id)].iterrows():
group = file1.l1 if file1.l1 == file2.l1 else 'BT'
if file1.speaker == file2.speaker:
continue
if spkrs_rnk[file1.speaker] < spkrs_rnk[file2.speaker]:
pair_key = f"{file1.speaker}_{file1.word_id}_{file1.sent_id}_{file2.speaker}"
else:
pair_key = f"{file2.speaker}_{file2.word_id}_{file2.sent_id}_{file1.speaker}"
if pair_key in keys:
continue
w1, w2 = file1.word_id, file2.word_id
if w1 != w2:
continue
if file1.word_txt != file2.word_txt:
print("ERROR: Invalid comparison between different words.")
print(os.path.basename(file1.file_path),
" XXX ", os.path.basename(file2.file_path))
print("skiping...")
dist = compute_speakers_distance(file1, file2, n_fft, n_bins, hop, group)
if dist == -1:
continue
if group == 'BT' and file1.l1 == 'ENG':
rows.append([f"{file2.speaker}_{w2}_{file2.word_txt}_{file2.sent_id}",
f"{file1.speaker}_{w1}_{file1.word_txt}_{file1.sent_id}", group, dist])
else:
rows.append([f"{file1.speaker}_{w1}_{file1.word_txt}_{file1.sent_id}",
f"{file2.speaker}_{w2}_{file2.word_txt}_{file2.sent_id}", group, dist])
keys.append(pair_key)
dist_df = pd.DataFrame(rows, columns=columns)
dist_df.to_csv(os.path.join(
save_path, f'kr_mfcc_baseline.csv'))
parser = argparse.ArgumentParser(
description='Parser for DTW baseline evaluation', )
parser.add_argument('--kr', action='store_true',
help='run korean dataset analysis')
parser.add_argument('--task', type=str,
help='reading task for the mandarin dataset')
parser.add_argument('--n_fft', type=int, default=int(0.02 * 16000))
parser.add_argument('--hop', type=int, default=int((0.01 * 16000) / 2))
parser.add_argument('--cmn_data_path', type=str,
help="path of the mandarin dataset")
parser.add_argument('--spn_data_path', type=str,
help="path of the spanish dataset")
parser.add_argument('--kr_mfa_path', type=str,
help="path of the korean dataset")
parser.add_argument('--save_path', type=str, required=True,
help="Path to directory in which the csv will be saved")
parser.add_argument('--n_bins', type=int, default=80)
args = parser.parse_args()
if __name__ == '__main__':
if args.kr:
if args.kr_mfa_path == "":
print("Please provide the path to the Korean dataset")
exit()
dist_kr(args.kr_mfa_path, args.n_fft,
args.n_bins, args.hop, args.save_path)
else:
if args.cmn_data_path is None or args.spn_data_path is None:
print("Please provide the path to the Mandarin and Spanish dataset")
exit()
if args.task is None:
print("Please provide a reading task to evaluate")
exit()
dist_cmn(args.cmn_data_path, args.spn_data_path, args.task,
args.n_fft, args.n_bins, args.hop, args.save_path)