-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdataset.py
324 lines (251 loc) · 12.7 KB
/
dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
import os
import gc
import numpy as np
from random import randint
from pydub import AudioSegment
from vad_model import VadModel
from keras.utils import Sequence
class SpectrogramDataGenerator(Sequence):
def __init__(self, x_set, y_set, batch_size):
'''
Here, `x_set` is list of path to the spectrogram .npy file
and `y_set` are the associated truth vector .npy file
'''
self.x, self.y = x_set, y_set
self.batch_size = batch_size
def __len__(self):
return int(np.ceil(len(self.x) / float(self.batch_size)))
def __getitem__(self, idx):
''' this method should return a complete batch. '''
batch_x = self.x[idx * self.batch_size:(idx + 1) * self.batch_size]
batch_y = self.y[idx * self.batch_size:(idx + 1) * self.batch_size]
return (np.array([np.load(filename) for filename in batch_x]),
np.array([np.load(filename) for filename in batch_y]))
def on_epoch_end(self):
''' If you want to modify your dataset between epochs you may implement. '''
pass
class Dataset(object):
def __init__(self, Tx, Ty, n_freq, dialog_dir, noise_dir, music_dir, audio_length_ms=2000.0, verbose=False):
"""
Arguments:
Tx - integer, The number of time steps input to the model from the spectrogram
Ty: integer, The number of time steps in the output of our model
n_freq - integer, Number of frequencies input to the model at each time step of the spectrogram
dialog_dir - string, path to dialog wav files
noise_dir - string, path to dialog wav files
music_dir - string, path to dialog wav files
verbose - boolean, debug verbose flag
"""
self.Tx = Tx
self.Ty = Ty
self.n_freq = n_freq
self.audio_length_ms = audio_length_ms
self.verbose = verbose
self.dialogs, self.noises, self.musics = self._load_raw_audio(dialog_dir, noise_dir, music_dir)
def create_dev_dataset(self, n_x, output_x_filename=None, output_y_filename=None):
X = np.zeros((n_x, self.Tx, self.n_freq))
Y = np.zeros((n_x, self.Ty))
print('number of training samples to generate =', n_x)
for i in range(n_x-1):
if i % 100 == 0:
print('sample {0}/{1}'.format(i, n_x))
gc.collect()
music_index = randint(0, len(self.musics)-1)
noise_index = randint(0, len(self.noises)-1)
x, y = self.create_training_example(self.musics[music_index], self.dialogs, self.noises[noise_index], verbose=False)
X[i] = x.transpose()
Y[i] = y.transpose()
if output_x_filename is not None:
if self.verbose: print('saving dev set X ...')
np.save(output_x_filename, X)
if output_y_filename is not None:
if self.verbose: print('saving dev set Y ...')
np.save(output_y_filename, Y)
return X, Y
def create_dev_dataset_files(self, n_x, output_folder, start_index=0):
print('number of training samples to generate =', n_x)
voice_sample_count = 0
for i in range(start_index, start_index + n_x):
if i % 100 == 0:
print('sample {0}/{1}'.format(i, start_index + n_x))
gc.collect()
music_index = randint(0, len(self.musics)-1)
noise_index = randint(0, len(self.noises)-1)
x_wav_filename = '{}/x_{}.wav'.format(output_folder, i)
x_sample_filename = '{}/x_spectrogram_{}.npy'.format(output_folder, i)
y_sample_filename = '{}/y_{}.npy'.format(output_folder, i)
x, y = self.create_training_example(self.musics[music_index], self.dialogs, self.noises[noise_index],
output_wav_filename=x_wav_filename, verbose=False)
X = x.transpose()
Y = y.transpose()
np.save(x_sample_filename, X)
np.save(y_sample_filename, Y)
if y[0] == 1: voice_sample_count += 1
del X, Y, x, y
print('voice_sample_count = ', voice_sample_count)
def load_dataset(x_filename, y_filename):
X = np.load(x_filename)
Y = np.load(y_filename)
return X, Y
def create_training_example(self, music, dialogs, noise, output_wav_filename='train.wav', verbose=False):
"""
Creates a training example with a given music, noise, and dialog.
Arguments:
music -- a 2 second music audio recording
dialogs -- a list of audio segments of a conversation between two persons
noise -- a 2 second noise audio recording
Returns:
x -- the spectrogram of the training example
y -- the label at each time step of the spectrogram
"""
dB_reduction = np.random.randint(5, 20)
noise = noise - dB_reduction
if np.random.randint(0, 10) == 0:
# 10% of the time, only noise (no music)
mixed_audio = noise
if verbose: print("noise {0} dB".format(dB_reduction))
else:
# Make music quieter or louder
dB_reduction = np.random.randint(-5, 20)
mixed_audio = music + dB_reduction
if verbose: print("music {0} dB".format(dB_reduction))
# insert the noise audio over mixed_audio and optional dialog
mixed_audio = mixed_audio.overlay(noise, position=0)
# Initialize y (label vector) of zeros
y = np.zeros((self.Ty))
# Select random "dialog" audio clips from the entire list of "dialogs" recordings
number_of_dialogs = np.random.randint(0, 2)
random_indices = np.random.randint(len(dialogs), size=number_of_dialogs)
random_dialogs = [dialogs[i] for i in random_indices]
# Loop over randomly selected "conversation" clips and insert in mixed_audio
for random_dialog in random_dialogs:
# Make dialog quieter or louder
dB_reduction = np.random.randint(0, 10)
random_dialog = random_dialog + dB_reduction
# Insert the audio clip on the mixed_audio
mixed_audio, segment_time = self._insert_audio_clip(mixed_audio, random_dialog)
# Retrieve segment_start and segment_end from segment_time
segment_start, segment_end = segment_time
# Insert labels in "y"
y = np.ones((self.Ty))
if verbose: print("dialog inserted {0}dB - [{1}, {2}]".format(dB_reduction, segment_start, segment_end))
# Standardize the volume of the audio clip
mixed_audio = self._match_target_amplitude(mixed_audio, -20.0)
# Export new training example
file_handle = mixed_audio.export(output_wav_filename, format="wav")
file_handle.close()
del mixed_audio
# Get and plot spectrogram of the new recording (mixed_audio with superposition of music, noise and dialog)
x = VadModel.graph_spectrogram(output_wav_filename)
if verbose: print("-----------------------")
return x, y
def _load_raw_audio(self, dialog_dir, noise_dir, music_dir):
''' Load raw audio files. '''
i = 0
loading_list = (
{
'dir': dialog_dir,
'list': []
},
{
'dir': noise_dir,
'list': []
},
{
'dir': music_dir,
'list': []
}
)
for loading in loading_list:
for filename in os.listdir(loading['dir']):
if filename.endswith("wav"):
try:
audio = AudioSegment.from_wav('{}/{}'.format(loading['dir'], filename))
loading['list'].append(audio)
if self.verbose and i % 500 == 0: print('raw audio count = {0}'.format(i))
i += 1
except Exception:
print('Error decoding audio file: ', filename)
return loading_list[0]['list'], loading_list[1]['list'], loading_list[2]['list']
def _insert_audio_clip(self, background, audio_clip, previous_segments=None):
"""
Insert a new audio segment over the background noise at a random time step, ensuring that the
audio segment does not overlap with existing segments.
Arguments:
background -- a 2 second background audio recording.
audio_clip -- the audio clip to be inserted/overlaid.
previous_segments -- times where audio segments have already been placed; None
Returns:
new_background -- the updated background audio
"""
# Get the duration of the audio clip in ms
segment_ms = len(audio_clip)
if segment_ms == len(background):
segment_time = (0, segment_ms-1)
else:
# Use one of the helper functions to pick a random time segment onto which to insert
# the new audio clip.
segment_time = self._get_random_time_segment(segment_ms)
# Check if the new segment_time overlaps with one of the previous_segments. If so, keep
# picking new segment_time at random until it doesn't overlap.
if previous_segments is not None:
while self._is_overlapping(segment_time, previous_segments):
segment_time = self._get_random_time_segment(segment_ms)
# Add the new segment_time to the list of previous_segments
previous_segments.append(segment_time)
# Superpose audio segment and background
new_background = background.overlay(audio_clip, position=segment_time[0])
return new_background, segment_time
def _get_random_time_segment(self, segment_ms, audio_length_ms=2000.0):
"""
Gets a random time segment of duration segment_ms in a 2000 ms audio clip.
Arguments:
segment_ms -- the duration of the audio clip in ms ("ms" stands for "milliseconds")
Returns:
segment_time -- a tuple of (segment_start, segment_end) in ms
"""
# Make sure segment doesn't run past the 2 seconds background
segment_start = np.random.randint(low=0, high=audio_length_ms-segment_ms)
segment_end = segment_start + segment_ms - 1
return (segment_start, segment_end)
def _is_overlapping(self, segment_time, previous_segments):
"""
Checks if the time of a segment overlaps with the times of existing segments.
Arguments:
segment_time -- a tuple of (segment_start, segment_end) for the new segment
previous_segments -- a list of tuples of (segment_start, segment_end) for the existing segments
Returns:
True if the time segment overlaps with any of the existing segments, False otherwise
"""
segment_start, segment_end = segment_time
# Step 1: Initialize overlap as a "False" flag.
overlap = False
# Step 2: loop over the previous_segments start and end times.
# Compare start/end times and set the flag to True if there is an overlap
for previous_start, previous_end in previous_segments:
if segment_start <= previous_end and segment_end >= previous_start:
overlap = True
return overlap
def _match_target_amplitude(self, sound, target_dBFS):
''' Used to standardize volume of audio clip. '''
change_in_dBFS = target_dBFS - sound.dBFS
return sound.apply_gain(change_in_dBFS)
if __name__ == '__main__':
dataset = Dataset(Tx=1101, Ty=1, n_freq=101,
dialog_dir='data/dev_set_wav/dialog',
noise_dir='data/dev_set_wav/noise',
music_dir='data/dev_set_wav/music',
verbose=True)
print("music[0]: " + str(len(dataset.musics[0]))) # Should be 2000, since it is a 2 sec clip
print("dialogs[0]: " + str(len(dataset.dialogs[0]))) # Should be 2000, since it is a 2 sec clip
print("noises[0]: " + str(len(dataset.noises[0]))) # Should be 2000, since it is a 2 sec clip
print('music 2s audio count = ', len(dataset.musics))
print('dialogs audio count = ', len(dataset.dialogs))
print('noises 2s audio count = ', len(dataset.noises))
# x, y = dataset.create_training_example(dataset.musics[1], dataset.dialogs, dataset.noises[1])
# print('x.shape =', x.shape)
# print('y.shape =', y.shape)
dataset.create_dev_dataset_files(5000, 'data/dev_set')
#dataset.create_dev_dataset_files(5000, 'data/dev_set', start_index=5000)
#dataset.create_dev_dataset_files(5000, 'data/dev_set', start_index=10000)
#dataset.create_dev_dataset_files(5000, 'data/dev_set', start_index=15000)