-
Notifications
You must be signed in to change notification settings - Fork 4
/
separake_make_samples.py
299 lines (249 loc) · 11.3 KB
/
separake_make_samples.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
import numpy as np
from scipy.io import wavfile
import matplotlib.pyplot as plt
import pyroomacoustics as pra
import datetime
from itertools import product, combinations
import shutil
import time
import os
import json
from multinmf_conv_mu import multinmf_conv_mu_wrapper
from multinmf_conv_em import multinmf_conv_em_wrapper
from utilities import partial_rir, reverse_simulate, reverse_simulate_all_single_sources
from sim_tools import json_append
from mir_eval.separation import bss_eval_images
base_dir = os.path.abspath(os.path.split(__file__)[0])
print('Base dir', base_dir)
output_dir = "/data/results/"
if not os.path.exists(base_dir+output_dir):
os.mkdir(base_dir+output_dir)
# output filename format. {} is replaced by date/time
data_dir_format = base_dir + output_dir +'{timestamp}_near_wall_{method}'
data_file_format = '/data_{}.json' # The {} is replaced by node pid
param_file_format = '/parameters.json' # We store the parameters in a json file
args_file_format = '/arguments.json' # We store the arguments list in a json file
error_file_format = '/error_{}.json' # We store some debug info on failed instances
fs = 16000
# room parameters
max_order = 8 # max image sources order in simulation
floorplan = [ [0, 6, 6, 2, 0], # x-coordinates
[0, 0, 5, 5, 3] ] # y-coordinates
height = 4.
absorption = 0.4
# planar circular array with three microphones and 30 cm inter-mic dist
# placed in bottom right corner of the room
mics_locs = [[ 5.61047449, 5.53282877, 5.32069674], # x-coordinates
[ 0.38952551, 0.67930326, 0.46717123], # y-coordinates
[ 0.70000000, 0.70000000, 0.70000000] ] # z-coordinates
speech_files = ['data/Speech/fq_sample3.wav', 'data/Speech/fq_sample2.wav',]
dist_src_mic = [2.5, 4] # Put all sources in donut
min_dist_src_src = 1. # minimum distance between two sources
n_src_locations = len(speech_files) # number of different source locations to consider
# optimal gamma set empirically
gamma_opt = {'learn': 0.1, 'anechoic': 10., 0: 10., 1: 0.0001, 2:0., 3:0., 4:0, 5:0, 6:0., 7:0.}
def get_gamma(n_echoes):
if n_echoes == 'learn':
return 0.1
elif n_echoes == 'anechoic':
return 10.
elif n_echoes == 0:
return 0.0001
elif n_echoes > 0:
return 0.
else:
raise ValueError('Negative number of echoes')
# convolutive separation parameters
dict_files = { 'spkr' : 'W_dictionary_em.npz', 'univ' : 'W_dictionary_sqmag_mu.npz' }
stft_win_len = 2048 # supposedly optimal at 16 kHz (Ozerov and Fevote 2010)
use_dict = True
n_latent_var = 4 # number of latent variables (ignored when dictionary is used)
base_dir = base_dir
if __name__ == '__main__':
import argparse
parser = argparse.ArgumentParser(description='Separake it!')
group = parser.add_mutually_exclusive_group()
group.add_argument('-l', '--learn', action='store_true',
help='Learn the TF from the data')
group.add_argument('-a', '--anechoic', action='store_true',
help='Anechoic conditions')
group.add_argument('-e', '--echoes', type=int, metavar='N',
help='Use %(metavar)s echoes to form the TF')
parser.add_argument('-m', '--method', type=str, default='mu', choices=['mu','em'],
help='The algorithm to use')
parser.add_argument('-d', '--dict', type=str, default='spkr', choices=['spkr','univ'],
help='The dictionary to use speaker dependent (default), or universal')
parser.add_argument('-p', '--play', action='store_true',
help='Play the signals after separation')
parser.add_argument('-i', '--iter', type=int, default=200,
help='Number of iterations of the algorithm')
parser.add_argument('-r', '--rng_seed', type=int, default=None,
help='The seed for the random number generator')
parser.add_argument('-s', '--save', metavar='DIR', type=str,
help='Save the audio files to %(metavar)s')
parser.add_argument('--mono', action='store_true',
help='Only save the first channel')
parser.add_argument('--save_rir', type=str, metavar='DIR',
help='Plot and save a typical RIR to %(metavar)s')
args = parser.parse_args()
# prepare the dictionary
if use_dict:
W_dict = np.load(dict_files[args.dict])['W_dictionary']
n_latent_var = W_dict.shape[1] # set by dictionary
print('Using dictionary with', n_latent_var, 'latent variables')
else:
W_dict = None
# the speech samples
speech_data = []
n_speech = len(speech_files)
for sp_fn in speech_files:
r, audio = wavfile.read(sp_fn)
audio /= np.std(audio)
if r != fs:
raise ValueError('The speech samples should have the same sample rate as the simulation')
speech_data.append(audio)
# a 5 wall room
fp = np.array(floorplan)
room = pra.Room.from_corners(fp,
fs=fs,
absorption=absorption,
max_order=max_order)
# add the third dimension
room.extrude(height, absorption=absorption)
# add a few microphones
mics_locs = np.array(mics_locs)
n_mics = mics_locs.shape[1]
for m in range(n_mics):
room.add_source(mics_locs[:,m])
# set the RNG seed if needed
if args.rng_seed is not None:
np.random.seed(args.rng_seed)
# generates sources in the room at random locations
# but ensure they are too close to microphones
bbox = np.array(
[ [min(fp[0]), min(fp[1]), 0],
[max(fp[0]), max(fp[1]), height] ] ).T
n_src_locs = n_src_locations # number of sources
sources_locs = np.zeros((3,0))
while sources_locs.shape[1] < n_src_locs:
# new candidate location in the bounding box
new_source = np.random.rand(3, 1) * (bbox[:,1] - bbox[:,0])[:,None] + bbox[:,0,None]
# check the source are in the room
is_in_room = room.is_inside(new_source[:,0])
# check the source is not too close to the microphone
mic_dist = pra.distance(mics_locs, new_source).min()
distance_mic_ok = (dist_src_mic[0] < mic_dist and
mic_dist < dist_src_mic[1])
select = is_in_room and distance_mic_ok
if sources_locs.shape[1] > 0:
distance_src_ok = (min_dist_src_src
< pra.distance(sources_locs, new_source).min())
select = select and distance_src_ok
if select:
sources_locs = np.concatenate([sources_locs, new_source], axis=1)
print('Source distances', np.linalg.norm(sources_locs[:,0] - sources_locs[:,1]))
source_array = pra.MicrophoneArray(sources_locs, fs)
room.add_microphone_array(source_array)
if args.anechoic:
# 1) We let the room be anechoic and simulate all
# microphone signals
room.max_order = 0 # never reflect!
room.image_source_model()
room.compute_rir()
single_sources = reverse_simulate_all_single_sources(room, speech_data)
partial_rirs = np.ones((n_mics, n_speech, stft_win_len // 2 + 1))
gamma = get_gamma('anechoic')
else:
# 2) Let the room have echoes and recompute all microphone signals
room.max_order = max_order
room.image_source_model()
room.compute_rir()
# simulate propagation of sources individually
# mixing will be done in the simulation loop by simple addition
# shape of single_sources: (n_speech, n_src_locs, n_samples, n_mics_locs)
single_sources = reverse_simulate_all_single_sources(room, speech_data)
# compute partial rir
# (remove negative partial lengths corresponding to anechoic conditions)
if args.learn:
partial_rirs = None
gamma = get_gamma('learn')
else:
freqvec = np.fft.rfftfreq(stft_win_len, 1 / room.fs)
partial_rirs = np.swapaxes(
partial_rir(room, args.echoes + 1, freqvec), 0, 1)
gamma = get_gamma(args.echoes)
# mix the signal
mic_signals = np.zeros(single_sources.shape[-2:]) # (n_samples, n_mics)
for speech_index in range(n_speech):
mic_signals += single_sources[speech_index,speech_index,:,:]
# run the method
if args.method == 'mu':
# separate using MU
sep_sources = multinmf_conv_mu_wrapper(
mic_signals, n_speech, n_latent_var, stft_win_len,
partial_rirs=partial_rirs,
W_dict=W_dict, l1_reg=gamma,
n_iter=args.iter, verbose=True)
elif args.method == 'em':
# separate using EM
sep_sources = multinmf_conv_em_wrapper(
mic_signals, n_speech, stft_win_len,
n_latent_var, n_iter=args.iter,
A_init=partial_rirs, W_init=W_dict,
update_a=False, update_w=False,
verbose=True)
else:
raise ValueError('Unknown algorithm {} requested'.format(method))
n_samples = np.minimum(single_sources.shape[2], sep_sources.shape[1])
reference_signals = []
for speech_ind in range(n_speech):
reference_signals.append(single_sources[speech_ind,speech_ind,:n_samples,:])
reference_signals = np.array(reference_signals)
ret = \
bss_eval_images(reference_signals, sep_sources[:,:n_samples,:])
print('SDR={} ISR={} SIR={} SAR={}'.format(*ret[:4]))
mic_norm = 0.7 / np.max(np.abs(mic_signals))
sep_src_norm = 0.7 / np.max(np.abs(sep_sources))
if args.play:
import sounddevice as sd
sd.play(mic_signals[:,:2] / mic_norm, samplerate=fs, blocking=True)
for s in range(n_speech):
sd.play(sep_sources[s,:,:2] / sep_src_norm, samplerate=fs, blocking=True)
if args.save is not None:
if args.mono:
save_mix = mic_signals[:,0]
save_sep = sep_sources[:,:,0]
else:
save_mix = mic_signals
save_sep = sep_sources
if args.learn:
scenario = 'learn'
elif args.anechoic:
scenario = 'anechoic'
else:
scenario = '{}echoes'.format(args.echoes)
description = '{}_{}_{}'.format(scenario, args.method, args.dict)
bnames = [os.path.splitext(os.path.basename(name))[0] for name in speech_files]
filename = args.save + '/separake_{}_mix_'.format(description) + '_'.join(bnames) + '.wav'
wavfile.write(filename, fs, save_mix)
for i, name in enumerate(bnames):
filename = args.save + '/separake_{}_sep_'.format(description) + name + '.wav'
wavfile.write(filename, fs, save_sep[i])
if args.save_rir is not None:
n_taps = len(room.rir[0][0])
import seaborn as sns
sns.set(style='white', context='paper', font_scale=0.8,
rc={
'figure.figsize': (1.5748, 1.29921), # 40 x 33 mm
'lines.linewidth': 0.5,
#'font.family': u'Roboto',
#'font.sans-serif': [u'Roboto Thin'],
'text.usetex': False,
})
plt.figure()
plt.plot(np.arange(n_taps) / room.fs, room.rir[0][0])
plt.xlabel('Time [s]')
plt.yticks([])
sns.despine(left=True, bottom=True)
plt.tight_layout(pad=0.5)
plt.savefig(args.save_rir + '/typical_rir.pdf')