-
Notifications
You must be signed in to change notification settings - Fork 303
/
intensity.py
221 lines (160 loc) · 7.22 KB
/
intensity.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
#!/usr/bin/env python3
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
# pyre-unsafe
from typing import Any, Callable, Dict, Optional, Tuple
import numpy as np
"""
This file contains 'intensity' functions for each of our augmentations.
Intensity functions give a float representation of how intense a particular
transform called with particular parameters is.
Each intensity function expects as parameters the kwargs the corresponding
transform was called with, as well as the metadata dictionary computed by the
transform (e.g. metadata[-1] after passing a list 'metadata' into the transform).
All intensity functions are normalized to be in [0, 100]. This means we are
assuming a range of valid values for each param - e.g. for pitch_shift we
assume we will never pitch shift more than an octave, meaning the range of
valid values for n_steps is [-12.0, 12.0], which you can see is assumed in the
intensity function below.
"""
def add_background_noise_intensity(snr_level_db: float = 10.0, **kwargs) -> float:
assert isinstance(snr_level_db, (float, int)), "snr_level_db must be a number"
max_snr_level_db_val = 110.0
return min(
((max_snr_level_db_val - snr_level_db) / max_snr_level_db_val) * 100.0, 100.0
)
def apply_lambda_intensity(
aug_function: Callable[..., Tuple[np.ndarray, int]],
**kwargs,
) -> float:
intensity_func = globals().get(f"{aug_function}_intensity")
return intensity_func(**kwargs) if intensity_func else 100.0
def change_volume_intensity(volume_db: float = 0.0, **kwargs) -> float:
assert isinstance(volume_db, (float, int)), "volume_db must be a nonnegative number"
max_volume_db_val = 110.0
return min((abs(volume_db) / max_volume_db_val) * 100.0, 100.0)
def clicks_intensity(
seconds_between_clicks: float = 0.5, snr_level_db: float = 1.0, **kwargs
) -> float:
assert (
isinstance(seconds_between_clicks, (float, int)) and seconds_between_clicks >= 0
), "seconds_between_clicks must be a nonnegative number"
assert isinstance(snr_level_db, (float, int)), "snr_level_db must be a number"
max_seconds_between_clicks_val = 60.0
max_snr_level_db_val = 110.0
seconds_between_clicks_intensity = (
max_seconds_between_clicks_val - seconds_between_clicks
) / max_seconds_between_clicks_val
snr_level_db_intensity = (
max_snr_level_db_val - snr_level_db
) / max_snr_level_db_val
return min(
(seconds_between_clicks_intensity * snr_level_db_intensity) * 100.0, 100.0
)
def clip_intensity(duration_factor: float = 1.0, **kwargs) -> float:
assert 0 < duration_factor <= 1, "duration_factor must be a number in (0, 1]"
max_duration_factor = 1.0
return min(
((max_duration_factor - duration_factor) / max_duration_factor) * 100.0, 100.0
)
def harmonic_intensity(**kwargs) -> float:
return 100.0
def high_pass_filter_intensity(cutoff_hz: float = 3000.0, **kwargs) -> float:
assert (
isinstance(cutoff_hz, (float, int)) and cutoff_hz >= 0
), "cutoff_hz must be a nonnegative number"
max_cutoff_hz_val = 20000.0
return min((cutoff_hz / max_cutoff_hz_val) * 100.0, 100.0)
def insert_in_background_intensity(metadata: Dict[str, Any], **kwargs) -> float:
bg_to_src_duration_ratio = (
metadata["dst_duration"] - metadata["src_duration"]
) / metadata["dst_duration"]
return bg_to_src_duration_ratio * 100.0
def invert_channels_intensity(metadata: Dict[str, Any], **kwargs) -> float:
return 0.0 if metadata["src_num_channels"] == 1 else 100.0
def loop_intensity(n: int = 1, **kwargs) -> float:
assert isinstance(n, int) and n >= 0, "Expected 'n' to be a nonnegative integer"
max_num_loops = 100
return min((n / max_num_loops) * 100.0, 100.0)
def low_pass_filter_intensity(cutoff_hz: float = 500.0, **kwargs) -> float:
assert (
isinstance(cutoff_hz, (float, int)) and cutoff_hz >= 0
), "cutoff_hz must be a nonnegative number"
max_cutoff_hz_val = 20000.0
return min(((max_cutoff_hz_val - cutoff_hz) / max_cutoff_hz_val) * 100.0, 100.0)
def normalize_intensity(norm: Optional[float] = np.inf, **kwargs) -> float:
return 100.0 if norm else 0.0
def peaking_equalizer_intensity(q: float, gain_db: float, **kwargs) -> float:
assert isinstance(q, (int, float)) and q > 0, "Expected 'q' to be a positive number"
assert isinstance(gain_db, (int, float)), "Expected 'gain_db' to be a number"
max_q_val, max_gain_db_val = 46, 110.0
q_intensity = (max_q_val - q) / max_q_val
gain_db_intensity = abs(gain_db) / max_gain_db_val
return min((q_intensity * gain_db_intensity) * 100.0, 100.0)
def percussive_intensity(**kwargs) -> float:
return 100.0
def pitch_shift_intensity(n_steps: float = 2.0, **kwargs) -> float:
assert isinstance(n_steps, (float, int)), "n_steps must be a number"
max_nsteps_val = 84.0
return min((abs(n_steps) / max_nsteps_val) * 100.0, 100.0)
def reverb_intensity(
reverberance: float = 50.0,
wet_only: bool = False,
room_scale: float = 100.0,
**kwargs,
) -> float:
assert (
isinstance(reverberance, (float, int))
and 0 <= reverberance <= 100
and isinstance(room_scale, (float, int))
and 0 <= room_scale <= 100
), "reverberance & room_scale must be numbers in [0, 100]"
if wet_only:
return 100.0
max_reverberance_val = 100.0
max_room_scale_val = 100.0
return min(
(reverberance / max_reverberance_val)
* (room_scale / max_room_scale_val)
* 100.0,
100.0,
)
def speed_intensity(factor: float = 2.0, **kwargs) -> float:
assert (
isinstance(factor, (float, int)) and factor > 0
), "factor must be a positive number"
if factor == 1.0:
return 0.0
max_factor_val = 10.0
# We want the intensity of factor = 2 to be the same as the intensity of
# factor = 0.5, since they both change the speed by 2x.
# speed_change_factor represents how much the speed of the audio has changed,
# with a value in [1, inf).
speed_change_factor = factor if factor >= 1 else 1 / factor
return min((speed_change_factor / max_factor_val) * 100.0, 100.0)
def tempo_intensity(factor: float = 2.0, **kwargs) -> float:
assert (
isinstance(factor, (float, int)) and factor > 0
), "factor must be a positive number"
if factor == 1.0:
return 0.0
max_factor_val = 10.0
speed_change_factor = factor if factor >= 1 else 1 / factor
return min((speed_change_factor / max_factor_val) * 100.0, 100.0)
def time_stretch_intensity(rate: float = 1.5, **kwargs) -> float:
assert (
isinstance(rate, (float, int)) and rate > 0
), "factor must be a positive number"
if rate == 1.0:
return 0.0
max_rate_val = 10.0
speed_change_rate = rate if rate >= 1 else 1 / rate
return min((speed_change_rate / max_rate_val) * 100.0, 100.0)
def to_mono_intensity(metadata: Dict[str, Any], **kwargs) -> float:
return 0.0 if metadata["src_num_channels"] == 1 else 100.0
def fft_convolve_intensity(**kwargs) -> float:
# This is a full convolution, so return 100.0 here.
return 100.0