-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathspectrogram.py
149 lines (119 loc) · 3.31 KB
/
spectrogram.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
import matplotlib.colors as colors
import matplotlib.pyplot as plt
import matplotlib as mpl
import argparse
import pydub
import numpy as np
from matplotlib.colors import LinearSegmentedColormap, to_rgba_array
c_window_size = 2048
c_freq_min = 0
c_freq_max = 20000
c_channel = 0
c_factor = 16
def audio_to_array(audio):
sample_rate = audio.frame_rate
channels = audio.channels
divisor = 2 ** (audio.sample_width * 8) / 2
samples = np.array(audio.get_array_of_samples()).reshape((channels, -1))
waveform = samples / divisor
duration = samples.size / sample_rate / channels
return waveform, duration, sample_rate
def parse_ratio(ratio):
return np.array(list(map(float, ratio.split(':'))))
def normalize_ratio(ratio, factor):
return ratio / ratio.max() * factor
def increase_saturation(cmap, factor=2.0):
quantization = cmap.N
color_list = cmap(np.arange(quantization))
hsv_colors = colors.rgb_to_hsv(color_list[:, :3])
hsv_colors[:, 1] *= factor
hsv_colors[:, 1] = np.clip(hsv_colors[:, 1], 0, 1)
return LinearSegmentedColormap.from_list(
"saturated",
colors.hsv_to_rgb(hsv_colors),
quantization
)
def parse_arguments():
parser = argparse.ArgumentParser(description="A CLI tool to transform audio into spectrograms")
parser.add_argument(
"--input", "-i",
help="The audio file to use",
required=True, type=str
)
parser.add_argument(
"--output", "-o",
help="The output filename",
required=True, type=str
)
parser.add_argument(
"--ratio", "-r",
help="The image ratio, format should be X:Y",
required=True, type=str
)
# Optional Arguments
parser.add_argument(
"--view", "-v",
help="Open a window with the result",
action="store_true"
)
parser.add_argument(
"--factor", "-f",
help=f"Scaling factor to use (defaults to {c_factor})",
type=float, default=c_factor
)
parser.add_argument(
"--channel", "-c",
help=f"Which channel to use (defaults to {c_channel})",
type=int, default=c_channel
)
parser.add_argument(
"--size", "-s",
help=f"Sets the window size (defaults to {c_window_size})",
type=int, default=c_window_size
)
parser.add_argument(
"--freq-min",
help=f"Sets the minimum frequency (defaults to {c_freq_min})",
type=int, default=c_freq_min
)
parser.add_argument(
"--freq-max",
help=f"Sets the maximum frequency (defaults to {c_freq_max})",
type=int, default=c_freq_max
)
return parser.parse_args()
def main():
arguments = parse_arguments()
window_size = arguments.size
output_file = arguments.output
input_file = arguments.input
freq_min = arguments.freq_min
freq_max = arguments.freq_max
channel = arguments.channel
factor = arguments.factor
ratio = arguments.ratio
view = arguments.view
audio = pydub.AudioSegment.from_file(input_file)
waveform, _duration, sample_rate = audio_to_array(audio)
scaling = normalize_ratio(parse_ratio(ratio), factor)
overlap = window_size // 2
window = np.hanning(window_size)
fig, ax = plt.subplots(figsize=tuple(reversed(scaling)))
fig.subplots_adjust(0, 0, 1, 1)
ax.specgram(
waveform[channel] * 20,
window_size,
sample_rate, 0,
window=window,
noverlap=overlap,
cmap=increase_saturation(mpl.colormaps["gray"], 2),
vmin=-100, vmax=0
)
ax.set_ylim(freq_min, freq_max)
ax.axis("off")
if view:
plt.show()
fig.savefig(output_file, pad_inches=0)
plt.close(fig)
if __name__ == "__main__":
main()