-
Notifications
You must be signed in to change notification settings - Fork 8
/
test_words.py
115 lines (100 loc) · 4.92 KB
/
test_words.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import numpy as np
import glob
import os
import tensorflow as tf
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import LSTM, Input, Dense, BatchNormalization, Conv2D, MaxPooling2D, Dropout, Flatten, TimeDistributed
from tensorflow.keras.layers.experimental import preprocessing
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
tf.config.set_visible_devices([], 'GPU')
dataFolder = "mini_speech_commands/"
words = [os.path.basename(x) for x in glob.glob(dataFolder + "*")]
words.remove("_background_noise_")
batch_size = 4
epochs = 16
block_length = 0.500#->500ms
audio_max_length = int(2/block_length)#->2s
frame_length = 512
fft_size = int(frame_length//2+1)
step_length = 0.008
split_count = 7
latent_dim=512
def audioToTensor(filepath:str):
audio_binary = tf.io.read_file(filepath)
audio, audioSR = tf.audio.decode_wav(audio_binary)
audioSR = tf.get_static_value(audioSR)
audio = tf.squeeze(audio, axis=-1)
frame_step = int(audioSR * step_length)#16000*0.008=128
if len(audio)<audioSR*audio_max_length:
audio = tf.concat([np.zeros([int(audioSR*audio_max_length)-len(audio)]), audio], 0)
else:
audio = audio[-int(audioSR*audio_max_length):]
spectrogram = tf.signal.stft(audio, frame_length=frame_length, frame_step=frame_step)
spect_real = tf.math.real(spectrogram)
spect_real = tf.abs(spect_real)
spect_real = (tf.math.log(spect_real)/tf.math.log(tf.constant(10, dtype=tf.float32))*20)-60
spect_real = tf.where(tf.math.is_nan(spect_real), tf.zeros_like(spect_real), spect_real)
spect_real = tf.where(tf.math.is_inf(spect_real), tf.zeros_like(spect_real), spect_real)
#spect_real = tf.image.resize(spect_real, (spect_real.shape[0]//2, spect_real.shape[1]//2))#We resize all to be more efficient
return spect_real
wordToId, idToWord = {}, {}
testParts = audioToTensor(os.path.join(dataFolder, 'go/0a9f9af7_nohash_0.wav'))
print("Test", testParts.shape)
X_audio, Y_word = [], []
for i, word in enumerate(words):
for file in glob.glob(os.path.join(dataFolder, word) + '/*.wav'):
X_audio.append(file)
Y_word.append(np.array(to_categorical([i], num_classes=len(words))[0]))
X_audio, Y_word = np.asarray(X_audio), np.asarray(Y_word)
class MySequence(tf.keras.utils.Sequence):
def __init__(self, x_audio, y_word, batch_size):
self.x_audio, self.y_word = x_audio, y_word
self.batch_size = batch_size
def __len__(self):
return len(self.x_audio) // self.batch_size
def __getitem__(self, idx):
batch_y = self.y_word[idx * self.batch_size:(idx + 1) * self.batch_size]
batch_x = np.zeros((self.batch_size, testParts.shape[0], testParts.shape[1]))
for i in range(0, batch_size):
batch_x[i] = audioToTensor(self.x_audio[idx * self.batch_size + i])
return batch_x, batch_y
X_audio, X_audio_test, Y_word, Y_word_test = train_test_split(X_audio, Y_word)
print("X_audio.shape: ", X_audio.shape)
print("Y_word.shape: ", Y_word.shape)
print("X_audio_test.shape: ", X_audio_test.shape)
print("Y_word_test.shape: ", Y_word_test.shape)
encoder_inputs = Input(shape=(testParts.shape[0], testParts.shape[1]))
normalization = BatchNormalization()(encoder_inputs)
split = tf.keras.layers.Reshape((normalization.shape[1]//split_count, -1, normalization.shape[2], 1))(normalization)
conv2d = TimeDistributed(Conv2D(34, 3, activation='relu'))(split)
conv2d = TimeDistributed(Conv2D(64, 3, activation='relu'))(conv2d)
maxpool = TimeDistributed(MaxPooling2D())(conv2d)
dropout = TimeDistributed(Dropout(0.25))(maxpool)
flatten = TimeDistributed(Flatten())(dropout)
encoder_lstm = LSTM(units=latent_dim)(flatten)
decoder_dense = Dense(len(words), activation='softmax')(encoder_lstm)
model = Model(encoder_inputs, decoder_dense)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])
model.summary()
tf.keras.utils.plot_model(model, to_file='model_words.png', show_shapes=True)
history=model.fit(MySequence(X_audio, Y_word, batch_size), shuffle=True, batch_size=batch_size, epochs=epochs, validation_data=MySequence(X_audio_test, Y_word_test, batch_size))
model.save("model_words")
metrics = history.history
plt.plot(history.epoch, metrics['loss'], metrics['acc'])
plt.legend(['loss', 'acc'])
plt.savefig("learning-words.png")
plt.show()
plt.close()
score = model.evaluate(X_audio_test, Y_word_test, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])
print("Test voice recognition")
for test_path, test_string in [('mini_speech_commands/go/0a9f9af7_nohash_0.wav', 'go'), ('mini_speech_commands/right/0c2ca723_nohash_0.wav', 'right')]:
print("test_string: ", test_string)
test_audio = audioToTensor(test_path)
result = model.predict(np.array([test_audio]))
maxIndex = np.argmax(result)
print("decoded_sentence: ", result, maxIndex, idToWord[maxIndex])