From d2265f3ac5918b67695d7fd6ef4e169341ad0735 Mon Sep 17 00:00:00 2001 From: topcuemre-analog <141146278+topcuemre-analog@users.noreply.github.com> Date: Thu, 24 Aug 2023 12:05:49 +0300 Subject: [PATCH 1/2] kws20.py with %66 augment chance each augmentation (noise, time_stretch, shift) is applied with a random chance of 2/3 --- datasets/kws20.py | 97 ++++++++++++++++++++++++++++------------------- 1 file changed, 57 insertions(+), 40 deletions(-) diff --git a/datasets/kws20.py b/datasets/kws20.py index e49410fb9..99c08bfaa 100644 --- a/datasets/kws20.py +++ b/datasets/kws20.py @@ -150,10 +150,10 @@ def __parse_augmentation(self, augmentation): print('No key `shift` in input augmentation dictionary! ' 'Using defaults: [Min:-0.1, Max: 0.1]') self.augmentation['shift'] = {'min': -0.1, 'max': 0.1} - if 'strech' not in augmentation: - print('No key `strech` in input augmentation dictionary! ' + if 'stretch' not in augmentation: + print('No key `stretch` in input augmentation dictionary! ' 'Using defaults: [Min: 0.8, Max: 1.3]') - self.augmentation['strech'] = {'min': 0.8, 'max': 1.3} + self.augmentation['stretch'] = {'min': 0.8, 'max': 1.3} def __download(self): @@ -201,11 +201,7 @@ def __gen_datasets(self, exp_len=16384): record_list = sorted(os.listdir(os.path.join(self.raw_folder, label))) record_len = len(record_list) - if not self.save_unquantized: - data_in = np.empty((record_len, - exp_len), dtype=np.uint8) - else: - data_in = np.empty((record_len, + data_in = np.empty((record_len, exp_len), dtype=np.float32) @@ -234,15 +230,7 @@ def __gen_datasets(self, exp_len=16384): record = np.pad(record, [0, exp_len - record.size]) data_type[r , 0] = d_typ - - if not self.save_unquantized: - data_in[r] = \ - KWS.quantize_audio(record, - num_bits=self.quantization['bits'], - compand=self.quantization['compand'], - mu=self.quantization['mu']) - else: - data_in[r] = record + data_in[r] = record dur = time.time() - time_s print(f'Finished in {dur:.3f} seconds.') @@ -269,27 +257,36 @@ def __gen_datasets(self, exp_len=16384): def __dynamic_augment(self, record, fs = 16000, verbose=False, exp_len=16384, row_len=128, overlap_ratio=0): audio = self.augment(record, fs) - audio = np.array(audio, np.uint8) + data_in = self.reshape_file(audio) + + return data_in + + def reshape_file(self, audio, row_len = 128, exp_len=16384, overlap_ratio=0): overlap = int(np.ceil(row_len * overlap_ratio)) num_rows = int(np.ceil(exp_len / (row_len - overlap))) - data_len = int((num_rows * row_len - (num_rows - 1) * overlap)) if not self.save_unquantized: - data_in = np.empty((row_len, num_rows), dtype=np.uint8) + data_in = np.empty((row_len, num_rows), dtype=np.uint8) else: data_in = np.empty((row_len, num_rows), dtype=np.float32) - + for n_r in range(num_rows): start_idx = n_r * (row_len - overlap) end_idx = start_idx + row_len audio_chunk = audio[start_idx:end_idx] - audio_chunk = np.pad(audio_chunk, [0, row_len - audio_chunk.size]) + audio_chunk = np.pad(audio_chunk, [0, row_len - audio_chunk.shape[0]]) - data_in[:, n_r] = audio_chunk + if not self.save_unquantized: + data_in[:, n_r] = \ + KWS.quantize_audio(audio_chunk, + num_bits=self.quantization['bits'], + compand=self.quantization['compand'], + mu=self.quantization['mu']) + else: + data_in[:, n_r] = audio_chunk data_in = torch.from_numpy(data_in) - return data_in @@ -463,7 +460,7 @@ def __filter_dtype(self): self.data = self.data[idx_to_select, :] self.targets = self.targets[idx_to_select, :] - del self.data_type + self.data_type = self.data_type[idx_to_select, :] def __filter_classes(self): @@ -487,9 +484,12 @@ def __len__(self): return len(self.data) def __getitem__(self, index): - inp, target = self.data[index], int(self.targets[index]) + inp, target, data_type = self.data[index], int(self.targets[index]), self.data_type[index] - inp = self.__dynamic_augment(inp) + if data_type == 0: + inp = self.__dynamic_augment(inp) + else: + inp = self.reshape_file(inp) inp = inp.type(torch.FloatTensor) @@ -504,21 +504,22 @@ def __getitem__(self, index): def add_white_noise(audio, noise_var_coeff): """Adds zero mean Gaussian noise to image with specified variance. """ - coeff = noise_var_coeff * np.mean(np.abs(audio)) - noisy_audio = audio + coeff * np.random.randn(len(audio)) + audio_mean = torch.mean(torch.abs(audio)) + coeff = noise_var_coeff * audio_mean + noisy_audio = audio + coeff * torch.randn(len(audio)) return noisy_audio @staticmethod - def shift(audio, shift_sec, fs): + def shift(audio, shift_sec, fs=16000): """Shifts audio. """ shift_count = int(shift_sec * fs) - return np.roll(audio, shift_count) + return torch.roll(audio, shift_count) @staticmethod def stretch(audio, rate=1): """Stretches audio with specified ratio. - """ + """ input_length = 16000 audio2 = librosa.effects.time_stretch(audio, rate) if len(audio2) > input_length: @@ -527,6 +528,11 @@ def stretch(audio, rate=1): audio2 = np.pad(audio2, (0, max(0, input_length - len(audio2))), "constant") return audio2 + + @staticmethod + def stretch_(audio, rate=1): + return torch.from_numpy(tsm.wsola(audio, rate)) + def augment(self, audio, fs, verbose=False): """Augments audio by adding random noise, shift and stretch ratio. @@ -535,17 +541,28 @@ def augment(self, audio, fs, verbose=False): self.augmentation['noise_var']['max']) random_shift_time = np.random.uniform(self.augmentation['shift']['min'], self.augmentation['shift']['max']) - random_strech_coeff = np.random.uniform(self.augmentation['strech']['min'], - self.augmentation['strech']['max']) + random_stretch_coeff = np.random.uniform(self.augmentation['stretch']['min'], + self.augmentation['stretch']['max']) + + augment_methods = { + "noise_var": [self.add_white_noise, random_noise_var_coeff], + "shift": [self.shift, random_shift_time], + "stretch": [self.stretch_, random_stretch_coeff] + } + + for option in augment_methods: + # %66 possibility to apply an augmentation + if np.random.randint(3) > 0: + aug_func = augment_methods[option][0] + audio = aug_func(audio, augment_methods[option][1]) + else: + continue - aug_audio = tsm.wsola(audio, random_strech_coeff) - aug_audio = self.shift(aug_audio, random_shift_time, fs) - aug_audio = self.add_white_noise(aug_audio, random_noise_var_coeff) if verbose: print(f'random_noise_var_coeff: {random_noise_var_coeff:.2f}\nrandom_shift_time: \ - {random_shift_time:.2f}\nrandom_strech_coeff: {random_strech_coeff:.2f}') - return aug_audio + {random_shift_time:.2f}\nrandom_stretch_coeff: {random_stretch_coeff:.2f}') + return audio def augment_multiple(self, audio, fs, n_augment, verbose=False): """Calls `augment` function for n_augment times for given audio data. @@ -628,7 +645,7 @@ def KWS_get_datasets(data, load_train=True, load_test=True, num_classes=6): raise ValueError(f'Unsupported num_classes {num_classes}') augmentation = {'aug_num': 2, 'shift': {'min': -0.15, 'max': 0.15}, - 'noise_var': {'min': 0, 'max': 1}} + 'noise_var': {'min': 0, 'max': 0.05}} quantization_scheme = {'compand': False, 'mu': 10} if load_train: From 62114bb74c1b398c62a52ccbd401497457a951b0 Mon Sep 17 00:00:00 2001 From: topcuemre-analog <141146278+topcuemre-analog@users.noreply.github.com> Date: Thu, 24 Aug 2023 12:29:15 +0300 Subject: [PATCH 2/2] parse log file of training to easily interpret log file outputs of training. --- utils/log_parser | 73 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) create mode 100644 utils/log_parser diff --git a/utils/log_parser b/utils/log_parser new file mode 100644 index 000000000..3541825d2 --- /dev/null +++ b/utils/log_parser @@ -0,0 +1,73 @@ +#!/usr/bin/env python3 + +""" +Training+validation log parser +""" + +import re +import matplotlib.pyplot as plt + +def log_parser(log_path, plot_loss = True, plot_acc = True):# Regular expressions to extract relevant information + + # Target values to locate + epoch_pattern = re.compile(r'Epoch: \[(\d+)\]') + loss_pattern = re.compile(r'Overall Loss (\d+\.\d+)') + validation_loss_pattern = re.compile(r'Loss\s*([\d.]+)') + top1_pattern = r'Top1\s*([\d.]+)' + + # Open and read the log file + with open(log_path, 'r') as log_file: + log_contents = log_file.read() + + # Find corresponding values + epoch_matches = re.findall(epoch_pattern, log_contents) + loss_matches = re.findall(loss_pattern, log_contents) + validation_loss_matches = re.findall(validation_loss_pattern, log_contents) + top1_matches = re.findall(top1_pattern, log_contents) + + # Convert extracted data to appropriate data types + epochs = [int(match) for match in epoch_matches] + losses = [float(match) for match in loss_matches] + validation_losses = [float(match) for match in validation_loss_matches] + top1_accuracies = [float(match) for match in top1_matches] + + # Make sure training and validation losses have the same length + min_length = min(len(losses), len(validation_losses)) + epochs = epochs[:min_length] + training_losses = losses[:min_length] + validation_losses = validation_losses[:min_length] + + # Make sure training and validation losses have the same length + top1_length = min(len(epochs), len(top1_accuracies)) + top1_epochs = epochs[:top1_length] + top1_accuracies = top1_accuracies[:top1_length] + + # Plot training + val loss vs epoch + if plot_loss: + plt.plot(epochs, losses, label='Training Loss') + plt.plot(epochs, validation_losses, label='Validation Loss',color="r") + plt.legend(loc="upper right") + plt.xlabel('Epoch') + plt.ylabel('Objective Loss') + plt.title('Training Objective Loss Over Epochs') + plt.grid(True) + plt.show() + + # Plot top1 acc vs epoch + if plot_acc: + plt.figure(figsize=(10, 5)) + plt.scatter(top1_epochs, top1_accuracies, label='Top-1 Accuracy') + plt.xlabel('Epoch') + plt.ylabel('Top-1 Accuracy (%)') + plt.title('Top-1 Accuracy vs Epoch') + plt.grid(True) + plt.legend() + plt.show() + + return 0 + + + + + +