-
Notifications
You must be signed in to change notification settings - Fork 0
/
create_audio_tfrecords.py
90 lines (70 loc) · 3.55 KB
/
create_audio_tfrecords.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import os
import numpy as np
import librosa
import cv2
import tensorflow as tf
def _bytestring_feature(list_of_bytestrings):
return tf.train.Feature(bytes_list=tf.train.BytesList(value=list_of_bytestrings))
def _bytes_feature(value):
return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
def _floats_feature(value):
return tf.train.Feature(float_list=tf.train.FloatList(value=value.reshape(-1)))
def _int_feature(list_of_ints): # int64
return tf.train.Feature(int64_list=tf.train.Int64List(value=list_of_ints))
def _float_feature(list_of_floats): # float32
return tf.train.Feature(float_list=tf.train.FloatList(value=list_of_floats))
def to_tfrecord(audio, label, width, height, channels):
feature = {
# 'audio': _float_feature(audio), # audio is a list of floats
'audio': _floats_feature(audio),
'label': _int_feature([label]), # wrap label index in list
'width': _int_feature([width]),
'height': _int_feature([height]),
'channels': _int_feature([channels])
}
# Example is a flexible message type that contains key-value pairs,
# where each key maps to a Feature message. Here, each Example contains
# two features: A FloatList for the decoded audio data and an Int64List
# containing the corresponding label's index.
return tf.train.Example(features=tf.train.Features(feature=feature))
def scale_minmax(X, min=0.0, max=1.0):
X_std = (X - X.min()) / (X.max() - X.min())
X_scaled = X_std * (max - min) + min
return X_scaled
if __name__ == "__main__":
audio_dir = "/content/drive/My Drive/Deepfake Detection/data/deepfake_audio/LA_Full_Dataset/LA/ASVspoof2019_LA_dev/"
f = open("/content/drive/My Drive/Deepfake Detection/data/deepfake_audio/LA_Full_Dataset/LA/ASVspoof2019_LA_cm_protocols/ASVspoof2019.LA.cm.dev.trl.txt")
lines = f.readlines()
lines=lines[2548:]
lines=lines[3*len(lines)//4:]
c=0
with tf.io.TFRecordWriter('/content/drive/My Drive/Deepfake Detection/data/deepfake_audio/audio_tfrecords/dev_spoof_4.tfrecord') as out:
for count,line in enumerate(lines):
filename = line.split()[1].strip()
label = line.split()[-1].strip()
if label!="spoof":
continue
label = 0
audio_data = "/content/drive/My Drive/Deepfake Detection/data/deepfake_audio/LA_Full_Dataset/LA/ASVspoof2019_LA_dev/flac/" + filename + ".flac"
audio_array,sr = librosa.load(audio_data, sr=16000)
trim_audio_array, index = librosa.effects.trim(audio_array)
S = librosa.feature.melspectrogram(trim_audio_array)
mels = np.log(S + 1e-9) # add small number to avoid log(0)
# min-max scale to fit inside 8-bit range
img = scale_minmax(mels, 0, 255).astype(np.uint8)
img = np.flip(img, axis=0) # put low frequencies at the bottom in image
img = 255-img # invert. make black==more energy
if img.shape[1]<60:
continue
itr=0
c+=1
print(c, filename)
# if img.shape[1]==60:
# print("shape===60: ",audio_data)
while(itr+60<img.shape[1]):
img = cv2.cvtColor(img[:,itr:itr+60],cv2.COLOR_GRAY2RGB)
# img = tf.io.encode_jpeg(img)
# img = tf.io.serialize_tensor(img)
tfexample = to_tfrecord(img, label,img.shape[0],img.shape[1],img.shape[2])
out.write(tfexample.SerializeToString())
itr+=60