-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathphone_embedding.py
46 lines (35 loc) · 1.49 KB
/
phone_embedding.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
from deepspeech.utils import audioToInputVector
from DeepSpeech import create_inference_graph, initialize_globals
import tensorflow as tf
import scipy.io.wavfile as wav
model_path = "./data/models/output_graph.pb"
def load_graph(frozen_graph_filename):
# We load the protobuf file from the disk and parse it to retrieve the
# unserialized graph_def
with tf.gfile.GFile(frozen_graph_filename, "rb") as f:
graph_def = tf.GraphDef()
graph_def.ParseFromString(f.read())
# Then, we import the graph_def into a new Graph and returns it
with tf.Graph().as_default() as graph:
# The name var will prefix every op/nodes in your graph
# Since we load everything in a new graph, this is not needed
tf.import_graph_def(graph_def, name="prefix")
return graph
def infer_audio(input_file_path):
initialize_globals()
n_input = 26
n_context = 9
graph = load_graph(model_path)
# We access the input and output nodes
inp = graph.get_tensor_by_name('prefix/input_node:0')
inp_len = graph.get_tensor_by_name('prefix/input_lengths:0')
logits = graph.get_tensor_by_name('prefix/logits:0')
op = graph.get_tensor_by_name('prefix/output_node:0')
with tf.Session(graph=graph) as session:
fs, audio = wav.read(input_file_path)
mfcc = audioToInputVector(audio, fs, n_input, n_context)
output = session.run(logits, feed_dict={
inp: [mfcc],
inp_len: [len(mfcc)],
})
return output