-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathconvo.py
154 lines (116 loc) · 5.52 KB
/
convo.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
# Imports
import os
import time
import re
import torch
import requests
import torchaudio
import wave
import pyaudio
from glob import glob #globe is require for batching audio input for STT
import speech_recognition as sr
from omegaconf import OmegaConf #used by Silaro
from silero.utils import (init_jit_model, split_into_batches, read_audio, read_batch, prepare_model_input)
#Required variables
audio_input_file = "micAudioInput.wav"
audio_output_file = "ttsAudioOutput.wav"
language = 'en'
speaker = 'lj_v2'
sample_rate = 16000
CHUNK = 1024
you_said = ""
rasa_bot_url = 'http://localhost:5005/webhooks/rest/webhook' #default rasa HTTP URL
device = torch.device('cpu') # you can use any pytorch device
#Initialize Speech Recognizer audio
r_audio = sr.Recognizer()
p = pyaudio.PyAudio() #for playing speech
SpeakerStream = p.open(format = 8, channels = 1, rate = sample_rate, output = True)
#initialize Speech to Text model of Silaro https://github.com/snakers4/silero-models
stt_model, decoder, utils = torch.hub.load(repo_or_dir='snakers4/silero-models', model='silero_stt', jit_model='jit_xlarge', language='en', device=device)
stt_model.to(device)
#initialize Text to Speech model of Silaro https://github.com/snakers4/silero-models
tts_model, example_text = torch.hub.load(repo_or_dir='snakers4/silero-models', model='silero_tts', language=language,speaker=speaker)
tts_model.to(device)
# Infinite loop to keep Convo running continuously until we ask it by saying stop / quit / exit.
# We are not using any separate hotword detection method here
while True:
loop_start_time = time.time()
# Step 1 - Take Mic input
try:
with sr.Microphone() as SR_AudioSource:
#r.adjust_for_ambient_noise(source,duration=1.0) # Try these if you need to adjust mic for ambient noise
#r.energy_threshold = 2000 # Try these to adjust hearing sensitivity - value between 50-4000
print("Say something...")
mic_audio = r_audio.listen(SR_AudioSource,1,4) # mic_audio=r_audio.listen(source,timeout=1,phrase_time_limit=4)
#mic_audio = r_audio.listen(source) #you may also simply call .listen without any parameters
except: #This is required if there is no Audio input or some error
print("Cound not capture mic input or no audio...")
continue #continue next mic input loop if there was any error
#save captured audio to a file
with open(audio_input_file, "wb") as file:
file.write(mic_audio.get_wav_data())
file.flush()
file.close()
#time log
audio_capture_time = time.time()
print("Audio Capture time = ", audio_capture_time-loop_start_time)
# Step 2 - Convert recorded Speech to Text
# Check if there is input audio file saved otherwise continue listening
if not os.path.exists(audio_input_file):
print("no input file exists")
continue
# Read audio file into batches and create input pipelie for STT
batches = split_into_batches(glob(audio_input_file), batch_size=10)
readbatchout = read_batch(batches[0])
input = prepare_model_input(read_batch(batches[0]), device=device)
#feed to STT model and get the text output
output = stt_model(input)
you_said = decoder(output[0].cpu())
print(you_said)
if(you_said == ""):
print("No speech recognized...")
continue
#check if user wants to stop - This can also be achieved by implementing a Hotword Detection
if(re.search("exit",you_said) or re.search("stop",you_said) or re.search("quit",you_said)):
break
#time log
stt_time = time.time()
print("time for Speech to Text conversion = ",stt_time-audio_capture_time)
#clear input file so that it is not referred again and again in future
if os.path.exists(audio_input_file):
os.remove(audio_input_file)
#ConvoBot communication block
#TODO implement call to chatbot
rasa_call_response = requests.post(rasa_bot_url,json={"sender":"Ashutosh","message":you_said})
bot_response = rasa_call_response.json()
print("bot_response=",bot_response)
#for i in r.json():
#TODO: check other outputs from Bot
if (bot_response is None or len(bot_response) == 0):
bot_said = "I did not quite catch that..."
else:
bot_said = bot_response[0]['text']
print(bot_said)
bot_response_time = time.time()
print("time for bot response = ",bot_response_time - stt_time)
#check if bot response is valid text
if bot_said =="":
continue
#Text to Speech block
tts_audio = tts_model.apply_tts(texts=[bot_said],sample_rate=sample_rate)
torchaudio.save(audio_output_file, tts_audio[0].unsqueeze(0), sample_rate=sample_rate, bits_per_sample=16)
tts_time = time.time()
print("time for Text to Speech conversion = ",tts_time - bot_response_time)
print("time for overall loop = ",tts_time - loop_start_time)
wf = wave.open(audio_output_file, 'rb')
# Read data in chunks
data = wf.readframes(CHUNK)
#Speak the output
# Play the sound by writing the audio data to the stream
while len(data) > 0:
SpeakerStream.write(data)
data = wf.readframes(CHUNK)
wf.close()
#clear output file so that it is not referred again and again in future
if os.path.exists(audio_output_file):
os.remove(audio_output_file)