-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathapp.py
157 lines (134 loc) · 6.89 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
# Gradio app
# A chatbot that supports Audio inputs(user can upload an audio file.)
# from transformers import AutoModel, AutoTokenizer
import gradio as gr
from transformers import AutoModel
if gr.NO_RELOAD:
model = AutoModel.from_pretrained("DeSTA-ntu/DeSTA2-8B-beta", trust_remote_code=True)
model.to("cuda")
model.eval()
def reset_chat(history, chatbot):
history = [{"role": "system", "content": "Focus on the input audio. You are a helpful voice assistant."}]
# history.clear()
return (history, None, gr.update(interactive=False), gr.update(interactive=True))
def upload_audio(history, speech, text_box, chatbot, chat_button, upload_button):
# {"role": "audio", "content": "assets/audios/DialogueEmotionClassification_DailyTalk_0196_7_1_d756.wav"},
print(speech)
if speech is None:
gr.Warning("⚠️ Please upload an audio file first!", duration=5)
return (history, speech, text_box, chatbot, chat_button, upload_button)
history.append({"role": "audio", "content": speech})
chatbot.append([f"Speech: \n\n{speech}", None])
return (
history,
gr.update(interactive=True), # speech box
gr.update(interactive=True, placeholder="Start chatting!"), # text_box,
chatbot,
gr.update(interactive=True), # chat_button,
gr.update(interactive=False) # upload_button
)
def user_send_message(history, speech, text_box, chatbot):
history.append({"role": "user", "content": text_box})
chatbot.append([f"{text_box}", None])
return (
history,
speech,
gr.update(interactive=True, placeholder="Start chatting!", value=""), # text_box,
chatbot,
)
def model_response(history, speech, text_box, chatbot):
print(history)
messages = history
generated_ids = model.chat(messages, max_new_tokens=128, do_sample=False, temperature=1.0, top_p=1.0)
response = model.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
history.append({"role": "assistant", "content": response})
chatbot[-1][1] = response
return (
history,
speech,
gr.update(interactive=True, placeholder="Start chatting!"), # text_box,
chatbot,
)
with gr.Blocks() as demo:
gr.Markdown("# DeSTA2 demo page")
message_box = gr.Markdown(value="have fun!", label="Message")
history = gr.State([{ "role": "system", "content": "Focus on the input audio. You are a helpful voice assistant." }])
# history = gr.State([])
with gr.Row():
chatbot = gr.Chatbot(label="DeSTA2", height="100%", min_height="400px")
with gr.Row():
with gr.Column():
speech = gr.Audio(label="Audio", type="filepath", sources=["microphone", "upload"])
upload_button = gr.Button("Upload")
with gr.Column():
text_box = gr.Textbox(label="User", interactive=False, placeholder="Upload an audio first!")
chat_button = gr.Button("Send", interactive=False)
with gr.Row():
# top_p = gr.Slider(minimum=0.0, maximum=1.0, value=1.0, label="Top P")
# temperature = gr.Slider(minimum=0.0, maximum=1.0, value=1.0, label="Temperature")
gr.Button("Reset chat").click(reset_chat,
inputs=[history, chatbot],
outputs=[history, chatbot, chat_button, upload_button])
upload_button.click(upload_audio,
inputs=[history, speech, text_box, chatbot, chat_button, upload_button],
outputs=[history, speech, text_box, chatbot, chat_button, upload_button]
)
chat_button.click(user_send_message,
inputs=[history, speech, text_box, chatbot],
outputs=[history, speech, text_box, chatbot]).then(
model_response,
inputs=[history, speech, text_box, chatbot],
outputs=[history, speech, text_box, chatbot]
)
with gr.Row():
examples_prompt = gr.Examples(
examples = [
"Transcribe the speech accurately.",
"What is the primary emotion conveyed by the speaker?",
"Describe the content and tone of the audio in detail.",
"Provide a summary of the audio content.",
"Identify the language spoken in the recording.",
"What does the background noise in the audio indicate?",
"Identify if the speaker has a specific accent and describe it.",
"What is the gender and approximate age of the speaker?",
"Summarize the conversation happening in this audio.",
"Classify the type of audio: speech, music, noise, or mixed.",
"Assess the clarity and intelligibility of the speech.",
"What is the emotional state of the speaker, and why do you think so?",
"Provide a timestamped breakdown of key events in the audio."
"將這段語音轉成文字,請確保準確的時間點。",
"你能辨認出這段語音的情感是什麼嗎?",
"這段聲音中的說話者有什麼情緒?",
"從這段聲音中提取關鍵詞。",
"請翻譯這段語音的內容。",
"從這段聲音中找出說話者的性別和口音。",
],
inputs=[text_box],
label="Example prompts"
)
with gr.Row():
examples = gr.Examples(
examples = [
["assets/audios/0_000307.wav"],
["assets/audios/4_0_d47.wav"],
["assets/audios/7_1_d7.wav"],
["assets/audios/AccentClassification_AccentdbExtended_0193_british_s01_176.wav"],
["assets/audios/DialogueEmotionClassification_DailyTalk_0196_7_1_d756.wav"],
["assets/audios/EmotionRecognition_MultimodalEmotionlinesDataset_0026_dia382_utt0.wav"],
["assets/audios/LanguageIdentification_VoxForge_0000_de143-43.flac"],
["assets/audios/MUL0608_120.98_148.92.wav"],
["assets/audios/NoiseDetection_LJSpeech_MUSAN-Music_0199_music_LJSpeech-1.1_16k_LJ050-0033.wav"],
["assets/audios/Ses01F_script03_1_F029.wav"],
["assets/audios/Ses01M_script01_1_F014.wav"],
["assets/audios/Ses04F_impro02_M004.wav"],
["assets/audios/SpeakerVerification_LibriSpeech-TestClean_0046_3575-170457-0038.flac"],
["assets/audios/SpeechTextMatching_LJSpeech_0001_LJ001-0107.wav"],
["assets/audios/common_voice_en_34980360.mp3"],
["assets/audios/p284_159.wav"],
["assets/audios/p287_162.wav"]
],
inputs=[speech],
label="Example audios"
)
if __name__ == "__main__":
demo.launch(share=True)