-
Notifications
You must be signed in to change notification settings - Fork 0
/
WhoIsTheHuman.py
382 lines (319 loc) · 16.6 KB
/
WhoIsTheHuman.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
import random
import warnings
import wave
import webrtcvad
import whisper
import pyaudio
import threading
from openai import OpenAI
import torch
from fuzzywuzzy import fuzz # Add fuzzy matching
from pydub import AudioSegment
from pydub.playback import play
from pydub.effects import speedup
import os
from TTS.api import TTS
import re
# Suppress specific warnings
warnings.filterwarnings("ignore", category=UserWarning, module='fuzzywuzzy')
warnings.filterwarnings("ignore", category=FutureWarning, module='TTS')
warnings.filterwarnings("ignore", category=UserWarning, module='whisper')
warnings.filterwarnings("ignore", category=FutureWarning, module='whisper')
# Suppress PyTorch warnings
os.environ['TORCH_CPP_LOG_LEVEL'] = 'ERROR'
# Suppress TensorFlow warnings (if applicable)
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
# Initialize TTS models
# TODO These models sound good but they suffer from halizunations and a differnt TTS system should be used since tts is no longer developed.
try:
thorsten_model = TTS("tts_models/de/thorsten/tacotron2-DDC")
jenny_model = TTS("tts_models/en/jenny/jenny")
male_model = TTS("tts_models/en/ljspeech/tacotron2-DDC")
tts_model = TTS("tts_models/en/ljspeech/tacotron2-DDC")
# Move models to GPU if available
tts_device = "cuda" if torch.cuda.is_available() else "cpu"
thorsten_model.to(tts_device)
jenny_model.to(tts_device)
male_model.to(tts_device)
tts_model.to(tts_device)
except Exception as e:
print(f"Failed to initialize TTS models: {e}")
thorsten_model = None
jenny_model = None
male_model = None
tts_model = None
# Initialize the OpenAI client with local LLM running at port 1234 (replace this with your actual setup)
client = OpenAI(base_url="http://localhost:1234/v1", api_key="lm-studio")
modelID = "mradermacher/Reflection-Llama-3.1-8B-GGUF" # Specify the model you are using
# Set up WebRTC Voice Activity Detection (VAD)
vad = webrtcvad.Vad()
vad.set_mode(3) # Set VAD to the most aggressive mode, useful for noisy environments
# Threading flags for controlling recording
recording = threading.Event() # Tracks if recording is active
user_introduced = threading.Event() # Tracks if the user's introduction has been recorded
gm_history = [{"role": "system",
"content": (
"You are the Game Master for the game 'Who's the Human?'. Your responsibilities include:"
"\n1. Guiding the game and explaining the rules clearly."
"\n2. Conducting only two rounds: Introductions and Voting."
"\n3. After the Voting round, conclude the game by announcing the results and declaring the winner."
"\n4. No additional rounds should be initiated under any circumstances."
"\nRules of the Game (explained in one short sentence): "
"In 'Who's the Human?', players introduce themselves in character, then vote to identify the human among them. "
"The player suspected the most as human loses the game.")}]
player_histories = {
"Einstein": [{"role": "system",
"content": "Du spielst Einstein. Bleibe und spiele immer in deinem Charakter. Antworte immer in Deutsch auch wenn du Englisch angesprochen wirst."}],
"Genghis Khan": [{"role": "system",
"content": "You are playing the role of Genghis Khan. Always stay in character and respond as Genghis Khan would."}],
"Cleopatra": [{"role": "system",
"content": "You are playing the role of Cleopatra. Always stay in character and respond as Cleopatra would."}]
}
# In case the name spoken is not exactly the same.
def match_name(input_text, valid_names, threshold=60):
# Define a list to store potential matches
potential_matches = []
# Check for each valid name in the input text
for name in valid_names:
if fuzz.partial_ratio(name.lower(), input_text.lower()) >= threshold:
potential_matches.append((name, fuzz.partial_ratio(name.lower(), input_text.lower())))
# Sort potential matches by their match score in descending order
potential_matches.sort(key=lambda x: x[1], reverse=True)
# Return the best match if it meets the threshold
if potential_matches and potential_matches[0][1] >= threshold:
return potential_matches[0][0]
return None
def extract_vote_from_response(response, valid_names):
"""
Extract a vote from an AI response using the same fuzzy matching as for user votes.
Returns the matched name or None if no valid match is found.
"""
# Common vote patterns in responses
patterns = [
r"I think (?:the human is|it is) ([A-Za-z\s]+)", # English pattern
r"Ich denke (?:der Mensch ist|es ist) ([A-Za-z\s]+)", # German pattern
r"vote for ([A-Za-z\s]+)",
r"([A-Za-z\s]+) must be the human",
r"<([^>]+)/>", # Bracket format
]
# Try each pattern
for pattern in patterns:
match = re.search(pattern, response, re.IGNORECASE)
if match:
vote = match.group(1).strip()
# Use the same fuzzy matching as used for user votes
matched_name = match_name(vote, valid_names)
if matched_name:
return matched_name
# If no pattern matches, try fuzzy matching on the whole response
# This is a fallback for cases where the AI response doesn't follow expected patterns
for name in valid_names:
if name.lower() in response.lower():
return name
return None
# Load the Whisper model for speech-to-text, and set up GPU/CPU usage
whisper_model = whisper.load_model("base") # Removed weights_only=True
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") # Move model to GPU if available, otherwise use CPU
print(f"Is CUDA available? {torch.cuda.is_available()}")
print(f"Using device: {device}")
whisper_model = whisper_model.to(device) # Assign the model to the appropriate device
def speak(text, character, speed=1):
if tts_model is None or thorsten_model is None or jenny_model is None or male_model is None:
print("Error: One or more TTS models not initialized.")
return
filtered_text = re.sub(r"<user_data>.*?</user_data>", "", text).strip()
output_file = "temp_tts.wav"
try:
if character == "Einstein":
thorsten_model.tts_to_file(text=filtered_text, file_path=output_file)
elif character == "Genghis Khan":
male_model.tts_to_file(text=filtered_text, file_path=output_file)
elif character == "Cleopatra":
jenny_model.tts_to_file(text=filtered_text, file_path=output_file)
else:
tts_model.tts_to_file(text=filtered_text, file_path=output_file)
audio = AudioSegment.from_wav(output_file)
if speed != 1.0:
audio = speedup(audio, playback_speed=speed, chunk_size=50, crossfade=25)
play(audio)
except Exception as e:
print(f"Error in text-to-speech: {e}")
finally:
if os.path.exists(output_file):
os.remove(output_file)
def get_response(history):
completion = client.chat.completions.create(
model=modelID,
messages=history,
temperature=0.7,
max_tokens=350,
stream=False,
)
response = completion.choices[0].message.content.strip()
cleaned_response = clean_text(response)
return cleaned_response
def explain_rules():
gm_history.append({"role": "user", "content": "Briefly explain the rules. only one sentance. The human needs to be found in the group. the ai overlords can stay."})
response = get_response(gm_history)
speak(response, "Game Master") # Use TTS to speak the rules
return response
def is_speech(data):
try:
return vad.is_speech(data, 16000) # Check for speech in 16kHz audio frame
except Exception as e:
print(f"Error in speech detection: {e}")
return False # If there's an error, assume no speech is detected
def record_audio(valid_names, votes, validate_name=True):
print("Starting continuous audio recording...")
mic = pyaudio.PyAudio() # Initialize PyAudio for capturing audio
stream = mic.open(format=pyaudio.paInt16, channels=1, rate=16000, input=True, frames_per_buffer=4096)
frames = [] # List to hold recorded audio frames
silence_counter = 0 # Track how long silence has been detected
speech_counter = 0 # Track how long speech has been detected
silence_threshold = 100 # Number of silent chunks to trigger the end of recording (increased for less sensitivity)
speech_threshold = 10 # Number of speech chunks to confirm speech has started (increased for less sensitivity)
try:
while True: # Continuous loop to capture audio
if user_introduced.is_set():
break # Stop recording if the user's introduction has been recorded
try:
data = stream.read(320, exception_on_overflow=False) # Read 20ms of audio (320 samples at 16kHz)
except IOError as e:
print(f"Input overflow: {e}") # Handle potential input errors
continue
# Check if the audio frame contains speech
if is_speech(data):
speech_counter += 1 # Increment speech counter
silence_counter = 0 # Reset silence counter
if speech_counter >= speech_threshold and not recording.is_set(): # If speech is confirmed
print("Speech detected, starting recording...")
recording.set() # Set the recording flag to true
frames = [] # Clear previous audio frames
frames.append(data) # Store the recorded audio
else:
silence_counter += 1 # Increment silence counter if no speech
speech_counter = 0 # Reset speech counter
if silence_counter >= silence_threshold and recording.is_set(): # If silence is confirmed
print("Speech ended, processing...")
audio_data = b''.join(frames) # Combine audio frames into a single audio blob
user_input = transcribe_audio(audio_data) # Transcribe the audio data
if validate_name:
matched_name = match_name(user_input, valid_names)
if matched_name:
player_histories[user_role].append({"role": "user", "content": user_input})
print(f"{user_role} (YOU): {user_input}")
votes[matched_name] += 1 # Ensure the vote is counted
user_introduced.set() # Set the flag to indicate the user's introduction has been recorded
break # Break out of the loop if the correct name is detected
else:
print("Invalid input, please try again.")
else:
player_histories[user_role].append({"role": "user", "content": user_input})
print(f"{user_role} (YOU): {user_input}")
user_introduced.set() # Set the flag to indicate the user's introduction has been recorded
break # Break out of the loop after recording the introduction
recording.clear() # Clear recording flag
silence_counter = 0 # Reset counters
speech_counter = 0
except Exception as e:
print(f"Error in audio recording: {e}")
finally:
stream.stop_stream() # Stop the audio stream on exit
stream.close()
mic.terminate() # Terminate PyAudio instance
def transcribe_audio(audio_data):
audio_path = "temp_audio.wav"
with wave.open(audio_path, 'wb') as wf:
wf.setnchannels(1)
wf.setsampwidth(pyaudio.PyAudio().get_sample_size(pyaudio.paInt16))
wf.setframerate(16000)
wf.writeframes(audio_data)
audio = whisper.load_audio(audio_path)
audio = whisper.pad_or_trim(audio)
mel = whisper.log_mel_spectrogram(audio).to(device)
# Set decoding options to explicitly use English
options = whisper.DecodingOptions(fp16=True, language="en")
result = whisper.decode(whisper_model, mel, options)
transcribed_text = result.text.strip()
print(f"Transcribed text: {transcribed_text}")
return transcribed_text
def clean_text(text):
# Define a regular expression pattern to match unwanted symbols
pattern = r'[^\w\s,.!?-]' # This pattern keeps letters, numbers, whitespace, and common punctuation
cleaned_text = re.sub(pattern, '', text)
return cleaned_text
def process_audio(audio_data):
user_input = transcribe_audio(audio_data)
if user_input: # Check if transcription is not empty
player_histories[user_role].append({"role": "user", "content": user_input})
print(f"{user_role} (YOU): {user_input}")
user_introduced.set() # Set the flag to indicate the user's introduction has been recorded
else:
print("Transcription failed or was empty.")
def main():
# print("Game Master:", explain_rules())
roles = ["Einstein", "Genghis Khan", "Cleopatra"]
random.shuffle(roles)
global user_role
user_role = random.choice(roles)
roles.remove(user_role)
roles.append(user_role)
print(f"\nYou are playing the role of {user_role}.")
valid_names = [role for role in roles if role != user_role]
print("\nFirst Round: Introductions")
for role in roles:
if role == user_role:
record_audio(valid_names, votes={}, validate_name=False) # No name validation in the first round
else:
player_histories[role].append(
{"role": "user", "content": "Introduce yourself in character. In 1 sentence: "})
response = get_response(player_histories[role])
player_histories[role].append({"role": "assistant", "content": response})
print(f"{role} (AI): {response}")
speak(response, role) # TTS for AI responses
# Reset the flag to allow user input in the second round
user_introduced.clear()
print("\nSecond Round: Voting")
votes = {role: 0 for role in roles}
introductions = "\n".join([f"{role}: {player_histories[role][-1]['content']}" for role in roles])
print(f"Introductions: {introductions}")
for role in roles:
if role == user_role:
print(f"\nYour turn to vote, {role}! Please say the name of who you think is the human player.")
record_audio(valid_names, votes)
else:
if role == "Einstein":
voting_prompt = f"""Du bist {role}. Hier sind die Einführungen: {introductions}
WICHTIG: Du MUSST deine Antwort in diesem Format geben:
'Ich denke der Mensch ist [Name]' - zum Beispiel 'Ich denke der Mensch ist Cleopatra'
Danach erkläre kurz warum. Stimme nicht für dich selbst ab!"""
else:
voting_prompt = f"""You are {role}. Here are the introductions: {introductions}
IMPORTANT: You MUST give your answer in this format:
'I think the human is [Name]' - for example 'I think the human is Cleopatra'
Then briefly explain why. Do not vote for yourself!"""
player_histories[role].append({"role": "user", "content": voting_prompt})
response = get_response(player_histories[role])
print(f"\n{role} (AI) says: {response}")
speak(response, role)
# Process AI vote using the same fuzzy matching logic as user votes
voted_for = extract_vote_from_response(response, [r for r in roles if r != role])
if voted_for:
votes[voted_for] += 1
print(f"Vote recorded: {role} voted for {voted_for}")
else:
print(f"Warning: Could not detect a valid vote from {role}")
# Print final vote tally
print("\nFinal Vote Count:")
for role, vote_count in votes.items():
print(f"{role}: {vote_count} votes")
max_votes = max(votes.values())
most_voted = [role for role, count in votes.items() if count == max_votes and list(votes.values()).count(max_votes) == 1]
human_identified = user_role in most_voted
gm_history.append({"role": "user",
"content": f"The votes are: {votes}. The human player was {user_role}. The player(s) with the most votes: {most_voted} looses the game because he is called the not so welcomed human. Announce the result and declare if the AI players successfully identified the human: {'Yes they did' if human_identified else 'No they did not'}."})
print(f"The votes are: {votes}")
print("\nGame Master:", get_response(gm_history))
speak(get_response(gm_history), "Game Master") # TTS for the final game result
if __name__ == "__main__":
main()