-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathurl2speech.py
124 lines (99 loc) · 3.95 KB
/
url2speech.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import requests
import subprocess
import os
import replicate
import sys
from PIL import Image
from io import BytesIO
os.environ['REPLICATE_API_TOKEN'] = 'abc'
def download_image(image_url):
try:
# Send a GET request to the image URL
response = requests.get(image_url)
if response.status_code != 200:
raise Exception("Failed to download the image")
# Open the image using PIL
image = Image.open(BytesIO(response.content))
# Resize the image if its width is greater than 1028 pixels
max_width = 1028
if image.width > max_width:
# Calculate the new height to maintain the aspect ratio
new_height = int((max_width / image.width) * image.height)
image = image.resize((max_width, new_height), Image.LANCZOS)
# Define the path where the image will be saved
image_path = "out.png"
# Save the image as PNG
image.save(image_path, format='PNG')
# Return the path to the saved image
return image_path
except Exception as e:
print(f"An error occurred: {e}")
return None
def process_image(image_path):
with open(image_path, "rb") as image:
# Run the Bakllava model for image description
description = replicate.run(
"lucataco/bakllava:452b2fa0b66d8acdf40e05a7f0af948f9c6065f6da5af22fce4cead99a26ff3d",
input={
"image": image,
"prompt": "Describe this image",
"max_sequence": 512
}
)
# Run the Seamless Communication model for translation and text-to-speech
output = replicate.run(
"cjwbw/seamless_communication:668a4fec05a887143e5fe8d45df25ec4c794dd43169b9a11562309b2d45873b0",
input={
"task_name": "T2ST (Text to Speech translation)",
"input_text": description,
"input_text_language": "English",
"max_input_audio_length": 60,
"target_language_text_only": "Norwegian Nynorsk",
"target_language_with_speech": "Turkish"
}
)
return output
def download_and_play_audio(json_output):
try:
# Extract the audio URL from the JSON output
audio_url = json_output.get('audio_output')
if not audio_url:
raise ValueError("Audio URL not found in the JSON output")
# Download the audio file
response = requests.get(audio_url)
if response.status_code != 200:
raise Exception("Failed to download the audio file")
# Save the audio file temporarily
audio_file_path = "/tmp/temp_audio_file.wav"
with open(audio_file_path, 'wb') as audio_file:
audio_file.write(response.content)
# Play the audio file
subprocess.run(["aplay", audio_file_path])
# Optional: Remove the audio file after playing
os.remove(audio_file_path)
except Exception as e:
print(f"An error occurred: {e}")
def main():
if len(sys.argv) != 2:
print("Usage: python script_name.py <image_url>")
sys.exit(1)
image_url = sys.argv[1]
try:
# Download the image from the URL
print("Downloading the image...")
image_path = download_image(image_url)
if not image_path:
raise Exception("Failed to download or process the image")
# Process the image to get the description and convert it to audio
print("Processing the image...")
audio_output = process_image(image_path)
if not audio_output:
raise Exception("Failed to process the image or get the audio output")
# Download and play the audio that describes the image
print("Playing the audio...")
download_and_play_audio(audio_output)
except Exception as e:
print(f"An error occurred in the main function: {e}")
# Start the main function
if __name__ == "__main__":
main()