-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathtranslate_lecture.py
184 lines (159 loc) · 5.55 KB
/
translate_lecture.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
"""This module is used to translate the videos."""
import argparse
import logging
from pathlib import Path
from rtpt import RTPT
from src.silence import Silence
from src.speaker import SegmentsSpeaker
from src.whisper_wrapper import Transcriber
from utils import file_handler
from utils.path_handler import (
AUDIO_DIRECTORY,
AUDIO_TRANSLATED_SPEED_DIRECTORY,
ORIGINAL_VIDEO_DIRECTORY,
SUBTITLES_DIRECTORY,
VIDEO_DEST_DIRECTORY,
VIDEO_DIRECTORY,
VIDEO_SUBTITLES_DIRECTORY,
create_folders,
)
def main(
max_segment_duration: int,
video_directory: Path = ORIGINAL_VIDEO_DIRECTORY,
use_rtpt: bool = True,
use_cuda: bool = True,
no_cache=False,
):
"""This function is the main function of the program. It is called when the program is executed.
It is responsible for the whole process of translating a lecture.
Steps:
- split all videos into audio files and video files without audio
- translate and transcribe the audio files
- generate subtitles and text files
- synthesize the audio files
- merge the audio files and the video files
- merge the video files and the subtitles
"""
logging.info(
f"Starting the translation process for all videos in {video_directory}."
)
if use_rtpt:
logging.debug("Initializing and starting the RTPT process.")
rtpt = RTPT(
name_initials="DP",
experiment_name="Translating:IntroAI",
max_iterations=len(list(video_directory.iterdir())),
)
rtpt.start()
create_folders()
for original_video in video_directory.iterdir():
lecture_name = original_video.stem
# if there is a video with the same name in VIDEO_SUBTITLES_DIRECTORY , skip this video
if (VIDEO_DEST_DIRECTORY / f"{lecture_name}.mp4").exists():
logging.warning(
f"{lecture_name}: Skipped, since a video with the same name exists at {VIDEO_SUBTITLES_DIRECTORY}."
)
if use_rtpt:
rtpt.step()
continue
logging.info(lecture_name)
file_handler.split_video(str(original_video))
# If the audio file has already been transcribed, this method uses the stored results.
transcriber = Transcriber(model="large", fp16_settings=True)
result = transcriber.transcribe_and_translate(
str(AUDIO_DIRECTORY / f"{lecture_name}.wav"), no_cache=no_cache
)
# Write the subtitle file
Transcriber.write_srt(
result=result, output_dir=str(SUBTITLES_DIRECTORY / f"{lecture_name}.srt")
)
# Prepare the results for tts
segments = Silence.add_silence_segments_pydub_whisper(
result["segments"],
AUDIO_DIRECTORY / f"{lecture_name}.wav",
max_duration=max_segment_duration,
)
# Synthesize the results
speaker = SegmentsSpeaker(lecture_name=lecture_name, segments=segments)
speaker.speak(use_gpu=use_cuda)
# Merge audio and video file.
file_handler.merge_audio_and_video_to_mp4(
video_file=str(VIDEO_DIRECTORY / f"{lecture_name}.mp4"),
audio_file=str(AUDIO_TRANSLATED_SPEED_DIRECTORY / f"{lecture_name}.wav"),
output_path=str(VIDEO_DEST_DIRECTORY / lecture_name) + ".mp4",
)
# Embed the subtitles in the video
file_handler.embed_subtitles_in_mp4(
video_file=VIDEO_DEST_DIRECTORY / f"{lecture_name}.mp4",
subtitles_file=SUBTITLES_DIRECTORY / f"{lecture_name}.srt",
output_path=str(VIDEO_SUBTITLES_DIRECTORY / lecture_name) + ".mp4",
language="eng",
)
logging.info(f"{lecture_name}: Finished.")
if use_rtpt:
rtpt.step()
logging.info(f"Finished processing all videos in {video_directory}.")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"-v",
"--verbose",
help="increase output verbosity to logging lever INFO",
action="store_true",
)
parser.add_argument(
"-disable_rtpt",
"--disable_rtpt",
help="use RTPT to show the remaining time of the process",
action="store_true",
)
parser.add_argument(
"-disable_cuda",
"--disable_cuda",
help="disable CUDA for the text-to-speech synthesizer",
action="store_true",
)
parser.add_argument(
"-max_segment_duration",
"--max_segment_duration",
help="specify the maximum duration of the segments",
)
parser.add_argument(
"-disable_max_duration",
"--disable_max_duration",
help="disable the maximum duration for segments",
action="store_true",
)
parser.add_argument(
"-no_cache",
"--no_cache",
help="disable the use of stored translation results",
action="store_true",
)
args = parser.parse_args()
if args.verbose:
logging.basicConfig(level=logging.INFO)
if args.disable_rtpt:
use_rtpt = False
else:
use_rtpt = True
if args.disable_cuda:
use_cuda = False
else:
use_cuda = True
if args.max_segment_duration:
max_segment_duration = args.max_segment_duration
else:
max_segment_duration = 30
if args.disable_max_duration:
max_segment_duration = None
if args.no_cache:
no_cache = True
else:
no_cache = False
main(
max_segment_duration=max_segment_duration,
use_rtpt=use_rtpt,
use_cuda=use_cuda,
no_cache=no_cache,
)