-
Notifications
You must be signed in to change notification settings - Fork 1
/
split_media_by_srt.py
301 lines (235 loc) · 8.85 KB
/
split_media_by_srt.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
#!/usr/bin/env python3
import os
import subprocess
import re
import argparse
from pathvalidate import sanitize_filename
from openpyxl import Workbook
from datetime import datetime, timedelta
def is_colab():
try:
import google.colab
return True
except ImportError:
return False
# Define the directory containing the media files
COLAB_MEDIA_DIR = "/content/drive/My Drive/trans/subs"
# Constants
MEDIA_FOLDER = COLAB_MEDIA_DIR if is_colab() else "./downloads/subs"
TIME_STAMP_FORMAT = "%H:%M:%S.%f"
OFFSET_LEFT_MS = 700
OFFSET_RIGHT_MS = 100
# MEDIA_FOLDER = "downloads/subs"
OUT_MEDIA = "videos-out"
SPLIT_VIDEOS = True
TRIM_VIDEOS = False
SPLIT_REPORT = 50
DEBUG_NOW = True
MAX_DEBUG = 2000000
MAX_FILENAME_LENGTH = 100
def parse_time(time_str):
hours, minutes, seconds = time_str.split(":")
seconds, milliseconds = seconds.split(".")
return timedelta(
hours=int(hours),
minutes=int(minutes),
seconds=int(seconds),
milliseconds=int(milliseconds),
)
def split_video(video_file, srt_file, report_data):
"""
Splits the video into multiple segments based on the timestamps provided in the .srt file.
Parameters:
video_file (str): Path to the video file.
srt_file (str): Path to the corresponding .srt subtitle file.
report_data (list): List to collect reporting information.
translator (Translator): Google Translator instance.
"""
# Read the .srt file
with open(srt_file, "r", encoding="utf-8") as f:
text = f.read()
path, filename = os.path.split(video_file)
filebase, fileext = os.path.splitext(filename)
new_dir = os.path.join(path, filebase)
os.makedirs(new_dir, exist_ok=True)
# Define the improved regular expression with capturing groups
regex = r"(?m)^(\d+)$\n^(\d{2}:\d{2}:\d{2},\d{3} --> \d{2}:\d{2}:\d{2},\d{3})$\n((?:(?!^\d+$).*\n?)+)"
# # Read the content from the text file
# with open('input.txt', 'r', encoding='utf-8') as file:
# text = file.read()
# Find all matches
text = text.replace("(", "(").replace(")", ")").replace(" )", ")")
matches = re.finditer(regex, text)
# Iterate through all matches and print their captured groups
for num, match in enumerate(matches, start=1):
if DEBUG_NOW and num > MAX_DEBUG:
break
index = int(match.group(1))
if index % SPLIT_REPORT == 0:
print(f"Processed {index} records")
timestamp = match.group(2).strip()
text_content = match.group(3).strip()
# Parse subtitle lines
# index = 1
# for i in range(0, len(lines), 4): # Assuming each subtitle block has 4 lines
# # Ensure valid subtitle format and lines are not empty
# if len(lines[i+1].strip()) > 0 and len(lines[i+2].strip()) > 0:
start, end = parse_srt_timestamp(timestamp)
end += timedelta(milliseconds=OFFSET_RIGHT_MS)
# start = start0 - timedelta(milliseconds=OFFSET_LEFT_MS)
# if start.time() > start0.time():
# start = datetime(1900, 1, 1, 0, 0, 0) # zero_time
# text = lines[i+2].strip()
# Extract English and Chinese parts from text
items = parse_text(text_content)
chinese_text, pinyin_text, english_text = (
items[0],
items[1] if len(items) > 1 else "",
items[2] if len(items) > 2 else "",
)
sanitized_text = sanitize_filename(
f"{chinese_text}_{pinyin_text}_{english_text}".replace(" )", ")").replace(
" ", "-"
)
) # Replace spaces with underscores
output_file = f"{index:03d}_{sanitized_text[:MAX_FILENAME_LENGTH]}{fileext}"
output_file = os.path.join(new_dir, output_file)
if SPLIT_VIDEOS:
# Split video using ffmpeg with error handling
try:
cmd = [
"ffmpeg",
"-loglevel",
"error",
"-y",
"-i",
video_file,
"-ss",
timestamp_string(start),
"-to",
timestamp_string(end),
"-c",
"copy",
output_file,
]
# print("Command: %s" % " ".join(cmd))
subprocess.run(
cmd,
stdout=subprocess.DEVNULL,
stderr=subprocess.PIPE,
check=True,
)
except subprocess.CalledProcessError as e:
# Print error details
print(
f"Error splitting video segment {index}: {e.stderr.decode('utf-8').strip()} with is command"
)
print(f"\n{' '.join(cmd)}")
continue
# Get the length of the segment
length = (end - start).total_seconds()
# Append report data
report_data.append(
[
video_file,
index,
chinese_text,
english_text,
output_file,
length,
"",
"",
]
)
pass
def parse_srt_timestamp(srt_line):
"""
Parses a timestamp line from an .srt file and converts it to a format compatible with ffmpeg.
Parameters:
srt_line (str): A line from the .srt file containing the start and end timestamps.
Returns:
tuple: A tuple containing the start and end timestamps in ffmpeg-compatible format.
"""
# Example timestamp format: "00:00:10,000 --> 00:00:15,000"
start, end = srt_line.strip().split(" --> ")
# Convert to ffmpeg-compatible format: "HH:MM:SS"
start = start.replace(",", ".")
end = end.replace(",", ".")
return datetime.strptime(start, TIME_STAMP_FORMAT), datetime.strptime(
end, TIME_STAMP_FORMAT
)
def timestamp_string(time_value):
return time_value.strftime(TIME_STAMP_FORMAT)
def parse_text(text):
"""
Parses the subtitle text to extract English and Chinese parts.
Parameters:
text (str): The text line from the .srt file containing both English and Chinese.
Returns:
tuple: A list containing the Chinese + Pinyin + English text.
"""
return [item.strip() for item in text.split("\n")]
def generate_report(report_data, output_filename="video_split_report.xlsx"):
"""
Generates an Excel report from the report data.
Parameters:
report_data (list): List containing the report information.
output_filename (str): Name of the output Excel file.
"""
workbook = Workbook()
sheet = workbook.active
sheet.title = "Video Report"
# Write header row
headers = [
"Input Filename",
"Index",
"Chinese Text",
"English Text",
"Filename",
"Length",
"Vietnamese from English",
"Vietnamese from Chinese",
]
sheet.append(headers)
# Write data rows
for row in report_data:
sheet.append(row)
# Save the workbook
workbook.save(output_filename)
print(f"Report generated: {output_filename}")
def main():
# Argument parser to handle input file
parser = argparse.ArgumentParser(
description="Split and trim a video based on SRT subtitles."
)
parser.add_argument(
"input_file",
nargs="?",
default="吃饭_|_Chinese_Daily_Dialogues_|_Upper_Beginner_|_Chinese_Listening_Practice_HSK_3⧸4.webm", # "input.webm",
help="Input video file (default is 'input.mp4')",
)
args = parser.parse_args()
import shutil
from pathvalidate import sanitize_filename
file = args.input_file
orgfilepathbase, orgfilext = os.path.splitext(file)
input_file = os.path.join(MEDIA_FOLDER, sanitize_filename(file).replace(" ", "_"))
shutil.move(os.path.join(MEDIA_FOLDER, file), input_file)
filepathbase, filext = os.path.splitext(input_file)
srt_file = filepathbase + ".srt"
shutil.move(os.path.join(MEDIA_FOLDER, orgfilepathbase + ".srt"), srt_file)
# print(input_file)
print(srt_file)
report_data = []
if os.path.exists(srt_file):
print(f"Splitting {input_file} using {srt_file}")
split_video(input_file, srt_file, report_data)
# Generate report
report_file = filepathbase + ".xlsx"
generate_report(report_data, report_file)
else:
print(f"No corresponding .srt file found for {input_file}")
print("All videos processed!")
# Example usage: main()
if __name__ == "__main__":
main()