-
Notifications
You must be signed in to change notification settings - Fork 192
/
Copy pathtranslate_readme.py
152 lines (130 loc) · 5.71 KB
/
translate_readme.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
import os
import json
import re
import markdown
from bs4 import BeautifulSoup
from googletrans import Translator
import logging
# 设置日志
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
def load_translation_cache(cache_file):
if os.path.exists(cache_file):
with open(cache_file, 'r', encoding='utf-8') as f:
return json.load(f)
return {}
def save_translation_cache(cache_file, translations):
with open(cache_file, 'w', encoding='utf-8') as f:
json.dump(translations, f, ensure_ascii=False, indent=4)
def get_cache_key(text, target_lang):
"""生成缓存键,将文本和目标语言合并为一个字符串"""
return f"{target_lang}::{text}"
def translate_text(text, target_lang, translator, translations):
cache_key = get_cache_key(text, target_lang)
if cache_key in translations:
logging.info(f"从缓存中获取翻译: '{text}' -> '{translations[cache_key]}'")
return translations[cache_key]
try:
translation = translator.translate(text, dest=target_lang)
translations[cache_key] = translation.text
logging.info(f"翻译成功: '{text}' -> '{translation.text}'")
return translation.text
except Exception as e:
logging.error(f"翻译失败: {text}\n错误: {e}")
return text # 返回原始文本以防翻译失败
def should_translate(segment):
"""判断是否需要翻译该段落"""
# 定义需要跳过翻译的模式
skip_patterns = [
r'\[简体中文\]\(README\.md\)', # 只要匹配到[简体中文](README.md),就跳过该段落
r'\[.*\]\(.*\)' # 跳过所有包含 markdown 链接的段落
]
for pattern in skip_patterns:
if re.search(pattern, segment):
logging.info(f"跳过翻译的段落: {segment}")
return False
return True
def extract_text_segments(content):
"""
使用 Markdown 解析库提取非代码块的文本段落。
"""
html = markdown.markdown(content)
soup = BeautifulSoup(html, 'html.parser')
# 移除所有代码块
for code in soup.find_all(['code', 'pre']):
code.extract()
# 获取纯文本
text = soup.get_text()
# 按段落分割
segments = text.split('\n\n')
# 清理段落
segments = [segment.strip() for segment in segments if segment.strip()]
logging.info(f"提取到 {len(segments)} 个文本段落进行翻译。")
return segments
def replace_translation(original_content, translated_segments):
"""
将翻译后的文本段落重新插入到原始内容中,保持代码块不变。
"""
pattern = re.compile(r'```[\s\S]*?```', re.MULTILINE)
parts = pattern.findall(original_content)
result = ""
translated_iter = iter(translated_segments)
splitted = pattern.split(original_content)
for i, part in enumerate(splitted):
translated_text = next(translated_iter, "")
result += translated_text + "\n\n"
if i < len(parts):
result += parts[i] + "\n\n"
return result.strip()
def translate_readme(input_file, output_dir):
# 创建翻译缓存并加载
cache_file = 'translation_cache.json'
translations = load_translation_cache(cache_file)
logging.info(f"缓存文件 '{cache_file}' 加载完成,共有 {len(translations)} 条缓存记录。")
# 初始化翻译器
translator = Translator(service_urls=['translate.google.com'])
logging.info("翻译器初始化完成。")
# 读取 README.md 内容
with open(input_file, 'r', encoding='utf-8') as f:
content = f.read()
logging.info(f"读取 '{input_file}' 完成,共 {len(content)} 个字符。")
# 提取非代码块的文本段落
text_segments = extract_text_segments(content)
logging.info(f"提取到 {len(text_segments)} 个文本段落。")
# 定义目标语言
languages = {
'en': 'english',
'ru': 'russian',
'fa': 'persian'
}
# 获取源文件名
source_filename = os.path.basename(input_file)
for lang_code, lang_name in languages.items():
logging.info(f"开始翻译到 {lang_name} ({lang_code})")
translated_segments = []
for idx, segment in enumerate(text_segments, 1):
# 判断是否需要翻译
if should_translate(segment) and re.search(r'[\u4e00-\u9fff]', segment):
translated = translate_text(segment, lang_code, translator, translations)
translated_segments.append(translated)
logging.debug(f"段落 {idx} 翻译为 {lang_name}:{translated}")
else:
translated_segments.append(segment)
logging.debug(f"段落 {idx} 不需要翻译,保持原样。")
# 重新组合内容,保留代码块
translated_content = replace_translation(content, translated_segments)
logging.info(f"{lang_name} 翻译内容重新组合完成。")
# 定义目标语言的目录
lang_dir = os.path.join(output_dir, 'languages', lang_code)
os.makedirs(lang_dir, exist_ok=True)
# 定义输出文件路径,保持与源文件名一致
output_file = os.path.join(lang_dir, source_filename)
with open(output_file, 'w', encoding='utf-8') as f:
f.write(translated_content)
logging.info(f"{lang_name} 翻译完成,文件保存至: {output_file}")
# 保存翻译缓存
save_translation_cache(cache_file, translations)
logging.info("所有翻译完成,并已保存缓存。")
if __name__ == "__main__":
input_readme = 'README.md' # 源文件路径
output_directory = '.' # 输出目录,可以根据需要修改
translate_readme(input_readme, output_directory)