-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathopenai_api_auto_translate.py
212 lines (175 loc) · 7.99 KB
/
openai_api_auto_translate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
# PDF-translator-OpenAI-API
# https://github.com/FlyingFathead/PDF-translator-OpenAI-API/
#
# FlyingFathead // Dec 2023
# v0.10
#
# changelog:
# v0.10 - translation combining via `combine_translation.py`
# v0.09 - token handling, naming policy
# v0.08 - more changes to the API call functionality
# v0.07 - API call updated and fixed for openai >v1.0
# v0.06 - fixes to the API call
# v0.05 - calculate the cost approximation
# v0.04 - calculate both tokens and chars
import sys
import os
import shutil
import configparser
import openai
from transformers import GPT2Tokenizer
from openai_pricing_calculator import calculate_cost
# print term width horizontal line
def hz_line(character='-'):
terminal_width = shutil.get_terminal_size().columns
line = character * terminal_width
print(line)
# laod the config
def load_config():
config = configparser.ConfigParser()
config.read('config.ini')
return config['DEFAULT']
# API key reading
# First, try to get the API key from an environment variable
openai.api_key = os.getenv('OPENAI_API_KEY')
# If the environment variable is not set, try to read the key from a file
if openai.api_key is None:
try:
with open('api_token.txt', 'r') as file:
openai.api_key = file.read().strip()
except FileNotFoundError:
print("Error: The OPENAI_API_KEY environment variable is not set, and api_token.txt was not found. Please set the environment variable or create this file with your OpenAI API key.")
sys.exit(1)
# If the key is still None at this point, neither method was successful
if openai.api_key is None:
print("Error: Failed to obtain OpenAI API key. Please set the OPENAI_API_KEY environment variable or create a file named api_token.txt with your OpenAI API key.")
sys.exit(1)
# count the tokens
def count_tokens_and_chars(file_path):
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
max_length = tokenizer.model_max_length
try:
with open(file_path, 'r', encoding='utf-8') as file:
text = file.read()
tokens_count = 0
chars_count = 0 # Initialize character count
start = 0
while start < len(text):
end = start + max_length
chunk = text[start:end]
tokens = tokenizer.encode(chunk, add_special_tokens=False)
tokens_count += len(tokens)
chars_count += len(chunk) # Count characters in the chunk
start = end
return tokens_count, chars_count
except Exception as e:
print(f"Error processing file: {e}")
return None, None # Return None for both counts in case of an error
# text translation via OpenAI API
def translate_text(text, model, instructions, max_allowed_tokens):
client = openai.OpenAI(api_key=os.getenv('OPENAI_API_KEY'))
# Tokenize the input text to estimate its length
input_tokens = len(text.split()) # This is a rough estimation
# Adjust max_tokens to avoid exceeding the model's limit
max_tokens = max_allowed_tokens - input_tokens
# Ensure max_tokens is positive and within a reasonable range
max_tokens = max(1, min(max_tokens, max_allowed_tokens))
try:
response = client.chat.completions.create(
model=model,
messages=[
{"role": "system", "content": f"{instructions}"},
{"role": "user", "content": text}
],
max_tokens=max_tokens
)
# Access the completion using the appropriate method
return response.choices[0].message.content.strip()
except Exception as e:
print(f"Error during translation: {e}")
return None
def split_text_for_translation(file_path, char_limit):
with open(file_path, 'r', encoding='utf-8') as file:
content = file.read()
sections = []
current_section = ""
for line in content.split('\n'):
if len(current_section) + len(line) < char_limit or not line.strip():
current_section += line + '\n'
else:
sections.append(current_section)
current_section = line + '\n'
if current_section:
sections.append(current_section)
return sections
# def main(file_path, char_limit):
def main(directory, model, char_limit, instructions, max_tokens):
if not os.path.exists(directory) or not os.path.isdir(directory):
print(f"Directory {directory} does not exist or is not a directory.")
sys.exit(1)
text_files = [f for f in os.listdir(directory) if f.endswith('.txt')]
if not text_files:
print(f"No text files found in directory {directory}.")
sys.exit(1)
total_tokens = 0
total_chars = 0 # Total character count
print("Calculating token and character counts for files...", flush=True)
for file in text_files:
file_path = os.path.join(directory, file)
token_count, char_count = count_tokens_and_chars(file_path)
if token_count is not None and char_count is not None:
total_tokens += token_count
total_chars += char_count # Sum character counts
hz_line()
print(f"::: Directory: {directory}")
print(f"::: Number of text files: {len(text_files)}")
print(f"::: Combined token length: {total_tokens} tokens")
print(f"::: Combined character length: {total_chars} characters") # Display total characters
print(f"::: Instructions to the model: {instructions}")
print(f"::: Model in use: {model}")
# Calculate and display cost
# Assuming output token count is approximately equal to input token count
input_token_count = total_tokens
output_token_count = total_tokens # Approximation
# Calculate and display cost
try:
cost = calculate_cost(model, input_token_count, output_token_count)
print(f"::: Estimated cost for translation: ${cost:.4f}")
except ValueError as e:
print(f"Error in cost calculation: {e}")
hz_line()
confirm = input("Do you wish to continue (y/n)? ")
if confirm.lower() != 'y':
print("Translation cancelled.")
sys.exit(0)
for file in text_files:
file_path = os.path.join(directory, file)
base_name, file_extension = os.path.splitext(file)
output_filename = os.path.join(directory, f"translated_{base_name}{file_extension}")
print(f"Processing file: {file}", flush=True)
sections = split_text_for_translation(file_path, char_limit)
translated_content = "" # Initialize an empty string to accumulate translated sections
for i, section in enumerate(sections, start=1):
hz_line()
print(f"::: Translating segment: {i}/{len(sections)} of file {file}", flush=True)
hz_line()
translated_section = translate_text(section, model, instructions, max_tokens)
if translated_section:
translated_content += translated_section # Append each translated section
# Write all translated content to a single file
with open(output_filename, 'w', encoding='utf-8') as output_file:
output_file.write(translated_content)
print(f"Translated file written to {output_filename}", flush=True)
if __name__ == "__main__":
if len(sys.argv) != 2:
print("Usage: python openai_api_auto_translate.py <directory>")
sys.exit(1)
# Extracting configuration settings
config = load_config()
directory = sys.argv[1]
char_limit = int(config.get('MaxCharacterLimit', 100000))
max_tokens = int(config.get('MaxTokens', 16000)) # Default to 16000 if not set
model = config.get('Model', 'gpt-3.5-turbo')
instructions = config.get('TranslationInstructions', 'Translate this Finnish text to English, format the text properly')
# Call main with the necessary parameters
main(directory, model=model, char_limit=char_limit, instructions=instructions, max_tokens=max_tokens)