Skip to content

Commit

Permalink
code refactor
Browse files Browse the repository at this point in the history
  • Loading branch information
adam-pawelek committed Jan 29, 2025
1 parent a728776 commit 7850614
Show file tree
Hide file tree
Showing 6 changed files with 34 additions and 31 deletions.
25 changes: 14 additions & 11 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ print(translated_text) # Output: Hello everyone
---

### Using `AsyncTranslator` (Asynchronous)

```python
import asyncio
from llmtranslate import AsyncTranslator
Expand All @@ -63,41 +64,43 @@ from langchain_openai import ChatOpenAI
# Initialize the LLM and AsyncTranslator
llm = ChatOpenAI(model_name="gpt-4o", openai_api_key="your_openai_api_key")


async def translate_text():
translator = AsyncTranslator(
llm=llm,
max_length_text_chunk_to_translate=100,
max_length_text_chunk_to_translate_multiple_languages=50,
llm=llm,
max_translation_chunk_length=100,
max_translation_chunk_length_multilang=50,
max_concurrent_llm_calls=10
)
tasks = [
translator.get_text_language("Hi how are you?"),
translator.get_text_language("Hi how are you?"),
translator.translate("Hi how are you?", "Spanish")
]
results = await asyncio.gather(*tasks)
# Output the detected language information
text_language = results[0]
if results:
print(text_language.ISO_639_1_code) # Output: en
print(text_language.ISO_639_2_code) # Output: eng
print(text_language.ISO_639_3_code) # Output: eng
print(text_language.language_name) # Output: English
print(text_language.ISO_639_1_code) # Output: en
print(text_language.ISO_639_2_code) # Output: eng
print(text_language.ISO_639_3_code) # Output: eng
print(text_language.language_name) # Output: English

# Output the translated text
print(results[1]) # Output: Hola, ¿cómo estás?


# Run the asynchronous translation
asyncio.run(translate_text())
```

---
## Key Parameters

### `max_length_text_chunk_to_translate`
### `max_translation_chunk_length`
- **Description**: Defines the maximum length (in characters) of a text chunk to be translated in a single call when the text is in one language.
- **Recommendation**: If translations are not accurate, try reducing this value as weaker LLMs struggle with large chunks of text.

### `max_length_text_chunk_to_translate_multiple_languages`
### `max_translation_chunk_length_multilang`
- **Description**: Defines the maximum length (in characters) of a text chunk when the text contains multiple languages.
- **Recommendation**: Reduce this value for better accuracy with multi-language inputs.

Expand Down
30 changes: 15 additions & 15 deletions llmtranslate/translator.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,19 +94,19 @@ class HowManyLanguages(BaseModel):



def __init__(self, llm: BaseChatModel, max_length_text_chunk_to_translate: int = 200, max_length_text_chunk_to_translate_multiple_languages: int = 50):
def __init__(self, llm: BaseChatModel, max_translation_chunk_length: int = 200, max_translation_chunk_length_multilang: int = 50):
"""
Initializes the Translator with the given parameters.
Args:
llm (BaseChatModel):
The large language model used for performing language detection
and translation tasks.
max_length_text_chunk_to_translate (int, optional):
max_translation_chunk_length (int, optional):
The maximum length of text to be translated in a single chunk
when dealing with a single language. If not provided,
a default value is used.
max_length_text_chunk_to_translate_multiple_languages (int, optional):
max_translation_chunk_length_multilang (int, optional):
The maximum length of text to be translated in a single chunk
when multiple languages are present. If not provided,
a different default value is used.
Expand All @@ -115,8 +115,8 @@ def __init__(self, llm: BaseChatModel, max_length_text_chunk_to_translate: int =
raise MissingLangchainChatModelError()

self.llm = llm
self.max_length_text_chunk_to_translate = max_length_text_chunk_to_translate if max_length_text_chunk_to_translate else MAX_LENGTH
self.max_length_text_chunk_to_translate_multiple_languages = max_length_text_chunk_to_translate_multiple_languages if max_length_text_chunk_to_translate_multiple_languages else MAX_LENGTH_MINI_TEXT_CHUNK
self.max_translation_chunk_length = max_translation_chunk_length if max_translation_chunk_length else MAX_LENGTH
self.max_translation_chunk_length_multilang = max_translation_chunk_length_multilang if max_translation_chunk_length_multilang else MAX_LENGTH_MINI_TEXT_CHUNK
######## detect language ##########
structured_llm_detect_language = llm.with_structured_output(BaseTranslator.TextLanguageFormat)
self.few_shot_structured_llm_detect_language = BaseTranslator.prompt_detect_language | structured_llm_detect_language
Expand Down Expand Up @@ -156,7 +156,7 @@ def get_text_language(self, text: str) -> TextLanguage:
>>> print(language.language_name)
French
"""
text = get_first_n_words(text, self.max_length_text_chunk_to_translate)
text = get_first_n_words(text, self.max_translation_chunk_length)
response = self.few_shot_structured_llm_detect_language.invoke(text)
response_message = response.language_ISO_639_1_code
try:
Expand Down Expand Up @@ -189,13 +189,13 @@ def translate(self, text: str, to_language ="Spanish") -> str:
>>> print(result)
"Hola mundo. Hola el mundo."
"""
text_chunks = split_text_to_chunks(text, self.max_length_text_chunk_to_translate)
text_chunks = split_text_to_chunks(text, self.max_translation_chunk_length)
counted_number_of_languages = [self.how_many_languages_are_in_text(text_chunk) for text_chunk in text_chunks]

translated_list = []
for index, text_chunk in enumerate(text_chunks):
if counted_number_of_languages[index] > 1:
mini_text_chunks = split_text_to_chunks(text_chunk, self.max_length_text_chunk_to_translate_multiple_languages)
mini_text_chunks = split_text_to_chunks(text_chunk, self.max_translation_chunk_length_multilang)
for mini_text_chunk in mini_text_chunks:
translated_list.append(self.translate_chunk_of_text(mini_text_chunk, to_language))
else:
Expand All @@ -221,11 +221,11 @@ class AsyncTranslator(BaseTranslator):
def __init__(
self,
llm: BaseChatModel,
max_length_text_chunk_to_translate: int = 200,
max_length_text_chunk_to_translate_multiple_languages: int = 50,
max_translation_chunk_length: int = 200,
max_translation_chunk_length_multilang: int = 50,
max_concurrent_llm_calls: int = 100
):
super().__init__(llm, max_length_text_chunk_to_translate, max_length_text_chunk_to_translate_multiple_languages)
super().__init__(llm, max_translation_chunk_length, max_translation_chunk_length_multilang)
self.semaphore = asyncio.Semaphore(max_concurrent_llm_calls)


Expand Down Expand Up @@ -256,9 +256,9 @@ async def get_text_language(self, text) -> TextLanguage:
>>> print(language.language_name)
French
"""
text = get_first_n_words(text, self.max_length_text_chunk_to_translate)
text = get_first_n_words(text, self.max_translation_chunk_length)
async with self.semaphore:
text = get_first_n_words(text, self.max_length_text_chunk_to_translate)
text = get_first_n_words(text, self.max_translation_chunk_length)
response = await self.few_shot_structured_llm_detect_language.ainvoke(text)
response_message = response.language_ISO_639_1_code
try:
Expand Down Expand Up @@ -297,7 +297,7 @@ async def translate(self, text: str, to_language ="Spanish") -> str:
>>> print(result)
"Hola mundo. Hola el mundo."
"""
text_chunks = split_text_to_chunks(text, self.max_length_text_chunk_to_translate)
text_chunks = split_text_to_chunks(text, self.max_translation_chunk_length)

# Run how_many_languages_are_in_text concurrently
# Chunks that contain more than one language will be split (this will simplify translation for the LLM)
Expand All @@ -306,7 +306,7 @@ async def translate(self, text: str, to_language ="Spanish") -> str:
tasks = []
for index, text_chunk in enumerate(text_chunks):
if counted_number_of_languages[index] > 1:
mini_text_chunks = split_text_to_chunks(text_chunk, self.max_length_text_chunk_to_translate_multiple_languages)
mini_text_chunks = split_text_to_chunks(text_chunk, self.max_translation_chunk_length_multilang)
for mini_text_chunk in mini_text_chunks:
tasks.append(self.translate_chunk_of_text(mini_text_chunk, to_language))
else:
Expand Down
2 changes: 1 addition & 1 deletion my_notes/my_tests/new_langchain_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

llm = ChatOpenAI(model="gpt-4o-mini")

translator = AsyncTranslator(llm=llm, max_length_text_chunk_to_translate=100, max_length_text_chunk_to_translate_multiple_languages=100)
translator = AsyncTranslator(llm=llm, max_translation_chunk_length=100, max_translation_chunk_length_multilang=100)


async def run_test():
Expand Down
2 changes: 1 addition & 1 deletion my_notes/my_tests/testt_new_translator.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model_name="gpt-4o", openai_api_key=os.getenv("OPENAI_API_KEY"))
translator = Translator(llm=llm, max_length_text_chunk_to_translate=100, max_length_text_chunk_to_translate_multiple_languages=100)
translator = Translator(llm=llm, max_translation_chunk_length=100, max_translation_chunk_length_multilang=100)
text_language = translator.get_text_language("Hi how are you?")
if text_language:
print(text_language.ISO_639_1_code)
Expand Down
2 changes: 1 addition & 1 deletion my_notes/my_tests/testt_new_translator_async.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

llm = ChatOpenAI(model_name="gpt-4o", openai_api_key=os.getenv("OPENAI_API_KEY"))
async def translate_text():
translator = AsyncTranslator(llm=llm, max_length_text_chunk_to_translate=100, max_length_text_chunk_to_translate_multiple_languages=50, max_concurrent_llm_calls=10)
translator = AsyncTranslator(llm=llm, max_translation_chunk_length=100, max_translation_chunk_length_multilang=50, max_concurrent_llm_calls=10)
tasks = [translator.get_text_language("Hi how are you?"), translator.translate("Hi how are you?", "Spanish")]
results = await asyncio.gather(*tasks)
print(results[0])
Expand Down
4 changes: 2 additions & 2 deletions tests/test_translator.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ class TestTranslator:
@pytest.fixture
def translator(self):
llm = ChatOpenAI(model_name="gpt-4o-mini", openai_api_key=os.getenv("OPENAI_API_KEY"))
return Translator(llm=llm, max_length_text_chunk_to_translate=100, max_length_text_chunk_to_translate_multiple_languages=100)
return Translator(llm=llm, max_translation_chunk_length=100, max_translation_chunk_length_multilang=100)

def test_set_api_key_success(self, translator):
assert translator.llm is not None
Expand Down Expand Up @@ -116,7 +116,7 @@ class TestAsyncTranslator:
@pytest.fixture
def translator(self):
llm = ChatOpenAI(model_name="gpt-4o-mini", openai_api_key=os.getenv("OPENAI_API_KEY"))
return AsyncTranslator(llm=llm, max_length_text_chunk_to_translate=100, max_length_text_chunk_to_translate_multiple_languages=100, max_concurrent_llm_calls=10)
return AsyncTranslator(llm=llm, max_translation_chunk_length=100, max_translation_chunk_length_multilang=100, max_concurrent_llm_calls=10)

def test_set_api_key_success(self, translator):
assert translator.llm is not None
Expand Down

0 comments on commit 7850614

Please sign in to comment.