From 2d85c7dce5ca80a7ab5be69e85263129998ae3d1 Mon Sep 17 00:00:00 2001 From: Sarang-Nambiar <101126190+Sarang-Nambiar@users.noreply.github.com> Date: Thu, 16 Jan 2025 13:47:47 +0800 Subject: [PATCH 1/6] Initial commit --- app/features/text_rewriter/__init__.py | 0 app/features/text_rewriter/core.py | 32 +++ app/features/text_rewriter/metadata.json | 19 ++ .../prompt/text-rewriter-prompt.txt | 10 + app/features/text_rewriter/tests/__init__.py | 0 app/features/text_rewriter/tests/test_core.py | 222 ++++++++++++++++++ app/features/text_rewriter/tools.py | 84 +++++++ 7 files changed, 367 insertions(+) create mode 100644 app/features/text_rewriter/__init__.py create mode 100644 app/features/text_rewriter/core.py create mode 100644 app/features/text_rewriter/metadata.json create mode 100644 app/features/text_rewriter/prompt/text-rewriter-prompt.txt create mode 100644 app/features/text_rewriter/tests/__init__.py create mode 100644 app/features/text_rewriter/tests/test_core.py create mode 100644 app/features/text_rewriter/tools.py diff --git a/app/features/text_rewriter/__init__.py b/app/features/text_rewriter/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/app/features/text_rewriter/core.py b/app/features/text_rewriter/core.py new file mode 100644 index 00000000..ad14039f --- /dev/null +++ b/app/features/text_rewriter/core.py @@ -0,0 +1,32 @@ +from app.services.logger import setup_logger +from app.utils.document_loaders import get_docs +from app.features.text_rewriter.tools import TextRewriter +from app.api.error_utilities import ToolExecutorError + +logger = setup_logger() + +def executor( + instructions: str, + file_url: str, + file_type: str, + verbose=False): + try: + if verbose: + logger.info(f"File URL loaded: {file_url}") + + if file_type and file_url: + docs = get_docs(file_url, file_type, verbose=True) + else: + docs = None + raise ToolExecutorError("File URL and file type must be provided") + + # TODO: IMPLEMENT CLASS HERE + output = TextRewriter(instructions, verbose=verbose).rewrite(docs) + + except Exception as e: + error_message = f"Error in executor: {e}" + logger.error(error_message) + raise ValueError(error_message) + + return output + diff --git a/app/features/text_rewriter/metadata.json b/app/features/text_rewriter/metadata.json new file mode 100644 index 00000000..56c6fbde --- /dev/null +++ b/app/features/text_rewriter/metadata.json @@ -0,0 +1,19 @@ +{ + "inputs": [ + { + "label": "Instructions", + "name": "instructions", + "type": "text" + }, + { + "label": "File URL", + "name": "file_url", + "type": "text" + }, + { + "label": "File Type", + "name": "file_type", + "type": "text" + } + ] +} \ No newline at end of file diff --git a/app/features/text_rewriter/prompt/text-rewriter-prompt.txt b/app/features/text_rewriter/prompt/text-rewriter-prompt.txt new file mode 100644 index 00000000..48266ce7 --- /dev/null +++ b/app/features/text_rewriter/prompt/text-rewriter-prompt.txt @@ -0,0 +1,10 @@ +You are a helpful AI assistant thatt helps users rewrite text from documents. + +You are given instructions on how to rewrite a given text from the context provided: +{instructions} + +You must respond as a JSON object: +{format_instructions} + +Context: +{context} \ No newline at end of file diff --git a/app/features/text_rewriter/tests/__init__.py b/app/features/text_rewriter/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/app/features/text_rewriter/tests/test_core.py b/app/features/text_rewriter/tests/test_core.py new file mode 100644 index 00000000..17f2fa3c --- /dev/null +++ b/app/features/text_rewriter/tests/test_core.py @@ -0,0 +1,222 @@ +import pytest + +from app.tools.ai_resistant_assignment_generator.core import executor + +# Base attributes reused across all tests +base_attributes = { + "assignment": "Math Homework", + "grade_level": "university", + "lang": "en" +} + +# PDF Tests +def test_executor_pdf_url_valid(): + ai_resistant_assignment = executor( + **base_attributes, + file_url="https://filesamples.com/samples/document/pdf/sample1.pdf", + file_type="pdf" + ) + assert isinstance(ai_resistant_assignment, dict) + +def test_executor_pdf_url_invalid(): + with pytest.raises(ValueError) as exc_info: + executor( + **base_attributes, + file_url="https://filesamples.com/samples/document/pdf/sample1.pdf", + file_type=1 + ) + assert isinstance(exc_info.value, ValueError) + +# CSV Tests +def test_executor_csv_url_valid(): + ai_resistant_assignment = executor( + **base_attributes, + file_url="https://filesamples.com/samples/document/csv/sample1.csv", + file_type="csv" + ) + assert isinstance(ai_resistant_assignment, dict) + +def test_executor_csv_url_invalid(): + with pytest.raises(ValueError) as exc_info: + executor( + **base_attributes, + file_url="https://filesamples.com/samples/document/csv/sample1.csv", + file_type=1 + ) + assert isinstance(exc_info.value, ValueError) + +# TXT Tests +def test_executor_txt_url_valid(): + ai_resistant_assignment = executor( + **base_attributes, + file_url="https://filesamples.com/samples/document/txt/sample1.txt", + file_type="txt" + ) + assert isinstance(ai_resistant_assignment, dict) + +def test_executor_txt_url_invalid(): + with pytest.raises(ValueError) as exc_info: + executor( + **base_attributes, + file_url="https://filesamples.com/samples/document/txt/sample1.txt", + file_type=1 + ) + assert isinstance(exc_info.value, ValueError) + +# MD Tests +def test_executor_md_url_valid(): + ai_resistant_assignment = executor( + **base_attributes, + file_url="https://github.com/radicalxdev/kai-ai-backend/blob/main/README.md", + file_type="md" + ) + assert isinstance(ai_resistant_assignment, dict) + +def test_executor_md_url_invalid(): + with pytest.raises(ValueError) as exc_info: + executor( + **base_attributes, + file_url="https://github.com/radicalxdev/kai-ai-backend/blob/main/README.md", + file_type=1 + ) + assert isinstance(exc_info.value, ValueError) + +# PPTX Tests +def test_executor_pptx_url_valid(): + ai_resistant_assignment = executor( + **base_attributes, + file_url="https://scholar.harvard.edu/files/torman_personal/files/samplepptx.pptx", + file_type="pptx" + ) + assert isinstance(ai_resistant_assignment, dict) + +def test_executor_pptx_url_invalid(): + with pytest.raises(ValueError) as exc_info: + executor( + **base_attributes, + file_url="https://scholar.harvard.edu/files/torman_personal/files/samplepptx.pptx", + file_type=1 + ) + assert isinstance(exc_info.value, ValueError) + +# DOCX Tests +def test_executor_docx_url_valid(): + ai_resistant_assignment = executor( + **base_attributes, + file_url="https://filesamples.com/samples/document/docx/sample1.docx", + file_type="docx" + ) + assert isinstance(ai_resistant_assignment, dict) + +def test_executor_docx_url_invalid(): + with pytest.raises(ValueError) as exc_info: + executor( + **base_attributes, + file_url="https://filesamples.com/samples/document/docx/sample1.docx", + file_type=1 + ) + assert isinstance(exc_info.value, ValueError) + +# XLS Tests +def test_executor_xls_url_valid(): + ai_resistant_assignment = executor( + **base_attributes, + file_url="https://filesamples.com/samples/document/xls/sample1.xls", + file_type="xls" + ) + assert isinstance(ai_resistant_assignment, dict) + +def test_executor_xls_url_invalid(): + with pytest.raises(ValueError) as exc_info: + executor( + **base_attributes, + file_url="https://filesamples.com/samples/document/xls/sample1.xls", + file_type=1 + ) + assert isinstance(exc_info.value, ValueError) + +# XLSX Tests +def test_executor_xlsx_url_valid(): + ai_resistant_assignment = executor( + **base_attributes, + file_url="https://filesamples.com/samples/document/xlsx/sample1.xlsx", + file_type="xlsx" + ) + assert isinstance(ai_resistant_assignment, dict) + +def test_executor_xlsx_url_invalid(): + with pytest.raises(ValueError) as exc_info: + executor( + **base_attributes, + file_url="https://filesamples.com/samples/document/xlsx/sample1.xlsx", + file_type=1 + ) + assert isinstance(exc_info.value, ValueError) + +# XML Tests +def test_executor_xml_url_invalid(): + with pytest.raises(ValueError) as exc_info: + executor( + **base_attributes, + file_url="https://filesampleshub.com/download/code/xml/dummy.xml", + file_type=1 + ) + assert isinstance(exc_info.value, ValueError) + +# GDocs Tests +def test_executor_gdocs_url_invalid(): + with pytest.raises(ValueError) as exc_info: + executor( + **base_attributes, + file_url="https://docs.google.com/document/d/1OWQfO9LX6psGipJu9LabzNE22us1Ct/edit", + file_type=1 + ) + assert isinstance(exc_info.value, ValueError) + +# GSheets Tests +def test_executor_gsheets_url_invalid(): + with pytest.raises(ValueError) as exc_info: + executor( + **base_attributes, + file_url="https://docs.google.com/spreadsheets/d/16OPtLLSfU/edit", + file_type=1 + ) + assert isinstance(exc_info.value, ValueError) + +# GSlides Tests +def test_executor_gslides_url_invalid(): + with pytest.raises(ValueError) as exc_info: + executor( + **base_attributes, + file_url="https://docs.google.com/spreadsheets/d/16OPtLLSfU/edit", + file_type=1 + ) + assert isinstance(exc_info.value, ValueError) + +# GPDFs Tests +def test_executor_gpdfs_url_valid(): + ai_resistant_assignment = executor( + **base_attributes, + file_url="https://drive.google.com/file/d/1fUj1uWIMh6QZsPkt0Vs7mEd2VEqz3O8l/view", + file_type="gpdf" + ) + assert isinstance(ai_resistant_assignment, dict) + +def test_executor_gpdfs_url_invalid(): + with pytest.raises(ValueError) as exc_info: + executor( + **base_attributes, + file_url="https://drive.google.com/file/d/1fUj1uWIMh6QZsPkt0Vs7mEd2VEqz3O8l/view", + file_type=1 + ) + assert isinstance(exc_info.value, ValueError) + +# MP3 Tests +def test_executor_mp3_url_invalid(): + with pytest.raises(ValueError) as exc_info: + executor( + **base_attributes, + file_url="https://raw.githubusercontent.com/asleem/uploaded_files/main/dummy.mp3", + file_type=1 + ) + assert isinstance(exc_info.value, ValueError) diff --git a/app/features/text_rewriter/tools.py b/app/features/text_rewriter/tools.py new file mode 100644 index 00000000..10015cac --- /dev/null +++ b/app/features/text_rewriter/tools.py @@ -0,0 +1,84 @@ +from pydantic import BaseModel, Field +from typing import List, Dict, Optional +import os +from langchain_core.documents import Document +from langchain_chroma import Chroma +from langchain_core.prompts import PromptTemplate +from langchain_core.runnables import RunnablePassthrough, RunnableParallel +from langchain_core.output_parsers import JsonOutputParser +from langchain_google_genai import GoogleGenerativeAI +from langchain_google_genai import GoogleGenerativeAIEmbeddings + +from app.services.logger import setup_logger + +logger = setup_logger(__name__) + +def read_text_file(file_path): + # Get the directory containing the script file + script_dir = os.path.dirname(os.path.abspath(__file__)) + + # Combine the script directory with the relative file path + absolute_file_path = os.path.join(script_dir, file_path) + + with open(absolute_file_path, 'r') as file: + return file.read() + +class TextRewriter: + def __init__(self, instructions, vectorstore_class=Chroma, prompt=None, embedding_model=None, model=None, parser=None, verbose=False): + default_config = { + "model": GoogleGenerativeAI(model="gemini-1.5-flash"), + "embedding_model": GoogleGenerativeAIEmbeddings(model='models/embedding-001'), + "parser": JsonOutputParser(pydantic_object=RewrittenOutput), + "prompt": read_text_file("prompt/ai-resistant-prompt.txt"), + "prompt_without_context": read_text_file("prompt/ai-resistant-without-context-prompt.txt"), + "vectorstore_class": Chroma + } + + self.prompt = prompt or default_config["prompt"] + self.prompt_without_context = default_config["prompt_without_context"] + self.model = model or default_config["model"] + self.parser = parser or default_config["parser"] + self.embedding_model = embedding_model or default_config["embedding_model"] + + self.vectorstore_class = vectorstore_class or default_config["vectorstore_class"] + self.vectorstore, self.retriever, self.runner = None, None, None + self.instructions = instructions + self.verbose = verbose + + if vectorstore_class is None: raise ValueError("Vectorstore must be provided") + + def compile_with_docs(self, documents: List[Document]): + # Return the chain + prompt = PromptTemplate( + template=self.prompt, + input_variables=["instructions"], + partial_variables={"format_instructions": self.parser.get_format_instructions()} + ) + + if self.runner is None: + logger.info(f"Creating vectorstore from {len(documents)} documents") if self.verbose else None + self.vectorstore = self.vectorstore_class.from_documents(documents, self.embedding_model) + logger.info(f"Vectorstore created") if self.verbose else None + + self.retriever = self.vectorstore.as_retriever() + logger.info(f"Retriever created successfully") if self.verbose else None + + self.runner = RunnableParallel( + {"context": self.retriever, + "instructions": RunnablePassthrough() + } + ) + + chain = self.runner | prompt | self.model | self.parser + + logger.info(f"Chain compilation complete") + + return chain + + def rewrite(self, documents: List[Document]): + chain = self.compile_with_docs(documents) + output = chain.run() + return output + +class RewrittenOutput(BaseModel): + rewritten_text: str = Field(description="The rewritten text") \ No newline at end of file From 86be7391d3fe3591665598a9d176fb5c7286e32c Mon Sep 17 00:00:00 2001 From: Sarang-Nambiar <101126190+Sarang-Nambiar@users.noreply.github.com> Date: Thu, 16 Jan 2025 23:03:43 +0800 Subject: [PATCH 2/6] chore: essential functions added in tools.py --- app/features/text_rewriter/core.py | 3 +- .../prompt/text-rewriter-prompt.txt | 2 +- app/features/text_rewriter/tools.py | 68 +++++++++---------- 3 files changed, 36 insertions(+), 37 deletions(-) diff --git a/app/features/text_rewriter/core.py b/app/features/text_rewriter/core.py index ad14039f..80c351a3 100644 --- a/app/features/text_rewriter/core.py +++ b/app/features/text_rewriter/core.py @@ -10,17 +10,18 @@ def executor( file_url: str, file_type: str, verbose=False): + try: if verbose: logger.info(f"File URL loaded: {file_url}") if file_type and file_url: + logger.info(f"Generating docs. from {file_url} with type {file_type}") docs = get_docs(file_url, file_type, verbose=True) else: docs = None raise ToolExecutorError("File URL and file type must be provided") - # TODO: IMPLEMENT CLASS HERE output = TextRewriter(instructions, verbose=verbose).rewrite(docs) except Exception as e: diff --git a/app/features/text_rewriter/prompt/text-rewriter-prompt.txt b/app/features/text_rewriter/prompt/text-rewriter-prompt.txt index 48266ce7..a686e3d7 100644 --- a/app/features/text_rewriter/prompt/text-rewriter-prompt.txt +++ b/app/features/text_rewriter/prompt/text-rewriter-prompt.txt @@ -3,7 +3,7 @@ You are a helpful AI assistant thatt helps users rewrite text from documents. You are given instructions on how to rewrite a given text from the context provided: {instructions} -You must respond as a JSON object: +You must respond as a JSON object with the below format: {format_instructions} Context: diff --git a/app/features/text_rewriter/tools.py b/app/features/text_rewriter/tools.py index 10015cac..25f5c2b0 100644 --- a/app/features/text_rewriter/tools.py +++ b/app/features/text_rewriter/tools.py @@ -1,13 +1,10 @@ from pydantic import BaseModel, Field -from typing import List, Dict, Optional +from typing import List, Dict import os from langchain_core.documents import Document -from langchain_chroma import Chroma from langchain_core.prompts import PromptTemplate -from langchain_core.runnables import RunnablePassthrough, RunnableParallel from langchain_core.output_parsers import JsonOutputParser from langchain_google_genai import GoogleGenerativeAI -from langchain_google_genai import GoogleGenerativeAIEmbeddings from app.services.logger import setup_logger @@ -24,61 +21,62 @@ def read_text_file(file_path): return file.read() class TextRewriter: - def __init__(self, instructions, vectorstore_class=Chroma, prompt=None, embedding_model=None, model=None, parser=None, verbose=False): + def __init__(self, instructions, prompt=None, model=None, parser=None, verbose=False): default_config = { "model": GoogleGenerativeAI(model="gemini-1.5-flash"), - "embedding_model": GoogleGenerativeAIEmbeddings(model='models/embedding-001'), "parser": JsonOutputParser(pydantic_object=RewrittenOutput), - "prompt": read_text_file("prompt/ai-resistant-prompt.txt"), - "prompt_without_context": read_text_file("prompt/ai-resistant-without-context-prompt.txt"), - "vectorstore_class": Chroma + "prompt": read_text_file("prompt/text-rewriter-prompt.txt"), } self.prompt = prompt or default_config["prompt"] - self.prompt_without_context = default_config["prompt_without_context"] self.model = model or default_config["model"] self.parser = parser or default_config["parser"] - self.embedding_model = embedding_model or default_config["embedding_model"] - self.vectorstore_class = vectorstore_class or default_config["vectorstore_class"] - self.vectorstore, self.retriever, self.runner = None, None, None self.instructions = instructions self.verbose = verbose - - if vectorstore_class is None: raise ValueError("Vectorstore must be provided") - def compile_with_docs(self, documents: List[Document]): + def compile(self): # Return the chain prompt = PromptTemplate( template=self.prompt, - input_variables=["instructions"], + input_variables=["instructions", "context"], partial_variables={"format_instructions": self.parser.get_format_instructions()} ) - if self.runner is None: - logger.info(f"Creating vectorstore from {len(documents)} documents") if self.verbose else None - self.vectorstore = self.vectorstore_class.from_documents(documents, self.embedding_model) - logger.info(f"Vectorstore created") if self.verbose else None + chain = prompt | self.model | self.parser - self.retriever = self.vectorstore.as_retriever() - logger.info(f"Retriever created successfully") if self.verbose else None + return chain + + def validate_output(self, response: Dict) -> bool: + # TODO: implement a response validator here + # might need to add more checks + if 'rewritten_text' in response: + return True + return False - self.runner = RunnableParallel( - {"context": self.retriever, - "instructions": RunnablePassthrough() - } - ) + def rewrite(self, documents: List[Document]): + chain = self.compile() + doc_content = "\n".join([doc.page_content for doc in documents]) - chain = self.runner | prompt | self.model | self.parser + attempts = 0 + max_attempts = 5 - logger.info(f"Chain compilation complete") + while attempts < max_attempts: + response = chain.invoke( + instructions=self.instructions, + context=doc_content + ) - return chain + if self.verbose: + logger.info(f"Generated response attempt {attempts + 1}: {response}") - def rewrite(self, documents: List[Document]): - chain = self.compile_with_docs(documents) - output = chain.run() - return output + # validate response + if self.validate_output(response): + break + + # if response is invalid, retry + attempts += 1 + return response class RewrittenOutput(BaseModel): rewritten_text: str = Field(description="The rewritten text") \ No newline at end of file From ae2515da36e93ab6c42c0cf41c0446c07d07f773 Mon Sep 17 00:00:00 2001 From: Sarang-Nambiar <101126190+Sarang-Nambiar@users.noreply.github.com> Date: Sat, 18 Jan 2025 17:31:59 +0800 Subject: [PATCH 3/6] feat: rewriter functionality complete --- app/features/text_rewriter/core.py | 13 ++++---- app/features/text_rewriter/metadata.json | 5 ++++ .../prompt/text-rewriter-prompt.txt | 4 +-- app/features/text_rewriter/tools.py | 30 +++++++++++-------- app/tools/utils/tools_config.json | 4 +++ 5 files changed, 37 insertions(+), 19 deletions(-) diff --git a/app/features/text_rewriter/core.py b/app/features/text_rewriter/core.py index 80c351a3..d4f11c8f 100644 --- a/app/features/text_rewriter/core.py +++ b/app/features/text_rewriter/core.py @@ -5,24 +5,27 @@ logger = setup_logger() +ALLOWED_FILE_TYPES = {"pptx", "pdf", "docx", "txt", "csv", "youtube_url", "url", "gsheet"} + def executor( + raw_text: str, instructions: str, file_url: str, file_type: str, verbose=False): try: - if verbose: - logger.info(f"File URL loaded: {file_url}") + if verbose: logger.info(f"File URL loaded: {file_url}") - if file_type and file_url: + if file_type and file_url and file_type in ALLOWED_FILE_TYPES: logger.info(f"Generating docs. from {file_url} with type {file_type}") docs = get_docs(file_url, file_type, verbose=True) - else: + elif raw_text: docs = None + else: raise ToolExecutorError("File URL and file type must be provided") - output = TextRewriter(instructions, verbose=verbose).rewrite(docs) + output = TextRewriter(instructions, verbose=verbose).rewrite(raw_text, docs) except Exception as e: error_message = f"Error in executor: {e}" diff --git a/app/features/text_rewriter/metadata.json b/app/features/text_rewriter/metadata.json index 56c6fbde..90d0be49 100644 --- a/app/features/text_rewriter/metadata.json +++ b/app/features/text_rewriter/metadata.json @@ -5,6 +5,11 @@ "name": "instructions", "type": "text" }, + { + "label": "Text to Rewrite", + "name": "raw_text", + "type": "text" + }, { "label": "File URL", "name": "file_url", diff --git a/app/features/text_rewriter/prompt/text-rewriter-prompt.txt b/app/features/text_rewriter/prompt/text-rewriter-prompt.txt index a686e3d7..70e85eb5 100644 --- a/app/features/text_rewriter/prompt/text-rewriter-prompt.txt +++ b/app/features/text_rewriter/prompt/text-rewriter-prompt.txt @@ -1,10 +1,10 @@ You are a helpful AI assistant thatt helps users rewrite text from documents. -You are given instructions on how to rewrite a given text from the context provided: +You must follow the instructions given below on how the text should be rewritten: {instructions} You must respond as a JSON object with the below format: {format_instructions} -Context: +Text to rewrite: {context} \ No newline at end of file diff --git a/app/features/text_rewriter/tools.py b/app/features/text_rewriter/tools.py index 25f5c2b0..e66b3605 100644 --- a/app/features/text_rewriter/tools.py +++ b/app/features/text_rewriter/tools.py @@ -36,7 +36,6 @@ def __init__(self, instructions, prompt=None, model=None, parser=None, verbose=F self.verbose = verbose def compile(self): - # Return the chain prompt = PromptTemplate( template=self.prompt, input_variables=["instructions", "context"], @@ -45,37 +44,44 @@ def compile(self): chain = prompt | self.model | self.parser + if self.verbose: logger.info(f"Chain compiled: {chain}") + return chain def validate_output(self, response: Dict) -> bool: - # TODO: implement a response validator here - # might need to add more checks if 'rewritten_text' in response: return True return False - def rewrite(self, documents: List[Document]): + def rewrite(self, raw_text: str, documents: List[Document]): chain = self.compile() - doc_content = "\n".join([doc.page_content for doc in documents]) - + if documents: + doc_content = "\n".join([doc.page_content for doc in documents]) + else: + doc_content = raw_text + attempts = 0 max_attempts = 5 while attempts < max_attempts: - response = chain.invoke( - instructions=self.instructions, - context=doc_content - ) + response = chain.invoke({ + "instructions": self.instructions, + "context": doc_content + }) if self.verbose: logger.info(f"Generated response attempt {attempts + 1}: {response}") - # validate response + # validate response incase of LLM hallucinations if self.validate_output(response): break - + + if self.verbose: logger.warning(f"Invalid response generated, retrying...") # if response is invalid, retry attempts += 1 + + if self.verbose: logger.info(f"Final response generated: {response}") + return response class RewrittenOutput(BaseModel): diff --git a/app/tools/utils/tools_config.json b/app/tools/utils/tools_config.json index 08c2e241..525b8c2e 100644 --- a/app/tools/utils/tools_config.json +++ b/app/tools/utils/tools_config.json @@ -38,5 +38,9 @@ "writing-feedback-generator": { "path": "tools.writing_feedback_generator.core", "metadata_file": "metadata.json" + }, + "text-rewriter": { + "path": "features.text_rewriter.core", + "metadata_file": "metadata.json" } } From 03fe2f97a232140ff136ac6c34c0f48010c2345c Mon Sep 17 00:00:00 2001 From: Sarang-Nambiar <101126190+Sarang-Nambiar@users.noreply.github.com> Date: Sat, 18 Jan 2025 22:02:16 +0800 Subject: [PATCH 4/6] feat: Added unit test cases + bug fixes --- app/features/text_rewriter/core.py | 2 + app/features/text_rewriter/tests/test_core.py | 165 ++++++++---------- 2 files changed, 70 insertions(+), 97 deletions(-) diff --git a/app/features/text_rewriter/core.py b/app/features/text_rewriter/core.py index d4f11c8f..80f7ee04 100644 --- a/app/features/text_rewriter/core.py +++ b/app/features/text_rewriter/core.py @@ -23,6 +23,8 @@ def executor( elif raw_text: docs = None else: + if file_type not in ALLOWED_FILE_TYPES: + raise ToolExecutorError(f"File type {file_type} not supported") raise ToolExecutorError("File URL and file type must be provided") output = TextRewriter(instructions, verbose=verbose).rewrite(raw_text, docs) diff --git a/app/features/text_rewriter/tests/test_core.py b/app/features/text_rewriter/tests/test_core.py index 17f2fa3c..9f189eaa 100644 --- a/app/features/text_rewriter/tests/test_core.py +++ b/app/features/text_rewriter/tests/test_core.py @@ -1,22 +1,26 @@ import pytest -from app.tools.ai_resistant_assignment_generator.core import executor +from app.features.text_rewriter.core import executor +from app.api.error_utilities import InputValidationError # Base attributes reused across all tests base_attributes = { - "assignment": "Math Homework", - "grade_level": "university", - "lang": "en" + "instructions": "Rewrite the text in a more formal tone.", + "raw_text": "", +} + +base_attributes_without_raw_text = { + "instructions": "Rewrite the text in a more formal tone.", } # PDF Tests def test_executor_pdf_url_valid(): - ai_resistant_assignment = executor( + rewritten_text = executor( **base_attributes, file_url="https://filesamples.com/samples/document/pdf/sample1.pdf", file_type="pdf" ) - assert isinstance(ai_resistant_assignment, dict) + assert isinstance(rewritten_text, dict) def test_executor_pdf_url_invalid(): with pytest.raises(ValueError) as exc_info: @@ -29,12 +33,12 @@ def test_executor_pdf_url_invalid(): # CSV Tests def test_executor_csv_url_valid(): - ai_resistant_assignment = executor( + rewritten_text = executor( **base_attributes, file_url="https://filesamples.com/samples/document/csv/sample1.csv", file_type="csv" ) - assert isinstance(ai_resistant_assignment, dict) + assert isinstance(rewritten_text, dict) def test_executor_csv_url_invalid(): with pytest.raises(ValueError) as exc_info: @@ -47,12 +51,12 @@ def test_executor_csv_url_invalid(): # TXT Tests def test_executor_txt_url_valid(): - ai_resistant_assignment = executor( + rewritten_text = executor( **base_attributes, file_url="https://filesamples.com/samples/document/txt/sample1.txt", file_type="txt" ) - assert isinstance(ai_resistant_assignment, dict) + assert isinstance(rewritten_text, dict) def test_executor_txt_url_invalid(): with pytest.raises(ValueError) as exc_info: @@ -63,32 +67,14 @@ def test_executor_txt_url_invalid(): ) assert isinstance(exc_info.value, ValueError) -# MD Tests -def test_executor_md_url_valid(): - ai_resistant_assignment = executor( - **base_attributes, - file_url="https://github.com/radicalxdev/kai-ai-backend/blob/main/README.md", - file_type="md" - ) - assert isinstance(ai_resistant_assignment, dict) - -def test_executor_md_url_invalid(): - with pytest.raises(ValueError) as exc_info: - executor( - **base_attributes, - file_url="https://github.com/radicalxdev/kai-ai-backend/blob/main/README.md", - file_type=1 - ) - assert isinstance(exc_info.value, ValueError) - # PPTX Tests def test_executor_pptx_url_valid(): - ai_resistant_assignment = executor( + rewritten_text = executor( **base_attributes, file_url="https://scholar.harvard.edu/files/torman_personal/files/samplepptx.pptx", file_type="pptx" ) - assert isinstance(ai_resistant_assignment, dict) + assert isinstance(rewritten_text, dict) def test_executor_pptx_url_invalid(): with pytest.raises(ValueError) as exc_info: @@ -101,12 +87,12 @@ def test_executor_pptx_url_invalid(): # DOCX Tests def test_executor_docx_url_valid(): - ai_resistant_assignment = executor( + rewritten_text = executor( **base_attributes, file_url="https://filesamples.com/samples/document/docx/sample1.docx", file_type="docx" ) - assert isinstance(ai_resistant_assignment, dict) + assert isinstance(rewritten_text, dict) def test_executor_docx_url_invalid(): with pytest.raises(ValueError) as exc_info: @@ -117,106 +103,91 @@ def test_executor_docx_url_invalid(): ) assert isinstance(exc_info.value, ValueError) -# XLS Tests -def test_executor_xls_url_valid(): - ai_resistant_assignment = executor( - **base_attributes, - file_url="https://filesamples.com/samples/document/xls/sample1.xls", - file_type="xls" - ) - assert isinstance(ai_resistant_assignment, dict) - -def test_executor_xls_url_invalid(): - with pytest.raises(ValueError) as exc_info: - executor( - **base_attributes, - file_url="https://filesamples.com/samples/document/xls/sample1.xls", - file_type=1 - ) - assert isinstance(exc_info.value, ValueError) - -# XLSX Tests -def test_executor_xlsx_url_valid(): - ai_resistant_assignment = executor( - **base_attributes, - file_url="https://filesamples.com/samples/document/xlsx/sample1.xlsx", - file_type="xlsx" - ) - assert isinstance(ai_resistant_assignment, dict) - -def test_executor_xlsx_url_invalid(): +# Invalid file type test +def test_executor_invalid_file_type(): with pytest.raises(ValueError) as exc_info: executor( **base_attributes, file_url="https://filesamples.com/samples/document/xlsx/sample1.xlsx", - file_type=1 + file_type="xlsx" ) assert isinstance(exc_info.value, ValueError) -# XML Tests -def test_executor_xml_url_invalid(): - with pytest.raises(ValueError) as exc_info: - executor( +# GSheets Tests +def test_executor_gsheets_url_valid(): + rewritten_text = executor( **base_attributes, - file_url="https://filesampleshub.com/download/code/xml/dummy.xml", - file_type=1 + file_url="https://docs.google.com/spreadsheets/d/1BxiMVs0XRA5nFMdKvBdBZjgmUUqptlbs74OgvE2upms/edit?gid=0#gid=0", + file_type="gsheet" ) - assert isinstance(exc_info.value, ValueError) + assert isinstance(rewritten_text, dict) -# GDocs Tests -def test_executor_gdocs_url_invalid(): +def test_executor_gsheets_url_invalid(): with pytest.raises(ValueError) as exc_info: executor( **base_attributes, - file_url="https://docs.google.com/document/d/1OWQfO9LX6psGipJu9LabzNE22us1Ct/edit", + file_url="https://docs.google.com/spreadsheets/d/1BxiMVs0XRA5nFMdKvBdBZjgmUUqptlbs74OgvE2upms/edit?gid=0#gid=0", file_type=1 ) assert isinstance(exc_info.value, ValueError) -# GSheets Tests -def test_executor_gsheets_url_invalid(): - with pytest.raises(ValueError) as exc_info: - executor( +# Youtube URL Tests +def test_executor_youtube_url_valid(): + rewritten_text = executor( **base_attributes, - file_url="https://docs.google.com/spreadsheets/d/16OPtLLSfU/edit", - file_type=1 + file_url="https://www.youtube.com/watch?v=HgBpFaATdoA", + file_type="youtube_url" ) - assert isinstance(exc_info.value, ValueError) + assert isinstance(rewritten_text, dict) -# GSlides Tests -def test_executor_gslides_url_invalid(): +def test_executor_youtube_url_invalid(): with pytest.raises(ValueError) as exc_info: executor( **base_attributes, - file_url="https://docs.google.com/spreadsheets/d/16OPtLLSfU/edit", + file_url="https://www.youtube.com/watch?v=HgBpFaATdoA", file_type=1 ) assert isinstance(exc_info.value, ValueError) -# GPDFs Tests -def test_executor_gpdfs_url_valid(): - ai_resistant_assignment = executor( +# PPTX Tests +def test_executor_pptx_url_valid(): + rewritten_text = executor( **base_attributes, - file_url="https://drive.google.com/file/d/1fUj1uWIMh6QZsPkt0Vs7mEd2VEqz3O8l/view", - file_type="gpdf" + file_url = "https://getsamplefiles.com/download/pptx/sample-1.pptx", + file_type = "pptx", ) - assert isinstance(ai_resistant_assignment, dict) -def test_executor_gpdfs_url_invalid(): + assert isinstance(rewritten_text, dict) + +def test_executor_pptx_url_invalid(): + with pytest.raises(ValueError) as exc_info: executor( **base_attributes, - file_url="https://drive.google.com/file/d/1fUj1uWIMh6QZsPkt0Vs7mEd2VEqz3O8l/view", - file_type=1 + file_url = "https://getsamplefiles.com/download/pptx/sample-1.pptx", + file_type = "pptx", ) + assert isinstance(exc_info.value, ValueError) -# MP3 Tests -def test_executor_mp3_url_invalid(): - with pytest.raises(ValueError) as exc_info: +# Plain text through text box +def test_executor_plain_text_valid(): + rewritten_text = executor( + **base_attributes_without_raw_text, + raw_text="The quick brown fox jumps over the lazy dog.", + file_url="", + file_type="", + ) + + assert isinstance(rewritten_text, dict) + +def test_executor_plain_text_invalid(): + with pytest.raises(InputValidationError) as exc_info: executor( - **base_attributes, - file_url="https://raw.githubusercontent.com/asleem/uploaded_files/main/dummy.mp3", - file_type=1 + **base_attributes_without_raw_text, + raw_text=1, + file_url="", + file_type="", ) - assert isinstance(exc_info.value, ValueError) + + assert isinstance(exc_info.value, InputValidationError) \ No newline at end of file From 0bad428f2babd8e8814a8beb8b517ec02ea3047b Mon Sep 17 00:00:00 2001 From: Sarang-Nambiar <101126190+Sarang-Nambiar@users.noreply.github.com> Date: Sat, 18 Jan 2025 22:23:58 +0800 Subject: [PATCH 5/6] fix: test case bugs --- app/features/text_rewriter/tests/test_core.py | 26 +++---------------- 1 file changed, 4 insertions(+), 22 deletions(-) diff --git a/app/features/text_rewriter/tests/test_core.py b/app/features/text_rewriter/tests/test_core.py index 9f189eaa..edcca330 100644 --- a/app/features/text_rewriter/tests/test_core.py +++ b/app/features/text_rewriter/tests/test_core.py @@ -68,14 +68,6 @@ def test_executor_txt_url_invalid(): assert isinstance(exc_info.value, ValueError) # PPTX Tests -def test_executor_pptx_url_valid(): - rewritten_text = executor( - **base_attributes, - file_url="https://scholar.harvard.edu/files/torman_personal/files/samplepptx.pptx", - file_type="pptx" - ) - assert isinstance(rewritten_text, dict) - def test_executor_pptx_url_invalid(): with pytest.raises(ValueError) as exc_info: executor( @@ -149,16 +141,6 @@ def test_executor_youtube_url_invalid(): ) assert isinstance(exc_info.value, ValueError) -# PPTX Tests -def test_executor_pptx_url_valid(): - rewritten_text = executor( - **base_attributes, - file_url = "https://getsamplefiles.com/download/pptx/sample-1.pptx", - file_type = "pptx", - ) - - assert isinstance(rewritten_text, dict) - def test_executor_pptx_url_invalid(): with pytest.raises(ValueError) as exc_info: @@ -182,12 +164,12 @@ def test_executor_plain_text_valid(): assert isinstance(rewritten_text, dict) def test_executor_plain_text_invalid(): - with pytest.raises(InputValidationError) as exc_info: + with pytest.raises(ValueError) as exc_info: executor( **base_attributes_without_raw_text, - raw_text=1, + raw_text="", file_url="", - file_type="", + file_type=1, ) - assert isinstance(exc_info.value, InputValidationError) \ No newline at end of file + assert isinstance(exc_info.value, ValueError) \ No newline at end of file From f59a1beaa022b155f13419adc2b74eab82d8a7f4 Mon Sep 17 00:00:00 2001 From: Sarang-Nambiar <101126190+Sarang-Nambiar@users.noreply.github.com> Date: Thu, 23 Jan 2025 16:34:52 +0800 Subject: [PATCH 6/6] chore: imported langchain in core.py + reverted tools_config.json --- app/features/text_rewriter/core.py | 1 + app/tools/utils/tools_config.json | 4 ---- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/app/features/text_rewriter/core.py b/app/features/text_rewriter/core.py index 80f7ee04..515c470a 100644 --- a/app/features/text_rewriter/core.py +++ b/app/features/text_rewriter/core.py @@ -2,6 +2,7 @@ from app.utils.document_loaders import get_docs from app.features.text_rewriter.tools import TextRewriter from app.api.error_utilities import ToolExecutorError +import langchain logger = setup_logger() diff --git a/app/tools/utils/tools_config.json b/app/tools/utils/tools_config.json index 525b8c2e..08c2e241 100644 --- a/app/tools/utils/tools_config.json +++ b/app/tools/utils/tools_config.json @@ -38,9 +38,5 @@ "writing-feedback-generator": { "path": "tools.writing_feedback_generator.core", "metadata_file": "metadata.json" - }, - "text-rewriter": { - "path": "features.text_rewriter.core", - "metadata_file": "metadata.json" } }