From e69a00ab84b1d286bae979fef7b84dfe75ae12fa Mon Sep 17 00:00:00 2001 From: Emmett McFaralne Date: Wed, 4 Sep 2024 15:56:09 -0400 Subject: [PATCH] added json block formatting removal --- setup.py | 2 +- tests/test_extractor.py | 2 +- thepipe/extract.py | 13 ++++++++++--- 3 files changed, 12 insertions(+), 5 deletions(-) diff --git a/setup.py b/setup.py index 0ddbe13..136948c 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ def read_git_requirements(file): setup( name='thepipe_api', - version='1.2.5', + version='1.2.6', author='Emmett McFarlane', author_email='emmett@thepi.pe', description='AI-native extractor, powered by multimodal LLMs.', diff --git a/tests/test_extractor.py b/tests/test_extractor.py index 8210d41..9f77706 100644 --- a/tests/test_extractor.py +++ b/tests/test_extractor.py @@ -60,7 +60,7 @@ def test_extract_json_from_response(self): def test_extract(self): results, total_tokens_used = extract( - chunks=self.chunks, + chunks=self.chunks, # receipt schema=self.schema, ) diff --git a/thepipe/extract.py b/thepipe/extract.py index 270bb82..7323f64 100644 --- a/thepipe/extract.py +++ b/thepipe/extract.py @@ -14,14 +14,21 @@ def extract_json_from_response(llm_response: str) -> Union[Dict, List[Dict], None]: def clean_response_text(llm_response: str) -> str: - return llm_response.encode('utf-8', 'ignore').decode('utf-8') + return llm_response.encode('utf-8', 'ignore').decode('utf-8').strip() - llm_response = llm_response.strip() + # try to match inside of code block code_block_pattern = r'^```(?:json)?\s*([\s\S]*?)\s*```$' match = re.match(code_block_pattern, llm_response, re.MULTILINE | re.DOTALL) if match: - llm_response = match.group(1).strip() + llm_response = match.group(1) llm_response = clean_response_text(llm_response) + + # try to remove code block formatting if still present + if llm_response.startswith("```json") and llm_response.endswith("```"): + llm_response = llm_response[len("```json"):-len("```")] + llm_response = clean_response_text(llm_response) + + # parse json by matching curly braces try: parsed_json = json.loads(llm_response) return parsed_json