From e69a00ab84b1d286bae979fef7b84dfe75ae12fa Mon Sep 17 00:00:00 2001
From: Emmett McFaralne <staminacode@gmail.com>
Date: Wed, 4 Sep 2024 15:56:09 -0400
Subject: [PATCH] added json block formatting removal

---
 setup.py                |  2 +-
 tests/test_extractor.py |  2 +-
 thepipe/extract.py      | 13 ++++++++++---
 3 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/setup.py b/setup.py
index 0ddbe13..136948c 100644
--- a/setup.py
+++ b/setup.py
@@ -10,7 +10,7 @@ def read_git_requirements(file):
 
 setup(
     name='thepipe_api',
-    version='1.2.5',
+    version='1.2.6',
     author='Emmett McFarlane',
     author_email='emmett@thepi.pe',
     description='AI-native extractor, powered by multimodal LLMs.',
diff --git a/tests/test_extractor.py b/tests/test_extractor.py
index 8210d41..9f77706 100644
--- a/tests/test_extractor.py
+++ b/tests/test_extractor.py
@@ -60,7 +60,7 @@ def test_extract_json_from_response(self):
 
     def test_extract(self):
         results, total_tokens_used = extract(
-            chunks=self.chunks,
+            chunks=self.chunks, # receipt
             schema=self.schema,
         )
 
diff --git a/thepipe/extract.py b/thepipe/extract.py
index 270bb82..7323f64 100644
--- a/thepipe/extract.py
+++ b/thepipe/extract.py
@@ -14,14 +14,21 @@
 
 def extract_json_from_response(llm_response: str) -> Union[Dict, List[Dict], None]:
     def clean_response_text(llm_response: str) -> str:
-        return llm_response.encode('utf-8', 'ignore').decode('utf-8')
+        return llm_response.encode('utf-8', 'ignore').decode('utf-8').strip()
     
-    llm_response = llm_response.strip()
+    # try to match inside of code block
     code_block_pattern = r'^```(?:json)?\s*([\s\S]*?)\s*```$'
     match = re.match(code_block_pattern, llm_response, re.MULTILINE | re.DOTALL)
     if match:
-        llm_response = match.group(1).strip()
+        llm_response = match.group(1)
     llm_response = clean_response_text(llm_response)
+
+    # try to remove code block formatting if still present
+    if llm_response.startswith("```json") and llm_response.endswith("```"):
+        llm_response = llm_response[len("```json"):-len("```")]
+    llm_response = clean_response_text(llm_response)
+
+    # parse json by matching curly braces
     try:
         parsed_json = json.loads(llm_response)
         return parsed_json