fix scraper error response parsing

emcf · Jan 2, 2025 · 8172ef1 · 8172ef1
1 parent 5674c1b
commit 8172ef1
Show file tree

Hide file tree

Showing 2 changed files with 46 additions and 27 deletions.
diff --git a/setup.py b/setup.py
@@ -1,38 +1,45 @@
 from setuptools import setup, find_packages
 
+
 def read_requirements(file):
-    with open(file, encoding='utf-8') as f:
-        return [line.strip() for line in f if line.strip() and not line.startswith('#') and not line.startswith('git+')]
+    with open(file, encoding="utf-8") as f:
+        return [
+            line.strip()
+            for line in f
+            if line.strip() and not line.startswith("#") and not line.startswith("git+")
+        ]
+
 
 def read_git_requirements(file):
-    with open(file, encoding='utf-8') as f:
-        return [line.strip() for line in f if line.strip().startswith('git+')]
+    with open(file, encoding="utf-8") as f:
+        return [line.strip() for line in f if line.strip().startswith("git+")]
+
 
 setup(
-    name='thepipe_api',
-    version='1.3.9',
-    author='Emmett McFarlane',
-    author_email='emmett@thepi.pe',
-    description='AI-native extractor, powered by multimodal LLMs.',
-    long_description=open('README.md', encoding='utf-8').read(),
-    long_description_content_type='text/markdown',
-    url='https://github.com/emcf/thepipe',
+    name="thepipe_api",
+    version="1.4.0",
+    author="Emmett McFarlane",
+    author_email="emmett@thepi.pe",
+    description="Document extraction, powered by multimodal LLMs.",
+    long_description=open("README.md", encoding="utf-8").read(),
+    long_description_content_type="text/markdown",
+    url="https://github.com/emcf/thepipe",
     packages=find_packages(),
     classifiers=[
-        'Programming Language :: Python :: 3',
-        'License :: OSI Approved :: MIT License',
-        'Operating System :: OS Independent',
+        "Programming Language :: Python :: 3",
+        "License :: OSI Approved :: MIT License",
+        "Operating System :: OS Independent",
     ],
-    python_requires='>=3.9',
-    install_requires=read_requirements('requirements.txt'),
+    python_requires=">=3.9",
+    install_requires=read_requirements("requirements.txt"),
     include_package_data=True,
     entry_points={
-        'console_scripts': [
-            'thepipe=thepipe.__init__:main',
+        "console_scripts": [
+            "thepipe=thepipe.__init__:main",
         ],
     },
     extras_require={
-        'local': read_requirements('local.txt'),
+        "local": read_requirements("local.txt"),
     },
-    dependency_links=read_git_requirements('local.txt')
+    dependency_links=read_git_requirements("local.txt"),
 )
diff --git a/thepipe/scraper.py b/thepipe/scraper.py
@@ -114,10 +114,16 @@ def scrape_file(
                     ),
                 },
             )
-        if "error" in response.content.decode("utf-8"):
-            error_message = json.loads(response.content.decode("utf-8"))["error"]
-            raise ValueError(f"Error scraping {filepath}: {error_message}")
         response.raise_for_status()
+        for line in response.iter_lines(decode_unicode=True):
+            # each line is its own JSON object
+            if not line.strip():
+                continue  # skip blank lines
+            data = json.loads(line)
+            # If the server sent an error for this chunk, handle it
+            if "error" in data:
+                raise ValueError(f"Error scraping: {data['error']}")
+
         chunks = []
         for line in response.iter_lines():
             if line:
@@ -729,10 +735,16 @@ def scrape_url(
         }
         data["urls"] = url
         response = requests.post(endpoint, headers=headers, data=data, stream=True)
-        if "error" in response.content.decode("utf-8"):
-            error_message = json.loads(response.content.decode("utf-8"))["error"]
-            raise ValueError(f"Error scraping {url}: {error_message}")
         response.raise_for_status()
+        for line in response.iter_lines(decode_unicode=True):
+            # each line is its own JSON object
+            if not line.strip():
+                continue  # skip blank lines
+            data = json.loads(line)
+            # If the server sent an error for this chunk, handle it
+            if "error" in data:
+                raise ValueError(f"Error scraping: {data['error']}")
+
         results = []
         for line in response.iter_lines():
             if line: