Skip to content

Commit

Permalink
fix scraper error response parsing
Browse files Browse the repository at this point in the history
  • Loading branch information
emcf committed Jan 2, 2025
1 parent 5674c1b commit 8172ef1
Show file tree
Hide file tree
Showing 2 changed files with 46 additions and 27 deletions.
49 changes: 28 additions & 21 deletions setup.py
Original file line number Diff line number Diff line change
@@ -1,38 +1,45 @@
from setuptools import setup, find_packages


def read_requirements(file):
with open(file, encoding='utf-8') as f:
return [line.strip() for line in f if line.strip() and not line.startswith('#') and not line.startswith('git+')]
with open(file, encoding="utf-8") as f:
return [
line.strip()
for line in f
if line.strip() and not line.startswith("#") and not line.startswith("git+")
]


def read_git_requirements(file):
with open(file, encoding='utf-8') as f:
return [line.strip() for line in f if line.strip().startswith('git+')]
with open(file, encoding="utf-8") as f:
return [line.strip() for line in f if line.strip().startswith("git+")]


setup(
name='thepipe_api',
version='1.3.9',
author='Emmett McFarlane',
author_email='emmett@thepi.pe',
description='AI-native extractor, powered by multimodal LLMs.',
long_description=open('README.md', encoding='utf-8').read(),
long_description_content_type='text/markdown',
url='https://github.com/emcf/thepipe',
name="thepipe_api",
version="1.4.0",
author="Emmett McFarlane",
author_email="emmett@thepi.pe",
description="Document extraction, powered by multimodal LLMs.",
long_description=open("README.md", encoding="utf-8").read(),
long_description_content_type="text/markdown",
url="https://github.com/emcf/thepipe",
packages=find_packages(),
classifiers=[
'Programming Language :: Python :: 3',
'License :: OSI Approved :: MIT License',
'Operating System :: OS Independent',
"Programming Language :: Python :: 3",
"License :: OSI Approved :: MIT License",
"Operating System :: OS Independent",
],
python_requires='>=3.9',
install_requires=read_requirements('requirements.txt'),
python_requires=">=3.9",
install_requires=read_requirements("requirements.txt"),
include_package_data=True,
entry_points={
'console_scripts': [
'thepipe=thepipe.__init__:main',
"console_scripts": [
"thepipe=thepipe.__init__:main",
],
},
extras_require={
'local': read_requirements('local.txt'),
"local": read_requirements("local.txt"),
},
dependency_links=read_git_requirements('local.txt')
dependency_links=read_git_requirements("local.txt"),
)
24 changes: 18 additions & 6 deletions thepipe/scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,10 +114,16 @@ def scrape_file(
),
},
)
if "error" in response.content.decode("utf-8"):
error_message = json.loads(response.content.decode("utf-8"))["error"]
raise ValueError(f"Error scraping {filepath}: {error_message}")
response.raise_for_status()
for line in response.iter_lines(decode_unicode=True):
# each line is its own JSON object
if not line.strip():
continue # skip blank lines
data = json.loads(line)
# If the server sent an error for this chunk, handle it
if "error" in data:
raise ValueError(f"Error scraping: {data['error']}")

chunks = []
for line in response.iter_lines():
if line:
Expand Down Expand Up @@ -729,10 +735,16 @@ def scrape_url(
}
data["urls"] = url
response = requests.post(endpoint, headers=headers, data=data, stream=True)
if "error" in response.content.decode("utf-8"):
error_message = json.loads(response.content.decode("utf-8"))["error"]
raise ValueError(f"Error scraping {url}: {error_message}")
response.raise_for_status()
for line in response.iter_lines(decode_unicode=True):
# each line is its own JSON object
if not line.strip():
continue # skip blank lines
data = json.loads(line)
# If the server sent an error for this chunk, handle it
if "error" in data:
raise ValueError(f"Error scraping: {data['error']}")

results = []
for line in response.iter_lines():
if line:
Expand Down

0 comments on commit 8172ef1

Please sign in to comment.