diff --git a/CHANGELOG.md b/CHANGELOG.md index 3c2e8fa9..58e8e26c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,8 @@ +## 0.0.82 + +* Bump to `unstructured` 0.16.11 +* No longer attempts to download NLTK asset from S3 which could result in a 403 + ## 0.0.81 * Update `strategy` parameter to allow `'` and `"` as input surrounding the value. diff --git a/prepline_general/api/app.py b/prepline_general/api/app.py index 60247c30..798d1815 100644 --- a/prepline_general/api/app.py +++ b/prepline_general/api/app.py @@ -13,7 +13,7 @@ app = FastAPI( title="Unstructured Pipeline API", summary="Partition documents with the Unstructured library", - version="0.0.81", + version="0.0.82", docs_url="/general/docs", openapi_url="/general/openapi.json", servers=[ diff --git a/prepline_general/api/general.py b/prepline_general/api/general.py index 7623702b..e378ab04 100644 --- a/prepline_general/api/general.py +++ b/prepline_general/api/general.py @@ -43,7 +43,6 @@ elements_from_json, ) from unstructured_inference.models.base import UnknownModelException -from unstructured_inference.models.chipper import MODEL_TYPES as CHIPPER_MODEL_TYPES app = FastAPI() router = APIRouter() @@ -214,37 +213,6 @@ def partition_pdf_splits( return results -is_chipper_processing = False - - -class ChipperMemoryProtection: - """Chipper calls are expensive, and right now we can only do one call at a time. - - If the model is in use, return a 503 error. The API should scale up and the user can try again - on a different server. - """ - - def __enter__(self): - global is_chipper_processing - if is_chipper_processing: - # Log here so we can track how often it happens - logger.error("Chipper is already is use") - raise HTTPException( - status_code=503, detail="Server is under heavy load. Please try again later." - ) - - is_chipper_processing = True - - def __exit__( - self, - exc_type: Optional[type[BaseException]], - exc_value: Optional[BaseException], - exc_tb: Optional[TracebackType], - ): - global is_chipper_processing - is_chipper_processing = False - - def pipeline_api( file: IO[bytes], request: Request, @@ -331,7 +299,6 @@ def pipeline_api( if file_content_type == "application/pdf": _check_pdf(file) - hi_res_model_name = _validate_hi_res_model_name(hi_res_model_name, coordinates) strategy = _validate_strategy(strategy) pdf_infer_table_structure = _set_pdf_infer_table_structure( pdf_infer_table_structure, @@ -417,9 +384,6 @@ def pipeline_api( coordinates=coordinates, **partition_kwargs, # type: ignore # pyright: ignore[reportGeneralTypeIssues] ) - elif hi_res_model_name and hi_res_model_name in CHIPPER_MODEL_TYPES: - with ChipperMemoryProtection(): - elements = partition(**partition_kwargs) # type: ignore # pyright: ignore[reportGeneralTypeIssues] else: elements = partition(**partition_kwargs) # type: ignore # pyright: ignore[reportGeneralTypeIssues] @@ -533,21 +497,6 @@ def _validate_strategy(strategy: str) -> str: return strategy -def _validate_hi_res_model_name( - hi_res_model_name: Optional[str], show_coordinates: bool -) -> Optional[str]: - # Make sure chipper aliases to the latest model - if hi_res_model_name and hi_res_model_name == "chipper": - hi_res_model_name = "chipperv2" - - if hi_res_model_name and hi_res_model_name in CHIPPER_MODEL_TYPES and show_coordinates: - raise HTTPException( - status_code=400, - detail=f"coordinates aren't available when using the {hi_res_model_name} model type", - ) - return hi_res_model_name - - def _validate_chunking_strategy(chunking_strategy: Optional[str]) -> Optional[str]: """Raise on `chunking_strategy` is not a valid chunking strategy name. @@ -653,7 +602,7 @@ def return_content_type(filename: str): @router.get("/general/v0/general", include_in_schema=False) -@router.get("/general/v0.0.81/general", include_in_schema=False) +@router.get("/general/v0.0.82/general", include_in_schema=False) async def handle_invalid_get_request(): raise HTTPException( status_code=status.HTTP_405_METHOD_NOT_ALLOWED, detail="Only POST requests are supported." @@ -668,7 +617,7 @@ async def handle_invalid_get_request(): description="Description", operation_id="partition_parameters", ) -@router.post("/general/v0.0.81/general", include_in_schema=False) +@router.post("/general/v0.0.82/general", include_in_schema=False) def general_partition( request: Request, # cannot use annotated type here because of a bug described here: diff --git a/preprocessing-pipeline-family.yaml b/preprocessing-pipeline-family.yaml index 01a64677..116f88e1 100644 --- a/preprocessing-pipeline-family.yaml +++ b/preprocessing-pipeline-family.yaml @@ -1,2 +1,2 @@ name: general -version: 0.0.81 +version: 0.0.82 diff --git a/requirements/base.txt b/requirements/base.txt index 47934cf0..61458431 100644 --- a/requirements/base.txt +++ b/requirements/base.txt @@ -1,14 +1,16 @@ # -# This file is autogenerated by pip-compile with Python 3.10 +# This file is autogenerated by pip-compile with Python 3.11 # by the following command: # -# pip-compile requirements/base.in +# pip-compile --config=pyproject.toml requirements/base.in # +aiofiles==24.1.0 + # via unstructured-client annotated-types==0.7.0 # via pydantic antlr4-python3-runtime==4.9.3 # via omegaconf -anyio==4.6.0 +anyio==4.7.0 # via # httpx # starlette @@ -25,16 +27,14 @@ certifi==2024.8.30 # httpcore # httpx # requests - # unstructured-client cffi==1.17.1 # via cryptography chardet==5.2.0 # via unstructured -charset-normalizer==3.3.2 +charset-normalizer==3.4.0 # via # pdfminer-six # requests - # unstructured-client click==8.1.3 # via # -r requirements/base.in @@ -43,30 +43,26 @@ click==8.1.3 # uvicorn coloredlogs==15.0.1 # via onnxruntime -contourpy==1.3.0 +contourpy==1.3.1 # via matplotlib -cryptography==43.0.1 +cryptography==44.0.0 # via # pdfminer-six # unstructured-client cycler==0.12.1 # via matplotlib dataclasses-json==0.6.7 - # via - # unstructured - # unstructured-client -deepdiff==8.0.1 - # via unstructured-client -deprecated==1.2.14 + # via unstructured +deprecated==1.2.15 # via pikepdf effdet==0.4.1 # via unstructured -emoji==2.13.0 +emoji==2.14.0 # via unstructured -et-xmlfile==1.1.0 +et-xmlfile==2.0.0 # via openpyxl -exceptiongroup==1.2.2 - # via anyio +eval-type-backport==0.2.0 + # via unstructured-client fastapi==0.113.0 # via -r requirements/base.in filelock==3.16.1 @@ -78,39 +74,41 @@ filetype==1.2.0 # via unstructured flatbuffers==24.3.25 # via onnxruntime -fonttools==4.54.0 +fonttools==4.55.3 # via matplotlib -fsspec==2024.9.0 +fsspec==2024.10.0 # via # huggingface-hub # torch -google-api-core[grpc]==2.20.0 +google-api-core[grpc]==2.24.0 # via google-cloud-vision -google-auth==2.35.0 +google-auth==2.37.0 # via # google-api-core # google-cloud-vision -google-cloud-vision==3.7.4 +google-cloud-vision==3.9.0 # via unstructured -googleapis-common-protos==1.65.0 +googleapis-common-protos==1.66.0 # via # google-api-core # grpcio-status -grpcio==1.66.1 +grpcio==1.68.1 # via # google-api-core # grpcio-status -grpcio-status==1.66.1 +grpcio-status==1.68.1 # via google-api-core h11==0.14.0 # via # httpcore # uvicorn -httpcore==1.0.5 +html5lib==1.1 + # via unstructured +httpcore==1.0.7 # via httpx -httpx==0.27.2 +httpx==0.28.1 # via unstructured-client -huggingface-hub==0.25.1 +huggingface-hub==0.26.5 # via # timm # tokenizers @@ -123,7 +121,6 @@ idna==3.10 # anyio # httpx # requests - # unstructured-client iopath==0.1.10 # via layoutparser jinja2==3.1.4 @@ -146,25 +143,21 @@ lxml==5.3.0 # unstructured markdown==3.7 # via unstructured -markupsafe==2.1.5 +markupsafe==3.0.2 # via jinja2 -marshmallow==3.22.0 - # via - # dataclasses-json - # unstructured-client -matplotlib==3.9.2 +marshmallow==3.23.1 + # via dataclasses-json +matplotlib==3.9.4 # via # pycocotools # unstructured-inference mpmath==1.3.0 # via sympy mypy-extensions==1.0.0 - # via - # typing-inspect - # unstructured-client + # via typing-inspect nest-asyncio==1.6.0 # via unstructured-client -networkx==3.3 +networkx==3.4.2 # via # torch # unstructured @@ -185,15 +178,16 @@ numpy==1.26.4 # torchvision # transformers # unstructured + # unstructured-inference olefile==0.47 # via python-oxmsg omegaconf==2.3.0 # via effdet -onnx==1.16.2 +onnx==1.17.0 # via # unstructured # unstructured-inference -onnxruntime==1.19.2 +onnxruntime==1.20.1 # via unstructured-inference opencv-python==4.10.0.84 # via @@ -201,9 +195,7 @@ opencv-python==4.10.0.84 # unstructured-inference openpyxl==3.1.5 # via unstructured -orderly-set==5.2.2 - # via deepdiff -packaging==24.1 +packaging==24.2 # via # huggingface-hub # marshmallow @@ -211,7 +203,6 @@ packaging==24.1 # onnxruntime # pikepdf # transformers - # unstructured-client # unstructured-pytesseract pandas==2.2.3 # via @@ -227,11 +218,11 @@ pdfminer-six==20231228 # unstructured pdfplumber==0.11.4 # via layoutparser -pi-heif==0.18.0 +pi-heif==0.21.0 # via unstructured -pikepdf==9.2.1 +pikepdf==9.4.2 # via unstructured -pillow==10.4.0 +pillow==11.0.0 # via # layoutparser # matplotlib @@ -242,13 +233,13 @@ pillow==10.4.0 # python-pptx # torchvision # unstructured-pytesseract -portalocker==2.10.1 +portalocker==3.0.0 # via iopath -proto-plus==1.24.0 +proto-plus==1.25.0 # via # google-api-core # google-cloud-vision -protobuf==5.28.2 +protobuf==5.29.1 # via # google-api-core # google-cloud-vision @@ -257,7 +248,7 @@ protobuf==5.28.2 # onnx # onnxruntime # proto-plus -psutil==6.0.0 +psutil==6.1.0 # via # -r requirements/base.in # unstructured @@ -271,17 +262,19 @@ pycocotools==2.0.8 # via effdet pycparser==2.22 # via cffi -pycryptodome==3.20.0 +pycryptodome==3.21.0 # via -r requirements/base.in pydantic==2.9.2 - # via fastapi + # via + # fastapi + # unstructured-client pydantic-core==2.23.4 # via pydantic -pypandoc==1.13 +pypandoc==1.14 # via unstructured -pyparsing==3.1.4 +pyparsing==3.2.0 # via matplotlib -pypdf==5.0.0 +pypdf==5.1.0 # via # -r requirements/base.in # unstructured @@ -295,11 +288,11 @@ python-dateutil==2.9.0.post0 # unstructured-client python-docx==1.1.2 # via unstructured -python-iso639==2024.4.27 +python-iso639==2024.10.22 # via unstructured python-magic==0.4.27 # via unstructured -python-multipart==0.0.10 +python-multipart==0.0.19 # via unstructured-inference python-oxmsg==0.0.1 # via unstructured @@ -314,13 +307,13 @@ pyyaml==6.0.2 # omegaconf # timm # transformers -rapidfuzz==3.10.0 +rapidfuzz==3.10.1 # via # unstructured # unstructured-inference ratelimit==2.2.1 # via -r requirements/base.in -regex==2024.9.11 +regex==2024.11.6 # via # nltk # transformers @@ -332,7 +325,6 @@ requests==2.32.3 # requests-toolbelt # transformers # unstructured - # unstructured-client requests-toolbelt==1.0.0 # via unstructured-client rsa==4.9 @@ -343,89 +335,81 @@ safetensors==0.4.5 # transformers scipy==1.14.1 # via layoutparser -six==1.16.0 +six==1.17.0 # via + # html5lib # langdetect # python-dateutil - # unstructured-client sniffio==1.3.1 - # via - # anyio - # httpx + # via anyio soupsieve==2.6 # via beautifulsoup4 starlette==0.38.6 # via fastapi -sympy==1.13.3 +sympy==1.13.1 # via # onnxruntime # torch -tabulate==0.9.0 - # via unstructured -timm==1.0.9 +timm==1.0.12 # via # effdet # unstructured-inference -tokenizers==0.19.1 +tokenizers==0.21.0 # via transformers -torch==2.4.1 +torch==2.5.1 # via # effdet # timm # torchvision # unstructured-inference -torchvision==0.19.1 +torchvision==0.20.1 # via # effdet # timm -tqdm==4.66.5 +tqdm==4.67.1 # via # huggingface-hub # iopath # nltk # transformers # unstructured -transformers==4.44.2 +transformers==4.47.0 # via unstructured-inference typing-extensions==4.12.2 # via # anyio - # emoji # fastapi # huggingface-hub # iopath # pydantic # pydantic-core - # pypdf # python-docx # python-oxmsg # python-pptx # torch # typing-inspect # unstructured - # unstructured-client - # uvicorn typing-inspect==0.9.0 # via # dataclasses-json # unstructured-client -tzdata==2024.1 +tzdata==2024.2 # via pandas -unstructured[all-docs]==0.15.13 +unstructured[all-docs]==0.16.11 # via -r requirements/base.in -unstructured-client==0.25.9 +unstructured-client==0.28.1 # via unstructured -unstructured-inference==0.7.36 +unstructured-inference==0.8.1 # via unstructured unstructured-pytesseract==0.3.13 # via unstructured urllib3==2.2.3 - # via - # requests - # unstructured-client -uvicorn==0.30.6 + # via requests +uvicorn==0.32.1 # via -r requirements/base.in -wrapt==1.16.0 +webencodings==0.5.1 + # via html5lib +wrapt==1.17.0 # via # deprecated # unstructured diff --git a/requirements/test.txt b/requirements/test.txt index cbfbd943..efbac57c 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -1,9 +1,13 @@ # -# This file is autogenerated by pip-compile with Python 3.10 +# This file is autogenerated by pip-compile with Python 3.11 # by the following command: # -# pip-compile --output-file=requirements/test.txt requirements/base.txt requirements/test.in +# pip-compile --config=pyproject.toml --output-file=requirements/test.txt requirements/base.txt requirements/test.in # +aiofiles==24.1.0 + # via + # -r requirements/base.txt + # unstructured-client annotated-types==0.7.0 # via # -r requirements/base.txt @@ -12,7 +16,7 @@ antlr4-python3-runtime==4.9.3 # via # -r requirements/base.txt # omegaconf -anyio==4.6.0 +anyio==4.7.0 # via # -r requirements/base.txt # httpx @@ -26,7 +30,7 @@ argon2-cffi-bindings==21.2.0 # via argon2-cffi arrow==1.3.0 # via isoduration -asttokens==2.4.1 +asttokens==3.0.0 # via # nbdev # stack-data @@ -49,9 +53,9 @@ beautifulsoup4==4.12.3 # -r requirements/base.txt # nbconvert # unstructured -black==24.8.0 +black==24.10.0 # via -r requirements/test.in -bleach==6.1.0 +bleach==6.2.0 # via nbconvert cachetools==5.5.0 # via @@ -63,7 +67,6 @@ certifi==2024.8.30 # httpcore # httpx # requests - # unstructured-client cffi==1.17.1 # via # -r requirements/base.txt @@ -73,12 +76,11 @@ chardet==5.2.0 # via # -r requirements/base.txt # unstructured -charset-normalizer==3.3.2 +charset-normalizer==3.4.0 # via # -r requirements/base.txt # pdfminer-six # requests - # unstructured-client click==8.1.3 # via # -r requirements/base.txt @@ -95,13 +97,13 @@ comm==0.2.2 # via # ipykernel # ipywidgets -contourpy==1.3.0 +contourpy==1.3.1 # via # -r requirements/base.txt # matplotlib -coverage[toml]==7.6.1 +coverage[toml]==7.6.9 # via pytest-cov -cryptography==43.0.1 +cryptography==44.0.0 # via # -r requirements/base.txt # pdfminer-six @@ -114,19 +116,15 @@ dataclasses-json==0.6.7 # via # -r requirements/base.txt # unstructured - # unstructured-client -debugpy==1.8.5 +debugpy==1.8.11 # via ipykernel decorator==5.1.1 # via ipython deepdiff==8.0.1 - # via - # -r requirements/base.txt - # -r requirements/test.in - # unstructured-client + # via -r requirements/test.in defusedxml==0.7.1 # via nbconvert -deprecated==1.2.14 +deprecated==1.2.15 # via # -r requirements/base.txt # pikepdf @@ -134,32 +132,30 @@ effdet==0.4.1 # via # -r requirements/base.txt # unstructured -emoji==2.13.0 +emoji==2.14.0 # via # -r requirements/base.txt # unstructured -et-xmlfile==1.1.0 +et-xmlfile==2.0.0 # via # -r requirements/base.txt # openpyxl -exceptiongroup==1.2.2 +eval-type-backport==0.2.0 # via # -r requirements/base.txt - # anyio - # ipython - # pytest -execnb==0.1.6 + # unstructured-client +execnb==0.1.11 # via nbdev executing==2.1.0 # via stack-data fastapi==0.113.0 # via -r requirements/base.txt -fastcore==1.7.9 +fastcore==1.7.26 # via # execnb # ghapi # nbdev -fastjsonschema==2.20.0 +fastjsonschema==2.21.1 # via nbformat filelock==3.16.1 # via @@ -177,43 +173,43 @@ flatbuffers==24.3.25 # via # -r requirements/base.txt # onnxruntime -fonttools==4.54.0 +fonttools==4.55.3 # via # -r requirements/base.txt # matplotlib fqdn==1.5.1 # via jsonschema -fsspec==2024.9.0 +fsspec==2024.10.0 # via # -r requirements/base.txt # huggingface-hub # torch ghapi==1.0.6 # via nbdev -google-api-core[grpc]==2.20.0 +google-api-core[grpc]==2.24.0 # via # -r requirements/base.txt # google-cloud-vision -google-auth==2.35.0 +google-auth==2.37.0 # via # -r requirements/base.txt # google-api-core # google-cloud-vision -google-cloud-vision==3.7.4 +google-cloud-vision==3.9.0 # via # -r requirements/base.txt # unstructured -googleapis-common-protos==1.65.0 +googleapis-common-protos==1.66.0 # via # -r requirements/base.txt # google-api-core # grpcio-status -grpcio==1.66.1 +grpcio==1.68.1 # via # -r requirements/base.txt # google-api-core # grpcio-status -grpcio-status==1.66.1 +grpcio-status==1.68.1 # via # -r requirements/base.txt # google-api-core @@ -222,17 +218,21 @@ h11==0.14.0 # -r requirements/base.txt # httpcore # uvicorn -httpcore==1.0.5 +html5lib==1.1 + # via + # -r requirements/base.txt + # unstructured +httpcore==1.0.7 # via # -r requirements/base.txt # httpx -httpx==0.27.2 +httpx==0.28.1 # via # -r requirements/base.txt # -r requirements/test.in # jupyterlab # unstructured-client -huggingface-hub==0.25.1 +huggingface-hub==0.26.5 # via # -r requirements/base.txt # timm @@ -250,7 +250,6 @@ idna==3.10 # httpx # jsonschema # requests - # unstructured-client iniconfig==2.0.0 # via pytest iopath==0.1.10 @@ -262,7 +261,7 @@ ipykernel==6.29.5 # jupyter # jupyter-console # jupyterlab -ipython==8.27.0 +ipython==8.30.0 # via # execnb # ipykernel @@ -272,7 +271,7 @@ ipywidgets==8.1.5 # via jupyter isoduration==20.11.0 # via jsonschema -jedi==0.19.1 +jedi==0.19.2 # via ipython jinja2==3.1.4 # via @@ -286,7 +285,7 @@ joblib==1.4.2 # via # -r requirements/base.txt # nltk -json5==0.9.25 +json5==0.10.0 # via jupyterlab-server jsonpath-python==1.0.6 # via @@ -299,7 +298,7 @@ jsonschema[format-nongpl]==4.23.0 # jupyter-events # jupyterlab-server # nbformat -jsonschema-specifications==2023.12.1 +jsonschema-specifications==2024.10.1 # via jsonschema jupyter==1.1.1 # via -r requirements/test.in @@ -334,7 +333,7 @@ jupyter-server==2.14.2 # notebook-shim jupyter-server-terminals==0.5.3 # via jupyter-server -jupyterlab==4.2.5 +jupyterlab==4.3.3 # via # jupyter # notebook @@ -369,17 +368,16 @@ markdown==3.7 # via # -r requirements/base.txt # unstructured -markupsafe==2.1.5 +markupsafe==3.0.2 # via # -r requirements/base.txt # jinja2 # nbconvert -marshmallow==3.22.0 +marshmallow==3.23.1 # via # -r requirements/base.txt # dataclasses-json - # unstructured-client -matplotlib==3.9.2 +matplotlib==3.9.4 # via # -r requirements/base.txt # pycocotools @@ -396,7 +394,7 @@ mpmath==1.3.0 # via # -r requirements/base.txt # sympy -mypy==1.11.2 +mypy==1.13.0 # via -r requirements/test.in mypy-extensions==1.0.0 # via @@ -404,14 +402,13 @@ mypy-extensions==1.0.0 # black # mypy # typing-inspect - # unstructured-client -nbclient==0.10.0 +nbclient==0.10.1 # via nbconvert nbconvert==7.16.4 # via # jupyter # jupyter-server -nbdev==2.3.31 +nbdev==2.3.34 # via -r requirements/test.in nbformat==5.10.4 # via @@ -423,7 +420,7 @@ nest-asyncio==1.6.0 # -r requirements/base.txt # ipykernel # unstructured-client -networkx==3.3 +networkx==3.4.2 # via # -r requirements/base.txt # torch @@ -432,7 +429,7 @@ nltk==3.9.1 # via # -r requirements/base.txt # unstructured -notebook==7.2.2 +notebook==7.3.1 # via jupyter notebook-shim==0.2.4 # via @@ -454,6 +451,7 @@ numpy==1.26.4 # torchvision # transformers # unstructured + # unstructured-inference olefile==0.47 # via # -r requirements/base.txt @@ -462,12 +460,12 @@ omegaconf==2.3.0 # via # -r requirements/base.txt # effdet -onnx==1.16.2 +onnx==1.17.0 # via # -r requirements/base.txt # unstructured # unstructured-inference -onnxruntime==1.19.2 +onnxruntime==1.20.1 # via # -r requirements/base.txt # unstructured-inference @@ -481,12 +479,10 @@ openpyxl==3.1.5 # -r requirements/base.txt # unstructured orderly-set==5.2.2 - # via - # -r requirements/base.txt - # deepdiff + # via deepdiff overrides==7.7.0 # via jupyter-server -packaging==24.1 +packaging==24.2 # via # -r requirements/base.txt # black @@ -505,7 +501,6 @@ packaging==24.1 # pikepdf # pytest # transformers - # unstructured-client # unstructured-pytesseract pandas==2.2.3 # via @@ -534,15 +529,15 @@ pdfplumber==0.11.4 # layoutparser pexpect==4.9.0 # via ipython -pi-heif==0.18.0 +pi-heif==0.21.0 # via # -r requirements/base.txt # unstructured -pikepdf==9.2.1 +pikepdf==9.4.2 # via # -r requirements/base.txt # unstructured -pillow==10.4.0 +pillow==11.0.0 # via # -r requirements/base.txt # layoutparser @@ -560,22 +555,22 @@ platformdirs==4.3.6 # jupyter-core pluggy==1.5.0 # via pytest -portalocker==2.10.1 +portalocker==3.0.0 # via # -r requirements/base.txt # iopath -prometheus-client==0.21.0 +prometheus-client==0.21.1 # via jupyter-server -prompt-toolkit==3.0.47 +prompt-toolkit==3.0.48 # via # ipython # jupyter-console -proto-plus==1.24.0 +proto-plus==1.25.0 # via # -r requirements/base.txt # google-api-core # google-cloud-vision -protobuf==5.28.2 +protobuf==5.29.1 # via # -r requirements/base.txt # google-api-core @@ -585,7 +580,7 @@ protobuf==5.28.2 # onnx # onnxruntime # proto-plus -psutil==6.0.0 +psutil==6.1.0 # via # -r requirements/base.txt # ipykernel @@ -615,12 +610,13 @@ pycparser==2.22 # via # -r requirements/base.txt # cffi -pycryptodome==3.20.0 +pycryptodome==3.21.0 # via -r requirements/base.txt pydantic==2.9.2 # via # -r requirements/base.txt # fastapi + # unstructured-client pydantic-core==2.23.4 # via # -r requirements/base.txt @@ -632,15 +628,15 @@ pygments==2.18.0 # ipython # jupyter-console # nbconvert -pypandoc==1.13 +pypandoc==1.14 # via # -r requirements/base.txt # unstructured -pyparsing==3.1.4 +pyparsing==3.2.0 # via # -r requirements/base.txt # matplotlib -pypdf==5.0.0 +pypdf==5.1.0 # via # -r requirements/base.txt # unstructured @@ -649,11 +645,11 @@ pypdfium2==4.30.0 # via # -r requirements/base.txt # pdfplumber -pytest==8.3.3 +pytest==8.3.4 # via # pytest-cov # pytest-mock -pytest-cov==5.0.0 +pytest-cov==6.0.0 # via -r requirements/test.in pytest-mock==3.14.0 # via -r requirements/test.in @@ -669,17 +665,17 @@ python-docx==1.1.2 # via # -r requirements/base.txt # unstructured -python-iso639==2024.4.27 +python-iso639==2024.10.22 # via # -r requirements/base.txt # unstructured -python-json-logger==2.0.7 +python-json-logger==3.2.0 # via jupyter-events python-magic==0.4.27 # via # -r requirements/base.txt # unstructured -python-multipart==0.0.10 +python-multipart==0.0.19 # via # -r requirements/base.txt # unstructured-inference @@ -711,7 +707,7 @@ pyzmq==26.2.0 # jupyter-client # jupyter-console # jupyter-server -rapidfuzz==3.10.0 +rapidfuzz==3.10.1 # via # -r requirements/base.txt # unstructured @@ -723,7 +719,7 @@ referencing==0.35.1 # jsonschema # jsonschema-specifications # jupyter-events -regex==2024.9.11 +regex==2024.11.6 # via # -r requirements/base.txt # nltk @@ -737,7 +733,6 @@ requests==2.32.3 # requests-toolbelt # transformers # unstructured - # unstructured-client requests-toolbelt==1.0.0 # via # -r requirements/base.txt @@ -750,7 +745,7 @@ rfc3986-validator==0.1.1 # via # jsonschema # jupyter-events -rpds-py==0.20.0 +rpds-py==0.22.3 # via # jsonschema # referencing @@ -769,21 +764,18 @@ scipy==1.14.1 # layoutparser send2trash==1.8.3 # via jupyter-server -six==1.16.0 +six==1.17.0 # via # -r requirements/base.txt - # asttokens # astunparse - # bleach + # html5lib # langdetect # python-dateutil # rfc3339-validator - # unstructured-client sniffio==1.3.1 # via # -r requirements/base.txt # anyio - # httpx soupsieve==2.6 # via # -r requirements/base.txt @@ -794,50 +786,39 @@ starlette==0.38.6 # via # -r requirements/base.txt # fastapi -sympy==1.13.3 +sympy==1.13.1 # via # -r requirements/base.txt # onnxruntime # torch -tabulate==0.9.0 - # via - # -r requirements/base.txt - # unstructured terminado==0.18.1 # via # jupyter-server # jupyter-server-terminals -timm==1.0.9 +timm==1.0.12 # via # -r requirements/base.txt # effdet # unstructured-inference -tinycss2==1.3.0 +tinycss2==1.4.0 # via nbconvert -tokenizers==0.19.1 +tokenizers==0.21.0 # via # -r requirements/base.txt # transformers -tomli==2.0.1 - # via - # black - # coverage - # jupyterlab - # mypy - # pytest -torch==2.4.1 +torch==2.5.1 # via # -r requirements/base.txt # effdet # timm # torchvision # unstructured-inference -torchvision==0.19.1 +torchvision==0.20.1 # via # -r requirements/base.txt # effdet # timm -tornado==6.4.1 +tornado==6.4.2 # via # ipykernel # jupyter-client @@ -845,7 +826,7 @@ tornado==6.4.1 # jupyterlab # notebook # terminado -tqdm==4.66.5 +tqdm==4.67.1 # via # -r requirements/base.txt # huggingface-hub @@ -869,19 +850,16 @@ traitlets==5.14.3 # nbclient # nbconvert # nbformat -transformers==4.44.2 +transformers==4.47.0 # via # -r requirements/base.txt # unstructured-inference -types-python-dateutil==2.9.0.20240906 +types-python-dateutil==2.9.0.20241206 # via arrow typing-extensions==4.12.2 # via # -r requirements/base.txt # anyio - # async-lru - # black - # emoji # fastapi # huggingface-hub # iopath @@ -889,31 +867,28 @@ typing-extensions==4.12.2 # mypy # pydantic # pydantic-core - # pypdf # python-docx # python-oxmsg # python-pptx # torch # typing-inspect # unstructured - # unstructured-client - # uvicorn typing-inspect==0.9.0 # via # -r requirements/base.txt # dataclasses-json # unstructured-client -tzdata==2024.1 +tzdata==2024.2 # via # -r requirements/base.txt # pandas -unstructured[all-docs]==0.15.13 +unstructured[all-docs]==0.16.11 # via -r requirements/base.txt -unstructured-client==0.25.9 +unstructured-client==0.28.1 # via # -r requirements/base.txt # unstructured -unstructured-inference==0.7.36 +unstructured-inference==0.8.1 # via # -r requirements/base.txt # unstructured @@ -927,26 +902,27 @@ urllib3==2.2.3 # via # -r requirements/base.txt # requests - # unstructured-client -uvicorn==0.30.6 +uvicorn==0.32.1 # via -r requirements/base.txt -watchdog==5.0.2 +watchdog==6.0.0 # via nbdev wcwidth==0.2.13 # via prompt-toolkit -webcolors==24.8.0 +webcolors==24.11.1 # via jsonschema webencodings==0.5.1 # via + # -r requirements/base.txt # bleach + # html5lib # tinycss2 websocket-client==1.8.0 # via jupyter-server -wheel==0.44.0 +wheel==0.45.1 # via astunparse widgetsnbextension==4.0.13 # via ipywidgets -wrapt==1.16.0 +wrapt==1.17.0 # via # -r requirements/base.txt # deprecated diff --git a/test_general/api/test_app.py b/test_general/api/test_app.py index 4f08cb17..4a2db763 100644 --- a/test_general/api/test_app.py +++ b/test_general/api/test_app.py @@ -1,9 +1,7 @@ import io import os import tempfile -import time import uuid -from concurrent.futures import ThreadPoolExecutor from pathlib import Path from unittest.mock import ANY, Mock @@ -981,42 +979,6 @@ def test_general_api_returns_400_bad_json(tmpdir): assert response.status_code == 400 -def test_chipper_memory_protection(monkeypatch, mocker): - """ - For now, only 1 Chipper call is allowed at a time. - Assert that we return a 503 while it's in use. - """ - - def mock_partition(*args, **kwargs): - time.sleep(2) - return {} - - monkeypatch.setattr( - general, - "partition", - mock_partition, - ) - - client = TestClient(app) - test_file = Path("sample-docs") / "layout-parser-paper-fast.pdf" - - def make_request(*args): - return client.post( - MAIN_API_ROUTE, - files=[("files", (str(test_file), open(test_file, "rb"), "application/pdf"))], - data={"strategy": "hi_res", "hi_res_model_name": "chipper"}, - ) - - with ThreadPoolExecutor() as executor: - responses = list(executor.map(make_request, range(3))) - - status_codes = [response.status_code for response in responses] - - # Assert only one call got through - assert status_codes.count(200) == 1 - assert status_codes.count(503) == 2 - - def test_invalid_strategy_for_image_file(): """ Verify that we get a 400 error if we use "strategy=fast" with an image file