diff --git a/CHANGELOG.md b/CHANGELOG.md index ef9ccd39..1018fc7f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,7 @@ +## 0.0.59 + +* Bump unstructured to 0.11.0 + ## 0.0.58 * Bump unstructured to 0.10.30 diff --git a/Dockerfile b/Dockerfile index ddb95dcb..723baa46 100644 --- a/Dockerfile +++ b/Dockerfile @@ -32,12 +32,9 @@ RUN python3.10 -m pip install pip==${PIP_VERSION} \ USER ${NB_USER} FROM python-deps as model-deps -# Note(yuming): quick workaround for ingest import error -# should import initialize within unstructured but out of ingest dir -COPY --chown=${NB_USER}:${NB_USER} scripts/hi_res_model_initialize.py hi_res_model_initialize.py RUN python3.10 -c "import nltk; nltk.download('punkt')" && \ python3.10 -c "import nltk; nltk.download('averaged_perceptron_tagger')" && \ - python3.10 -c "from hi_res_model_initialize import initialize; initialize()" + python3.10 -c "from unstructured.partition.model_init import initialize; initialize()" FROM model-deps as code COPY --chown=${NB_USER}:${NB_USER} CHANGELOG.md CHANGELOG.md diff --git a/prepline_general/api/app.py b/prepline_general/api/app.py index 68c04fcc..3acf46d6 100644 --- a/prepline_general/api/app.py +++ b/prepline_general/api/app.py @@ -11,7 +11,7 @@ app = FastAPI( title="Unstructured Pipeline API", description="""""", - version="0.0.58", + version="0.0.59", docs_url="/general/docs", openapi_url="/general/openapi.json", ) diff --git a/prepline_general/api/general.py b/prepline_general/api/general.py index ebb04d15..677459d1 100644 --- a/prepline_general/api/general.py +++ b/prepline_general/api/general.py @@ -636,7 +636,7 @@ def return_content_type(filename): @router.post("/general/v0/general") -@router.post("/general/v0.0.58/general") +@router.post("/general/v0.0.59/general") def pipeline_1( request: Request, gz_uncompressed_content_type: Optional[str] = Form(default=None), diff --git a/preprocessing-pipeline-family.yaml b/preprocessing-pipeline-family.yaml index 00af3b19..17679714 100644 --- a/preprocessing-pipeline-family.yaml +++ b/preprocessing-pipeline-family.yaml @@ -1,2 +1,2 @@ name: general -version: 0.0.58 +version: 0.0.59 diff --git a/requirements/base.txt b/requirements/base.txt index 3d3f3c25..2b67a6c9 100644 --- a/requirements/base.txt +++ b/requirements/base.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.11 +# This file is autogenerated by pip-compile with Python 3.10 # by the following command: # # pip-compile requirements/base.in @@ -16,7 +16,7 @@ backoff==2.2.1 # unstructured beautifulsoup4==4.12.2 # via unstructured -certifi==2023.7.22 +certifi==2023.11.17 # via requests cffi==1.16.0 # via cryptography @@ -47,6 +47,8 @@ emoji==2.8.0 # via unstructured et-xmlfile==1.1.0 # via openpyxl +exceptiongroup==1.1.3 + # via anyio fastapi==0.104.1 # via -r requirements/base.in filelock==3.13.1 @@ -58,7 +60,7 @@ filetype==1.2.0 # via unstructured flatbuffers==23.5.26 # via onnxruntime -fonttools==4.44.3 +fonttools==4.45.0 # via matplotlib fsspec==2023.10.0 # via @@ -66,7 +68,7 @@ fsspec==2023.10.0 # torch h11==0.14.0 # via uvicorn -huggingface-hub==0.19.3 +huggingface-hub==0.19.4 # via # timm # tokenizers @@ -101,7 +103,7 @@ markupsafe==2.1.3 # via jinja2 marshmallow==3.20.1 # via dataclasses-json -matplotlib==3.8.1 +matplotlib==3.8.2 # via pycocotools mpmath==1.3.0 # via sympy @@ -180,7 +182,7 @@ pillow==10.1.0 # unstructured-pytesseract portalocker==2.8.2 # via iopath -protobuf==4.25.0 +protobuf==4.25.1 # via # onnx # onnxruntime @@ -251,7 +253,7 @@ safetensors==0.3.2 # -c requirements/constraints.in # timm # transformers -scipy==1.11.3 +scipy==1.11.4 # via layoutparser six==1.16.0 # via @@ -303,15 +305,14 @@ typing-extensions==4.8.0 # torch # typing-inspect # unstructured + # uvicorn typing-inspect==0.9.0 # via dataclasses-json tzdata==2023.3 # via pandas -unstructured[local-inference]==0.10.30 - # via - # -r requirements/base.in - # unstructured -unstructured-inference==0.7.11 +unstructured[local-inference]==0.11.0 + # via -r requirements/base.in +unstructured-inference==0.7.15 # via unstructured unstructured-pytesseract==0.3.12 # via unstructured @@ -319,6 +320,8 @@ urllib3==2.1.0 # via requests uvicorn==0.24.0.post1 # via -r requirements/base.in +wrapt==1.16.0 + # via unstructured xlrd==2.0.1 # via unstructured xlsxwriter==3.1.9 diff --git a/requirements/test.txt b/requirements/test.txt index 36fb4493..f94b89ff 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -1,5 +1,5 @@ # -# This file is autogenerated by pip-compile with Python 3.11 +# This file is autogenerated by pip-compile with Python 3.10 # by the following command: # # pip-compile --output-file=requirements/test.txt requirements/base.txt requirements/test.in @@ -52,7 +52,7 @@ black==23.11.0 # via -r requirements/test.in bleach==6.1.0 # via nbconvert -certifi==2023.7.22 +certifi==2023.11.17 # via # -r requirements/base.txt # httpcore @@ -90,9 +90,7 @@ contourpy==1.2.0 # -r requirements/base.txt # matplotlib coverage[toml]==7.3.2 - # via - # coverage - # pytest-cov + # via pytest-cov cryptography==41.0.5 # via # -r requirements/base.txt @@ -123,6 +121,12 @@ et-xmlfile==1.1.0 # via # -r requirements/base.txt # openpyxl +exceptiongroup==1.1.3 + # via + # -r requirements/base.txt + # anyio + # ipython + # pytest execnb==0.1.5 # via nbdev executing==2.0.1 @@ -152,7 +156,7 @@ flatbuffers==23.5.26 # via # -r requirements/base.txt # onnxruntime -fonttools==4.44.3 +fonttools==4.45.0 # via # -r requirements/base.txt # matplotlib @@ -174,7 +178,7 @@ httpcore==1.0.2 # via httpx httpx==0.25.1 # via -r requirements/test.in -huggingface-hub==0.19.3 +huggingface-hub==0.19.4 # via # -r requirements/base.txt # timm @@ -235,7 +239,7 @@ json5==0.9.14 # via jupyterlab-server jsonpointer==2.4 # via jsonschema -jsonschema[format-nongpl]==4.19.2 +jsonschema[format-nongpl]==4.20.0 # via # jupyter-events # jupyterlab-server @@ -277,11 +281,11 @@ jupyter-server==2.10.1 # notebook-shim jupyter-server-terminals==0.4.4 # via jupyter-server -jupyterlab==4.0.8 +jupyterlab==4.0.9 # via notebook jupyterlab-pygments==0.2.2 # via nbconvert -jupyterlab-server==2.25.1 +jupyterlab-server==2.25.2 # via # jupyterlab # notebook @@ -298,7 +302,6 @@ langdetect==1.0.9 layoutparser[layoutmodels,tesseract]==0.3.4 # via # -r requirements/base.txt - # layoutparser # unstructured-inference lxml==4.9.3 # via @@ -319,7 +322,7 @@ marshmallow==3.20.1 # via # -r requirements/base.txt # dataclasses-json -matplotlib==3.8.1 +matplotlib==3.8.2 # via # -r requirements/base.txt # pycocotools @@ -495,7 +498,7 @@ prompt-toolkit==3.0.41 # via # ipython # jupyter-console -protobuf==4.25.0 +protobuf==4.25.1 # via # -r requirements/base.txt # onnx @@ -528,7 +531,7 @@ pydantic==1.10.13 # fastapi pyflakes==3.1.0 # via flake8 -pygments==2.16.1 +pygments==2.17.1 # via # ipython # jupyter-console @@ -610,7 +613,7 @@ pyzmq==25.1.1 # jupyter-console # jupyter-server # qtconsole -qtconsole==5.5.0 +qtconsole==5.5.1 # via jupyter qtpy==2.4.1 # via qtconsole @@ -647,7 +650,7 @@ rfc3986-validator==0.1.1 # via # jsonschema # jupyter-events -rpds-py==0.12.0 +rpds-py==0.13.1 # via # jsonschema # referencing @@ -657,7 +660,7 @@ safetensors==0.3.2 # -r requirements/base.txt # timm # transformers -scipy==1.11.3 +scipy==1.11.4 # via # -r requirements/base.txt # layoutparser @@ -710,6 +713,13 @@ tokenizers==0.15.0 # via # -r requirements/base.txt # transformers +tomli==2.0.1 + # via + # black + # coverage + # jupyterlab + # mypy + # pytest torch==2.1.1 # via # -r requirements/base.txt @@ -764,6 +774,8 @@ types-python-dateutil==2.8.19.14 typing-extensions==4.8.0 # via # -r requirements/base.txt + # async-lru + # black # fastapi # huggingface-hub # iopath @@ -773,6 +785,7 @@ typing-extensions==4.8.0 # torch # typing-inspect # unstructured + # uvicorn typing-inspect==0.9.0 # via # -r requirements/base.txt @@ -781,11 +794,9 @@ tzdata==2023.3 # via # -r requirements/base.txt # pandas -unstructured[local-inference]==0.10.30 - # via - # -r requirements/base.txt - # unstructured -unstructured-inference==0.7.11 +unstructured[local-inference]==0.11.0 + # via -r requirements/base.txt +unstructured-inference==0.7.15 # via # -r requirements/base.txt # unstructured @@ -817,6 +828,10 @@ wheel==0.41.3 # via astunparse widgetsnbextension==4.0.9 # via ipywidgets +wrapt==1.16.0 + # via + # -r requirements/base.txt + # unstructured xlrd==2.0.1 # via # -r requirements/base.txt diff --git a/scripts/hi_res_model_initialize.py b/scripts/hi_res_model_initialize.py deleted file mode 100644 index eb006620..00000000 --- a/scripts/hi_res_model_initialize.py +++ /dev/null @@ -1,16 +0,0 @@ -import os - -from unstructured_inference.models.base import get_model - - -def initialize(): - """Download default model or model specified by UNSTRUCTURED_HI_RES_MODEL_NAME environment - variable (avoids subprocesses all doing the same)""" - - # If more than one model will be supported and left up to user selection - supported_model = os.environ.get("UNSTRUCTURED_HI_RES_SUPPORTED_MODEL", "") - if supported_model: - for model_name in supported_model.split(","): - get_model(model_name=model_name) - - get_model(os.environ.get("UNSTRUCTURED_HI_RES_MODEL_NAME"))