fix/Fix MS Office filetype errors and harden docker smoketest (#436)

# Changes **Fix for docx and other office files returning `{"detail":"File type None is not supported."}`** After moving to the wolfi base image, the `mimetypes` lib no longer knows about these file extensions. To avoid issues like this, let's add an explicit mapping for all the file extensions we care about. I added a `filetypes.py` and moved `get_validated_mimetype` over. When this file is imported, we'll call `mimetypes.add_type` for all file extensions we support. **Update smoke test coverage** This bug snuck past because we were already providing the mimetype in the docker smoke test. I updated `test_happy_path` to test against the container with and without passing `content_type`. I added some missing filetypes, and sorted the test params by extension so we can see when new types are missing. # Testing The new smoke test will verify that all filetypes are working. You can also `make docker-build && make docker-start-api`, and test out the docx in the sample docs dir. On `main`, this file will give you the error above. ``` curl 'http://localhost:8000/general/v0/general' \ --form 'files=@"fake.docx"' ```
Unstructured-IO · Jun 28, 2024 · 6710df0 · 6710df0
1 parent d5a878f
commit 6710df0
Show file tree

Hide file tree

Showing 9 changed files with 187 additions and 114 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,7 @@
+## 0.0.72
+
+* Fix certain filetypes failing mimetype lookup in the new base image
+
 ## 0.0.71
 
 * replace rockylinux with chainguard/wolfi as a base image for `amd64`

diff --git a/prepline_general/api/app.py b/prepline_general/api/app.py
@@ -13,7 +13,7 @@
 app = FastAPI(
     title="Unstructured Pipeline API",
     summary="Partition documents with the Unstructured library",
-    version="0.0.71",
+    version="0.0.72",
     docs_url="/general/docs",
     openapi_url="/general/openapi.json",
     servers=[

diff --git a/prepline_general/api/filetypes.py b/prepline_general/api/filetypes.py
@@ -0,0 +1,107 @@
+import mimetypes
+import os
+from fastapi import UploadFile, HTTPException
+from typing import Optional
+
+DEFAULT_MIMETYPES = (
+    "application/pdf,application/msword,image/jpeg,image/png,text/markdown,"
+    "text/x-markdown,text/html,"
+    "application/vnd.openxmlformats-officedocument.wordprocessingml.document,"
+    "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet,"
+    "application/vnd.ms-excel,application/vnd.openxmlformats-officedocument."
+    "presentationml.presentation,"
+    "application/json,"
+    "application/vnd.ms-powerpoint,"
+    "text/html,message/rfc822,text/plain,image/png,"
+    "application/epub,application/epub+zip,"
+    "application/rtf,text/rtf,"
+    "application/vnd.oasis.opendocument.text,"
+    "text/csv,text/x-csv,application/csv,application/x-csv,"
+    "text/comma-separated-values,text/x-comma-separated-values,"
+    "application/xml,text/xml,text/x-rst,text/prs.fallenstein.rst,"
+    "text/tsv,text/tab-separated-values,"
+    "application/x-ole-storage,application/vnd.ms-outlook,"
+    "application/yaml,"
+    "application/x-yaml,"
+    "text/x-yaml,"
+    "text/yaml,"
+    "image/bmp,"
+    "image/heic,"
+    "image/tiff,"
+    "text/org,"
+)
+
+if not os.environ.get("UNSTRUCTURED_ALLOWED_MIMETYPES", None):
+    os.environ["UNSTRUCTURED_ALLOWED_MIMETYPES"] = DEFAULT_MIMETYPES
+
+
+def _load_mimetypes() -> None:
+    """Call this on startup to ensure that all expected file extensions are present in the mimetypes
+    lib"""
+    expected_mimetypes = [
+        (".bmp", "image/bmp"),
+        (".csv", "application/csv"),
+        (".doc", "application/msword"),
+        (".docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document"),
+        (".eml", "message/rfc822"),
+        (".epub", "application/epub"),
+        (".gz", "application/gzip"),
+        (".heic", "image/heic"),
+        (".html", "text/html"),
+        (".jpeg", "image/jpeg"),
+        (".jpg", "image/jpeg"),
+        (".json", "application/json"),
+        (".md", "text/markdown"),
+        (".msg", "application/x-ole-storage"),
+        (".odt", "application/vnd.oasis.opendocument.text"),
+        (".org", "text/org"),
+        (".pdf", "application/pdf"),
+        (".png", "image/png"),
+        (".ppt", "application/vnd.ms-powerpoint"),
+        (".pptx", "application/vnd.openxmlformats-officedocument.presentationml.presentation"),
+        (".rst", "text/prs.fallenstein.rst"),
+        (".rtf", "application/rtf"),
+        (".tiff", "image/tiff"),
+        (".tsv", "text/tab-separated-values"),
+        (".txt", "text/plain"),
+        (".xlsx", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"),
+        (".xml", "text/xml"),
+    ]
+
+    for extension, mimetype in expected_mimetypes:
+        mimetypes.add_type(mimetype, extension)
+
+
+_load_mimetypes()
+
+
+def get_validated_mimetype(file: UploadFile) -> Optional[str]:
+    """The MIME-type of `file`.
+
+    The mimetype is computed based on `file.content_type`, or the mimetypes lib if that's too
+    generic. If the user has set UNSTRUCTURED_ALLOWED_MIMETYPES, validate against this list and
+    return HTTP 400 for an invalid type.
+    """
+    content_type = file.content_type
+    filename = str(file.filename)  # -- "None" when file.filename is None --
+    if not content_type or content_type == "application/octet-stream":
+        content_type = mimetypes.guess_type(filename)[0]
+
+        # Some filetypes missing for this library, just hardcode them for now
+        if not content_type:
+            if filename.endswith(".md"):
+                content_type = "text/markdown"
+            elif filename.endswith(".msg"):
+                content_type = "message/rfc822"
+
+    allowed_mimetypes_str = os.environ.get("UNSTRUCTURED_ALLOWED_MIMETYPES")
+    if allowed_mimetypes_str is not None:
+        allowed_mimetypes = allowed_mimetypes_str.split(",")
+
+        if content_type not in allowed_mimetypes:
+            raise HTTPException(
+                status_code=400,
+                detail=(f"File type {content_type} is not supported."),
+            )
+
+    return content_type
diff --git a/prepline_general/api/general.py b/prepline_general/api/general.py
@@ -34,6 +34,7 @@
 from starlette.types import Send
 
 from prepline_general.api.models.form_params import GeneralFormParams
+from prepline_general.api.filetypes import get_validated_mimetype
 from unstructured.documents.elements import Element
 from unstructured.partition.auto import partition
 from unstructured.staging.base import (
@@ -59,37 +60,6 @@ def is_compatible_response_type(media_type: str, response_type: type) -> bool:
 
 logger = logging.getLogger("unstructured_api")
 
-DEFAULT_MIMETYPES = (
-    "application/pdf,application/msword,image/jpeg,image/png,text/markdown,"
-    "text/x-markdown,text/html,"
-    "application/vnd.openxmlformats-officedocument.wordprocessingml.document,"
-    "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet,"
-    "application/vnd.ms-excel,application/vnd.openxmlformats-officedocument."
-    "presentationml.presentation,"
-    "application/json,"
-    "application/vnd.ms-powerpoint,"
-    "text/html,message/rfc822,text/plain,image/png,"
-    "application/epub,application/epub+zip,"
-    "application/rtf,text/rtf,"
-    "application/vnd.oasis.opendocument.text,"
-    "text/csv,text/x-csv,application/csv,application/x-csv,"
-    "text/comma-separated-values,text/x-comma-separated-values,"
-    "application/xml,text/xml,text/x-rst,text/prs.fallenstein.rst,"
-    "text/tsv,text/tab-separated-values,"
-    "application/x-ole-storage,application/vnd.ms-outlook,"
-    "application/yaml,"
-    "application/x-yaml,"
-    "text/x-yaml,"
-    "text/yaml,"
-    "image/bmp,"
-    "image/heic,"
-    "image/tiff,"
-    "text/org,"
-)
-
-if not os.environ.get("UNSTRUCTURED_ALLOWED_MIMETYPES", None):
-    os.environ["UNSTRUCTURED_ALLOWED_MIMETYPES"] = DEFAULT_MIMETYPES
-
 
 def get_pdf_splits(pdf_pages: Sequence[PageObject], split_size: int = 1):
     """Given a pdf (PdfReader) with n pages, split it into pdfs each with split_size # of pages.
@@ -609,38 +579,6 @@ def _set_pdf_infer_table_structure(
     return strategy in ("hi_res", "auto") and pdf_infer_table_structure
 
 
-def get_validated_mimetype(file: UploadFile) -> Optional[str]:
-    """The MIME-type of `file`.
-
-    The mimetype is computed based on `file.content_type`, or the mimetypes lib if that's too
-    generic. If the user has set UNSTRUCTURED_ALLOWED_MIMETYPES, validate against this list and
-    return HTTP 400 for an invalid type.
-    """
-    content_type = file.content_type
-    filename = str(file.filename)  # -- "None" when file.filename is None --
-    if not content_type or content_type == "application/octet-stream":
-        content_type = mimetypes.guess_type(filename)[0]
-
-        # Some filetypes missing for this library, just hardcode them for now
-        if not content_type:
-            if filename.endswith(".md"):
-                content_type = "text/markdown"
-            elif filename.endswith(".msg"):
-                content_type = "message/rfc822"
-
-    allowed_mimetypes_str = os.environ.get("UNSTRUCTURED_ALLOWED_MIMETYPES")
-    if allowed_mimetypes_str is not None:
-        allowed_mimetypes = allowed_mimetypes_str.split(",")
-
-        if content_type not in allowed_mimetypes:
-            raise HTTPException(
-                status_code=400,
-                detail=(f"File type {content_type} is not supported."),
-            )
-
-    return content_type
-
-
 class MultipartMixedResponse(StreamingResponse):
     CRLF = b"\r\n"
 
@@ -713,7 +651,7 @@ def return_content_type(filename: str):
 
 
 @router.get("/general/v0/general", include_in_schema=False)
-@router.get("/general/v0.0.71/general", include_in_schema=False)
+@router.get("/general/v0.0.72/general", include_in_schema=False)
 async def handle_invalid_get_request():
     raise HTTPException(
         status_code=status.HTTP_405_METHOD_NOT_ALLOWED, detail="Only POST requests are supported."
@@ -728,7 +666,7 @@ async def handle_invalid_get_request():
     description="Description",
     operation_id="partition_parameters",
 )
-@router.post("/general/v0.0.71/general", include_in_schema=False)
+@router.post("/general/v0.0.72/general", include_in_schema=False)
 def general_partition(
     request: Request,
     # cannot use annotated type here because of a bug described here:

diff --git a/preprocessing-pipeline-family.yaml b/preprocessing-pipeline-family.yaml
@@ -1,2 +1,2 @@
 name: general
-version: 0.0.71
+version: 0.0.72
diff --git a/sample-docs/DA-1p.bmp b/sample-docs/DA-1p.bmp
diff --git a/sample-docs/DA-1p.heic b/sample-docs/DA-1p.heic
diff --git a/sample-docs/layout-parser-paper-fast.tiff b/sample-docs/layout-parser-paper-fast.tiff
diff --git a/scripts/smoketest.py b/scripts/smoketest.py
@@ -49,72 +49,96 @@ def send_document(
 
 
 @pytest.mark.parametrize(
-    "example_filename, content_type",
+    ("extension", "example_filename", "content_type"),
     [
-        # Note(yuming): Please sort filetypes alphabetically according to
-        # https://github.com/Unstructured-IO/unstructured/blob/main/unstructured/partition/auto.py#L14
-        ("stanley-cups.csv", "application/csv"),
-        ("fake.doc", "application/msword"),
-        ("fake.docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document"),
-        ("alert.eml", "message/rfc822"),
-        ("announcement.eml", "message/rfc822"),
-        ("fake-email-attachment.eml", "message/rfc822"),
-        ("fake-email-image-embedded.eml", "message/rfc822"),
-        ("fake-email.eml", "message/rfc822"),
-        ("family-day.eml", "message/rfc822"),
-        ("winter-sports.epub", "application/epub"),
-        ("fake-html.html", "text/html"),
-        pytest.param(
-            "layout-parser-paper-fast.jpg",
-            "image/jpeg",
-            marks=pytest.mark.skipif(skip_inference_tests, reason="emulated architecture"),
-        ),
-        ("spring-weather.html.json", "application/json"),
-        ("README.md", "text/markdown"),
-        ("fake-email.msg", "application/x-ole-storage"),
-        ("fake.odt", "application/vnd.oasis.opendocument.text"),
-        # Note(austin) The two inference calls will hang on mac with unsupported hardware error
-        # Skip these with SKIP_INFERENCE_TESTS=true make docker-test
-        pytest.param(
-            "layout-parser-paper.pdf.gz",
-            "application/gzip",
-            marks=pytest.mark.skipif(skip_inference_tests, reason="emulated architecture"),
-        ),
-        pytest.param(
-            "layout-parser-paper.pdf",
-            "application/pdf",
-            marks=pytest.mark.skipif(skip_inference_tests, reason="emulated architecture"),
+        (".bmp", "DA-1p.bmp", "image/bmp"),
+        (".csv", "stanley-cups.csv", "application/csv"),
+        (".doc", "fake.doc", "application/msword"),
+        (
+            ".docx",
+            "fake.docx",
+            "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
         ),
-        ("fake-power-point.ppt", "application/vnd.ms-powerpoint"),
+        (".eml", "fake-email-attachment.eml", "message/rfc822"),
+        (".epub", "winter-sports.epub", "application/epub"),
+        (".heic", "DA-1p.heic", "image/heic"),
+        (".html", "fake-html.html", "text/html"),
+        (".jpeg", "layout-parser-paper-fast.jpg", "image/jpeg"),
+        (".md", "README.md", "text/markdown"),
+        (".msg", "fake-email.msg", "application/x-ole-storage"),
+        (".odt", "fake.odt", "application/vnd.oasis.opendocument.text"),
+        (".pdf", "layout-parser-paper.pdf", "application/pdf"),
+        (".png", "english-and-korean.png", "image/png"),
+        (".ppt", "fake-power-point.ppt", "application/vnd.ms-powerpoint"),
         (
+            ".pptx",
             "fake-power-point.pptx",
             "application/vnd.openxmlformats-officedocument.presentationml.presentation",
         ),
-        ("README.rst", "text/prs.fallenstein.rst"),
-        ("fake-doc.rtf", "application/rtf"),
-        ("fake-text.txt", "text/plain"),
-        ("stanley-cups.tsv", "text/tab-separated-values"),
+        (".rst", "README.rst", "text/prs.fallenstein.rst"),
+        (".rtf", "fake-doc.rtf", "application/rtf"),
+        (".tiff", "layout-parser-paper-fast.tiff", "image/tiff"),
+        (".tsv", "stanley-cups.tsv", "text/tab-separated-values"),
+        (".txt", "fake-text.txt", "text/plain"),
         (
+            ".xlsx",
             "stanley-cups.xlsx",
             "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
         ),
-        ("fake-xml.xml", "text/xml"),
+        (".xml", "fake-xml.xml", "text/xml"),
+        (".json", "spring-weather.html.json", "application/json"),
+        (
+            ".gz",
+            "layout-parser-paper.pdf.gz",
+            "application/gzip",
+        ),
     ],
 )
-def test_happy_path(example_filename: str, content_type: str):
+def test_happy_path_all_types(extension, example_filename: str, content_type: str):
     """
     For the files in sample-docs, verify that we get a 200
     and some structured response
     """
+    # The auto strategy will run ocr on these files
+    # This doesn't always work on our macs
+    if skip_inference_tests and extension in [
+        ".bmp",
+        ".heic",
+        ".jpeg",
+        ".pdf",
+        ".png",
+        ".tiff",
+        ".gz",  # Since we're using a gzipped pdf...
+    ]:
+        pytest.skip("emulated hardware")
+
     test_file = str(Path("sample-docs") / example_filename)
-    print(f"sending {content_type}")
-    json_response = send_document(filenames=[test_file], content_type=content_type)
-    assert json_response.status_code == 200
-    assert len(json_response.json()) > 0
-    assert len("".join(elem["text"] for elem in json_response.json())) > 20
 
+    # Verify we can send with explicit content type
+    response = send_document(filenames=[test_file], content_type=content_type)
+
+    if response.status_code != 200:
+        assert False, response.text
+
+    assert len(response.json()) > 0
+    assert len("".join(elem["text"] for elem in response.json())) > 20
+
+    # Verify we can infer the filetype on the server
+    response = send_document(filenames=[test_file], content_type=None)
+
+    if response.status_code != 200:
+        assert False, response.text
+
+    assert len(response.json()) > 0
+    assert len("".join(elem["text"] for elem in response.json())) > 20
+
+    json_response = response
+
+    # Verify we can set output type to csv
     csv_response = send_document(
-        filenames=[test_file], content_type=content_type, output_format="text/csv"
+        filenames=[test_file],
+        content_type=content_type,
+        output_format="text/csv",
     )
     assert csv_response.status_code == 200
     assert len(csv_response.text) > 0