feat: enhance API filetype detection (#445)

# Use the library for filetype detection The mimetype detection has always been very naive in the API - we rely on the file extension. If the user doesn't include a filename, we return an error that `Filetype None is not supported`. The library has a detect_filetype that actually inspects the file bytes, so let's reuse this. # Add a `content_type` param to override filetype detection Add an optional `content_type` param that allows the user to override the filetype detection. We'll use this value if it's set, or take the `file.content_type` which is based on the multipart `Content-Type` header. This provides an alternative when clients are unable to modify the header. # Testing The important thing is that `test_happy_path_all_types` passes in the docker smoke test - this contains all filetypes that we want the API to support. To test manually, you can try sending files to the server with and without the filename/content_type defined. Check out this branch and run `make run-web-app`. Example sending with no extension in filename. This correctly processes a pdf. ``` import requests filename = "sample-docs/layout-parser-paper-fast.pdf" url = "http://localhost:8000/general/v0/general" with open(filename, 'rb') as f: files = {'files': ("sample-doc", f)} response = requests.post(url, files=files) print(response.text) ``` For the new param, you can try modifying the content type for a text based file. Verify that you can change the `metadata.filetype` of the response using the new param: ``` curl --location 'http://localhost:8000/general/v0/general' \ --form 'files=@"sample-docs/family-day.eml"' \ --form 'content_type="text/plain"' [ { "type": "UncategorizedText", "element_id": "5cafe1ce2b0a96f8e3eba232e790db19", "text": "MIME-Version: 1.0 Date: Wed, 21 Dec 2022 10:28:53 -0600 Message-ID: <CAPgNNXQKR=o6AsOTr74VMrsDNhUJW0Keou9n3vLa2UO_Nv+tZw@mail.gmail.com> Subject: Family Day From: Mallori Harrell <mallori@unstructured.io> To: Mallori Harrell <mallori@unstructured.io> Content-Type: multipart/alternative; boundary=\"0000000000005c115405f0590ce4\"", "metadata": { "filename": "family-day.eml", "languages": [ "eng" ], "filetype": "text/plain" } }, ... ] ```
Unstructured-IO · Aug 6, 2024 · 7468938 · 7468938
1 parent d5502d0
commit 7468938
Show file tree

Hide file tree

Showing 9 changed files with 152 additions and 302 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,8 @@
+## 0.0.76
+* Use the library's `detect_filetype` in API to determine mimetype
+* Add content_type api parameter
+* Bump to `unstructured` 0.15.1
+
 ## 0.0.75
 
 * Remove constraint on `safetensors` that preventing us from bumping `transformers`.

diff --git a/prepline_general/api/app.py b/prepline_general/api/app.py
@@ -13,7 +13,7 @@
 app = FastAPI(
     title="Unstructured Pipeline API",
     summary="Partition documents with the Unstructured library",
-    version="0.0.75",
+    version="0.0.76",
     docs_url="/general/docs",
     openapi_url="/general/openapi.json",
     servers=[

diff --git a/prepline_general/api/filetypes.py b/prepline_general/api/filetypes.py
@@ -1,107 +1,55 @@
-import mimetypes
 import os
-from fastapi import UploadFile, HTTPException
 from typing import Optional
+from io import BytesIO
 
-DEFAULT_MIMETYPES = (
-    "application/pdf,application/msword,image/jpeg,image/png,text/markdown,"
-    "text/x-markdown,text/html,"
-    "application/vnd.openxmlformats-officedocument.wordprocessingml.document,"
-    "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet,"
-    "application/vnd.ms-excel,application/vnd.openxmlformats-officedocument."
-    "presentationml.presentation,"
-    "application/json,"
-    "application/vnd.ms-powerpoint,"
-    "text/html,message/rfc822,text/plain,image/png,"
-    "application/epub,application/epub+zip,"
-    "application/rtf,text/rtf,"
-    "application/vnd.oasis.opendocument.text,"
-    "text/csv,text/x-csv,application/csv,application/x-csv,"
-    "text/comma-separated-values,text/x-comma-separated-values,"
-    "application/xml,text/xml,text/x-rst,text/prs.fallenstein.rst,"
-    "text/tsv,text/tab-separated-values,"
-    "application/x-ole-storage,application/vnd.ms-outlook,"
-    "application/yaml,"
-    "application/x-yaml,"
-    "text/x-yaml,"
-    "text/yaml,"
-    "image/bmp,"
-    "image/heic,"
-    "image/tiff,"
-    "text/org,"
-)
+from fastapi import HTTPException, UploadFile
 
-if not os.environ.get("UNSTRUCTURED_ALLOWED_MIMETYPES", None):
-    os.environ["UNSTRUCTURED_ALLOWED_MIMETYPES"] = DEFAULT_MIMETYPES
+from unstructured.file_utils.filetype import detect_filetype
+from unstructured.file_utils.model import FileType
 
 
-def _load_mimetypes() -> None:
-    """Call this on startup to ensure that all expected file extensions are present in the mimetypes
-    lib"""
-    expected_mimetypes = [
-        (".bmp", "image/bmp"),
-        (".csv", "application/csv"),
-        (".doc", "application/msword"),
-        (".docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document"),
-        (".eml", "message/rfc822"),
-        (".epub", "application/epub"),
-        (".gz", "application/gzip"),
-        (".heic", "image/heic"),
-        (".html", "text/html"),
-        (".jpeg", "image/jpeg"),
-        (".jpg", "image/jpeg"),
-        (".json", "application/json"),
-        (".md", "text/markdown"),
-        (".msg", "application/x-ole-storage"),
-        (".odt", "application/vnd.oasis.opendocument.text"),
-        (".org", "text/org"),
-        (".pdf", "application/pdf"),
-        (".png", "image/png"),
-        (".ppt", "application/vnd.ms-powerpoint"),
-        (".pptx", "application/vnd.openxmlformats-officedocument.presentationml.presentation"),
-        (".rst", "text/prs.fallenstein.rst"),
-        (".rtf", "application/rtf"),
-        (".tiff", "image/tiff"),
-        (".tsv", "text/tab-separated-values"),
-        (".txt", "text/plain"),
-        (".xlsx", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"),
-        (".xml", "text/xml"),
-    ]
+def _remove_optional_info_from_mime_type(content_type: str | None) -> str | None:
+    """removes charset information from mime types, e.g.,
+    "application/json; charset=utf-8" -> "application/json"
+    """
+    if not content_type:
+        return content_type
+    return content_type.split(";")[0]
+
 
-    for extension, mimetype in expected_mimetypes:
-        mimetypes.add_type(mimetype, extension)
+def get_validated_mimetype(file: UploadFile, content_type_hint: str | None = None) -> Optional[str]:
+    """Given the incoming file, identify and return the correct mimetype.
 
+    Order of operations:
+    - If user passed content_type as a form param, take it as truth.
+    - Otherwise, use file.content_type (as set by the Content-Type header)
+    - If no content_type was passed and the header wasn't useful, call the library's detect_filetype
 
-_load_mimetypes()
+    Once we have a filteype, check is_partitionable and return 400 if we don't support this file.
+    """
+    content_type: str | None = None
 
+    if content_type_hint is not None:
+        content_type = content_type_hint
+    else:
+        content_type = _remove_optional_info_from_mime_type(file.content_type)
 
-def get_validated_mimetype(file: UploadFile) -> Optional[str]:
-    """The MIME-type of `file`.
+    filetype = FileType.from_mime_type(content_type)
 
-    The mimetype is computed based on `file.content_type`, or the mimetypes lib if that's too
-    generic. If the user has set UNSTRUCTURED_ALLOWED_MIMETYPES, validate against this list and
-    return HTTP 400 for an invalid type.
-    """
-    content_type = file.content_type
-    filename = str(file.filename)  # -- "None" when file.filename is None --
-    if not content_type or content_type == "application/octet-stream":
-        content_type = mimetypes.guess_type(filename)[0]
+    # If content_type was not specified, use the library to identify the file
+    # We inspect the bytes to do this, so we need to buffer the file
+    if not filetype or filetype == FileType.UNK:
+        file_buffer = BytesIO(file.file.read())
+        file.file.seek(0)
 
-        # Some filetypes missing for this library, just hardcode them for now
-        if not content_type:
-            if filename.endswith(".md"):
-                content_type = "text/markdown"
-            elif filename.endswith(".msg"):
-                content_type = "message/rfc822"
+        file_buffer.name = file.filename
 
-    allowed_mimetypes_str = os.environ.get("UNSTRUCTURED_ALLOWED_MIMETYPES")
-    if allowed_mimetypes_str is not None:
-        allowed_mimetypes = allowed_mimetypes_str.split(",")
+        filetype = detect_filetype(file=file_buffer)
 
-        if content_type not in allowed_mimetypes:
-            raise HTTPException(
-                status_code=400,
-                detail=(f"File type {content_type} is not supported."),
-            )
+    if not filetype.is_partitionable:
+        raise HTTPException(
+            status_code=400,
+            detail=(f"File type {filetype.mime_type} is not supported."),
+        )
 
-    return content_type
+    return filetype.mime_type
diff --git a/prepline_general/api/general.py b/prepline_general/api/general.py
@@ -649,7 +649,7 @@ def return_content_type(filename: str):
 
 
 @router.get("/general/v0/general", include_in_schema=False)
-@router.get("/general/v0.0.75/general", include_in_schema=False)
+@router.get("/general/v0.0.76/general", include_in_schema=False)
 async def handle_invalid_get_request():
     raise HTTPException(
         status_code=status.HTTP_405_METHOD_NOT_ALLOWED, detail="Only POST requests are supported."
@@ -664,7 +664,7 @@ async def handle_invalid_get_request():
     description="Description",
     operation_id="partition_parameters",
 )
-@router.post("/general/v0.0.75/general", include_in_schema=False)
+@router.post("/general/v0.0.76/general", include_in_schema=False)
 def general_partition(
     request: Request,
     # cannot use annotated type here because of a bug described here:
@@ -683,13 +683,13 @@ def general_partition(
                 detail=f"API key {api_key} is invalid", status_code=status.HTTP_401_UNAUTHORIZED
             )
 
-    content_type = request.headers.get("Accept")
+    accept_type = request.headers.get("Accept")
 
     # -- detect response content-type conflict when multiple files are uploaded --
     if (
         len(files) > 1
-        and content_type
-        and content_type
+        and accept_type
+        and accept_type
         not in [
             "*/*",
             "multipart/mixed",
@@ -698,7 +698,7 @@ def general_partition(
         ]
     ):
         raise HTTPException(
-            detail=f"Conflict in media type {content_type} with response type 'multipart/mixed'.\n",
+            detail=f"Conflict in media type {accept_type} with response type 'multipart/mixed'.\n",
             status_code=status.HTTP_406_NOT_ACCEPTABLE,
         )
 
@@ -714,7 +714,9 @@ def general_partition(
 
     def response_generator(is_multipart: bool):
         for file in files:
-            file_content_type = get_validated_mimetype(file)
+            file_content_type = get_validated_mimetype(
+                file, content_type_hint=form_params.content_type
+            )
 
             _file = file.file
 
@@ -781,7 +783,7 @@ def join_responses(
         MultipartMixedResponse(
             response_generator(is_multipart=True), content_type=form_params.output_format
         )
-        if content_type == "multipart/mixed"
+        if accept_type == "multipart/mixed"
         else (
             list(response_generator(is_multipart=False))[0]
             if len(files) == 1

diff --git a/prepline_general/api/models/form_params.py b/prepline_general/api/models/form_params.py
@@ -20,6 +20,7 @@ class GeneralFormParams(BaseModel):
     output_format: str
     coordinates: bool
     encoding: str
+    content_type: Optional[str]
     hi_res_model_name: Optional[str]
     include_page_breaks: bool
     pdf_infer_table_structure: bool
@@ -100,6 +101,15 @@ def as_form(
             ),
             BeforeValidator(SmartValueParser[bool]().value_or_first_element),
         ] = False,
+        content_type: Annotated[
+            Optional[str],
+            Form(
+                title="Content type",
+                description="A hint about the content type to use (such as text/markdown), when there are problems processing a specific file. This value is a MIME type in the format type/subtype.",
+                example="text/markdown",
+            ),
+            BeforeValidator(SmartValueParser[str]().value_or_first_element),
+        ] = None,
         encoding: Annotated[
             str,
             Form(
@@ -245,6 +255,7 @@ def as_form(
             gz_uncompressed_content_type=gz_uncompressed_content_type,
             output_format=output_format,
             coordinates=coordinates,
+            content_type=content_type,
             encoding=encoding,
             hi_res_model_name=hi_res_model_name,
             include_page_breaks=include_page_breaks,

diff --git a/preprocessing-pipeline-family.yaml b/preprocessing-pipeline-family.yaml
@@ -1,2 +1,2 @@
 name: general
-version: 0.0.75
+version: 0.0.76