Skip to content

Commit

Permalink
feat: enhance API filetype detection (#445)
Browse files Browse the repository at this point in the history
# Use the library for filetype detection 

The mimetype detection has always been very naive in the API - we rely
on the file extension. If the user doesn't include a filename, we return
an error that `Filetype None is not supported`. The library has a
detect_filetype that actually inspects the file bytes, so let's reuse
this.

# Add a `content_type` param to override filetype detection

Add an optional `content_type` param that allows the user to override
the filetype detection. We'll use this value if it's set, or take the
`file.content_type` which is based on the multipart `Content-Type`
header. This provides an alternative when clients are unable to modify
the header.

# Testing

The important thing is that `test_happy_path_all_types` passes in the
docker smoke test - this contains all filetypes that we want the API to
support.

To test manually, you can try sending files to the server with and
without the filename/content_type defined.

Check out this branch and run `make run-web-app`.

Example sending with no extension in filename. This correctly processes
a pdf.
```
import requests

filename = "sample-docs/layout-parser-paper-fast.pdf"
url = "http://localhost:8000/general/v0/general"

with open(filename, 'rb') as f:
    files = {'files': ("sample-doc", f)}
    response = requests.post(url, files=files)
    print(response.text)
```

For the new param, you can try modifying the content type for a text
based file.

Verify that you can change the `metadata.filetype` of the response using
the new param:

```
 curl --location 'http://localhost:8000/general/v0/general' \
--form 'files=@"sample-docs/family-day.eml"' \
--form 'content_type="text/plain"'

[
    {
        "type": "UncategorizedText",
        "element_id": "5cafe1ce2b0a96f8e3eba232e790db19",
        "text": "MIME-Version: 1.0 Date: Wed, 21 Dec 2022 10:28:53 -0600 Message-ID: <CAPgNNXQKR=o6AsOTr74VMrsDNhUJW0Keou9n3vLa2UO_Nv+tZw@mail.gmail.com> Subject: Family Day From: Mallori Harrell <mallori@unstructured.io> To: Mallori Harrell <mallori@unstructured.io> Content-Type: multipart/alternative; boundary=\"0000000000005c115405f0590ce4\"",
        "metadata": {
            "filename": "family-day.eml",
            "languages": [
                "eng"
            ],
            "filetype": "text/plain"
        }
    },
    ...
]
```
  • Loading branch information
awalker4 authored Aug 6, 2024
1 parent d5502d0 commit 7468938
Show file tree
Hide file tree
Showing 9 changed files with 152 additions and 302 deletions.
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
## 0.0.76
* Use the library's `detect_filetype` in API to determine mimetype
* Add content_type api parameter
* Bump to `unstructured` 0.15.1

## 0.0.75

* Remove constraint on `safetensors` that preventing us from bumping `transformers`.
Expand Down
2 changes: 1 addition & 1 deletion prepline_general/api/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
app = FastAPI(
title="Unstructured Pipeline API",
summary="Partition documents with the Unstructured library",
version="0.0.75",
version="0.0.76",
docs_url="/general/docs",
openapi_url="/general/openapi.json",
servers=[
Expand Down
130 changes: 39 additions & 91 deletions prepline_general/api/filetypes.py
Original file line number Diff line number Diff line change
@@ -1,107 +1,55 @@
import mimetypes
import os
from fastapi import UploadFile, HTTPException
from typing import Optional
from io import BytesIO

DEFAULT_MIMETYPES = (
"application/pdf,application/msword,image/jpeg,image/png,text/markdown,"
"text/x-markdown,text/html,"
"application/vnd.openxmlformats-officedocument.wordprocessingml.document,"
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet,"
"application/vnd.ms-excel,application/vnd.openxmlformats-officedocument."
"presentationml.presentation,"
"application/json,"
"application/vnd.ms-powerpoint,"
"text/html,message/rfc822,text/plain,image/png,"
"application/epub,application/epub+zip,"
"application/rtf,text/rtf,"
"application/vnd.oasis.opendocument.text,"
"text/csv,text/x-csv,application/csv,application/x-csv,"
"text/comma-separated-values,text/x-comma-separated-values,"
"application/xml,text/xml,text/x-rst,text/prs.fallenstein.rst,"
"text/tsv,text/tab-separated-values,"
"application/x-ole-storage,application/vnd.ms-outlook,"
"application/yaml,"
"application/x-yaml,"
"text/x-yaml,"
"text/yaml,"
"image/bmp,"
"image/heic,"
"image/tiff,"
"text/org,"
)
from fastapi import HTTPException, UploadFile

if not os.environ.get("UNSTRUCTURED_ALLOWED_MIMETYPES", None):
os.environ["UNSTRUCTURED_ALLOWED_MIMETYPES"] = DEFAULT_MIMETYPES
from unstructured.file_utils.filetype import detect_filetype
from unstructured.file_utils.model import FileType


def _load_mimetypes() -> None:
"""Call this on startup to ensure that all expected file extensions are present in the mimetypes
lib"""
expected_mimetypes = [
(".bmp", "image/bmp"),
(".csv", "application/csv"),
(".doc", "application/msword"),
(".docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document"),
(".eml", "message/rfc822"),
(".epub", "application/epub"),
(".gz", "application/gzip"),
(".heic", "image/heic"),
(".html", "text/html"),
(".jpeg", "image/jpeg"),
(".jpg", "image/jpeg"),
(".json", "application/json"),
(".md", "text/markdown"),
(".msg", "application/x-ole-storage"),
(".odt", "application/vnd.oasis.opendocument.text"),
(".org", "text/org"),
(".pdf", "application/pdf"),
(".png", "image/png"),
(".ppt", "application/vnd.ms-powerpoint"),
(".pptx", "application/vnd.openxmlformats-officedocument.presentationml.presentation"),
(".rst", "text/prs.fallenstein.rst"),
(".rtf", "application/rtf"),
(".tiff", "image/tiff"),
(".tsv", "text/tab-separated-values"),
(".txt", "text/plain"),
(".xlsx", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"),
(".xml", "text/xml"),
]
def _remove_optional_info_from_mime_type(content_type: str | None) -> str | None:
"""removes charset information from mime types, e.g.,
"application/json; charset=utf-8" -> "application/json"
"""
if not content_type:
return content_type
return content_type.split(";")[0]


for extension, mimetype in expected_mimetypes:
mimetypes.add_type(mimetype, extension)
def get_validated_mimetype(file: UploadFile, content_type_hint: str | None = None) -> Optional[str]:
"""Given the incoming file, identify and return the correct mimetype.
Order of operations:
- If user passed content_type as a form param, take it as truth.
- Otherwise, use file.content_type (as set by the Content-Type header)
- If no content_type was passed and the header wasn't useful, call the library's detect_filetype
_load_mimetypes()
Once we have a filteype, check is_partitionable and return 400 if we don't support this file.
"""
content_type: str | None = None

if content_type_hint is not None:
content_type = content_type_hint
else:
content_type = _remove_optional_info_from_mime_type(file.content_type)

def get_validated_mimetype(file: UploadFile) -> Optional[str]:
"""The MIME-type of `file`.
filetype = FileType.from_mime_type(content_type)

The mimetype is computed based on `file.content_type`, or the mimetypes lib if that's too
generic. If the user has set UNSTRUCTURED_ALLOWED_MIMETYPES, validate against this list and
return HTTP 400 for an invalid type.
"""
content_type = file.content_type
filename = str(file.filename) # -- "None" when file.filename is None --
if not content_type or content_type == "application/octet-stream":
content_type = mimetypes.guess_type(filename)[0]
# If content_type was not specified, use the library to identify the file
# We inspect the bytes to do this, so we need to buffer the file
if not filetype or filetype == FileType.UNK:
file_buffer = BytesIO(file.file.read())
file.file.seek(0)

# Some filetypes missing for this library, just hardcode them for now
if not content_type:
if filename.endswith(".md"):
content_type = "text/markdown"
elif filename.endswith(".msg"):
content_type = "message/rfc822"
file_buffer.name = file.filename

allowed_mimetypes_str = os.environ.get("UNSTRUCTURED_ALLOWED_MIMETYPES")
if allowed_mimetypes_str is not None:
allowed_mimetypes = allowed_mimetypes_str.split(",")
filetype = detect_filetype(file=file_buffer)

if content_type not in allowed_mimetypes:
raise HTTPException(
status_code=400,
detail=(f"File type {content_type} is not supported."),
)
if not filetype.is_partitionable:
raise HTTPException(
status_code=400,
detail=(f"File type {filetype.mime_type} is not supported."),
)

return content_type
return filetype.mime_type
18 changes: 10 additions & 8 deletions prepline_general/api/general.py
Original file line number Diff line number Diff line change
Expand Up @@ -649,7 +649,7 @@ def return_content_type(filename: str):


@router.get("/general/v0/general", include_in_schema=False)
@router.get("/general/v0.0.75/general", include_in_schema=False)
@router.get("/general/v0.0.76/general", include_in_schema=False)
async def handle_invalid_get_request():
raise HTTPException(
status_code=status.HTTP_405_METHOD_NOT_ALLOWED, detail="Only POST requests are supported."
Expand All @@ -664,7 +664,7 @@ async def handle_invalid_get_request():
description="Description",
operation_id="partition_parameters",
)
@router.post("/general/v0.0.75/general", include_in_schema=False)
@router.post("/general/v0.0.76/general", include_in_schema=False)
def general_partition(
request: Request,
# cannot use annotated type here because of a bug described here:
Expand All @@ -683,13 +683,13 @@ def general_partition(
detail=f"API key {api_key} is invalid", status_code=status.HTTP_401_UNAUTHORIZED
)

content_type = request.headers.get("Accept")
accept_type = request.headers.get("Accept")

# -- detect response content-type conflict when multiple files are uploaded --
if (
len(files) > 1
and content_type
and content_type
and accept_type
and accept_type
not in [
"*/*",
"multipart/mixed",
Expand All @@ -698,7 +698,7 @@ def general_partition(
]
):
raise HTTPException(
detail=f"Conflict in media type {content_type} with response type 'multipart/mixed'.\n",
detail=f"Conflict in media type {accept_type} with response type 'multipart/mixed'.\n",
status_code=status.HTTP_406_NOT_ACCEPTABLE,
)

Expand All @@ -714,7 +714,9 @@ def general_partition(

def response_generator(is_multipart: bool):
for file in files:
file_content_type = get_validated_mimetype(file)
file_content_type = get_validated_mimetype(
file, content_type_hint=form_params.content_type
)

_file = file.file

Expand Down Expand Up @@ -781,7 +783,7 @@ def join_responses(
MultipartMixedResponse(
response_generator(is_multipart=True), content_type=form_params.output_format
)
if content_type == "multipart/mixed"
if accept_type == "multipart/mixed"
else (
list(response_generator(is_multipart=False))[0]
if len(files) == 1
Expand Down
11 changes: 11 additions & 0 deletions prepline_general/api/models/form_params.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ class GeneralFormParams(BaseModel):
output_format: str
coordinates: bool
encoding: str
content_type: Optional[str]
hi_res_model_name: Optional[str]
include_page_breaks: bool
pdf_infer_table_structure: bool
Expand Down Expand Up @@ -100,6 +101,15 @@ def as_form(
),
BeforeValidator(SmartValueParser[bool]().value_or_first_element),
] = False,
content_type: Annotated[
Optional[str],
Form(
title="Content type",
description="A hint about the content type to use (such as text/markdown), when there are problems processing a specific file. This value is a MIME type in the format type/subtype.",
example="text/markdown",
),
BeforeValidator(SmartValueParser[str]().value_or_first_element),
] = None,
encoding: Annotated[
str,
Form(
Expand Down Expand Up @@ -245,6 +255,7 @@ def as_form(
gz_uncompressed_content_type=gz_uncompressed_content_type,
output_format=output_format,
coordinates=coordinates,
content_type=content_type,
encoding=encoding,
hi_res_model_name=hi_res_model_name,
include_page_breaks=include_page_breaks,
Expand Down
2 changes: 1 addition & 1 deletion preprocessing-pipeline-family.yaml
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
name: general
version: 0.0.75
version: 0.0.76
Loading

0 comments on commit 7468938

Please sign in to comment.