Skip to content

Commit

Permalink
fix/Fix MS Office filetype errors and harden docker smoketest (#436)
Browse files Browse the repository at this point in the history
# Changes
**Fix for docx and other office files returning `{"detail":"File type
None is not supported."}`**
After moving to the wolfi base image, the `mimetypes` lib no longer
knows about these file extensions. To avoid issues like this, let's add
an explicit mapping for all the file extensions we care about. I added a
`filetypes.py` and moved `get_validated_mimetype` over. When this file
is imported, we'll call `mimetypes.add_type` for all file extensions we
support.

**Update smoke test coverage**
This bug snuck past because we were already providing the mimetype in
the docker smoke test. I updated `test_happy_path` to test against the
container with and without passing `content_type`. I added some missing
filetypes, and sorted the test params by extension so we can see when
new types are missing.

# Testing
The new smoke test will verify that all filetypes are working. You can
also `make docker-build && make docker-start-api`, and test out the docx
in the sample docs dir. On `main`, this file will give you the error
above.
```
curl 'http://localhost:8000/general/v0/general' \
--form 'files=@"fake.docx"'
```
  • Loading branch information
awalker4 authored Jun 28, 2024
1 parent d5a878f commit 6710df0
Show file tree
Hide file tree
Showing 9 changed files with 187 additions and 114 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
## 0.0.72

* Fix certain filetypes failing mimetype lookup in the new base image

## 0.0.71

* replace rockylinux with chainguard/wolfi as a base image for `amd64`
Expand Down
2 changes: 1 addition & 1 deletion prepline_general/api/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
app = FastAPI(
title="Unstructured Pipeline API",
summary="Partition documents with the Unstructured library",
version="0.0.71",
version="0.0.72",
docs_url="/general/docs",
openapi_url="/general/openapi.json",
servers=[
Expand Down
107 changes: 107 additions & 0 deletions prepline_general/api/filetypes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
import mimetypes
import os
from fastapi import UploadFile, HTTPException
from typing import Optional

DEFAULT_MIMETYPES = (
"application/pdf,application/msword,image/jpeg,image/png,text/markdown,"
"text/x-markdown,text/html,"
"application/vnd.openxmlformats-officedocument.wordprocessingml.document,"
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet,"
"application/vnd.ms-excel,application/vnd.openxmlformats-officedocument."
"presentationml.presentation,"
"application/json,"
"application/vnd.ms-powerpoint,"
"text/html,message/rfc822,text/plain,image/png,"
"application/epub,application/epub+zip,"
"application/rtf,text/rtf,"
"application/vnd.oasis.opendocument.text,"
"text/csv,text/x-csv,application/csv,application/x-csv,"
"text/comma-separated-values,text/x-comma-separated-values,"
"application/xml,text/xml,text/x-rst,text/prs.fallenstein.rst,"
"text/tsv,text/tab-separated-values,"
"application/x-ole-storage,application/vnd.ms-outlook,"
"application/yaml,"
"application/x-yaml,"
"text/x-yaml,"
"text/yaml,"
"image/bmp,"
"image/heic,"
"image/tiff,"
"text/org,"
)

if not os.environ.get("UNSTRUCTURED_ALLOWED_MIMETYPES", None):
os.environ["UNSTRUCTURED_ALLOWED_MIMETYPES"] = DEFAULT_MIMETYPES


def _load_mimetypes() -> None:
"""Call this on startup to ensure that all expected file extensions are present in the mimetypes
lib"""
expected_mimetypes = [
(".bmp", "image/bmp"),
(".csv", "application/csv"),
(".doc", "application/msword"),
(".docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document"),
(".eml", "message/rfc822"),
(".epub", "application/epub"),
(".gz", "application/gzip"),
(".heic", "image/heic"),
(".html", "text/html"),
(".jpeg", "image/jpeg"),
(".jpg", "image/jpeg"),
(".json", "application/json"),
(".md", "text/markdown"),
(".msg", "application/x-ole-storage"),
(".odt", "application/vnd.oasis.opendocument.text"),
(".org", "text/org"),
(".pdf", "application/pdf"),
(".png", "image/png"),
(".ppt", "application/vnd.ms-powerpoint"),
(".pptx", "application/vnd.openxmlformats-officedocument.presentationml.presentation"),
(".rst", "text/prs.fallenstein.rst"),
(".rtf", "application/rtf"),
(".tiff", "image/tiff"),
(".tsv", "text/tab-separated-values"),
(".txt", "text/plain"),
(".xlsx", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"),
(".xml", "text/xml"),
]

for extension, mimetype in expected_mimetypes:
mimetypes.add_type(mimetype, extension)


_load_mimetypes()


def get_validated_mimetype(file: UploadFile) -> Optional[str]:
"""The MIME-type of `file`.
The mimetype is computed based on `file.content_type`, or the mimetypes lib if that's too
generic. If the user has set UNSTRUCTURED_ALLOWED_MIMETYPES, validate against this list and
return HTTP 400 for an invalid type.
"""
content_type = file.content_type
filename = str(file.filename) # -- "None" when file.filename is None --
if not content_type or content_type == "application/octet-stream":
content_type = mimetypes.guess_type(filename)[0]

# Some filetypes missing for this library, just hardcode them for now
if not content_type:
if filename.endswith(".md"):
content_type = "text/markdown"
elif filename.endswith(".msg"):
content_type = "message/rfc822"

allowed_mimetypes_str = os.environ.get("UNSTRUCTURED_ALLOWED_MIMETYPES")
if allowed_mimetypes_str is not None:
allowed_mimetypes = allowed_mimetypes_str.split(",")

if content_type not in allowed_mimetypes:
raise HTTPException(
status_code=400,
detail=(f"File type {content_type} is not supported."),
)

return content_type
68 changes: 3 additions & 65 deletions prepline_general/api/general.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
from starlette.types import Send

from prepline_general.api.models.form_params import GeneralFormParams
from prepline_general.api.filetypes import get_validated_mimetype
from unstructured.documents.elements import Element
from unstructured.partition.auto import partition
from unstructured.staging.base import (
Expand All @@ -59,37 +60,6 @@ def is_compatible_response_type(media_type: str, response_type: type) -> bool:

logger = logging.getLogger("unstructured_api")

DEFAULT_MIMETYPES = (
"application/pdf,application/msword,image/jpeg,image/png,text/markdown,"
"text/x-markdown,text/html,"
"application/vnd.openxmlformats-officedocument.wordprocessingml.document,"
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet,"
"application/vnd.ms-excel,application/vnd.openxmlformats-officedocument."
"presentationml.presentation,"
"application/json,"
"application/vnd.ms-powerpoint,"
"text/html,message/rfc822,text/plain,image/png,"
"application/epub,application/epub+zip,"
"application/rtf,text/rtf,"
"application/vnd.oasis.opendocument.text,"
"text/csv,text/x-csv,application/csv,application/x-csv,"
"text/comma-separated-values,text/x-comma-separated-values,"
"application/xml,text/xml,text/x-rst,text/prs.fallenstein.rst,"
"text/tsv,text/tab-separated-values,"
"application/x-ole-storage,application/vnd.ms-outlook,"
"application/yaml,"
"application/x-yaml,"
"text/x-yaml,"
"text/yaml,"
"image/bmp,"
"image/heic,"
"image/tiff,"
"text/org,"
)

if not os.environ.get("UNSTRUCTURED_ALLOWED_MIMETYPES", None):
os.environ["UNSTRUCTURED_ALLOWED_MIMETYPES"] = DEFAULT_MIMETYPES


def get_pdf_splits(pdf_pages: Sequence[PageObject], split_size: int = 1):
"""Given a pdf (PdfReader) with n pages, split it into pdfs each with split_size # of pages.
Expand Down Expand Up @@ -609,38 +579,6 @@ def _set_pdf_infer_table_structure(
return strategy in ("hi_res", "auto") and pdf_infer_table_structure


def get_validated_mimetype(file: UploadFile) -> Optional[str]:
"""The MIME-type of `file`.
The mimetype is computed based on `file.content_type`, or the mimetypes lib if that's too
generic. If the user has set UNSTRUCTURED_ALLOWED_MIMETYPES, validate against this list and
return HTTP 400 for an invalid type.
"""
content_type = file.content_type
filename = str(file.filename) # -- "None" when file.filename is None --
if not content_type or content_type == "application/octet-stream":
content_type = mimetypes.guess_type(filename)[0]

# Some filetypes missing for this library, just hardcode them for now
if not content_type:
if filename.endswith(".md"):
content_type = "text/markdown"
elif filename.endswith(".msg"):
content_type = "message/rfc822"

allowed_mimetypes_str = os.environ.get("UNSTRUCTURED_ALLOWED_MIMETYPES")
if allowed_mimetypes_str is not None:
allowed_mimetypes = allowed_mimetypes_str.split(",")

if content_type not in allowed_mimetypes:
raise HTTPException(
status_code=400,
detail=(f"File type {content_type} is not supported."),
)

return content_type


class MultipartMixedResponse(StreamingResponse):
CRLF = b"\r\n"

Expand Down Expand Up @@ -713,7 +651,7 @@ def return_content_type(filename: str):


@router.get("/general/v0/general", include_in_schema=False)
@router.get("/general/v0.0.71/general", include_in_schema=False)
@router.get("/general/v0.0.72/general", include_in_schema=False)
async def handle_invalid_get_request():
raise HTTPException(
status_code=status.HTTP_405_METHOD_NOT_ALLOWED, detail="Only POST requests are supported."
Expand All @@ -728,7 +666,7 @@ async def handle_invalid_get_request():
description="Description",
operation_id="partition_parameters",
)
@router.post("/general/v0.0.71/general", include_in_schema=False)
@router.post("/general/v0.0.72/general", include_in_schema=False)
def general_partition(
request: Request,
# cannot use annotated type here because of a bug described here:
Expand Down
2 changes: 1 addition & 1 deletion preprocessing-pipeline-family.yaml
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
name: general
version: 0.0.71
version: 0.0.72
Binary file added sample-docs/DA-1p.bmp
Binary file not shown.
Binary file added sample-docs/DA-1p.heic
Binary file not shown.
Binary file added sample-docs/layout-parser-paper-fast.tiff
Binary file not shown.
118 changes: 71 additions & 47 deletions scripts/smoketest.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,72 +49,96 @@ def send_document(


@pytest.mark.parametrize(
"example_filename, content_type",
("extension", "example_filename", "content_type"),
[
# Note(yuming): Please sort filetypes alphabetically according to
# https://github.com/Unstructured-IO/unstructured/blob/main/unstructured/partition/auto.py#L14
("stanley-cups.csv", "application/csv"),
("fake.doc", "application/msword"),
("fake.docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document"),
("alert.eml", "message/rfc822"),
("announcement.eml", "message/rfc822"),
("fake-email-attachment.eml", "message/rfc822"),
("fake-email-image-embedded.eml", "message/rfc822"),
("fake-email.eml", "message/rfc822"),
("family-day.eml", "message/rfc822"),
("winter-sports.epub", "application/epub"),
("fake-html.html", "text/html"),
pytest.param(
"layout-parser-paper-fast.jpg",
"image/jpeg",
marks=pytest.mark.skipif(skip_inference_tests, reason="emulated architecture"),
),
("spring-weather.html.json", "application/json"),
("README.md", "text/markdown"),
("fake-email.msg", "application/x-ole-storage"),
("fake.odt", "application/vnd.oasis.opendocument.text"),
# Note(austin) The two inference calls will hang on mac with unsupported hardware error
# Skip these with SKIP_INFERENCE_TESTS=true make docker-test
pytest.param(
"layout-parser-paper.pdf.gz",
"application/gzip",
marks=pytest.mark.skipif(skip_inference_tests, reason="emulated architecture"),
),
pytest.param(
"layout-parser-paper.pdf",
"application/pdf",
marks=pytest.mark.skipif(skip_inference_tests, reason="emulated architecture"),
(".bmp", "DA-1p.bmp", "image/bmp"),
(".csv", "stanley-cups.csv", "application/csv"),
(".doc", "fake.doc", "application/msword"),
(
".docx",
"fake.docx",
"application/vnd.openxmlformats-officedocument.wordprocessingml.document",
),
("fake-power-point.ppt", "application/vnd.ms-powerpoint"),
(".eml", "fake-email-attachment.eml", "message/rfc822"),
(".epub", "winter-sports.epub", "application/epub"),
(".heic", "DA-1p.heic", "image/heic"),
(".html", "fake-html.html", "text/html"),
(".jpeg", "layout-parser-paper-fast.jpg", "image/jpeg"),
(".md", "README.md", "text/markdown"),
(".msg", "fake-email.msg", "application/x-ole-storage"),
(".odt", "fake.odt", "application/vnd.oasis.opendocument.text"),
(".pdf", "layout-parser-paper.pdf", "application/pdf"),
(".png", "english-and-korean.png", "image/png"),
(".ppt", "fake-power-point.ppt", "application/vnd.ms-powerpoint"),
(
".pptx",
"fake-power-point.pptx",
"application/vnd.openxmlformats-officedocument.presentationml.presentation",
),
("README.rst", "text/prs.fallenstein.rst"),
("fake-doc.rtf", "application/rtf"),
("fake-text.txt", "text/plain"),
("stanley-cups.tsv", "text/tab-separated-values"),
(".rst", "README.rst", "text/prs.fallenstein.rst"),
(".rtf", "fake-doc.rtf", "application/rtf"),
(".tiff", "layout-parser-paper-fast.tiff", "image/tiff"),
(".tsv", "stanley-cups.tsv", "text/tab-separated-values"),
(".txt", "fake-text.txt", "text/plain"),
(
".xlsx",
"stanley-cups.xlsx",
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
),
("fake-xml.xml", "text/xml"),
(".xml", "fake-xml.xml", "text/xml"),
(".json", "spring-weather.html.json", "application/json"),
(
".gz",
"layout-parser-paper.pdf.gz",
"application/gzip",
),
],
)
def test_happy_path(example_filename: str, content_type: str):
def test_happy_path_all_types(extension, example_filename: str, content_type: str):
"""
For the files in sample-docs, verify that we get a 200
and some structured response
"""
# The auto strategy will run ocr on these files
# This doesn't always work on our macs
if skip_inference_tests and extension in [
".bmp",
".heic",
".jpeg",
".pdf",
".png",
".tiff",
".gz", # Since we're using a gzipped pdf...
]:
pytest.skip("emulated hardware")

test_file = str(Path("sample-docs") / example_filename)
print(f"sending {content_type}")
json_response = send_document(filenames=[test_file], content_type=content_type)
assert json_response.status_code == 200
assert len(json_response.json()) > 0
assert len("".join(elem["text"] for elem in json_response.json())) > 20

# Verify we can send with explicit content type
response = send_document(filenames=[test_file], content_type=content_type)

if response.status_code != 200:
assert False, response.text

assert len(response.json()) > 0
assert len("".join(elem["text"] for elem in response.json())) > 20

# Verify we can infer the filetype on the server
response = send_document(filenames=[test_file], content_type=None)

if response.status_code != 200:
assert False, response.text

assert len(response.json()) > 0
assert len("".join(elem["text"] for elem in response.json())) > 20

json_response = response

# Verify we can set output type to csv
csv_response = send_document(
filenames=[test_file], content_type=content_type, output_format="text/csv"
filenames=[test_file],
content_type=content_type,
output_format="text/csv",
)
assert csv_response.status_code == 200
assert len(csv_response.text) > 0
Expand Down

0 comments on commit 6710df0

Please sign in to comment.