diff --git a/CHANGELOG.md b/CHANGELOG.md index 3337a69957..b8c778e3af 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,16 @@ +## 0.14.5-dev0 + +### Enhancements + +* **Use `python-oxmsg` for `partition_msg()`.** Outlook MSG emails are now partitioned using the `python-oxmsg` package which resolves some shortcomings of the prior MSG parser. + +### Features + +### Fixes + +* **8-bit string Outlook MSG files are parsed.** `partition_msg()` is now able to parse non-unicode Outlook MSG emails. +* **Attachments to Outlook MSG files are extracted intact.** `partition_msg()` is now able to extract attachments without corruption. + ## 0.14.4 ### Enhancements diff --git a/pyproject.toml b/pyproject.toml index a7537170ed..ea90b03210 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -31,6 +31,7 @@ lint.select = [ ] lint.ignore = [ "COM812", # -- over aggressively insists on trailing commas where not desireable -- + "PT001", # -- wants empty parens on @pytest.fixture where not used (essentially always) -- "PT005", # -- flags mock fixtures with names intentionally matching private method name -- "PT011", # -- pytest.raises({exc}) too broad, use match param or more specific exception -- "PT012", # -- pytest.raises() block should contain a single simple statement -- diff --git a/requirements/extra-msg.in b/requirements/extra-msg.in index 909f4cb923..b7293bda0d 100644 --- a/requirements/extra-msg.in +++ b/requirements/extra-msg.in @@ -1,4 +1,4 @@ -c ./deps/constraints.txt -c base.txt -msg_parser +python-oxmsg diff --git a/requirements/extra-msg.txt b/requirements/extra-msg.txt index ae9de328e4..d0857bd3e1 100644 --- a/requirements/extra-msg.txt +++ b/requirements/extra-msg.txt @@ -4,7 +4,15 @@ # # pip-compile ./extra-msg.in # -msg-parser==1.2.0 - # via -r ./extra-msg.in +click==8.1.7 + # via + # -c ./base.txt + # python-oxmsg olefile==0.47 - # via msg-parser + # via python-oxmsg +python-oxmsg==0.0.1 + # via -r ./extra-msg.in +typing-extensions==4.12.0 + # via + # -c ./base.txt + # python-oxmsg diff --git a/test_unstructured/partition/test_msg.py b/test_unstructured/partition/test_msg.py index f6e9ded9ca..cef496cf6d 100644 --- a/test_unstructured/partition/test_msg.py +++ b/test_unstructured/partition/test_msg.py @@ -2,17 +2,19 @@ from __future__ import annotations -import pathlib +import io +from typing import Any -import msg_parser import pytest -from pytest_mock import MockFixture +from oxmsg import Message from test_unstructured.unit_utils import ( + FixtureRequest, LogCaptureFixture, - MonkeyPatch, + Mock, assert_round_trips_through_JSON, example_doc_path, + property_mock, ) from unstructured.chunking.title import chunk_by_title from unstructured.documents.elements import ( @@ -21,9 +23,8 @@ NarrativeText, Title, ) -from unstructured.partition.msg import extract_msg_attachment_info, partition_msg -from unstructured.partition.text import partition_text -from unstructured.partition.utils.constants import UNSTRUCTURED_INCLUDE_DEBUG_METADATA +from unstructured.partition.msg import partition_msg +from unstructured.partition.new_msg import MsgPartitionerOptions EXPECTED_MSG_OUTPUT = [ NarrativeText(text="This is a test email to use for unit tests."), @@ -32,15 +33,6 @@ ListItem(text="Violets are blue"), ] -ATTACH_EXPECTED_OUTPUT = [ - { - "filename": "fake-attachment.txt", - "extension": ".txt", - "file_size": "unknown", - "payload": b"Hey this is a fake attachment!", - }, -] - def test_partition_msg_from_filename(): filename = example_doc_path("fake-email.msg") @@ -53,21 +45,18 @@ def test_partition_msg_from_filename(): == ElementMetadata( coordinates=None, filename=filename, - last_modified="2022-12-16T17:04:16-05:00", + last_modified="2023-03-28T17:00:31+00:00", page_number=None, url=None, - sent_from=["Matthew Robinson "], - sent_to=["Matthew Robinson (None)"], + sent_from=['"Matthew Robinson" '], + sent_to=["mrobinson@unstructured.io"], subject="Test Email", filetype="application/vnd.ms-outlook", parent_id=parent_id, languages=["eng"], ).to_dict() ) - for element in elements: - assert element.metadata.filename == "fake-email.msg" - if UNSTRUCTURED_INCLUDE_DEBUG_METADATA: - assert {element.metadata.detection_origin for element in elements} == {"msg"} + assert all(e.metadata.filename == "fake-email.msg" for e in elements) def test_partition_msg_from_filename_returns_uns_elements(): @@ -82,17 +71,12 @@ def test_partition_msg_from_filename_with_metadata_filename(): assert all(element.metadata.filename == "test" for element in elements) -class MockMsOxMessage: - def __init__(self, filename: str): - self.body = "Here is an email with plain text." - self.header_dict = {"Content-Type": "text/plain"} - - -def test_partition_msg_from_filename_with_text_content(monkeypatch: MonkeyPatch): - monkeypatch.setattr(msg_parser, "MsOxMessage", MockMsOxMessage) +def test_partition_msg_from_filename_with_text_content(): filename = example_doc_path("fake-email.msg") + elements = partition_msg(filename=filename) - assert str(elements[0]) == "Here is an email with plain text." + + assert str(elements[0]) == "This is a test email to use for unit tests." assert elements[0].metadata.filename == "fake-email.msg" assert elements[0].metadata.file_directory == example_doc_path("") @@ -121,17 +105,9 @@ def test_partition_msg_from_file_with_metadata_filename(): assert element.metadata.filename == "test" -def test_extract_attachment_info(): - filename = example_doc_path("fake-email-attachment.msg") - attachment_info = extract_msg_attachment_info(filename) - assert len(attachment_info) > 0 - assert attachment_info == ATTACH_EXPECTED_OUTPUT - - -def test_partition_msg_raises_with_both_specified(): - filename = example_doc_path("fake-email.msg") - with open(filename, "rb") as f, pytest.raises(ValueError): - partition_msg(filename=filename, file=f) +def test_partition_msg_uses_file_path_when_both_are_specified(): + elements = partition_msg(example_doc_path("fake-email.msg"), file=io.BytesIO(b"abcde")) + assert elements == EXPECTED_MSG_OUTPUT def test_partition_msg_raises_with_neither(): @@ -156,102 +132,49 @@ def test_partition_msg_from_file_exclude_metadata(): assert elements[i].metadata.to_dict() == {} -def test_partition_msg_can_process_attachments(tmp_path: pathlib.Path): - file_path = example_doc_path("fake-email-attachment.msg") - tmp_dir_path = str(tmp_path) - extract_msg_attachment_info(filename=file_path, output_dir=tmp_dir_path) - attachment_filename = str(tmp_path / str(ATTACH_EXPECTED_OUTPUT[0]["filename"])) - - mocked_last_modification_date = "2029-07-05T09:24:28" - - attachment_elements = partition_text( - filename=attachment_filename, - metadata_filename=attachment_filename, - metadata_last_modified=mocked_last_modification_date, - ) - expected_metadata = attachment_elements[0].metadata - expected_metadata.file_directory = None - expected_metadata.attached_to_filename = file_path - - elements = partition_msg( - filename=file_path, - attachment_partitioner=partition_text, - process_attachments=True, - metadata_last_modified=mocked_last_modification_date, - ) - - # This test does not need to validate if hierarchy is working - # Patch to nullify parent_id - expected_metadata.parent_id = None - elements[-1].metadata.parent_id = None - - assert elements[0].text.startswith("Hello!") - for element in elements[:-1]: - assert element.metadata.filename == "fake-email-attachment.msg" - assert element.metadata.subject == "Fake email with attachment" - assert elements[-1].text == "Hey this is a fake attachment!" - assert elements[-1].metadata == expected_metadata - - -def test_partition_msg_can_process_min_max_wtih_attachments(tmp_path: pathlib.Path): - file_path = example_doc_path("fake-email-attachment.msg") - tmp_dir_path = str(tmp_path) - extract_msg_attachment_info(filename=file_path, output_dir=tmp_dir_path) - attachment_filename = str(tmp_path / str(ATTACH_EXPECTED_OUTPUT[0]["filename"])) - - attachment_elements = partition_text( - filename=attachment_filename, - metadata_filename=attachment_filename, - min_partition=6, - max_partition=12, - ) - +def test_partition_msg_can_process_attachments(): elements = partition_msg( - filename=file_path, - attachment_partitioner=partition_text, - process_attachments=True, - min_partition=6, - max_partition=12, - ) - - assert elements[0].text.startswith("Hello!") - assert elements[-1].text == attachment_elements[-1].text - assert elements[-2].text == attachment_elements[-2].text - for element in elements: - if element.metadata.attached_to_filename is not None: - assert len(element.text) <= 12 - assert len(element.text) >= 6 - - -def test_partition_msg_raises_with_no_partitioner(): - with pytest.raises(ValueError): - partition_msg(example_doc_path("fake-email-attachment.msg"), process_attachments=True) - - -def test_partition_msg_metadata_date_from_header(mocker: MockFixture): - expected_last_modification_date = "2022-12-16T17:04:16-05:00" - - mocker.patch( - "unstructured.partition.msg.get_last_modified_date", - return_value=None, - ) - mocker.patch( - "unstructured.partition.msg.get_last_modified_date_from_file", - return_value=None, + example_doc_path("fake-email-attachment.msg"), process_attachments=True ) + assert all(e.metadata.filename == "fake-email-attachment.msg" for e in elements[:8]) + assert elements[8].metadata.filename == "fake-attachment.txt" + assert [e.text for e in elements] == [ + "Hello!\xa0", + "Here's the attachments!", + "It includes:", + "Lots of whitespace", + "Little\xa0to no content", + "and is a quick read", + "Best,", + "Mallori", + "Hey this is a fake attachment!", + ] + assert [type(e).__name__ for e in elements] == [ + "HTMLTitle", + "HTMLNarrativeText", + "HTMLNarrativeText", + "HTMLListItem", + "HTMLListItem", + "HTMLListItem", + "HTMLText", + "HTMLTitle", + "NarrativeText", + ] + + +def test_partition_msg_pulls_last_modified_from_message_sent_date(): elements = partition_msg(example_doc_path("fake-email.msg")) - - assert elements[0].metadata.last_modified == expected_last_modification_date + assert all(e.metadata.last_modified == "2023-03-28T17:00:31+00:00" for e in elements) -def test_partition_msg_from_file_custom_metadata_date(): - expected_last_modification_date = "2020-07-05T09:24:28" +def test_partition_msg_from_file_prefers_metadata_last_modified_when_provided(): + metadata_last_modified = "2020-07-05T09:24:28" with open(example_doc_path("fake-email.msg"), "rb") as f: - elements = partition_msg(file=f, metadata_last_modified=expected_last_modification_date) + elements = partition_msg(file=f, metadata_last_modified=metadata_last_modified) - assert elements[0].metadata.last_modified == expected_last_modification_date + assert all(e.metadata.last_modified == metadata_last_modified for e in elements) def test_partition_msg_custom_metadata_date(): @@ -288,6 +211,9 @@ def test_add_chunking_strategy_by_title_on_partition_msg(): assert chunk_elements == chunks +# -- language behaviors -------------------------------------------------------------------------- + + def test_partition_msg_element_metadata_has_languages(): filename = "example-docs/fake-email.msg" elements = partition_msg(filename=filename) @@ -303,4 +229,184 @@ def test_partition_msg_respects_languages_arg(): def test_partition_msg_raises_TypeError_for_invalid_languages(): with pytest.raises(TypeError): filename = "example-docs/fake-email.msg" - partition_msg(filename=filename, languages="eng") # pyright: ignore[reportArgumentType] + partition_msg(filename=filename, languages="eng") + + +# ================================================================================================ +# ISOLATED UNIT TESTS +# ================================================================================================ +# These test components used by `partition_msg()` in isolation such that all edge cases can be +# exercised. +# ================================================================================================ + + +class DescribeMsgPartitionerOptions: + """Unit-test suite for `unstructured.partition.msg.MsgPartitionerOptions` objects.""" + + # -- .is_encrypted --------------------------- + + @pytest.mark.parametrize( + ("file_name", "expected_value"), [("fake-encrypted.msg", True), ("fake-email.msg", False)] + ) + def it_knows_when_the_msg_is_encrypted( + self, file_name: str, expected_value: bool, opts_args: dict[str, Any] + ): + opts_args["file_path"] = example_doc_path(file_name) + opts = MsgPartitionerOptions(**opts_args) + + assert opts.is_encrypted is expected_value + + # -- .metadata_file_path --------------------- + + def it_uses_the_user_provided_metadata_file_path_when_provided(self, opts_args: dict[str, Any]): + opts_args["file_path"] = "x/y/z.msg" + opts_args["metadata_file_path"] = "a/b/c.msg" + opts = MsgPartitionerOptions(**opts_args) + + assert opts.metadata_file_path == "a/b/c.msg" + + @pytest.mark.parametrize("file_path", ["u/v/w.msg", None]) + def and_it_falls_back_to_the_document_file_path_otherwise_including_when_the_file_path_is_None( + self, file_path: str | None, opts_args: dict[str, Any] + ): + opts_args["file_path"] = file_path + opts_args["metadata_file_path"] = None + opts = MsgPartitionerOptions(**opts_args) + + assert opts.metadata_file_path == file_path + + # -- .metadata_last_modified ----------------- + + @pytest.mark.parametrize("metadata_last_modified", ["2024-03-05T17:02:53", None]) + def it_knows_the_metadata_last_modified_date_provided_by_the_caller( + self, metadata_last_modified: str | None, opts_args: dict[str, Any] + ): + opts_args["metadata_last_modified"] = metadata_last_modified + opts = MsgPartitionerOptions(**opts_args) + + assert opts.metadata_last_modified == metadata_last_modified + + # -- .msg ------------------------------------ + + def it_loads_the_msg_document_from_a_file_path_when_provided(self, opts_args: dict[str, Any]): + opts_args["file_path"] = example_doc_path("fake-email.msg") + opts = MsgPartitionerOptions(**opts_args) + + assert isinstance(opts.msg, Message) + + def and_it_loads_the_msg_document_from_a_file_like_object_when_provided( + self, opts_args: dict[str, Any] + ): + with open(example_doc_path("fake-email.msg"), "rb") as f: + opts_args["file"] = io.BytesIO(f.read()) + opts = MsgPartitionerOptions(**opts_args) + + assert isinstance(opts.msg, Message) + + def but_it_raises_when_neither_is_provided(self, opts_args: dict[str, Any]): + with pytest.raises(ValueError, match="one of `file` or `filename` arguments must be prov"): + MsgPartitionerOptions(**opts_args).msg + + # -- .msg_metadata --------------------------- + + def it_provides_a_unique_metadata_instance_for_each_element(self, opts_args: dict[str, Any]): + opts_args["file_path"] = example_doc_path("fake-email.msg") + opts = MsgPartitionerOptions(**opts_args) + + assert opts.msg_metadata is not opts.msg_metadata + + # -- .metadata.filename ---------------------- + + def it_uses_the_metadata_file_path_value_for_msg_metadata( + self, opts_args: dict[str, Any], metadata_file_path_prop_: Mock + ): + metadata_file_path_prop_.return_value = "a/b/c.msg" + opts_args["file_path"] = example_doc_path("fake-email.msg") + opts = MsgPartitionerOptions(**opts_args) + + assert opts.msg_metadata.filename == "c.msg" + assert opts.msg_metadata.file_directory == "a/b" + + # -- .metadata.last_modified ----------------- + + def it_uses_metadata_last_modified_when_provided_by_caller(self, opts_args: dict[str, Any]): + opts_args["file_path"] = example_doc_path("fake-email.msg") + opts_args["metadata_last_modified"] = "2024-06-03T20:07:31+00:00" + opts = MsgPartitionerOptions(**opts_args) + + assert opts.msg_metadata.last_modified == "2024-06-03T20:07:31+00:00" + + def and_it_uses_the_sent_date_of_the_email_when_metadata_last_modified_is_not_provided( + self, opts_args: dict[str, Any] + ): + opts_args["file_path"] = example_doc_path("fake-email.msg") + opts = MsgPartitionerOptions(**opts_args) + + assert opts.msg_metadata.last_modified == "2023-03-28T17:00:31+00:00" + + @pytest.mark.parametrize("file_last_modified", ["2024-06-03T20:12:53", None]) + def and_it_uses_the_last_modified_date_from_the_source_file_when_the_message_has_no_sent_date( + self, + opts_args: dict[str, Any], + file_last_modified: str | None, + Message_sent_date_: Mock, + _last_modified_prop_: Mock, + ): + Message_sent_date_.return_value = None + _last_modified_prop_.return_value = file_last_modified + opts_args["file_path"] = example_doc_path("fake-email.msg") + opts = MsgPartitionerOptions(**opts_args) + + assert opts.msg_metadata.last_modified == file_last_modified + + # -- .metadata (email-specific) -------------- + + def it_adds_email_specific_fields_to_the_msg_element_metadata(self, opts_args: dict[str, Any]): + opts_args["file_path"] = example_doc_path("fake-email.msg") + opts = MsgPartitionerOptions(**opts_args) + + assert opts.msg_metadata.sent_from == ['"Matthew Robinson" '] + assert opts.msg_metadata.sent_to == ["mrobinson@unstructured.io"] + assert opts.msg_metadata.subject == "Test Email" + + # -- .partition_attachments ------------------ + + @pytest.mark.parametrize("partition_attachments", [True, False]) + def it_knows_whether_attachments_should_also_be_partitioned( + self, partition_attachments: bool, opts_args: dict[str, Any] + ): + opts_args["file_path"] = example_doc_path("fake-email.msg") + opts_args["partition_attachments"] = partition_attachments + opts = MsgPartitionerOptions(**opts_args) + + assert opts.partition_attachments is partition_attachments + + # -- fixtures -------------------------------------------------------------------------------- + + @pytest.fixture + def _last_modified_prop_(self, request: FixtureRequest): + return property_mock(request, MsgPartitionerOptions, "_last_modified") + + @pytest.fixture + def Message_sent_date_(self, request: FixtureRequest): + return property_mock(request, Message, "sent_date") + + @pytest.fixture + def metadata_file_path_prop_(self, request: FixtureRequest): + return property_mock(request, MsgPartitionerOptions, "metadata_file_path") + + @pytest.fixture + def opts_args(self) -> dict[str, Any]: + """All default arguments for `MsgPartitionerOptions`. + + Individual argument values can be changed to suit each test. Makes construction of opts more + compact for testing purposes. + """ + return { + "date_from_file_object": False, + "file": None, + "file_path": None, + "metadata_file_path": None, + "metadata_last_modified": None, + "partition_attachments": False, + } diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 9398730982..0951c31533 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.14.4" # pragma: no cover +__version__ = "0.14.5-dev0" # pragma: no cover diff --git a/unstructured/partition/auto.py b/unstructured/partition/auto.py index 544ef345e2..6a57340f74 100644 --- a/unstructured/partition/auto.py +++ b/unstructured/partition/auto.py @@ -72,7 +72,7 @@ PARTITION_WITH_EXTRAS_MAP["md"] = partition_md -if dependency_exists("msg_parser"): +if dependency_exists("oxmsg"): from unstructured.partition.msg import partition_msg PARTITION_WITH_EXTRAS_MAP["msg"] = partition_msg diff --git a/unstructured/partition/msg.py b/unstructured/partition/msg.py index fe1d7f0a25..4cc36c4d3d 100644 --- a/unstructured/partition/msg.py +++ b/unstructured/partition/msg.py @@ -1,239 +1,17 @@ -from __future__ import annotations - -import os -import tempfile -from typing import IO, Any, Callable, Optional +# pyright: reportPrivateUsage=false -import msg_parser +from __future__ import annotations -from unstructured.chunking import add_chunking_strategy -from unstructured.documents.elements import Element, ElementMetadata, process_metadata -from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype -from unstructured.logger import logger -from unstructured.partition.common import ( - exactly_one, - get_last_modified_date, - get_last_modified_date_from_file, +from unstructured.partition.new_msg import ( + MsgPartitionerOptions, + _AttachmentPartitioner, + _MsgPartitioner, + partition_msg, ) -from unstructured.partition.email import convert_to_iso_8601 -from unstructured.partition.html import partition_html -from unstructured.partition.lang import apply_lang_metadata -from unstructured.partition.text import partition_text - - -@process_metadata() -@add_metadata_with_filetype(FileType.MSG) -@add_chunking_strategy -def partition_msg( - filename: Optional[str] = None, - file: Optional[IO[bytes]] = None, - max_partition: Optional[int] = 1500, - include_metadata: bool = True, - metadata_filename: Optional[str] = None, - metadata_last_modified: Optional[str] = None, - process_attachments: bool = False, - attachment_partitioner: Optional[Callable[..., list[Element]]] = None, - min_partition: Optional[int] = 0, - chunking_strategy: Optional[str] = None, - languages: Optional[list[str]] = ["auto"], - detect_language_per_element: bool = False, - date_from_file_object: bool = False, - **kwargs: Any, -) -> list[Element]: - """Partitions a MSFT Outlook .msg file - - Parameters - ---------- - filename - A string defining the target filename path. - file - A file-like object using "rb" mode --> open(filename, "rb"). - max_partition - The maximum number of characters to include in a partition. If None is passed, - no maximum is applied. Only applies if processing text/plain content. - metadata_filename - The filename to use for the metadata. - process_attachments - If True, partition_email will process email attachments in addition to - processing the content of the email itself. - attachment_partitioner - The partitioning function to use to process attachments. - metadata_last_modified - The last modified date for the document. - min_partition - The minimum number of characters to include in a partition. Only applies if - processing text/plain content. - languages - User defined value for `metadata.languages` if provided. Otherwise language is detected - using naive Bayesian filter via `langdetect`. Multiple languages indicates text could be - in either language. - Additional Parameters: - detect_language_per_element - Detect language per element instead of at the document level. - date_from_file_object - Applies only when providing file via `file` parameter. If this option is True and inference - from message header failed, attempt to infer last_modified metadata from bytes, - otherwise set it to None. - """ - exactly_one(filename=filename, file=file) - - if filename is not None: - msg_obj = msg_parser.MsOxMessage(filename) - # -- `exactly_one()` call above guarantees `file` is present when `filename` is None -- - else: - assert file is not None - tmp = tempfile.NamedTemporaryFile(delete=False) - tmp.write(file.read()) - tmp.close() - msg_obj = msg_parser.MsOxMessage(tmp.name) - - # NOTE(robinson) - Per RFC 2015, the content type for emails with PGP encrypted - # content is multipart/encrypted - # ref: https://www.ietf.org/rfc/rfc2015.txt - content_type = msg_obj.header_dict.get("Content-Type", "") - is_encrypted = "encrypted" in content_type - text = msg_obj.body - elements: list[Element] = [] - if is_encrypted: - logger.warning( - "Encrypted email detected. Partition function will return an empty list.", - ) - elif text is None: - pass - elif "" in text or "" in text: - elements = partition_html( - text=text, - languages=[""], - include_metadata=False, # metadata is overwritten later, so no need to compute it here - detection_origin="msg", - ) - else: - elements = partition_text( - text=text, - max_partition=max_partition, - min_partition=min_partition, - languages=[""], - include_metadata=False, # metadata is overwritten later, so no need to compute it here - detection_origin="msg", - ) - - last_modification_date = None - if filename is not None: - last_modification_date = get_last_modified_date(filename) - elif file is not None: - last_modification_date = ( - get_last_modified_date_from_file(file) if date_from_file_object else None - ) - for element in elements: - metadata = build_msg_metadata( - msg_obj, - metadata_filename or filename, - metadata_last_modified=metadata_last_modified, - last_modification_date=last_modification_date, - ) - element.metadata = metadata - - if process_attachments: - with tempfile.TemporaryDirectory() as tmpdir: - extract_msg_attachment_info(msg_obj=msg_obj, output_dir=tmpdir) - attached_files = os.listdir(tmpdir) - for attached_file in attached_files: - attached_filename = os.path.join(tmpdir, attached_file) - if attachment_partitioner is None: - raise ValueError( - "Specify the attachment_partitioner kwarg to process attachments.", - ) - attached_elements = attachment_partitioner( - filename=attached_filename, - metadata_last_modified=metadata_last_modified, - max_partition=max_partition, - min_partition=min_partition, - ) - for element in attached_elements: - element.metadata.filename = attached_file - element.metadata.file_directory = None - element.metadata.attached_to_filename = metadata_filename or filename - elements.append(element) - - elements = list( - apply_lang_metadata( - elements=elements, - languages=languages, - detect_language_per_element=detect_language_per_element, - ), - ) - return elements - - -def build_msg_metadata( - msg_obj: msg_parser.MsOxMessage, - filename: Optional[str], - metadata_last_modified: Optional[str], - last_modification_date: Optional[str], - languages: Optional[list[str]] = ["auto"], -) -> ElementMetadata: - """Creates an ElementMetadata object from the header information in the email.""" - email_date = getattr(msg_obj, "sent_date", None) - if email_date is not None: - email_date = convert_to_iso_8601(email_date) - - sent_from = getattr(msg_obj, "sender", None) - if sent_from is not None: - sent_from = [str(sender) for sender in sent_from] - - sent_to = getattr(msg_obj, "recipients", None) - if sent_to is not None: - sent_to = [str(recipient) for recipient in sent_to] - - element_metadata = ElementMetadata( - sent_to=sent_to, - sent_from=sent_from, - subject=getattr(msg_obj, "subject", None), - last_modified=metadata_last_modified or email_date or last_modification_date, - filename=filename, - languages=languages, - ) - element_metadata.detection_origin = "msg" - return element_metadata - - -def extract_msg_attachment_info( - filename: Optional[str] = None, - file: Optional[IO[bytes]] = None, - output_dir: Optional[str] = None, - msg_obj: Optional[msg_parser.MsOxMessage] = None, -) -> list[dict[str, str]]: - """Extracts information from email message attachments and returns a list of dictionaries. - If 'output_dir' is provided, attachments are also saved to that directory. - """ - exactly_one(filename=filename, file=file, msg_obj=msg_obj) - - if filename is not None: - msg_obj = msg_parser.MsOxMessage(filename) - elif file is not None: - tmp = tempfile.NamedTemporaryFile(delete=False) - tmp.write(file.read()) - tmp.close() - msg_obj = msg_parser.MsOxMessage(tmp.name) - else: - assert msg_obj is not None - msg_obj = msg_obj - - list_attachments: list[dict[str, Any]] = [] - - for attachment in msg_obj.attachments: - attachment_info: dict[str, Any] = {} - - attachment_info["filename"] = attachment.AttachLongFilename - attachment_info["extension"] = attachment.AttachExtension - attachment_info["file_size"] = attachment.AttachmentSize - attachment_info["payload"] = attachment.data - - list_attachments.append(attachment_info) - - if output_dir is not None: - output_filename = output_dir + "/" + (attachment_info["filename"] or "unknown") - with open(output_filename, "wb") as f: - f.write(attachment.data) - return list_attachments +__all__ = [ + "MsgPartitionerOptions", + "_AttachmentPartitioner", + "_MsgPartitioner", + "partition_msg", +] diff --git a/unstructured/partition/new_msg.py b/unstructured/partition/new_msg.py new file mode 100644 index 0000000000..ccb87a2366 --- /dev/null +++ b/unstructured/partition/new_msg.py @@ -0,0 +1,309 @@ +from __future__ import annotations + +import copy +import os +import tempfile +from typing import IO, Any, Iterator, Optional + +from oxmsg import Message +from oxmsg.attachment import Attachment + +from unstructured.chunking import add_chunking_strategy +from unstructured.documents.elements import Element, ElementMetadata, process_metadata +from unstructured.file_utils.filetype import FileType, add_metadata_with_filetype +from unstructured.logger import logger +from unstructured.partition.common import ( + get_last_modified_date, + get_last_modified_date_from_file, +) +from unstructured.partition.html import partition_html +from unstructured.partition.lang import apply_lang_metadata +from unstructured.partition.text import partition_text +from unstructured.utils import is_temp_file_path, lazyproperty + + +@process_metadata() +@add_metadata_with_filetype(FileType.MSG) +@add_chunking_strategy +def partition_msg( + filename: Optional[str] = None, + *, + file: Optional[IO[bytes]] = None, + date_from_file_object: bool = False, + metadata_filename: Optional[str] = None, + metadata_last_modified: Optional[str] = None, + process_attachments: bool = False, + **kwargs: Any, +) -> list[Element]: + """Partitions a MSFT Outlook .msg file + + Parameters + ---------- + filename + A string defining the target filename path. + file + A file-like object using "rb" mode --> open(filename, "rb"). + date_from_file_object + Applies only when providing file via `file` parameter. If this option is True and inference + from message header failed, attempt to infer last_modified metadata from bytes, + otherwise set it to None. + metadata_filename + The filename to use for the metadata. + metadata_last_modified + The last modified date for the document. + process_attachments + If True, partition_email will process email attachments in addition to + processing the content of the email itself. + """ + opts = MsgPartitionerOptions( + date_from_file_object=date_from_file_object, + file=file, + file_path=filename, + metadata_file_path=metadata_filename, + metadata_last_modified=metadata_last_modified, + partition_attachments=process_attachments, + ) + + return list( + apply_lang_metadata( + elements=_MsgPartitioner.iter_message_elements(opts), + languages=kwargs.get("languages", ["auto"]), + detect_language_per_element=kwargs.get("detect_language_per_element", False), + ) + ) + + +class MsgPartitionerOptions: + """Encapsulates partitioning option validation, computation, and application of defaults.""" + + def __init__( + self, + *, + date_from_file_object: bool, + file: IO[bytes] | None, + file_path: str | None, + metadata_file_path: str | None, + metadata_last_modified: str | None, + partition_attachments: bool, + ): + self._date_from_file_object = date_from_file_object + self._file = file + self._file_path = file_path + self._metadata_file_path = metadata_file_path + self._metadata_last_modified = metadata_last_modified + self._partition_attachments = partition_attachments + + @lazyproperty + def is_encrypted(self) -> bool: + """True when message is encrypted.""" + # NOTE(robinson) - Per RFC 2015, the content type for emails with PGP encrypted content + # is multipart/encrypted (ref: https://www.ietf.org/rfc/rfc2015.txt) + if "encrypted" in self.msg.message_headers.get("Content-Type", ""): + return True + # -- pretty sure we're going to want to dig deeper to discover messages that are encrypted + # -- with something other than PGP. + # - might be able to distinguish based on PID_MESSAGE_CLASS = 'IPM.Note.Signed' + # - Content-Type header might include "application/pkcs7-mime" for Microsoft S/MIME + # encryption. + return False + + @lazyproperty + def metadata_file_path(self) -> str | None: + """Best available path for MSG file. + + The value is the caller supplied `metadata_filename` if present, falling back to the + source file-path if that was provided, otherwise `None`. + """ + return self._metadata_file_path or self._file_path + + @lazyproperty + def metadata_last_modified(self) -> str | None: + """Caller override for `.metadata.last_modified` to be applied to all elements.""" + return self._metadata_last_modified + + @lazyproperty + def msg(self) -> Message: + """The `oxmsg.Message` object loaded from file or filename.""" + return Message.load(self._msg_file) + + @property + def msg_metadata(self) -> ElementMetadata: + """ElementMetadata suitable for use on an element formed from message content. + + A distinct instance is returned on each reference such that downstream changes to the + metadata of one element is not also reflected in another element. + """ + return copy.copy(self._msg_metadata) + + @lazyproperty + def partition_attachments(self) -> bool: + """True when message attachments should also be partitioned.""" + return self._partition_attachments + + @lazyproperty + def partitioning_kwargs(self) -> dict[str, Any]: + """Partitioning keyword-arguments to be passed along to attachment partitioner.""" + # TODO: no good reason we can't accept and pass along any file-type specific kwargs + # the caller might want to send along. + return {} + + @lazyproperty + def _last_modified(self) -> str | None: + """The best last-modified date available from source-file, None if not available.""" + if self._file_path: + return ( + None + if is_temp_file_path(self._file_path) + else get_last_modified_date(self._file_path) + ) + + if self._file: + return ( + get_last_modified_date_from_file(self._file) + if self._date_from_file_object + else None + ) + + return None + + @lazyproperty + def _msg_file(self) -> str | IO[bytes]: + """The source for the bytes of the message, either a file-path or a file-like object.""" + if file_path := self._file_path: + return file_path + + if file := self._file: + return file + + raise ValueError("one of `file` or `filename` arguments must be provided") + + @property + def _msg_metadata(self) -> ElementMetadata: + """ElementMetadata "template" for elements of this message. + + None of these metadata fields change based on the element, so compute it once here and then + just make a separate copy for each element. + """ + msg = self.msg + + email_date = sent_date.isoformat() if (sent_date := msg.sent_date) else None + sent_from = [s.strip() for s in sender.split(",")] if (sender := msg.sender) else None + sent_to = [r.email_address for r in msg.recipients] or None + + element_metadata = ElementMetadata( + filename=self.metadata_file_path, + last_modified=self._metadata_last_modified or email_date or self._last_modified, + sent_from=sent_from, + sent_to=sent_to, + subject=msg.subject or None, + ) + element_metadata.detection_origin = "msg" + + return element_metadata + + +class _MsgPartitioner: + """Partitions Outlook email message (MSG) files.""" + + def __init__(self, opts: MsgPartitionerOptions): + self._opts = opts + + @classmethod + def iter_message_elements(cls, opts: MsgPartitionerOptions) -> Iterator[Element]: + """Partition MS Outlook email messages (.msg files) into elements.""" + if opts.is_encrypted: + logger.warning("Encrypted email detected. Partitioner will return an empty list.") + return + + yield from cls(opts)._iter_message_elements() + + def _iter_message_elements(self) -> Iterator[Element]: + """Partition MS Outlook email messages (.msg files) into elements.""" + yield from self._iter_message_body_elements() + + if not self._opts.partition_attachments: + return + + for attachment in self._attachments: + yield from _AttachmentPartitioner.iter_elements(attachment, self._opts) + + @lazyproperty + def _attachments(self) -> tuple[Attachment, ...]: + """The `oxmsg.attachment.Attachment` objects for this message.""" + return tuple(self._opts.msg.attachments) + + def _iter_message_body_elements(self) -> Iterator[Element]: + """Partition the message body (but not the attachments).""" + msg = self._opts.msg + + if html_body := msg.html_body: + elements = partition_html(text=html_body, languages=[""]) + elif msg.body: + elements = partition_text(text=msg.body, languages=[""]) + else: + elements: list[Element] = [] + + # -- replace the element metadata with email-specific values -- + for e in elements: + e.metadata = self._opts.msg_metadata + yield e + + +class _AttachmentPartitioner: + """Partitions an attachment to a MSG file.""" + + def __init__(self, attachment: Attachment, opts: MsgPartitionerOptions): + self._attachment = attachment + self._opts = opts + + @classmethod + def iter_elements( + cls, attachment: Attachment, opts: MsgPartitionerOptions + ) -> Iterator[Element]: + """Partition an `oxmsg.attachment.Attachment` from an Outlook email message (.msg file).""" + return cls(attachment, opts)._iter_elements() + + def _iter_elements(self) -> Iterator[Element]: + """Partition the file in an `oxmsg.attachment.Attachment` into elements.""" + from unstructured.partition.auto import partition + + with tempfile.TemporaryDirectory() as tmp_dir_path: + # -- save attachment as file in this temporary directory -- + detached_file_path = os.path.join(tmp_dir_path, self._attachment_file_name) + with open(detached_file_path, "wb") as f: + f.write(self._file_bytes) + + # -- partition the attachment -- + for element in partition( + detached_file_path, + metadata_filename=self._attachment_file_name, + metadata_last_modified=self._attachment_last_modified, + **self._opts.partitioning_kwargs, + ): + element.metadata.attached_to_filename = self._opts.metadata_file_path + yield element + + @lazyproperty + def _attachment_file_name(self) -> str: + """The original name of the attached file, no path. + + This value is 'unknown' if it is not present in the MSG file (not expected). + """ + return self._attachment.file_name or "unknown" + + @lazyproperty + def _attachment_last_modified(self) -> str | None: + """ISO8601 string timestamp of attachment last-modified date. + + This value generally available on the attachment and will be the most reliable last-modifed + time. There are fallbacks for when it is not present, ultimately `None` if we have no way + of telling. + """ + if last_modified := self._attachment.last_modified: + return last_modified.isoformat() + return self._opts.metadata_last_modified + + @lazyproperty + def _file_bytes(self) -> bytes: + """The bytes of the attached file.""" + return self._attachment.file_bytes or b""