-
Notifications
You must be signed in to change notification settings - Fork 16.4k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
15 changed files
with
384 additions
and
4 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,94 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "markdown", | ||
"id": "9fdbd55d", | ||
"metadata": {}, | ||
"source": [ | ||
"# Email\n", | ||
"\n", | ||
"This notebook shows how to load email (`.eml`) files." | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 1, | ||
"id": "40cd9806", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"from langchain.document_loaders import UnstructuredEmailLoader" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 2, | ||
"id": "2d20b852", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"loader = UnstructuredEmailLoader('example_data/fake-email.eml')" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 3, | ||
"id": "579fa702", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"data = loader.load()" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 4, | ||
"id": "90c1d899", | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/plain": [ | ||
"[Document(page_content='This is a test email to use for unit tests.\\n\\nImportant points:\\n\\nRoses are red\\n\\nViolets are blue', lookup_str='', metadata={'source': 'example_data/fake-email.eml'}, lookup_index=0)]" | ||
] | ||
}, | ||
"execution_count": 4, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"data" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "4ef9a5f4", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3 (ipykernel)", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.10.9" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 5 | ||
} |
9 changes: 9 additions & 0 deletions
9
docs/modules/document_loaders/examples/example_data/fake-content.html
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
<!DOCTYPE html> | ||
<html> | ||
<body> | ||
|
||
<h1>My First Heading</h1> | ||
<p>My first paragraph.</p> | ||
|
||
</body> | ||
</html> |
20 changes: 20 additions & 0 deletions
20
docs/modules/document_loaders/examples/example_data/fake-email.eml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
MIME-Version: 1.0 | ||
Date: Fri, 16 Dec 2022 17:04:16 -0500 | ||
Message-ID: <CADc-_xaLB2FeVQ7mNsoX+NJb_7hAJhBKa_zet-rtgPGenj0uVw@mail.gmail.com> | ||
Subject: Test Email | ||
From: Matthew Robinson <mrobinson@unstructured.io> | ||
To: Matthew Robinson <mrobinson@unstructured.io> | ||
Content-Type: multipart/alternative; boundary="00000000000095c9b205eff92630" | ||
|
||
--00000000000095c9b205eff92630 | ||
Content-Type: text/plain; charset="UTF-8" | ||
This is a test email to use for unit tests. | ||
Important points: | ||
- Roses are red | ||
- Violets are blue | ||
--00000000000095c9b205eff92630 | ||
Content-Type: text/html; charset="UTF-8" | ||
<div dir="ltr"><div>This is a test email to use for unit tests.</div><div><br></div><div>Important points:</div><div><ul><li>Roses are red</li><li>Violets are blue</li></ul></div></div> | ||
--00000000000095c9b205eff92630-- |
Binary file added
BIN
+37.5 KB
docs/modules/document_loaders/examples/example_data/fake-power-point.pptx
Binary file not shown.
Binary file not shown.
Binary file added
BIN
+4.47 MB
docs/modules/document_loaders/examples/example_data/layout-parser-paper.pdf
Binary file not shown.
94 changes: 94 additions & 0 deletions
94
docs/modules/document_loaders/examples/microsoft_word.ipynb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,94 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "markdown", | ||
"id": "34c90eed", | ||
"metadata": {}, | ||
"source": [ | ||
"# Microsoft Word\n", | ||
"\n", | ||
"This notebook shows how to load text from Microsoft word documents." | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 1, | ||
"id": "28ded768", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"from langchain.document_loaders import UnstructuredDocxLoader" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 2, | ||
"id": "f1f26035", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"loader = UnstructuredDocxLoader('example_data/fake.docx')" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 3, | ||
"id": "2c87dde9", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"data = loader.load()" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 4, | ||
"id": "0e4a884c", | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"data": { | ||
"text/plain": [ | ||
"[Document(page_content='Lorem ipsum dolor sit amet.', lookup_str='', metadata={'source': 'example_data/fake.docx'}, lookup_index=0)]" | ||
] | ||
}, | ||
"execution_count": 4, | ||
"metadata": {}, | ||
"output_type": "execute_result" | ||
} | ||
], | ||
"source": [ | ||
"data" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "61953c83", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3 (ipykernel)", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.10.9" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 5 | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,66 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "markdown", | ||
"id": "1dc7df1d", | ||
"metadata": {}, | ||
"source": [ | ||
"# Obsidian\n", | ||
"This notebook covers how to load documents from an Obsidian database.\n", | ||
"\n", | ||
"Since Obsidian is just stored on disk as a folder of Markdown files, the loader just takes a path to this directory." | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 1, | ||
"id": "007c5cbf", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"from langchain.document_loaders import ObsidianLoader" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "a1caec59", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"loader = ObsidianLoader(\"<path-to-obsidian>\")" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": null, | ||
"id": "b1c30ff7", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"docs = loader.load()" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3 (ipykernel)", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.10.9" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 5 | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
"""Loader that loads Microsoft Word files.""" | ||
from typing import List | ||
|
||
from langchain.docstore.document import Document | ||
from langchain.document_loaders.base import BaseLoader | ||
|
||
|
||
class UnstructuredDocxLoader(BaseLoader): | ||
"""Loader that uses unstructured to load Microsoft Word files.""" | ||
|
||
def __init__(self, file_path: str): | ||
"""Initialize with file path.""" | ||
try: | ||
import unstructured # noqa:F401 | ||
except ImportError: | ||
raise ValueError( | ||
"unstructured package not found, please install it with " | ||
"`pip install unstructured`" | ||
) | ||
self.file_path = file_path | ||
|
||
def load(self) -> List[Document]: | ||
"""Load file.""" | ||
from unstructured.partition.docx import partition_docx | ||
|
||
elements = partition_docx(filename=self.file_path) | ||
text = "\n\n".join([str(el) for el in elements]) | ||
metadata = {"source": self.file_path} | ||
return [Document(page_content=text, metadata=metadata)] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
"""Loader that loads email files.""" | ||
from typing import List | ||
|
||
from langchain.docstore.document import Document | ||
from langchain.document_loaders.base import BaseLoader | ||
|
||
|
||
class UnstructuredEmailLoader(BaseLoader): | ||
"""Loader that uses unstructured to load email files.""" | ||
|
||
def __init__(self, file_path: str): | ||
"""Initialize with file path.""" | ||
try: | ||
import unstructured # noqa:F401 | ||
except ImportError: | ||
raise ValueError( | ||
"unstructured package not found, please install it with " | ||
"`pip install unstructured`" | ||
) | ||
self.file_path = file_path | ||
|
||
def load(self) -> List[Document]: | ||
"""Load file.""" | ||
from unstructured.partition.email import partition_email | ||
|
||
elements = partition_email(filename=self.file_path) | ||
text = "\n\n".join([str(el) for el in elements]) | ||
metadata = {"source": self.file_path} | ||
return [Document(page_content=text, metadata=metadata)] |
Oops, something went wrong.