Skip to content

Commit

Permalink
Harrison/obsidian (#920)
Browse files Browse the repository at this point in the history
  • Loading branch information
hwchase17 authored Feb 7, 2023
1 parent 1e56879 commit 637c0d6
Show file tree
Hide file tree
Showing 15 changed files with 384 additions and 4 deletions.
94 changes: 94 additions & 0 deletions docs/modules/document_loaders/examples/email.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "9fdbd55d",
"metadata": {},
"source": [
"# Email\n",
"\n",
"This notebook shows how to load email (`.eml`) files."
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "40cd9806",
"metadata": {},
"outputs": [],
"source": [
"from langchain.document_loaders import UnstructuredEmailLoader"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "2d20b852",
"metadata": {},
"outputs": [],
"source": [
"loader = UnstructuredEmailLoader('example_data/fake-email.eml')"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "579fa702",
"metadata": {},
"outputs": [],
"source": [
"data = loader.load()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "90c1d899",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[Document(page_content='This is a test email to use for unit tests.\\n\\nImportant points:\\n\\nRoses are red\\n\\nViolets are blue', lookup_str='', metadata={'source': 'example_data/fake-email.eml'}, lookup_index=0)]"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4ef9a5f4",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.9"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
<!DOCTYPE html>
<html>
<body>

<h1>My First Heading</h1>
<p>My first paragraph.</p>

</body>
</html>
20 changes: 20 additions & 0 deletions docs/modules/document_loaders/examples/example_data/fake-email.eml
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
MIME-Version: 1.0
Date: Fri, 16 Dec 2022 17:04:16 -0500
Message-ID: <CADc-_xaLB2FeVQ7mNsoX+NJb_7hAJhBKa_zet-rtgPGenj0uVw@mail.gmail.com>
Subject: Test Email
From: Matthew Robinson <mrobinson@unstructured.io>
To: Matthew Robinson <mrobinson@unstructured.io>
Content-Type: multipart/alternative; boundary="00000000000095c9b205eff92630"

--00000000000095c9b205eff92630
Content-Type: text/plain; charset="UTF-8"
This is a test email to use for unit tests.
Important points:
- Roses are red
- Violets are blue
--00000000000095c9b205eff92630
Content-Type: text/html; charset="UTF-8"
<div dir="ltr"><div>This is a test email to use for unit tests.</div><div><br></div><div>Important points:</div><div><ul><li>Roses are red</li><li>Violets are blue</li></ul></div></div>
--00000000000095c9b205eff92630--
Binary file not shown.
Binary file not shown.
Binary file not shown.
94 changes: 94 additions & 0 deletions docs/modules/document_loaders/examples/microsoft_word.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "34c90eed",
"metadata": {},
"source": [
"# Microsoft Word\n",
"\n",
"This notebook shows how to load text from Microsoft word documents."
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "28ded768",
"metadata": {},
"outputs": [],
"source": [
"from langchain.document_loaders import UnstructuredDocxLoader"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "f1f26035",
"metadata": {},
"outputs": [],
"source": [
"loader = UnstructuredDocxLoader('example_data/fake.docx')"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "2c87dde9",
"metadata": {},
"outputs": [],
"source": [
"data = loader.load()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "0e4a884c",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[Document(page_content='Lorem ipsum dolor sit amet.', lookup_str='', metadata={'source': 'example_data/fake.docx'}, lookup_index=0)]"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "61953c83",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.9"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
66 changes: 66 additions & 0 deletions docs/modules/document_loaders/examples/obsidian.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "1dc7df1d",
"metadata": {},
"source": [
"# Obsidian\n",
"This notebook covers how to load documents from an Obsidian database.\n",
"\n",
"Since Obsidian is just stored on disk as a folder of Markdown files, the loader just takes a path to this directory."
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "007c5cbf",
"metadata": {},
"outputs": [],
"source": [
"from langchain.document_loaders import ObsidianLoader"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a1caec59",
"metadata": {},
"outputs": [],
"source": [
"loader = ObsidianLoader(\"<path-to-obsidian>\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b1c30ff7",
"metadata": {},
"outputs": [],
"source": [
"docs = loader.load()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.9"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
8 changes: 8 additions & 0 deletions docs/modules/document_loaders/how_to_guides.rst
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,14 @@ There are a lot of different document loaders that LangChain supports. Below are

`PowerPoint <./examples/powerpoint.html>`_: A walkthrough of how to load data from a powerpoint file.

`Email <./examples/email.html>`_: A walkthrough of how to load data from an email (`.eml`) file.

`GoogleDrive <./examples/googledrive.html>`_: A walkthrough of how to load data from Google drive.

`Microsoft Word <./examples/microsoft_word.html>`_: A walkthrough of how to load data from Microsoft Word files.

`Obsidian <./examples/obsidian.html>`_: A walkthrough of how to load data from an Obsidian file dump.

.. toctree::
:maxdepth: 1
:glob:
Expand Down
6 changes: 6 additions & 0 deletions langchain/document_loaders/__init__.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
"""All different types of document loaders."""

from langchain.document_loaders.directory import DirectoryLoader
from langchain.document_loaders.docx import UnstructuredDocxLoader
from langchain.document_loaders.email import UnstructuredEmailLoader
from langchain.document_loaders.googledrive import GoogleDriveLoader
from langchain.document_loaders.html import UnstructuredHTMLLoader
from langchain.document_loaders.notion import NotionDirectoryLoader
from langchain.document_loaders.obsidian import ObsidianLoader
from langchain.document_loaders.pdf import UnstructuredPDFLoader
from langchain.document_loaders.powerpoint import UnstructuredPowerPointLoader
from langchain.document_loaders.readthedocs import ReadTheDocsLoader
Expand All @@ -18,4 +21,7 @@
"UnstructuredHTMLLoader",
"UnstructuredPowerPointLoader",
"UnstructuredPDFLoader",
"ObsidianLoader",
"UnstructuredDocxLoader",
"UnstructuredEmailLoader",
]
29 changes: 29 additions & 0 deletions langchain/document_loaders/docx.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
"""Loader that loads Microsoft Word files."""
from typing import List

from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader


class UnstructuredDocxLoader(BaseLoader):
"""Loader that uses unstructured to load Microsoft Word files."""

def __init__(self, file_path: str):
"""Initialize with file path."""
try:
import unstructured # noqa:F401
except ImportError:
raise ValueError(
"unstructured package not found, please install it with "
"`pip install unstructured`"
)
self.file_path = file_path

def load(self) -> List[Document]:
"""Load file."""
from unstructured.partition.docx import partition_docx

elements = partition_docx(filename=self.file_path)
text = "\n\n".join([str(el) for el in elements])
metadata = {"source": self.file_path}
return [Document(page_content=text, metadata=metadata)]
29 changes: 29 additions & 0 deletions langchain/document_loaders/email.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
"""Loader that loads email files."""
from typing import List

from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader


class UnstructuredEmailLoader(BaseLoader):
"""Loader that uses unstructured to load email files."""

def __init__(self, file_path: str):
"""Initialize with file path."""
try:
import unstructured # noqa:F401
except ImportError:
raise ValueError(
"unstructured package not found, please install it with "
"`pip install unstructured`"
)
self.file_path = file_path

def load(self) -> List[Document]:
"""Load file."""
from unstructured.partition.email import partition_email

elements = partition_email(filename=self.file_path)
text = "\n\n".join([str(el) for el in elements])
metadata = {"source": self.file_path}
return [Document(page_content=text, metadata=metadata)]
Loading

0 comments on commit 637c0d6

Please sign in to comment.