Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add GoogleDriveLoader #914

Merged
merged 4 commits into from
Feb 7, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
84 changes: 84 additions & 0 deletions docs/modules/document_loaders/examples/googledrive.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "b0ed136e-6983-4893-ae1b-b75753af05f8",
"metadata": {},
"source": [
"# Google Drive\n",
"This notebook covers how to load documents from Google Drive. Currently, only Google Docs are supported.\n",
"\n",
"## Prerequisites\n",
"\n",
"1. Create a Google Cloud project or use an existing project\n",
"1. Enable the [Google Drive API](https://console.cloud.google.com/flows/enableapi?apiid=drive.googleapis.com)\n",
"1. [Authorize credentials for desktop app](https://developers.google.com/drive/api/quickstart/python#authorize_credentials_for_a_desktop_application)\n",
"1. `pip install --upgrade google-api-python-client google-auth-httplib2 google-auth-oauthlib`\n",
"\n",
"## 🧑 Instructions for ingesting your Google Docs data\n",
"By default, the `GoogleDriveLoader` expects the `credentials.json` file to be `~/.credentials/credentials.json`, but this is configurable using the `credentials_file` keyword argument. Same thing with `token.json`. Note that `token.json` will be created automatically the first time you use the loader.\n",
"\n",
"`GoogleDriveLoader` can load from a list of Google Docs document ids or a folder id. You can obtain your folder and document id from the URL:\n",
"* Folder: https://drive.google.com/drive/u/0/folders/1yucgL9WGgWZdM1TOuKkeghlPizuzMYb5 -> folder id is `\"1yucgL9WGgWZdM1TOuKkeghlPizuzMYb5\"`\n",
"* Document: https://docs.google.com/document/d/1bfaMQ18_i56204VaQDVeAFpqEijJTgvurupdEDiaUQw/edit -> document id is `\"1bfaMQ18_i56204VaQDVeAFpqEijJTgvurupdEDiaUQw\"`"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "878928a6-a5ae-4f74-b351-64e3b01733fe",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"from langchain.document_loaders import GoogleDriveLoader"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "2216c83f-68e4-4d2f-8ea2-5878fb18bbe7",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"loader = GoogleDriveLoader(folder_id=\"1yucgL9WGgWZdM1TOuKkeghlPizuzMYb5\")"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "8f3b6aa0-b45d-4e37-8c50-5bebe70fdb9d",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"docs = loader.load()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.9"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
2 changes: 2 additions & 0 deletions langchain/document_loaders/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""All different types of document loaders."""

from langchain.document_loaders.directory import DirectoryLoader
from langchain.document_loaders.googledrive import GoogleDriveLoader
from langchain.document_loaders.html import UnstructuredHTMLLoader
from langchain.document_loaders.notion import NotionDirectoryLoader
from langchain.document_loaders.pdf import UnstructuredPDFLoader
Expand All @@ -13,6 +14,7 @@
"DirectoryLoader",
"NotionDirectoryLoader",
"ReadTheDocsLoader",
"GoogleDriveLoader",
"UnstructuredHTMLLoader",
"UnstructuredPowerPointLoader",
"UnstructuredPDFLoader",
Expand Down
141 changes: 141 additions & 0 deletions langchain/document_loaders/googledrive.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
"""Loader that loads data from Google Drive."""

# Prerequisites:
# 1. Create a Google Cloud project
# 2. Enable the Google Drive API:
# https://console.cloud.google.com/flows/enableapi?apiid=drive.googleapis.com
# 3. Authorize credentials for desktop app:
# https://developers.google.com/drive/api/quickstart/python#authorize_credentials_for_a_desktop_application # noqa: E501


from pathlib import Path
from typing import Any, Dict, List, Optional

from pydantic import BaseModel, root_validator, validator

from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader

SCOPES = ["https://www.googleapis.com/auth/drive.readonly"]


class GoogleDriveLoader(BaseLoader, BaseModel):
"""Loader that loads Google Docs from Google Drive."""

credentials_path: Path = Path.home() / ".credentials" / "credentials.json"
token_path: Path = Path.home() / ".credentials" / "token.json"
folder_id: Optional[str] = None
document_ids: Optional[List[str]] = None

@root_validator
def validate_folder_id_or_document_ids(
cls, values: Dict[str, Any]
) -> Dict[str, Any]:
"""Validate that either folder_id or document_ids is set, but not both."""
if values.get("folder_id") and values.get("document_ids"):
raise ValueError("Cannot specify both folder_id and document_ids")
if not values.get("folder_id") and not values.get("document_ids"):
raise ValueError("Must specify either folder_id or document_ids")
return values

@validator("credentials_path")
def validate_credentials_path(cls, v: Any, **kwargs: Any) -> Any:
"""Validate that credentials_path exists."""
if not v.exists():
raise ValueError(f"credentials_path {v} does not exist")
return v

def _load_credentials(self) -> Any:
"""Load credentials."""
# Adapted from https://developers.google.com/drive/api/v3/quickstart/python
try:
from google.auth.transport.requests import Request
from google.oauth2.credentials import Credentials
from google_auth_oauthlib.flow import InstalledAppFlow
except ImportError:
raise ImportError(
"You must run"
"`pip install --upgrade "
"google-api-python-client google-auth-httplib2 "
"google-auth-oauthlib`"
"to use the Google Drive loader."
)

creds = None
if self.token_path.exists():
creds = Credentials.from_authorized_user_file(str(self.token_path), SCOPES)

if not creds or not creds.valid:
if creds and creds.expired and creds.refresh_token:
creds.refresh(Request())
else:
flow = InstalledAppFlow.from_client_secrets_file(
str(self.credentials_path), SCOPES
)
creds = flow.run_local_server(port=0)
with open(self.token_path, "w") as token:
token.write(creds.to_json())

return creds

def _load_document_from_id(self, id: str) -> Document:
"""Load a document from an ID."""
from io import BytesIO

from googleapiclient.discovery import build
from googleapiclient.http import MediaIoBaseDownload

creds = self._load_credentials()
service = build("drive", "v3", credentials=creds)

request = service.files().export_media(fileId=id, mimeType="text/plain")
fh = BytesIO()
downloader = MediaIoBaseDownload(fh, request)
done = False
while done is False:
status, done = downloader.next_chunk()
text = fh.getvalue().decode("utf-8")
metadata = {"source": f"https://docs.google.com/document/d/{id}/edit"}
return Document(page_content=text, metadata=metadata)

def _load_documents_from_folder(self) -> List[Document]:
"""Load documents from a folder."""
from googleapiclient.discovery import build

creds = self._load_credentials()
service = build("drive", "v3", credentials=creds)

results = (
service.files()
.list(
q=f"'{self.folder_id}' in parents",
pageSize=1000,
fields="nextPageToken, files(id, name, mimeType)",
)
.execute()
)
items = results.get("files", [])

docs = []
for item in items:
# Only support Google Docs for now
if item["mimeType"] == "application/vnd.google-apps.document":
docs.append(self._load_document_from_id(item["id"]))
return docs

def _load_documents_from_ids(self) -> List[Document]:
"""Load documents from a list of IDs."""
if not self.document_ids:
raise ValueError("document_ids must be set")

docs = []
for doc_id in self.document_ids:
docs.append(self._load_document_from_id(doc_id))
return docs

def load(self) -> List[Document]:
"""Load documents."""
if self.folder_id:
return self._load_documents_from_folder()
else:
return self._load_documents_from_ids()