langchain-ai · hwchase17 · Feb 7, 2023 · Feb 6, 2023 · Feb 7, 2023 · Feb 7, 2023
diff --git a/docs/modules/document_loaders/examples/googledrive.ipynb b/docs/modules/document_loaders/examples/googledrive.ipynb
@@ -0,0 +1,84 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "b0ed136e-6983-4893-ae1b-b75753af05f8",
+   "metadata": {},
+   "source": [
+    "# Google Drive\n",
+    "This notebook covers how to load documents from Google Drive. Currently, only Google Docs are supported.\n",
+    "\n",
+    "## Prerequisites\n",
+    "\n",
+    "1. Create a Google Cloud project or use an existing project\n",
+    "1. Enable the [Google Drive API](https://console.cloud.google.com/flows/enableapi?apiid=drive.googleapis.com)\n",
+    "1. [Authorize credentials for desktop app](https://developers.google.com/drive/api/quickstart/python#authorize_credentials_for_a_desktop_application)\n",
+    "1. `pip install --upgrade google-api-python-client google-auth-httplib2 google-auth-oauthlib`\n",
+    "\n",
+    "## 🧑 Instructions for ingesting your Google Docs data\n",
+    "By default, the `GoogleDriveLoader` expects the `credentials.json` file to be `~/.credentials/credentials.json`, but this is configurable using the `credentials_file` keyword argument. Same thing with `token.json`. Note that `token.json` will be created automatically the first time you use the loader.\n",
+    "\n",
+    "`GoogleDriveLoader` can load from a list of Google Docs document ids or a folder id. You can obtain your folder and document id from the URL:\n",
+    "* Folder: https://drive.google.com/drive/u/0/folders/1yucgL9WGgWZdM1TOuKkeghlPizuzMYb5 -> folder id is `\"1yucgL9WGgWZdM1TOuKkeghlPizuzMYb5\"`\n",
+    "* Document: https://docs.google.com/document/d/1bfaMQ18_i56204VaQDVeAFpqEijJTgvurupdEDiaUQw/edit -> document id is `\"1bfaMQ18_i56204VaQDVeAFpqEijJTgvurupdEDiaUQw\"`"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "878928a6-a5ae-4f74-b351-64e3b01733fe",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "from langchain.document_loaders import GoogleDriveLoader"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "2216c83f-68e4-4d2f-8ea2-5878fb18bbe7",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "loader = GoogleDriveLoader(folder_id=\"1yucgL9WGgWZdM1TOuKkeghlPizuzMYb5\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "8f3b6aa0-b45d-4e37-8c50-5bebe70fdb9d",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "docs = loader.load()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/langchain/document_loaders/__init__.py b/langchain/document_loaders/__init__.py
@@ -1,6 +1,7 @@
 """All different types of document loaders."""
 
 from langchain.document_loaders.directory import DirectoryLoader
+from langchain.document_loaders.googledrive import GoogleDriveLoader
 from langchain.document_loaders.html import UnstructuredHTMLLoader
 from langchain.document_loaders.notion import NotionDirectoryLoader
 from langchain.document_loaders.pdf import UnstructuredPDFLoader
@@ -13,6 +14,7 @@
     "DirectoryLoader",
     "NotionDirectoryLoader",
     "ReadTheDocsLoader",
+    "GoogleDriveLoader",
     "UnstructuredHTMLLoader",
     "UnstructuredPowerPointLoader",
     "UnstructuredPDFLoader",

diff --git a/langchain/document_loaders/googledrive.py b/langchain/document_loaders/googledrive.py
@@ -0,0 +1,141 @@
+"""Loader that loads data from Google Drive."""
+
+# Prerequisites:
+# 1. Create a Google Cloud project
+# 2. Enable the Google Drive API:
+#   https://console.cloud.google.com/flows/enableapi?apiid=drive.googleapis.com
+# 3. Authorize credentials for desktop app:
+#   https://developers.google.com/drive/api/quickstart/python#authorize_credentials_for_a_desktop_application # noqa: E501
+
+
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+
+from pydantic import BaseModel, root_validator, validator
+
+from langchain.docstore.document import Document
+from langchain.document_loaders.base import BaseLoader
+
+SCOPES = ["https://www.googleapis.com/auth/drive.readonly"]
+
+
+class GoogleDriveLoader(BaseLoader, BaseModel):
+    """Loader that loads Google Docs from Google Drive."""
+
+    credentials_path: Path = Path.home() / ".credentials" / "credentials.json"
+    token_path: Path = Path.home() / ".credentials" / "token.json"
+    folder_id: Optional[str] = None
+    document_ids: Optional[List[str]] = None
+
+    @root_validator
+    def validate_folder_id_or_document_ids(
+        cls, values: Dict[str, Any]
+    ) -> Dict[str, Any]:
+        """Validate that either folder_id or document_ids is set, but not both."""
+        if values.get("folder_id") and values.get("document_ids"):
+            raise ValueError("Cannot specify both folder_id and document_ids")
+        if not values.get("folder_id") and not values.get("document_ids"):
+            raise ValueError("Must specify either folder_id or document_ids")
+        return values
+
+    @validator("credentials_path")
+    def validate_credentials_path(cls, v: Any, **kwargs: Any) -> Any:
+        """Validate that credentials_path exists."""
+        if not v.exists():
+            raise ValueError(f"credentials_path {v} does not exist")
+        return v
+
+    def _load_credentials(self) -> Any:
+        """Load credentials."""
+        # Adapted from https://developers.google.com/drive/api/v3/quickstart/python
+        try:
+            from google.auth.transport.requests import Request
+            from google.oauth2.credentials import Credentials
+            from google_auth_oauthlib.flow import InstalledAppFlow
+        except ImportError:
+            raise ImportError(
+                "You must run"
+                "`pip install --upgrade "
+                "google-api-python-client google-auth-httplib2 "
+                "google-auth-oauthlib`"
+                "to use the Google Drive loader."
+            )
+
+        creds = None
+        if self.token_path.exists():
+            creds = Credentials.from_authorized_user_file(str(self.token_path), SCOPES)
+
+        if not creds or not creds.valid:
+            if creds and creds.expired and creds.refresh_token:
+                creds.refresh(Request())
+            else:
+                flow = InstalledAppFlow.from_client_secrets_file(
+                    str(self.credentials_path), SCOPES
+                )
+                creds = flow.run_local_server(port=0)
+            with open(self.token_path, "w") as token:
+                token.write(creds.to_json())
+
+        return creds
+
+    def _load_document_from_id(self, id: str) -> Document:
+        """Load a document from an ID."""
+        from io import BytesIO
+
+        from googleapiclient.discovery import build
+        from googleapiclient.http import MediaIoBaseDownload
+
+        creds = self._load_credentials()
+        service = build("drive", "v3", credentials=creds)
+
+        request = service.files().export_media(fileId=id, mimeType="text/plain")
+        fh = BytesIO()
+        downloader = MediaIoBaseDownload(fh, request)
+        done = False
+        while done is False:
+            status, done = downloader.next_chunk()
+        text = fh.getvalue().decode("utf-8")
+        metadata = {"source": f"https://docs.google.com/document/d/{id}/edit"}
+        return Document(page_content=text, metadata=metadata)
+
+    def _load_documents_from_folder(self) -> List[Document]:
+        """Load documents from a folder."""
+        from googleapiclient.discovery import build
+
+        creds = self._load_credentials()
+        service = build("drive", "v3", credentials=creds)
+
+        results = (
+            service.files()
+            .list(
+                q=f"'{self.folder_id}' in parents",
+                pageSize=1000,
+                fields="nextPageToken, files(id, name, mimeType)",
+            )
+            .execute()
+        )
+        items = results.get("files", [])
+
+        docs = []
+        for item in items:
+            # Only support Google Docs for now
+            if item["mimeType"] == "application/vnd.google-apps.document":
+                docs.append(self._load_document_from_id(item["id"]))
+        return docs
+
+    def _load_documents_from_ids(self) -> List[Document]:
+        """Load documents from a list of IDs."""
+        if not self.document_ids:
+            raise ValueError("document_ids must be set")
+
+        docs = []
+        for doc_id in self.document_ids:
+            docs.append(self._load_document_from_id(doc_id))
+        return docs
+
+    def load(self) -> List[Document]:
+        """Load documents."""
+        if self.folder_id:
+            return self._load_documents_from_folder()
+        else:
+            return self._load_documents_from_ids()