From a233d1a9d253816062dad2d2f8ddc72eb01d9e28 Mon Sep 17 00:00:00 2001
From: Alexis VIALARET <alexis.vialaret@artefact.com>
Date: Tue, 19 Mar 2024 09:58:07 +0100
Subject: [PATCH] doc: add document loading notebook example

---
 README.md                                  |  36 ++-----
 backend/config.yaml                        |  10 +-
 backend/main.py                            |   1 -
 backend/rag_components/rag.py              |  36 +++----
 data_sample/add_data_sample_to_rag.py      |   9 --
 docs/cookbook/configs/llms_configs.md      |   4 +-
 docs/index.md                              |  35 ++-----
 {data_sample => examples}/billionaires.csv |   0
 examples/load_documents.ipynb              | 108 +++++++++++++++++++++
 9 files changed, 151 insertions(+), 88 deletions(-)
 delete mode 100644 data_sample/add_data_sample_to_rag.py
 rename {data_sample => examples}/billionaires.csv (100%)
 create mode 100644 examples/load_documents.ipynb

diff --git a/README.md b/README.md
index c76ae5c..5f1c60b 100644
--- a/README.md
+++ b/README.md
@@ -17,27 +17,25 @@ This is a starter kit to deploy a modularizable RAG locally or on the cloud (or
 
 ## Quickstart
 
-This quickstart will guide you through the steps to serve a RAG fully locally. You will run the API backend and frontend on your machine, which should allow you to run your first queries against the RAG.
+This quickstart will guide you through the steps to serve the RAG and load a few documents. 
+
+You will run both the back and front on your machine.
+
+For this exemple, we will be using GPT4, the `BAAI/bge-base-en-v1.5` embedding model, and Chroma for the vector store.
 
-For this exemple, we will be using the `tinyllama` LLM, the `BAAI/bge-base-en-v1.5` embedding model, and Chroma for the vector store. This allows this setup to be fully local, and independent of any external API (and thus, free). However, the relevance of answers will not be impressive.
 
 Duration: ~15 minutes.
 
 ### Pre-requisites
 
-- Ollama, to serve the LLM locally ([Download and install](https://ollama.com/))
-- A few GB of disk space to host the models
+- An `OPENAI_API_KEY` for the Artefact GPT-4 deployment on Azure. Contact alexis.vialaret@artefact.com if you do not have one.
+- A few GB of disk space
 - Tested with python 3.11 (may work with other versions)
 
 ### Run using docker compose
 
 If you have docker installed and running you can run the whole RAG app using it. [Otherwise, skip to the "Run directly" section](#run-directly)
 
-Start the LLM server:
-```shell
-ollama run tinyllama
-```
-
 Start the service:
 ```shell
 docker compose up -d
@@ -53,11 +51,6 @@ Go to http://localhost:9000/ to query your RAG.
 
 ### Run directly
 
-Start the LLM server:
-```shell
-ollama run tinyllama
-```
-
 In a fresh env:
 ```shell
 pip install -r requirements-dev.txt
@@ -79,20 +72,9 @@ Start the frontend demo
 python -m streamlit run frontend/front.py
 ```
 
-### Querying and loading the RAG
-
-You should then be able to login and chat to the bot:
-
-![](docs/login_and_chat.gif)
-
-Right now the RAG does not have any document loaded, let's add a sample:
-```shell
-python data_sample/add_data_sample_to_rag.py
-```
-
-The RAG now has access to the information from your loaded documents:
+### Loading documents in the RAG
 
-![](docs/query_with_knowledge.gif)
+Right now the RAG does not have any documents loaded, you can use the notebook in the `examples` folder to transform a file into documents and load them in the vector store.
 
 ## Documentation
 
diff --git a/backend/config.yaml b/backend/config.yaml
index 7831db3..326bff2 100644
--- a/backend/config.yaml
+++ b/backend/config.yaml
@@ -1,9 +1,11 @@
 LLMConfig: &LLMConfig
-  source: ChatOllama
+  source: AzureChatOpenAI
   source_config:
-    model: tinyllama
-    temperature: 0
-    # base_url: http://host.docker.internal:11434  # Uncomment this line if you are running the RAG through Docker Compose
+    openai_api_type: azure
+    openai_api_key: {{ OPENAI_API_KEY }}
+    openai_api_base: https://genai-ds.openai.azure.com/openai/deployments/gpt4
+    openai_api_version: 2023-07-01-preview
+    temperature: 0.1
 
 VectorStoreConfig: &VectorStoreConfig
   source: Chroma
diff --git a/backend/main.py b/backend/main.py
index 4b2ad58..1595601 100644
--- a/backend/main.py
+++ b/backend/main.py
@@ -11,7 +11,6 @@
 rag = RAG(config=Path(__file__).parent / "config.yaml")
 chain = rag.get_chain()
 
-
 # Create a minimal RAG server based on langserve
 # Learn how to extend this configuration to add authentication and session management
 # https://artefactory.github.io/skaff-rag-accelerator/backend/plugins/plugins/
diff --git a/backend/rag_components/rag.py b/backend/rag_components/rag.py
index e47ae0c..a7a74f6 100644
--- a/backend/rag_components/rag.py
+++ b/backend/rag_components/rag.py
@@ -1,7 +1,6 @@
 from pathlib import Path
 from typing import List, Union
 
-import sqlalchemy
 from langchain.chat_models.base import BaseChatModel
 from langchain.docstore.document import Document
 from langchain.indexes import SQLRecordManager, index
@@ -67,25 +66,26 @@ def load_file(self, file_path: Path) -> List[Document]:
         filtered_documents = filter_complex_metadata(documents)
         return self.load_documents(filtered_documents)
 
-    def load_documents(self, documents: List[Document], insertion_mode: str = None):
+    def load_documents(self, documents: List[Document], insertion_mode: str = None, namespace: str = "default"):
         insertion_mode = insertion_mode or self.config.vector_store.insertion_mode
 
         record_manager = SQLRecordManager(
-            namespace="vector_store/my_docs", db_url=self.config.database.database_url
+            namespace=namespace, db_url=self.config.database.database_url
         )
 
-        try:
-            record_manager.create_schema()
-        except sqlalchemy.exc.OperationalError:
-            with Database() as connection:
-                connection.initialize_schema()
-            record_manager.create_schema()
-
-        indexing_output = index(
-            documents,
-            record_manager,
-            self.vector_store,
-            cleanup=insertion_mode,
-            source_id_key="source",
-        )
-        self.logger.info({"event": "load_documents", **indexing_output})
+        record_manager.create_schema()
+
+        self.logger.info(f"Indexing {len(documents)} documents.")
+
+        batch_size = 100
+        for batch in range(0, len(documents), batch_size):
+            self.logger.info(f"Indexing batch {batch} to {min(len(documents), batch + batch_size)}.")
+
+            indexing_output = index(
+                documents[batch : min(len(documents), batch + batch_size)],
+                record_manager,
+                self.vector_store,
+                cleanup=insertion_mode,
+                source_id_key="source",
+            )
+            self.logger.info({"event": "load_documents", **indexing_output})
diff --git a/data_sample/add_data_sample_to_rag.py b/data_sample/add_data_sample_to_rag.py
deleted file mode 100644
index 3c73905..0000000
--- a/data_sample/add_data_sample_to_rag.py
+++ /dev/null
@@ -1,9 +0,0 @@
-from pathlib import Path
-
-from backend.rag_components.rag import RAG
-
-config_directory = Path("backend/config.yaml")
-rag = RAG(config_directory)
-
-data_sample_path = Path("data_sample/billionaires_csv.csv")
-print(rag.load_file(data_sample_path))
diff --git a/docs/cookbook/configs/llms_configs.md b/docs/cookbook/configs/llms_configs.md
index 9e949af..e7ca9c4 100644
--- a/docs/cookbook/configs/llms_configs.md
+++ b/docs/cookbook/configs/llms_configs.md
@@ -10,7 +10,7 @@ LLMConfig: &LLMConfig
     openai_api_base: https://genai-ds.openai.azure.com/
     openai_api_version: 2023-07-01-preview
     deployment_name: gpt4
-  temperature: 0.1
+    temperature: 0.1
 ```
 
 ## Local llama2
@@ -58,5 +58,5 @@ LLMConfig: &LLMConfig
   source: ChatVertexAI
   source_config:
     model_name: gemini-pro
-  temperature: 0.1
+    temperature: 0.1
 ```
diff --git a/docs/index.md b/docs/index.md
index 903c68d..9d6967b 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -17,27 +17,24 @@ This is a starter kit to deploy a modularizable RAG locally or on the cloud (or
 
 ## Quickstart
 
-This quickstart will guide you through the steps to serve a RAG fully locally. You will run the API backend and frontend on your machine, which should allow you to run your first queries against the RAG.
+This quickstart will guide you through the steps to serve the RAG and load a few documents. 
 
-For this exemple, we will be using the `tinyllama` LLM, the `BAAI/bge-base-en-v1.5` embedding model, and Chroma for the vector store. This allows this setup to be fully local, and independent of any external API (and thus, free). However, the relevance of answers will not be impressive.
+You will run both the back and front on your machine.
+
+For this exemple, we will be using GPT4, the `BAAI/bge-base-en-v1.5` embedding model, and Chroma for the vector store.
 
 Duration: ~15 minutes.
 
 ### Pre-requisites
 
-- Ollama, to serve the LLM locally ([Download and install](https://ollama.com/))
-- A few GB of disk space to host the models
+- An `OPENAI_API_KEY` for the Artefact GPT-4 deployment on Azure. Contact alexis.vialaret@artefact.com if you do not have one.
+- A few GB of disk space
 - Tested with python 3.11 (may work with other versions)
 
 ### Run using docker compose
 
 If you have docker installed and running you can run the whole RAG app using it. [Otherwise, skip to the "Run directly" section](index.md#run-directly)
 
-Start the LLM server:
-```shell
-ollama run tinyllama
-```
-
 Start the service:
 ```shell
 docker compose up -d
@@ -53,11 +50,6 @@ Go to http://localhost:9000/ to query your RAG.
 
 ### Run directly
 
-Start the LLM server:
-```shell
-ollama run tinyllama
-```
-
 In a fresh env:
 ```shell
 pip install -r requirements-dev.txt
@@ -79,20 +71,9 @@ Start the frontend demo
 python -m streamlit run frontend/front.py
 ```
 
-### Querying and loading the RAG
-
-You should then be able to login and chat to the bot:
-
-![](login_and_chat.gif)
-
-Right now the RAG does not have any document loaded, let's add a sample:
-```shell
-python data_sample/add_data_sample_to_rag.py
-```
-
-The RAG now has access to the information from your loaded documents:
+### Loading documents in the RAG
 
-![](query_with_knowledge.gif)
+Right now the RAG does not have any documents loaded, you can use the notebook in the `examples` folder to transform a file into documents and load them in the vector store.
 
 ## Documentation
 
diff --git a/data_sample/billionaires.csv b/examples/billionaires.csv
similarity index 100%
rename from data_sample/billionaires.csv
rename to examples/billionaires.csv
diff --git a/examples/load_documents.ipynb b/examples/load_documents.ipynb
new file mode 100644
index 0000000..524aa21
--- /dev/null
+++ b/examples/load_documents.ipynb
@@ -0,0 +1,108 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This is an interactive example that will walk you through the initialization of a RAG and the basic embedding of a few documents."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pathlib import Path\n",
+    "import sys\n",
+    "import os\n",
+    "repo_root = Path(os.getcwd()).parent\n",
+    "sys.path.append(str(repo_root))\n",
+    "\n",
+    "from backend.config import RagConfig\n",
+    "from backend.rag_components.rag import RAG\n",
+    "\n",
+    "rag_config = RagConfig.from_yaml(repo_root / \"backend\" / \"config.yaml\")\n",
+    "rag_config.database.database_url = f\"sqlite:////{repo_root}/database/rag.sqlite3\"\n",
+    "\n",
+    "rag = RAG(config=rag_config)\n",
+    "\n",
+    "print(\"LLM:\", rag.llm.__class__.__name__)\n",
+    "print(\"Embedding model:\", rag.embeddings.__class__.__name__)\n",
+    "print(\"Vector store:\", rag.vector_store.__class__.__name__)\n",
+    "print(\"Retriever:\", rag.retriever.__class__.__name__)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Here we transform our CSV into standalone embeddable documents that we will be able to feed the vector store.\n",
+    "\n",
+    "We generate one document for each line, and each document will contain header:value pairs for all the columns.\n",
+    "\n",
+    "This is a very simplistic example, but vector store data models can get more advanced to support more [powerful retreival methods.](https://python.langchain.com/docs/modules/data_connection/retrievers/)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain_community.document_loaders.csv_loader import CSVLoader\n",
+    "from langchain.vectorstores.utils import filter_complex_metadata\n",
+    "\n",
+    "\n",
+    "data_sample_path = repo_root / \"examples\" / \"billionaires.csv\"\n",
+    "\n",
+    "loader = CSVLoader(\n",
+    "    file_path=str(data_sample_path),\n",
+    "    csv_args={\"delimiter\": \",\", \"quotechar\": '\"', \"escapechar\": \"\\\\\"},\n",
+    "    encoding=\"utf-8-sig\",\n",
+    ")\n",
+    "\n",
+    "raw_documents = loader.load()\n",
+    "documents = filter_complex_metadata(raw_documents)\n",
+    "documents[:5]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "To load the docs in the vector store, we recommend using the `load_document` as it [indexes previously embedded docs](https://python.langchain.com/docs/modules/data_connection/indexing), making the process idempotent."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "rag.load_documents(documents)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}