add rag_2

artefactory · Mar 13, 2024 · 134d3e6 · 134d3e6
1 parent f1efcc4
commit 134d3e6
Show file tree

Hide file tree

Showing 15 changed files with 1,328 additions and 49 deletions.
diff --git a/README.md b/README.md
@@ -9,6 +9,7 @@
 
 - [Features](#features)
   - [RAG Option 1](#rag-option-1)
+  - [RAG Option 2](#rag-option-2)
   - [RAG Option 3](#rag-option-3)
 - [Installation](#installation)
 - [Usage](#usage)
@@ -25,6 +26,20 @@ including text, images, and tables. It utilizes a retriever to store and manage
 
 ![alt text](https://blog.langchain.dev/content/images/size/w1600/2023/10/image-22.png)
 
+- **Option 1**: This option involves retrieving the raw image directly from the dataset and combining it with the raw table and text data. The combined raw data is then processed by a Multimodal LLM to generate an answer. This approach uses the complete, unprocessed image data in conjunction with textual information.
+  - Ingestion : Multimodal embeddings
+  - RAG chain : Multimodal LLM
+
+- **Option 2**: In this option, instead of using the raw image, an image summary is retrieved. This summary, along with the raw table and text data, is fed into a Text LLM to generate an answer.
+  - Ingestion : Multimodal LLM (for summarization) + Text embeddings
+  - RAG chain : Text LLM
+
+- **Option 3**: This option also retrieves an image summary, but unlike Option 2, it passes the raw image to a Multimodal LLM for synthesis along with the raw table and text data.
+  - Ingestion : Multimodal LLM (for summarization) + Text embeddings
+  - RAG chain : Multimodal LLM
+
+For all options, we can choose to treat tables as text or images.
+
 ### RAG Option 1
 
 Folder: [backend/rag_1](backend/rag_1)
@@ -48,6 +63,32 @@ Parameters:
 - `ingest.export_extracted` : Whether to export extracted elements in local folder.
 - `metadata_keys` : Unstructured metadata to use.
 
+### RAG Option 2
+
+Folder: [backend/rag_2](backend/rag_2)
+
+Method:
+
+- Use a multimodal LLM (such as GPT-4V, LLaVA, or FUYU-8b) to produce text summaries from images.
+- Embed and retrieve image summaries and texts chunks.
+- Pass image summaries and text chunks to a text LLM for answer synthesis.
+
+Backend:
+
+- Use the [multi-vector retriever](https://python.langchain.com/docs/modules/data_connection/retrievers/multi_vector)
+  with [Chroma](https://www.trychroma.com/) to store raw text (or tables) and images along with their summaries for retrieval.
+- Use GPT-4V for image summarization.
+- Use GPT-4 for final answer synthesis from join review of image summaries and texts (or tables).
+
+Parameters:
+
+- `ingest.clear_database` : Whether to clear the database before ingesting new data.
+- `ingest.table_format` : How to extract tables with Unstructured (`text`, `html` or `image`).
+- `ingest.summarize_text` : Whether to summarize texts with an LLM or use raw texts for retrieval.
+- `ingest.summarize_table` : Whether to summarize tables with LLM or use raw tables for retrieval.
+- `ingest.export_extracted` : Whether to export extracted elements to a local folder.
+- `metadata_keys` : Metadata keys from Unstructured to use.
+
 ### RAG Option 3
 
 Folder: [backend/rag_3](backend/rag_3)
@@ -101,6 +142,12 @@ To use the RAG Multimodal Demo, follow these steps:
     make ingest_rag_1
     ```
 
+    For RAG Option 2:
+
+    ```bash
+    make ingest_rag_2
+    ```
+
     For RAG Option 3:
 
     ```bash
@@ -119,8 +166,9 @@ make serve
 This command will launch the backend server, allowing you to access the FastAPI documentation and playground interfaces :
 
 - FastAPI documentation: <http://0.0.0.0:8000/docs>
-- RAG 1 playground interface: <http://0.0.0.0:8000/rag-1/playground/>
-- RAG 3 playground interface: <http://0.0.0.0:8000/rag-3/playground/>
+- RAG Option 1 playground interface: <http://0.0.0.0:8000/rag-1/playground/>
+- RAG Option 2 playground interface: <http://0.0.0.0:8000/rag-2/playground/>
+- RAG Option 3 playground interface: <http://0.0.0.0:8000/rag-3/playground/>
 
 ## Development
 

diff --git a/app/server.py b/app/server.py
@@ -7,6 +7,8 @@
 
 from backend.rag_1.chain import get_chain as get_chain_rag_1
 from backend.rag_1.config import validate_config as validate_config_1
+from backend.rag_2.chain import get_chain as get_chain_rag_2
+from backend.rag_2.config import validate_config as validate_config_2
 from backend.rag_3.chain import get_chain as get_chain_rag_3
 from backend.rag_3.config import validate_config as validate_config_3
 
@@ -26,6 +28,13 @@ async def redirect_root_to_docs() -> RedirectResponse:
     # validate config
     _ = validate_config_1(config_1)
 
+with initialize(config_path="../backend/rag_2", version_base=None):
+    config_2 = compose(config_name="config")
+    print(config_2)
+
+    # validate config
+    _ = validate_config_2(config_2)
+
 with initialize(config_path="../backend/rag_3", version_base=None):
     config_3 = compose(config_name="config")
     print(config_3)
@@ -37,6 +46,9 @@ async def redirect_root_to_docs() -> RedirectResponse:
 chain_rag_1 = get_chain_rag_1(config_1)
 add_routes(app, chain_rag_1, path="/rag-1")
 
+chain_rag_2 = get_chain_rag_2(config_2)
+add_routes(app, chain_rag_2, path="/rag-2")
+
 chain_rag_3 = get_chain_rag_3(config_3)
 add_routes(app, chain_rag_3, path="/rag-3")
 

diff --git a/backend/rag_1/notebook.ipynb b/backend/rag_1/notebook.ipynb
@@ -4,7 +4,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# RAG 1 : PDF ingestion and RAG"
+    "# RAG Option 1 : PDF ingestion and RAG"
    ]
   },
   {

diff --git a/backend/rag_2/__init__.py b/backend/rag_2/__init__.py
@@ -0,0 +1 @@
+"""RAG Option 2."""
diff --git a/backend/rag_2/chain.py b/backend/rag_2/chain.py
@@ -0,0 +1,47 @@
+"""RAG chain for Option 2."""
+
+from langchain_core.output_parsers.string import StrOutputParser
+from langchain_core.prompts import ChatPromptTemplate
+from langchain_core.runnables import RunnablePassthrough
+from langchain_core.runnables.base import RunnableSequence
+from omegaconf.dictconfig import DictConfig
+
+from backend.utils.llm import get_text_llm
+from backend.utils.retriever import get_retriever
+
+from . import prompts
+
+
+def get_chain(config: DictConfig) -> RunnableSequence:
+    """Constructs a RAG pipeline that retrieves text data from documents.
+
+    The pipeline consists of the following steps:
+    1. Retrieval of documents using a retriever object.
+    2. Prompting the model with the text data.
+    4. Generating responses using a text language model.
+    5. Parsing the string output.
+
+    Args:
+        config (DictConfig): Configuration object.
+
+    Returns:
+        RunnableSequence: RAG pipeline.
+    """
+    retriever = get_retriever(config)
+    model = get_text_llm(config)
+
+    # Prompt template
+    prompt = ChatPromptTemplate.from_template(prompts.RAG_PROMPT)
+
+    # Define the RAG pipeline
+    chain = (
+        {
+            "context": retriever,
+            "question": RunnablePassthrough(),
+        }
+        | prompt
+        | model
+        | StrOutputParser()
+    )
+
+    return chain
diff --git a/backend/rag_2/config.py b/backend/rag_2/config.py
@@ -0,0 +1,107 @@
+"""Configuration schema for the RAG Option 2."""
+
+from typing import Literal
+
+from omegaconf import OmegaConf
+from omegaconf.dictconfig import DictConfig
+from pydantic import BaseModel, ConfigDict, root_validator
+from pydantic.dataclasses import dataclass
+
+
+class HydraObject(BaseModel):
+    """Configuration for objects to be instantiated by Hydra."""
+
+    target: str
+    partial: bool | None
+
+    class Config:
+        """Pydantic configuration."""
+
+        extra = "allow"
+        fields = {"target": "_target_", "partial": "_partial_"}
+
+
+@dataclass(config=ConfigDict(extra="forbid"))
+class PathConfig:
+    """Configuration for paths."""
+
+    docs: str
+    database: str
+    export_extracted: str
+
+
+@dataclass(config=ConfigDict(extra="forbid"))
+class IngestConfig:
+    """Configuration for PDF ingestion."""
+
+    clear_database: bool
+
+    chunking_enable: bool
+    chunking_func: HydraObject
+
+    table_format: Literal["text", "html", "image"]
+    summarize_text: bool
+    summarize_table: bool
+
+    metadata_keys: list[str]
+
+    export_extracted: bool
+
+    @root_validator(pre=False)
+    def validate_table_format(cls, values: dict) -> dict:
+        """Validate the 'table_format' field in relation to 'summarize_table'.
+
+        This validator ensures that if the 'table_format' is set to 'image',
+        then 'summarize_table' must also be set to True. It enforces the rule
+        that image tables require summarization.
+
+        Args:
+            values (dict): Dictionnary of field values for the IngestConfig class.
+
+        Raises:
+            ValueError: If 'table_format' is 'image' and 'summarize_table' is not True.
+
+        Returns:
+            dict: The validated field values.
+        """
+        table_format = values.get("table_format")
+        summarize_table = values.get("summarize_table")
+
+        if table_format == "image" and not summarize_table:
+            raise ValueError("summarize_table must be True for table_format=image")
+
+        return values
+
+
+@dataclass(config=ConfigDict(extra="forbid"))
+class Config:
+    """Configuration for the RAG Option 2."""
+
+    name: str
+
+    path: PathConfig
+
+    text_llm: HydraObject
+    vision_llm: HydraObject
+    embedding: HydraObject
+    vectorstore: HydraObject
+    store: HydraObject
+    retriever: HydraObject
+
+    ingest: IngestConfig
+
+
+def validate_config(config: DictConfig) -> Config:
+    """Validate the configuration.
+
+    Args:
+        config (DictConfig): Configuration object.
+
+    Returns:
+        Config: Validated configuration object.
+    """
+    # Resolve the DictConfig to a native Python object
+    cfg_obj = OmegaConf.to_object(config)
+    # Instantiate the Config class
+    validated_config = Config(**cfg_obj)
+    return validated_config
diff --git a/backend/rag_2/config.yaml b/backend/rag_2/config.yaml
@@ -0,0 +1,78 @@
+name: rag_2
+
+path:
+  docs: "docs"
+  export_extracted: "${.docs}/extracted/${..name}"
+  database: "database/${..name}"
+
+text_llm:
+  _target_: langchain_openai.AzureChatOpenAI
+  azure_endpoint: ${oc.env:TEXT_OPENAI_ENDPOINT}
+  openai_api_key: ${oc.env:TEXT_OPENAI_API_KEY}
+  openai_api_version: "2024-02-15-preview"
+  deployment_name: "gpt4"
+  temperature: 0.0
+  max_tokens: 1024
+
+vision_llm:
+  _target_: langchain_openai.AzureChatOpenAI
+  azure_endpoint: ${oc.env:VISION_OPENAI_ENDPOINT}
+  openai_api_key: ${oc.env:VISION_OPENAI_API_KEY}
+  openai_api_version: "2024-02-15-preview"
+  deployment_name: "gpt-4-vision"
+  temperature: 0.0
+  max_tokens: 1024
+
+embedding:
+  _target_: langchain_openai.AzureOpenAIEmbeddings
+  azure_endpoint: ${oc.env:EMBEDDING_OPENAI_ENDPOINT}
+  openai_api_key: ${oc.env:EMBEDDING_OPENAI_API_KEY}
+  deployment: "ada"
+  chunk_size: 500
+
+vectorstore:
+  _target_: langchain_community.vectorstores.Chroma
+  collection_name: "summaries"
+  embedding_function: ${..embedding}
+  persist_directory: "${..path.database}/chroma_db"
+
+store:
+  _target_: langchain.storage.LocalFileStore
+  root_path: "${..path.database}/multi_vector_retriever_metadata/"
+
+retriever:
+  _target_: langchain.retrievers.multi_vector.MultiVectorRetriever
+  vectorstore: ${..vectorstore}
+  byte_store: ${..store}
+  id_key: "doc_id"
+
+ingest:
+  clear_database: True
+
+  chunking_enable: True
+  chunking_func:
+    _target_: unstructured.chunking.title.chunk_by_title
+    _partial_: True
+    max_characters: 4000
+    new_after_n_chars: 3800
+    combine_text_under_n_chars: 2000
+
+  table_format: "html" # "text" or "html" or "image"
+  summarize_text: False
+  summarize_table: True
+
+  vectorstore_source: # retrieval step
+    text: "content" # "content" or "summary" if enabled
+    table: "summary"
+    image: "summary"
+
+  docstore_source: # RAG step
+    text: "content"
+    table: "content"
+    image: "summary"
+
+  export_extracted: True
+
+  metadata_keys:
+    - filename
+    - page_number