Merge pull request #6 from artefactory/rag-2

Add rag 2
artefactory · Mar 14, 2024 · 868d99d · 868d99d
2 parents 992e4c1 + 82ce197
commit 868d99d
Show file tree

Hide file tree

Showing 10 changed files with 1,229 additions and 1 deletion.
diff --git a/Makefile b/Makefile
@@ -1,6 +1,9 @@
 ingest_rag_1:
 	poetry run python -m backend.rag_1.ingest
 
+ingest_rag_2:
+	poetry run python -m backend.rag_2.ingest
+
 ingest_rag_3:
 	poetry run python -m backend.rag_3.ingest
 

diff --git a/README.md b/README.md
@@ -9,6 +9,7 @@
 
 - [Features](#features)
   - [RAG Option 1](#rag-option-1)
+  - [RAG Option 2](#rag-option-2)
   - [RAG Option 3](#rag-option-3)
 - [Installation](#installation)
 - [Usage](#usage)
@@ -62,6 +63,36 @@ Parameters:
 - `ingest.table_format` : How to extract table with Unstructured (`text`, `html` or `image`).
 - `ingest.export_extracted` : Whether to export extracted elements in local folder.
 
+### RAG Option 2
+
+Folder: [backend/rag_2](backend/rag_2)
+
+Method:
+
+- Use a multimodal LLM (such as GPT-4V, LLaVA, or FUYU-8b) to produce text summaries from images.
+- Embed and retrieve image summaries and texts chunks.
+- Pass image summaries and text chunks to a text LLM for answer synthesis.
+
+Backend:
+
+- Use the [multi-vector retriever](https://python.langchain.com/docs/modules/data_connection/retrievers/multi_vector)
+  with [Chroma](https://www.trychroma.com/) to store raw text (or tables) and images (in a docstore) along with their summaries (in a vectorstore) for retrieval.
+- Use GPT-4V for image summarization.
+- Use GPT-4 for final answer synthesis from join review of image summaries and texts (or tables).
+
+Parameters:
+
+- `ingest.clear_database` : Whether to clear the database before ingesting new data.
+- `ingest.metadata_keys` : Unstructured metadata to use.
+- `ingest.table_format` : How to extract tables with Unstructured (`text`, `html` or `image`).
+- `ingest.summarize_text` : Whether to summarize texts with an LLM or use raw texts for retrieval.
+- `ingest.summarize_table` : Whether to summarize tables with LLM or use raw tables for retrieval.
+- `ingest.vectorstore_source` : The field of documents to add into the vectorstore (`content` or `summary`).
+- `ingest.docstore_source` : The field of documents to add into the docstore (`content` or `summary`).
+- `ingest.export_extracted` : Whether to export extracted elements to a local folder.
+
+In option 2, the vectorstore and docstore must be populated with text documents (text content or summary).
+
 ### RAG Option 3
 
 Folder: [backend/rag_3](backend/rag_3)
@@ -75,7 +106,7 @@ Method:
 Backend:
 
 - Use the [multi-vector retriever](https://python.langchain.com/docs/modules/data_connection/retrievers/multi_vector)
-  with [Chroma](https://www.trychroma.com/) to store raw text and images (in a docstore) along with their summaries (in a vectorstore) for retrieval.
+  with [Chroma](https://www.trychroma.com/) to store raw text (or tables) and images (in a docstore) along with their summaries (in a vectorstore) for retrieval.
 - Use GPT-4V for both image summarization (for retrieval) as well as final answer synthesis from join review of images and texts (or tables).
 
 Parameters:
@@ -89,6 +120,8 @@ Parameters:
 - `ingest.docstore_source` : The field of documents to add into the docstore (`content` or `summary`).
 - `ingest.export_extracted` : Whether to export extracted elements to a local folder.
 
+In option 3, the vectorstore must be populated with text documents (text content or summary) as in option 2. However, the docstore can be populated with either text or image documents.
+
 ## Installation
 
 To set up the project, ensure you have Python version between 3.10 and 3.11. Then install the dependencies using Poetry:
@@ -117,6 +150,12 @@ To use the RAG Multimodal Demo, follow these steps:
     make ingest_rag_1
     ```
 
+    For RAG Option 2:
+
+    ```bash
+    make ingest_rag_2
+    ```
+
     For RAG Option 3:
 
     ```bash
@@ -136,6 +175,7 @@ This command will launch the backend server, allowing you to access the FastAPI
 
 - FastAPI documentation: <http://0.0.0.0:8000/docs>
 - RAG Option 1 playground interface: <http://0.0.0.0:8000/rag-1/playground/>
+- RAG Option 2 playground interface: <http://0.0.0.0:8000/rag-2/playground/>
 - RAG Option 3 playground interface: <http://0.0.0.0:8000/rag-3/playground/>
 
 ## Development

diff --git a/app/server.py b/app/server.py
@@ -7,6 +7,8 @@
 
 from backend.rag_1.chain import get_chain as get_chain_rag_1
 from backend.rag_1.config import validate_config as validate_config_1
+from backend.rag_2.chain import get_chain as get_chain_rag_2
+from backend.rag_2.config import validate_config as validate_config_2
 from backend.rag_3.chain import get_chain as get_chain_rag_3
 from backend.rag_3.config import validate_config as validate_config_3
 
@@ -26,6 +28,13 @@ async def redirect_root_to_docs() -> RedirectResponse:
     # validate config
     _ = validate_config_1(config_1)
 
+with initialize(config_path="../backend/rag_2", version_base=None):
+    config_2 = compose(config_name="config")
+    print(config_2)
+
+    # validate config
+    _ = validate_config_2(config_2)
+
 with initialize(config_path="../backend/rag_3", version_base=None):
     config_3 = compose(config_name="config")
     print(config_3)
@@ -37,6 +46,9 @@ async def redirect_root_to_docs() -> RedirectResponse:
 chain_rag_1 = get_chain_rag_1(config_1)
 add_routes(app, chain_rag_1, path="/rag-1")
 
+chain_rag_2 = get_chain_rag_2(config_2)
+add_routes(app, chain_rag_2, path="/rag-2")
+
 chain_rag_3 = get_chain_rag_3(config_3)
 add_routes(app, chain_rag_3, path="/rag-3")
 

diff --git a/backend/rag_2/__init__.py b/backend/rag_2/__init__.py
@@ -0,0 +1 @@
+"""RAG Option 2."""
diff --git a/backend/rag_2/chain.py b/backend/rag_2/chain.py
@@ -0,0 +1,47 @@
+"""RAG chain for Option 2."""
+
+from langchain_core.output_parsers.string import StrOutputParser
+from langchain_core.prompts import ChatPromptTemplate
+from langchain_core.runnables import RunnablePassthrough
+from langchain_core.runnables.base import RunnableSequence
+from omegaconf.dictconfig import DictConfig
+
+from backend.utils.llm import get_text_llm
+from backend.utils.retriever import get_retriever
+
+from . import prompts
+
+
+def get_chain(config: DictConfig) -> RunnableSequence:
+    """Constructs a RAG pipeline that retrieves text data from documents.
+
+    The pipeline consists of the following steps:
+    1. Retrieval of documents using a retriever object.
+    2. Prompting the model with the text data.
+    4. Generating responses using a text language model.
+    5. Parsing the string output.
+
+    Args:
+        config (DictConfig): Configuration object.
+
+    Returns:
+        RunnableSequence: RAG pipeline.
+    """
+    retriever = get_retriever(config)
+    model = get_text_llm(config)
+
+    # Prompt template
+    prompt = ChatPromptTemplate.from_template(prompts.RAG_PROMPT)
+
+    # Define the RAG pipeline
+    chain = (
+        {
+            "context": retriever,
+            "question": RunnablePassthrough(),
+        }
+        | prompt
+        | model
+        | StrOutputParser()
+    )
+
+    return chain
diff --git a/backend/rag_2/config.py b/backend/rag_2/config.py
@@ -0,0 +1,153 @@
+"""Configuration schema for the RAG Option 2."""
+
+from typing import Literal
+
+from omegaconf import OmegaConf
+from omegaconf.dictconfig import DictConfig
+from pydantic import BaseModel, ConfigDict, root_validator
+from pydantic.dataclasses import dataclass
+
+
+class HydraObject(BaseModel):
+    """Configuration for objects to be instantiated by Hydra."""
+
+    target: str
+    partial: bool | None
+
+    class Config:
+        """Pydantic configuration."""
+
+        extra = "allow"
+        fields = {"target": "_target_", "partial": "_partial_"}
+
+
+@dataclass(config=ConfigDict(extra="forbid"))
+class PathConfig:
+    """Configuration for paths."""
+
+    docs: str
+    database: str
+    export_extracted: str
+
+
+@dataclass(config=ConfigDict(extra="forbid"))
+class SourceConfig:
+    """Configuration for the vectorstore or docstore source."""
+
+    text: Literal["content", "summary"]
+    table: Literal["content", "summary"]
+    image: Literal["content", "summary"]
+
+
+@dataclass(config=ConfigDict(extra="forbid"))
+class IngestConfig:
+    """Configuration for PDF ingestion."""
+
+    clear_database: bool
+
+    chunking_enable: bool
+    chunking_func: HydraObject
+
+    metadata_keys: list[str]
+    table_format: Literal["text", "html", "image"]
+
+    summarize_text: bool
+    summarize_table: bool
+
+    vectorstore_source: SourceConfig
+    docstore_source: SourceConfig
+
+    export_extracted: bool
+
+    @root_validator(pre=True)
+    def validate_fields(cls, values: dict) -> dict:
+        """Various checks on the fields.
+
+        Args:
+            values (dict): Field values.
+
+        Returns:
+            dict: Validated field values.
+        """
+        table_format = values["table_format"]
+        summarize_text = values["summarize_text"]
+        summarize_table = values["summarize_table"]
+        vectorstore_source = values["vectorstore_source"]
+        docstore_source = values["docstore_source"]
+
+        # Check that summary is enabled when the source is set to "summary"
+        if vectorstore_source["text"] == "summary" and not summarize_text:
+            raise ValueError(
+                "vectorstore_source.text cannot be 'summary' when summarize_text is"
+                " False"
+            )
+        if vectorstore_source["table"] == "summary" and not summarize_table:
+            raise ValueError(
+                "vectorstore_source.table cannot be 'summary' when summarize_table is"
+                " False"
+            )
+        if docstore_source["text"] == "summary" and not summarize_text:
+            raise ValueError(
+                "docstore_source.text cannot be 'summary' when summarize_text is False"
+            )
+        if docstore_source["table"] == "summary" and not summarize_table:
+            raise ValueError(
+                "docstore_source.table cannot be 'summary' when summarize_table is"
+                " False"
+            )
+
+        # Check that the source of vectorstore is not set to "content" when the content
+        # is an image
+        if vectorstore_source["image"] == "content":
+            raise ValueError("vectorstore_source.image cannot be 'content'")
+        if table_format == "image" and vectorstore_source["table"] == "content":
+            raise ValueError(
+                "vectorstore_source.table cannot be 'content' when table_format is"
+                " 'image'"
+            )
+
+        # Check that the source of docstore is not set to "content" when the content
+        # is an image (option 2)
+        if docstore_source["image"] == "content":
+            raise ValueError("docstore_source.image cannot be 'content'")
+        if table_format == "image" and docstore_source["table"] == "content":
+            raise ValueError(
+                "docstore_source.table cannot be 'content' when table_format is"
+                " 'image'"
+            )
+
+        return values
+
+
+@dataclass(config=ConfigDict(extra="forbid"))
+class Config:
+    """Configuration for the RAG Option 2."""
+
+    name: str
+
+    path: PathConfig
+
+    text_llm: HydraObject
+    vision_llm: HydraObject
+    embedding: HydraObject
+    vectorstore: HydraObject
+    store: HydraObject
+    retriever: HydraObject
+
+    ingest: IngestConfig
+
+
+def validate_config(config: DictConfig) -> Config:
+    """Validate the configuration.
+
+    Args:
+        config (DictConfig): Configuration object.
+
+    Returns:
+        Config: Validated configuration object.
+    """
+    # Resolve the DictConfig to a native Python object
+    cfg_obj = OmegaConf.to_object(config)
+    # Instantiate the Config class
+    validated_config = Config(**cfg_obj)
+    return validated_config