artefactory · baptiste-pasquier · Mar 26, 2024 · Mar 26, 2024 · Mar 26, 2024 · Mar 26, 2024
diff --git a/README.md b/README.md
@@ -44,6 +44,8 @@ For all options, we can choose to treat tables as text or images.
 **Common parameters**:
 
 - `ingest.clear_database` : Whether to clear the database before ingesting new data.
+- `ingest.partition_pdf_func` : Parameters for Unstructured `partition_pdf` function.
+- `ingest.chunking_func` : Parameters for Unstructured chunking function.
 - `ingest.metadata_keys` : Unstructured metadata to use.
 - `ingest.table_format` : How to extract table with Unstructured (`text`, `html` or `image`).
 - `ingest.image_min_size` : Minimum relative size for images to be considered.

diff --git a/backend/rag_1/config.py b/backend/rag_1/config.py
@@ -4,7 +4,7 @@
 
 from omegaconf import OmegaConf
 from omegaconf.dictconfig import DictConfig
-from pydantic import BaseModel, ConfigDict, validator
+from pydantic import BaseModel, ConfigDict, root_validator, validator
 from pydantic.dataclasses import dataclass
 
 
@@ -36,6 +36,8 @@ class IngestConfig:
 
     clear_database: bool
 
+    partition_pdf_func: HydraObject
+
     chunking_enable: bool
     chunking_func: HydraObject
 
@@ -46,6 +48,43 @@ class IngestConfig:
 
     export_extracted: bool
 
+    @root_validator(pre=True)
+    def validate_fields(cls, values: dict) -> dict:
+        """Various checks on the fields.
+
+        Args:
+            values (dict): Field values.
+
+        Returns:
+            dict: Validated field values.
+        """
+        partition_pdf_func = values["partition_pdf_func"]
+        table_format = values["table_format"]
+
+        # Check that the table structure is to be inferred when the table format is set
+        # to "html"
+        if table_format == "html" and (
+            "infer_table_structure" not in partition_pdf_func
+            or not partition_pdf_func["infer_table_structure"]
+        ):
+            raise ValueError(
+                "partition_pdf_func.infer_table_structure must be True when"
+                " table_format is 'html'"
+            )
+
+        # Check that tables are to be extracted as images when the table format is set
+        # to "image"
+        if table_format == "image" and (
+            "extract_image_block_types" not in partition_pdf_func
+            or "table" not in partition_pdf_func["extract_image_block_types"]
+        ):
+            raise ValueError(
+                "partition_pdf_func.extract_image_block_types must contain 'table'"
+                " when table_format is 'image'"
+            )
+
+        return values
+
     @validator("image_min_size", "table_min_size")
     def validate_size(cls, value: list[float]) -> list[float]:
         """Check that the value is between 0 and 1."""

diff --git a/backend/rag_1/config.yaml b/backend/rag_1/config.yaml
@@ -44,6 +44,16 @@ retriever:
 ingest:
   clear_database: True
 
+  partition_pdf_func:
+    _target_: unstructured.partition.pdf.partition_pdf
+    _partial_: True
+    _convert_: all
+    strategy: "hi_res"
+    infer_table_structure: True
+    hi_res_model_name: "yolox"
+    extract_image_block_types: ["image"]
+    extract_image_block_to_payload: True
+
   chunking_enable: True
   chunking_func:
     _target_: unstructured.chunking.title.chunk_by_title

diff --git a/backend/rag_1/ingest.py b/backend/rag_1/ingest.py
@@ -7,11 +7,11 @@
 import hydra
 from omegaconf.dictconfig import DictConfig
 from tqdm.auto import tqdm
-from unstructured.partition.pdf import partition_pdf
 
 from backend.rag_1.config import validate_config
 from backend.rag_components.unstructured import (
     load_chunking_func,
+    load_partition_pdf_func,
     select_images,
     select_tables,
     select_texts,
@@ -31,12 +31,8 @@ def ingest_pdf(file_path: str | Path, config: DictConfig) -> None:
     logger.info(f"Processing {file_path}")
 
     # Get elements
-    raw_pdf_elements = partition_pdf(
-        filename=file_path,
-        infer_table_structure=True,
-        extract_image_block_types=["image", "table"],
-        extract_image_block_to_payload=True,
-    )
+    partition_pdf = load_partition_pdf_func(config)
+    raw_pdf_elements = partition_pdf(filename=file_path)
 
     # Get images
     images = select_images(

diff --git a/backend/rag_1/notebook.ipynb b/backend/rag_1/notebook.ipynb
@@ -69,14 +69,14 @@
     "from pathlib import Path\n",
     "\n",
     "from hydra import compose, initialize\n",
-    "from unstructured.partition.pdf import partition_pdf\n",
     "\n",
     "from backend.rag_1.chain import get_chain\n",
     "from backend.rag_1.config import validate_config\n",
     "from backend.rag_components.elements import convert_documents_to_elements\n",
     "from backend.rag_components.retriever import get_retriever\n",
     "from backend.rag_components.unstructured import (\n",
     "    load_chunking_func,\n",
+    "    load_partition_pdf_func,\n",
     "    select_images,\n",
     "    select_tables,\n",
     "    select_texts,\n",
@@ -159,12 +159,8 @@
     "t_partition = time.time()\n",
     "\n",
     "# Get elements\n",
-    "raw_pdf_elements = partition_pdf(\n",
-    "    filename=file_path,\n",
-    "    infer_table_structure=True,\n",
-    "    extract_image_block_types=[\"image\", \"table\"],\n",
-    "    extract_image_block_to_payload=True,\n",
-    ")\n",
+    "partition_pdf = load_partition_pdf_func(config)\n",
+    "raw_pdf_elements = partition_pdf(filename=file_path)\n",
     "\n",
     "print(f\"Partition time: {format_time_delta(time.time() - t_partition)}\")"
    ]

diff --git a/backend/rag_2/config.py b/backend/rag_2/config.py
@@ -45,6 +45,8 @@ class IngestConfig:
 
     clear_database: bool
 
+    partition_pdf_func: HydraObject
+
     chunking_enable: bool
     chunking_func: HydraObject
 
@@ -71,12 +73,35 @@ def validate_fields(cls, values: dict) -> dict:
         Returns:
             dict: Validated field values.
         """
+        partition_pdf_func = values["partition_pdf_func"]
         table_format = values["table_format"]
         summarize_text = values["summarize_text"]
         summarize_table = values["summarize_table"]
         vectorstore_source = values["vectorstore_source"]
         docstore_source = values["docstore_source"]
 
+        # Check that the table structure is to be inferred when the table format is set
+        # to "html"
+        if table_format == "html" and (
+            "infer_table_structure" not in partition_pdf_func
+            or not partition_pdf_func["infer_table_structure"]
+        ):
+            raise ValueError(
+                "partition_pdf_func.infer_table_structure must be True when"
+                " table_format is 'html'"
+            )
+
+        # Check that tables are to be extracted as images when the table format is set
+        # to "image"
+        if table_format == "image" and (
+            "extract_image_block_types" not in partition_pdf_func
+            or "table" not in partition_pdf_func["extract_image_block_types"]
+        ):
+            raise ValueError(
+                "partition_pdf_func.extract_image_block_types must contain 'table'"
+                " when table_format is 'image'"
+            )
+
         # Check that summary is enabled when the source is set to "summary"
         if vectorstore_source["text"] == "summary" and not summarize_text:
             raise ValueError(

diff --git a/backend/rag_2/config.yaml b/backend/rag_2/config.yaml
@@ -53,6 +53,16 @@ retriever:
 ingest:
   clear_database: True
 
+  partition_pdf_func:
+    _target_: unstructured.partition.pdf.partition_pdf
+    _partial_: True
+    _convert_: all
+    strategy: "hi_res"
+    infer_table_structure: True
+    hi_res_model_name: "yolox"
+    extract_image_block_types: ["image"]
+    extract_image_block_to_payload: True
+
   chunking_enable: True
   chunking_func:
     _target_: unstructured.chunking.title.chunk_by_title

diff --git a/backend/rag_2/ingest.py b/backend/rag_2/ingest.py
@@ -8,7 +8,6 @@
 import hydra
 from omegaconf.dictconfig import DictConfig
 from tqdm.auto import tqdm
-from unstructured.partition.pdf import partition_pdf
 
 from backend.rag_2 import prompts
 from backend.rag_2.config import validate_config
@@ -21,6 +20,7 @@
 from backend.rag_components.retriever import get_retriever
 from backend.rag_components.unstructured import (
     load_chunking_func,
+    load_partition_pdf_func,
     select_images,
     select_tables,
     select_texts,
@@ -39,12 +39,8 @@ async def ingest_pdf(file_path: str | Path, config: DictConfig) -> None:
     logger.info(f"Processing {file_path}")
 
     # Get elements
-    raw_pdf_elements = partition_pdf(
-        filename=file_path,
-        infer_table_structure=True,
-        extract_image_block_types=["image", "table"],
-        extract_image_block_to_payload=True,
-    )
+    partition_pdf = load_partition_pdf_func(config)
+    raw_pdf_elements = partition_pdf(filename=file_path)
 
     # Get images
     images = select_images(

diff --git a/backend/rag_2/notebook.ipynb b/backend/rag_2/notebook.ipynb
@@ -70,7 +70,6 @@
     "from pathlib import Path\n",
     "\n",
     "from hydra import compose, initialize\n",
-    "from unstructured.partition.pdf import partition_pdf\n",
     "\n",
     "from backend.rag_2 import prompts\n",
     "from backend.rag_2.chain import get_chain\n",
@@ -85,6 +84,7 @@
     "from backend.rag_components.retriever import get_retriever\n",
     "from backend.rag_components.unstructured import (\n",
     "    load_chunking_func,\n",
+    "    load_partition_pdf_func,\n",
     "    select_images,\n",
     "    select_tables,\n",
     "    select_texts,\n",
@@ -166,12 +166,8 @@
     "t_partition = time.time()\n",
     "\n",
     "# Get elements\n",
-    "raw_pdf_elements = partition_pdf(\n",
-    "    filename=file_path,\n",
-    "    infer_table_structure=True,\n",
-    "    extract_image_block_types=[\"image\", \"table\"],\n",
-    "    extract_image_block_to_payload=True,\n",
-    ")\n",
+    "partition_pdf = load_partition_pdf_func(config)\n",
+    "raw_pdf_elements = partition_pdf(filename=file_path)\n",
     "\n",
     "print(f\"Partition time: {format_time_delta(time.time() - t_partition)}\")"
    ]

diff --git a/backend/rag_3/config.py b/backend/rag_3/config.py
@@ -44,6 +44,9 @@ class IngestConfig:
     """Configuration for PDF ingestion."""
 
     clear_database: bool
+
+    partition_pdf_func: HydraObject
+
     chunking_enable: bool
     chunking_func: HydraObject
 
@@ -70,12 +73,35 @@ def validate_fields(cls, values: dict) -> dict:
         Returns:
             dict: Validated field values.
         """
+        partition_pdf_func = values["partition_pdf_func"]
         table_format = values["table_format"]
         summarize_text = values["summarize_text"]
         summarize_table = values["summarize_table"]
         vectorstore_source = values["vectorstore_source"]
         docstore_source = values["docstore_source"]
 
+        # Check that the table structure is to be inferred when the table format is set
+        # to "html"
+        if table_format == "html" and (
+            "infer_table_structure" not in partition_pdf_func
+            or not partition_pdf_func["infer_table_structure"]
+        ):
+            raise ValueError(
+                "partition_pdf_func.infer_table_structure must be True when"
+                " table_format is 'html'"
+            )
+
+        # Check that tables are to be extracted as images when the table format is set
+        # to "image"
+        if table_format == "image" and (
+            "extract_image_block_types" not in partition_pdf_func
+            or "table" not in partition_pdf_func["extract_image_block_types"]
+        ):
+            raise ValueError(
+                "partition_pdf_func.extract_image_block_types must contain 'table'"
+                " when table_format is 'image'"
+            )
+
         # Check that summary is enabled when the source is set to "summary"
         if vectorstore_source["text"] == "summary" and not summarize_text:
             raise ValueError(

diff --git a/backend/rag_3/config.yaml b/backend/rag_3/config.yaml
@@ -55,6 +55,16 @@ retriever:
 ingest:
   clear_database: True
 
+  partition_pdf_func:
+    _target_: unstructured.partition.pdf.partition_pdf
+    _partial_: True
+    _convert_: all
+    strategy: "hi_res"
+    infer_table_structure: False
+    hi_res_model_name: "yolox"
+    extract_image_block_types: ["image", "table"]
+    extract_image_block_to_payload: True
+
   chunking_enable: True
   chunking_func:
     _target_: unstructured.chunking.title.chunk_by_title

diff --git a/backend/rag_3/ingest.py b/backend/rag_3/ingest.py
@@ -8,7 +8,6 @@
 import hydra
 from omegaconf.dictconfig import DictConfig
 from tqdm.auto import tqdm
-from unstructured.partition.pdf import partition_pdf
 
 from backend.rag_3 import prompts
 from backend.rag_3.config import validate_config
@@ -21,6 +20,7 @@
 from backend.rag_components.retriever import get_retriever
 from backend.rag_components.unstructured import (
     load_chunking_func,
+    load_partition_pdf_func,
     select_images,
     select_tables,
     select_texts,
@@ -39,12 +39,8 @@ async def ingest_pdf(file_path: str | Path, config: DictConfig) -> None:
     logger.info(f"Processing {file_path}")
 
     # Get elements
-    raw_pdf_elements = partition_pdf(
-        filename=file_path,
-        infer_table_structure=True,
-        extract_image_block_types=["image", "table"],
-        extract_image_block_to_payload=True,
-    )
+    partition_pdf = load_partition_pdf_func(config)
+    raw_pdf_elements = partition_pdf(filename=file_path)
 
     # Get images
     images = select_images(

diff --git a/backend/rag_3/notebook.ipynb b/backend/rag_3/notebook.ipynb
@@ -70,7 +70,6 @@
     "from pathlib import Path\n",
     "\n",
     "from hydra import compose, initialize\n",
-    "from unstructured.partition.pdf import partition_pdf\n",
     "\n",
     "from backend.rag_3 import prompts\n",
     "from backend.rag_3.chain import get_chain\n",
@@ -85,6 +84,7 @@
     "from backend.rag_components.retriever import get_retriever\n",
     "from backend.rag_components.unstructured import (\n",
     "    load_chunking_func,\n",
+    "    load_partition_pdf_func,\n",
     "    select_images,\n",
     "    select_tables,\n",
     "    select_texts,\n",
@@ -166,12 +166,8 @@
     "t_partition = time.time()\n",
     "\n",
     "# Get elements\n",
-    "raw_pdf_elements = partition_pdf(\n",
-    "    filename=file_path,\n",
-    "    infer_table_structure=True,\n",
-    "    extract_image_block_types=[\"image\", \"table\"],\n",
-    "    extract_image_block_to_payload=True,\n",
-    ")\n",
+    "partition_pdf = load_partition_pdf_func(config)\n",
+    "raw_pdf_elements = partition_pdf(filename=file_path)\n",
     "\n",
     "print(f\"Partition time: {format_time_delta(time.time() - t_partition)}\")"
    ]