diff --git a/README.md b/README.md index ecccc38..7a56505 100644 --- a/README.md +++ b/README.md @@ -44,6 +44,8 @@ For all options, we can choose to treat tables as text or images. **Common parameters**: - `ingest.clear_database` : Whether to clear the database before ingesting new data. +- `ingest.partition_pdf_func` : Parameters for Unstructured `partition_pdf` function. +- `ingest.chunking_func` : Parameters for Unstructured chunking function. - `ingest.metadata_keys` : Unstructured metadata to use. - `ingest.table_format` : How to extract table with Unstructured (`text`, `html` or `image`). - `ingest.image_min_size` : Minimum relative size for images to be considered. diff --git a/backend/rag_1/config.py b/backend/rag_1/config.py index 92af83b..84072fb 100644 --- a/backend/rag_1/config.py +++ b/backend/rag_1/config.py @@ -4,7 +4,7 @@ from omegaconf import OmegaConf from omegaconf.dictconfig import DictConfig -from pydantic import BaseModel, ConfigDict, validator +from pydantic import BaseModel, ConfigDict, root_validator, validator from pydantic.dataclasses import dataclass @@ -36,6 +36,8 @@ class IngestConfig: clear_database: bool + partition_pdf_func: HydraObject + chunking_enable: bool chunking_func: HydraObject @@ -46,6 +48,43 @@ class IngestConfig: export_extracted: bool + @root_validator(pre=True) + def validate_fields(cls, values: dict) -> dict: + """Various checks on the fields. + + Args: + values (dict): Field values. + + Returns: + dict: Validated field values. + """ + partition_pdf_func = values["partition_pdf_func"] + table_format = values["table_format"] + + # Check that the table structure is to be inferred when the table format is set + # to "html" + if table_format == "html" and ( + "infer_table_structure" not in partition_pdf_func + or not partition_pdf_func["infer_table_structure"] + ): + raise ValueError( + "partition_pdf_func.infer_table_structure must be True when" + " table_format is 'html'" + ) + + # Check that tables are to be extracted as images when the table format is set + # to "image" + if table_format == "image" and ( + "extract_image_block_types" not in partition_pdf_func + or "table" not in partition_pdf_func["extract_image_block_types"] + ): + raise ValueError( + "partition_pdf_func.extract_image_block_types must contain 'table'" + " when table_format is 'image'" + ) + + return values + @validator("image_min_size", "table_min_size") def validate_size(cls, value: list[float]) -> list[float]: """Check that the value is between 0 and 1.""" diff --git a/backend/rag_1/config.yaml b/backend/rag_1/config.yaml index 79900ab..a03c56e 100644 --- a/backend/rag_1/config.yaml +++ b/backend/rag_1/config.yaml @@ -44,6 +44,16 @@ retriever: ingest: clear_database: True + partition_pdf_func: + _target_: unstructured.partition.pdf.partition_pdf + _partial_: True + _convert_: all + strategy: "hi_res" + infer_table_structure: True + hi_res_model_name: "yolox" + extract_image_block_types: ["image"] + extract_image_block_to_payload: True + chunking_enable: True chunking_func: _target_: unstructured.chunking.title.chunk_by_title diff --git a/backend/rag_1/ingest.py b/backend/rag_1/ingest.py index a09d5f1..3e2ea75 100644 --- a/backend/rag_1/ingest.py +++ b/backend/rag_1/ingest.py @@ -7,11 +7,11 @@ import hydra from omegaconf.dictconfig import DictConfig from tqdm.auto import tqdm -from unstructured.partition.pdf import partition_pdf from backend.rag_1.config import validate_config from backend.rag_components.unstructured import ( load_chunking_func, + load_partition_pdf_func, select_images, select_tables, select_texts, @@ -31,12 +31,8 @@ def ingest_pdf(file_path: str | Path, config: DictConfig) -> None: logger.info(f"Processing {file_path}") # Get elements - raw_pdf_elements = partition_pdf( - filename=file_path, - infer_table_structure=True, - extract_image_block_types=["image", "table"], - extract_image_block_to_payload=True, - ) + partition_pdf = load_partition_pdf_func(config) + raw_pdf_elements = partition_pdf(filename=file_path) # Get images images = select_images( diff --git a/backend/rag_1/notebook.ipynb b/backend/rag_1/notebook.ipynb index abced02..81526fb 100644 --- a/backend/rag_1/notebook.ipynb +++ b/backend/rag_1/notebook.ipynb @@ -69,7 +69,6 @@ "from pathlib import Path\n", "\n", "from hydra import compose, initialize\n", - "from unstructured.partition.pdf import partition_pdf\n", "\n", "from backend.rag_1.chain import get_chain\n", "from backend.rag_1.config import validate_config\n", @@ -77,6 +76,7 @@ "from backend.rag_components.retriever import get_retriever\n", "from backend.rag_components.unstructured import (\n", " load_chunking_func,\n", + " load_partition_pdf_func,\n", " select_images,\n", " select_tables,\n", " select_texts,\n", @@ -159,12 +159,8 @@ "t_partition = time.time()\n", "\n", "# Get elements\n", - "raw_pdf_elements = partition_pdf(\n", - " filename=file_path,\n", - " infer_table_structure=True,\n", - " extract_image_block_types=[\"image\", \"table\"],\n", - " extract_image_block_to_payload=True,\n", - ")\n", + "partition_pdf = load_partition_pdf_func(config)\n", + "raw_pdf_elements = partition_pdf(filename=file_path)\n", "\n", "print(f\"Partition time: {format_time_delta(time.time() - t_partition)}\")" ] diff --git a/backend/rag_2/config.py b/backend/rag_2/config.py index 907fe90..c497c0d 100644 --- a/backend/rag_2/config.py +++ b/backend/rag_2/config.py @@ -45,6 +45,8 @@ class IngestConfig: clear_database: bool + partition_pdf_func: HydraObject + chunking_enable: bool chunking_func: HydraObject @@ -71,12 +73,35 @@ def validate_fields(cls, values: dict) -> dict: Returns: dict: Validated field values. """ + partition_pdf_func = values["partition_pdf_func"] table_format = values["table_format"] summarize_text = values["summarize_text"] summarize_table = values["summarize_table"] vectorstore_source = values["vectorstore_source"] docstore_source = values["docstore_source"] + # Check that the table structure is to be inferred when the table format is set + # to "html" + if table_format == "html" and ( + "infer_table_structure" not in partition_pdf_func + or not partition_pdf_func["infer_table_structure"] + ): + raise ValueError( + "partition_pdf_func.infer_table_structure must be True when" + " table_format is 'html'" + ) + + # Check that tables are to be extracted as images when the table format is set + # to "image" + if table_format == "image" and ( + "extract_image_block_types" not in partition_pdf_func + or "table" not in partition_pdf_func["extract_image_block_types"] + ): + raise ValueError( + "partition_pdf_func.extract_image_block_types must contain 'table'" + " when table_format is 'image'" + ) + # Check that summary is enabled when the source is set to "summary" if vectorstore_source["text"] == "summary" and not summarize_text: raise ValueError( diff --git a/backend/rag_2/config.yaml b/backend/rag_2/config.yaml index 0482eb5..65d0d48 100644 --- a/backend/rag_2/config.yaml +++ b/backend/rag_2/config.yaml @@ -53,6 +53,16 @@ retriever: ingest: clear_database: True + partition_pdf_func: + _target_: unstructured.partition.pdf.partition_pdf + _partial_: True + _convert_: all + strategy: "hi_res" + infer_table_structure: True + hi_res_model_name: "yolox" + extract_image_block_types: ["image"] + extract_image_block_to_payload: True + chunking_enable: True chunking_func: _target_: unstructured.chunking.title.chunk_by_title diff --git a/backend/rag_2/ingest.py b/backend/rag_2/ingest.py index 90dca1b..bda0397 100644 --- a/backend/rag_2/ingest.py +++ b/backend/rag_2/ingest.py @@ -8,7 +8,6 @@ import hydra from omegaconf.dictconfig import DictConfig from tqdm.auto import tqdm -from unstructured.partition.pdf import partition_pdf from backend.rag_2 import prompts from backend.rag_2.config import validate_config @@ -21,6 +20,7 @@ from backend.rag_components.retriever import get_retriever from backend.rag_components.unstructured import ( load_chunking_func, + load_partition_pdf_func, select_images, select_tables, select_texts, @@ -39,12 +39,8 @@ async def ingest_pdf(file_path: str | Path, config: DictConfig) -> None: logger.info(f"Processing {file_path}") # Get elements - raw_pdf_elements = partition_pdf( - filename=file_path, - infer_table_structure=True, - extract_image_block_types=["image", "table"], - extract_image_block_to_payload=True, - ) + partition_pdf = load_partition_pdf_func(config) + raw_pdf_elements = partition_pdf(filename=file_path) # Get images images = select_images( diff --git a/backend/rag_2/notebook.ipynb b/backend/rag_2/notebook.ipynb index d227b42..8b193b6 100644 --- a/backend/rag_2/notebook.ipynb +++ b/backend/rag_2/notebook.ipynb @@ -70,7 +70,6 @@ "from pathlib import Path\n", "\n", "from hydra import compose, initialize\n", - "from unstructured.partition.pdf import partition_pdf\n", "\n", "from backend.rag_2 import prompts\n", "from backend.rag_2.chain import get_chain\n", @@ -85,6 +84,7 @@ "from backend.rag_components.retriever import get_retriever\n", "from backend.rag_components.unstructured import (\n", " load_chunking_func,\n", + " load_partition_pdf_func,\n", " select_images,\n", " select_tables,\n", " select_texts,\n", @@ -166,12 +166,8 @@ "t_partition = time.time()\n", "\n", "# Get elements\n", - "raw_pdf_elements = partition_pdf(\n", - " filename=file_path,\n", - " infer_table_structure=True,\n", - " extract_image_block_types=[\"image\", \"table\"],\n", - " extract_image_block_to_payload=True,\n", - ")\n", + "partition_pdf = load_partition_pdf_func(config)\n", + "raw_pdf_elements = partition_pdf(filename=file_path)\n", "\n", "print(f\"Partition time: {format_time_delta(time.time() - t_partition)}\")" ] diff --git a/backend/rag_3/config.py b/backend/rag_3/config.py index 20d7e42..9189d07 100644 --- a/backend/rag_3/config.py +++ b/backend/rag_3/config.py @@ -44,6 +44,9 @@ class IngestConfig: """Configuration for PDF ingestion.""" clear_database: bool + + partition_pdf_func: HydraObject + chunking_enable: bool chunking_func: HydraObject @@ -70,12 +73,35 @@ def validate_fields(cls, values: dict) -> dict: Returns: dict: Validated field values. """ + partition_pdf_func = values["partition_pdf_func"] table_format = values["table_format"] summarize_text = values["summarize_text"] summarize_table = values["summarize_table"] vectorstore_source = values["vectorstore_source"] docstore_source = values["docstore_source"] + # Check that the table structure is to be inferred when the table format is set + # to "html" + if table_format == "html" and ( + "infer_table_structure" not in partition_pdf_func + or not partition_pdf_func["infer_table_structure"] + ): + raise ValueError( + "partition_pdf_func.infer_table_structure must be True when" + " table_format is 'html'" + ) + + # Check that tables are to be extracted as images when the table format is set + # to "image" + if table_format == "image" and ( + "extract_image_block_types" not in partition_pdf_func + or "table" not in partition_pdf_func["extract_image_block_types"] + ): + raise ValueError( + "partition_pdf_func.extract_image_block_types must contain 'table'" + " when table_format is 'image'" + ) + # Check that summary is enabled when the source is set to "summary" if vectorstore_source["text"] == "summary" and not summarize_text: raise ValueError( diff --git a/backend/rag_3/config.yaml b/backend/rag_3/config.yaml index 9605d7d..d74f349 100644 --- a/backend/rag_3/config.yaml +++ b/backend/rag_3/config.yaml @@ -55,6 +55,16 @@ retriever: ingest: clear_database: True + partition_pdf_func: + _target_: unstructured.partition.pdf.partition_pdf + _partial_: True + _convert_: all + strategy: "hi_res" + infer_table_structure: False + hi_res_model_name: "yolox" + extract_image_block_types: ["image", "table"] + extract_image_block_to_payload: True + chunking_enable: True chunking_func: _target_: unstructured.chunking.title.chunk_by_title diff --git a/backend/rag_3/ingest.py b/backend/rag_3/ingest.py index 0fad870..38e484f 100644 --- a/backend/rag_3/ingest.py +++ b/backend/rag_3/ingest.py @@ -8,7 +8,6 @@ import hydra from omegaconf.dictconfig import DictConfig from tqdm.auto import tqdm -from unstructured.partition.pdf import partition_pdf from backend.rag_3 import prompts from backend.rag_3.config import validate_config @@ -21,6 +20,7 @@ from backend.rag_components.retriever import get_retriever from backend.rag_components.unstructured import ( load_chunking_func, + load_partition_pdf_func, select_images, select_tables, select_texts, @@ -39,12 +39,8 @@ async def ingest_pdf(file_path: str | Path, config: DictConfig) -> None: logger.info(f"Processing {file_path}") # Get elements - raw_pdf_elements = partition_pdf( - filename=file_path, - infer_table_structure=True, - extract_image_block_types=["image", "table"], - extract_image_block_to_payload=True, - ) + partition_pdf = load_partition_pdf_func(config) + raw_pdf_elements = partition_pdf(filename=file_path) # Get images images = select_images( diff --git a/backend/rag_3/notebook.ipynb b/backend/rag_3/notebook.ipynb index 64ff469..2ed163c 100644 --- a/backend/rag_3/notebook.ipynb +++ b/backend/rag_3/notebook.ipynb @@ -70,7 +70,6 @@ "from pathlib import Path\n", "\n", "from hydra import compose, initialize\n", - "from unstructured.partition.pdf import partition_pdf\n", "\n", "from backend.rag_3 import prompts\n", "from backend.rag_3.chain import get_chain\n", @@ -85,6 +84,7 @@ "from backend.rag_components.retriever import get_retriever\n", "from backend.rag_components.unstructured import (\n", " load_chunking_func,\n", + " load_partition_pdf_func,\n", " select_images,\n", " select_tables,\n", " select_texts,\n", @@ -166,12 +166,8 @@ "t_partition = time.time()\n", "\n", "# Get elements\n", - "raw_pdf_elements = partition_pdf(\n", - " filename=file_path,\n", - " infer_table_structure=True,\n", - " extract_image_block_types=[\"image\", \"table\"],\n", - " extract_image_block_to_payload=True,\n", - ")\n", + "partition_pdf = load_partition_pdf_func(config)\n", + "raw_pdf_elements = partition_pdf(filename=file_path)\n", "\n", "print(f\"Partition time: {format_time_delta(time.time() - t_partition)}\")" ] diff --git a/backend/rag_components/unstructured.py b/backend/rag_components/unstructured.py index 3a61802..a54c5ed 100644 --- a/backend/rag_components/unstructured.py +++ b/backend/rag_components/unstructured.py @@ -160,6 +160,24 @@ def select_tables( return tables +def load_partition_pdf_func(config: DictConfig) -> Callable: + """Load the partition_pdf function from the configuration. + + Args: + config (DictConfig): Configuration object. + + Raises: + ValueError: If the partition_pdf function is not callable. + + Returns: + Callable: The partition_pdf function. + """ + func = instantiate(config.ingest.partition_pdf_func) + if not callable(func): + raise ValueError("partition_pdf function must be callable") + return func + + def load_chunking_func(config: DictConfig) -> Callable: """Load the chunking function from the configuration.