Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add config for partition pdf #15

Merged
merged 3 commits into from
Mar 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@ For all options, we can choose to treat tables as text or images.
**Common parameters**:

- `ingest.clear_database` : Whether to clear the database before ingesting new data.
- `ingest.partition_pdf_func` : Parameters for Unstructured `partition_pdf` function.
- `ingest.chunking_func` : Parameters for Unstructured chunking function.
- `ingest.metadata_keys` : Unstructured metadata to use.
- `ingest.table_format` : How to extract table with Unstructured (`text`, `html` or `image`).
- `ingest.image_min_size` : Minimum relative size for images to be considered.
Expand Down
41 changes: 40 additions & 1 deletion backend/rag_1/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

from omegaconf import OmegaConf
from omegaconf.dictconfig import DictConfig
from pydantic import BaseModel, ConfigDict, validator
from pydantic import BaseModel, ConfigDict, root_validator, validator
from pydantic.dataclasses import dataclass


Expand Down Expand Up @@ -36,6 +36,8 @@ class IngestConfig:

clear_database: bool

partition_pdf_func: HydraObject

chunking_enable: bool
chunking_func: HydraObject

Expand All @@ -46,6 +48,43 @@ class IngestConfig:

export_extracted: bool

@root_validator(pre=True)
def validate_fields(cls, values: dict) -> dict:
"""Various checks on the fields.

Args:
values (dict): Field values.

Returns:
dict: Validated field values.
"""
partition_pdf_func = values["partition_pdf_func"]
table_format = values["table_format"]

# Check that the table structure is to be inferred when the table format is set
# to "html"
if table_format == "html" and (
"infer_table_structure" not in partition_pdf_func
or not partition_pdf_func["infer_table_structure"]
):
raise ValueError(
"partition_pdf_func.infer_table_structure must be True when"
" table_format is 'html'"
)

# Check that tables are to be extracted as images when the table format is set
# to "image"
if table_format == "image" and (
"extract_image_block_types" not in partition_pdf_func
or "table" not in partition_pdf_func["extract_image_block_types"]
):
raise ValueError(
"partition_pdf_func.extract_image_block_types must contain 'table'"
" when table_format is 'image'"
)

return values

@validator("image_min_size", "table_min_size")
def validate_size(cls, value: list[float]) -> list[float]:
"""Check that the value is between 0 and 1."""
Expand Down
10 changes: 10 additions & 0 deletions backend/rag_1/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,16 @@ retriever:
ingest:
clear_database: True

partition_pdf_func:
_target_: unstructured.partition.pdf.partition_pdf
_partial_: True
_convert_: all
strategy: "hi_res"
infer_table_structure: True
hi_res_model_name: "yolox"
extract_image_block_types: ["image"]
extract_image_block_to_payload: True

chunking_enable: True
chunking_func:
_target_: unstructured.chunking.title.chunk_by_title
Expand Down
10 changes: 3 additions & 7 deletions backend/rag_1/ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,11 @@
import hydra
from omegaconf.dictconfig import DictConfig
from tqdm.auto import tqdm
from unstructured.partition.pdf import partition_pdf

from backend.rag_1.config import validate_config
from backend.rag_components.unstructured import (
load_chunking_func,
load_partition_pdf_func,
select_images,
select_tables,
select_texts,
Expand All @@ -31,12 +31,8 @@ def ingest_pdf(file_path: str | Path, config: DictConfig) -> None:
logger.info(f"Processing {file_path}")

# Get elements
raw_pdf_elements = partition_pdf(
filename=file_path,
infer_table_structure=True,
extract_image_block_types=["image", "table"],
extract_image_block_to_payload=True,
)
partition_pdf = load_partition_pdf_func(config)
raw_pdf_elements = partition_pdf(filename=file_path)

# Get images
images = select_images(
Expand Down
10 changes: 3 additions & 7 deletions backend/rag_1/notebook.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -69,14 +69,14 @@
"from pathlib import Path\n",
"\n",
"from hydra import compose, initialize\n",
"from unstructured.partition.pdf import partition_pdf\n",
"\n",
"from backend.rag_1.chain import get_chain\n",
"from backend.rag_1.config import validate_config\n",
"from backend.rag_components.elements import convert_documents_to_elements\n",
"from backend.rag_components.retriever import get_retriever\n",
"from backend.rag_components.unstructured import (\n",
" load_chunking_func,\n",
" load_partition_pdf_func,\n",
" select_images,\n",
" select_tables,\n",
" select_texts,\n",
Expand Down Expand Up @@ -159,12 +159,8 @@
"t_partition = time.time()\n",
"\n",
"# Get elements\n",
"raw_pdf_elements = partition_pdf(\n",
" filename=file_path,\n",
" infer_table_structure=True,\n",
" extract_image_block_types=[\"image\", \"table\"],\n",
" extract_image_block_to_payload=True,\n",
")\n",
"partition_pdf = load_partition_pdf_func(config)\n",
"raw_pdf_elements = partition_pdf(filename=file_path)\n",
"\n",
"print(f\"Partition time: {format_time_delta(time.time() - t_partition)}\")"
]
Expand Down
25 changes: 25 additions & 0 deletions backend/rag_2/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@ class IngestConfig:

clear_database: bool

partition_pdf_func: HydraObject

chunking_enable: bool
chunking_func: HydraObject

Expand All @@ -71,12 +73,35 @@ def validate_fields(cls, values: dict) -> dict:
Returns:
dict: Validated field values.
"""
partition_pdf_func = values["partition_pdf_func"]
table_format = values["table_format"]
summarize_text = values["summarize_text"]
summarize_table = values["summarize_table"]
vectorstore_source = values["vectorstore_source"]
docstore_source = values["docstore_source"]

# Check that the table structure is to be inferred when the table format is set
# to "html"
if table_format == "html" and (
"infer_table_structure" not in partition_pdf_func
or not partition_pdf_func["infer_table_structure"]
):
raise ValueError(
"partition_pdf_func.infer_table_structure must be True when"
" table_format is 'html'"
)

# Check that tables are to be extracted as images when the table format is set
# to "image"
if table_format == "image" and (
"extract_image_block_types" not in partition_pdf_func
or "table" not in partition_pdf_func["extract_image_block_types"]
):
raise ValueError(
"partition_pdf_func.extract_image_block_types must contain 'table'"
" when table_format is 'image'"
)

# Check that summary is enabled when the source is set to "summary"
if vectorstore_source["text"] == "summary" and not summarize_text:
raise ValueError(
Expand Down
10 changes: 10 additions & 0 deletions backend/rag_2/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,16 @@ retriever:
ingest:
clear_database: True

partition_pdf_func:
_target_: unstructured.partition.pdf.partition_pdf
_partial_: True
_convert_: all
strategy: "hi_res"
infer_table_structure: True
hi_res_model_name: "yolox"
extract_image_block_types: ["image"]
extract_image_block_to_payload: True

chunking_enable: True
chunking_func:
_target_: unstructured.chunking.title.chunk_by_title
Expand Down
10 changes: 3 additions & 7 deletions backend/rag_2/ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
import hydra
from omegaconf.dictconfig import DictConfig
from tqdm.auto import tqdm
from unstructured.partition.pdf import partition_pdf

from backend.rag_2 import prompts
from backend.rag_2.config import validate_config
Expand All @@ -21,6 +20,7 @@
from backend.rag_components.retriever import get_retriever
from backend.rag_components.unstructured import (
load_chunking_func,
load_partition_pdf_func,
select_images,
select_tables,
select_texts,
Expand All @@ -39,12 +39,8 @@ async def ingest_pdf(file_path: str | Path, config: DictConfig) -> None:
logger.info(f"Processing {file_path}")

# Get elements
raw_pdf_elements = partition_pdf(
filename=file_path,
infer_table_structure=True,
extract_image_block_types=["image", "table"],
extract_image_block_to_payload=True,
)
partition_pdf = load_partition_pdf_func(config)
raw_pdf_elements = partition_pdf(filename=file_path)

# Get images
images = select_images(
Expand Down
10 changes: 3 additions & 7 deletions backend/rag_2/notebook.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,6 @@
"from pathlib import Path\n",
"\n",
"from hydra import compose, initialize\n",
"from unstructured.partition.pdf import partition_pdf\n",
"\n",
"from backend.rag_2 import prompts\n",
"from backend.rag_2.chain import get_chain\n",
Expand All @@ -85,6 +84,7 @@
"from backend.rag_components.retriever import get_retriever\n",
"from backend.rag_components.unstructured import (\n",
" load_chunking_func,\n",
" load_partition_pdf_func,\n",
" select_images,\n",
" select_tables,\n",
" select_texts,\n",
Expand Down Expand Up @@ -166,12 +166,8 @@
"t_partition = time.time()\n",
"\n",
"# Get elements\n",
"raw_pdf_elements = partition_pdf(\n",
" filename=file_path,\n",
" infer_table_structure=True,\n",
" extract_image_block_types=[\"image\", \"table\"],\n",
" extract_image_block_to_payload=True,\n",
")\n",
"partition_pdf = load_partition_pdf_func(config)\n",
"raw_pdf_elements = partition_pdf(filename=file_path)\n",
"\n",
"print(f\"Partition time: {format_time_delta(time.time() - t_partition)}\")"
]
Expand Down
26 changes: 26 additions & 0 deletions backend/rag_3/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,9 @@ class IngestConfig:
"""Configuration for PDF ingestion."""

clear_database: bool

partition_pdf_func: HydraObject

chunking_enable: bool
chunking_func: HydraObject

Expand All @@ -70,12 +73,35 @@ def validate_fields(cls, values: dict) -> dict:
Returns:
dict: Validated field values.
"""
partition_pdf_func = values["partition_pdf_func"]
table_format = values["table_format"]
summarize_text = values["summarize_text"]
summarize_table = values["summarize_table"]
vectorstore_source = values["vectorstore_source"]
docstore_source = values["docstore_source"]

# Check that the table structure is to be inferred when the table format is set
# to "html"
if table_format == "html" and (
"infer_table_structure" not in partition_pdf_func
or not partition_pdf_func["infer_table_structure"]
):
raise ValueError(
"partition_pdf_func.infer_table_structure must be True when"
" table_format is 'html'"
)

# Check that tables are to be extracted as images when the table format is set
# to "image"
if table_format == "image" and (
"extract_image_block_types" not in partition_pdf_func
or "table" not in partition_pdf_func["extract_image_block_types"]
):
raise ValueError(
"partition_pdf_func.extract_image_block_types must contain 'table'"
" when table_format is 'image'"
)

# Check that summary is enabled when the source is set to "summary"
if vectorstore_source["text"] == "summary" and not summarize_text:
raise ValueError(
Expand Down
10 changes: 10 additions & 0 deletions backend/rag_3/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,16 @@ retriever:
ingest:
clear_database: True

partition_pdf_func:
_target_: unstructured.partition.pdf.partition_pdf
_partial_: True
_convert_: all
strategy: "hi_res"
infer_table_structure: False
hi_res_model_name: "yolox"
extract_image_block_types: ["image", "table"]
extract_image_block_to_payload: True

chunking_enable: True
chunking_func:
_target_: unstructured.chunking.title.chunk_by_title
Expand Down
10 changes: 3 additions & 7 deletions backend/rag_3/ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
import hydra
from omegaconf.dictconfig import DictConfig
from tqdm.auto import tqdm
from unstructured.partition.pdf import partition_pdf

from backend.rag_3 import prompts
from backend.rag_3.config import validate_config
Expand All @@ -21,6 +20,7 @@
from backend.rag_components.retriever import get_retriever
from backend.rag_components.unstructured import (
load_chunking_func,
load_partition_pdf_func,
select_images,
select_tables,
select_texts,
Expand All @@ -39,12 +39,8 @@ async def ingest_pdf(file_path: str | Path, config: DictConfig) -> None:
logger.info(f"Processing {file_path}")

# Get elements
raw_pdf_elements = partition_pdf(
filename=file_path,
infer_table_structure=True,
extract_image_block_types=["image", "table"],
extract_image_block_to_payload=True,
)
partition_pdf = load_partition_pdf_func(config)
raw_pdf_elements = partition_pdf(filename=file_path)

# Get images
images = select_images(
Expand Down
10 changes: 3 additions & 7 deletions backend/rag_3/notebook.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,6 @@
"from pathlib import Path\n",
"\n",
"from hydra import compose, initialize\n",
"from unstructured.partition.pdf import partition_pdf\n",
"\n",
"from backend.rag_3 import prompts\n",
"from backend.rag_3.chain import get_chain\n",
Expand All @@ -85,6 +84,7 @@
"from backend.rag_components.retriever import get_retriever\n",
"from backend.rag_components.unstructured import (\n",
" load_chunking_func,\n",
" load_partition_pdf_func,\n",
" select_images,\n",
" select_tables,\n",
" select_texts,\n",
Expand Down Expand Up @@ -166,12 +166,8 @@
"t_partition = time.time()\n",
"\n",
"# Get elements\n",
"raw_pdf_elements = partition_pdf(\n",
" filename=file_path,\n",
" infer_table_structure=True,\n",
" extract_image_block_types=[\"image\", \"table\"],\n",
" extract_image_block_to_payload=True,\n",
")\n",
"partition_pdf = load_partition_pdf_func(config)\n",
"raw_pdf_elements = partition_pdf(filename=file_path)\n",
"\n",
"print(f\"Partition time: {format_time_delta(time.time() - t_partition)}\")"
]
Expand Down
Loading
Loading