Skip to content

Commit

Permalink
✨ add validation for infer_table_structure
Browse files Browse the repository at this point in the history
  • Loading branch information
baptiste-pasquier committed Mar 26, 2024
1 parent 16f726f commit 98f2138
Show file tree
Hide file tree
Showing 3 changed files with 84 additions and 1 deletion.
39 changes: 38 additions & 1 deletion backend/rag_1/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

from omegaconf import OmegaConf
from omegaconf.dictconfig import DictConfig
from pydantic import BaseModel, ConfigDict, validator
from pydantic import BaseModel, ConfigDict, root_validator, validator
from pydantic.dataclasses import dataclass


Expand Down Expand Up @@ -48,6 +48,43 @@ class IngestConfig:

export_extracted: bool

@root_validator(pre=True)
def validate_fields(cls, values: dict) -> dict:
"""Various checks on the fields.
Args:
values (dict): Field values.
Returns:
dict: Validated field values.
"""
partition_pdf_func = values["partition_pdf_func"]
table_format = values["table_format"]

# Check that the table structure is to be inferred when the table format is set
# to "html"
if table_format == "html" and (
"infer_table_structure" not in partition_pdf_func
or not partition_pdf_func["infer_table_structure"]
):
raise ValueError(
"partition_pdf_func.infer_table_structure must be True when"
" table_format is 'html'"
)

# Check that tables are to be extracted as images when the table format is set
# to "image"
if table_format == "image" and (
"extract_image_block_types" not in partition_pdf_func
or "table" not in partition_pdf_func["extract_image_block_types"]
):
raise ValueError(
"partition_pdf_func.extract_image_block_types must contain 'table'"
" when table_format is 'image'"
)

return values

@validator("image_min_size", "table_min_size")
def validate_size(cls, value: list[float]) -> list[float]:
"""Check that the value is between 0 and 1."""
Expand Down
23 changes: 23 additions & 0 deletions backend/rag_2/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,12 +73,35 @@ def validate_fields(cls, values: dict) -> dict:
Returns:
dict: Validated field values.
"""
partition_pdf_func = values["partition_pdf_func"]
table_format = values["table_format"]
summarize_text = values["summarize_text"]
summarize_table = values["summarize_table"]
vectorstore_source = values["vectorstore_source"]
docstore_source = values["docstore_source"]

# Check that the table structure is to be inferred when the table format is set
# to "html"
if table_format == "html" and (
"infer_table_structure" not in partition_pdf_func
or not partition_pdf_func["infer_table_structure"]
):
raise ValueError(
"partition_pdf_func.infer_table_structure must be True when"
" table_format is 'html'"
)

# Check that tables are to be extracted as images when the table format is set
# to "image"
if table_format == "image" and (
"extract_image_block_types" not in partition_pdf_func
or "table" not in partition_pdf_func["extract_image_block_types"]
):
raise ValueError(
"partition_pdf_func.extract_image_block_types must contain 'table'"
" when table_format is 'image'"
)

# Check that summary is enabled when the source is set to "summary"
if vectorstore_source["text"] == "summary" and not summarize_text:
raise ValueError(
Expand Down
23 changes: 23 additions & 0 deletions backend/rag_3/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,12 +73,35 @@ def validate_fields(cls, values: dict) -> dict:
Returns:
dict: Validated field values.
"""
partition_pdf_func = values["partition_pdf_func"]
table_format = values["table_format"]
summarize_text = values["summarize_text"]
summarize_table = values["summarize_table"]
vectorstore_source = values["vectorstore_source"]
docstore_source = values["docstore_source"]

# Check that the table structure is to be inferred when the table format is set
# to "html"
if table_format == "html" and (
"infer_table_structure" not in partition_pdf_func
or not partition_pdf_func["infer_table_structure"]
):
raise ValueError(
"partition_pdf_func.infer_table_structure must be True when"
" table_format is 'html'"
)

# Check that tables are to be extracted as images when the table format is set
# to "image"
if table_format == "image" and (
"extract_image_block_types" not in partition_pdf_func
or "table" not in partition_pdf_func["extract_image_block_types"]
):
raise ValueError(
"partition_pdf_func.extract_image_block_types must contain 'table'"
" when table_format is 'image'"
)

# Check that summary is enabled when the source is set to "summary"
if vectorstore_source["text"] == "summary" and not summarize_text:
raise ValueError(
Expand Down

0 comments on commit 98f2138

Please sign in to comment.