Skip to content

Commit

Permalink
Add image_min_size parameter
Browse files Browse the repository at this point in the history
  • Loading branch information
baptiste-pasquier committed Mar 21, 2024
1 parent 3bb8635 commit 4c0138d
Show file tree
Hide file tree
Showing 13 changed files with 193 additions and 59 deletions.
13 changes: 12 additions & 1 deletion backend/rag_1/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

from omegaconf import OmegaConf
from omegaconf.dictconfig import DictConfig
from pydantic import BaseModel, ConfigDict
from pydantic import BaseModel, ConfigDict, validator
from pydantic.dataclasses import dataclass


Expand Down Expand Up @@ -41,9 +41,20 @@ class IngestConfig:

metadata_keys: list[str]
table_format: Literal["text", "html", "image"]
image_min_size: list[float]
table_min_size: list[float]

export_extracted: bool

@validator("image_min_size", "table_min_size")
def validate_size(cls, value: list[float]) -> list[float]:
"""Check that the value is between 0 and 1."""
if len(value) != 2:
raise ValueError("Size must be a list of two floats.")
if min(value) < 0 or max(value) > 1:
raise ValueError("Size must be a list of floats between 0 and 1.")
return value


@dataclass(config=ConfigDict(extra="forbid"))
class Config:
Expand Down
2 changes: 2 additions & 0 deletions backend/rag_1/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -48,5 +48,7 @@ ingest:
- filename
- page_number
table_format: "html" # "text" or "html" or "image"
image_min_size: [0.1, 0.1]
table_min_size: [0.0, 0.0]

export_extracted: True
13 changes: 10 additions & 3 deletions backend/rag_1/ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,11 @@ def ingest_pdf(file_path: str | Path, config: DictConfig) -> None:
)

# Get images
images = select_images(raw_pdf_elements, config.ingest.metadata_keys)
images = select_images(
raw_pdf_elements,
metadata_keys=config.ingest.metadata_keys,
min_size=config.ingest.image_min_size,
)

# Get chunks
if config.ingest.chunking_enable:
Expand All @@ -49,9 +53,12 @@ def ingest_pdf(file_path: str | Path, config: DictConfig) -> None:
chunks = raw_pdf_elements

# Get text, tables
texts = select_texts(chunks, config.ingest.metadata_keys)
texts = select_texts(chunks, metadata_keys=config.ingest.metadata_keys)
tables = select_tables(
chunks, config.ingest.table_format, config.ingest.metadata_keys
chunks,
table_format=config.ingest.table_format,
metadata_keys=config.ingest.metadata_keys,
min_size=config.ingest.table_min_size,
)

vectorstore = get_vectorstore(config)
Expand Down
15 changes: 12 additions & 3 deletions backend/rag_1/notebook.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -192,7 +192,11 @@
"outputs": [],
"source": [
"# Get images\n",
"images = select_images(raw_pdf_elements)\n",
"images = select_images(\n",
" raw_pdf_elements,\n",
" metadata_keys=config.ingest.metadata_keys,\n",
" min_size=config.ingest.image_min_size,\n",
")\n",
"for image in images[:N_DISPLAY]:\n",
" display(image)"
]
Expand Down Expand Up @@ -233,8 +237,13 @@
"outputs": [],
"source": [
"# Get text, tables\n",
"texts = select_texts(chunks, config.ingest.metadata_keys)\n",
"tables = select_tables(chunks, config.ingest.table_format, config.ingest.metadata_keys)"
"texts = select_texts(chunks, metadata_keys=config.ingest.metadata_keys)\n",
"tables = select_tables(\n",
" chunks,\n",
" table_format=config.ingest.table_format,\n",
" metadata_keys=config.ingest.metadata_keys,\n",
" min_size=config.ingest.table_min_size,\n",
")"
]
},
{
Expand Down
13 changes: 12 additions & 1 deletion backend/rag_2/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

from omegaconf import OmegaConf
from omegaconf.dictconfig import DictConfig
from pydantic import BaseModel, ConfigDict, root_validator
from pydantic import BaseModel, ConfigDict, root_validator, validator
from pydantic.dataclasses import dataclass


Expand Down Expand Up @@ -50,6 +50,8 @@ class IngestConfig:

metadata_keys: list[str]
table_format: Literal["text", "html", "image"]
image_min_size: list[float]
table_min_size: list[float]

summarize_text: bool
summarize_table: bool
Expand Down Expand Up @@ -118,6 +120,15 @@ def validate_fields(cls, values: dict) -> dict:

return values

@validator("image_min_size", "table_min_size")
def validate_size(cls, value: list[float]) -> list[float]:
"""Check that the value is between 0 and 1."""
if len(value) != 2:
raise ValueError("Size must be a list of two floats.")
if min(value) < 0 or max(value) > 1:
raise ValueError("Size must be a list of floats between 0 and 1.")
return value


@dataclass(config=ConfigDict(extra="forbid"))
class Config:
Expand Down
2 changes: 2 additions & 0 deletions backend/rag_2/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,8 @@ ingest:
- filename
- page_number
table_format: "html" # "text" or "html" or "image"
image_min_size: [0.1, 0.1]
table_min_size: [0.0, 0.0]

summarize_text: False
summarize_table: True
Expand Down
13 changes: 10 additions & 3 deletions backend/rag_2/ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,11 @@ async def ingest_pdf(file_path: str | Path, config: DictConfig) -> None:
)

# Get images
images = select_images(raw_pdf_elements, config.ingest.metadata_keys)
images = select_images(
raw_pdf_elements,
metadata_keys=config.ingest.metadata_keys,
min_size=config.ingest.image_min_size,
)

# Get chunks
if config.ingest.chunking_enable:
Expand All @@ -160,9 +164,12 @@ async def ingest_pdf(file_path: str | Path, config: DictConfig) -> None:
chunks = raw_pdf_elements

# Get text, tables
texts = select_texts(chunks, config.ingest.metadata_keys)
texts = select_texts(chunks, metadata_keys=config.ingest.metadata_keys)
tables = select_tables(
chunks, config.ingest.table_format, config.ingest.metadata_keys
chunks,
table_format=config.ingest.table_format,
metadata_keys=config.ingest.metadata_keys,
min_size=config.ingest.table_min_size,
)

# Summarize text
Expand Down
15 changes: 12 additions & 3 deletions backend/rag_2/notebook.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -198,7 +198,11 @@
"outputs": [],
"source": [
"# Get images\n",
"images = select_images(raw_pdf_elements)\n",
"images = select_images(\n",
" raw_pdf_elements,\n",
" metadata_keys=config.ingest.metadata_keys,\n",
" min_size=config.ingest.image_min_size,\n",
")\n",
"for image in images[:N_DISPLAY]:\n",
" display(image)"
]
Expand Down Expand Up @@ -239,8 +243,13 @@
"outputs": [],
"source": [
"# Get text, tables\n",
"texts = select_texts(chunks, config.ingest.metadata_keys)\n",
"tables = select_tables(chunks, config.ingest.table_format, config.ingest.metadata_keys)"
"texts = select_texts(chunks, metadata_keys=config.ingest.metadata_keys)\n",
"tables = select_tables(\n",
" chunks,\n",
" table_format=config.ingest.table_format,\n",
" metadata_keys=config.ingest.metadata_keys,\n",
" min_size=config.ingest.table_min_size,\n",
")"
]
},
{
Expand Down
14 changes: 12 additions & 2 deletions backend/rag_3/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

from omegaconf import OmegaConf
from omegaconf.dictconfig import DictConfig
from pydantic import BaseModel, ConfigDict, root_validator
from pydantic import BaseModel, ConfigDict, root_validator, validator
from pydantic.dataclasses import dataclass


Expand Down Expand Up @@ -44,12 +44,13 @@ class IngestConfig:
"""Configuration for PDF ingestion."""

clear_database: bool

chunking_enable: bool
chunking_func: HydraObject

metadata_keys: list[str]
table_format: Literal["text", "html", "image"]
image_min_size: list[float]
table_min_size: list[float]

summarize_text: bool
summarize_table: bool
Expand Down Expand Up @@ -108,6 +109,15 @@ def validate_fields(cls, values: dict) -> dict:

return values

@validator("image_min_size", "table_min_size")
def validate_size(cls, value: list[float]) -> list[float]:
"""Check that the value is between 0 and 1."""
if len(value) != 2:
raise ValueError("Size must be a list of two floats.")
if min(value) < 0 or max(value) > 1:
raise ValueError("Size must be a list of floats between 0 and 1.")
return value


@dataclass(config=ConfigDict(extra="forbid"))
class Config:
Expand Down
2 changes: 2 additions & 0 deletions backend/rag_3/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,8 @@ ingest:
- filename
- page_number
table_format: "image" # "text" or "html" or "image"
image_min_size: [0.1, 0.1]
table_min_size: [0.0, 0.0]

summarize_text: False
summarize_table: True
Expand Down
13 changes: 10 additions & 3 deletions backend/rag_3/ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,11 @@ async def ingest_pdf(file_path: str | Path, config: DictConfig) -> None:
)

# Get images
images = select_images(raw_pdf_elements, config.ingest.metadata_keys)
images = select_images(
raw_pdf_elements,
metadata_keys=config.ingest.metadata_keys,
min_size=config.ingest.image_min_size,
)

# Get chunks
if config.ingest.chunking_enable:
Expand All @@ -160,9 +164,12 @@ async def ingest_pdf(file_path: str | Path, config: DictConfig) -> None:
chunks = raw_pdf_elements

# Get text, tables
texts = select_texts(chunks, config.ingest.metadata_keys)
texts = select_texts(chunks, metadata_keys=config.ingest.metadata_keys)
tables = select_tables(
chunks, config.ingest.table_format, config.ingest.metadata_keys
chunks,
table_format=config.ingest.table_format,
metadata_keys=config.ingest.metadata_keys,
min_size=config.ingest.table_min_size,
)

# Summarize text
Expand Down
18 changes: 15 additions & 3 deletions backend/rag_3/notebook.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -198,7 +198,11 @@
"outputs": [],
"source": [
"# Get images\n",
"images = select_images(raw_pdf_elements)\n",
"images = select_images(\n",
" raw_pdf_elements,\n",
" metadata_keys=config.ingest.metadata_keys,\n",
" min_size=config.ingest.image_min_size,\n",
")\n",
"for image in images[:N_DISPLAY]:\n",
" display(image)"
]
Expand Down Expand Up @@ -239,8 +243,16 @@
"outputs": [],
"source": [
"# Get text, tables\n",
"texts = select_texts(chunks, config.ingest.metadata_keys)\n",
"tables = select_tables(chunks, config.ingest.table_format, config.ingest.metadata_keys)"
"texts = select_texts(\n",
" chunks,\n",
" metadata_keys=config.ingest.metadata_keys,\n",
")\n",
"tables = select_tables(\n",
" chunks,\n",
" table_format=config.ingest.table_format,\n",
" metadata_keys=config.ingest.metadata_keys,\n",
" min_size=config.ingest.table_min_size,\n",
")"
]
},
{
Expand Down
Loading

0 comments on commit 4c0138d

Please sign in to comment.