Skip to content

Commit

Permalink
misc
Browse files Browse the repository at this point in the history
  • Loading branch information
Filimoa committed Sep 17, 2024
1 parent 00f02a2 commit 5555589
Show file tree
Hide file tree
Showing 7 changed files with 192 additions and 78 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -37,4 +37,5 @@ notebooks/
sample-docs/
weights/
.env
uv.lock

97 changes: 59 additions & 38 deletions src/openparse/schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,9 @@
from collections import defaultdict, namedtuple
from enum import Enum
from functools import cached_property
from io import BytesIO
from typing import Any, List, Literal, Optional, Set, Tuple, Union

from pydantic import BaseModel, ConfigDict, Field, computed_field, model_validator
from pydantic_core import core_schema

from openparse import consts
from openparse.utils import num_tokens
Expand Down Expand Up @@ -86,9 +84,9 @@ def is_heading(self) -> bool:
return self.size >= MIN_HEADING_SIZE and self.is_bold

def formatted_text(
self,
previous_span: Optional["TextSpan"] = None,
next_span: Optional["TextSpan"] = None,
self,
previous_span: Optional["TextSpan"] = None,
next_span: Optional["TextSpan"] = None,
) -> str:
"""Format text considering adjacent spans to avoid redundant markdown symbols."""
formatted = self.text
Expand Down Expand Up @@ -190,19 +188,19 @@ def _clean_markdown_formatting(self, text: str) -> str:

def overlaps(self, other: "LineElement", error_margin: float = 0.0) -> bool:
x_overlap = not (
self.bbox[0] - error_margin > other.bbox[2] + error_margin
or other.bbox[0] - error_margin > self.bbox[2] + error_margin
self.bbox[0] - error_margin > other.bbox[2] + error_margin
or other.bbox[0] - error_margin > self.bbox[2] + error_margin
)

y_overlap = not (
self.bbox[1] - error_margin > other.bbox[3] + error_margin
or other.bbox[1] - error_margin > self.bbox[3] + error_margin
self.bbox[1] - error_margin > other.bbox[3] + error_margin
or other.bbox[1] - error_margin > self.bbox[3] + error_margin
)

return x_overlap and y_overlap

def is_at_similar_height(
self, other: "LineElement", error_margin: float = 0.0
self, other: "LineElement", error_margin: float = 0.0
) -> bool:
y_distance = abs(self.bbox[1] - other.bbox[1])

Expand Down Expand Up @@ -261,27 +259,29 @@ def area(self) -> float:
return (self.bbox.x1 - self.bbox.x0) * (self.bbox.y1 - self.bbox.y0)

def is_at_similar_height(
self, other: Union["TableElement", "TextElement", "ImageElement"], error_margin: float = 1
self,
other: Union["TableElement", "TextElement", "ImageElement"],
error_margin: float = 1,
) -> bool:
y_distance = abs(self.bbox.y1 - other.bbox.y1)

return y_distance <= error_margin

def overlaps(
self,
other: "TextElement",
x_error_margin: float = 0.0,
y_error_margin: float = 0.0,
self,
other: "TextElement",
x_error_margin: float = 0.0,
y_error_margin: float = 0.0,
) -> bool:
if self.page != other.page:
return False
x_overlap = not (
self.bbox.x0 - x_error_margin > other.bbox.x1 + x_error_margin
or other.bbox.x0 - x_error_margin > self.bbox.x1 + x_error_margin
self.bbox.x0 - x_error_margin > other.bbox.x1 + x_error_margin
or other.bbox.x0 - x_error_margin > self.bbox.x1 + x_error_margin
)
y_overlap = not (
self.bbox.y0 - y_error_margin > other.bbox.y1 + y_error_margin
or other.bbox.y0 - y_error_margin > self.bbox.y1 + y_error_margin
self.bbox.y0 - y_error_margin > other.bbox.y1 + y_error_margin
or other.bbox.y0 - y_error_margin > self.bbox.y1 + y_error_margin
)

return x_overlap and y_overlap
Expand Down Expand Up @@ -321,7 +321,9 @@ def tokens(self) -> int:
return num_tokens(self.text)

def is_at_similar_height(
self, other: Union["TableElement", "TextElement", "ImageElement"], error_margin: float = 1
self,
other: Union["TableElement", "TextElement", "ImageElement"],
error_margin: float = 1,
) -> bool:
y_distance = abs(self.bbox.y1 - other.bbox.y1)

Expand All @@ -331,25 +333,37 @@ def is_at_similar_height(
######################
### IMAGE ELEMENTS ###
######################


class ImageElement(BaseModel):
text: str
ext: str
bbox: Bbox
image: BytesIO # type: ignore
block: Optional[dict] = None
image: str # base64 encoded image
image_mimetype: Union[
Literal[
"image/jpeg",
"image/png",
"image/bmp",
"image/jbig2",
"image/webp",
"unknown",
],
str,
]
ocr_context: Optional[dict] = None
_embed_text: Optional[str] = None
variant: Literal[NodeVariant.IMAGE] = NodeVariant.IMAGE

class Config:
arbitrary_types_allowed = True
model_config = ConfigDict(frozen=True)

@computed_field # type: ignore
@cached_property
def embed_text(self) -> str:
if self._embed_text:
return self._embed_text

return self.text

@cached_property
def area(self) -> float:
return (self.bbox.x1 - self.bbox.x0) * (self.bbox.y1 - self.bbox.y0)
Expand All @@ -363,7 +377,9 @@ def tokens(self) -> int:
return 512 # Placeholder for image tokenization

def is_at_similar_height(
self, other: Union["TableElement", "TextElement", "ImageElement"], error_margin: float = 1
self,
other: Union["TableElement", "TextElement", "ImageElement"],
error_margin: float = 1,
) -> bool:
y_distance = abs(self.bbox.y1 - other.bbox.y1)

Expand All @@ -376,10 +392,10 @@ def is_at_similar_height(


def _determine_relationship(
elem1: Union["TextElement", "TableElement"],
elem2: Union["TextElement", "TableElement"],
line_threshold: float = 1,
paragraph_threshold: float = 12,
elem1: Union["TextElement", "TableElement"],
elem2: Union["TextElement", "TableElement"],
line_threshold: float = 1,
paragraph_threshold: float = 12,
) -> Literal["same-line", "same-paragraph", None]:
"""
Determines the relationship between two elements (either TextElement or TableElement).
Expand Down Expand Up @@ -428,14 +444,19 @@ def node_id(self) -> str:

@computed_field # type: ignore
@cached_property
def variant(self) -> Set[Literal["text", "table"]]:
def variant(self) -> Set[Literal["text", "table", "image"]]:
return {e.variant.value for e in self.elements}

@computed_field # type: ignore
@cached_property
def tokens(self) -> int:
return sum([e.tokens for e in self.elements])

@computed_field # type: ignore
@cached_property
def images(self) -> List[ImageElement]:
return [e for e in self.elements if e.variant == NodeVariant.IMAGE]

@computed_field # type: ignore
@cached_property
def bbox(self) -> List[Bbox]:
Expand Down Expand Up @@ -568,7 +589,7 @@ def reading_order(self) -> ReadingOrder:
return ReadingOrder(min_page=min_page, y_position=y_position, min_x0=min_x0)

def overlaps(
self, other: "Node", x_error_margin: float = 0.0, y_error_margin: float = 0.0
self, other: "Node", x_error_margin: float = 0.0, y_error_margin: float = 0.0
) -> bool:
for bbox in self.bbox:
other_bboxes = [
Expand All @@ -577,13 +598,13 @@ def overlaps(

for other_bbox in other_bboxes:
x_overlap = not (
bbox.x0 - x_error_margin > other_bbox.x1 + x_error_margin
or other_bbox.x0 - x_error_margin > bbox.x1 + x_error_margin
bbox.x0 - x_error_margin > other_bbox.x1 + x_error_margin
or other_bbox.x0 - x_error_margin > bbox.x1 + x_error_margin
)

y_overlap = not (
bbox.y0 - y_error_margin > other_bbox.y1 + y_error_margin
or other_bbox.y0 - y_error_margin > bbox.y1 + y_error_margin
bbox.y0 - y_error_margin > other_bbox.y1 + y_error_margin
or other_bbox.y0 - y_error_margin > bbox.y1 + y_error_margin
)

if x_overlap and y_overlap:
Expand Down Expand Up @@ -612,7 +633,7 @@ def __lt__(self, other: "Node") -> bool:
return NotImplemented

assert (
self.coordinate_system == other.coordinate_system
self.coordinate_system == other.coordinate_system
), "Coordinate systems must match."

return self.reading_order < other.reading_order
Expand Down Expand Up @@ -705,7 +726,7 @@ def _nodes_to_llama_index(self, llama_index_doc):
for i in range(len(li_nodes) - 1):
li_nodes[i].relationships[NodeRelationship.NEXT] = li_nodes[
i + 1
].as_related_node_info()
].as_related_node_info()

li_nodes[i + 1].relationships[NodeRelationship.PREVIOUS] = li_nodes[
i
Expand Down
79 changes: 50 additions & 29 deletions src/openparse/text/pdfminer/core.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,19 @@
import uuid
import base64
from io import BytesIO
from typing import Any, Iterable, List, Tuple, Union

from pdfminer.layout import LTAnno, LTChar, LTTextContainer, LTTextLine, LTImage, LTFigure, LTLine, LTRect
from pdfminer.layout import (
LTAnno,
LTChar,
LTFigure,
LTImage,
LTTextContainer,
LTTextLine,
)
from pydantic import BaseModel, model_validator

from openparse.pdf import Pdf
from openparse.schemas import Bbox, LineElement, TextElement, TextSpan, ImageElement
from openparse.schemas import Bbox, ImageElement, LineElement, TextElement, TextSpan


class CharElement(BaseModel):
Expand Down Expand Up @@ -56,6 +63,24 @@ def _extract_chars(text_line: LTTextLine) -> List[CharElement]:
return chars


def get_mime_type(pdf_object: LTImage) -> str | None:
subtype = pdf_object.stream.attrs.get("Subtype", {"name": None}).name
filter_ = pdf_object.stream.attrs.get("Filter", {"name": None}).name
if subtype == "Image":
if filter_ == "DCTDecode":
return "image/jpeg"
elif filter_ == "FlateDecode":
return "image/png" # Most likely, but could also be TIFF
elif filter_ == "JPXDecode":
return "image/jp2"
elif filter_ == "CCITTFaxDecode":
return "image/tiff"
elif filter_ == "JBIG2Decode":
return "image/jbig2"

return None


def _group_chars_into_spans(chars: Iterable[CharElement]) -> List[TextSpan]:
spans = []
current_text = ""
Expand Down Expand Up @@ -117,8 +142,8 @@ def _get_bbox(lines: List[LineElement]) -> Tuple[float, float, float, float]:
return x0, y0, x1, y1


def ingest(pdf_input: Union[Pdf]) -> List[TextElement]:
"""Parse PDF and return a list of LineElement objects."""
def ingest(pdf_input: Pdf) -> List[Union[TextElement, ImageElement]]:
"""Parse PDF and return a list of TextElement and ImageElement objects."""
elements = []
page_layouts = pdf_input.extract_layout_pages()

Expand Down Expand Up @@ -151,30 +176,26 @@ def ingest(pdf_input: Union[Pdf]) -> List[TextElement]:
)
)
elif isinstance(element, LTFigure):
element = element._objs
if element is None:
continue
for e in element:
for e in element._objs:
if isinstance(e, LTImage):
elements.append(
ImageElement(
bbox=Bbox(
x0=e.bbox[0],
y0=e.bbox[1],
x1=e.bbox[2],
y1=e.bbox[3],
page=page_num,
page_width=e.width,
page_height=e.height,
),
image=BytesIO(e.stream.get_data()),
ext="png",
text='',
mime_type = get_mime_type(e)
if mime_type:
img_data = BytesIO(e.stream.get_data()).getvalue()
base64_string = base64.b64encode(img_data).decode("utf-8")
elements.append(
ImageElement(
bbox=Bbox(
x0=e.bbox[0],
y0=e.bbox[1],
x1=e.bbox[2],
y1=e.bbox[3],
page=page_num,
page_width=page_width,
page_height=page_height,
),
image=base64_string,
image_mimetype=mime_type or "unknown",
text="",
)
)
)
elif isinstance(element, LTLine):
pass
elif isinstance(element, LTRect):
pass
# This is a placeholder, actual method may vary
return elements
Loading

0 comments on commit 5555589

Please sign in to comment.