misc

Filimoa · Sep 17, 2024 · 5555589 · 5555589
1 parent 00f02a2
commit 5555589
Show file tree

Hide file tree

Showing 7 changed files with 192 additions and 78 deletions.
diff --git a/.gitignore b/.gitignore
@@ -37,4 +37,5 @@ notebooks/
 sample-docs/
 weights/
 .env
+uv.lock
 
diff --git a/src/openparse/schemas.py b/src/openparse/schemas.py
@@ -4,11 +4,9 @@
 from collections import defaultdict, namedtuple
 from enum import Enum
 from functools import cached_property
-from io import BytesIO
 from typing import Any, List, Literal, Optional, Set, Tuple, Union
 
 from pydantic import BaseModel, ConfigDict, Field, computed_field, model_validator
-from pydantic_core import core_schema
 
 from openparse import consts
 from openparse.utils import num_tokens
@@ -86,9 +84,9 @@ def is_heading(self) -> bool:
         return self.size >= MIN_HEADING_SIZE and self.is_bold
 
     def formatted_text(
-            self,
-            previous_span: Optional["TextSpan"] = None,
-            next_span: Optional["TextSpan"] = None,
+        self,
+        previous_span: Optional["TextSpan"] = None,
+        next_span: Optional["TextSpan"] = None,
     ) -> str:
         """Format text considering adjacent spans to avoid redundant markdown symbols."""
         formatted = self.text
@@ -190,19 +188,19 @@ def _clean_markdown_formatting(self, text: str) -> str:
 
     def overlaps(self, other: "LineElement", error_margin: float = 0.0) -> bool:
         x_overlap = not (
-                self.bbox[0] - error_margin > other.bbox[2] + error_margin
-                or other.bbox[0] - error_margin > self.bbox[2] + error_margin
+            self.bbox[0] - error_margin > other.bbox[2] + error_margin
+            or other.bbox[0] - error_margin > self.bbox[2] + error_margin
         )
 
         y_overlap = not (
-                self.bbox[1] - error_margin > other.bbox[3] + error_margin
-                or other.bbox[1] - error_margin > self.bbox[3] + error_margin
+            self.bbox[1] - error_margin > other.bbox[3] + error_margin
+            or other.bbox[1] - error_margin > self.bbox[3] + error_margin
         )
 
         return x_overlap and y_overlap
 
     def is_at_similar_height(
-            self, other: "LineElement", error_margin: float = 0.0
+        self, other: "LineElement", error_margin: float = 0.0
     ) -> bool:
         y_distance = abs(self.bbox[1] - other.bbox[1])
 
@@ -261,27 +259,29 @@ def area(self) -> float:
         return (self.bbox.x1 - self.bbox.x0) * (self.bbox.y1 - self.bbox.y0)
 
     def is_at_similar_height(
-            self, other: Union["TableElement", "TextElement", "ImageElement"], error_margin: float = 1
+        self,
+        other: Union["TableElement", "TextElement", "ImageElement"],
+        error_margin: float = 1,
     ) -> bool:
         y_distance = abs(self.bbox.y1 - other.bbox.y1)
 
         return y_distance <= error_margin
 
     def overlaps(
-            self,
-            other: "TextElement",
-            x_error_margin: float = 0.0,
-            y_error_margin: float = 0.0,
+        self,
+        other: "TextElement",
+        x_error_margin: float = 0.0,
+        y_error_margin: float = 0.0,
     ) -> bool:
         if self.page != other.page:
             return False
         x_overlap = not (
-                self.bbox.x0 - x_error_margin > other.bbox.x1 + x_error_margin
-                or other.bbox.x0 - x_error_margin > self.bbox.x1 + x_error_margin
+            self.bbox.x0 - x_error_margin > other.bbox.x1 + x_error_margin
+            or other.bbox.x0 - x_error_margin > self.bbox.x1 + x_error_margin
         )
         y_overlap = not (
-                self.bbox.y0 - y_error_margin > other.bbox.y1 + y_error_margin
-                or other.bbox.y0 - y_error_margin > self.bbox.y1 + y_error_margin
+            self.bbox.y0 - y_error_margin > other.bbox.y1 + y_error_margin
+            or other.bbox.y0 - y_error_margin > self.bbox.y1 + y_error_margin
         )
 
         return x_overlap and y_overlap
@@ -321,7 +321,9 @@ def tokens(self) -> int:
         return num_tokens(self.text)
 
     def is_at_similar_height(
-            self, other: Union["TableElement", "TextElement", "ImageElement"], error_margin: float = 1
+        self,
+        other: Union["TableElement", "TextElement", "ImageElement"],
+        error_margin: float = 1,
     ) -> bool:
         y_distance = abs(self.bbox.y1 - other.bbox.y1)
 
@@ -331,25 +333,37 @@ def is_at_similar_height(
 ######################
 ### IMAGE ELEMENTS ###
 ######################
+
+
 class ImageElement(BaseModel):
     text: str
-    ext: str
     bbox: Bbox
-    image: BytesIO # type: ignore
-    block: Optional[dict] = None
+    image: str  # base64 encoded image
+    image_mimetype: Union[
+        Literal[
+            "image/jpeg",
+            "image/png",
+            "image/bmp",
+            "image/jbig2",
+            "image/webp",
+            "unknown",
+        ],
+        str,
+    ]
     ocr_context: Optional[dict] = None
     _embed_text: Optional[str] = None
     variant: Literal[NodeVariant.IMAGE] = NodeVariant.IMAGE
 
-    class Config:
-        arbitrary_types_allowed = True
+    model_config = ConfigDict(frozen=True)
+
     @computed_field  # type: ignore
     @cached_property
     def embed_text(self) -> str:
         if self._embed_text:
             return self._embed_text
 
         return self.text
+
     @cached_property
     def area(self) -> float:
         return (self.bbox.x1 - self.bbox.x0) * (self.bbox.y1 - self.bbox.y0)
@@ -363,7 +377,9 @@ def tokens(self) -> int:
         return 512  # Placeholder for image tokenization
 
     def is_at_similar_height(
-            self, other: Union["TableElement", "TextElement", "ImageElement"], error_margin: float = 1
+        self,
+        other: Union["TableElement", "TextElement", "ImageElement"],
+        error_margin: float = 1,
     ) -> bool:
         y_distance = abs(self.bbox.y1 - other.bbox.y1)
 
@@ -376,10 +392,10 @@ def is_at_similar_height(
 
 
 def _determine_relationship(
-        elem1: Union["TextElement", "TableElement"],
-        elem2: Union["TextElement", "TableElement"],
-        line_threshold: float = 1,
-        paragraph_threshold: float = 12,
+    elem1: Union["TextElement", "TableElement"],
+    elem2: Union["TextElement", "TableElement"],
+    line_threshold: float = 1,
+    paragraph_threshold: float = 12,
 ) -> Literal["same-line", "same-paragraph", None]:
     """
     Determines the relationship between two elements (either TextElement or TableElement).
@@ -428,14 +444,19 @@ def node_id(self) -> str:
 
     @computed_field  # type: ignore
     @cached_property
-    def variant(self) -> Set[Literal["text", "table"]]:
+    def variant(self) -> Set[Literal["text", "table", "image"]]:
         return {e.variant.value for e in self.elements}
 
     @computed_field  # type: ignore
     @cached_property
     def tokens(self) -> int:
         return sum([e.tokens for e in self.elements])
 
+    @computed_field  # type: ignore
+    @cached_property
+    def images(self) -> List[ImageElement]:
+        return [e for e in self.elements if e.variant == NodeVariant.IMAGE]
+
     @computed_field  # type: ignore
     @cached_property
     def bbox(self) -> List[Bbox]:
@@ -568,7 +589,7 @@ def reading_order(self) -> ReadingOrder:
         return ReadingOrder(min_page=min_page, y_position=y_position, min_x0=min_x0)
 
     def overlaps(
-            self, other: "Node", x_error_margin: float = 0.0, y_error_margin: float = 0.0
+        self, other: "Node", x_error_margin: float = 0.0, y_error_margin: float = 0.0
     ) -> bool:
         for bbox in self.bbox:
             other_bboxes = [
@@ -577,13 +598,13 @@ def overlaps(
 
             for other_bbox in other_bboxes:
                 x_overlap = not (
-                        bbox.x0 - x_error_margin > other_bbox.x1 + x_error_margin
-                        or other_bbox.x0 - x_error_margin > bbox.x1 + x_error_margin
+                    bbox.x0 - x_error_margin > other_bbox.x1 + x_error_margin
+                    or other_bbox.x0 - x_error_margin > bbox.x1 + x_error_margin
                 )
 
                 y_overlap = not (
-                        bbox.y0 - y_error_margin > other_bbox.y1 + y_error_margin
-                        or other_bbox.y0 - y_error_margin > bbox.y1 + y_error_margin
+                    bbox.y0 - y_error_margin > other_bbox.y1 + y_error_margin
+                    or other_bbox.y0 - y_error_margin > bbox.y1 + y_error_margin
                 )
 
                 if x_overlap and y_overlap:
@@ -612,7 +633,7 @@ def __lt__(self, other: "Node") -> bool:
             return NotImplemented
 
         assert (
-                self.coordinate_system == other.coordinate_system
+            self.coordinate_system == other.coordinate_system
         ), "Coordinate systems must match."
 
         return self.reading_order < other.reading_order
@@ -705,7 +726,7 @@ def _nodes_to_llama_index(self, llama_index_doc):
         for i in range(len(li_nodes) - 1):
             li_nodes[i].relationships[NodeRelationship.NEXT] = li_nodes[
                 i + 1
-                ].as_related_node_info()
+            ].as_related_node_info()
 
             li_nodes[i + 1].relationships[NodeRelationship.PREVIOUS] = li_nodes[
                 i

diff --git a/src/openparse/text/pdfminer/core.py b/src/openparse/text/pdfminer/core.py
@@ -1,12 +1,19 @@
-import uuid
+import base64
 from io import BytesIO
 from typing import Any, Iterable, List, Tuple, Union
 
-from pdfminer.layout import LTAnno, LTChar, LTTextContainer, LTTextLine, LTImage, LTFigure, LTLine, LTRect
+from pdfminer.layout import (
+    LTAnno,
+    LTChar,
+    LTFigure,
+    LTImage,
+    LTTextContainer,
+    LTTextLine,
+)
 from pydantic import BaseModel, model_validator
 
 from openparse.pdf import Pdf
-from openparse.schemas import Bbox, LineElement, TextElement, TextSpan, ImageElement
+from openparse.schemas import Bbox, ImageElement, LineElement, TextElement, TextSpan
 
 
 class CharElement(BaseModel):
@@ -56,6 +63,24 @@ def _extract_chars(text_line: LTTextLine) -> List[CharElement]:
     return chars
 
 
+def get_mime_type(pdf_object: LTImage) -> str | None:
+    subtype = pdf_object.stream.attrs.get("Subtype", {"name": None}).name
+    filter_ = pdf_object.stream.attrs.get("Filter", {"name": None}).name
+    if subtype == "Image":
+        if filter_ == "DCTDecode":
+            return "image/jpeg"
+        elif filter_ == "FlateDecode":
+            return "image/png"  # Most likely, but could also be TIFF
+        elif filter_ == "JPXDecode":
+            return "image/jp2"
+        elif filter_ == "CCITTFaxDecode":
+            return "image/tiff"
+        elif filter_ == "JBIG2Decode":
+            return "image/jbig2"
+
+    return None
+
+
 def _group_chars_into_spans(chars: Iterable[CharElement]) -> List[TextSpan]:
     spans = []
     current_text = ""
@@ -117,8 +142,8 @@ def _get_bbox(lines: List[LineElement]) -> Tuple[float, float, float, float]:
     return x0, y0, x1, y1
 
 
-def ingest(pdf_input: Union[Pdf]) -> List[TextElement]:
-    """Parse PDF and return a list of LineElement objects."""
+def ingest(pdf_input: Pdf) -> List[Union[TextElement, ImageElement]]:
+    """Parse PDF and return a list of TextElement and ImageElement objects."""
     elements = []
     page_layouts = pdf_input.extract_layout_pages()
 
@@ -151,30 +176,26 @@ def ingest(pdf_input: Union[Pdf]) -> List[TextElement]:
                     )
                 )
             elif isinstance(element, LTFigure):
-                element = element._objs
-                if element is None:
-                    continue
-                for e in element:
+                for e in element._objs:
                     if isinstance(e, LTImage):
-                        elements.append(
-                            ImageElement(
-                                bbox=Bbox(
-                                    x0=e.bbox[0],
-                                    y0=e.bbox[1],
-                                    x1=e.bbox[2],
-                                    y1=e.bbox[3],
-                                    page=page_num,
-                                    page_width=e.width,
-                                    page_height=e.height,
-                                ),
-                                image=BytesIO(e.stream.get_data()),
-                                ext="png",
-                                text='',
+                        mime_type = get_mime_type(e)
+                        if mime_type:
+                            img_data = BytesIO(e.stream.get_data()).getvalue()
+                            base64_string = base64.b64encode(img_data).decode("utf-8")
+                            elements.append(
+                                ImageElement(
+                                    bbox=Bbox(
+                                        x0=e.bbox[0],
+                                        y0=e.bbox[1],
+                                        x1=e.bbox[2],
+                                        y1=e.bbox[3],
+                                        page=page_num,
+                                        page_width=page_width,
+                                        page_height=page_height,
+                                    ),
+                                    image=base64_string,
+                                    image_mimetype=mime_type or "unknown",
+                                    text="",
+                                )
                             )
-                        )
-            elif isinstance(element, LTLine):
-                pass
-            elif isinstance(element, LTRect):
-                pass
-                # This is a placeholder, actual method may vary
     return elements
-Original file line number
+Diff line change
@@ Expand Up / @@ -37,4 +37,5 @@ notebooks/ @@
     sample-docs/
     weights/
     .env
+    uv.lock