diff --git a/.gitignore b/.gitignore index ff8db49..bcdd21d 100644 --- a/.gitignore +++ b/.gitignore @@ -37,4 +37,5 @@ notebooks/ sample-docs/ weights/ .env +uv.lock diff --git a/src/openparse/schemas.py b/src/openparse/schemas.py index 3f801e7..de85531 100644 --- a/src/openparse/schemas.py +++ b/src/openparse/schemas.py @@ -4,11 +4,9 @@ from collections import defaultdict, namedtuple from enum import Enum from functools import cached_property -from io import BytesIO from typing import Any, List, Literal, Optional, Set, Tuple, Union from pydantic import BaseModel, ConfigDict, Field, computed_field, model_validator -from pydantic_core import core_schema from openparse import consts from openparse.utils import num_tokens @@ -86,9 +84,9 @@ def is_heading(self) -> bool: return self.size >= MIN_HEADING_SIZE and self.is_bold def formatted_text( - self, - previous_span: Optional["TextSpan"] = None, - next_span: Optional["TextSpan"] = None, + self, + previous_span: Optional["TextSpan"] = None, + next_span: Optional["TextSpan"] = None, ) -> str: """Format text considering adjacent spans to avoid redundant markdown symbols.""" formatted = self.text @@ -190,19 +188,19 @@ def _clean_markdown_formatting(self, text: str) -> str: def overlaps(self, other: "LineElement", error_margin: float = 0.0) -> bool: x_overlap = not ( - self.bbox[0] - error_margin > other.bbox[2] + error_margin - or other.bbox[0] - error_margin > self.bbox[2] + error_margin + self.bbox[0] - error_margin > other.bbox[2] + error_margin + or other.bbox[0] - error_margin > self.bbox[2] + error_margin ) y_overlap = not ( - self.bbox[1] - error_margin > other.bbox[3] + error_margin - or other.bbox[1] - error_margin > self.bbox[3] + error_margin + self.bbox[1] - error_margin > other.bbox[3] + error_margin + or other.bbox[1] - error_margin > self.bbox[3] + error_margin ) return x_overlap and y_overlap def is_at_similar_height( - self, other: "LineElement", error_margin: float = 0.0 + self, other: "LineElement", error_margin: float = 0.0 ) -> bool: y_distance = abs(self.bbox[1] - other.bbox[1]) @@ -261,27 +259,29 @@ def area(self) -> float: return (self.bbox.x1 - self.bbox.x0) * (self.bbox.y1 - self.bbox.y0) def is_at_similar_height( - self, other: Union["TableElement", "TextElement", "ImageElement"], error_margin: float = 1 + self, + other: Union["TableElement", "TextElement", "ImageElement"], + error_margin: float = 1, ) -> bool: y_distance = abs(self.bbox.y1 - other.bbox.y1) return y_distance <= error_margin def overlaps( - self, - other: "TextElement", - x_error_margin: float = 0.0, - y_error_margin: float = 0.0, + self, + other: "TextElement", + x_error_margin: float = 0.0, + y_error_margin: float = 0.0, ) -> bool: if self.page != other.page: return False x_overlap = not ( - self.bbox.x0 - x_error_margin > other.bbox.x1 + x_error_margin - or other.bbox.x0 - x_error_margin > self.bbox.x1 + x_error_margin + self.bbox.x0 - x_error_margin > other.bbox.x1 + x_error_margin + or other.bbox.x0 - x_error_margin > self.bbox.x1 + x_error_margin ) y_overlap = not ( - self.bbox.y0 - y_error_margin > other.bbox.y1 + y_error_margin - or other.bbox.y0 - y_error_margin > self.bbox.y1 + y_error_margin + self.bbox.y0 - y_error_margin > other.bbox.y1 + y_error_margin + or other.bbox.y0 - y_error_margin > self.bbox.y1 + y_error_margin ) return x_overlap and y_overlap @@ -321,7 +321,9 @@ def tokens(self) -> int: return num_tokens(self.text) def is_at_similar_height( - self, other: Union["TableElement", "TextElement", "ImageElement"], error_margin: float = 1 + self, + other: Union["TableElement", "TextElement", "ImageElement"], + error_margin: float = 1, ) -> bool: y_distance = abs(self.bbox.y1 - other.bbox.y1) @@ -331,18 +333,29 @@ def is_at_similar_height( ###################### ### IMAGE ELEMENTS ### ###################### + + class ImageElement(BaseModel): text: str - ext: str bbox: Bbox - image: BytesIO # type: ignore - block: Optional[dict] = None + image: str # base64 encoded image + image_mimetype: Union[ + Literal[ + "image/jpeg", + "image/png", + "image/bmp", + "image/jbig2", + "image/webp", + "unknown", + ], + str, + ] ocr_context: Optional[dict] = None _embed_text: Optional[str] = None variant: Literal[NodeVariant.IMAGE] = NodeVariant.IMAGE - class Config: - arbitrary_types_allowed = True + model_config = ConfigDict(frozen=True) + @computed_field # type: ignore @cached_property def embed_text(self) -> str: @@ -350,6 +363,7 @@ def embed_text(self) -> str: return self._embed_text return self.text + @cached_property def area(self) -> float: return (self.bbox.x1 - self.bbox.x0) * (self.bbox.y1 - self.bbox.y0) @@ -363,7 +377,9 @@ def tokens(self) -> int: return 512 # Placeholder for image tokenization def is_at_similar_height( - self, other: Union["TableElement", "TextElement", "ImageElement"], error_margin: float = 1 + self, + other: Union["TableElement", "TextElement", "ImageElement"], + error_margin: float = 1, ) -> bool: y_distance = abs(self.bbox.y1 - other.bbox.y1) @@ -376,10 +392,10 @@ def is_at_similar_height( def _determine_relationship( - elem1: Union["TextElement", "TableElement"], - elem2: Union["TextElement", "TableElement"], - line_threshold: float = 1, - paragraph_threshold: float = 12, + elem1: Union["TextElement", "TableElement"], + elem2: Union["TextElement", "TableElement"], + line_threshold: float = 1, + paragraph_threshold: float = 12, ) -> Literal["same-line", "same-paragraph", None]: """ Determines the relationship between two elements (either TextElement or TableElement). @@ -428,7 +444,7 @@ def node_id(self) -> str: @computed_field # type: ignore @cached_property - def variant(self) -> Set[Literal["text", "table"]]: + def variant(self) -> Set[Literal["text", "table", "image"]]: return {e.variant.value for e in self.elements} @computed_field # type: ignore @@ -436,6 +452,11 @@ def variant(self) -> Set[Literal["text", "table"]]: def tokens(self) -> int: return sum([e.tokens for e in self.elements]) + @computed_field # type: ignore + @cached_property + def images(self) -> List[ImageElement]: + return [e for e in self.elements if e.variant == NodeVariant.IMAGE] + @computed_field # type: ignore @cached_property def bbox(self) -> List[Bbox]: @@ -568,7 +589,7 @@ def reading_order(self) -> ReadingOrder: return ReadingOrder(min_page=min_page, y_position=y_position, min_x0=min_x0) def overlaps( - self, other: "Node", x_error_margin: float = 0.0, y_error_margin: float = 0.0 + self, other: "Node", x_error_margin: float = 0.0, y_error_margin: float = 0.0 ) -> bool: for bbox in self.bbox: other_bboxes = [ @@ -577,13 +598,13 @@ def overlaps( for other_bbox in other_bboxes: x_overlap = not ( - bbox.x0 - x_error_margin > other_bbox.x1 + x_error_margin - or other_bbox.x0 - x_error_margin > bbox.x1 + x_error_margin + bbox.x0 - x_error_margin > other_bbox.x1 + x_error_margin + or other_bbox.x0 - x_error_margin > bbox.x1 + x_error_margin ) y_overlap = not ( - bbox.y0 - y_error_margin > other_bbox.y1 + y_error_margin - or other_bbox.y0 - y_error_margin > bbox.y1 + y_error_margin + bbox.y0 - y_error_margin > other_bbox.y1 + y_error_margin + or other_bbox.y0 - y_error_margin > bbox.y1 + y_error_margin ) if x_overlap and y_overlap: @@ -612,7 +633,7 @@ def __lt__(self, other: "Node") -> bool: return NotImplemented assert ( - self.coordinate_system == other.coordinate_system + self.coordinate_system == other.coordinate_system ), "Coordinate systems must match." return self.reading_order < other.reading_order @@ -705,7 +726,7 @@ def _nodes_to_llama_index(self, llama_index_doc): for i in range(len(li_nodes) - 1): li_nodes[i].relationships[NodeRelationship.NEXT] = li_nodes[ i + 1 - ].as_related_node_info() + ].as_related_node_info() li_nodes[i + 1].relationships[NodeRelationship.PREVIOUS] = li_nodes[ i diff --git a/src/openparse/text/pdfminer/core.py b/src/openparse/text/pdfminer/core.py index 7a0a209..173fb94 100644 --- a/src/openparse/text/pdfminer/core.py +++ b/src/openparse/text/pdfminer/core.py @@ -1,12 +1,19 @@ -import uuid +import base64 from io import BytesIO from typing import Any, Iterable, List, Tuple, Union -from pdfminer.layout import LTAnno, LTChar, LTTextContainer, LTTextLine, LTImage, LTFigure, LTLine, LTRect +from pdfminer.layout import ( + LTAnno, + LTChar, + LTFigure, + LTImage, + LTTextContainer, + LTTextLine, +) from pydantic import BaseModel, model_validator from openparse.pdf import Pdf -from openparse.schemas import Bbox, LineElement, TextElement, TextSpan, ImageElement +from openparse.schemas import Bbox, ImageElement, LineElement, TextElement, TextSpan class CharElement(BaseModel): @@ -56,6 +63,24 @@ def _extract_chars(text_line: LTTextLine) -> List[CharElement]: return chars +def get_mime_type(pdf_object: LTImage) -> str | None: + subtype = pdf_object.stream.attrs.get("Subtype", {"name": None}).name + filter_ = pdf_object.stream.attrs.get("Filter", {"name": None}).name + if subtype == "Image": + if filter_ == "DCTDecode": + return "image/jpeg" + elif filter_ == "FlateDecode": + return "image/png" # Most likely, but could also be TIFF + elif filter_ == "JPXDecode": + return "image/jp2" + elif filter_ == "CCITTFaxDecode": + return "image/tiff" + elif filter_ == "JBIG2Decode": + return "image/jbig2" + + return None + + def _group_chars_into_spans(chars: Iterable[CharElement]) -> List[TextSpan]: spans = [] current_text = "" @@ -117,8 +142,8 @@ def _get_bbox(lines: List[LineElement]) -> Tuple[float, float, float, float]: return x0, y0, x1, y1 -def ingest(pdf_input: Union[Pdf]) -> List[TextElement]: - """Parse PDF and return a list of LineElement objects.""" +def ingest(pdf_input: Pdf) -> List[Union[TextElement, ImageElement]]: + """Parse PDF and return a list of TextElement and ImageElement objects.""" elements = [] page_layouts = pdf_input.extract_layout_pages() @@ -151,30 +176,26 @@ def ingest(pdf_input: Union[Pdf]) -> List[TextElement]: ) ) elif isinstance(element, LTFigure): - element = element._objs - if element is None: - continue - for e in element: + for e in element._objs: if isinstance(e, LTImage): - elements.append( - ImageElement( - bbox=Bbox( - x0=e.bbox[0], - y0=e.bbox[1], - x1=e.bbox[2], - y1=e.bbox[3], - page=page_num, - page_width=e.width, - page_height=e.height, - ), - image=BytesIO(e.stream.get_data()), - ext="png", - text='', + mime_type = get_mime_type(e) + if mime_type: + img_data = BytesIO(e.stream.get_data()).getvalue() + base64_string = base64.b64encode(img_data).decode("utf-8") + elements.append( + ImageElement( + bbox=Bbox( + x0=e.bbox[0], + y0=e.bbox[1], + x1=e.bbox[2], + y1=e.bbox[3], + page=page_num, + page_width=page_width, + page_height=page_height, + ), + image=base64_string, + image_mimetype=mime_type or "unknown", + text="", + ) ) - ) - elif isinstance(element, LTLine): - pass - elif isinstance(element, LTRect): - pass - # This is a placeholder, actual method may vary return elements diff --git a/src/openparse/text/pymupdf/core.py b/src/openparse/text/pymupdf/core.py index 91f8346..0cc151a 100644 --- a/src/openparse/text/pymupdf/core.py +++ b/src/openparse/text/pymupdf/core.py @@ -1,7 +1,10 @@ +import base64 from typing import List +import fitz + from openparse.pdf import Pdf -from openparse.schemas import Bbox, LineElement, TextElement, TextSpan, ImageElement +from openparse.schemas import Bbox, ImageElement, LineElement, TextElement, TextSpan def flags_decomposer(flags: int) -> str: @@ -66,6 +69,17 @@ def _lines_from_ocr_output(lines: dict, error_margin: float = 0) -> List[LineEle return combined +def _extract_base64_image(doc: fitz.Document, xref: int) -> tuple[str, str]: + img = doc.extract_image(xref) + image_bytes = img["image"] + image_ext = img["ext"] + + base64_image = base64.b64encode(image_bytes).decode("utf-8") + + mime_type = f"image/{image_ext}" + return base64_image, mime_type + + def ingest( doc: Pdf, ) -> List[TextElement]: @@ -76,14 +90,18 @@ def ingest( page_ocr = page.get_textpage_ocr(flags=0, full=False) for node in page.get_text("dict", textpage=page_ocr, sort=True)["blocks"]: # Flip y-coordinates to match the top-left origin system - fy0 = page.rect.height - node["bbox"][1] - fy1 = page.rect.height - node["bbox"][3] + fy0 = page.rect.height - node["bbox"][3] + fy1 = page.rect.height - node["bbox"][1] if node["type"] == 1: img_info = page.get_images(node["bbox"]) - if len(img_info) > 0: - name = f'{img_info[0][7]}.{node["ext"]}' + if img_info: + xref = img_info[0][0] + base64_image, mime_type = _extract_base64_image(pdoc, xref) + name = f'{img_info[0][7]}.{mime_type.split("/")[-1]}' else: + base64_image, mime_type = "", "unknown" name = f'test.{node["ext"]}' + elements.append( ImageElement( bbox=Bbox( @@ -95,8 +113,8 @@ def ingest( page_width=page.rect.width, page_height=page.rect.height, ), - image=node["image"], - ext=node["ext"], + image=base64_image, + image_mimetype=mime_type, text=name, ) ) diff --git a/src/tests/sample_data/europe.jpg b/src/tests/sample_data/europe.jpg new file mode 100644 index 0000000..ea9ef34 Binary files /dev/null and b/src/tests/sample_data/europe.jpg differ diff --git a/src/tests/sample_data/pdf-with-image.pdf b/src/tests/sample_data/pdf-with-image.pdf new file mode 100644 index 0000000..da89750 Binary files /dev/null and b/src/tests/sample_data/pdf-with-image.pdf differ diff --git a/src/tests/text/pdf_miner/test_core.py b/src/tests/text/pdf_miner/test_core.py index 3b2c64e..9d2b87c 100644 --- a/src/tests/text/pdf_miner/test_core.py +++ b/src/tests/text/pdf_miner/test_core.py @@ -1,16 +1,21 @@ -from typing import Tuple, List +import base64 +import io +from pathlib import Path +from typing import List, Tuple from unittest.mock import MagicMock from pdfminer.layout import LTAnno, LTChar +from PIL import Image, ImageChops -from openparse.schemas import TextSpan +from openparse.pdf import Pdf +from openparse.schemas import NodeVariant, TextSpan from openparse.text.pdfminer.core import ( CharElement, - _group_chars_into_spans, _extract_chars, + _group_chars_into_spans, + ingest, ) - raw_chars = [ CharElement(text="1", fontname="bold", size=9.0), CharElement(text=".", fontname="bold", size=9.0), @@ -198,3 +203,51 @@ def test_extract_chars_with_ltannos(): # Assert the result matches the expected output assert result == expected_output + + +def _images_are_similar(img1_bytes, img2_bytes, max_pct_diff=1.0, pixel_threshold=10): + """ + Compare two images and determine if the percentage of differing pixels is below a threshold. + + :param img1_bytes: Byte content of the first image. + :param img2_bytes: Byte content of the second image. + :param max_pct_diff: Maximum allowed percentage of differing pixels. + :param pixel_threshold: Per-pixel difference threshold to consider a pixel as different. + :return: Boolean indicating if images are similar within the allowed percentage difference. + """ + img1 = Image.open(io.BytesIO(img1_bytes)).convert("RGB") + img2 = Image.open(io.BytesIO(img2_bytes)).convert("RGB") + + if img1.size != img2.size: + print(f"Image sizes do not match: {img1.size} vs {img2.size}") + return False + + diff = ImageChops.difference(img1, img2) + + diff_gray = diff.convert("L") + + differing_pixels = sum( + 1 for pixel in diff_gray.getdata() if pixel > pixel_threshold + ) + total_pixels = img1.size[0] * img1.size[1] + pct_diff = (differing_pixels / total_pixels) * 100 + + print(f"Percentage of differing pixels: {pct_diff:.2f}%") + return pct_diff <= max_pct_diff + + +def test_parse_pdf_with_images(): + doc_with_image_path = Path("src/tests/sample_data/pdf-with-image.pdf") + pdf = Pdf(doc_with_image_path) + + elems = ingest(pdf) + assert elems[-1].variant == NodeVariant.IMAGE + assert elems[-1].image_mimetype == "image/jpeg" + extracted_image_data = base64.b64decode(elems[-1].image) + + # Read the raw image data + raw_image_path = Path("src/tests/sample_data/europe.jpg") + with raw_image_path.open("rb") as img_file: + raw_image_data = img_file.read() + + assert _images_are_similar(raw_image_data, extracted_image_data)