Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Whitespace Issues #51

Merged
merged 2 commits into from
Jun 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 25 additions & 6 deletions src/openparse/text/pdfminer/core.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from typing import Any, Iterable, List, Union, Tuple

from pdfminer.layout import LTChar, LTTextContainer, LTTextLine
from pdfminer.layout import LTAnno, LTChar, LTTextContainer, LTTextLine
from pydantic import BaseModel, model_validator

from openparse.pdf import Pdf
Expand Down Expand Up @@ -28,11 +28,30 @@ def round_size(cls, data: Any) -> Any:


def _extract_chars(text_line: LTTextLine) -> List[CharElement]:
return [
CharElement(text=char.get_text(), fontname=char.fontname, size=char.size)
for char in text_line
if isinstance(char, LTChar)
]
"""
The last_fontname variable is used to keep track of the most recent fontname seen as the function iterates through text_line.

This is necessary because LTAnno elements (annotations) do not have their own font and size information; they use the most recently encountered fontname and size from a LTChar element.
"""

chars = []
# take the first LTChar's fontname and size for any LTAnno before them
last_fontname = next(
(char.fontname for char in text_line if isinstance(char, LTChar)), ""
)
last_size = next((char.size for char in text_line if isinstance(char, LTChar)), 0.0)

for char in text_line:
if not isinstance(char, LTChar) and not isinstance(char, LTAnno):
continue
if isinstance(char, LTChar):
last_fontname = char.fontname
last_size = char.size
chars.append(
CharElement(text=char.get_text(), fontname=last_fontname, size=last_size)
)

return chars


def _group_chars_into_spans(chars: Iterable[CharElement]) -> List[TextSpan]:
Expand Down
117 changes: 116 additions & 1 deletion src/tests/text/pdf_miner/test_core.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,15 @@
from typing import Tuple, List
from unittest.mock import MagicMock

from pdfminer.layout import LTAnno, LTChar

from openparse.schemas import TextSpan
from openparse.text.pdfminer.core import CharElement, _group_chars_into_spans
from openparse.text.pdfminer.core import (
CharElement,
_group_chars_into_spans,
_extract_chars,
)


raw_chars = [
CharElement(text="1", fontname="bold", size=9.0),
Expand Down Expand Up @@ -83,3 +93,108 @@ def test_group_chars_into_spans():
), f"Expected size {expected_span.size}, got {result_span.size} in mixed styles"

# Add more tests here for additional scenarios like empty inputs, inputs with only spaces, etc.


def _char_data_to_text_line(char_data: List[Tuple[str, str, float]]):
text_line = []
for text, fontname, size in char_data:
# LTAnno does not have fontname and size attributes
if fontname is None and size is None:
anno = MagicMock(spec=LTAnno)
anno.get_text.return_value = text
text_line.append(anno)
else:
char = MagicMock(spec=LTChar)
char.get_text.return_value = text
char.fontname = fontname
char.size = size
text_line.append(char)
return text_line


def test_extract_chars():
char_data = [
("A", "Arial-Bold", 12.0),
("b", "Arial-Bold", 12.0),
("o", "Arial-Bold", 12.0),
("u", "Arial-Bold", 12.0),
("t", "Arial-Bold", 12.0),
(" ", "Arial-Bold", 12.0),
("w", "Arial-Bold", 12.0),
("h", "Arial-Bold", 12.0),
("o", "Arial-Bold", 12.0),
]

expected_output = [
CharElement(text="A", fontname="Arial-Bold", size=12.0),
CharElement(text="b", fontname="Arial-Bold", size=12.0),
CharElement(text="o", fontname="Arial-Bold", size=12.0),
CharElement(text="u", fontname="Arial-Bold", size=12.0),
CharElement(text="t", fontname="Arial-Bold", size=12.0),
CharElement(text=" ", fontname="Arial-Bold", size=12.0),
CharElement(text="w", fontname="Arial-Bold", size=12.0),
CharElement(text="h", fontname="Arial-Bold", size=12.0),
CharElement(text="o", fontname="Arial-Bold", size=12.0),
]

text_line = _char_data_to_text_line(char_data)
result = _extract_chars(text_line)
assert len(result) == 9

# Assert the result matches the expected output
assert result == expected_output


def test_extract_chars_with_ltannos():
# Data for LTChar and LTAnno mock instances interleaved
char_data = [
("A", "Arial-Bold", 12.0),
("b", "Arial-Bold", 12.0),
("o", "Arial-Bold", 12.0),
("u", "Arial-Bold", 12.0),
("t", "Arial-Bold", 12.0),
(" ", None, None), # LTAnno
("w", "Arial-Bold", 12.0),
("h", "Arial-Bold", 12.0),
("o", "Arial-Bold", 12.0),
(" ", None, None), # LTAnno
(" ", None, None), # LTAnno
("A", "Arial-Bold", 12.0),
("u", "Arial-Bold", 12.0),
("t", "Arial-Bold", 12.0),
("h", "Arial-Bold", 12.0),
("o", "Arial-Bold", 12.0),
("r", "Arial-Bold", 12.0),
("\n", None, None), # LTAnno
]

text_line = _char_data_to_text_line(char_data)

# Expected output
expected_output = [
CharElement(text="A", fontname="Arial-Bold", size=12.0),
CharElement(text="b", fontname="Arial-Bold", size=12.0),
CharElement(text="o", fontname="Arial-Bold", size=12.0),
CharElement(text="u", fontname="Arial-Bold", size=12.0),
CharElement(text="t", fontname="Arial-Bold", size=12.0),
CharElement(text=" ", fontname="Arial-Bold", size=12.0),
CharElement(text="w", fontname="Arial-Bold", size=12.0),
CharElement(text="h", fontname="Arial-Bold", size=12.0),
CharElement(text="o", fontname="Arial-Bold", size=12.0),
CharElement(text=" ", fontname="Arial-Bold", size=12.0),
CharElement(text=" ", fontname="Arial-Bold", size=12.0),
CharElement(text="A", fontname="Arial-Bold", size=12.0),
CharElement(text="u", fontname="Arial-Bold", size=12.0),
CharElement(text="t", fontname="Arial-Bold", size=12.0),
CharElement(text="h", fontname="Arial-Bold", size=12.0),
CharElement(text="o", fontname="Arial-Bold", size=12.0),
CharElement(text="r", fontname="Arial-Bold", size=12.0),
CharElement(text="\n", fontname="Arial-Bold", size=12.0),
]

# Call _extract_chars
result = _extract_chars(text_line)
assert len(result) == 18

# Assert the result matches the expected output
assert result == expected_output