Skip to content

Commit

Permalink
Merge pull request #51 from Filimoa/whitespace
Browse files Browse the repository at this point in the history
Whitespace Issues
  • Loading branch information
Filimoa authored Jun 13, 2024
2 parents a925f95 + 1b24e8e commit 4e9936f
Show file tree
Hide file tree
Showing 2 changed files with 141 additions and 7 deletions.
31 changes: 25 additions & 6 deletions src/openparse/text/pdfminer/core.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from typing import Any, Iterable, List, Union, Tuple

from pdfminer.layout import LTChar, LTTextContainer, LTTextLine
from pdfminer.layout import LTAnno, LTChar, LTTextContainer, LTTextLine
from pydantic import BaseModel, model_validator

from openparse.pdf import Pdf
Expand Down Expand Up @@ -28,11 +28,30 @@ def round_size(cls, data: Any) -> Any:


def _extract_chars(text_line: LTTextLine) -> List[CharElement]:
return [
CharElement(text=char.get_text(), fontname=char.fontname, size=char.size)
for char in text_line
if isinstance(char, LTChar)
]
"""
The last_fontname variable is used to keep track of the most recent fontname seen as the function iterates through text_line.
This is necessary because LTAnno elements (annotations) do not have their own font and size information; they use the most recently encountered fontname and size from a LTChar element.
"""

chars = []
# take the first LTChar's fontname and size for any LTAnno before them
last_fontname = next(
(char.fontname for char in text_line if isinstance(char, LTChar)), ""
)
last_size = next((char.size for char in text_line if isinstance(char, LTChar)), 0.0)

for char in text_line:
if not isinstance(char, LTChar) and not isinstance(char, LTAnno):
continue
if isinstance(char, LTChar):
last_fontname = char.fontname
last_size = char.size
chars.append(
CharElement(text=char.get_text(), fontname=last_fontname, size=last_size)
)

return chars


def _group_chars_into_spans(chars: Iterable[CharElement]) -> List[TextSpan]:
Expand Down
117 changes: 116 additions & 1 deletion src/tests/text/pdf_miner/test_core.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,15 @@
from typing import Tuple, List
from unittest.mock import MagicMock

from pdfminer.layout import LTAnno, LTChar

from openparse.schemas import TextSpan
from openparse.text.pdfminer.core import CharElement, _group_chars_into_spans
from openparse.text.pdfminer.core import (
CharElement,
_group_chars_into_spans,
_extract_chars,
)


raw_chars = [
CharElement(text="1", fontname="bold", size=9.0),
Expand Down Expand Up @@ -83,3 +93,108 @@ def test_group_chars_into_spans():
), f"Expected size {expected_span.size}, got {result_span.size} in mixed styles"

# Add more tests here for additional scenarios like empty inputs, inputs with only spaces, etc.


def _char_data_to_text_line(char_data: List[Tuple[str, str, float]]):
text_line = []
for text, fontname, size in char_data:
# LTAnno does not have fontname and size attributes
if fontname is None and size is None:
anno = MagicMock(spec=LTAnno)
anno.get_text.return_value = text
text_line.append(anno)
else:
char = MagicMock(spec=LTChar)
char.get_text.return_value = text
char.fontname = fontname
char.size = size
text_line.append(char)
return text_line


def test_extract_chars():
char_data = [
("A", "Arial-Bold", 12.0),
("b", "Arial-Bold", 12.0),
("o", "Arial-Bold", 12.0),
("u", "Arial-Bold", 12.0),
("t", "Arial-Bold", 12.0),
(" ", "Arial-Bold", 12.0),
("w", "Arial-Bold", 12.0),
("h", "Arial-Bold", 12.0),
("o", "Arial-Bold", 12.0),
]

expected_output = [
CharElement(text="A", fontname="Arial-Bold", size=12.0),
CharElement(text="b", fontname="Arial-Bold", size=12.0),
CharElement(text="o", fontname="Arial-Bold", size=12.0),
CharElement(text="u", fontname="Arial-Bold", size=12.0),
CharElement(text="t", fontname="Arial-Bold", size=12.0),
CharElement(text=" ", fontname="Arial-Bold", size=12.0),
CharElement(text="w", fontname="Arial-Bold", size=12.0),
CharElement(text="h", fontname="Arial-Bold", size=12.0),
CharElement(text="o", fontname="Arial-Bold", size=12.0),
]

text_line = _char_data_to_text_line(char_data)
result = _extract_chars(text_line)
assert len(result) == 9

# Assert the result matches the expected output
assert result == expected_output


def test_extract_chars_with_ltannos():
# Data for LTChar and LTAnno mock instances interleaved
char_data = [
("A", "Arial-Bold", 12.0),
("b", "Arial-Bold", 12.0),
("o", "Arial-Bold", 12.0),
("u", "Arial-Bold", 12.0),
("t", "Arial-Bold", 12.0),
(" ", None, None), # LTAnno
("w", "Arial-Bold", 12.0),
("h", "Arial-Bold", 12.0),
("o", "Arial-Bold", 12.0),
(" ", None, None), # LTAnno
(" ", None, None), # LTAnno
("A", "Arial-Bold", 12.0),
("u", "Arial-Bold", 12.0),
("t", "Arial-Bold", 12.0),
("h", "Arial-Bold", 12.0),
("o", "Arial-Bold", 12.0),
("r", "Arial-Bold", 12.0),
("\n", None, None), # LTAnno
]

text_line = _char_data_to_text_line(char_data)

# Expected output
expected_output = [
CharElement(text="A", fontname="Arial-Bold", size=12.0),
CharElement(text="b", fontname="Arial-Bold", size=12.0),
CharElement(text="o", fontname="Arial-Bold", size=12.0),
CharElement(text="u", fontname="Arial-Bold", size=12.0),
CharElement(text="t", fontname="Arial-Bold", size=12.0),
CharElement(text=" ", fontname="Arial-Bold", size=12.0),
CharElement(text="w", fontname="Arial-Bold", size=12.0),
CharElement(text="h", fontname="Arial-Bold", size=12.0),
CharElement(text="o", fontname="Arial-Bold", size=12.0),
CharElement(text=" ", fontname="Arial-Bold", size=12.0),
CharElement(text=" ", fontname="Arial-Bold", size=12.0),
CharElement(text="A", fontname="Arial-Bold", size=12.0),
CharElement(text="u", fontname="Arial-Bold", size=12.0),
CharElement(text="t", fontname="Arial-Bold", size=12.0),
CharElement(text="h", fontname="Arial-Bold", size=12.0),
CharElement(text="o", fontname="Arial-Bold", size=12.0),
CharElement(text="r", fontname="Arial-Bold", size=12.0),
CharElement(text="\n", fontname="Arial-Bold", size=12.0),
]

# Call _extract_chars
result = _extract_chars(text_line)
assert len(result) == 18

# Assert the result matches the expected output
assert result == expected_output

0 comments on commit 4e9936f

Please sign in to comment.