Skip to content

Commit

Permalink
Report the actual device space bounding box for rotated text (#20)
Browse files Browse the repository at this point in the history
* feat: compute *actual* bbox of rotated characters (unlike pdfminer.six)

* fix: caching glyphs is a bad idea since textstate is mutable

* fix: correct bbox in lazy API

* fix: test and docs
  • Loading branch information
dhdaines authored Nov 27, 2024
1 parent 03ca1b1 commit 2772928
Show file tree
Hide file tree
Showing 6 changed files with 93 additions and 74 deletions.
2 changes: 1 addition & 1 deletion playa/font.py
Original file line number Diff line number Diff line change
Expand Up @@ -1146,7 +1146,7 @@ def decode(self, data: bytes) -> Iterable[int]:
return self.cmap.decode(data)

def char_disp(self, cid: int) -> Union[float, Tuple[Optional[float], float]]:
"""Returns an integer for horizontal fonts, a tuple for vertical fonts."""
"""Returns 0 for horizontal fonts, a tuple for vertical fonts."""
return self.disps.get(cid, self.default_disp)

def to_unichr(self, cid: int) -> str:
Expand Down
100 changes: 39 additions & 61 deletions playa/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@
apply_matrix_norm,
decode_text,
get_bound,
get_transformed_bound,
make_compat_bytes,
mult_matrix,
parse_rect,
Expand Down Expand Up @@ -276,16 +277,17 @@ class TextState:
user space.
charspace: Extra spacing to add between each glyph, in
text space units.
wordspace: The width of a space, defined curiously as `cid=32`
wordspace: The width of a space, defined curiously as `cid==32`
(But PDF Is A prESeNTaTion fORmAT sO ThERe maY NOt Be aNY
SpACeS!!) in text space units.
scaling: The scaling factor as defined by the PDF standard.
SpACeS!!), in text space units.
scaling: The horizontal scaling factor as defined by the PDF
standard.
leading: The leading as defined by the PDF standard.
render_mode: The PDF rendering mode. The really important one
here is 3, which means "don't render the text". You might
want to use this to detect invisible text.
rise: The rise as defined by the PDF standard.
rise: The text rise (superscript or subscript position), in text
space units.
"""

line_matrix: Matrix = MATRIX_IDENTITY
Expand Down Expand Up @@ -1360,10 +1362,7 @@ def render_image(self, xobjid: str, stream: ContentStream) -> LayoutDict:
# unit high in user space, regardless of the number of samples
# in the image. To be painted, an image shall be mapped to a
# region of the page by temporarily altering the CTM.
bounds = ((0, 0), (1, 0), (0, 1), (1, 1))
x0, y0, x1, y1 = get_bound(
apply_matrix_pt(self.ctm, (p, q)) for (p, q) in bounds
)
x0, y0, x1, y1 = get_transformed_bound(self.ctm, (0, 0, 1, 1))
if stream.objid is not None and stream.genno is not None:
stream_id = (stream.objid, stream.genno)
else:
Expand Down Expand Up @@ -1564,33 +1563,26 @@ def render_char(
log.debug("undefined char: %r, %r", font, cid)
text = None
textwidth = font.char_width(cid)
textdisp = font.char_disp(cid)
adv = textwidth * fontsize * scaling
if font.vertical:
# vertical
textdisp = font.char_disp(cid)
assert isinstance(textdisp, tuple)
(vx, vy) = textdisp
if vx is None:
vx = fontsize * 0.5
else:
vx = vx * fontsize * 0.001
vy = (1000 - vy) * fontsize * 0.001
bbox_lower_left = (-vx, vy + rise + adv)
bbox_upper_right = (-vx + fontsize, vy + rise)
x0, y0 = (-vx, vy + rise + adv)
x1, y1 = (-vx + fontsize, vy + rise)
else:
# horizontal
descent = font.get_descent() * fontsize
bbox_lower_left = (0, descent + rise)
bbox_upper_right = (adv, descent + rise + fontsize)
x0, y0 = (0, descent + rise)
x1, y1 = (adv, descent + rise + fontsize)
(a, b, c, d, e, f) = matrix
upright = a * d * scaling > 0 and b * c <= 0
# FIXME: This is **not** the bounding box if rotation is involved!
x0, y0, x1, y1 = get_bound(
(
apply_matrix_pt(matrix, bbox_lower_left),
apply_matrix_pt(matrix, bbox_upper_right),
)
)
x0, y0, x1, y1 = get_transformed_bound(matrix, (x0, y0, x1, y1))
# NOTE: This is not right at all for rotated text, but we'll live with it
if font.vertical:
size = x1 - x0
else:
Expand Down Expand Up @@ -1692,7 +1684,8 @@ def object_type(self):
@property
def bbox(self) -> Rect:
points = itertools.chain.from_iterable(
((x0, y0), (x1, y1)) for x0, y0, x1, y1 in (item.bbox for item in self)
((x0, y0), (x0, y1), (x1, y1), (x1, y0))
for x0, y0, x1, y1 in (item.bbox for item in self)
)
return get_bound(points)

Expand Down Expand Up @@ -1740,8 +1733,7 @@ def bbox(self) -> Rect:
# unit high in user space, regardless of the number of samples
# in the image. To be painted, an image shall be mapped to a
# region of the page by temporarily altering the CTM.
bounds = ((0, 0), (1, 0), (0, 1), (1, 1))
return get_bound(apply_matrix_pt(self.ctm, (p, q)) for (p, q) in bounds)
return get_transformed_bound(self.ctm, (0, 0, 1, 1))


@dataclass
Expand Down Expand Up @@ -1778,11 +1770,7 @@ def __getitem__(self, name: str) -> PDFObject:
def bbox(self) -> Rect:
"""Get the bounding box of this XObject in device space."""
# It is a required attribute!
x0, y0, x1, y1 = parse_rect(self.stream["BBox"])
# FIXME: This is *not* the bbox in the case of rotation
return get_bound(
[apply_matrix_pt(self.ctm, (x0, y0)), apply_matrix_pt(self.ctm, (x1, y1))]
)
return get_transformed_bound(self.ctm, parse_rect(self.stream["BBox"]))

@property
def buffer(self) -> bytes:
Expand Down Expand Up @@ -1896,14 +1884,11 @@ def bbox(self) -> Rect:
"""Get bounding box of path in device space as defined by its
points and control points."""
# First get the bounding box in user space (fast)
x0, y0, x1, y1 = get_bound(
bbox = get_bound(
itertools.chain.from_iterable(seg.points for seg in self.raw_segments)
)
# Now transform it
x0, y0 = apply_matrix_pt(self.ctm, (x0, y0))
x1, y1 = apply_matrix_pt(self.ctm, (x1, y1))
# And get the new bounds (also normalizes)
return get_bound(((x0, y0), (x1, y1)))
# Transform it and get the new bounding box
return get_transformed_bound(self.ctm, bbox)


class TextItem(NamedTuple):
Expand Down Expand Up @@ -1934,20 +1919,19 @@ class GlyphObject(ContentObject):
context of iteration over the parent `TextObject`.
cid: Character ID for this glyph.
text: Unicode mapping of this glyph, if any.
adv: glyph displacement in user space units.
bbox: glyph bounding box in device space.
"""

textstate: TextState
cid: int
text: Union[str, None]
# FIXME: Subject to change here as not the most useful info
lower_left: Point
upper_right: Point
adv: float
_bbox: Rect

@property
def bbox(self) -> Rect:
# FIXME: This is not the bounding box in case of rotation!
return get_bound((self.lower_left, self.upper_right))
return self._bbox


@dataclass
Expand All @@ -1972,7 +1956,7 @@ def _render_char(
cid: int,
matrix: Matrix,
scaling: float,
) -> Tuple[GlyphObject, float]:
) -> GlyphObject:
font = self.textstate.font
assert font is not None
fontsize = self.textstate.fontsize
Expand All @@ -1984,35 +1968,33 @@ def _render_char(
log.debug("undefined char: %r, %r", font, cid)
text = None
textwidth = font.char_width(cid)
textdisp = font.char_disp(cid)
adv = textwidth * fontsize * scaling
if font.vertical:
# vertical
textdisp = font.char_disp(cid)
assert isinstance(textdisp, tuple)
(vx, vy) = textdisp
if vx is None:
vx = fontsize * 0.5
else:
vx = vx * fontsize * 0.001
vy = (1000 - vy) * fontsize * 0.001
bbox_lower_left = (-vx, vy + rise + adv)
bbox_upper_right = (-vx + fontsize, vy + rise)
x0, y0 = (-vx, vy + rise + adv)
x1, y1 = (-vx + fontsize, vy + rise)
else:
# horizontal
descent = font.get_descent() * fontsize
bbox_lower_left = (0, descent + rise)
bbox_upper_right = (adv, descent + rise + fontsize)
item = GlyphObject(
x0, y0 = (0, descent + rise)
x1, y1 = (adv, descent + rise + fontsize)
bbox = get_transformed_bound(matrix, (x0, y0, x1, y1))
return GlyphObject(
self.gstate,
self.ctm,
self.mcs,
self.textstate,
cid,
text,
apply_matrix_pt(matrix, bbox_lower_left),
apply_matrix_pt(matrix, bbox_upper_right),
adv,
bbox
)
return item, adv

def _render_string(self, item: TextItem) -> Iterator[GlyphObject]:
assert self.textstate.font is not None
Expand Down Expand Up @@ -2040,12 +2022,12 @@ def _render_string(self, item: TextItem) -> Iterator[GlyphObject]:
if needcharspace:
pos += charspace
self.textstate.glyph_offset = (x, pos) if vert else (pos, y)
glyph, adv = self._render_char(
glyph = self._render_char(
cid=cid,
matrix=translate_matrix(matrix, self.textstate.glyph_offset),
scaling=scaling,
)
pos += adv
pos += glyph.adv
yield glyph
if cid == 32 and wordspace:
pos += wordspace
Expand Down Expand Up @@ -2079,14 +2061,10 @@ def chars(self) -> str:

def __iter__(self) -> Iterator[GlyphObject]:
"""Generate glyphs for this text object"""
if self._glyphs is not None:
yield from self._glyphs
self._glyphs = []
for item in self.items:
if item.operator == "TJ":
for glyph in self._render_string(item):
yield glyph
self._glyphs.append(glyph)
else:
self.textstate.update(item.operator, *item.args)

Expand Down
29 changes: 27 additions & 2 deletions playa/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,16 +214,27 @@ def mult_matrix(m1: Matrix, m0: Matrix) -> Matrix:


def translate_matrix(m: Matrix, v: Point) -> Matrix:
"""Translates a matrix by (x, y)."""
"""Pre-translates a matrix by `v == (dx, dy)`.
Specifically this translates the *input space* of the matrix by
`(dx, dy)`, so:
(apply_matrix_pt(matrix, (x + dx, y + dy))
== apply_matrix_pt(translate_matrix(matrix,
(dx, dy)),
x, y))
For all `x, y, dx, dy`.
"""
(a, b, c, d, e, f) = m
(x, y) = v
return a, b, c, d, x * a + y * c + e, x * b + y * d + f


def apply_matrix_pt(m: Matrix, v: Point) -> Point:
"""Applies a matrix to a point."""
(a, b, c, d, e, f) = m
(x, y) = v
"""Applies a matrix to a point."""
return a * x + c * y + e, b * x + d * y + f


Expand Down Expand Up @@ -258,6 +269,20 @@ def get_bound(pts: Iterable[Point]) -> Rect:
return x0, y0, x1, y1


def get_transformed_bound(matrix: Matrix, bbox: Rect) -> Rect:
"""Transform a bounding box and return the rectangle that covers
the points of the resulting shape."""
x0, y0, x1, y1 = bbox
return get_bound(
(
apply_matrix_pt(matrix, (x0, y0)),
apply_matrix_pt(matrix, (x0, y1)),
apply_matrix_pt(matrix, (x1, y1)),
apply_matrix_pt(matrix, (x1, y0)),
)
)


def choplist(n: int, seq: Iterable[_T]) -> Iterator[Tuple[_T, ...]]:
"""Groups every n elements of the list."""
r = []
Expand Down
4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -57,10 +57,10 @@ ban-relative-imports = "all"
testpaths = [ "tests" ]

[tool.hatch.envs.hatch-test]
extra-dependencies = [ "cryptography", "pdfminer.six", "pandas", "polars" ]
extra-dependencies = [ "cryptography", "pdfminer.six", "pandas", "polars-lts-cpu" ]

[tool.hatch.envs.default]
dependencies = [ "cryptography", "pytest", "mypy", "pdfminer.six", "pandas", "polars" ]
dependencies = [ "cryptography", "pytest", "mypy", "pdfminer.six", "pandas", "polars-lts-cpu" ]

[tool.hatch.envs.default.scripts]
bench = [
Expand Down
15 changes: 15 additions & 0 deletions tests/test_lazy_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,5 +95,20 @@ def test_uncoloured_tiling() -> None:
assert path.gstate.ncolor == Color((0.5, 0.2, 1.0), "P1")


def test_rotated_glyphs() -> None:
"""Verify that we (unlike pdfminer) properly calculate the bbox
for rotated text."""
with playa.open(TESTDIR / "contrib" / "issue_495_pdfobjref.pdf") as pdf:
chars = []
for text in pdf.pages[0].texts:
for glyph in text:
if 1 not in glyph.textstate.line_matrix:
chars.append(glyph.text)
x0, y0, x1, y1 = glyph.bbox
width = x1 - x0
assert width > 6
assert "".join(chars) == "R18,00"


if __name__ == "__main__":
test_content_objects()
17 changes: 9 additions & 8 deletions tests/test_open.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,11 @@
"aes-256-m.pdf": ["foo"],
"aes-256-r6.pdf": ["usersecret", "ownersecret"],
}
PDFMINER_BUGS = {
"issue-449-vertical.pdf",
"issue_495_pdfobjref.pdf",
"issue-1008-inline-ascii85.pdf",
}


@pytest.mark.skipif(pdfminer is None, reason="pdfminer.six is not installed")
Expand All @@ -45,12 +50,15 @@ def convert_miner(layout):
if itype == "figure":
yield from convert_miner(ltitem)
else:
yield ((itype, ltitem.bbox))
yield (itype, ltitem.bbox)

passwords = PASSWORDS.get(path.name, [""])
for password in passwords:
miner = []
with open(path, "rb") as infh:
if path.name in PDFMINER_BUGS:
pytest.skip("pdfminer.six has a bug, skipping %s" % path.name)
break
try:
rsrc = PDFResourceManager()
agg = PDFPageAggregator(rsrc, pageno=1)
Expand Down Expand Up @@ -175,10 +183,3 @@ def test_tiff_predictor() -> None:
image = next(doc.pages[0].images)
# Decoded TIFF: 600 x 600 + a header
assert len(image.stream.buffer) == 360600


if __name__ == "__main__":
import logging

logging.basicConfig(level=logging.DEBUG)
test_xobjects()

0 comments on commit 2772928

Please sign in to comment.