Report the actual device space bounding box for rotated text (#20)

* feat: compute *actual* bbox of rotated characters (unlike pdfminer.six) * fix: caching glyphs is a bad idea since textstate is mutable * fix: correct bbox in lazy API * fix: test and docs
dhdaines · Nov 27, 2024 · 2772928 · 2772928
1 parent 03ca1b1
commit 2772928
Show file tree

Hide file tree

Showing 6 changed files with 93 additions and 74 deletions.
diff --git a/playa/font.py b/playa/font.py
@@ -1146,7 +1146,7 @@ def decode(self, data: bytes) -> Iterable[int]:
         return self.cmap.decode(data)
 
     def char_disp(self, cid: int) -> Union[float, Tuple[Optional[float], float]]:
-        """Returns an integer for horizontal fonts, a tuple for vertical fonts."""
+        """Returns 0 for horizontal fonts, a tuple for vertical fonts."""
         return self.disps.get(cid, self.default_disp)
 
     def to_unichr(self, cid: int) -> str:

diff --git a/playa/page.py b/playa/page.py
@@ -64,6 +64,7 @@
     apply_matrix_norm,
     decode_text,
     get_bound,
+    get_transformed_bound,
     make_compat_bytes,
     mult_matrix,
     parse_rect,
@@ -276,16 +277,17 @@ class TextState:
         user space.
       charspace: Extra spacing to add between each glyph, in
         text space units.
-      wordspace: The width of a space, defined curiously as `cid=32`
+      wordspace: The width of a space, defined curiously as `cid==32`
         (But PDF Is A prESeNTaTion fORmAT sO ThERe maY NOt Be aNY
-        SpACeS!!) in text space units.
-      scaling: The scaling factor as defined by the PDF standard.
+        SpACeS!!), in text space units.
+      scaling: The horizontal scaling factor as defined by the PDF
+        standard.
       leading: The leading as defined by the PDF standard.
       render_mode: The PDF rendering mode.  The really important one
         here is 3, which means "don't render the text".  You might
         want to use this to detect invisible text.
-      rise: The rise as defined by the PDF standard.
-
+      rise: The text rise (superscript or subscript position), in text
+        space units.
     """
 
     line_matrix: Matrix = MATRIX_IDENTITY
@@ -1360,10 +1362,7 @@ def render_image(self, xobjid: str, stream: ContentStream) -> LayoutDict:
         # unit high in user space, regardless of the number of samples
         # in the image. To be painted, an image shall be mapped to a
         # region of the page by temporarily altering the CTM.
-        bounds = ((0, 0), (1, 0), (0, 1), (1, 1))
-        x0, y0, x1, y1 = get_bound(
-            apply_matrix_pt(self.ctm, (p, q)) for (p, q) in bounds
-        )
+        x0, y0, x1, y1 = get_transformed_bound(self.ctm, (0, 0, 1, 1))
         if stream.objid is not None and stream.genno is not None:
             stream_id = (stream.objid, stream.genno)
         else:
@@ -1564,33 +1563,26 @@ def render_char(
             log.debug("undefined char: %r, %r", font, cid)
             text = None
         textwidth = font.char_width(cid)
-        textdisp = font.char_disp(cid)
         adv = textwidth * fontsize * scaling
         if font.vertical:
-            # vertical
+            textdisp = font.char_disp(cid)
             assert isinstance(textdisp, tuple)
             (vx, vy) = textdisp
             if vx is None:
                 vx = fontsize * 0.5
             else:
                 vx = vx * fontsize * 0.001
             vy = (1000 - vy) * fontsize * 0.001
-            bbox_lower_left = (-vx, vy + rise + adv)
-            bbox_upper_right = (-vx + fontsize, vy + rise)
+            x0, y0 = (-vx, vy + rise + adv)
+            x1, y1 = (-vx + fontsize, vy + rise)
         else:
-            # horizontal
             descent = font.get_descent() * fontsize
-            bbox_lower_left = (0, descent + rise)
-            bbox_upper_right = (adv, descent + rise + fontsize)
+            x0, y0 = (0, descent + rise)
+            x1, y1 = (adv, descent + rise + fontsize)
         (a, b, c, d, e, f) = matrix
         upright = a * d * scaling > 0 and b * c <= 0
-        # FIXME: This is **not** the bounding box if rotation is involved!
-        x0, y0, x1, y1 = get_bound(
-            (
-                apply_matrix_pt(matrix, bbox_lower_left),
-                apply_matrix_pt(matrix, bbox_upper_right),
-            )
-        )
+        x0, y0, x1, y1 = get_transformed_bound(matrix, (x0, y0, x1, y1))
+        # NOTE: This is not right at all for rotated text, but we'll live with it
         if font.vertical:
             size = x1 - x0
         else:
@@ -1692,7 +1684,8 @@ def object_type(self):
     @property
     def bbox(self) -> Rect:
         points = itertools.chain.from_iterable(
-            ((x0, y0), (x1, y1)) for x0, y0, x1, y1 in (item.bbox for item in self)
+            ((x0, y0), (x0, y1), (x1, y1), (x1, y0))
+            for x0, y0, x1, y1 in (item.bbox for item in self)
         )
         return get_bound(points)
 
@@ -1740,8 +1733,7 @@ def bbox(self) -> Rect:
         # unit high in user space, regardless of the number of samples
         # in the image. To be painted, an image shall be mapped to a
         # region of the page by temporarily altering the CTM.
-        bounds = ((0, 0), (1, 0), (0, 1), (1, 1))
-        return get_bound(apply_matrix_pt(self.ctm, (p, q)) for (p, q) in bounds)
+        return get_transformed_bound(self.ctm, (0, 0, 1, 1))
 
 
 @dataclass
@@ -1778,11 +1770,7 @@ def __getitem__(self, name: str) -> PDFObject:
     def bbox(self) -> Rect:
         """Get the bounding box of this XObject in device space."""
         # It is a required attribute!
-        x0, y0, x1, y1 = parse_rect(self.stream["BBox"])
-        # FIXME: This is *not* the bbox in the case of rotation
-        return get_bound(
-            [apply_matrix_pt(self.ctm, (x0, y0)), apply_matrix_pt(self.ctm, (x1, y1))]
-        )
+        return get_transformed_bound(self.ctm, parse_rect(self.stream["BBox"]))
 
     @property
     def buffer(self) -> bytes:
@@ -1896,14 +1884,11 @@ def bbox(self) -> Rect:
         """Get bounding box of path in device space as defined by its
         points and control points."""
         # First get the bounding box in user space (fast)
-        x0, y0, x1, y1 = get_bound(
+        bbox = get_bound(
             itertools.chain.from_iterable(seg.points for seg in self.raw_segments)
         )
-        # Now transform it
-        x0, y0 = apply_matrix_pt(self.ctm, (x0, y0))
-        x1, y1 = apply_matrix_pt(self.ctm, (x1, y1))
-        # And get the new bounds (also normalizes)
-        return get_bound(((x0, y0), (x1, y1)))
+        # Transform it and get the new bounding box
+        return get_transformed_bound(self.ctm, bbox)
 
 
 class TextItem(NamedTuple):
@@ -1934,20 +1919,19 @@ class GlyphObject(ContentObject):
         context of iteration over the parent `TextObject`.
       cid: Character ID for this glyph.
       text: Unicode mapping of this glyph, if any.
-
+      adv: glyph displacement in user space units.
+      bbox: glyph bounding box in device space.
     """
 
     textstate: TextState
     cid: int
     text: Union[str, None]
-    # FIXME: Subject to change here as not the most useful info
-    lower_left: Point
-    upper_right: Point
+    adv: float
+    _bbox: Rect
 
     @property
     def bbox(self) -> Rect:
-        # FIXME: This is not the bounding box in case of rotation!
-        return get_bound((self.lower_left, self.upper_right))
+        return self._bbox
 
 
 @dataclass
@@ -1972,7 +1956,7 @@ def _render_char(
         cid: int,
         matrix: Matrix,
         scaling: float,
-    ) -> Tuple[GlyphObject, float]:
+    ) -> GlyphObject:
         font = self.textstate.font
         assert font is not None
         fontsize = self.textstate.fontsize
@@ -1984,35 +1968,33 @@ def _render_char(
             log.debug("undefined char: %r, %r", font, cid)
             text = None
         textwidth = font.char_width(cid)
-        textdisp = font.char_disp(cid)
         adv = textwidth * fontsize * scaling
         if font.vertical:
-            # vertical
+            textdisp = font.char_disp(cid)
             assert isinstance(textdisp, tuple)
             (vx, vy) = textdisp
             if vx is None:
                 vx = fontsize * 0.5
             else:
                 vx = vx * fontsize * 0.001
             vy = (1000 - vy) * fontsize * 0.001
-            bbox_lower_left = (-vx, vy + rise + adv)
-            bbox_upper_right = (-vx + fontsize, vy + rise)
+            x0, y0 = (-vx, vy + rise + adv)
+            x1, y1 = (-vx + fontsize, vy + rise)
         else:
-            # horizontal
             descent = font.get_descent() * fontsize
-            bbox_lower_left = (0, descent + rise)
-            bbox_upper_right = (adv, descent + rise + fontsize)
-        item = GlyphObject(
+            x0, y0 = (0, descent + rise)
+            x1, y1 = (adv, descent + rise + fontsize)
+        bbox = get_transformed_bound(matrix, (x0, y0, x1, y1))
+        return GlyphObject(
             self.gstate,
             self.ctm,
             self.mcs,
             self.textstate,
             cid,
             text,
-            apply_matrix_pt(matrix, bbox_lower_left),
-            apply_matrix_pt(matrix, bbox_upper_right),
+            adv,
+            bbox
         )
-        return item, adv
 
     def _render_string(self, item: TextItem) -> Iterator[GlyphObject]:
         assert self.textstate.font is not None
@@ -2040,12 +2022,12 @@ def _render_string(self, item: TextItem) -> Iterator[GlyphObject]:
                     if needcharspace:
                         pos += charspace
                     self.textstate.glyph_offset = (x, pos) if vert else (pos, y)
-                    glyph, adv = self._render_char(
+                    glyph = self._render_char(
                         cid=cid,
                         matrix=translate_matrix(matrix, self.textstate.glyph_offset),
                         scaling=scaling,
                     )
-                    pos += adv
+                    pos += glyph.adv
                     yield glyph
                     if cid == 32 and wordspace:
                         pos += wordspace
@@ -2079,14 +2061,10 @@ def chars(self) -> str:
 
     def __iter__(self) -> Iterator[GlyphObject]:
         """Generate glyphs for this text object"""
-        if self._glyphs is not None:
-            yield from self._glyphs
-        self._glyphs = []
         for item in self.items:
             if item.operator == "TJ":
                 for glyph in self._render_string(item):
                     yield glyph
-                    self._glyphs.append(glyph)
             else:
                 self.textstate.update(item.operator, *item.args)
 

diff --git a/playa/utils.py b/playa/utils.py
@@ -214,16 +214,27 @@ def mult_matrix(m1: Matrix, m0: Matrix) -> Matrix:
 
 
 def translate_matrix(m: Matrix, v: Point) -> Matrix:
-    """Translates a matrix by (x, y)."""
+    """Pre-translates a matrix by `v == (dx, dy)`.
+
+    Specifically this translates the *input space* of the matrix by
+    `(dx, dy)`, so:
+
+       (apply_matrix_pt(matrix, (x + dx, y + dy))
+        == apply_matrix_pt(translate_matrix(matrix,
+                                            (dx, dy)),
+                           x, y))
+
+    For all `x, y, dx, dy`.
+    """
     (a, b, c, d, e, f) = m
     (x, y) = v
     return a, b, c, d, x * a + y * c + e, x * b + y * d + f
 
 
 def apply_matrix_pt(m: Matrix, v: Point) -> Point:
+    """Applies a matrix to a point."""
     (a, b, c, d, e, f) = m
     (x, y) = v
-    """Applies a matrix to a point."""
     return a * x + c * y + e, b * x + d * y + f
 
 
@@ -258,6 +269,20 @@ def get_bound(pts: Iterable[Point]) -> Rect:
     return x0, y0, x1, y1
 
 
+def get_transformed_bound(matrix: Matrix, bbox: Rect) -> Rect:
+    """Transform a bounding box and return the rectangle that covers
+    the points of the resulting shape."""
+    x0, y0, x1, y1 = bbox
+    return get_bound(
+        (
+            apply_matrix_pt(matrix, (x0, y0)),
+            apply_matrix_pt(matrix, (x0, y1)),
+            apply_matrix_pt(matrix, (x1, y1)),
+            apply_matrix_pt(matrix, (x1, y0)),
+        )
+    )
+
+
 def choplist(n: int, seq: Iterable[_T]) -> Iterator[Tuple[_T, ...]]:
     """Groups every n elements of the list."""
     r = []

diff --git a/pyproject.toml b/pyproject.toml
@@ -57,10 +57,10 @@ ban-relative-imports = "all"
 testpaths = [ "tests" ]
 
 [tool.hatch.envs.hatch-test]
-extra-dependencies = [ "cryptography", "pdfminer.six", "pandas", "polars" ]
+extra-dependencies = [ "cryptography", "pdfminer.six", "pandas", "polars-lts-cpu" ]
 
 [tool.hatch.envs.default]
-dependencies = [ "cryptography", "pytest", "mypy", "pdfminer.six", "pandas", "polars" ]
+dependencies = [ "cryptography", "pytest", "mypy", "pdfminer.six", "pandas", "polars-lts-cpu" ]
 
 [tool.hatch.envs.default.scripts]
 bench = [

diff --git a/tests/test_lazy_api.py b/tests/test_lazy_api.py
@@ -95,5 +95,20 @@ def test_uncoloured_tiling() -> None:
         assert path.gstate.ncolor == Color((0.5, 0.2, 1.0), "P1")
 
 
+def test_rotated_glyphs() -> None:
+    """Verify that we (unlike pdfminer) properly calculate the bbox
+    for rotated text."""
+    with playa.open(TESTDIR / "contrib" / "issue_495_pdfobjref.pdf") as pdf:
+        chars = []
+        for text in pdf.pages[0].texts:
+            for glyph in text:
+                if 1 not in glyph.textstate.line_matrix:
+                    chars.append(glyph.text)
+                    x0, y0, x1, y1 = glyph.bbox
+                    width = x1 - x0
+                    assert width > 6
+        assert "".join(chars) == "R18,00"
+
+
 if __name__ == "__main__":
     test_content_objects()
diff --git a/tests/test_open.py b/tests/test_open.py
@@ -27,6 +27,11 @@
     "aes-256-m.pdf": ["foo"],
     "aes-256-r6.pdf": ["usersecret", "ownersecret"],
 }
+PDFMINER_BUGS = {
+    "issue-449-vertical.pdf",
+    "issue_495_pdfobjref.pdf",
+    "issue-1008-inline-ascii85.pdf",
+}
 
 
 @pytest.mark.skipif(pdfminer is None, reason="pdfminer.six is not installed")
@@ -45,12 +50,15 @@ def convert_miner(layout):
             if itype == "figure":
                 yield from convert_miner(ltitem)
             else:
-                yield ((itype, ltitem.bbox))
+                yield (itype, ltitem.bbox)
 
     passwords = PASSWORDS.get(path.name, [""])
     for password in passwords:
         miner = []
         with open(path, "rb") as infh:
+            if path.name in PDFMINER_BUGS:
+                pytest.skip("pdfminer.six has a bug, skipping %s" % path.name)
+                break
             try:
                 rsrc = PDFResourceManager()
                 agg = PDFPageAggregator(rsrc, pageno=1)
@@ -175,10 +183,3 @@ def test_tiff_predictor() -> None:
         image = next(doc.pages[0].images)
         # Decoded TIFF: 600 x 600 + a header
         assert len(image.stream.buffer) == 360600
-
-
-if __name__ == "__main__":
-    import logging
-
-    logging.basicConfig(level=logging.DEBUG)
-    test_xobjects()