BUG: BI in text content identified as image tag (#2459)

Fixes #2456
py-pdf · Feb 20, 2024 · 9245c6a · 9245c6a
1 parent cc306ad
commit 9245c6a
Show file tree

Hide file tree

Showing 2 changed files with 21 additions and 6 deletions.
diff --git a/pypdf/_page.py b/pypdf/_page.py
@@ -440,12 +440,17 @@ def _get_ids_image(
         else:
             call_stack.append(_i)
         if self.inline_images_keys is None:
-            nb_inlines = len(
-                re.findall(
-                    WHITESPACES_AS_REGEXP + b"BI" + WHITESPACES_AS_REGEXP,
-                    self._get_contents_as_bytes() or b"",
-                )
-            )
+            content = self._get_contents_as_bytes() or b""
+            nb_inlines = 0
+            for matching in re.finditer(
+                WHITESPACES_AS_REGEXP + b"BI" + WHITESPACES_AS_REGEXP,
+                content,
+            ):
+                start_of_string = content[: matching.start()]
+                if len(re.findall(b"[^\\\\]\\(", start_of_string)) == len(
+                    re.findall(b"[^\\\\]\\)", start_of_string)
+                ):
+                    nb_inlines += 1
             self.inline_images_keys = [f"~{x}~" for x in range(nb_inlines)]
         if obj is None:
             obj = self

diff --git a/tests/test_images.py b/tests/test_images.py
@@ -236,3 +236,13 @@ def test_devicen_cmyk_black_only():
     name = "iss2321_img1.pdf"
     img = Image.open(BytesIO(get_data_from_url(url, name=name)))
     assert image_similarity(reader.pages[10].images[0].image, img) >= 0.99
+
+
+@pytest.mark.enable_socket()
+def test_bi_in_text():
+    """Cf #2456"""
+    url = "https://github.com/py-pdf/pypdf/files/14322910/BI_text_with_one_image.pdf"
+    name = "BI_text_with_one_image.pdf"
+    reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
+    assert reader.pages[0].images.keys() == ["~0~"]
+    assert reader.pages[0].images[0].name == "~0~.png"