diff --git a/pypdf/_page.py b/pypdf/_page.py index 3c656a978..cad362506 100644 --- a/pypdf/_page.py +++ b/pypdf/_page.py @@ -440,12 +440,17 @@ def _get_ids_image( else: call_stack.append(_i) if self.inline_images_keys is None: - nb_inlines = len( - re.findall( - WHITESPACES_AS_REGEXP + b"BI" + WHITESPACES_AS_REGEXP, - self._get_contents_as_bytes() or b"", - ) - ) + content = self._get_contents_as_bytes() or b"" + nb_inlines = 0 + for matching in re.finditer( + WHITESPACES_AS_REGEXP + b"BI" + WHITESPACES_AS_REGEXP, + content, + ): + start_of_string = content[: matching.start()] + if len(re.findall(b"[^\\\\]\\(", start_of_string)) == len( + re.findall(b"[^\\\\]\\)", start_of_string) + ): + nb_inlines += 1 self.inline_images_keys = [f"~{x}~" for x in range(nb_inlines)] if obj is None: obj = self diff --git a/tests/test_images.py b/tests/test_images.py index b256efe9f..7e9686fac 100644 --- a/tests/test_images.py +++ b/tests/test_images.py @@ -236,3 +236,13 @@ def test_devicen_cmyk_black_only(): name = "iss2321_img1.pdf" img = Image.open(BytesIO(get_data_from_url(url, name=name))) assert image_similarity(reader.pages[10].images[0].image, img) >= 0.99 + + +@pytest.mark.enable_socket() +def test_bi_in_text(): + """Cf #2456""" + url = "https://github.com/py-pdf/pypdf/files/14322910/BI_text_with_one_image.pdf" + name = "BI_text_with_one_image.pdf" + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + assert reader.pages[0].images.keys() == ["~0~"] + assert reader.pages[0].images[0].name == "~0~.png"