Skip to content

Commit

Permalink
BUG: BI in text content identified as image tag (#2459)
Browse files Browse the repository at this point in the history
Fixes #2456
  • Loading branch information
pubpub-zz authored Feb 20, 2024
1 parent cc306ad commit 9245c6a
Show file tree
Hide file tree
Showing 2 changed files with 21 additions and 6 deletions.
17 changes: 11 additions & 6 deletions pypdf/_page.py
Original file line number Diff line number Diff line change
Expand Up @@ -440,12 +440,17 @@ def _get_ids_image(
else:
call_stack.append(_i)
if self.inline_images_keys is None:
nb_inlines = len(
re.findall(
WHITESPACES_AS_REGEXP + b"BI" + WHITESPACES_AS_REGEXP,
self._get_contents_as_bytes() or b"",
)
)
content = self._get_contents_as_bytes() or b""
nb_inlines = 0
for matching in re.finditer(
WHITESPACES_AS_REGEXP + b"BI" + WHITESPACES_AS_REGEXP,
content,
):
start_of_string = content[: matching.start()]
if len(re.findall(b"[^\\\\]\\(", start_of_string)) == len(
re.findall(b"[^\\\\]\\)", start_of_string)
):
nb_inlines += 1
self.inline_images_keys = [f"~{x}~" for x in range(nb_inlines)]
if obj is None:
obj = self
Expand Down
10 changes: 10 additions & 0 deletions tests/test_images.py
Original file line number Diff line number Diff line change
Expand Up @@ -236,3 +236,13 @@ def test_devicen_cmyk_black_only():
name = "iss2321_img1.pdf"
img = Image.open(BytesIO(get_data_from_url(url, name=name)))
assert image_similarity(reader.pages[10].images[0].image, img) >= 0.99


@pytest.mark.enable_socket()
def test_bi_in_text():
"""Cf #2456"""
url = "https://github.com/py-pdf/pypdf/files/14322910/BI_text_with_one_image.pdf"
name = "BI_text_with_one_image.pdf"
reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
assert reader.pages[0].images.keys() == ["~0~"]
assert reader.pages[0].images[0].name == "~0~.png"

0 comments on commit 9245c6a

Please sign in to comment.