Skip to content

Commit

Permalink
ROB: Cope with some image extraction issues (py-pdf#2591)
Browse files Browse the repository at this point in the history
Closes py-pdf#2343:
1st case : image with images in 1 byte encoding with Separation color space

2nd case: similar + \n to be ignored at the end of the image data
  • Loading branch information
pubpub-zz authored Apr 10, 2024
1 parent 5c6a7b6 commit ced67e1
Show file tree
Hide file tree
Showing 3 changed files with 61 additions and 6 deletions.
17 changes: 16 additions & 1 deletion pypdf/_xobj_image_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,21 @@ def _get_imagemode(
return mode, mode == "CMYK"


def _extended_image_frombytes(
mode: str, size: Tuple[int, int], data: bytes
) -> Image.Image:
try:
img = Image.frombytes(mode, size, data)
except ValueError as exc:
nb_pix = size[0] * size[1]
if len(data) % nb_pix != 0:
raise exc
k = nb_pix * len(mode) / len(data)
data = b"".join([bytes((x,) * int(k)) for x in data])
img = Image.frombytes(mode, size, data)
return img


def _handle_flate(
size: Tuple[int, int],
data: bytes,
Expand Down Expand Up @@ -168,7 +183,7 @@ def bits2byte(data: bytes, size: Tuple[int, int], bits: int) -> bytes:
elif mode == "4bits":
mode = "P"
data = bits2byte(data, size, 4)
img = Image.frombytes(mode, size, data)
img = _extended_image_frombytes(mode, size, data)
if color_space == "/Indexed":
from .generic import TextStringObject

Expand Down
22 changes: 17 additions & 5 deletions pypdf/filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -285,7 +285,9 @@ def decode(
index = 0
while True:
if index >= len(data):
logger_warning("missing EOD in ASCIIHexDecode, check if output is OK", __name__)
logger_warning(
"missing EOD in ASCIIHexDecode, check if output is OK", __name__
)
break # reach End Of String even if no EOD
char = data[index : index + 1]
if char == b">":
Expand Down Expand Up @@ -341,7 +343,9 @@ def decode(
index = 0
while True:
if index >= len(data):
logger_warning("missing EOD in RunLengthDecode, check if output is OK", __name__)
logger_warning(
"missing EOD in RunLengthDecode, check if output is OK", __name__
)
break # reach End Of String even if no EOD
length = data[index]
index += 1
Expand Down Expand Up @@ -733,6 +737,7 @@ def _xobj_to_image(x_object_obj: Dict[str, Any]) -> Tuple[Optional[str], bytes,
"""
from ._xobj_image_helpers import (
Image,
_extended_image_frombytes,
_get_imagemode,
_handle_flate,
_handle_jpx,
Expand All @@ -747,10 +752,12 @@ def _xobj_to_image(x_object_obj: Dict[str, Any]) -> Tuple[Optional[str], bytes,
else:
obj_as_text = x_object_obj.__repr__()

size = (x_object_obj[IA.WIDTH], x_object_obj[IA.HEIGHT])
size = (cast(int, x_object_obj[IA.WIDTH]), cast(int, x_object_obj[IA.HEIGHT]))
data = x_object_obj.get_data() # type: ignore
if isinstance(data, str): # pragma: no cover
data = data.encode()
if len(data) % (size[0] * size[1]) == 1 and data[-1] == 0x0A: # ie. '\n'
data = data[:-1]
colors = x_object_obj.get("/Colors", 1)
color_space: Any = x_object_obj.get("/ColorSpace", NullObject()).get_object()
if isinstance(color_space, list) and len(color_space) == 1:
Expand Down Expand Up @@ -819,7 +826,7 @@ def _xobj_to_image(x_object_obj: Dict[str, Any]) -> Tuple[Optional[str], bytes,
)
elif mode == "CMYK":
img, image_format, extension, invert_color = (
Image.frombytes(mode, size, data),
_extended_image_frombytes(mode, size, data),
"TIFF",
".tif",
False,
Expand All @@ -828,7 +835,7 @@ def _xobj_to_image(x_object_obj: Dict[str, Any]) -> Tuple[Optional[str], bytes,
raise PdfReadError(f"ColorSpace field not found in {x_object_obj}")
else:
img, image_format, extension, invert_color = (
Image.frombytes(mode, size, data),
_extended_image_frombytes(mode, size, data),
"PNG",
".png",
False,
Expand All @@ -849,6 +856,11 @@ def _xobj_to_image(x_object_obj: Dict[str, Any]) -> Tuple[Optional[str], bytes,
and color_space[0].get_object() == "/Indexed"
):
decode = None # decode is meanless of Indexed
if (
isinstance(color_space, ArrayObject)
and color_space[0].get_object() == "/Separation"
):
decode = [1.0, 0.0] * len(img.getbands())
if decode is not None and not all(decode[i] == i % 2 for i in range(len(decode))):
lut: List[int] = []
for i in range(0, len(decode), 2):
Expand Down
28 changes: 28 additions & 0 deletions tests/test_images.py
Original file line number Diff line number Diff line change
Expand Up @@ -255,3 +255,31 @@ def test_cmyk_no_filter():
name = "iss2522.pdf"
reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
reader.pages[0].images[0].image


@pytest.mark.enable_socket()
def test_separation_1byte_to_rgb_inverted():
"""Cf #2343"""
url = "https://github.com/py-pdf/pypdf/files/13679585/test2_P038-038.pdf"
name = "iss2343.pdf"
reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
url = "https://github.com/py-pdf/pypdf/assets/4083478/b7f41897-96ef-4ea6-b165-5ef307a92b87"
name = "iss2343.png"
img = Image.open(BytesIO(get_data_from_url(url, name=name)))
assert image_similarity(reader.pages[0].images[0].image, img) >= 0.99
obj = reader.pages[0].images[0].indirect_reference.get_object()
obj.set_data(obj.get_data() + b"\x00")
with pytest.raises(ValueError):
reader.pages[0].images[0]


@pytest.mark.enable_socket()
def test_data_with_lf():
"""Cf #2343"""
url = "https://github.com/py-pdf/pypdf/files/13946477/panda.pdf"
name = "iss2343b.pdf"
reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
url = "https://github.com/py-pdf/pypdf/assets/4083478/1120b0cf-a67a-403f-aa1a-9a191cbc087f"
name = "iss2343b0.png"
img = Image.open(BytesIO(get_data_from_url(url, name=name)))
assert image_similarity(reader.pages[8].images[9].image, img) == 1.0

0 comments on commit ced67e1

Please sign in to comment.