Skip to content

Commit

Permalink
TST: Fix test_get_images (#730)
Browse files Browse the repository at this point in the history
BUG: Two tests with the same name existed

MAINT: Move _xobj_to_image from Scrips (image extractor) into PyPDF2. This reduces the overall code size and could be useful for others. I make it private for the moment as I'm uncertain if it should be in PyPDF2
  • Loading branch information
MartinThoma authored Apr 10, 2022
1 parent d123c63 commit 984841c
Show file tree
Hide file tree
Showing 3 changed files with 80 additions and 89 deletions.
45 changes: 45 additions & 0 deletions PyPDF2/filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -420,3 +420,48 @@ def decodeStreamData(stream):
# unsupported filter
raise NotImplementedError("unsupported filter %s" % filterType)
return data


def _xobj_to_image(x_object_obj):
"""
Users need to have the pillow package installed.
It's unclear if PyPDF2 will keep this function here, hence it's private.
It might get removed at any point.
:return: Tuple[file extension, bytes]
"""
import io
from PIL import Image

size = (x_object_obj["/Width"], x_object_obj["/Height"])
data = x_object_obj.getData()
if x_object_obj["/ColorSpace"] == "/DeviceRGB":
mode = "RGB"
else:
mode = "P"
extension = None
if "/Filter" in x_object_obj:
if x_object_obj["/Filter"] == "/FlateDecode":
extension = ".png"
img = Image.frombytes(mode, size, data)
if "/SMask" in x_object_obj: # add alpha channel
alpha = Image.frombytes("L", size, x_object_obj["/SMask"].getData())
img.putalpha(alpha)
img_byte_arr = io.BytesIO()
img.save(img_byte_arr, format="PNG")
data = img_byte_arr.getvalue()
elif x_object_obj["/Filter"] == "/DCTDecode":
extension = ".jpg"
elif x_object_obj["/Filter"] == "/JPXDecode":
extension = ".jp2"
elif x_object_obj["/Filter"] == "/CCITTFaxDecode":
extension = ".tiff"
else:
extension = ".png"
img = Image.frombytes(mode, size, data)
img_byte_arr = io.BytesIO()
img.save(img_byte_arr, format="PNG")
data = img_byte_arr.getvalue()

return extension, data
66 changes: 23 additions & 43 deletions Scripts/pdf-image-extractor.py
Original file line number Diff line number Diff line change
@@ -1,57 +1,37 @@
'''
"""
Extract images from PDF without resampling or altering.
Adapted from work by Sylvain Pelissier
http://stackoverflow.com/questions/2693820/extract-images-from-pdf-without-resampling-in-python
'''
"""

import sys
import PyPDF2
from PIL import Image
from PyPDF2.filters import _xobj_to_image

if (len(sys.argv) != 2):
print("\nUsage: python {} input_file\n".format(sys.argv[0]))
sys.exit(1)

pdf = sys.argv[1]
def main(pdf: str):
reader = PyPDF2.PdfFileReader(open(pdf, "rb"))
page = reader.pages[30]

if __name__ == '__main__':
input1 = PyPDF2.PdfFileReader(open(pdf, "rb"))
page0 = input1.getPage(30)

if '/XObject' in page0['/Resources']:
xObject = page0['/Resources']['/XObject'].getObject()
if "/XObject" in page["/Resources"]:
xObject = page["/Resources"]["/XObject"].getObject()

for obj in xObject:
if xObject[obj]['/Subtype'] == '/Image':
size = (xObject[obj]['/Width'], xObject[obj]['/Height'])
data = xObject[obj].getData()
if xObject[obj]['/ColorSpace'] == '/DeviceRGB':
mode = "RGB"
else:
mode = "P"

if '/Filter' in xObject[obj]:
if xObject[obj]['/Filter'] == '/FlateDecode':
img = Image.frombytes(mode, size, data)
if "/SMask" in xObject[obj]: # add alpha channel
alpha = Image.frombytes("L", size, xObject[obj]["/SMask"].getData())
img.putalpha(alpha)
img.save(obj[1:] + ".png")
elif xObject[obj]['/Filter'] == '/DCTDecode':
img = open(obj[1:] + ".jpg", "wb")
img.write(data)
img.close()
elif xObject[obj]['/Filter'] == '/JPXDecode':
img = open(obj[1:] + ".jp2", "wb")
img.write(data)
img.close()
elif xObject[obj]['/Filter'] == '/CCITTFaxDecode':
img = open(obj[1:] + ".tiff", "wb")
img.write(data)
img.close()
else:
img = Image.frombytes(mode, size, data)
img.save(obj[1:] + ".png")
if xObject[obj]["/Subtype"] == "/Image":
extension, byte_stream = _xobj_to_image(xObject[obj])
if extension is not None:
filename = obj[1:] + ".png"
with open(filename, "wb") as img:
img.write(byte_stream)
else:
print("No image found.")


if __name__ == "__main__":
if len(sys.argv) != 2:
print("\nUsage: python {} input_file\n".format(sys.argv[0]))
sys.exit(1)

pdf = sys.argv[1]
main(pdf)
58 changes: 12 additions & 46 deletions Tests/test_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import os
import pytest
import PyPDF2
from PyPDF2.filters import decodeStreamData, _xobj_to_image

TESTS_ROOT = os.path.abspath(os.path.dirname(__file__))
PROJECT_ROOT = os.path.dirname(TESTS_ROOT)
Expand Down Expand Up @@ -84,61 +85,26 @@ def test_get_outlines(src, outline_elements):
],
)
def test_get_images(src, nb_images):
from PIL import Image

input1 = PyPDF2.PdfFileReader(open(src, "rb"))
reader = PyPDF2.PdfFileReader(open(src, "rb"))

with pytest.raises(TypeError):
page0 = input1.pages["0"]
page = reader.pages["0"]

page0 = input1.pages[-1]
page0 = input1.pages[0]
page = reader.pages[-1]
page = reader.pages[0]

images_extracted = []

if "/XObject" in page0["/Resources"]:
xObject = page0["/Resources"]["/XObject"].getObject()
if "/XObject" in page["/Resources"]:
xObject = page["/Resources"]["/XObject"].getObject()

for obj in xObject:
if xObject[obj]["/Subtype"] == "/Image":
size = (xObject[obj]["/Width"], xObject[obj]["/Height"])
data = xObject[obj].getData()
if xObject[obj]["/ColorSpace"] == "/DeviceRGB":
mode = "RGB"
else:
mode = "P"

filename = None
if "/Filter" in xObject[obj]:
if xObject[obj]["/Filter"] == "/FlateDecode":
img = Image.frombytes(mode, size, data)
if "/SMask" in xObject[obj]: # add alpha channel
alpha = Image.frombytes(
"L", size, xObject[obj]["/SMask"].getData()
)
img.putalpha(alpha)
filename = obj[1:] + ".png"
img.save(filename)
elif xObject[obj]["/Filter"] == "/DCTDecode":
filename = obj[1:] + ".jpg"
img = open(filename, "wb")
img.write(data)
img.close()
elif xObject[obj]["/Filter"] == "/JPXDecode":
filename = obj[1:] + ".jp2"
img = open(filename, "wb")
img.write(data)
img.close()
elif xObject[obj]["/Filter"] == "/CCITTFaxDecode":
filename = obj[1:] + ".tiff"
img = open(filename, "wb")
img.write(data)
img.close()
else:
img = Image.frombytes(mode, size, data)
extension, byte_stream = _xobj_to_image(xObject[obj])
if extension is not None:
filename = obj[1:] + ".png"
img.save(filename)
if filename is not None:
with open(filename, "wb") as img:
img.write(byte_stream)
images_extracted.append(filename)
else:
print("No image found.")
Expand All @@ -155,7 +121,7 @@ def test_get_images(src, nb_images):
(False, False, False),
],
)
def test_get_images(strict, with_prev_0, should_fail):
def test_get_images_raw(strict, with_prev_0, should_fail):
pdf_data = b"%%PDF-1.7\n" \
b"1 0 obj << /Count 1 /Kids [4 0 R] /Type /Pages >> endobj\n" \
b"2 0 obj << >> endobj\n" \
Expand Down

0 comments on commit 984841c

Please sign in to comment.