TST: Fix test_get_images (#730)

BUG: Two tests with the same name existed MAINT: Move _xobj_to_image from Scrips (image extractor) into PyPDF2. This reduces the overall code size and could be useful for others. I make it private for the moment as I'm uncertain if it should be in PyPDF2
py-pdf · Apr 10, 2022 · 984841c · 984841c
1 parent d123c63
commit 984841c
Show file tree

Hide file tree

Showing 3 changed files with 80 additions and 89 deletions.
diff --git a/PyPDF2/filters.py b/PyPDF2/filters.py
@@ -420,3 +420,48 @@ def decodeStreamData(stream):
                 # unsupported filter
                 raise NotImplementedError("unsupported filter %s" % filterType)
     return data
+
+
+def _xobj_to_image(x_object_obj):
+    """
+    Users need to have the pillow package installed.
+
+    It's unclear if PyPDF2 will keep this function here, hence it's private.
+    It might get removed at any point.
+
+    :return: Tuple[file extension, bytes]
+    """
+    import io
+    from PIL import Image
+
+    size = (x_object_obj["/Width"], x_object_obj["/Height"])
+    data = x_object_obj.getData()
+    if x_object_obj["/ColorSpace"] == "/DeviceRGB":
+        mode = "RGB"
+    else:
+        mode = "P"
+    extension = None
+    if "/Filter" in x_object_obj:
+        if x_object_obj["/Filter"] == "/FlateDecode":
+            extension = ".png"
+            img = Image.frombytes(mode, size, data)
+            if "/SMask" in x_object_obj:  # add alpha channel
+                alpha = Image.frombytes("L", size, x_object_obj["/SMask"].getData())
+                img.putalpha(alpha)
+            img_byte_arr = io.BytesIO()
+            img.save(img_byte_arr, format="PNG")
+            data = img_byte_arr.getvalue()
+        elif x_object_obj["/Filter"] == "/DCTDecode":
+            extension = ".jpg"
+        elif x_object_obj["/Filter"] == "/JPXDecode":
+            extension = ".jp2"
+        elif x_object_obj["/Filter"] == "/CCITTFaxDecode":
+            extension = ".tiff"
+    else:
+        extension = ".png"
+        img = Image.frombytes(mode, size, data)
+        img_byte_arr = io.BytesIO()
+        img.save(img_byte_arr, format="PNG")
+        data = img_byte_arr.getvalue()
+
+    return extension, data
diff --git a/Scripts/pdf-image-extractor.py b/Scripts/pdf-image-extractor.py
@@ -1,57 +1,37 @@
-'''
+"""
 Extract images from PDF without resampling or altering.
 
 Adapted from work by Sylvain Pelissier
 http://stackoverflow.com/questions/2693820/extract-images-from-pdf-without-resampling-in-python
-'''
+"""
 
 import sys
 import PyPDF2
-from PIL import Image
+from PyPDF2.filters import _xobj_to_image
 
-if (len(sys.argv) != 2):
-    print("\nUsage: python {} input_file\n".format(sys.argv[0]))
-    sys.exit(1)
 
-pdf = sys.argv[1]
+def main(pdf: str):
+    reader = PyPDF2.PdfFileReader(open(pdf, "rb"))
+    page = reader.pages[30]
 
-if __name__ == '__main__':
-    input1 = PyPDF2.PdfFileReader(open(pdf, "rb"))
-    page0 = input1.getPage(30)
-
-    if '/XObject' in page0['/Resources']:
-        xObject = page0['/Resources']['/XObject'].getObject()
+    if "/XObject" in page["/Resources"]:
+        xObject = page["/Resources"]["/XObject"].getObject()
 
         for obj in xObject:
-            if xObject[obj]['/Subtype'] == '/Image':
-                size = (xObject[obj]['/Width'], xObject[obj]['/Height'])
-                data = xObject[obj].getData()
-                if xObject[obj]['/ColorSpace'] == '/DeviceRGB':
-                    mode = "RGB"
-                else:
-                    mode = "P"
-
-                if '/Filter' in xObject[obj]:
-                    if xObject[obj]['/Filter'] == '/FlateDecode':
-                        img = Image.frombytes(mode, size, data)
-                        if "/SMask" in xObject[obj]: # add alpha channel
-                            alpha = Image.frombytes("L", size, xObject[obj]["/SMask"].getData())
-                            img.putalpha(alpha)
-                        img.save(obj[1:] + ".png")
-                    elif xObject[obj]['/Filter'] == '/DCTDecode':
-                        img = open(obj[1:] + ".jpg", "wb")
-                        img.write(data)
-                        img.close()
-                    elif xObject[obj]['/Filter'] == '/JPXDecode':
-                        img = open(obj[1:] + ".jp2", "wb")
-                        img.write(data)
-                        img.close()
-                    elif xObject[obj]['/Filter'] == '/CCITTFaxDecode':
-                        img = open(obj[1:] + ".tiff", "wb")
-                        img.write(data)
-                        img.close()
-                else:
-                    img = Image.frombytes(mode, size, data)
-                    img.save(obj[1:] + ".png")
+            if xObject[obj]["/Subtype"] == "/Image":
+                extension, byte_stream = _xobj_to_image(xObject[obj])
+                if extension is not None:
+                    filename = obj[1:] + ".png"
+                    with open(filename, "wb") as img:
+                        img.write(byte_stream)
     else:
         print("No image found.")
+
+
+if __name__ == "__main__":
+    if len(sys.argv) != 2:
+        print("\nUsage: python {} input_file\n".format(sys.argv[0]))
+        sys.exit(1)
+
+    pdf = sys.argv[1]
+    main(pdf)
diff --git a/Tests/test_reader.py b/Tests/test_reader.py
@@ -2,6 +2,7 @@
 import os
 import pytest
 import PyPDF2
+from PyPDF2.filters import decodeStreamData, _xobj_to_image
 
 TESTS_ROOT = os.path.abspath(os.path.dirname(__file__))
 PROJECT_ROOT = os.path.dirname(TESTS_ROOT)
@@ -84,61 +85,26 @@ def test_get_outlines(src, outline_elements):
     ],
 )
 def test_get_images(src, nb_images):
-    from PIL import Image
-
-    input1 = PyPDF2.PdfFileReader(open(src, "rb"))
+    reader = PyPDF2.PdfFileReader(open(src, "rb"))
 
     with pytest.raises(TypeError):
-        page0 = input1.pages["0"]
+        page = reader.pages["0"]
 
-    page0 = input1.pages[-1]
-    page0 = input1.pages[0]
+    page = reader.pages[-1]
+    page = reader.pages[0]
 
     images_extracted = []
 
-    if "/XObject" in page0["/Resources"]:
-        xObject = page0["/Resources"]["/XObject"].getObject()
+    if "/XObject" in page["/Resources"]:
+        xObject = page["/Resources"]["/XObject"].getObject()
 
         for obj in xObject:
             if xObject[obj]["/Subtype"] == "/Image":
-                size = (xObject[obj]["/Width"], xObject[obj]["/Height"])
-                data = xObject[obj].getData()
-                if xObject[obj]["/ColorSpace"] == "/DeviceRGB":
-                    mode = "RGB"
-                else:
-                    mode = "P"
-
-                filename = None
-                if "/Filter" in xObject[obj]:
-                    if xObject[obj]["/Filter"] == "/FlateDecode":
-                        img = Image.frombytes(mode, size, data)
-                        if "/SMask" in xObject[obj]:  # add alpha channel
-                            alpha = Image.frombytes(
-                                "L", size, xObject[obj]["/SMask"].getData()
-                            )
-                            img.putalpha(alpha)
-                        filename = obj[1:] + ".png"
-                        img.save(filename)
-                    elif xObject[obj]["/Filter"] == "/DCTDecode":
-                        filename = obj[1:] + ".jpg"
-                        img = open(filename, "wb")
-                        img.write(data)
-                        img.close()
-                    elif xObject[obj]["/Filter"] == "/JPXDecode":
-                        filename = obj[1:] + ".jp2"
-                        img = open(filename, "wb")
-                        img.write(data)
-                        img.close()
-                    elif xObject[obj]["/Filter"] == "/CCITTFaxDecode":
-                        filename = obj[1:] + ".tiff"
-                        img = open(filename, "wb")
-                        img.write(data)
-                        img.close()
-                else:
-                    img = Image.frombytes(mode, size, data)
+                extension, byte_stream = _xobj_to_image(xObject[obj])
+                if extension is not None:
                     filename = obj[1:] + ".png"
-                    img.save(filename)
-                if filename is not None:
+                    with open(filename, "wb") as img:
+                        img.write(byte_stream)
                     images_extracted.append(filename)
     else:
         print("No image found.")
@@ -155,7 +121,7 @@ def test_get_images(src, nb_images):
         (False, False, False),
     ],
 )
-def test_get_images(strict, with_prev_0, should_fail):
+def test_get_images_raw(strict, with_prev_0, should_fail):
     pdf_data = b"%%PDF-1.7\n" \
                b"1 0 obj << /Count 1 /Kids [4 0 R] /Type /Pages >> endobj\n" \
                b"2 0 obj << >> endobj\n" \