-
Notifications
You must be signed in to change notification settings - Fork 1.4k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
BUG: Two tests with the same name existed MAINT: Move _xobj_to_image from Scrips (image extractor) into PyPDF2. This reduces the overall code size and could be useful for others. I make it private for the moment as I'm uncertain if it should be in PyPDF2
- Loading branch information
1 parent
d123c63
commit 984841c
Showing
3 changed files
with
80 additions
and
89 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,57 +1,37 @@ | ||
''' | ||
""" | ||
Extract images from PDF without resampling or altering. | ||
Adapted from work by Sylvain Pelissier | ||
http://stackoverflow.com/questions/2693820/extract-images-from-pdf-without-resampling-in-python | ||
''' | ||
""" | ||
|
||
import sys | ||
import PyPDF2 | ||
from PIL import Image | ||
from PyPDF2.filters import _xobj_to_image | ||
|
||
if (len(sys.argv) != 2): | ||
print("\nUsage: python {} input_file\n".format(sys.argv[0])) | ||
sys.exit(1) | ||
|
||
pdf = sys.argv[1] | ||
def main(pdf: str): | ||
reader = PyPDF2.PdfFileReader(open(pdf, "rb")) | ||
page = reader.pages[30] | ||
|
||
if __name__ == '__main__': | ||
input1 = PyPDF2.PdfFileReader(open(pdf, "rb")) | ||
page0 = input1.getPage(30) | ||
|
||
if '/XObject' in page0['/Resources']: | ||
xObject = page0['/Resources']['/XObject'].getObject() | ||
if "/XObject" in page["/Resources"]: | ||
xObject = page["/Resources"]["/XObject"].getObject() | ||
|
||
for obj in xObject: | ||
if xObject[obj]['/Subtype'] == '/Image': | ||
size = (xObject[obj]['/Width'], xObject[obj]['/Height']) | ||
data = xObject[obj].getData() | ||
if xObject[obj]['/ColorSpace'] == '/DeviceRGB': | ||
mode = "RGB" | ||
else: | ||
mode = "P" | ||
|
||
if '/Filter' in xObject[obj]: | ||
if xObject[obj]['/Filter'] == '/FlateDecode': | ||
img = Image.frombytes(mode, size, data) | ||
if "/SMask" in xObject[obj]: # add alpha channel | ||
alpha = Image.frombytes("L", size, xObject[obj]["/SMask"].getData()) | ||
img.putalpha(alpha) | ||
img.save(obj[1:] + ".png") | ||
elif xObject[obj]['/Filter'] == '/DCTDecode': | ||
img = open(obj[1:] + ".jpg", "wb") | ||
img.write(data) | ||
img.close() | ||
elif xObject[obj]['/Filter'] == '/JPXDecode': | ||
img = open(obj[1:] + ".jp2", "wb") | ||
img.write(data) | ||
img.close() | ||
elif xObject[obj]['/Filter'] == '/CCITTFaxDecode': | ||
img = open(obj[1:] + ".tiff", "wb") | ||
img.write(data) | ||
img.close() | ||
else: | ||
img = Image.frombytes(mode, size, data) | ||
img.save(obj[1:] + ".png") | ||
if xObject[obj]["/Subtype"] == "/Image": | ||
extension, byte_stream = _xobj_to_image(xObject[obj]) | ||
if extension is not None: | ||
filename = obj[1:] + ".png" | ||
with open(filename, "wb") as img: | ||
img.write(byte_stream) | ||
else: | ||
print("No image found.") | ||
|
||
|
||
if __name__ == "__main__": | ||
if len(sys.argv) != 2: | ||
print("\nUsage: python {} input_file\n".format(sys.argv[0])) | ||
sys.exit(1) | ||
|
||
pdf = sys.argv[1] | ||
main(pdf) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters