diff --git a/pdfminer/ascii85.py b/pdfminer/ascii85.py new file mode 100644 index 00000000..ee36222f --- /dev/null +++ b/pdfminer/ascii85.py @@ -0,0 +1,39 @@ +"""Python implementation of ASCII85/ASCIIHex decoder (Adobe version).""" + +import re +from base64 import a85decode +from binascii import unhexlify + + +def ascii85decode(data: bytes) -> bytes: + """In ASCII85 encoding, every four bytes are encoded with five ASCII + letters, using 85 different types of characters (as 256**4 < 85**5). + When the length of the original bytes is not a multiple of 4, a special + rule is used for round up. + + The Adobe's ASCII85 implementation is slightly different from + its original in handling the last characters. + + """ + return a85decode(data, adobe=True) + + +bws_re = re.compile(rb"\s") + + +def asciihexdecode(data: bytes) -> bytes: + """ASCIIHexDecode filter: PDFReference v1.4 section 3.3.1 + For each pair of ASCII hexadecimal digits (0-9 and A-F or a-f), the + ASCIIHexDecode filter produces one byte of binary data. All white-space + characters are ignored. A right angle bracket character (>) indicates + EOD. Any other characters will cause an error. If the filter encounters + the EOD marker after reading an odd number of hexadecimal digits, it + will behave as if a 0 followed the last digit. + """ + data = bws_re.sub(b"", data) + idx = data.find(b">") + if idx != -1: + data = data[:idx] + if idx % 2 == 1: + data += b"0" + return unhexlify(data) diff --git a/pdfminer/pdftypes.py b/pdfminer/pdftypes.py index 17228fd3..d333ec49 100644 --- a/pdfminer/pdftypes.py +++ b/pdfminer/pdftypes.py @@ -1,9 +1,6 @@ import io import logging -import re import zlib -from base64 import a85decode -from binascii import unhexlify from typing import ( TYPE_CHECKING, Any, @@ -19,6 +16,7 @@ from warnings import warn from pdfminer import pdfexceptions, settings +from pdfminer.ascii85 import ascii85decode, asciihexdecode from pdfminer.ccitt import ccittfaxdecode from pdfminer.lzw import lzwdecode from pdfminer.psparser import LIT, PSObject @@ -244,27 +242,6 @@ def decompress_corrupted(data: bytes) -> bytes: return result_str -bws_re = re.compile(rb"\s") - - -def asciihexdecode(data: bytes) -> bytes: - """ASCIIHexDecode filter: PDFReference v1.4 section 3.3.1 - For each pair of ASCII hexadecimal digits (0-9 and A-F or a-f), the - ASCIIHexDecode filter produces one byte of binary data. All white-space - characters are ignored. A right angle bracket character (>) indicates - EOD. Any other characters will cause an error. If the filter encounters - the EOD marker after reading an odd number of hexadecimal digits, it - will behave as if a 0 followed the last digit. - """ - data = bws_re.sub(b"", data) - idx = data.find(b">") - if idx != -1: - data = data[:idx] - if idx % 2 == 1: - data += b"0" - return unhexlify(data) - - class PDFStream(PDFObject): def __init__( self, @@ -366,7 +343,7 @@ def decode(self) -> None: elif f in LITERALS_LZW_DECODE: data = lzwdecode(data) elif f in LITERALS_ASCII85_DECODE: - data = a85decode(data, adobe=True) + data = ascii85decode(data) elif f in LITERALS_ASCIIHEX_DECODE: data = asciihexdecode(data) elif f in LITERALS_RUNLENGTH_DECODE: diff --git a/tests/test_pdfminer_crypto.py b/tests/test_pdfminer_crypto.py index 52651f27..a56ece96 100644 --- a/tests/test_pdfminer_crypto.py +++ b/tests/test_pdfminer_crypto.py @@ -1,11 +1,10 @@ """Test of various compression/encoding modules (previously in doctests)""" import binascii -from base64 import a85decode from pdfminer.arcfour import Arcfour +from pdfminer.ascii85 import ascii85decode, asciihexdecode from pdfminer.lzw import lzwdecode -from pdfminer.pdftypes import asciihexdecode from pdfminer.runlength import rldecode @@ -19,13 +18,13 @@ def dehex(b): return binascii.unhexlify(b) -class TestAsciiHex: +class TestAscii85: def test_ascii85decode(self): """The sample string is taken from: http://en.wikipedia.org/w/index.php?title=Ascii85 """ - assert a85decode(b"9jqo^BlbD-BleB1DJ+*+F(f,q") == b"Man is distinguished" - assert a85decode(b"E,9)oF*2M7/c~>", adobe=True) == b"pleasure." + assert ascii85decode(b"9jqo^BlbD-BleB1DJ+*+F(f,q") == b"Man is distinguished" + assert ascii85decode(b"E,9)oF*2M7/c~>") == b"pleasure." def test_asciihexdecode(self): assert asciihexdecode(b"61 62 2e6364 65") == b"ab.cde"