Skip to content

Commit

Permalink
Prevent endless looping on bogus stream length and other EOFs (#21)
Browse files Browse the repository at this point in the history
* fix: prevent endless looping on bogus stream length and other EOFs

* fix: add xfails to benchmark (FIXME: centralize them...)
  • Loading branch information
dhdaines authored Nov 27, 2024
1 parent 2772928 commit aca421a
Show file tree
Hide file tree
Showing 7 changed files with 44 additions and 5 deletions.
15 changes: 15 additions & 0 deletions benchmarks/converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,23 @@
"aes-256-m.pdf": ["foo"],
"aes-256-r6.pdf": ["usersecret", "ownersecret"],
}
PDFMINER_BUGS = {
"issue-449-vertical.pdf",
"issue_495_pdfobjref.pdf",
"issue-1008-inline-ascii85.pdf",
"rotated.pdf",
}
XFAILS = {
"bogus-stream-length.pdf",
}


def benchmark_one_pdf(path: Path):
"""Open one of the documents"""
import playa

if path.name in PDFMINER_BUGS or path.name in XFAILS:
return
passwords = PASSWORDS.get(path.name, [""])
for password in passwords:
LOG.info("Reading %s", path)
Expand All @@ -40,6 +51,8 @@ def benchmark_one_lazy(path: Path):
"""Open one of the documents"""
import playa

if path.name in PDFMINER_BUGS or path.name in XFAILS:
return
passwords = PASSWORDS.get(path.name, [""])
for password in passwords:
LOG.info("Reading %s", path)
Expand All @@ -59,6 +72,8 @@ def benchmark_one_pdfminer(path: Path):
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFParser

if path.name in PDFMINER_BUGS or path.name in XFAILS:
return
passwords = PASSWORDS.get(path.name, [""])
for password in passwords:
with open(path, "rb") as infh:
Expand Down
4 changes: 4 additions & 0 deletions playa/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,8 @@ def __init__(self, parser: ObjectParser) -> None:
def _load(self, parser: ObjectParser) -> None:
while True:
pos, line = parser.nextline()
if line == b"":
break
line = line.strip()
if not line:
continue
Expand All @@ -149,6 +151,8 @@ def _load(self, parser: ObjectParser) -> None:
raise ValueError(error_msg)
for objid in range(start, start + nobjs):
_, line = parser.nextline()
if line == b"":
break
line = line.strip()
f = line.split(b" ")
if len(f) != 3:
Expand Down
12 changes: 8 additions & 4 deletions playa/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -623,10 +623,11 @@ def __next__(self) -> Tuple[int, IndirectObject]:
linepos, line = self._parser.nextline()
log.debug("After stream data: %r %r", linepos, line)
if self.strict:
log.warning(
"Expected a newline between end of stream and 'endstream', got %r",
line,
)
# In reality there usually is no end-of-line
# marker. We will nonetheless warn if there's
# something other than 'endstream'.
if line not in (b"\n", b"\r\n", b"endstream\n", b"endstream\r\n"):
log.warning("Expected newline or 'endstream', got %r", line)
else:
# Reuse that line and read more if necessary
while True:
Expand All @@ -640,6 +641,9 @@ def __next__(self) -> Tuple[int, IndirectObject]:
data += line
linepos, line = self._parser.nextline()
log.debug("After stream data: %r %r", linepos, line)
if line == b"": # Means EOF
log.warning("Incorrect legnth for stream, no 'endstream' found")
break
doc = None if self.doc is None else self.doc()
stream = ContentStream(
dic, bytes(data), None if doc is None else doc.decipher
Expand Down
Binary file added samples/bogus-stream-length.pdf
Binary file not shown.
Binary file added samples/rotated.pdf
Binary file not shown.
5 changes: 5 additions & 0 deletions tests/test_lazy_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,9 @@
"aes-256-m.pdf": ["foo"],
"aes-256-r6.pdf": ["usersecret", "ownersecret"],
}
XFAILS = {
"bogus-stream-length.pdf",
}


def test_content_objects():
Expand Down Expand Up @@ -66,6 +69,8 @@ def test_content_objects():
@pytest.mark.parametrize("path", ALLPDFS, ids=str)
def test_open_lazy(path: Path) -> None:
"""Open all the documents"""
if path.name in XFAILS:
pytest.xfail("Intentionally corrupt file: %s" % path.name)
passwords = PASSWORDS.get(path.name, [""])
for password in passwords:
beach = []
Expand Down
13 changes: 12 additions & 1 deletion tests/test_open.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
except ImportError:
pdfminer = None # type: ignore
import playa
from playa.exceptions import PDFEncryptionError
from playa.exceptions import PDFEncryptionError, PDFSyntaxError

TESTDIR = Path(__file__).parent.parent / "samples"
ALLPDFS = TESTDIR.glob("**/*.pdf")
Expand All @@ -31,13 +31,19 @@
"issue-449-vertical.pdf",
"issue_495_pdfobjref.pdf",
"issue-1008-inline-ascii85.pdf",
"rotated.pdf",
}
XFAILS = {
"bogus-stream-length.pdf",
}


@pytest.mark.skipif(pdfminer is None, reason="pdfminer.six is not installed")
@pytest.mark.parametrize("path", ALLPDFS, ids=str)
def test_open(path: Path) -> None:
"""Open all the documents and compare with pdfplumber"""
if path.name in XFAILS:
pytest.xfail("Intentionally corrupt file: %s" % path.name)
from pdfminer.converter import PDFPageAggregator
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager
Expand Down Expand Up @@ -183,3 +189,8 @@ def test_tiff_predictor() -> None:
image = next(doc.pages[0].images)
# Decoded TIFF: 600 x 600 + a header
assert len(image.stream.buffer) == 360600


def test_bogus_stream_length() -> None:
with pytest.raises(PDFSyntaxError):
_ = playa.open(TESTDIR / "bogus-stream-length.pdf")

0 comments on commit aca421a

Please sign in to comment.