From bfcc5a98d56df5ab4b06e4cd76a8e1e720b3b8f8 Mon Sep 17 00:00:00 2001 From: pubpub-zz <4083478+pubpub-zz@users.noreply.github.com> Date: Wed, 18 Sep 2024 19:54:29 +0200 Subject: [PATCH] ROB: tolerate comments in arrays closes #2843 --- pypdf/_utils.py | 2 ++ pypdf/generic/_data_structures.py | 11 +++++------ tests/test_reader.py | 16 ++++++++++++++++ 3 files changed, 23 insertions(+), 6 deletions(-) diff --git a/pypdf/_utils.py b/pypdf/_utils.py index e0034ccc4..8cd85b609 100644 --- a/pypdf/_utils.py +++ b/pypdf/_utils.py @@ -206,6 +206,8 @@ def skip_over_comment(stream: StreamType) -> None: if tok == b"%": while tok not in (b"\n", b"\r"): tok = stream.read(1) + if tok == b"": + raise PdfStreamError("File ended unexpectedly.") def read_until_regex(stream: StreamType, regex: Pattern[bytes]) -> bytes: diff --git a/pypdf/generic/_data_structures.py b/pypdf/generic/_data_structures.py index cc4b4a032..17f5fae27 100644 --- a/pypdf/generic/_data_structures.py +++ b/pypdf/generic/_data_structures.py @@ -232,6 +232,10 @@ def read_from_stream( tok = stream.read(1) while tok.isspace(): tok = stream.read(1) + if tok == b"%": + stream.seek(-1, 1) + skip_over_comment(stream) + continue stream.seek(-1, 1) # check for array ending peek_ahead = stream.read(1) @@ -1341,12 +1345,7 @@ def read_object( return NullObject.read_from_stream(stream) elif tok == b"%": # comment - while tok not in (b"\r", b"\n"): - tok = stream.read(1) - # Prevents an infinite loop by raising an error if the stream is at - # the EOF - if len(tok) <= 0: - raise PdfStreamError("File ended unexpectedly.") + skip_over_comment(stream) tok = read_non_whitespace(stream) stream.seek(-1, 1) return read_object(stream, pdf, forced_encoding) diff --git a/tests/test_reader.py b/tests/test_reader.py index 284edf769..8d6bc2d6b 100644 --- a/tests/test_reader.py +++ b/tests/test_reader.py @@ -17,6 +17,7 @@ EmptyFileError, FileNotDecryptedError, PdfReadError, + PdfStreamError, WrongPasswordError, ) from pypdf.generic import ( @@ -1641,3 +1642,18 @@ def test_truncated_files(caplog): reader = PdfReader(BytesIO(b[:-6])) assert "CAUTION: startxref found while searching for %%EOF" in caplog.text assert reader._startxref < 100993 + + +@pytest.mark.enable_socket() +def test_comments_in_array(caplog): + """Cf #2843: this deals with comments""" + url = "https://github.com/user-attachments/files/16992416/crash-2347912aa2a6f0fab5df4ebc8a424735d5d0d128.pdf" + name = "iss2843.pdf" # reused + b = get_data_from_url(url, name=name) + reader = PdfReader(BytesIO(b)) + reader.pages[0] + assert caplog.text == "" + reader = PdfReader(BytesIO(b)) + reader.stream = BytesIO(b[:1149]) + with pytest.raises(PdfStreamError): + reader.pages[0]