Skip to content

Commit

Permalink
ROB: Repair PDF with invalid Root object (py-pdf#2880)
Browse files Browse the repository at this point in the history
Closes py-pdf#2875.
  • Loading branch information
pubpub-zz authored Sep 28, 2024
1 parent c8220c6 commit 79345ed
Show file tree
Hide file tree
Showing 4 changed files with 111 additions and 18 deletions.
5 changes: 3 additions & 2 deletions pypdf/_doc_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -1148,8 +1148,9 @@ def _flatten(
# Fix issue 327: set flattened_pages attribute only for
# decrypted file
catalog = self.root_object
pages = catalog["/Pages"].get_object() # type: ignore
assert isinstance(pages, DictionaryObject)
pages = catalog.get("/Pages").get_object() # type: ignore
if not isinstance(pages, DictionaryObject):
raise PdfReadError("Invalid object in /Pages")
self.flattened_pages = []

if PA.TYPE in pages:
Expand Down
39 changes: 33 additions & 6 deletions pypdf/_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,8 @@ def __init__(
# map page indirect_reference number to page number
self._page_id2num: Optional[Dict[Any, Any]] = None

self._validated_root: Optional[DictionaryObject] = None

self._initialize_stream(stream)

self._override_encryption = False
Expand Down Expand Up @@ -197,10 +199,35 @@ def close(self) -> None:
@property
def root_object(self) -> DictionaryObject:
"""Provide access to "/Root". Standardized with PdfWriter."""
root = self.trailer[TK.ROOT]
if root is None:
raise PdfReadError('Cannot find "/Root" key in trailer')
return cast(DictionaryObject, root.get_object())
if self._validated_root:
return self._validated_root
root = self.trailer.get(TK.ROOT)
if is_null_or_none(root):
logger_warning('Cannot find "/Root" key in trailer', __name__)
elif (
cast(DictionaryObject, cast(PdfObject, root).get_object()).get("/Type")
== "/Catalog"
):
self._validated_root = cast(
DictionaryObject, cast(PdfObject, root).get_object()
)
else:
logger_warning("Invalid Root object in trailer", __name__)
if self._validated_root is None:
logger_warning('Searching object with "/Catalog" key', __name__)
nb = cast(int, self.trailer.get("/Size", 0))
for i in range(nb):
try:
o = self.get_object(i + 1)
except Exception: # to be sure to capture all errors
o = None
if isinstance(o, DictionaryObject) and o.get("/Type") == "/Catalog":
self._validated_root = o
logger_warning(f"Root found at {o.indirect_reference!r}", __name__)
break
if self._validated_root is None:
raise PdfReadError("Cannot find Root object in pdf")
return self._validated_root

@property
def _info(self) -> Optional[DictionaryObject]:
Expand All @@ -215,11 +242,11 @@ def _info(self) -> Optional[DictionaryObject]:
return None
else:
info = info.get_object()
if info == None: # noqa: E711
if not isinstance(info, DictionaryObject):
raise PdfReadError(
"Trailer not found or does not point to document information directory"
)
return cast(DictionaryObject, info)
return info

@property
def _ID(self) -> Optional[ArrayObject]:
Expand Down
3 changes: 2 additions & 1 deletion pypdf/generic/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -879,5 +879,6 @@ def is_null_or_none(x: Any) -> TypeGuard[Union[None, NullObject, IndirectObject]
True if x is None or NullObject.
"""
return x is None or (
isinstance(x, PdfObject) and isinstance(x.get_object(), NullObject)
isinstance(x, PdfObject)
and (x.get_object() is None or isinstance(x.get_object(), NullObject))
)
82 changes: 73 additions & 9 deletions tests/test_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -137,14 +137,14 @@ def test_iss1943():
def test_broken_meta_data(pdf_path):
with open(pdf_path, "rb") as f:
reader = PdfReader(f)
with pytest.raises(
PdfReadError,
match=(
"Trailer not found or does not point to document "
"information directory"
),
):
reader.metadata
assert reader.metadata is None

with open(RESOURCE_ROOT / "crazyones.pdf", "rb") as f:
b = f.read(-1)
reader = PdfReader(BytesIO(b.replace(b"/Info 2 0 R", b"/Info 2 ")))
with pytest.raises(PdfReadError) as exc:
reader.metadata
assert "does not point to document information directory" in repr(exc)


@pytest.mark.parametrize(
Expand Down Expand Up @@ -621,7 +621,7 @@ def test_read_unknown_zero_pages(caplog):
assert normalize_warnings(caplog.text) == warnings
with pytest.raises(PdfReadError) as exc:
len(reader.pages)
assert exc.value.args[0] == 'Cannot find "/Root" key in trailer'
assert exc.value.args[0] == "Invalid object in /Pages"


def test_read_encrypted_without_decryption():
Expand Down Expand Up @@ -1712,3 +1712,67 @@ def test_unbalanced_brackets_in_dictionary_object(caplog):
name = "iss2877.pdf" # reused
reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
assert len(reader.pages) == 43 # note: /Count = 46 but 3 kids are None


@pytest.mark.enable_socket()
def test_repair_root(caplog):
"""Cf #2877"""
url = "https://github.com/user-attachments/files/17162216/crash-6620e8b1abfe3da639b654595da859b87f985748.pdf"
name = "iss2875.pdf"

b = get_data_from_url(url, name=name)
reader = PdfReader(BytesIO(b))
assert len(reader.pages) == 1
assert all(
msg in caplog.text
for msg in (
"Invalid Root object",
'Searching object with "/Catalog" key',
"Root found at IndirectObject(2, 0,",
)
)

# no /Root Entry
reader = PdfReader(BytesIO(b.replace(b"/Root", b"/Roo ")))
caplog.clear()
assert len(reader.pages) == 1
assert all(
msg in caplog.text
for msg in (
'Cannot find "/Root" key in trailer',
'Searching object with "/Catalog" key',
"Root found at IndirectObject(2, 0,",
)
)

# Invalid /Root Entry
caplog.clear()
reader = PdfReader(
BytesIO(
b.replace(b"/Root 1 0 R", b"/Root 2 0 R").replace(b"/Catalog", b"/Catalo ")
)
)
with pytest.raises(PdfReadError):
len(reader.pages)
assert all(
msg in caplog.text
for msg in (
"Invalid Root object in trailer",
'Searching object with "/Catalog" key',
)
)

# Invalid /Root Entry + error in get_object
caplog.clear()
b = b.replace(b"/Root 1 0 R", b"/Root 2 0 R").replace(b"/Catalog", b"/Catalo ")
b = b[:5124] + b"A" + b[5125:]
reader = PdfReader(BytesIO(b))
with pytest.raises(PdfReadError):
len(reader.pages)
assert all(
msg in caplog.text
for msg in (
"Invalid Root object in trailer",
'Searching object with "/Catalog" key',
)
)

0 comments on commit 79345ed

Please sign in to comment.