Skip to content

Commit

Permalink
BUG: remove images in sub form
Browse files Browse the repository at this point in the history
  • Loading branch information
pubpub-zz committed Aug 30, 2023
1 parent fe2dfaf commit e630d5b
Show file tree
Hide file tree
Showing 2 changed files with 82 additions and 37 deletions.
94 changes: 58 additions & 36 deletions pypdf/_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -2173,11 +2173,8 @@ def remove_objects_from_page(
else: # del text
jump_operators = [b"Tj", b"TJ", b"'", b'"']

images = []
forms = []

def clean(content: ContentStream) -> None:
nonlocal images, forms, to_delete
def clean(content: ContentStream, images: List[str], forms: List[str]) -> None:
nonlocal to_delete
i = 0
while i < len(content.operations):
operands, operator = content.operations[i]
Expand All @@ -2196,42 +2193,67 @@ def clean(content: ContentStream) -> None:
i += 1
content.get_data() # this ensures ._data is rebuilt from the .operations

try:
d = cast(dict, cast(DictionaryObject, page["/Resources"])["/XObject"])
except KeyError:
d = {}
for k, v in d.items():
o = v.get_object()
def clean_forms(
elt: DictionaryObject, stack: List[DictionaryObject]
) -> Tuple[List[str], List[str]]:
nonlocal to_delete
if elt in stack:
# to prevent infinite looping
return [], [] # pragma: no cover
try:
content: Any = None
if to_delete & ObjectDeletionFlag.IMAGES and o["/Subtype"] == "/Image":
content = NullObject()
images.append(k)
if o["/Subtype"] == "/Form":
forms.append(k)
if isinstance(o, ContentStream):
content = o
else:
content = ContentStream(o, self)
content.update(o.items())
for k1 in ["/Length", "/Filter", "/DecodeParms"]:
try:
del content[k1]
except KeyError:
pass
clean(content)
if content is not None:
if isinstance(v, IndirectObject):
self._objects[v.idnum - 1] = content
else:
d[k] = self._add_object(content)
except (TypeError, KeyError):
pass
d = cast(dict, cast(DictionaryObject, elt["/Resources"])["/XObject"])
except KeyError:
d = {}
images = []
forms = []
for k, v in d.items():
o = v.get_object()
try:
content: Any = None
if (
to_delete & ObjectDeletionFlag.IMAGES
and o["/Subtype"] == "/Image"
):
content = NullObject()
images.append(k)
if o["/Subtype"] == "/Form":
forms.append(k)
if isinstance(o, ContentStream):
content = o
else:
content = ContentStream(o, self)
content.update(o.items())
for k1 in ["/Length", "/Filter", "/DecodeParms"]:
try:
del content[k1]
except KeyError:
pass
clean_forms(content, stack + [elt]) # clean sub forms
if content is not None:
if isinstance(v, IndirectObject):
self._objects[v.idnum - 1] = content
else:
# should only occur with pdf not respecting pdf spec
# where streams must be indirected.
d[k] = self._add_object(content) # pragma: no cover
except (TypeError, KeyError):
pass
if isinstance(elt, StreamObject): # for /Form
if not isinstance(elt, ContentStream):
e = ContentStream(elt, self)
e.update(elt.items())
elt = e
clean(elt, images, forms) # clean the content
return images, forms

if "/Contents" in page:
content = page["/Contents"].get_object()

if not isinstance(content, ContentStream):
content = ContentStream(content, page)
clean(cast(ContentStream, content))
images, forms = clean_forms(page, [])

clean(cast(ContentStream, content), images, forms)
if isinstance(page["/Contents"], ArrayObject):
for o in cast(ArrayObject, page["/Contents"]):
self._objects[o.idnum - 1] = NullObject()
Expand Down
25 changes: 24 additions & 1 deletion tests/test_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -327,6 +327,27 @@ def test_remove_images(pdf_file_path, input_path):
assert "Lorem ipsum dolor sit amet" in extracted_text


@pytest.mark.enable_socket()
def test_remove_images_sub_level():
"""Cf #2035"""
url = "https://github.com/py-pdf/pypdf/files/12394781/2210.03142-1.pdf"
name = "iss2103.pdf"
writer = PdfWriter(clone_from=BytesIO(get_data_from_url(url, name=name)))
writer.remove_images()
assert (
len(
[
o.get_object()
for o in writer.pages[0]["/Resources"]["/XObject"]["/Fm1"][
"/Resources"
]["/XObject"]["/Im1"]["/Resources"]["/XObject"].values()
if not isinstance(o.get_object(), NullObject)
]
)
== 0
)


@pytest.mark.parametrize(
"input_path",
[
Expand Down Expand Up @@ -1238,7 +1259,9 @@ def test_iss1601():
url = "https://github.com/py-pdf/pypdf/files/10579503/badges-38.pdf"
name = "badge-38.pdf"
reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
original_cs_operations = ContentStream(reader.pages[0].get_contents(), reader).operations
original_cs_operations = ContentStream(
reader.pages[0].get_contents(), reader
).operations
writer = PdfWriter()
page_1 = writer.add_blank_page(
reader.pages[0].mediabox[2], reader.pages[0].mediabox[3]
Expand Down

0 comments on commit e630d5b

Please sign in to comment.