Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ROB : fix image extraction #1327

Merged
merged 2 commits into from
Sep 6, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions PyPDF2/generic/_data_structures.py
Original file line number Diff line number Diff line change
Expand Up @@ -759,17 +759,17 @@ def _read_inline_image(self, stream: StreamType) -> Dict[str, Any]:
tok = stream.read(1)
# Check for End Image
tok2 = stream.read(1)
if tok2 == b"I":
# Data can contain EI, so check for the Q operator.
if tok2 == b"I" and buf[loc - 1 : loc] in WHITESPACES:
# Data can contain [\s]EI, so check for the separator \s; 4 chars suffisent Q operator not required.
tok3 = stream.read(1)
info = tok + tok2
# We need to find whitespace between EI and Q.
# We need to find at least one whitespace after.
has_q_whitespace = False
while tok3 in WHITESPACES:
has_q_whitespace = True
info += tok3
tok3 = stream.read(1)
if tok3 == b"Q" and has_q_whitespace:
if has_q_whitespace:
stream.seek(-1, 1)
break
else:
Expand Down
15 changes: 6 additions & 9 deletions tests/test_workflows.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
from PyPDF2.constants import ImageAttributes as IA
from PyPDF2.constants import PageAttributes as PG
from PyPDF2.constants import Ressources as RES
from PyPDF2.errors import PdfReadError, PdfReadWarning
from PyPDF2.errors import PdfReadWarning
from PyPDF2.filters import _xobj_to_image

from . import get_pdf_from_url, normalize_warnings
Expand Down Expand Up @@ -425,7 +425,7 @@ def test_get_metadata(url, name):
"https://corpora.tika.apache.org/base/docs/govdocs1/938/938702.pdf",
"tika-938702.pdf",
False,
(PdfReadError, "Unexpected end of stream"),
None, # iss #1090 is now fixed
),
(
"https://corpora.tika.apache.org/base/docs/govdocs1/942/942358.pdf",
Expand Down Expand Up @@ -512,19 +512,16 @@ def test_extract_text(url, name, strict, exception):
),
(
"https://corpora.tika.apache.org/base/docs/govdocs1/957/957304.pdf",
"tika-938702.pdf",
"tika-957304.pdf",
),
],
)
def test_compress_raised(url, name):
data = BytesIO(get_pdf_from_url(url, name=name))
reader = PdfReader(data)
# TODO: which page exactly?
# TODO: Is it reasonable to have an exception here?
with pytest.raises(PdfReadError) as exc:
for page in reader.pages:
page.compress_content_streams()
assert exc.value.args[0] == "Unexpected end of stream"
# no more error since iss #1090 fix
for page in reader.pages:
page.compress_content_streams()


@pytest.mark.parametrize(
Expand Down