Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 7 additions & 2 deletions pypdf/filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -408,12 +408,17 @@ def decode(
# We should first check, if we have an inner stream from a multi-encoded
# stream with a faulty trailing newline that we can decode properly.
# We will just ignore the last byte and raise a warning ...
if (index == data_length - 1) and (data[index : index+1] == b"\n"):
if (index == data_length - 1) and (data[index : index + 1] == b"\n"):
logger_warning(
"Found trailing newline in stream data, check if output is OK", __name__
)
break
raise PdfStreamError("Early EOD in RunLengthDecode")
# Raising an exception here breaks all image extraction for this file, which might
# not be desirable. For this reason, indicate that the output is most likely wrong,
# as processing stopped after the first EOD marker. See issue #3517.
logger_warning(
"Early EOD in RunLengthDecode, check if output is OK", __name__
)
break
if length < 128:
length += 1
Expand Down
16 changes: 8 additions & 8 deletions pypdf/generic/_data_structures.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,11 +78,11 @@
)
from ._fit import Fit
from ._image_inline import (
extract_inline_A85,
extract_inline_AHx,
extract_inline_DCT,
extract_inline__ascii85_decode,
extract_inline__ascii_hex_decode,
extract_inline__dct_decode,
extract_inline__run_length_decode,
extract_inline_default,
extract_inline_RL,
)
from ._utils import read_hex_string_from_stream, read_string_from_stream

Expand Down Expand Up @@ -1325,13 +1325,13 @@ def _read_inline_image(self, stream: StreamType) -> dict[str, Any]:
if isinstance(filtr, list):
filtr = filtr[0] # used forencoding
if "AHx" in filtr or "ASCIIHexDecode" in filtr:
data = extract_inline_AHx(stream)
data = extract_inline__ascii_hex_decode(stream)
elif "A85" in filtr or "ASCII85Decode" in filtr:
data = extract_inline_A85(stream)
data = extract_inline__ascii85_decode(stream)
elif "RL" in filtr or "RunLengthDecode" in filtr:
data = extract_inline_RL(stream)
data = extract_inline__run_length_decode(stream)
elif "DCT" in filtr or "DCTDecode" in filtr:
data = extract_inline_DCT(stream)
data = extract_inline__dct_decode(stream)
elif filtr == "not set":
cs = settings.get("/CS", "")
if isinstance(cs, list):
Expand Down
56 changes: 34 additions & 22 deletions pypdf/generic/_image_inline.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
WHITESPACES,
WHITESPACES_AS_BYTES,
StreamType,
logger_warning,
read_non_whitespace,
)
from ..errors import PdfReadError
Expand All @@ -44,7 +45,14 @@
BUFFER_SIZE = 8192


def extract_inline_AHx(stream: StreamType) -> bytes:
def _check_end_image_marker(stream: StreamType) -> bool:
ei_tok = read_non_whitespace(stream)
ei_tok += stream.read(2)
stream.seek(-3, 1)
return ei_tok[:2] == b"EI" and (ei_tok[2:3] == b"" or ei_tok[2:3] in WHITESPACES)


def extract_inline__ascii_hex_decode(stream: StreamType) -> bytes:
"""
Extract HexEncoded stream from inline image.
The stream will be moved onto the EI.
Expand Down Expand Up @@ -77,15 +85,12 @@ def extract_inline_AHx(stream: StreamType) -> bytes:
data_out += data_buffered[:-2]
stream.seek(-2, 1)

ei_tok = read_non_whitespace(stream)
ei_tok += stream.read(2)
stream.seek(-3, 1)
if ei_tok[:2] != b"EI" or not (ei_tok[2:3] == b"" or ei_tok[2:3] in WHITESPACES):
if not _check_end_image_marker(stream):
raise PdfReadError("EI stream not found")
return data_out


def extract_inline_A85(stream: StreamType) -> bytes:
def extract_inline__ascii85_decode(stream: StreamType) -> bytes:
"""
Extract A85 stream from inline image.
The stream will be moved onto the EI.
Expand All @@ -109,15 +114,12 @@ def extract_inline_A85(stream: StreamType) -> bytes:
] # back by one char in case of in the middle of ~>
stream.seek(-2, 1)

ei_tok = read_non_whitespace(stream)
ei_tok += stream.read(2)
stream.seek(-3, 1)
if ei_tok[:2] != b"EI" or not (ei_tok[2:3] == b"" or ei_tok[2:3] in WHITESPACES):
if not _check_end_image_marker(stream):
raise PdfReadError("EI stream not found")
return data_out


def extract_inline_RL(stream: StreamType) -> bytes:
def extract_inline__run_length_decode(stream: StreamType) -> bytes:
"""
Extract RL (RunLengthDecode) stream from inline image.
The stream will be moved onto the EI.
Expand All @@ -130,20 +132,33 @@ def extract_inline_RL(stream: StreamType) -> bytes:
raise PdfReadError("Unexpected end of stream")
pos_tok = data_buffered.find(b"\x80")
if pos_tok >= 0: # found
data_out += data_buffered[: pos_tok + 1]
stream.seek(-len(data_buffered) + pos_tok + 1, 1)
# Ideally, we could just use plain run-length decoding here, where 80_16 = 128_10
# marks the EOD. But there apparently are cases like in issue #3517, where we have
# an inline image with up to 51 EOD markers. In these cases, be resilient here and
# use the default `EI` marker detection instead. Please note that this fallback
# still omits special `EI` handling within the stream, but for now assume that having
# both of these cases occur at the same time is very unlikely (and the image stream
# is broken anyway).
# For now, do not skip over more than one whitespace character.
after_token = data_buffered[pos_tok + 1 : pos_tok + 4]
if after_token.startswith(b"EI") or after_token.endswith(b"EI"):
data_out += data_buffered[: pos_tok + 1]
stream.seek(-len(data_buffered) + pos_tok + 1, 1)
else:
logger_warning("Early EOD in RunLengthDecode of inline image, using fallback.", __name__)
ei_marker = data_buffered.find(b"EI")
if ei_marker > 0:
data_out += data_buffered[: ei_marker]
stream.seek(-len(data_buffered) + ei_marker - 1, 1)
break
data_out += data_buffered

ei_tok = read_non_whitespace(stream)
ei_tok += stream.read(2)
stream.seek(-3, 1)
if ei_tok[:2] != b"EI" or not (ei_tok[2:3] == b"" or ei_tok[2:3] in WHITESPACES):
if not _check_end_image_marker(stream):
raise PdfReadError("EI stream not found")
return data_out


def extract_inline_DCT(stream: StreamType) -> bytes:
def extract_inline__dct_decode(stream: StreamType) -> bytes:
"""
Extract DCT (JPEG) stream from inline image.
The stream will be moved onto the EI.
Expand Down Expand Up @@ -185,10 +200,7 @@ def read(length: int) -> bytes:
sz = c[0] * 256 + c[1]
data_out += read(sz - 2)

ei_tok = read_non_whitespace(stream)
ei_tok += stream.read(2)
stream.seek(-3, 1)
if ei_tok[:2] != b"EI" or not (ei_tok[2:3] == b"" or ei_tok[2:3] in WHITESPACES):
if not _check_end_image_marker(stream):
raise PdfReadError("EI stream not found")
return data_out

Expand Down
11 changes: 11 additions & 0 deletions tests/generic/test_image_inline.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,3 +75,14 @@ def test_extract_inline_dct__early_end_of_file():

with pytest.raises(expected_exception=PdfReadError, match=r"^Unexpected end of stream$"):
page.images[0].image.load()


@pytest.mark.enable_socket
def test_extract_inline_dct__multiple_eod():
url = "https://github.com/user-attachments/files/23900687/cedolini_esempio-1.pdf"
name = "issue3517.pdf"
reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))

for page in reader.pages:
for image in page.images:
_ = image.image.load()
11 changes: 7 additions & 4 deletions tests/test_filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -842,19 +842,22 @@ def test_rle_decode_with_faulty_tail_byte_in_multi_encoded_stream(caplog):


@pytest.mark.enable_socket
def test_rle_decode_exception_with_corrupted_stream():
def test_rle_decode_exception_with_corrupted_stream(caplog):
"""
Additional Test to #3355

This test must raise the EOD exception during RLE decoding and ensures
This test must report the EOD warning during RLE decoding and ensures
that we do not fail during code coverage analyses in the git PR pipeline.
"""
data = get_data_from_url(
url="https://github.com/user-attachments/files/21052626/rle_stream_with_error.txt",
name="rle_stream_with_error.txt"
)
with pytest.raises(PdfStreamError, match="Early EOD in RunLengthDecode"):
RunLengthDecode.decode(data)
decoded = RunLengthDecode.decode(data)
assert decoded.startswith(b"\x01\x01\x01\x01\x01\x01\x01\x02\x02\x02\x02\x02\x02\x02\x03\x03")
assert decoded.endswith(b"\x87\x83\x83\x83\x83\x83\x83\x83]]]]]]]RRRRRRRX\xa5")
assert len(decoded) == 1048576
assert caplog.messages == ["Early EOD in RunLengthDecode, check if output is OK"]


def test_decompress():
Expand Down
34 changes: 17 additions & 17 deletions tests/test_generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,10 +41,10 @@
read_string_from_stream,
)
from pypdf.generic._image_inline import (
extract_inline_A85,
extract_inline_AHx,
extract_inline_DCT,
extract_inline_RL,
extract_inline__ascii85_decode,
extract_inline__ascii_hex_decode,
extract_inline__dct_decode,
extract_inline__run_length_decode,
)

from . import ReaderDummy, get_data_from_url
Expand Down Expand Up @@ -1158,36 +1158,36 @@ def test_array_operators():

def test_unitary_extract_inline_buffer_invalid():
with pytest.raises(PdfReadError):
extract_inline_AHx(BytesIO())
extract_inline__ascii_hex_decode(BytesIO())
with pytest.raises(PdfReadError):
extract_inline_AHx(BytesIO(4095 * b"00" + b" "))
extract_inline__ascii_hex_decode(BytesIO(4095 * b"00" + b" "))
with pytest.raises(PdfReadError):
extract_inline_AHx(BytesIO(b"00"))
extract_inline__ascii_hex_decode(BytesIO(b"00"))
with pytest.raises(PdfReadError):
extract_inline_A85(BytesIO())
extract_inline__ascii85_decode(BytesIO())
with pytest.raises(PdfReadError):
extract_inline_A85(BytesIO(a85encode(b"1")))
extract_inline__ascii85_decode(BytesIO(a85encode(b"1")))
with pytest.raises(PdfReadError):
extract_inline_A85(BytesIO(a85encode(b"1") + b"~> Q"))
extract_inline__ascii85_decode(BytesIO(a85encode(b"1") + b"~> Q"))
with pytest.raises(PdfReadError):
extract_inline_A85(BytesIO(a85encode(b"1234578" * 990)))
extract_inline__ascii85_decode(BytesIO(a85encode(b"1234578" * 990)))
with pytest.raises(PdfReadError):
extract_inline_RL(BytesIO())
extract_inline__run_length_decode(BytesIO())
with pytest.raises(PdfReadError):
extract_inline_RL(BytesIO(b"\x01\x01\x80"))
extract_inline__run_length_decode(BytesIO(b"\x01\x01\x80"))
with pytest.raises(PdfReadError):
extract_inline_DCT(BytesIO(b"\xFF\xD9"))
extract_inline__dct_decode(BytesIO(b"\xFF\xD9"))


def test_unitary_extract_inline():
# AHx
b = 16000 * b"00"
assert len(extract_inline_AHx(BytesIO(b + b" EI"))) == len(b)
assert len(extract_inline__ascii_hex_decode(BytesIO(b + b" EI"))) == len(b)
with pytest.raises(PdfReadError):
extract_inline_AHx(BytesIO(b + b"> "))
extract_inline__ascii_hex_decode(BytesIO(b + b"> "))
# RL
b = 8200 * b"\x00\xAB" + b"\x80"
assert len(extract_inline_RL(BytesIO(b + b" EI"))) == len(b)
assert len(extract_inline__run_length_decode(BytesIO(b + b" EI"))) == len(b)

# default
# EIDD instead of EI; using A85
Expand Down