diff --git a/pypdf/filters.py b/pypdf/filters.py index e8299126a..4d07956bc 100644 --- a/pypdf/filters.py +++ b/pypdf/filters.py @@ -408,12 +408,17 @@ def decode( # We should first check, if we have an inner stream from a multi-encoded # stream with a faulty trailing newline that we can decode properly. # We will just ignore the last byte and raise a warning ... - if (index == data_length - 1) and (data[index : index+1] == b"\n"): + if (index == data_length - 1) and (data[index : index + 1] == b"\n"): logger_warning( "Found trailing newline in stream data, check if output is OK", __name__ ) break - raise PdfStreamError("Early EOD in RunLengthDecode") + # Raising an exception here breaks all image extraction for this file, which might + # not be desirable. For this reason, indicate that the output is most likely wrong, + # as processing stopped after the first EOD marker. See issue #3517. + logger_warning( + "Early EOD in RunLengthDecode, check if output is OK", __name__ + ) break if length < 128: length += 1 diff --git a/pypdf/generic/_data_structures.py b/pypdf/generic/_data_structures.py index d94635bf5..b82665270 100644 --- a/pypdf/generic/_data_structures.py +++ b/pypdf/generic/_data_structures.py @@ -78,11 +78,11 @@ ) from ._fit import Fit from ._image_inline import ( - extract_inline_A85, - extract_inline_AHx, - extract_inline_DCT, + extract_inline__ascii85_decode, + extract_inline__ascii_hex_decode, + extract_inline__dct_decode, + extract_inline__run_length_decode, extract_inline_default, - extract_inline_RL, ) from ._utils import read_hex_string_from_stream, read_string_from_stream @@ -1325,13 +1325,13 @@ def _read_inline_image(self, stream: StreamType) -> dict[str, Any]: if isinstance(filtr, list): filtr = filtr[0] # used forencoding if "AHx" in filtr or "ASCIIHexDecode" in filtr: - data = extract_inline_AHx(stream) + data = extract_inline__ascii_hex_decode(stream) elif "A85" in filtr or "ASCII85Decode" in filtr: - data = extract_inline_A85(stream) + data = extract_inline__ascii85_decode(stream) elif "RL" in filtr or "RunLengthDecode" in filtr: - data = extract_inline_RL(stream) + data = extract_inline__run_length_decode(stream) elif "DCT" in filtr or "DCTDecode" in filtr: - data = extract_inline_DCT(stream) + data = extract_inline__dct_decode(stream) elif filtr == "not set": cs = settings.get("/CS", "") if isinstance(cs, list): diff --git a/pypdf/generic/_image_inline.py b/pypdf/generic/_image_inline.py index 255707d7e..d1876936a 100644 --- a/pypdf/generic/_image_inline.py +++ b/pypdf/generic/_image_inline.py @@ -33,6 +33,7 @@ WHITESPACES, WHITESPACES_AS_BYTES, StreamType, + logger_warning, read_non_whitespace, ) from ..errors import PdfReadError @@ -44,7 +45,14 @@ BUFFER_SIZE = 8192 -def extract_inline_AHx(stream: StreamType) -> bytes: +def _check_end_image_marker(stream: StreamType) -> bool: + ei_tok = read_non_whitespace(stream) + ei_tok += stream.read(2) + stream.seek(-3, 1) + return ei_tok[:2] == b"EI" and (ei_tok[2:3] == b"" or ei_tok[2:3] in WHITESPACES) + + +def extract_inline__ascii_hex_decode(stream: StreamType) -> bytes: """ Extract HexEncoded stream from inline image. The stream will be moved onto the EI. @@ -77,15 +85,12 @@ def extract_inline_AHx(stream: StreamType) -> bytes: data_out += data_buffered[:-2] stream.seek(-2, 1) - ei_tok = read_non_whitespace(stream) - ei_tok += stream.read(2) - stream.seek(-3, 1) - if ei_tok[:2] != b"EI" or not (ei_tok[2:3] == b"" or ei_tok[2:3] in WHITESPACES): + if not _check_end_image_marker(stream): raise PdfReadError("EI stream not found") return data_out -def extract_inline_A85(stream: StreamType) -> bytes: +def extract_inline__ascii85_decode(stream: StreamType) -> bytes: """ Extract A85 stream from inline image. The stream will be moved onto the EI. @@ -109,15 +114,12 @@ def extract_inline_A85(stream: StreamType) -> bytes: ] # back by one char in case of in the middle of ~> stream.seek(-2, 1) - ei_tok = read_non_whitespace(stream) - ei_tok += stream.read(2) - stream.seek(-3, 1) - if ei_tok[:2] != b"EI" or not (ei_tok[2:3] == b"" or ei_tok[2:3] in WHITESPACES): + if not _check_end_image_marker(stream): raise PdfReadError("EI stream not found") return data_out -def extract_inline_RL(stream: StreamType) -> bytes: +def extract_inline__run_length_decode(stream: StreamType) -> bytes: """ Extract RL (RunLengthDecode) stream from inline image. The stream will be moved onto the EI. @@ -130,20 +132,33 @@ def extract_inline_RL(stream: StreamType) -> bytes: raise PdfReadError("Unexpected end of stream") pos_tok = data_buffered.find(b"\x80") if pos_tok >= 0: # found - data_out += data_buffered[: pos_tok + 1] - stream.seek(-len(data_buffered) + pos_tok + 1, 1) + # Ideally, we could just use plain run-length decoding here, where 80_16 = 128_10 + # marks the EOD. But there apparently are cases like in issue #3517, where we have + # an inline image with up to 51 EOD markers. In these cases, be resilient here and + # use the default `EI` marker detection instead. Please note that this fallback + # still omits special `EI` handling within the stream, but for now assume that having + # both of these cases occur at the same time is very unlikely (and the image stream + # is broken anyway). + # For now, do not skip over more than one whitespace character. + after_token = data_buffered[pos_tok + 1 : pos_tok + 4] + if after_token.startswith(b"EI") or after_token.endswith(b"EI"): + data_out += data_buffered[: pos_tok + 1] + stream.seek(-len(data_buffered) + pos_tok + 1, 1) + else: + logger_warning("Early EOD in RunLengthDecode of inline image, using fallback.", __name__) + ei_marker = data_buffered.find(b"EI") + if ei_marker > 0: + data_out += data_buffered[: ei_marker] + stream.seek(-len(data_buffered) + ei_marker - 1, 1) break data_out += data_buffered - ei_tok = read_non_whitespace(stream) - ei_tok += stream.read(2) - stream.seek(-3, 1) - if ei_tok[:2] != b"EI" or not (ei_tok[2:3] == b"" or ei_tok[2:3] in WHITESPACES): + if not _check_end_image_marker(stream): raise PdfReadError("EI stream not found") return data_out -def extract_inline_DCT(stream: StreamType) -> bytes: +def extract_inline__dct_decode(stream: StreamType) -> bytes: """ Extract DCT (JPEG) stream from inline image. The stream will be moved onto the EI. @@ -185,10 +200,7 @@ def read(length: int) -> bytes: sz = c[0] * 256 + c[1] data_out += read(sz - 2) - ei_tok = read_non_whitespace(stream) - ei_tok += stream.read(2) - stream.seek(-3, 1) - if ei_tok[:2] != b"EI" or not (ei_tok[2:3] == b"" or ei_tok[2:3] in WHITESPACES): + if not _check_end_image_marker(stream): raise PdfReadError("EI stream not found") return data_out diff --git a/tests/generic/test_image_inline.py b/tests/generic/test_image_inline.py index c1f5b7dbd..042a396fd 100644 --- a/tests/generic/test_image_inline.py +++ b/tests/generic/test_image_inline.py @@ -75,3 +75,14 @@ def test_extract_inline_dct__early_end_of_file(): with pytest.raises(expected_exception=PdfReadError, match=r"^Unexpected end of stream$"): page.images[0].image.load() + + +@pytest.mark.enable_socket +def test_extract_inline_dct__multiple_eod(): + url = "https://github.com/user-attachments/files/23900687/cedolini_esempio-1.pdf" + name = "issue3517.pdf" + reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + + for page in reader.pages: + for image in page.images: + _ = image.image.load() diff --git a/tests/test_filters.py b/tests/test_filters.py index 2d6f4c75f..c7bcbb1f7 100644 --- a/tests/test_filters.py +++ b/tests/test_filters.py @@ -842,19 +842,22 @@ def test_rle_decode_with_faulty_tail_byte_in_multi_encoded_stream(caplog): @pytest.mark.enable_socket -def test_rle_decode_exception_with_corrupted_stream(): +def test_rle_decode_exception_with_corrupted_stream(caplog): """ Additional Test to #3355 - This test must raise the EOD exception during RLE decoding and ensures + This test must report the EOD warning during RLE decoding and ensures that we do not fail during code coverage analyses in the git PR pipeline. """ data = get_data_from_url( url="https://github.com/user-attachments/files/21052626/rle_stream_with_error.txt", name="rle_stream_with_error.txt" ) - with pytest.raises(PdfStreamError, match="Early EOD in RunLengthDecode"): - RunLengthDecode.decode(data) + decoded = RunLengthDecode.decode(data) + assert decoded.startswith(b"\x01\x01\x01\x01\x01\x01\x01\x02\x02\x02\x02\x02\x02\x02\x03\x03") + assert decoded.endswith(b"\x87\x83\x83\x83\x83\x83\x83\x83]]]]]]]RRRRRRRX\xa5") + assert len(decoded) == 1048576 + assert caplog.messages == ["Early EOD in RunLengthDecode, check if output is OK"] def test_decompress(): diff --git a/tests/test_generic.py b/tests/test_generic.py index fde9ddd79..da8b3314b 100644 --- a/tests/test_generic.py +++ b/tests/test_generic.py @@ -41,10 +41,10 @@ read_string_from_stream, ) from pypdf.generic._image_inline import ( - extract_inline_A85, - extract_inline_AHx, - extract_inline_DCT, - extract_inline_RL, + extract_inline__ascii85_decode, + extract_inline__ascii_hex_decode, + extract_inline__dct_decode, + extract_inline__run_length_decode, ) from . import ReaderDummy, get_data_from_url @@ -1158,36 +1158,36 @@ def test_array_operators(): def test_unitary_extract_inline_buffer_invalid(): with pytest.raises(PdfReadError): - extract_inline_AHx(BytesIO()) + extract_inline__ascii_hex_decode(BytesIO()) with pytest.raises(PdfReadError): - extract_inline_AHx(BytesIO(4095 * b"00" + b" ")) + extract_inline__ascii_hex_decode(BytesIO(4095 * b"00" + b" ")) with pytest.raises(PdfReadError): - extract_inline_AHx(BytesIO(b"00")) + extract_inline__ascii_hex_decode(BytesIO(b"00")) with pytest.raises(PdfReadError): - extract_inline_A85(BytesIO()) + extract_inline__ascii85_decode(BytesIO()) with pytest.raises(PdfReadError): - extract_inline_A85(BytesIO(a85encode(b"1"))) + extract_inline__ascii85_decode(BytesIO(a85encode(b"1"))) with pytest.raises(PdfReadError): - extract_inline_A85(BytesIO(a85encode(b"1") + b"~> Q")) + extract_inline__ascii85_decode(BytesIO(a85encode(b"1") + b"~> Q")) with pytest.raises(PdfReadError): - extract_inline_A85(BytesIO(a85encode(b"1234578" * 990))) + extract_inline__ascii85_decode(BytesIO(a85encode(b"1234578" * 990))) with pytest.raises(PdfReadError): - extract_inline_RL(BytesIO()) + extract_inline__run_length_decode(BytesIO()) with pytest.raises(PdfReadError): - extract_inline_RL(BytesIO(b"\x01\x01\x80")) + extract_inline__run_length_decode(BytesIO(b"\x01\x01\x80")) with pytest.raises(PdfReadError): - extract_inline_DCT(BytesIO(b"\xFF\xD9")) + extract_inline__dct_decode(BytesIO(b"\xFF\xD9")) def test_unitary_extract_inline(): # AHx b = 16000 * b"00" - assert len(extract_inline_AHx(BytesIO(b + b" EI"))) == len(b) + assert len(extract_inline__ascii_hex_decode(BytesIO(b + b" EI"))) == len(b) with pytest.raises(PdfReadError): - extract_inline_AHx(BytesIO(b + b"> ")) + extract_inline__ascii_hex_decode(BytesIO(b + b"> ")) # RL b = 8200 * b"\x00\xAB" + b"\x80" - assert len(extract_inline_RL(BytesIO(b + b" EI"))) == len(b) + assert len(extract_inline__run_length_decode(BytesIO(b + b" EI"))) == len(b) # default # EIDD instead of EI; using A85