py-pdf · stefan6419846 · Dec 5, 2025 · Dec 5, 2025
diff --git a/pypdf/filters.py b/pypdf/filters.py
@@ -408,12 +408,17 @@ def decode(
                     # We should first check, if we have an inner stream from a multi-encoded
                     # stream with a faulty trailing newline that we can decode properly.
                     # We will just ignore the last byte and raise a warning ...
-                    if (index == data_length - 1) and (data[index : index+1] == b"\n"):
+                    if (index == data_length - 1) and (data[index : index + 1] == b"\n"):
                         logger_warning(
                             "Found trailing newline in stream data, check if output is OK", __name__
                         )
                         break
-                    raise PdfStreamError("Early EOD in RunLengthDecode")
+                    # Raising an exception here breaks all image extraction for this file, which might
+                    # not be desirable. For this reason, indicate that the output is most likely wrong,
+                    # as processing stopped after the first EOD marker. See issue #3517.
+                    logger_warning(
+                        "Early EOD in RunLengthDecode, check if output is OK", __name__
+                    )
                 break
             if length < 128:
                 length += 1

diff --git a/pypdf/generic/_data_structures.py b/pypdf/generic/_data_structures.py
@@ -78,11 +78,11 @@
 )
 from ._fit import Fit
 from ._image_inline import (
-    extract_inline_A85,
-    extract_inline_AHx,
-    extract_inline_DCT,
+    extract_inline__ascii85_decode,
+    extract_inline__ascii_hex_decode,
+    extract_inline__dct_decode,
+    extract_inline__run_length_decode,
     extract_inline_default,
-    extract_inline_RL,
 )
 from ._utils import read_hex_string_from_stream, read_string_from_stream
 
@@ -1325,13 +1325,13 @@ def _read_inline_image(self, stream: StreamType) -> dict[str, Any]:
         if isinstance(filtr, list):
             filtr = filtr[0]  # used forencoding
         if "AHx" in filtr or "ASCIIHexDecode" in filtr:
-            data = extract_inline_AHx(stream)
+            data = extract_inline__ascii_hex_decode(stream)
         elif "A85" in filtr or "ASCII85Decode" in filtr:
-            data = extract_inline_A85(stream)
+            data = extract_inline__ascii85_decode(stream)
         elif "RL" in filtr or "RunLengthDecode" in filtr:
-            data = extract_inline_RL(stream)
+            data = extract_inline__run_length_decode(stream)
         elif "DCT" in filtr or "DCTDecode" in filtr:
-            data = extract_inline_DCT(stream)
+            data = extract_inline__dct_decode(stream)
         elif filtr == "not set":
             cs = settings.get("/CS", "")
             if isinstance(cs, list):

diff --git a/pypdf/generic/_image_inline.py b/pypdf/generic/_image_inline.py
@@ -33,6 +33,7 @@
     WHITESPACES,
     WHITESPACES_AS_BYTES,
     StreamType,
+    logger_warning,
     read_non_whitespace,
 )
 from ..errors import PdfReadError
@@ -44,7 +45,14 @@
 BUFFER_SIZE = 8192
 
 
-def extract_inline_AHx(stream: StreamType) -> bytes:
+def _check_end_image_marker(stream: StreamType) -> bool:
+    ei_tok = read_non_whitespace(stream)
+    ei_tok += stream.read(2)
+    stream.seek(-3, 1)
+    return ei_tok[:2] == b"EI" and (ei_tok[2:3] == b"" or ei_tok[2:3] in WHITESPACES)
+
+
+def extract_inline__ascii_hex_decode(stream: StreamType) -> bytes:
     """
     Extract HexEncoded stream from inline image.
     The stream will be moved onto the EI.
@@ -77,15 +85,12 @@ def extract_inline_AHx(stream: StreamType) -> bytes:
         data_out += data_buffered[:-2]
         stream.seek(-2, 1)
 
-    ei_tok = read_non_whitespace(stream)
-    ei_tok += stream.read(2)
-    stream.seek(-3, 1)
-    if ei_tok[:2] != b"EI" or not (ei_tok[2:3] == b"" or ei_tok[2:3] in WHITESPACES):
+    if not _check_end_image_marker(stream):
         raise PdfReadError("EI stream not found")
     return data_out
 
 
-def extract_inline_A85(stream: StreamType) -> bytes:
+def extract_inline__ascii85_decode(stream: StreamType) -> bytes:
     """
     Extract A85 stream from inline image.
     The stream will be moved onto the EI.
@@ -109,15 +114,12 @@ def extract_inline_A85(stream: StreamType) -> bytes:
         ]  # back by one char in case of in the middle of ~>
         stream.seek(-2, 1)
 
-    ei_tok = read_non_whitespace(stream)
-    ei_tok += stream.read(2)
-    stream.seek(-3, 1)
-    if ei_tok[:2] != b"EI" or not (ei_tok[2:3] == b"" or ei_tok[2:3] in WHITESPACES):
+    if not _check_end_image_marker(stream):
         raise PdfReadError("EI stream not found")
     return data_out
 
 
-def extract_inline_RL(stream: StreamType) -> bytes:
+def extract_inline__run_length_decode(stream: StreamType) -> bytes:
     """
     Extract RL (RunLengthDecode) stream from inline image.
     The stream will be moved onto the EI.
@@ -130,20 +132,33 @@ def extract_inline_RL(stream: StreamType) -> bytes:
             raise PdfReadError("Unexpected end of stream")
         pos_tok = data_buffered.find(b"\x80")
         if pos_tok >= 0:  # found
-            data_out += data_buffered[: pos_tok + 1]
-            stream.seek(-len(data_buffered) + pos_tok + 1, 1)
+            # Ideally, we could just use plain run-length decoding here, where 80_16 = 128_10
+            # marks the EOD. But there apparently are cases like in issue #3517, where we have
+            # an inline image with up to 51 EOD markers. In these cases, be resilient here and
+            # use the default `EI` marker detection instead. Please note that this fallback
+            # still omits special `EI` handling within the stream, but for now assume that having
+            # both of these cases occur at the same time is very unlikely (and the image stream
+            # is broken anyway).
+            # For now, do not skip over more than one whitespace character.
+            after_token = data_buffered[pos_tok + 1 : pos_tok + 4]
+            if after_token.startswith(b"EI") or after_token.endswith(b"EI"):
+                data_out += data_buffered[: pos_tok + 1]
+                stream.seek(-len(data_buffered) + pos_tok + 1, 1)
+            else:
+                logger_warning("Early EOD in RunLengthDecode of inline image, using fallback.", __name__)
+                ei_marker = data_buffered.find(b"EI")
+                if ei_marker > 0:
+                    data_out += data_buffered[: ei_marker]
+                    stream.seek(-len(data_buffered) + ei_marker - 1, 1)
             break
         data_out += data_buffered
 
-    ei_tok = read_non_whitespace(stream)
-    ei_tok += stream.read(2)
-    stream.seek(-3, 1)
-    if ei_tok[:2] != b"EI" or not (ei_tok[2:3] == b"" or ei_tok[2:3] in WHITESPACES):
+    if not _check_end_image_marker(stream):
         raise PdfReadError("EI stream not found")
     return data_out
 
 
-def extract_inline_DCT(stream: StreamType) -> bytes:
+def extract_inline__dct_decode(stream: StreamType) -> bytes:
     """
     Extract DCT (JPEG) stream from inline image.
     The stream will be moved onto the EI.
@@ -185,10 +200,7 @@ def read(length: int) -> bytes:
             sz = c[0] * 256 + c[1]
             data_out += read(sz - 2)
 
-    ei_tok = read_non_whitespace(stream)
-    ei_tok += stream.read(2)
-    stream.seek(-3, 1)
-    if ei_tok[:2] != b"EI" or not (ei_tok[2:3] == b"" or ei_tok[2:3] in WHITESPACES):
+    if not _check_end_image_marker(stream):
         raise PdfReadError("EI stream not found")
     return data_out
 

diff --git a/tests/generic/test_image_inline.py b/tests/generic/test_image_inline.py
@@ -75,3 +75,14 @@ def test_extract_inline_dct__early_end_of_file():
 
     with pytest.raises(expected_exception=PdfReadError, match=r"^Unexpected end of stream$"):
         page.images[0].image.load()
+
+
+@pytest.mark.enable_socket
+def test_extract_inline_dct__multiple_eod():
+    url = "https://github.com/user-attachments/files/23900687/cedolini_esempio-1.pdf"
+    name = "issue3517.pdf"
+    reader = PdfReader(BytesIO(get_data_from_url(url, name=name)))
+
+    for page in reader.pages:
+        for image in page.images:
+            _ = image.image.load()
diff --git a/tests/test_filters.py b/tests/test_filters.py
@@ -842,19 +842,22 @@ def test_rle_decode_with_faulty_tail_byte_in_multi_encoded_stream(caplog):
 
 
 @pytest.mark.enable_socket
-def test_rle_decode_exception_with_corrupted_stream():
+def test_rle_decode_exception_with_corrupted_stream(caplog):
     """
     Additional Test to #3355
 
-    This test must raise the EOD exception during RLE decoding and ensures
+    This test must report the EOD warning during RLE decoding and ensures
     that we do not fail during code coverage analyses in the git PR pipeline.
     """
     data = get_data_from_url(
         url="https://github.com/user-attachments/files/21052626/rle_stream_with_error.txt",
         name="rle_stream_with_error.txt"
     )
-    with pytest.raises(PdfStreamError, match="Early EOD in RunLengthDecode"):
-        RunLengthDecode.decode(data)
+    decoded = RunLengthDecode.decode(data)
+    assert decoded.startswith(b"\x01\x01\x01\x01\x01\x01\x01\x02\x02\x02\x02\x02\x02\x02\x03\x03")
+    assert decoded.endswith(b"\x87\x83\x83\x83\x83\x83\x83\x83]]]]]]]RRRRRRRX\xa5")
+    assert len(decoded) == 1048576
+    assert caplog.messages == ["Early EOD in RunLengthDecode, check if output is OK"]
 
 
 def test_decompress():

diff --git a/tests/test_generic.py b/tests/test_generic.py
@@ -41,10 +41,10 @@
     read_string_from_stream,
 )
 from pypdf.generic._image_inline import (
-    extract_inline_A85,
-    extract_inline_AHx,
-    extract_inline_DCT,
-    extract_inline_RL,
+    extract_inline__ascii85_decode,
+    extract_inline__ascii_hex_decode,
+    extract_inline__dct_decode,
+    extract_inline__run_length_decode,
 )
 
 from . import ReaderDummy, get_data_from_url
@@ -1158,36 +1158,36 @@ def test_array_operators():
 
 def test_unitary_extract_inline_buffer_invalid():
     with pytest.raises(PdfReadError):
-        extract_inline_AHx(BytesIO())
+        extract_inline__ascii_hex_decode(BytesIO())
     with pytest.raises(PdfReadError):
-        extract_inline_AHx(BytesIO(4095 * b"00" + b"   "))
+        extract_inline__ascii_hex_decode(BytesIO(4095 * b"00" + b"   "))
     with pytest.raises(PdfReadError):
-        extract_inline_AHx(BytesIO(b"00"))
+        extract_inline__ascii_hex_decode(BytesIO(b"00"))
     with pytest.raises(PdfReadError):
-        extract_inline_A85(BytesIO())
+        extract_inline__ascii85_decode(BytesIO())
     with pytest.raises(PdfReadError):
-        extract_inline_A85(BytesIO(a85encode(b"1")))
+        extract_inline__ascii85_decode(BytesIO(a85encode(b"1")))
     with pytest.raises(PdfReadError):
-        extract_inline_A85(BytesIO(a85encode(b"1") + b"~> Q"))
+        extract_inline__ascii85_decode(BytesIO(a85encode(b"1") + b"~> Q"))
     with pytest.raises(PdfReadError):
-        extract_inline_A85(BytesIO(a85encode(b"1234578" * 990)))
+        extract_inline__ascii85_decode(BytesIO(a85encode(b"1234578" * 990)))
     with pytest.raises(PdfReadError):
-        extract_inline_RL(BytesIO())
+        extract_inline__run_length_decode(BytesIO())
     with pytest.raises(PdfReadError):
-        extract_inline_RL(BytesIO(b"\x01\x01\x80"))
+        extract_inline__run_length_decode(BytesIO(b"\x01\x01\x80"))
     with pytest.raises(PdfReadError):
-        extract_inline_DCT(BytesIO(b"\xFF\xD9"))
+        extract_inline__dct_decode(BytesIO(b"\xFF\xD9"))
 
 
 def test_unitary_extract_inline():
     # AHx
     b = 16000 * b"00"
-    assert len(extract_inline_AHx(BytesIO(b + b" EI"))) == len(b)
+    assert len(extract_inline__ascii_hex_decode(BytesIO(b + b" EI"))) == len(b)
     with pytest.raises(PdfReadError):
-        extract_inline_AHx(BytesIO(b + b"> "))
+        extract_inline__ascii_hex_decode(BytesIO(b + b"> "))
     # RL
     b = 8200 * b"\x00\xAB" + b"\x80"
-    assert len(extract_inline_RL(BytesIO(b + b" EI"))) == len(b)
+    assert len(extract_inline__run_length_decode(BytesIO(b + b" EI"))) == len(b)
 
     # default
     # EIDD instead of EI; using A85