Added GZIP input support for V1

tcalmant · tcalmant · commit 77017893848c · 2021-05-14T15:10:25.000+02:00
diff --git a/javaobj/utils.py b/javaobj/utils.py
@@ -30,8 +30,10 @@
 from __future__ import absolute_import
 
 # Standard library
-from typing import Tuple  # noqa: F401
+from typing import IO, Tuple  # noqa: F401
+import gzip
 import logging
+import os
 import struct
 import sys
 
@@ -107,6 +109,36 @@ def read_string(data, length_fmt="H"):
 # ------------------------------------------------------------------------------
 
 
+def java_data_fd(original_df):
+    # type: (IO[bytes]) -> IO[bytes]
+    """
+    Ensures that the input file descriptor contains a Java serialized content.
+    Automatically uncompresses GZipped data
+
+    :param original_df: Input file descriptor
+    :return: Input file descriptor or a fake one to access uncompressed data
+    :raise IOError: Error reading input file
+    """
+    # Read the first bytes
+    start_idx = original_df.tell()
+    magic_header = original_df.read(2)
+    original_df.seek(start_idx, os.SEEK_SET)
+
+    if magic_header[0] == 0xAC:
+        # Consider we have a raw seralized stream: use it
+        original_df.seek(start_idx, os.SEEK_SET)
+        return original_df
+    elif magic_header[0] == 0x1F and magic_header[1] == 0x8B:
+        # Open the GZip file
+        return gzip.open(original_df, "rb")
+    else:
+        # Let the parser raise the error
+        return original_df
+
+
+# ------------------------------------------------------------------------------
+
+
 def hexdump(src, start_offset=0, length=16):
     # type: (str, int, int) -> str
     """
diff --git a/javaobj/v1/core.py b/javaobj/v1/core.py
@@ -47,6 +47,7 @@
 from .marshaller import JavaObjectMarshaller
 from .unmarshaller import JavaObjectUnmarshaller
 from .transformers import DefaultObjectTransformer
+from ..utils import java_data_fd
 
 # ------------------------------------------------------------------------------
 
@@ -81,6 +82,9 @@ def load(file_object, *transformers, **kwargs):
                                   trailing bytes are remaining
     :return: The deserialized object
     """
+    # Check file format (uncompress if necessary)
+    file_object = java_data_fd(file_object)
+
     # Read keyword argument
     ignore_remaining_data = kwargs.get("ignore_remaining_data", False)
 
diff --git a/tests/tests.py b/tests/tests.py
@@ -43,7 +43,7 @@
 
 # Local
 import javaobj.v1 as javaobj
-from javaobj.utils import hexdump
+from javaobj.utils import hexdump, java_data_fd
 
 # ------------------------------------------------------------------------------
 
@@ -141,6 +141,34 @@ def test_chars_rw(self):
         self.assertEqual(pobj, expected)
         self._try_marshalling(jobj, pobj)
 
+    def test_gzip_open(self):
+        """
+        Tests if the GZip auto-uncompress works
+        """
+        with java_data_fd(self.read_file("testChars.ser", stream=True)) as fd:
+            base = fd.read()
+
+        with java_data_fd(
+            self.read_file("testChars.ser.gz", stream=True)
+        ) as fd:
+            gzipped = fd.read()
+
+        self.assertEqual(
+            base, gzipped, "Uncompressed content doesn't match the original"
+        )
+
+    def test_chars_gzip(self):
+        """
+        Reads testChars.ser.gz
+        """
+        # Expected string as a UTF-16 string
+        expected = "python-javaobj".encode("utf-16-be").decode("latin1")
+
+        jobj = self.read_file("testChars.ser.gz")
+        pobj = javaobj.loads(jobj)
+        _logger.debug("Read char objects: %s", pobj)
+        self.assertEqual(pobj, expected)
+
     def test_double_rw(self):
         """
         Reads testDouble.ser and checks the serialization process