Add support for gru_linear_before_reset=0

Talmaj · Talmaj · commit f233a73209f9 · 2025-11-02T00:30:57.000+01:00
diff --git a/onnx2pytorch/convert/layer.py b/onnx2pytorch/convert/layer.py
@@ -412,7 +412,7 @@ def convert_gru_layer(node, weights):
         direction="forward",
         hidden_size=None,
         layout=0,
-        linear_before_reset=0,
+        linear_before_reset=0,  # ONNX spec default
     )
     dc.update(extract_attributes(node))
     if dc["activation_alpha"] is not None:
@@ -436,10 +436,7 @@ def convert_gru_layer(node, weights):
         raise NotImplementedError(
             "GRU not implemented for layout={}".format(dc["layout"])
         )
-    if dc["linear_before_reset"] != 0:
-        raise NotImplementedError(
-            "GRU linear_before_reset={}".format(dc["linear_before_reset"])
-        )
+    # linear_before_reset is now supported for both 0 and 1
 
     kwargs = {
         "input_size": W.shape[2],
@@ -570,5 +567,5 @@ def convert_gru_layer(node, weights):
         )
         getattr(gru_layer, "bias_hh_l0").data = Rb_rzn
 
-    layer = GRUWrapper(gru_layer)
+    layer = GRUWrapper(gru_layer, linear_before_reset=dc["linear_before_reset"])
     return layer
diff --git a/onnx2pytorch/operations/gru.py b/onnx2pytorch/operations/gru.py
@@ -1,31 +1,180 @@
+import torch
 from torch import nn
 
 
 class GRUWrapper(nn.Module):
-    """Wraps a 1-layer nn.GRU to match the API of an ONNX GRU.
+    """Wraps a 1-layer nn.GRU or custom GRU to match the API of an ONNX GRU.
 
     It expects h_0 as a separate input rather than as a tuple,
     and returns h_n as a separate output rather than as a tuple.
+
+    Supports both linear_before_reset=0 and linear_before_reset=1.
     """
 
-    def __init__(self, gru_module: nn.GRU):
+    def __init__(self, gru_module, linear_before_reset=1):
         super().__init__()
         self.gru = gru_module
+        self.linear_before_reset = linear_before_reset
+
+        # For linear_before_reset=0, we need custom forward pass
+        if linear_before_reset == 0 and isinstance(gru_module, nn.GRU):
+            # Extract parameters from PyTorch GRU for custom implementation
+            self.input_size = gru_module.input_size
+            self.hidden_size = gru_module.hidden_size
+            self.bidirectional = gru_module.bidirectional
 
     def forward(self, input, h_0=None):
         (seq_len, batch, input_size) = input.shape
         num_layers = 1
-        num_directions = self.gru.bidirectional + 1
+        num_directions = (
+            self.gru.bidirectional + 1 if hasattr(self.gru, "bidirectional") else 1
+        )
         hidden_size = self.gru.hidden_size
 
         if h_0 is None or h_0.numel() == 0:
             h_0 = None
 
-        output, h_n = self.gru(input, h_0)
+        if self.linear_before_reset == 1:
+            # Use standard PyTorch GRU (linear_before_reset=1 is PyTorch's default)
+            output, h_n = self.gru(input, h_0)
+        else:
+            # Custom implementation for linear_before_reset=0
+            output, h_n = self._forward_linear_before_reset_0(input, h_0)
 
         # Y has shape (seq_length, num_directions, batch_size, hidden_size)
         Y = output.view(seq_len, batch, num_directions, hidden_size).transpose(1, 2)
         # Y_h has shape (num_directions, batch_size, hidden_size)
         Y_h = h_n.view(num_layers, num_directions, batch, hidden_size).squeeze(0)
 
         return Y, Y_h
+
+    def _forward_linear_before_reset_0(self, input, h_0):
+        """Custom GRU forward with linear_before_reset=0 (ONNX/TensorFlow default).
+
+        Key difference from linear_before_reset=1 (PyTorch default):
+        - linear_before_reset=0: ht = g(Xt*(Wh^T) + (rt (.) Ht-1)*(Rh^T) + Rbh + Wbh)
+          Reset gate is applied to hidden state BEFORE matrix multiplication.
+        - linear_before_reset=1: ht = g(Xt*(Wh^T) + (rt (.) (Ht-1*(Rh^T) + Rbh)) + Wbh)
+          Reset gate is applied AFTER matrix multiplication and bias addition.
+
+        Equations for linear_before_reset=0:
+        r_t = sigmoid(W_ir @ x_t + b_ir + W_hr @ h_{t-1} + b_hr)
+        z_t = sigmoid(W_iz @ x_t + b_iz + W_hz @ h_{t-1} + b_hz)
+        n_t = tanh(W_in @ x_t + b_in + (r_t * h_{t-1}) @ W_hn + b_hn)
+        h_t = (1 - z_t) * n_t + z_t * h_{t-1}
+        """
+        seq_len, batch, input_size = input.shape
+        hidden_size = self.hidden_size
+        num_directions = 2 if self.bidirectional else 1
+
+        if h_0 is None:
+            h_0 = torch.zeros(
+                num_directions,
+                batch,
+                hidden_size,
+                device=input.device,
+                dtype=input.dtype,
+            )
+
+        # Extract weights from PyTorch GRU
+        # PyTorch stores weights as: weight_ih_l0, weight_hh_l0, bias_ih_l0, bias_hh_l0
+        # For bidirectional: also weight_ih_l0_reverse, weight_hh_l0_reverse, etc.
+
+        def gru_cell_linear_before_reset_0(
+            x_t, h_prev, weight_ih, weight_hh, bias_ih, bias_hh
+        ):
+            """Single GRU cell with linear_before_reset=0."""
+            # Split weights for gates: reset, update, new
+            # PyTorch order: [reset, update, new]
+            hidden_size = h_prev.size(1)
+
+            # Input-to-hidden weights
+            W_ir, W_iz, W_in = weight_ih.chunk(3, 0)
+            # Hidden-to-hidden weights
+            W_hr, W_hz, W_hn = weight_hh.chunk(3, 0)
+            # Input biases
+            b_ir, b_iz, b_in = (
+                bias_ih.chunk(3, 0) if bias_ih is not None else (None, None, None)
+            )
+            # Hidden biases
+            b_hr, b_hz, b_hn = (
+                bias_hh.chunk(3, 0) if bias_hh is not None else (None, None, None)
+            )
+
+            # Reset gate
+            r_t = torch.sigmoid(
+                x_t @ W_ir.t()
+                + (b_ir if b_ir is not None else 0)
+                + h_prev @ W_hr.t()
+                + (b_hr if b_hr is not None else 0)
+            )
+
+            # Update gate
+            z_t = torch.sigmoid(
+                x_t @ W_iz.t()
+                + (b_iz if b_iz is not None else 0)
+                + h_prev @ W_hz.t()
+                + (b_hz if b_hz is not None else 0)
+            )
+
+            # New gate (linear_before_reset=0 version)
+            # Note: Reset gate is applied to h_prev BEFORE matrix multiplication
+            # ONNX spec: ht = g(Xt*(Wh^T) + (rt (.) Ht-1)*(Rh^T) + Rbh + Wbh)
+            n_t = torch.tanh(
+                x_t @ W_in.t()
+                + (b_in if b_in is not None else 0)
+                + (r_t * h_prev) @ W_hn.t()
+                + (b_hn if b_hn is not None else 0)
+            )
+
+            # Hidden state update
+            h_t = (1 - z_t) * n_t + z_t * h_prev
+
+            return h_t
+
+        # Process sequence
+        outputs_forward = []
+        h_forward = h_0[0]
+
+        for t in range(seq_len):
+            h_forward = gru_cell_linear_before_reset_0(
+                input[t],
+                h_forward,
+                self.gru.weight_ih_l0,
+                self.gru.weight_hh_l0,
+                self.gru.bias_ih_l0 if self.gru.bias else None,
+                self.gru.bias_hh_l0 if self.gru.bias else None,
+            )
+            outputs_forward.append(h_forward)
+
+        if self.bidirectional:
+            # Process backward direction
+            outputs_backward = []
+            h_backward = h_0[1]
+
+            for t in range(seq_len - 1, -1, -1):
+                h_backward = gru_cell_linear_before_reset_0(
+                    input[t],
+                    h_backward,
+                    self.gru.weight_ih_l0_reverse,
+                    self.gru.weight_hh_l0_reverse,
+                    self.gru.bias_ih_l0_reverse if self.gru.bias else None,
+                    self.gru.bias_hh_l0_reverse if self.gru.bias else None,
+                )
+                outputs_backward.append(h_backward)
+
+            outputs_backward.reverse()
+
+            # Concatenate forward and backward outputs
+            output = torch.stack(
+                [
+                    torch.cat([outputs_forward[t], outputs_backward[t]], dim=1)
+                    for t in range(seq_len)
+                ]
+            )
+            h_n = torch.stack([h_forward, h_backward])
+        else:
+            output = torch.stack(outputs_forward)
+            h_n = h_forward.unsqueeze(0)
+
+        return output, h_n
diff --git a/tests/onnx2pytorch/convert/test_gru.py b/tests/onnx2pytorch/convert/test_gru.py
@@ -0,0 +1,171 @@
+import io
+import numpy as np
+import onnx
+import onnxruntime as ort
+import pytest
+import torch
+from onnx import helper, TensorProto
+
+from onnx2pytorch.convert import ConvertModel
+
+
+@pytest.mark.parametrize(
+    "bidirectional, input_size, hidden_size, seq_len, batch, test_seq_len, test_batch",
+    [
+        (False, 3, 5, 23, 4, 23, 4),
+        (False, 3, 5, 23, 4, 37, 4),
+        (False, 3, 5, 23, 4, 23, 7),
+        (True, 3, 5, 23, 4, 23, 4),
+        (True, 3, 5, 23, 4, 37, 4),
+        (True, 3, 5, 23, 4, 23, 7),
+    ],
+)
+def test_single_layer_gru(
+    bidirectional, input_size, hidden_size, seq_len, batch, test_seq_len, test_batch
+):
+    torch.manual_seed(42)
+    num_layers = 1
+    num_directions = bidirectional + 1
+    gru = torch.nn.GRU(
+        input_size=input_size,
+        hidden_size=hidden_size,
+        num_layers=num_layers,
+        bidirectional=bidirectional,
+    )
+    input = torch.randn(seq_len, batch, input_size)
+    h_0 = torch.randn(num_layers * num_directions, batch, hidden_size)
+    output, h_n = gru(input, h_0)
+    bitstream = io.BytesIO()
+    torch.onnx.export(
+        model=gru,
+        args=(input, h_0),
+        f=bitstream,
+        input_names=["input", "h_0"],
+        opset_version=11,
+        dynamo=False,  # Use legacy exporter for GRU compatibility
+        dynamic_axes={
+            "input": {0: "seq_len", 1: "batch"},
+            "h_0": {1: "batch"},
+        },
+    )
+    bitstream_data = bitstream.getvalue()
+
+    onnx_gru = onnx.ModelProto.FromString(bitstream_data)
+    o2p_gru = ConvertModel(onnx_gru, experimental=True)
+    with torch.no_grad():
+        o2p_output, o2p_h_n = o2p_gru(input, h_0)
+        torch.testing.assert_close(o2p_output, output, rtol=1e-6, atol=1e-6)
+        torch.testing.assert_close(o2p_h_n, h_n, rtol=1e-6, atol=1e-6)
+
+    onnx_gru = onnx.ModelProto.FromString(bitstream_data)
+    o2p_gru = ConvertModel(onnx_gru, experimental=True)
+    with torch.no_grad():
+        o2p_output, o2p_h_n = o2p_gru(h_0=h_0, input=input)
+        torch.testing.assert_close(o2p_output, output, rtol=1e-6, atol=1e-6)
+        torch.testing.assert_close(o2p_h_n, h_n, rtol=1e-6, atol=1e-6)
+        with pytest.raises(KeyError):
+            o2p_output, o2p_h_n = o2p_gru(input=input)
+        with pytest.raises(Exception):
+            # Even though initial states are optional for nn.GRU(),
+            # we adhere to onnxruntime convention that inputs are provided
+            # as either all positional or all keyword arguments.
+            o2p_output, o2p_h_n = o2p_gru(input, h_0=h_0)
+
+
+@pytest.mark.parametrize("linear_before_reset", [0, 1])
+@pytest.mark.parametrize("bidirectional", [False, True])
+def test_gru_linear_before_reset(linear_before_reset, bidirectional):
+    """Test GRU with both linear_before_reset=0 (ONNX/TensorFlow default) and =1 (PyTorch default)."""
+    torch.manual_seed(42)
+    np.random.seed(42)
+
+    input_size = 3
+    hidden_size = 4
+    seq_len = 5
+    batch = 2
+    num_directions = 2 if bidirectional else 1
+
+    # Create input and initial hidden state
+    X = np.random.randn(seq_len, batch, input_size).astype(np.float32)
+    initial_h = np.random.randn(num_directions, batch, hidden_size).astype(np.float32)
+
+    # Create random weights for GRU
+    # W shape: [num_directions, 3*hidden_size, input_size]
+    W = np.random.randn(num_directions, 3 * hidden_size, input_size).astype(np.float32)
+    # R shape: [num_directions, 3*hidden_size, hidden_size]
+    R = np.random.randn(num_directions, 3 * hidden_size, hidden_size).astype(np.float32)
+    # B shape: [num_directions, 6*hidden_size] (Wb and Rb concatenated)
+    B = np.random.randn(num_directions, 6 * hidden_size).astype(np.float32)
+
+    # Create ONNX graph with GRU node
+    input_tensor = helper.make_tensor_value_info(
+        "X", TensorProto.FLOAT, [seq_len, batch, input_size]
+    )
+    initial_h_tensor = helper.make_tensor_value_info(
+        "initial_h", TensorProto.FLOAT, [num_directions, batch, hidden_size]
+    )
+    output_tensor = helper.make_tensor_value_info(
+        "Y", TensorProto.FLOAT, [seq_len, num_directions, batch, hidden_size]
+    )
+    output_h_tensor = helper.make_tensor_value_info(
+        "Y_h", TensorProto.FLOAT, [num_directions, batch, hidden_size]
+    )
+
+    W_initializer = helper.make_tensor(
+        "W", TensorProto.FLOAT, W.shape, W.flatten().tolist()
+    )
+    R_initializer = helper.make_tensor(
+        "R", TensorProto.FLOAT, R.shape, R.flatten().tolist()
+    )
+    B_initializer = helper.make_tensor(
+        "B", TensorProto.FLOAT, B.shape, B.flatten().tolist()
+    )
+
+    direction = "bidirectional" if bidirectional else "forward"
+    gru_node = helper.make_node(
+        "GRU",
+        inputs=["X", "W", "R", "B", "", "initial_h"],
+        outputs=["Y", "Y_h"],
+        hidden_size=hidden_size,
+        linear_before_reset=linear_before_reset,
+        direction=direction,
+    )
+
+    graph = helper.make_graph(
+        [gru_node],
+        "gru_test",
+        [input_tensor, initial_h_tensor],
+        [output_tensor, output_h_tensor],
+        [W_initializer, R_initializer, B_initializer],
+    )
+
+    model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 14)])
+    onnx.checker.check_model(model)
+
+    # Run with onnxruntime to get expected output
+    ort_session = ort.InferenceSession(model.SerializeToString())
+    ort_inputs = {"X": X, "initial_h": initial_h}
+    ort_outputs = ort_session.run(None, ort_inputs)
+    expected_Y, expected_Y_h = ort_outputs
+
+    # Convert to PyTorch and run
+    o2p_gru = ConvertModel(model, experimental=True)
+    X_torch = torch.from_numpy(X)
+    initial_h_torch = torch.from_numpy(initial_h)
+
+    with torch.no_grad():
+        o2p_output, o2p_h_n = o2p_gru(X_torch, initial_h_torch)
+
+    # Compare with onnxruntime outputs
+    torch.testing.assert_close(
+        o2p_output,
+        torch.from_numpy(expected_Y),
+        rtol=1e-5,
+        atol=1e-5,
+    )
+    torch.testing.assert_close(
+        o2p_h_n,
+        torch.from_numpy(expected_Y_h),
+        rtol=1e-5,
+        atol=1e-5,
+    )