fix: load fused moe

JamesBrianD · JamesBrianD · commit b4a2aa6c476e · 2025-12-11T14:20:16.000+08:00
diff --git a/python/sgl_jax/srt/layers/fused_moe.py b/python/sgl_jax/srt/layers/fused_moe.py
@@ -184,6 +184,9 @@ def __call__(self, hidden_states: jax.Array, router_logits: jax.Array) -> jax.Ar
         """
         assert hidden_states.ndim == 2
 
+        hidden_states = jax.sharding.reshard(hidden_states, P("tensor", None))
+        router_logits = jax.sharding.reshard(router_logits, P("tensor", None))
+
         output = fused_ep_moe(
             mesh=self.mesh,
             tokens=hidden_states,
@@ -212,4 +215,5 @@ def __call__(self, hidden_states: jax.Array, router_logits: jax.Array) -> jax.Ar
             # tp_axis_name="data",
         )
 
-        return output
+        final_output = jax.sharding.reshard(output, P(None))
+        return final_output
diff --git a/python/sgl_jax/srt/models/qwen3_moe.py b/python/sgl_jax/srt/models/qwen3_moe.py
@@ -263,6 +263,7 @@ def __call__(
 
         if self.is_moe_layer:
             router_logits = self.moe_gate(hidden_states)
+
             if self.use_fused:
                 hidden_states = self.mlp(hidden_states, router_logits)
             else:
@@ -510,37 +511,28 @@ def _create_moe_layer_mappings(self, layer_idx: int, is_mlp_layer: bool) -> dict
                 # Fused MoE Mapping
                 # w1: fused gate_proj(w1) + up_proj(w3) -> (num_experts, 2, hidden, intermediate)
                 # w2: down_proj(w2) -> (num_experts, intermediate, hidden)
-
-                # 1. Fused w1 (gate + up)
-                target_path_w1 = [f"{target_prefix}.mlp.w1"]
-                # Add source keys for gate_proj and up_proj
-                for name in ["gate_proj", "up_proj"]:
-                    target_path_w1.extend(
-                        [f"{prefix}.mlp.experts.{i}.{name}.weight" for i in range(num_experts)]
-                    )
-
+                w1_expert_keys = []
+                for expert_type in ["gate_proj", "up_proj"]:
+                    w1_expert_keys = w1_expert_keys + [
+                        f"{prefix}.mlp.experts.{i}.{expert_type}.weight" for i in range(num_experts)
+                    ]
                 mappings[f"__MOE_EXPERTS__{prefix}.mlp.w1"] = WeightMapping(
-                    target_path=target_path_w1,
+                    target_path=[f"{target_prefix}.mlp.w1"] + w1_expert_keys,
                     sharding=("tensor", None, None, None),  # (E, 2, H, I)
                     transpose=True,
-                    concat_axis=0,
                     fuse_moe_weights=True,
                     fuse_gate_up=("gate_proj", "up_proj"),
                 )
-
-                # 2. w2 (down)
-                target_path_w2 = [f"{target_prefix}.mlp.w2"]
-                target_path_w2.extend(
-                    [f"{prefix}.mlp.experts.{i}.down_proj.weight" for i in range(num_experts)]
-                )
-
+                w2_expert_keys = [
+                    f"{prefix}.mlp.experts.{i}.down_proj.weight" for i in range(num_experts)
+                ]
                 mappings[f"__MOE_EXPERTS__{prefix}.mlp.w2"] = WeightMapping(
-                    target_path=target_path_w2,
+                    target_path=[f"{target_prefix}.mlp.w2"] + w2_expert_keys,
                     sharding=("tensor", None, None),  # (E, I, H)
                     transpose=True,
-                    concat_axis=-1,
                 )
             else:
+                # EPMoE mapping - always use expert sharding
                 for expert_type in ["gate_proj", "up_proj", "down_proj"]:
                     target_name = {
                         "gate_proj": "wi_0",
@@ -553,9 +545,9 @@ def _create_moe_layer_mappings(self, layer_idx: int, is_mlp_layer: bool) -> dict
                     ]
 
                     if expert_type == "down_proj":
-                        sharding = ("tensor", None, None)
+                        sharding = ("expert", "tensor", None)
                     else:
-                        sharding = ("tensor", None, None)
+                        sharding = ("expert", None, "tensor")
 
                     mappings[f"__MOE_EXPERTS__{prefix}.mlp.{target_name}"] = WeightMapping(
                         target_path=[f"{target_prefix}.mlp.{target_name}"] + expert_keys,
@@ -598,8 +590,6 @@ def __call__(
         logits_metadata: LogitsMetadata,
     ):
         hidden_states, layers_kv_fused = self.model(forward_batch, token_to_kv_pool)
-        hidden_states = jax.sharding.reshard(hidden_states, jax.sharding.PartitionSpec(None, None))
-
         if not getattr(self.config, "tie_word_embeddings", False):
             output = self.logits_processor(hidden_states, self.lm_head, logits_metadata)
         else:
diff --git a/python/sgl_jax/srt/utils/weight_utils.py b/python/sgl_jax/srt/utils/weight_utils.py
@@ -292,6 +292,9 @@ def load_weights_from_safetensors(
 
         nnx.update(self.model, params)
 
+        # Final verification: check all fused MoE layers
+        self._verify_fused_moe_weights(params, moe_mappings)
+
     def _process_single_moe_group(
         self,
         params: nnx.State,
@@ -358,42 +361,48 @@ def _process_fused_moe_group(
                 }
         """
         target_path = mapping.target_path[0]
+        expected_hf_keys = mapping.target_path[1:]
 
         # Step 1: Process gate and up weights separately
+        # Use the predefined order from expected_hf_keys, not sorting
         gate_weights = []
         up_weights = []
 
-        # Process gate weights (w1)
-        for hf_key in sorted(grouped_weights["gate"].keys()):
-            weights = grouped_weights["gate"][hf_key]
+        gate_id, up_id = mapping.fuse_gate_up
 
-            # Concatenate TP shards
-            if mapping.concat_axis is not None and len(weights) > 1:
-                weight = jnp.concatenate(weights, axis=mapping.concat_axis)
-            else:
-                weight = weights[0]
+        # Separate expected keys into gate and up based on fuse_gate_up config
+        for hf_key in expected_hf_keys:
+            if gate_id in hf_key:
+                # This is a gate weight
+                weights = grouped_weights["gate"][hf_key]
 
-            # Transpose
-            if mapping.transpose:
-                weight = jnp.transpose(weight, (1, 0))
+                # Concatenate TP shards
+                if mapping.concat_axis is not None and len(weights) > 1:
+                    weight = jnp.concatenate(weights, axis=mapping.concat_axis)
+                else:
+                    weight = weights[0]
 
-            gate_weights.append(weight)
+                # Transpose
+                if mapping.transpose:
+                    weight = jnp.transpose(weight, (1, 0))
 
-        # Process up weights (w3)
-        for hf_key in sorted(grouped_weights["up"].keys()):
-            weights = grouped_weights["up"][hf_key]
+                gate_weights.append(weight)
 
-            # Concatenate TP shards
-            if mapping.concat_axis is not None and len(weights) > 1:
-                weight = jnp.concatenate(weights, axis=mapping.concat_axis)
-            else:
-                weight = weights[0]
+            elif up_id in hf_key:
+                # This is an up weight
+                weights = grouped_weights["up"][hf_key]
 
-            # Transpose
-            if mapping.transpose:
-                weight = jnp.transpose(weight, (1, 0))
+                # Concatenate TP shards
+                if mapping.concat_axis is not None and len(weights) > 1:
+                    weight = jnp.concatenate(weights, axis=mapping.concat_axis)
+                else:
+                    weight = weights[0]
 
-            up_weights.append(weight)
+                # Transpose
+                if mapping.transpose:
+                    weight = jnp.transpose(weight, (1, 0))
+
+                up_weights.append(weight)
 
         # Step 2: Stack to 3D tensors
         # gate_stacked: (num_experts, hidden_size, intermediate_size)
@@ -422,9 +431,24 @@ def _process_fused_moe_group(
 
         # Step 5: Assign to model parameter
         model_param = self._get_param(params, target_path)
-        model_param.value = sharded_weight.astype(model_param.value.dtype)
+        original_dtype = model_param.value.dtype
+        expected_shape = model_param.value.shape
+
+        # Validate shape before assignment
+        if fused_weight.shape != expected_shape:
+            raise ValueError(
+                f"Fused MoE weight shape mismatch for {target_path}: "
+                f"expected {expected_shape}, got {fused_weight.shape}"
+            )
+
+        model_param.value = sharded_weight.astype(original_dtype)
 
-        logger.debug("Assigned fused MoE group %s, final shape: %s", moe_key, fused_weight.shape)
+        # Verify assignment was successful
+        actual_shape = model_param.value.shape
+        if actual_shape != expected_shape:
+            raise RuntimeError(
+                f"Failed to assign fused MoE weight to {target_path}: shape mismatch"
+            )
 
     def _load_dummy_weights(
         self,
@@ -1000,3 +1024,72 @@ def _is_excluded_layer_weight(self, hf_key: str) -> bool:
 
         layer_num = int(parts[2])
         return layer_num >= self.model_config.num_hidden_layers
+
+    def _verify_fused_moe_weights(
+        self, params: nnx.State, moe_mappings: dict[str, WeightMapping]
+    ) -> None:
+        """Verify that all fused MoE weights were loaded correctly."""
+        # Get all fused w1 mappings
+        fused_w1_mappings = {
+            k: v for k, v in moe_mappings.items() if getattr(v, "fuse_moe_weights", False)
+        }
+
+        # Get corresponding w2 mappings (same layer, but w2 instead of w1)
+        w2_mappings = {}
+        for k in fused_w1_mappings:
+            w2_key = k.replace(".w1", ".w2")
+            if w2_key in moe_mappings:
+                w2_mappings[w2_key] = moe_mappings[w2_key]
+
+        if not fused_w1_mappings:
+            return
+
+        all_verified = True
+        verified_count = 0
+
+        # Verify w1 and w2 weights
+        for _, mapping in fused_w1_mappings.items():
+            target_path = mapping.target_path[0]
+            try:
+                model_param = self._get_param(params, target_path)
+                weight_shape = model_param.value.shape
+                weight_values = model_param.value
+
+                if (
+                    len(weight_shape) != 4
+                    or weight_shape[1] != 2
+                    or jnp.all(weight_values == 0)
+                    or jnp.any(jnp.isnan(weight_values))
+                ):
+                    logger.error("✗ %s: Invalid or corrupted weights", target_path)
+                    all_verified = False
+                else:
+                    verified_count += 1
+            except (KeyError, AttributeError, ValueError) as e:
+                logger.error("✗ %s: Failed to access - %s", target_path, str(e))
+                all_verified = False
+
+        for _, mapping in w2_mappings.items():
+            target_path = mapping.target_path[0]
+            try:
+                model_param = self._get_param(params, target_path)
+                weight_shape = model_param.value.shape
+                weight_values = model_param.value
+
+                if (
+                    len(weight_shape) != 3
+                    or jnp.all(weight_values == 0)
+                    or jnp.any(jnp.isnan(weight_values))
+                ):
+                    logger.error("✗ %s (w2): Invalid or corrupted weights", target_path)
+                    all_verified = False
+                else:
+                    verified_count += 1
+            except (KeyError, AttributeError, ValueError) as e:
+                logger.error("✗ %s (w2): Failed to access - %s", target_path, str(e))
+                all_verified = False
+
+        if all_verified:
+            logger.info("✓ Fused MoE weights verified: %d layers", verified_count // 2)
+        else:
+            raise RuntimeError("Fused MoE weight verification failed")