change fused moe kernel mesh

JamesBrianD · JamesBrianD · commit 57a995b271be · 2025-12-08T16:26:22.000+08:00
diff --git a/.gitignore b/.gitignore
@@ -241,3 +241,4 @@ CLAUDE.md
 
 #gemini code
 .gemini-clipboard
+GEMINI.md
diff --git a/python/sgl_jax/srt/kernels/fused_moe/v1/kernel.py b/python/sgl_jax/srt/kernels/fused_moe/v1/kernel.py
@@ -205,6 +205,7 @@ def _fused_ep_moe_kernel(
     top_k: int,
     renormalize_topk_logits: bool,
     ep_axis_name: str,
+    tp_axis_name: str,
     act_fn: str,
     subc_quant_wsz: int | None = None,
     # Kernel tuning params.
@@ -214,8 +215,8 @@ def _fused_ep_moe_kernel(
     bd2: int,  # Block size of hidden_size in w2.
     btc: int,  # Compute size of block tokens for active expert.
     bfc: int,  # Compute size of block intermediate_size.
-    bd1c: int,  # Compute size of block hidden_size.
-    bd2c: int,  # Compute size of block hidden_size.
+    bd1c: int,
+    bd2c: int,
 ):
     my_id = lax.axis_index(ep_axis_name)
     num_devices = lax.axis_size(ep_axis_name)
@@ -260,8 +261,8 @@ def _fused_ep_moe_kernel(
     num_bd2 = cdiv(hidden_size, bd2)
 
     def get_mesh_device_id(ep_rank):
-        dp_rank = jax.lax.axis_index("data")
-        return (dp_rank, ep_rank)
+        tp_rank = jax.lax.axis_index(tp_axis_name)
+        return (ep_rank, tp_rank)
 
     def sync_barrier():
         barrier_sem = pltpu.get_barrier_semaphore()
@@ -1104,6 +1105,7 @@ def _():
         "bd1c",
         "bd2c",
         "ep_axis_name",
+        "tp_axis_name",
     ],
 )
 def fused_ep_moe(
@@ -1134,12 +1136,12 @@ def fused_ep_moe(
     bfc: int,
     bd1c: int,
     bd2c: int,
-    ep_axis_name: str = "tensor",
+    ep_axis_name: str = "expert",
+    tp_axis_name: str = "tensor",
 ):
     # TODO(jevinjiang): move all these assertions to validation function.
     # Assert all other axes have length of 1
     assert len(mesh.shape) == 2, "Expect 2D mesh"
-    assert "data" in mesh.shape and mesh.shape["data"] == 1, "Expect data axis size of 1"
 
     ep_size = mesh.shape[ep_axis_name]
     num_devices = ep_size
@@ -1294,6 +1296,7 @@ def fused_ep_moe(
                 top_k=top_k,
                 renormalize_topk_logits=renormalize_topk_logits,
                 ep_axis_name=ep_axis_name,
+                tp_axis_name=tp_axis_name,
                 act_fn=act_fn,
                 subc_quant_wsz=subc_quant_wsz,
                 bt=bt,
@@ -1479,16 +1482,18 @@ def fused_ep_moe(
         mesh=mesh,
         in_specs=(
             P(ep_axis_name),  # tokens_hbm
-            P(ep_axis_name),  # w1_hbm
-            P(ep_axis_name),  # w2_hbm
-            None if w1_scale is None else P(ep_axis_name),  # w1_scale_hbm
-            None if w2_scale is None else P(ep_axis_name),  # w2_scale_hbm
-            None if b1 is None else P(ep_axis_name),  # b1_hbm
-            None if b2 is None else P(ep_axis_name),  # b2_hbm
+            P(ep_axis_name, None, None, tp_axis_name),  # w1_hbm
+            P(ep_axis_name, tp_axis_name, None),  # w2_hbm
+            (
+                None if w1_scale is None else P(ep_axis_name, None, None, None, tp_axis_name)
+            ),  # w1_scale_hbm
+            None if w2_scale is None else P(ep_axis_name, None, None, tp_axis_name),  # w2_scale_hbm
+            None if b1 is None else P(ep_axis_name, None, tp_axis_name),  # b1_hbm
+            None if b2 is None else P(ep_axis_name, tp_axis_name),  # b2_hbm
             P(ep_axis_name),  # gating_output_hbm
             P(),  # a2a_g_hbm
         ),
-        out_specs=P(ep_axis_name),
+        out_specs=P(ep_axis_name, None),
         check_vma=False,
     )
     def kernel(
@@ -1502,7 +1507,7 @@ def kernel(
         gating_output,
         a2a_g_hbm_scratch,
     ):
-        return fused_moe(
+        results = fused_moe(
             pltpu.with_memory_space_constraint(tokens, pltpu.HBM),  # tokens_hbm
             pltpu.with_memory_space_constraint(w1, pltpu.HBM),  # w1_hbm
             pltpu.with_memory_space_constraint(w2, pltpu.HBM),  # w2_hbm
@@ -1522,6 +1527,11 @@ def kernel(
             pltpu.with_memory_space_constraint(a2a_g_hbm_scratch, pltpu.HBM),  # a2a_g_hbm
         )
 
+        if tp_axis_name in mesh.axis_names:
+            results = jax.lax.psum(results, tp_axis_name)
+
+        return results
+
     a2a_g_hbm_scratch = pl.empty((num_experts, bt, t_packing, hidden_size // t_packing), t_dtype)
     results = kernel(
         tokens,
diff --git a/python/sgl_jax/srt/layers/fused_moe.py b/python/sgl_jax/srt/layers/fused_moe.py
@@ -1,13 +1,17 @@
 """Fused MoE layer using optimized TPU kernel."""
 
+import logging
+
 import jax
 import jax.numpy as jnp
 from flax import nnx
-from jax.sharding import Mesh
+from jax.sharding import Mesh, NamedSharding
 from jax.sharding import PartitionSpec as P
 
 from sgl_jax.srt.kernels.fused_moe.v1.kernel import fused_ep_moe
 
+logger = logging.getLogger(__name__)
+
 
 def _get_default_tile_sizes(hidden_size: int, intermediate_size: int) -> dict[str, int]:
     """
@@ -168,6 +172,33 @@ def __init__(
         self.bd1c = bd1c
         self.bd2c = bd2c
 
+        logger.info(
+            "Initializing FusedEPMoE layer %d: num_experts=%d, "
+            "num_experts_per_tok=%d, ep_size=%d, "
+            "intermediate_dim=%d, activation=%s, "
+            "renormalize_topk_logits=%s",
+            layer_id,
+            num_experts,
+            num_experts_per_tok,
+            ep_size,
+            intermediate_dim,
+            activation,
+            renormalize_topk_logits,
+        )
+        logger.info(
+            "FusedEPMoE layer %d tile sizes: bt=%d, bf=%d, bd1=%d, bd2=%d, "
+            "btc=%d, bfc=%d, bd1c=%d, bd2c=%d",
+            layer_id,
+            bt,
+            bf,
+            bd1,
+            bd2,
+            btc,
+            bfc,
+            bd1c,
+            bd2c,
+        )
+
         # Initialize weights in fused format
         with jax.sharding.use_abstract_mesh(self.updated_mesh):
             self.w1 = nnx.Param(
@@ -203,32 +234,46 @@ def __call__(self, hidden_states: jax.Array, router_logits: jax.Array) -> jax.Ar
         """
         assert hidden_states.ndim == 2
 
-        # Call the fused kernel
-        # Note: ep_size > 1 is handled internally by the kernel via mesh
-        output = fused_ep_moe(
-            mesh=self.moe_mesh,
-            tokens=hidden_states,
-            w1=self.w1.value,
-            w2=self.w2.value,
-            gating_output=router_logits,
-            top_k=self.num_experts_per_tok,
-            renormalize_topk_logits=self.renormalize_topk_logits,
-            act_fn=self.activation,
-            # Tile sizes
-            bt=self.bt,
-            bf=self.bf,
-            bd1=self.bd1,
-            bd2=self.bd2,
-            btc=self.btc,
-            bfc=self.bfc,
-            bd1c=self.bd1c,
-            bd2c=self.bd2c,
-            # Optional parameters (not used in basic case)
-            subc_quant_wsz=None,
-            w1_scale=None,
-            w2_scale=None,
-            b1=None,
-            b2=None,
+        logger.debug(
+            "FusedEPMoE layer %d: Processing %d tokens with %d experts (top-%d)",
+            self.layer_id,
+            hidden_states.shape[0],
+            self.num_experts,
+            self.num_experts_per_tok,
         )
 
-        return output
+        with jax.sharding.use_abstract_mesh(self.updated_mesh):
+            hidden_states = jax.lax.with_sharding_constraint(
+                hidden_states, NamedSharding(self.updated_mesh, P("expert", None))
+            )
+
+            output = fused_ep_moe(
+                mesh=self.moe_mesh,
+                tokens=hidden_states,
+                w1=self.w1.value,
+                w2=self.w2.value,
+                gating_output=router_logits,
+                top_k=self.num_experts_per_tok,
+                renormalize_topk_logits=self.renormalize_topk_logits,
+                act_fn=self.activation,
+                # Tile sizes
+                bt=self.bt,
+                bf=self.bf,
+                bd1=self.bd1,
+                bd2=self.bd2,
+                btc=self.btc,
+                bfc=self.bfc,
+                bd1c=self.bd1c,
+                bd2c=self.bd2c,
+                # Optional parameters (not used in basic case)
+                subc_quant_wsz=None,
+                w1_scale=None,
+                w2_scale=None,
+                b1=None,
+                b2=None,
+            )
+
+        output_pspec = P(*([None] * (output.ndim)))
+        return jax.sharding.reshard(
+            output, jax.sharding.NamedSharding(self.original_mesh, output_pspec)
+        )
diff --git a/python/sgl_jax/srt/model_executor/model_runner.py b/python/sgl_jax/srt/model_executor/model_runner.py
@@ -240,6 +240,7 @@ def load_model(self):
         self.model_config.configure_for_tensor_parallel(self.tp_size)
         self.model_config.log_kv_heads_info(self.tp_size)
         self.model_config.hf_config.ep_size = self.ep_size
+        self.model_config.hf_config.moe_backend = self.model_config.moe_backend.value
 
         self.model = self.model_loader.load_model(
             model_config=self.model_config,
diff --git a/python/sgl_jax/srt/models/bailing_moe.py b/python/sgl_jax/srt/models/bailing_moe.py
@@ -612,7 +612,7 @@ def _create_moe_layer_mappings(self, layer_idx: int, is_mlp_layer: bool) -> dict
                         [f"{prefix}.mlp.experts.{i}.{name}.weight" for i in range(num_experts)]
                     )
 
-                mappings[f"__MOE_EXPERTS__{prefix}.mlp.experts.w1"] = WeightMapping(
+                mappings[f"__MOE_EXPERTS__{prefix}.mlp.w1"] = WeightMapping(
                     target_path=target_path_w1,
                     sharding=("expert", None, None, "tensor"),  # (E, 2, H, I/TP)
                     transpose=True,
@@ -627,7 +627,7 @@ def _create_moe_layer_mappings(self, layer_idx: int, is_mlp_layer: bool) -> dict
                     [f"{prefix}.mlp.experts.{i}.down_proj.weight" for i in range(num_experts)]
                 )
 
-                mappings[f"__MOE_EXPERTS__{prefix}.mlp.experts.w2"] = WeightMapping(
+                mappings[f"__MOE_EXPERTS__{prefix}.mlp.w2"] = WeightMapping(
                     target_path=target_path_w2,
                     sharding=("expert", "tensor", None),  # (E, I/TP, H)
                     transpose=True,
diff --git a/python/sgl_jax/srt/models/grok.py b/python/sgl_jax/srt/models/grok.py
@@ -17,6 +17,7 @@
     _yarn_find_correction_range,
     _yarn_get_mscale,
 )
+from sgl_jax.srt.layers.fused_moe import FusedEPMoE
 from sgl_jax.srt.layers.layernorm import RMSNorm, dual_rmsnorm_forward
 from sgl_jax.srt.layers.linear import LinearBase
 from sgl_jax.srt.layers.logits_processor import (
@@ -206,6 +207,8 @@ class Grok1MoE(nnx.Module):
     kernel is used for the forward pass, with outputs reduced across ranks.
     """
 
+    experts: FusedEPMoE | EPMoE
+
     def __init__(
         self,
         config: PretrainedConfig,
@@ -242,8 +245,6 @@ def __init__(
         self.use_fused = self.moe_backend == "fused"
 
         if self.use_fused:
-            from sgl_jax.srt.layers.fused_moe import FusedEPMoE
-
             self.experts = FusedEPMoE(
                 config=config,
                 num_experts=num_experts,
@@ -283,12 +284,14 @@ def __call__(self, hidden_states: jax.Array) -> jax.Array:
         if self.use_fused:
             # Fused kernel: pass router_logits directly
             # Top-K selection is handled internally by the kernel
+            assert isinstance(self.experts, FusedEPMoE)
             return self.experts(hidden_states, router_logits)
         else:
             # EPMoE: compute top-k routing weights using sglang-style approach:
             # 1. Compute global softmax over ALL experts (not just top-k)
             # 2. Select top-k experts based on logits
             # 3. Extract corresponding weights (no renormalization)
+            assert isinstance(self.experts, EPMoE)
             top_k_weights, top_k_indices = self._custom_topk(
                 router_logits, self.top_k, renormalize=False
             )
@@ -939,7 +942,7 @@ def _create_layer_mappings(self, layer_idx: int) -> dict[str, WeightMapping]:
             # w2: down(w2) -> (num_experts, intermediate, hidden)
 
             # 1. Fused w1 (gate + up)
-            target_path_w1 = [f"{target_prefix}.block_sparse_moe.experts.w1"]
+            target_path_w1 = [f"{target_prefix}.block_sparse_moe.w1"]
             # Add source keys for w1 (gate) and w3 (up)
             # Note: Grok experts are 0..N-1
             for name in ["w1", "w3"]:
@@ -950,7 +953,7 @@ def _create_layer_mappings(self, layer_idx: int) -> dict[str, WeightMapping]:
                     ]
                 )
 
-            mappings[f"__MOE_EXPERTS__{prefix}.block_sparse_moe.experts.w1"] = WeightMapping(
+            mappings[f"__MOE_EXPERTS__{prefix}.block_sparse_moe.w1"] = WeightMapping(
                 target_path=target_path_w1,
                 sharding=("expert", None, None, "tensor"),  # (E, 2, H, I/TP)
                 transpose=True,
@@ -960,16 +963,15 @@ def _create_layer_mappings(self, layer_idx: int) -> dict[str, WeightMapping]:
             )
 
             # 2. w2 (down)
-            target_path_w2 = [f"{target_prefix}.block_sparse_moe.experts.w2"]
+            target_path_w2 = [f"{target_prefix}.block_sparse_moe.w2"]
             target_path_w2.extend(
                 [
                     f"{prefix}.block_sparse_moe.experts.{i}.w2.weight"
                     for i in range(self.config.num_local_experts)
                 ]
             )
 
-
-            mappings[f"__MOE_EXPERTS__{prefix}.block_sparse_moe.experts.w2"] = WeightMapping(
+            mappings[f"__MOE_EXPERTS__{prefix}.block_sparse_moe.w2"] = WeightMapping(
                 target_path=target_path_w2,
                 sharding=("expert", "tensor", None),  # (E, I/TP, H)
                 transpose=True,
@@ -987,7 +989,11 @@ def _create_layer_mappings(self, layer_idx: int) -> dict[str, WeightMapping]:
                     ]
                 )
 
-                sharding = ("expert", "tensor", None) if target_name == "wo" else ("expert", None, "tensor")
+                sharding = (
+                    ("expert", "tensor", None)
+                    if target_name == "wo"
+                    else ("expert", None, "tensor")
+                )
 
                 if name == "w2":
                     # w2 (down_proj) -> wo
diff --git a/python/sgl_jax/srt/models/qwen2_moe.py b/python/sgl_jax/srt/models/qwen2_moe.py
@@ -593,7 +593,7 @@ def _create_moe_layer_mappings(self, layer_idx: int) -> dict:
                     [f"{prefix}.mlp.experts.{i}.{name}.weight" for i in range(num_experts)]
                 )
 
-            mappings[f"__MOE_EXPERTS__{prefix}.mlp.experts.w1"] = WeightMapping(
+            mappings[f"__MOE_EXPERTS__{prefix}.mlp.w1"] = WeightMapping(
                 target_path=target_path_w1,
                 sharding=("expert", None, None, "tensor"),  # (E, 2, H, I/TP)
                 transpose=True,
@@ -608,7 +608,7 @@ def _create_moe_layer_mappings(self, layer_idx: int) -> dict:
                 [f"{prefix}.mlp.experts.{i}.down_proj.weight" for i in range(num_experts)]
             )
 
-            mappings[f"__MOE_EXPERTS__{prefix}.mlp.experts.w2"] = WeightMapping(
+            mappings[f"__MOE_EXPERTS__{prefix}.mlp.w2"] = WeightMapping(
                 target_path=target_path_w2,
                 sharding=("expert", "tensor", None),  # (E, I/TP, H)
                 transpose=True,
diff --git a/python/sgl_jax/srt/models/qwen3_moe.py b/python/sgl_jax/srt/models/qwen3_moe.py
diff --git a/python/sgl_jax/test/kernels/fused_moe_v1_test.py b/python/sgl_jax/test/kernels/fused_moe_v1_test.py

Original file line number	Diff line number	Diff line change
`@@ -241,3 +241,4 @@ CLAUDE.md`
`241`	`241`
`242`	`242`	`#gemini code`
`243`	`243`	`.gemini-clipboard`
	`244`	`+GEMINI.md`