NVIDIA
diff --git a/‎tensorrt_llm/_torch/auto_deploy/config/default.yaml‎
Lines changed: 5 additions & 0 deletions b/‎tensorrt_llm/_torch/auto_deploy/config/default.yaml‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎tensorrt_llm/_torch/auto_deploy/custom_ops/__init__.py‎
Lines changed: 2 additions & 0 deletions b/‎tensorrt_llm/_torch/auto_deploy/custom_ops/__init__.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎tensorrt_llm/_torch/auto_deploy/custom_ops/mxfp4_moe.py‎
Lines changed: 268 additions & 0 deletions b/‎tensorrt_llm/_torch/auto_deploy/custom_ops/mxfp4_moe.py‎
Lines changed: 268 additions & 0 deletions
diff --git a/‎tensorrt_llm/_torch/auto_deploy/custom_ops/torch_moe.py‎
Lines changed: 50 additions & 0 deletions b/‎tensorrt_llm/_torch/auto_deploy/custom_ops/torch_moe.py‎
Lines changed: 50 additions & 0 deletions
@@ -29,6 +29,8 @@ transforms:
   ############################################################################################
   match_moe_pattern:
     stage: pattern_matcher
+  match_dense_moe_pattern:
+    stage: pattern_matcher
   match_repeat_kv:
     stage: pattern_matcher
   match_eager_attention:
@@ -64,13 +66,16 @@ transforms:
     stage: pattern_matcher
   quantize_nvfp4_moe:
     stage: pattern_matcher
+  quantize_mxfp4_moe:
+    stage: pattern_matcher
   # TODO: Infer sharding parameters (tp_size, row/column sharding) from the model config.
   detect_sharding:
     stage: sharding
     simple_shard_only: false
     use_sharding_from_factory: false
     support_partial_config: false
     sharding_dims: ['tp', 'ep', 'bmm']
+    requires_shape_prop: true
   # TODO: (hg) need to ensure run_shape_prop after sharding.
   sharding_transform_executor:
     stage: sharding
 
@@ -6,13 +6,15 @@
 from .flashinfer_rope import *
 from .linear import *
 from .mla import *
+from .mxfp4_moe import *
 from .quant import *
 from .rms_norm import *
 from .torch_attention import *
 from .torch_backend_attention import *
 from .torch_moe import *
 from .torch_quant import *
 from .torch_rope import *
+from .torch_router import *
 from .triton_attention import *
 from .triton_rope import *
 from .trtllm_moe import *
@@ -0,0 +1,268 @@
+# Triton-kernels-based MXFP4 MoE ops (GPT-OSS style) with routing, swizzling, and fused activation
+
+from typing import Callable, Tuple
+
+import torch
+import torch.nn.functional as F
+
+IS_TRITON_KERNELS_AVAILABLE = True
+TRITON_KERNELS_UNAVAILABLE_REASON = ""
+
+try:
+    from triton_kernels.matmul_ogs import (
+        FlexCtx,
+        FnSpecs,
+        FusedActivation,
+        PrecisionConfig,
+        matmul_ogs,
+    )
+    from triton_kernels.numerics import InFlexData
+    from triton_kernels.routing import RoutingData, routing
+    from triton_kernels.swiglu import swiglu_fn
+    from triton_kernels.tensor import FP4, convert_layout, wrap_torch_tensor
+    from triton_kernels.tensor_details import layout
+    from triton_kernels.tensor_details.layout import StridedLayout
+
+    from tensorrt_llm._torch.modules.fused_moe.fused_moe_triton import TritonEPRouter
+
+except Exception as _e:
+    IS_TRITON_KERNELS_AVAILABLE = False
+    TRITON_KERNELS_UNAVAILABLE_REASON = f"{type(_e).__name__}: {_e}"
+
+    FlexCtx = FnSpecs = FusedActivation = PrecisionConfig = matmul_ogs = None
+    InFlexData = RoutingData = routing = swiglu_fn = None
+    FP4 = convert_layout = wrap_torch_tensor = None
+    layout = StridedLayout = None
+    TritonEPRouter = None
+
+
+# copied from transformers.integrations.mxfp4::swizzle_mxfp4 with minor modification
+def _swizzle_mxfp4(w, w_scale):
+    value_layout, value_layout_opts = layout.make_default_matmul_mxfp4_w_layout(mx_axis=1)
+    w = convert_layout(wrap_torch_tensor(w, dtype=FP4), value_layout, **value_layout_opts)
+    w_scale = convert_layout(wrap_torch_tensor(w_scale), StridedLayout)
+    return w, w_scale
+
+
+RouteFn = Callable[[torch.Tensor], Tuple[RoutingData, torch.Tensor, torch.Tensor]]
+
+
+def _prepare_weights_scales(
+    hidden_size: int,
+    gate_up_blocks: torch.Tensor,  # [E_local, 2I, H//32, 16] in unit8
+    gate_up_scales: torch.Tensor,  # [E_local, 2I, H//32] in unit8
+    down_blocks: torch.Tensor,  # [E_local, H, I//32, 16] in uint8
+    down_scales: torch.Tensor,  # [E_local, H, I//32] in uint8
+):
+    local_experts = gate_up_blocks.size(0)
+    intermediate_size = gate_up_blocks.shape[1] // 2
+
+    # canon shapes for swizzling (use last two dims as [K, N] style)
+    gate_up_blocks = gate_up_blocks.view(local_experts, intermediate_size * 2, -1)
+    triton_gate_up_w, gate_up_w_scale_raw = _swizzle_mxfp4(
+        gate_up_blocks.transpose(-2, -1), gate_up_scales.transpose(-2, -1)
+    )
+    triton_gate_up_w.shape = torch.Size([local_experts, hidden_size, intermediate_size * 2])
+
+    down_blocks = down_blocks.view(local_experts, -1, intermediate_size // 2)
+    triton_down_w, down_w_scale_raw = _swizzle_mxfp4(
+        down_blocks.transpose(-2, -1), down_scales.transpose(-2, -1)
+    )
+    triton_down_w.shape = torch.Size([local_experts, intermediate_size, hidden_size])
+
+    return (
+        triton_gate_up_w,
+        gate_up_w_scale_raw,
+        triton_down_w,
+        down_w_scale_raw,
+    )
+
+
+def _run_mxfp4_mlp_core(
+    hidden_states: torch.Tensor,  # [B, S, H] or [B*S, H]
+    router_weight: torch.Tensor,
+    router_bias: torch.Tensor,
+    gate_up_blocks: torch.Tensor,
+    gate_up_bias: torch.Tensor,
+    gate_up_scales: torch.Tensor,
+    alpha: float,
+    limit: float,
+    down_blocks: torch.Tensor,
+    down_bias: torch.Tensor,
+    down_scales: torch.Tensor,
+    route_fn: RouteFn,  # injects routing variant
+) -> torch.Tensor:
+    """
+    Shared core for both triton_mxfp4_moe and triton_mxfp4_moe_ep.
+    - route_fn encapsulates the only difference: how we produce (routing_data, gather_idx, scatter_idx).
+    """
+    leading_shape = hidden_states.shape[:-1]
+    hidden_size = hidden_states.shape[-1]
+    x = hidden_states.reshape(-1, hidden_size)
+
+    router_logits = F.linear(x, router_weight, router_bias)
+    # route (global vs EP-aware)
+    with torch.cuda.device(router_logits.device):
+        routing_data, gather_idx, scatter_idx = route_fn(router_logits)
+
+    (
+        triton_gate_up_w,
+        gate_up_w_scale_raw,
+        triton_down_w,
+        down_w_scale_raw,
+    ) = _prepare_weights_scales(
+        hidden_size, gate_up_blocks, gate_up_scales, down_blocks, down_scales
+    )
+
+    gate_pc = PrecisionConfig(
+        weight_scale=gate_up_w_scale_raw, flex_ctx=FlexCtx(rhs_data=InFlexData())
+    )
+    down_pc = PrecisionConfig(
+        weight_scale=down_w_scale_raw, flex_ctx=FlexCtx(rhs_data=InFlexData())
+    )
+
+    act = FusedActivation(
+        FnSpecs("swiglu", swiglu_fn, ("alpha", "limit")), (float(alpha), float(limit)), 2
+    )
+
+    # gate_up (with SWiGLU fused)
+    inter = matmul_ogs(
+        x,
+        triton_gate_up_w,
+        gate_up_bias.to(torch.float32),
+        routing_data,
+        gather_indx=gather_idx,
+        precision_config=gate_pc,
+        gammas=None,
+        fused_activation=act,
+    )
+
+    # down
+    y = matmul_ogs(
+        inter,
+        triton_down_w,
+        down_bias.to(torch.float32),
+        routing_data,
+        scatter_indx=scatter_idx,
+        precision_config=down_pc,
+        gammas=routing_data.gate_scal,
+    )
+
+    y = y.reshape(*leading_shape, hidden_size)
+    return y
+
+
+@torch.library.custom_op("auto_deploy::triton_mxfp4_moe", mutates_args=())
+def triton_mxfp4_moe(
+    hidden_states: torch.Tensor,  # [B, S, H] or [B*S, H]
+    # router
+    router_weight: torch.Tensor,  # [E, H]
+    router_bias: torch.Tensor,  # [E]
+    top_k: int,
+    # gate_up path
+    gate_up_blocks: torch.Tensor,  # [E, 2I, H//32, 16] in unit8
+    gate_up_bias: torch.Tensor,  # [E, 2I]
+    gate_up_scales: torch.Tensor,  # [E, 2I, H//32] in unit8
+    alpha: float,
+    limit: float,
+    # down path
+    down_blocks: torch.Tensor,  # [E, H, I//32, 16] in uint8
+    down_bias: torch.Tensor,  # [E, H]
+    down_scales: torch.Tensor,  # [E, H, I//32] in uint8
+) -> torch.Tensor:
+    def _global_route_fn(logits: torch.Tensor):
+        return routing(logits, top_k)
+
+    return _run_mxfp4_mlp_core(
+        hidden_states,
+        router_weight,
+        router_bias,
+        gate_up_blocks,
+        gate_up_bias,
+        gate_up_scales,
+        alpha,
+        limit,
+        down_blocks,
+        down_bias,
+        down_scales,
+        route_fn=_global_route_fn,
+    )
+
+
+@triton_mxfp4_moe.register_fake
+def _mxfp4_mlp_fake(
+    hidden_states: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: torch.Tensor,
+    top_k: int,
+    gate_up_blocks: torch.Tensor,
+    gate_up_bias: torch.Tensor,
+    gate_up_scales: torch.Tensor,
+    alpha: float,
+    limit: float,
+    down_blocks: torch.Tensor,
+    down_bias: torch.Tensor,
+    down_scales: torch.Tensor,
+):
+    return torch.empty_like(hidden_states)
+
+
+@torch.library.custom_op("auto_deploy::triton_mxfp4_moe_ep", mutates_args=())
+def triton_mxfp4_moe_ep(
+    hidden_states: torch.Tensor,  # [B, S, H] or [B*S, H]
+    # router (replicated across EP)
+    router_weight: torch.Tensor,  # [E_total, H]
+    router_bias: torch.Tensor,  # [E_total]
+    top_k: int,
+    # expert params (already sharded along dim 0)
+    gate_up_blocks: torch.Tensor,  # [E_local, 2I, H//32, 16] in unit8
+    gate_up_bias: torch.Tensor,  # [E_local, 2I]
+    gate_up_scales: torch.Tensor,  # [E_local, 2I, H//32] in unit8
+    alpha: float,
+    limit: float,
+    down_blocks: torch.Tensor,  # [E_local, H, I//32, 16] in uint8
+    down_bias: torch.Tensor,  # [E_local, H]
+    down_scales: torch.Tensor,  # [E_local, H, I//32] in uint8
+    # EP topology
+    ep_size: int,
+    ep_rank: int,
+) -> torch.Tensor:
+    triton_ep_router = TritonEPRouter()
+
+    def _ep_route_fn(logits: torch.Tensor):
+        return triton_ep_router(logits, top_k, ep=ep_size, node_idx=ep_rank)
+
+    return _run_mxfp4_mlp_core(
+        hidden_states,
+        router_weight,
+        router_bias,
+        gate_up_blocks,
+        gate_up_bias,
+        gate_up_scales,
+        alpha,
+        limit,
+        down_blocks,
+        down_bias,
+        down_scales,
+        route_fn=_ep_route_fn,
+    )
+
+
+@triton_mxfp4_moe_ep.register_fake
+def _mxfp4_mlp_ep_fake(
+    hidden_states: torch.Tensor,
+    router_weight: torch.Tensor,
+    router_bias: torch.Tensor,
+    top_k: int,
+    gate_up_blocks: torch.Tensor,
+    gate_up_bias: torch.Tensor,
+    gate_up_scales: torch.Tensor,
+    alpha: float,
+    limit: float,
+    down_blocks: torch.Tensor,
+    down_bias: torch.Tensor,
+    down_scales: torch.Tensor,
+    ep_size: int,
+    ep_rank: int,
+):
+    return torch.empty_like(hidden_states)
@@ -324,3 +324,53 @@ def torch_quant_nvfp4_moe_fake(
     w3_alpha: List[torch.Tensor],
 ) -> torch.Tensor:
     return torch.empty_like(x)
+
+
+# GPT-OSS uses this style
+@torch.library.custom_op("auto_deploy::torch_moe_dense_mlp", mutates_args=())
+def torch_moe_dense_mlp(
+    hidden_states: torch.Tensor,  # [B, S, H] or [B*S, H]
+    routing_weights: torch.Tensor,  # [B*S, E]
+    gate_up_w: torch.Tensor,  # [E, H, 2I]
+    gate_up_b: torch.Tensor,  # [E, 2I]
+    down_w: torch.Tensor,  # [E, I, H]
+    down_b: torch.Tensor,  # [E, H]
+    alpha: float = 1.0,
+    limit: float = 10.0,
+) -> torch.Tensor:
+    batch_size = hidden_states.shape[0]
+    leading_shape = hidden_states.shape[:-1]
+    hidden_size = hidden_states.shape[-1]
+    hidden_states = hidden_states.reshape(-1, hidden_size)  # (num_tokens, hidden_size)
+    num_experts = routing_weights.shape[1]
+
+    hidden_states = hidden_states.repeat(num_experts, 1)
+    hidden_states = hidden_states.view(num_experts, -1, hidden_size)
+    gate_up = torch.bmm(hidden_states, gate_up_w) + gate_up_b[..., None, :]
+    gate, up = gate_up[..., ::2], gate_up[..., 1::2]
+    gate = gate.clamp(min=None, max=limit)
+    up = up.clamp(min=-limit, max=limit)
+    glu = gate * torch.sigmoid(gate * alpha)
+    next_states = torch.bmm(((up + 1) * glu), down_w)
+    next_states = next_states + down_b[..., None, :]
+    next_states = next_states.view(num_experts, batch_size, -1, hidden_size)
+    next_states = (
+        next_states * routing_weights.transpose(0, 1).view(num_experts, batch_size, -1)[..., None]
+    )
+    next_states = next_states.sum(dim=0)
+    next_states = next_states.reshape(*leading_shape, hidden_size)
+    return next_states  # [B, S, H] or [B*S, H]
+
+
+@torch_moe_dense_mlp.register_fake
+def _torch_moe_dense_mlp_fake(
+    hidden_states: torch.Tensor,
+    routing_weights: torch.Tensor,
+    gate_up_w: torch.Tensor,
+    gate_up_b: torch.Tensor,
+    down_w: torch.Tensor,
+    down_b: torch.Tensor,
+    alpha: float = 1.0,
+    limit: float = 10.0,
+) -> torch.Tensor:
+    return torch.empty_like(hidden_states)