[OMNIML-2336][feat] add W4A8 NVFP4 FP8 fused moe (#7968)

sychen52 · web-flow · commit ba8abeab1047 · 2025-10-01T02:39:33.000-04:00
Signed-off-by: Shiyang Chen &lt;shiychen@nvidia.com&gt;
diff --git a/tensorrt_llm/_torch/modules/fused_moe/create_moe.py b/tensorrt_llm/_torch/modules/fused_moe/create_moe.py
@@ -41,6 +41,7 @@ def get_moe_cls(
                 quant_config.quant_mode.has_fp8_block_scales()
                 or quant_config.quant_mode.has_nvfp4()
                 or quant_config.quant_mode.has_w4a16_mxfp4()
+                or quant_config.quant_mode.has_w4a8_nvfp4_fp8()
                 or quant_config.quant_mode.has_w4a8_mxfp4_fp8()
                 or quant_config.quant_mode.has_w4a8_mxfp4_mxfp8()):
             return TRTLLMGenFusedMoE
diff --git a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_trtllm_gen.py b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_trtllm_gen.py
@@ -15,6 +15,7 @@
                            NVFP4TRTLLMGenFusedMoEMethod,
                            W4A8MXFP4FP8TRTLLMGenFusedMoEMethod,
                            W4A8MXFP4MXFP8TRTLLMGenFusedMoEMethod,
+                           W4A8NVFP4FP8TRTLLMGenFusedMoEMethod,
                            W4A16MXFP4TRTLLMGenFusedMoEMethod)
 from .routing import BaseMoeRoutingMethod, DeepSeekV3MoeRoutingMethod
 
@@ -111,7 +112,7 @@ def __init__(
 
     def _check_configs(self):
         assert self.has_deepseek_fp8_block_scales \
-            or self.has_nvfp4 or self.has_w4a16_mxfp4 \
+            or self.has_nvfp4 or self.has_w4a16_mxfp4 or self.has_w4a8_nvfp4_fp8 \
             or self.has_w4a8_mxfp4_fp8 or self.has_w4a8_mxfp4_mxfp8, "TRTLLMGenFusedMoE only supports fp8_block_scaling, nvfp4, w4a16_mxfp4, w4a8_mxfp4_fp8 and w4a8_mxfp4_mxfp8 dtypes."
 
         if self.bias or self.swiglu_alpha is not None or self.swiglu_beta is not None or self.swiglu_limit is not None:
@@ -125,6 +126,8 @@ def _get_quant_method(self):
                 return NVFP4TRTLLMGenFusedMoEMethod()
             elif self.quant_config.layer_quant_mode.has_w4a16_mxfp4():
                 return W4A16MXFP4TRTLLMGenFusedMoEMethod()
+            elif self.quant_config.layer_quant_mode.has_w4a8_nvfp4_fp8():
+                return W4A8NVFP4FP8TRTLLMGenFusedMoEMethod()
             elif self.quant_config.layer_quant_mode.has_w4a8_mxfp4_fp8():
                 return W4A8MXFP4FP8TRTLLMGenFusedMoEMethod()
             elif self.quant_config.layer_quant_mode.has_w4a8_mxfp4_mxfp8():
@@ -147,8 +150,8 @@ def create_weights(self):
         self._weights_created = True
         self._check_configs()
 
-        # TODO: FIX this.
-        if (self.has_w4a16_mxfp4 or self.has_w4a8_mxfp4_fp8
+        if (self.has_w4a16_mxfp4 or self.has_w4a8_nvfp4_fp8
+                or self.has_w4a8_mxfp4_fp8
                 or self.has_w4a8_mxfp4_mxfp8) and not self.bias:
             self.w3_w1_bias = nn.Parameter(torch.zeros(
                 (self.w3_w1_weight.shape[0], self.w3_w1_weight.shape[1]),
@@ -378,6 +381,46 @@ def forward_impl(
             )
             final_hidden_states = final_hidden_states[:, :self.
                                                       hidden_size].contiguous()
+        elif self.has_w4a8_nvfp4_fp8:
+
+            if not run_post_quant_allgather:
+                hidden_states_fp8, _ = torch.ops.tensorrt_llm.static_quantize_e4m3_per_tensor(
+                    x, 1.0 / self.fc31_input_scale)
+            else:
+                hidden_states_fp8 = x
+
+            outputs = torch.ops.trtllm.fp8_fp4_block_scale_moe_runner(
+                router_logits,
+                routing_bias,
+                hidden_states_fp8,
+                self.w3_w1_weight,
+                self.w3_w1_weight_scale.view(torch.float8_e4m3fn),
+                self.w2_weight,
+                self.w2_weight_scale.view(torch.float8_e4m3fn),
+                self.fc31_scale_c.data,
+                self.fc31_alpha.data,
+                self.fc2_alpha.data,
+                self.num_slots,
+                top_k,
+                n_group,
+                topk_group,
+                self.intermediate_size_per_partition,
+                self.
+                slot_start,  # local_expert_start;  use ep_rank if stride!=1
+                self.expert_size_per_partition,  # local_expert_size
+                routed_scaling_factor,
+                self.routing_method.routing_method_type,
+                do_finalize=do_finalize,
+                act_type=0,
+                topk_ids=token_selected_experts,
+                topk_weights=token_final_scales,
+            )
+
+            if not do_finalize:
+                assert not self.reduce_results, "reduce_results must be False when do_finalize is False"
+                return outputs
+            else:
+                final_hidden_states = outputs[0]
         elif self.has_w4a8_mxfp4_fp8:
             pad_size = self.w3_w1_weight.shape[-1] * 2 - x.shape[-1]
             if not run_post_quant_allgather:
diff --git a/tensorrt_llm/_torch/modules/fused_moe/interface.py b/tensorrt_llm/_torch/modules/fused_moe/interface.py
@@ -301,6 +301,12 @@ def has_nvfp4(self):
         return self.quant_config is not None and self.quant_config.layer_quant_mode.has_nvfp4(
         )
 
+    @property
+    def has_w4a8_nvfp4_fp8(self):
+        assert self._weights_created
+        return self.quant_config is not None and self.quant_config.layer_quant_mode.has_w4a8_nvfp4_fp8(
+        )
+
     @property
     def has_w4a8_mxfp4_fp8(self):
         assert self._weights_created
diff --git a/tensorrt_llm/_torch/modules/fused_moe/quantization.py b/tensorrt_llm/_torch/modules/fused_moe/quantization.py
@@ -96,7 +96,7 @@ def trtllmgen_maybe_get_cached_w3_w1_permute_indices(
                                     torch.Tensor],
         epilogue_tile_m: int,
         num_elts_per_sf: Union[None, int] = None) -> torch.Tensor:
-    key = (dst_w3_w1_weight.shape, "w31")
+    key = (dst_w3_w1_weight.shape, "w31", int(num_elts_per_sf or -1))
     if key not in cache_permute_indices:
         # Get permute indices and chain them together
         permute0 = get_reorder_rows_for_gated_act_gemm_row_indices(
@@ -122,7 +122,7 @@ def trtllmgen_maybe_get_cached_w2_permute_indices(
                                     torch.Tensor],
         epilogue_tile_m: int,
         num_elts_per_sf: Union[None, int] = None) -> torch.Tensor:
-    key = (dst_w2_weight.shape, "w2")
+    key = (dst_w2_weight.shape, "w2", int(num_elts_per_sf or -1))
     if key not in cache_permute_indices:
         if num_elts_per_sf is None:
             permute_indices = (get_shuffle_matrix_a_row_indices(
@@ -1478,11 +1478,15 @@ class NVFP4FusedMoEMethod(FusedMoEMethodBase):
     Base class for NVFP4 fused MoE methods for all backends.
     """
 
-    def create_weights(self, module: torch.nn.Module, weight_dtype,
-                       weight_vec_size, block_scales_dtype,
-                       block_scales_vec_size):
+    def create_weights(self,
+                       module: torch.nn.Module,
+                       weight_dtype,
+                       weight_vec_size,
+                       block_scales_dtype,
+                       block_scales_vec_size,
+                       scaling_vector_size=16):
 
-        module.scaling_vector_size = 16
+        module.scaling_vector_size = scaling_vector_size
         # Divide by 16 because we use int64 to pack 16 fp4 values
         w3_w1_weight_shape = (module.expert_size_per_partition,
                               module.intermediate_size_per_partition * 2,
@@ -1893,9 +1897,12 @@ def load_expert_w2_weight(self, module: torch.nn.Module,
                             non_blocking=True)
 
     def load_expert_w3_w1_weight_scale_nvfp4(
-            self, module: torch.nn.Module, w1_weight_scale: torch.Tensor,
+            self,
+            module: torch.nn.Module,
+            w1_weight_scale: torch.Tensor,
             w3_weight_scale: torch.Tensor,
-            dst_w3_w1_weight_scale: torch.Tensor):
+            dst_w3_w1_weight_scale: torch.Tensor,
+            num_elts_per_sf: int = 16):
         device = dst_w3_w1_weight_scale.device
         assert device.type == "cuda"
         w1_weight_scale = load_weight_shard(w1_weight_scale,
@@ -1933,7 +1940,7 @@ def load_expert_w3_w1_weight_scale_nvfp4(
             dst_w3_w1_weight_scale.view(float4_sf_dtype),
             self._cache_permute_indices,
             epilogue_tile_m,
-            num_elts_per_sf=16)
+            num_elts_per_sf=num_elts_per_sf)
 
         # Shuffle the weight according to permute indices
         w3_w1_weight_scale = torch.ops.trtllm.shuffle_matrix(
@@ -1949,9 +1956,11 @@ def load_expert_w3_w1_weight_scale_nvfp4(
             processed_w3_w1_weight_scale.view(
                 self.block_scales_dtype).reshape(orig_shape))
 
-    def load_expert_w2_weight_scale_nvfp4(self, module: torch.nn.Module,
+    def load_expert_w2_weight_scale_nvfp4(self,
+                                          module: torch.nn.Module,
                                           w2_weight_scale: torch.Tensor,
-                                          dst_w2_weight_scale: torch.Tensor):
+                                          dst_w2_weight_scale: torch.Tensor,
+                                          num_elts_per_sf: int = 16):
         device = dst_w2_weight_scale.device
         assert device.type == "cuda"
         w2_weight_scale = load_weight_shard(w2_weight_scale,
@@ -1976,7 +1985,7 @@ def load_expert_w2_weight_scale_nvfp4(self, module: torch.nn.Module,
             dst_w2_weight_scale.view(float4_sf_dtype),
             self._cache_permute_indices,
             epilogue_tile_m,
-            num_elts_per_sf=16)
+            num_elts_per_sf=num_elts_per_sf)
 
         # Shuffle the weight according to permute indices
         w_shuffled = torch.ops.trtllm.shuffle_matrix(
@@ -1998,6 +2007,56 @@ def load_quant_scales(self, module: torch.nn.Module, weights: Dict):
                                        non_blocking=True)
 
 
+class W4A8NVFP4FP8TRTLLMGenFusedMoEMethod(NVFP4TRTLLMGenFusedMoEMethod):
+
+    def create_weights(self, module: torch.nn.Module):
+        weight_vec_size = torch.iinfo(self.weight_dtype).bits // 4
+        block_scales_vec_size = 1
+
+        NVFP4FusedMoEMethod.create_weights(self, module, self.weight_dtype,
+                                           weight_vec_size,
+                                           self.block_scales_dtype,
+                                           block_scales_vec_size, 32)
+
+        fc31_scale_c = nn.Parameter(torch.ones(module.expert_size_per_partition,
+                                               dtype=torch.float32),
+                                    requires_grad=False)
+        module.register_parameter("fc31_scale_c", fc31_scale_c)
+
+        self.setup_quant_scales(module)
+
+    def load_expert_w3_w1_weight_scale_nvfp4(
+            self, module: torch.nn.Module, w1_weight_scale: torch.Tensor,
+            w3_weight_scale: torch.Tensor,
+            dst_w3_w1_weight_scale: torch.Tensor):
+        return super().load_expert_w3_w1_weight_scale_nvfp4(
+            module, w1_weight_scale, w3_weight_scale, dst_w3_w1_weight_scale,
+            32)
+
+    def load_expert_w2_weight_scale_nvfp4(self, module: torch.nn.Module,
+                                          w2_weight_scale: torch.Tensor,
+                                          dst_w2_weight_scale: torch.Tensor):
+        return super().load_expert_w2_weight_scale_nvfp4(
+            module, w2_weight_scale, dst_w2_weight_scale, 32)
+
+    def load_all_fp4_weight_scales_and_alphas(
+            self, module: torch.nn.Module, weights: Dict,
+            load_expert_ids: List[int], dst_w3_w1_weight_scale: torch.Tensor,
+            dst_w2_weight_scale: torch.Tensor, dst_fc31_alpha: torch.Tensor,
+            dst_fc2_alpha: torch.Tensor):
+        super().load_all_fp4_weight_scales_and_alphas(
+            module, weights, load_expert_ids, dst_w3_w1_weight_scale,
+            dst_w2_weight_scale, dst_fc31_alpha, dst_fc2_alpha)
+        # The kernel we use will convert nvfp4 to e4m3 before matmul,
+        # so the range of the scale factor can only be [0,448/6].
+        dst_w3_w1_weight_scale.copy_((dst_w3_w1_weight_scale.to(torch.float32) /
+                                      6.0).to(torch.float8_e4m3fn))
+        dst_w2_weight_scale.copy_((dst_w2_weight_scale.to(torch.float32) /
+                                   6.0).to(torch.float8_e4m3fn))
+        dst_fc31_alpha.copy_(dst_fc31_alpha * 6.0)
+        dst_fc2_alpha.copy_(dst_fc2_alpha * 6.0)
+
+
 def _get_weight_alignment(weight_alignment, scaling_vector_size, tp_size,
                           shard_dim_size):
 
diff --git a/tests/unittest/_torch/modules/test_fused_moe.py b/tests/unittest/_torch/modules/test_fused_moe.py
diff --git a/tests/unittest/_torch/thop/parallel/test_moe.py b/tests/unittest/_torch/thop/parallel/test_moe.py