[FMDL-1222][feat] Support weight and weight_scale padding for NVFP4 MoE cutlass

Wanli-Jiang · Wanli-Jiang · commit bf5c6a7aa22b · 2025-11-20T23:43:48.000-08:00
Signed-off-by: Wanli Jiang &lt;35160485+Wanli-Jiang@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/modules/fused_moe/quantization.py b/tensorrt_llm/_torch/modules/fused_moe/quantization.py
@@ -1548,6 +1548,42 @@ class NVFP4FusedMoEMethod(FusedMoEMethodBase):
     Base class for NVFP4 fused MoE methods for all backends.
     """
 
+    def get_weights_shapes(self, module: torch.nn.Module, weight_vec_size: int,
+                           block_scales_vec_size: int):
+        # Divide by 16 because we use int64 to pack 16 fp4 values
+        w3_w1_weight_shape = (module.expert_size_per_partition,
+                              module.intermediate_size_per_partition *
+                              module.intermediate_size_expand_ratio,
+                              module.hidden_size // weight_vec_size)
+        w2_weight_shape = (module.expert_size_per_partition, module.hidden_size,
+                           module.intermediate_size_per_partition //
+                           weight_vec_size)
+
+        w3_w1_weight_scale_shape = (module.expert_size_per_partition,
+                                    module.intermediate_size_per_partition *
+                                    module.intermediate_size_expand_ratio,
+                                    module.hidden_size //
+                                    module.scaling_vector_size //
+                                    block_scales_vec_size)
+        w2_weight_scale_shape = (module.expert_size_per_partition,
+                                 module.hidden_size,
+                                 module.intermediate_size_per_partition //
+                                 module.scaling_vector_size //
+                                 block_scales_vec_size)
+
+        if module.bias:
+            w3_w1_bias_shape = (module.expert_size_per_partition,
+                                module.intermediate_size_per_partition *
+                                module.intermediate_size_expand_ratio)
+            w2_bias_shape = (module.expert_size_per_partition,
+                             module.hidden_size)
+        else:
+            w3_w1_bias_shape = None
+            w2_bias_shape = None
+
+        return (w3_w1_weight_shape, w2_weight_shape, w3_w1_bias_shape,
+                w2_bias_shape, w3_w1_weight_scale_shape, w2_weight_scale_shape)
+
     def create_weights(self,
                        module: torch.nn.Module,
                        weight_dtype,
@@ -1557,35 +1593,23 @@ def create_weights(self,
                        scaling_vector_size=16):
 
         module.scaling_vector_size = scaling_vector_size
-        # Divide by 16 because we use int64 to pack 16 fp4 values
-        w3_w1_weight_shape = (module.expert_size_per_partition,
-                              module.intermediate_size_per_partition *
-                              module.intermediate_size_expand_ratio,
-                              module.hidden_size // weight_vec_size)
-        w2_weight_shape = (module.expert_size_per_partition, module.hidden_size,
-                           module.intermediate_size_per_partition //
-                           weight_vec_size)
+
+        (w3_w1_weight_shape, w2_weight_shape, w3_w1_bias_shape, w2_bias_shape,
+         w3_w1_weight_scale_shape,
+         w2_weight_scale_shape) = self.get_weights_shapes(
+             module, weight_vec_size, block_scales_vec_size)
 
         # Divide by 4 because we use int32 to pack 4 fp8 values
         # column parallel
-        w3_w1_weight_scale = nn.Parameter(
-            torch.ones(module.expert_size_per_partition,
-                       module.intermediate_size_per_partition *
-                       module.intermediate_size_expand_ratio,
-                       module.hidden_size // module.scaling_vector_size //
-                       block_scales_vec_size,
-                       dtype=block_scales_dtype),
-            requires_grad=False)
+        w3_w1_weight_scale = nn.Parameter(torch.ones(w3_w1_weight_scale_shape,
+                                                     dtype=block_scales_dtype),
+                                          requires_grad=False)
         module.register_parameter("w3_w1_weight_scale", w3_w1_weight_scale)
 
         # row parallel
-        w2_weight_scale = nn.Parameter(
-            torch.ones(module.expert_size_per_partition,
-                       module.hidden_size,
-                       module.intermediate_size_per_partition //
-                       module.scaling_vector_size // block_scales_vec_size,
-                       dtype=block_scales_dtype),
-            requires_grad=False)
+        w2_weight_scale = nn.Parameter(torch.ones(w2_weight_scale_shape,
+                                                  dtype=block_scales_dtype),
+                                       requires_grad=False)
         module.register_parameter("w2_weight_scale", w2_weight_scale)
 
         fc31_input_scale = nn.Parameter(torch.tensor(1., dtype=torch.float32),
@@ -1606,8 +1630,12 @@ def create_weights(self,
                                  requires_grad=False)
         module.register_parameter("fc2_alpha", fc2_alpha)
 
-        super().create_weights(module, weight_dtype, w3_w1_weight_shape,
-                               w2_weight_shape)
+        super().create_weights(module,
+                               weight_dtype,
+                               w3_w1_weight_shape=w3_w1_weight_shape,
+                               w2_weight_shape=w2_weight_shape,
+                               w3_w1_bias_shape=w3_w1_bias_shape,
+                               w2_bias_shape=w2_bias_shape)
 
         self.setup_quant_scales(module)
 
@@ -1816,6 +1844,55 @@ def setup_quant_scales(self, module: torch.nn.Module):
 class NVFP4CutlassFusedMoEMethod(NVFP4FusedMoEMethod):
     weight_dtype = FUSED_MOE_NVFP4_WEIGHT_DTYPE
     block_scales_dtype = FUSED_MOE_NVFP4_WEIGHT_BLOCK_SCALE_DTYPE
+    NVFP4_ROW_ALIGNMENT = 128
+    NVFP4_COL_ALIGNMENT = 4
+
+    def get_weights_shapes(self, module: torch.nn.Module, weight_vec_size: int,
+                           block_scales_vec_size: int):
+        """Override the base method to get aligned weights shapes for Cutlass nvfp4 alignment."""
+        intermediate_size_expand = module.intermediate_size_per_partition * module.intermediate_size_expand_ratio
+        intermediate_size_expand_aligned = (
+            intermediate_size_expand + self.NVFP4_ROW_ALIGNMENT -
+            1) // self.NVFP4_ROW_ALIGNMENT * self.NVFP4_ROW_ALIGNMENT
+
+        if module.hidden_size % self.NVFP4_COL_ALIGNMENT != 0:
+            raise ValueError(
+                f"hidden_size {module.hidden_size} must be divisible by {self.NVFP4_COL_ALIGNMENT}"
+            )
+        hidden_size_aligned = module.hidden_size
+
+        w3_w1_weight_shape = (module.expert_size_per_partition,
+                              intermediate_size_expand_aligned,
+                              hidden_size_aligned // weight_vec_size)
+        w2_weight_shape = (module.expert_size_per_partition,
+                           hidden_size_aligned,
+                           intermediate_size_expand_aligned //
+                           module.intermediate_size_expand_ratio //
+                           weight_vec_size)
+
+        w3_w1_weight_scale_shape = (module.expert_size_per_partition,
+                                    intermediate_size_expand_aligned,
+                                    hidden_size_aligned //
+                                    module.scaling_vector_size //
+                                    block_scales_vec_size)
+        w2_weight_scale_shape = (module.expert_size_per_partition,
+                                 hidden_size_aligned,
+                                 intermediate_size_expand_aligned //
+                                 module.intermediate_size_expand_ratio //
+                                 module.scaling_vector_size //
+                                 block_scales_vec_size)
+
+        if module.bias:
+            w3_w1_bias_shape = (module.expert_size_per_partition,
+                                intermediate_size_expand_aligned)
+            w2_bias_shape = (module.expert_size_per_partition,
+                             hidden_size_aligned)
+        else:
+            w3_w1_bias_shape = None
+            w2_bias_shape = None
+
+        return (w3_w1_weight_shape, w2_weight_shape, w3_w1_bias_shape,
+                w2_bias_shape, w3_w1_weight_scale_shape, w2_weight_scale_shape)
 
     def create_weights(self, module: torch.nn.Module):
         weight_vec_size = torch.iinfo(self.weight_dtype).bits // 4
@@ -1842,19 +1919,34 @@ def load_expert_w3_w1_weight_scale_nvfp4(
                                             device=device)
         # Keep weights in device buffer
         # w3
-        split_length = module.intermediate_size_per_partition * module.intermediate_size_expand_ratio // 2
+        split_length = dst_w3_w1_weight_scale.shape[0] // 2
         dst_w3_weight_scale = dst_w3_w1_weight_scale.narrow(dim=0,
                                                             start=0,
                                                             length=split_length)
-        dst_w3_weight_scale.copy_(
-            w3_weight_scale.view(dst_w3_weight_scale.dtype))
+        cast_w3_weight_scale = w3_weight_scale.view(dst_w3_weight_scale.dtype)
+
+        dst_w3_row, dst_w3_col = dst_w3_weight_scale.shape
+        _w3_row, _w3_col = cast_w3_weight_scale.shape
+        if _w3_row != dst_w3_row or _w3_col != dst_w3_col:
+            cast_w3_weight_scale = torch.nn.functional.pad(
+                cast_w3_weight_scale,
+                (0, dst_w3_col - _w3_col, 0, dst_w3_row - _w3_row), "constant",
+                0)
+        dst_w3_weight_scale.copy_(cast_w3_weight_scale)
 
         # w1
         dst_w1_weight_scale = dst_w3_w1_weight_scale.narrow(dim=0,
                                                             start=split_length,
                                                             length=split_length)
-        dst_w1_weight_scale.copy_(
-            w1_weight_scale.view(dst_w1_weight_scale.dtype))
+        dst_w1_row, dst_w1_col = dst_w1_weight_scale.shape
+        cast_w1_weight_scale = w1_weight_scale.view(dst_w1_weight_scale.dtype)
+        _w1_row, _w1_col = cast_w1_weight_scale.shape
+        if _w1_row != dst_w1_row or _w1_col != dst_w1_col:
+            cast_w1_weight_scale = torch.nn.functional.pad(
+                cast_w1_weight_scale,
+                (0, dst_w1_col - _w1_col, 0, dst_w1_row - _w1_row), "constant",
+                0)
+        dst_w1_weight_scale.copy_(cast_w1_weight_scale)
 
         orig_shape = dst_w3_w1_weight_scale.shape
 
@@ -1876,9 +1968,19 @@ def load_expert_w2_weight_scale_nvfp4(self, module: torch.nn.Module,
                                             module.tp_rank,
                                             TensorParallelMode.ROW,
                                             device=device)
+
+        cast_w2_weight_scale = w2_weight_scale.view(dst_w2_weight_scale.dtype)
+        dst_row, dst_col = dst_w2_weight_scale.shape
+        _row, _col = cast_w2_weight_scale.shape
+        if _row != dst_row or _col != dst_col:
+            cast_w2_weight_scale = torch.nn.functional.pad(
+                cast_w2_weight_scale,
+                (0, dst_col - _col, 0,
+                 dst_row - _row),  # (left, right, top, bottom)
+                "constant",
+                0)
         # Keep weights in device buffer
-        dst_w2_weight_scale.copy_(
-            w2_weight_scale.view(dst_w2_weight_scale.dtype))
+        dst_w2_weight_scale.copy_(cast_w2_weight_scale)
 
         orig_shape = dst_w2_weight_scale.shape
 
@@ -1890,6 +1992,67 @@ def load_expert_w2_weight_scale_nvfp4(self, module: torch.nn.Module,
 
         dst_w2_weight_scale.copy_(dst_w2_weight_scale_interleaved)
 
+    def load_expert_w3_w1_weight(self, module: torch.nn.Module,
+                                 w1_weight: torch.Tensor,
+                                 w3_weight: torch.Tensor,
+                                 dst_w3_w1_weight: torch.Tensor):
+        """Load and pad w1 and w3 weights for each expert, to match shape requirements for Cutlass nvfp4 alignment."""
+        device = dst_w3_w1_weight.device
+        w1_weight_shard = load_weight_shard(w1_weight,
+                                            module.tp_size,
+                                            module.tp_rank,
+                                            TensorParallelMode.COLUMN,
+                                            device=device)
+        w3_weight_shard = load_weight_shard(w3_weight,
+                                            module.tp_size,
+                                            module.tp_rank,
+                                            TensorParallelMode.COLUMN,
+                                            device=device)
+
+        cast_w1_weight_shard = w1_weight_shard.view(dst_w3_w1_weight.dtype)
+        cast_w3_weight_shard = w3_weight_shard.view(dst_w3_w1_weight.dtype)
+
+        dst_row, dst_col = dst_w3_w1_weight.shape
+        _w1_row, _w1_col = cast_w1_weight_shard.shape
+        _w3_row, _w3_col = cast_w3_weight_shard.shape
+        assert _w1_row == _w3_row and _w1_col == _w3_col, "w1 and w3 weights must have the same shape"
+        assert dst_row % 2 == 0, "dst_w3_w1_weight must have even number of rows"
+        if _w1_row != dst_row // 2 or _w1_col != dst_col:
+            _pad_row = dst_row // 2 - _w1_row
+            _pad_col = dst_col - _w1_col
+            cast_w1_weight_shard = torch.nn.functional.pad(
+                cast_w1_weight_shard, (0, _pad_col, 0, _pad_row), "constant", 0)
+            cast_w3_weight_shard = torch.nn.functional.pad(
+                cast_w3_weight_shard, (0, _pad_col, 0, _pad_row), "constant", 0)
+
+        cast_w31_weight_shard = torch.cat(
+            [cast_w3_weight_shard, cast_w1_weight_shard], dim=0)
+        dst_w3_w1_weight.copy_(cast_w31_weight_shard, non_blocking=True)
+
+    def load_expert_w2_weight(self, module: torch.nn.Module,
+                              w2_weight: torch.Tensor,
+                              dst_w2_weight: torch.Tensor):
+        """Load and pad w2 weight for each expert, to match shape requirements for Cutlass nvfp4 alignment."""
+        device = dst_w2_weight.device
+        w2_weight_shard = load_weight_shard(w2_weight,
+                                            module.tp_size,
+                                            module.tp_rank,
+                                            TensorParallelMode.ROW,
+                                            device=device)
+        cast_w2_weight_shard = w2_weight_shard.view(dst_w2_weight.dtype)
+
+        dst_row, dst_col = dst_w2_weight.shape
+        _row, _col = cast_w2_weight_shard.shape
+        if _row != dst_row or _col != dst_col:
+            cast_w2_weight_shard = torch.nn.functional.pad(
+                cast_w2_weight_shard,
+                (0, dst_col - _col, 0,
+                 dst_row - _row),  # (left, right, top, bottom)
+                "constant",
+                0)
+
+        dst_w2_weight.copy_(cast_w2_weight_shard, non_blocking=True)
+
 
 class NVFP4TRTLLMGenFusedMoEMethod(NVFP4FusedMoEMethod):
     weight_dtype = float4_sf_dtype