[mxfp8 moe training] add triton kernel for mxfp8 dequantization

danielvegamyhre · danielvegamyhre · commit 357d20fad16e · 2025-10-16T17:13:10.000-07:00
stack-info: PR: #3195, branch: danielvegamyhre/stack/78
diff --git a/benchmarks/prototype/moe_training/mxfp8/bench_all_to_all_v.py b/benchmarks/prototype/moe_training/mxfp8/bench_all_to_all_v.py
@@ -84,7 +84,6 @@ def default_a2a_fwd_bwd(
 
     loss = F.mse_loss(routed_input, labels)
     loss.backward()
-
     torch.cuda.synchronize()
     return routed_input
 
diff --git a/benchmarks/prototype/moe_training/mxfp8/bench_dequantize.py b/benchmarks/prototype/moe_training/mxfp8/bench_dequantize.py
@@ -0,0 +1,170 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD 3-Clause license found in the
+# LICENSE file in the root directory of this source tree.
+# this benchmarking script is a modified version of the original script from: https://github.com/drisspg/transformer_nuggets/blob/main/transformer_nuggets/utils/benchmark.py
+
+from dataclasses import dataclass
+from typing import List
+
+import torch
+from tabulate import tabulate
+from tqdm import tqdm
+
+from benchmarks.utils import benchmark_cuda_function_in_microseconds
+from torchao.prototype.mx_formats.kernels import triton_mxfp8_dequant_dim0
+from torchao.prototype.mx_formats.mx_tensor import to_dtype, to_mx
+
+device = torch.device("cuda")
+
+# Needed since changing args to function causes recompiles
+torch._dynamo.config.cache_size_limit = 1000
+
+
+@dataclass(frozen=True)
+class ExperimentConfig:
+    input_shape: tuple[int]
+
+
+@dataclass(frozen=True)
+class ExperimentResult:
+    # time
+    torch_us: float
+    triton_us: float
+    torch_gbps: float
+    triton_gbps: float
+
+
+@dataclass(frozen=True)
+class Experiment:
+    config: ExperimentConfig
+    result: ExperimentResult
+
+
+def get_configs() -> List[ExperimentConfig]:
+    input_shapes = [
+        # (local_batch_size, seq_len, dim)
+        (1, 8192, 7168),
+        (2, 8192, 7168),
+        (4, 8192, 7168),
+        (8, 8192, 7168),
+    ]
+    configs = []
+    for shape in input_shapes:
+        configs.append(
+            ExperimentConfig(
+                input_shape=shape,
+            )
+        )
+    return configs
+
+
+def run_experiment(config: ExperimentConfig) -> ExperimentResult:
+    block_size = 32
+    input_shape = config.input_shape
+    input_tensor = torch.randn(
+        *input_shape,
+        dtype=torch.bfloat16,
+        device=device,
+    )
+
+    e8m0_scales, e4m3_data = to_mx(input_tensor, torch.float8_e4m3fn, block_size)
+
+    # Bench torch dequant
+    to_dtype_c = torch.compile(to_dtype)
+    elem_dtype, target_dtype = torch.float8_e4m3fn, torch.bfloat16
+    torch_output = to_dtype_c(
+        e4m3_data,
+        e8m0_scales,
+        elem_dtype,
+        block_size,
+        target_dtype,
+    )
+    torch_us = benchmark_cuda_function_in_microseconds(
+        to_dtype_c,
+        e4m3_data,
+        e8m0_scales,
+        elem_dtype,
+        block_size,
+        target_dtype,
+    )
+
+    # Bench triton kernel
+    _ = triton_mxfp8_dequant_dim0(
+        e4m3_data,
+        e8m0_scales,
+        target_dtype,
+        block_size,
+    )
+    triton_us = benchmark_cuda_function_in_microseconds(
+        triton_mxfp8_dequant_dim0,
+        e4m3_data,
+        e8m0_scales,
+        target_dtype,
+        block_size,
+    )
+
+    # mem bw calculations
+    bytes_per_input_el = torch.finfo(elem_dtype).bits / 8
+    bytes_per_output_el = torch.finfo(target_dtype).bits / 8
+    bytes_per_scale_el = torch.finfo(torch.float8_e8m0fnu).bits / 8
+
+    read_bytes = (
+        e4m3_data.numel() * bytes_per_input_el
+        + e8m0_scales.numel() * bytes_per_scale_el
+    )
+    write_bytes = torch_output.numel() * bytes_per_output_el
+
+    torch_gbps = ((read_bytes + write_bytes) / 1e9) / (torch_us / 1e6)
+    triton_gbps = ((read_bytes + write_bytes) / 1e9) / (triton_us / 1e6)
+
+    return ExperimentResult(
+        torch_us=torch_us,
+        triton_us=triton_us,
+        triton_gbps=triton_gbps,
+        torch_gbps=torch_gbps,
+    )
+
+
+def print_results(experiments: List[Experiment]):
+    headers = [
+        "input_shape",
+        "torch_us",
+        "triton_us",
+        "torch_gbps",
+        "triton_gbps",
+        "triton_speedup",
+    ]
+    rows = []
+    for experiment in experiments:
+        triton_speedup = round(
+            experiment.result.torch_us / experiment.result.triton_us, 3
+        )
+        rows.append(
+            [
+                str(experiment.config.input_shape),
+                experiment.result.torch_us,
+                experiment.result.triton_us,
+                round(experiment.result.torch_gbps, 3),
+                round(experiment.result.triton_gbps, 3),
+                f"{triton_speedup}x",
+            ]
+        )
+    print(tabulate(rows, headers=headers))
+
+
+def main():
+    torch.random.manual_seed(123)
+    configs = get_configs()
+    results = []
+    for config in tqdm(configs):
+        result = run_experiment(config)
+        results.append(Experiment(config=config, result=result))
+
+    # Use Tabulate to print results
+    print_results(results)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/test/prototype/moe_training/mxfp8/test_mxfp8_a2a.py b/test/prototype/moe_training/mxfp8/test_mxfp8_a2a.py
@@ -23,8 +23,8 @@
     compute_error,
 )
 from torchao.prototype.moe_training.kernels.mxfp8.comms import (
-    mxfp8_on_device_all_to_all_v,
     to_mxfp8_a2a_dequant,
+    to_mxfp8_on_device_a2a_dequant,
 )
 
 from ..testing_utils import generate_split_sizes
@@ -88,7 +88,7 @@ def test_a2a_fwd_bwd(self):
             max_output_tokens_per_rank = tokens_per_ep_rank * self.world_size
 
             # Test forward
-            output, output_splits = mxfp8_on_device_all_to_all_v(
+            output, output_splits = to_mxfp8_on_device_a2a_dequant(
                 input_tensor,
                 input_splits,
                 max_output_tokens_per_rank,
diff --git a/test/prototype/mx_formats/test_kernels.py b/test/prototype/mx_formats/test_kernels.py
@@ -37,12 +37,13 @@
     pack_uint6,
     triton_f6_e2m3_to_bf16,
     triton_f6_e3m2_to_bf16,
+    triton_mxfp8_dequant_dim0,
     triton_to_mxfp8_dim0,
     triton_to_mxfp8_dim1,
     triton_to_mxfp8_dim1_reference,
     unpack_uint4,
 )
-from torchao.prototype.mx_formats.mx_tensor import ScaleCalculationMode, to_mx
+from torchao.prototype.mx_formats.mx_tensor import ScaleCalculationMode, to_dtype, to_mx
 from torchao.prototype.mx_formats.utils import to_blocked
 from torchao.utils import (
     is_sm_at_least_89,
@@ -513,6 +514,28 @@ def test_triton_mxfp8_dim0_zeros():
     torch.testing.assert_close(x_s_t, x_s_ref, rtol=0, atol=0)
 
 
+@pytest.mark.skipif(not has_triton(), reason="unsupported without triton")
+@pytest.mark.skipif(
+    not is_sm_at_least_100(),
+    reason="mxfp8 requires CUDA capability 10.0 or greater",
+)
+@pytest.mark.parametrize("M", (256, 2048, 131072))
+@pytest.mark.parametrize("K", (256, 5120, 7168))
+def test_triton_mxfp8_dequant_dim0(M, K):
+    x = torch.zeros(M, K, dtype=torch.bfloat16, device="cuda")
+    block_size = 32
+    x_data, x_scales = triton_to_mxfp8_dim0_reference(x, block_size=32)
+    hp_ref = to_dtype(
+        x_data,
+        x_scales,
+        torch.float8_e4m3fn,
+        block_size,
+        torch.bfloat16,
+    )
+    hp_t = triton_mxfp8_dequant_dim0(x_data, x_scales, torch.bfloat16, block_size)
+    torch.testing.assert_close(hp_t, hp_ref, rtol=0, atol=0)
+
+
 @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
 @pytest.mark.parametrize(
     "shape",
diff --git a/torchao/prototype/moe_training/kernels/mxfp8/comms.py b/torchao/prototype/moe_training/kernels/mxfp8/comms.py
@@ -11,8 +11,14 @@
     blockwise_barrier,
     sync_threads,
 )
-from torchao.prototype.mx_formats.config import ScaleCalculationMode
-from torchao.prototype.mx_formats.mx_tensor import to_dtype, to_mx
+from torchao.prototype.mx_formats.kernels import (
+    triton_mxfp8_dequant_dim0,
+    triton_to_mxfp8_dim0,
+)
+from torchao.prototype.mx_formats.mx_tensor import (
+    to_dtype,
+    to_mx,
+)
 
 
 # This performs dynamic mxfp8 quantization of the input tensor,
@@ -256,7 +262,7 @@ def backward(ctx, grad_output, grad_splits):
 
 
 # Alias
-mxfp8_on_device_all_to_all_v = MXFP8OnDeviceAllToAllV.apply
+to_mxfp8_on_device_a2a_dequant = MXFP8OnDeviceAllToAllV.apply
 
 
 # Triton launcher function
@@ -473,11 +479,9 @@ def forward(
         """
         # Quantize input
         block_size = 32
-        input_scales, input_data = to_mx(
+        input_data, input_scales = triton_to_mxfp8_dim0(
             input,
-            elem_dtype=torch.float8_e4m3fn,
-            block_size=block_size,
-            scaling_mode=ScaleCalculationMode.RCEIL,
+            inner_block_size=block_size,
         )
 
         # Dispatch data (async)
@@ -501,14 +505,12 @@ def forward(
         output_data = torch.ops._c10d_functional.wait_tensor(output_data)
 
         # Dequantize output
-        lowp_dtype = output_data.dtype
         hp_dtype = input.dtype
-        hp_output = to_dtype(
+        hp_output = triton_mxfp8_dequant_dim0(
             output_data,
             output_scales.view(torch.float8_e8m0fnu),
-            lowp_dtype,
-            block_size,
             hp_dtype,
+            block_size,
         )
 
         ctx.input_splits = input_splits
@@ -529,11 +531,9 @@ def backward(ctx, grad_output_hp):
 
         # Quantize grad_output
         block_size = 32
-        grad_out_scales, grad_out_data = to_mx(
+        grad_out_data, grad_out_scales = triton_to_mxfp8_dim0(
             grad_output_hp,
-            elem_dtype=torch.float8_e4m3fn,
-            block_size=block_size,
-            scaling_mode=ScaleCalculationMode.RCEIL,
+            inner_block_size=block_size,
         )
 
         # Dispatch data (async)
@@ -557,13 +557,11 @@ def backward(ctx, grad_output_hp):
         grad_input_scales = torch.ops._c10d_functional.wait_tensor(grad_input_scales)
 
         hp_dtype = grad_output_hp.dtype
-        lowp_dtype = grad_input_data.dtype
-        grad_input_hp = to_dtype(
+        grad_input_hp = triton_mxfp8_dequant_dim0(
             grad_input_data,
             grad_input_scales.view(torch.float8_e8m0fnu),
-            lowp_dtype,
-            block_size,
             hp_dtype,
+            block_size,
         )
         return grad_input_hp, None, None, None
 
diff --git a/torchao/prototype/mx_formats/kernels.py b/torchao/prototype/mx_formats/kernels.py