Add tests

andrewor14 · andrewor14 · commit 7e0749d31053 · 2025-10-23T11:21:17.000-07:00
diff --git a/test/quantization/quantize_/workflows/float8/test_float8_tensor.py b/test/quantization/quantize_/workflows/float8/test_float8_tensor.py
@@ -15,9 +15,11 @@
 from torch.testing._internal import common_utils
 from torch.testing._internal.common_utils import run_tests
 
+from torchao.float8.inference import Float8MMConfig
 from torchao.quantization import (
     Float8DynamicActivationFloat8WeightConfig,
     Float8WeightOnlyConfig,
+    Granularity,
     PerRow,
     PerTensor,
     quantize_,
@@ -82,7 +84,7 @@ def test_fp8_linear_variants(
         dtype: torch.dtype,
         mode: str,
         compile: bool,
-        granularity,
+        granularity: Granularity,
         kernel_preference: KernelPreference,
         sizes: Tuple,
     ):
@@ -148,6 +150,61 @@ def test_fp8_linear_variants(
                 f"Quantization error is too high got a SQNR of {error}"
             )
 
+    @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
+    @unittest.skipIf(
+        not is_sm_at_least_89(), "Requires GPU with compute capability >= 8.9"
+    )
+    @common_utils.parametrize("dtype", [torch.bfloat16, torch.float32])
+    @common_utils.parametrize("granularity", [PerTensor(), PerRow()])
+    @common_utils.parametrize(
+        "kernel_preference",
+        [KernelPreference.AUTO, KernelPreference.TORCH, KernelPreference.FBGEMM],
+    )
+    # Inputs are (M,..), K, N
+    @common_utils.parametrize(
+        "sizes",
+        [
+            ((128,), 256, 128),
+            ((32, 128), 64, 256),
+        ],
+    )
+    def test_fp8_matmul(
+        self,
+        dtype: torch.dtype,
+        granularity: Granularity,
+        kernel_preference: KernelPreference,
+        sizes: Tuple,
+    ):
+        if (
+            isinstance(granularity, PerTensor)
+            and kernel_preference == KernelPreference.FBGEMM
+        ):
+            return unittest.skip(
+                "per tensor with fbgemm kernel preferece does not work yet"
+            )
+        M, N, K = sizes
+        input_tensor = torch.randn(*M, K, dtype=dtype, device="cuda")
+        weight_tensor = torch.randn(K, N, dtype=dtype, device="cuda")
+        mm_config = Float8MMConfig()
+        input_tensor_fp8 = Float8Tensor.from_hp(
+            input_tensor,
+            granularity=granularity,
+            mm_config=mm_config,
+            kernel_preference=kernel_preference,
+        )
+        weight_tensor_fp8 = Float8Tensor.from_hp(
+            weight_tensor,
+            granularity=granularity,
+            mm_config=mm_config,
+            kernel_preference=kernel_preference,
+        )
+        output_tensor = torch.matmul(input_tensor, weight_tensor)
+        output_tensor_fp8 = torch.matmul(input_tensor_fp8, weight_tensor_fp8)
+        error = compute_error(output_tensor, output_tensor_fp8)
+        assert compute_error(output_tensor, output_tensor_fp8) > 20, (
+            f"Quantization error is too high got a SQNR of {error}"
+        )
+
     @common_utils.parametrize("granularity", [PerTensor(), PerRow()])
     @unittest.skipIf(
         not is_sm_at_least_90(),
@@ -653,6 +710,38 @@ def test_slice_3d_operation(self, granularity, slice_dim, tensor_shape):
 
         self.assertEqual(sliced_dequantized, sliced_original)
 
+    def test_to_dtype_layout(self):
+        x = torch.randn(128, 512, device="cuda", dtype=torch.bfloat16)
+        x_fp8 = Float8Tensor.from_hp(x)
+        y_fp8 = torch.ops.aten.to.dtype_layout(
+            x_fp8, dtype=x_fp8.dtype, layout=x_fp8.layout, device="cpu"
+        )
+        self.assertEqual(y_fp8.dtype, x_fp8.dtype)
+        self.assertEqual(y_fp8.layout, x_fp8.layout)
+        self.assertEqual(y_fp8.device, torch.device("cpu"))
+
+    def test_has_compatible_shallow_copy_type(self):
+        x1 = torch.randn(128, 512, device="cuda", dtype=torch.bfloat16)
+        x2 = torch.randn(128, 512, device="cuda", dtype=torch.bfloat16)
+        x3 = torch.randn(128, 256, device="cuda", dtype=torch.bfloat16)
+        x1_fp8 = Float8Tensor.from_hp(x1)
+        x2_fp8 = Float8Tensor.from_hp(x2)
+        x3_fp8 = Float8Tensor.from_hp(x3)
+        self.assertFalse(torch._has_compatible_shallow_copy_type(x1, x2_fp8))
+        self.assertFalse(torch._has_compatible_shallow_copy_type(x1_fp8, x2))
+        self.assertTrue(torch._has_compatible_shallow_copy_type(x1_fp8, x2_fp8))
+        # Wrong shape
+        self.assertFalse(torch._has_compatible_shallow_copy_type(x1_fp8, x3_fp8))
+
+    def test_transpose(self):
+        x = torch.randn(128, 512, device="cuda", dtype=torch.bfloat16)
+        x_fp8 = Float8Tensor.from_hp(x)
+        x_fp8_t = x_fp8.t()
+        torch.testing.assert_close(x_fp8_t.qdata, x_fp8.qdata.t(), atol=0, rtol=0)
+        torch.testing.assert_close(x_fp8_t.scale, x_fp8.scale.t(), atol=0, rtol=0)
+        self.assertEqual(x_fp8.block_size, (1, 512), atol=0, rtol=0)
+        self.assertEqual(x_fp8_t.block_size, (512, 1), atol=0, rtol=0)
+
 
 common_utils.instantiate_parametrized_tests(TestFloat8Tensor)
 
diff --git a/torchao/quantization/quantize_/workflows/float8/float8_tensor.py b/torchao/quantization/quantize_/workflows/float8/float8_tensor.py
@@ -286,18 +286,19 @@ def _float8_linear_impl(
         f"Don't expect to reach here with an override other than weight currently, {type(input_tensor)} {type(weight_tensor)}"
     )
 
-    # TODO: make this better
     # During the backward pass, we transpose the weight tensor,
     # so if the weight tensor was originally rowwise quantized,
     # now it becomes colwise. In this case, simply dequantize
     # the tensor and do a bf16 matmul
-    is_backward = (
-        weight_tensor.block_size[0] == weight_tensor.shape[0] and
-        weight_tensor.block_size[1] == 1
+    is_colwise = (
+        weight_tensor.block_size[0] == weight_tensor.shape[0]
+        and weight_tensor.block_size[1] == 1
     )
-    if is_backward:
+    if is_colwise:
         return torch.nn.functional.linear(
-            input_tensor, weight_tensor.dequantize(), bias,
+            input_tensor,
+            weight_tensor.dequantize(),
+            bias,
         )
 
     act_quant_kwargs = weight_tensor.act_quant_kwargs
@@ -598,22 +599,9 @@ def _(func, types, args, kwargs):
         assert original_shape[-1] == size[-1], (
             f"Only support reshaping when last dimension matches, requested: reshaping from {original_shape} to {size}"
         )
-        # TODO(andrew): This is technically not needed for unsloth fp8 RL
-        # but fixes a bug nonetheless, can do this separately
-        # Example input shapes:
-        #     self.shape = [6, 363, 4096]
-        #     self.scale.shape = [6, 363, 1]
-        #     self.block_size = [1, 1, 4096]
-        #     size = [-1, 4096]
-        #
-        # Example output shapes:
-        #     self.shape = [2178, 4096]
-        #     self.scale.shape = [2178, 1]
-        #     self.block_size = [1, 4096]
-        new_dim0 = original_shape[0] * original_shape[1]
-        assert size[0] == new_dim0 or size[0] == -1
-        qdata = self.qdata.reshape(new_dim0, -1)
-        scale = self.scale.reshape(new_dim0, -1)
+        # TODO: this seems wrong, we should merge the first two dimensions instead
+        qdata = self.qdata.reshape(*size)
+        scale = self.scale.reshape(*size)
         block_size = self.block_size.copy()
         block_size = [block_size[0] * block_size[1], block_size[2]]
     elif len(original_shape) == 2 and len(size) == 3:
@@ -754,10 +742,15 @@ def _(func, types, args, kwargs):
 # This is called during _apply() to see if we can shallow
 # copy the content of one tensor into another. For now,
 # we only allow shallow copy if both tensors are `Float8Tensor`
+# and have the same shape.
 @implements_torch_function(torch._has_compatible_shallow_copy_type)
 def _(func, types, args, kwargs):
     assert len(args) == 2
-    return isinstance(args[0], Float8Tensor) and isinstance(args[1], Float8Tensor)
+    return (
+        isinstance(args[0], Float8Tensor)
+        and isinstance(args[1], Float8Tensor)
+        and args[0].shape == args[1].shape
+    )
 
 
 @implements(aten.t.default)