fix shapes

kylesayrs · kylesayrs · commit e3f24d4dc0a5 · 2025-10-09T10:19:33.000-04:00
Signed-off-by: Kyle Sayers &lt;kylesayrs@gmail.com&gt;
diff --git a/tests/mock_observer.py b/tests/mock_observer.py
@@ -77,6 +77,8 @@ def flatten_for_quantization(
 
 
 def flatten_weight_for_quantization(value: torch.Tensor, args: QuantizationArgs):
+    # value.shape = (num_rows, num_cols)
+
     if args.strategy == QuantizationStrategy.TENSOR:
         # (1, 1, num_weight_elems)
         return value.reshape((1, 1, -1))
@@ -117,6 +119,8 @@ def flatten_weight_for_quantization(value: torch.Tensor, args: QuantizationArgs)
 
 
 def flatten_activation_for_quantization(value: torch.Tensor, args: QuantizationArgs):
+    # value.shape = (batch_size, seq_len, hidden_dim)
+
     if args.strategy == QuantizationStrategy.TENSOR:
         # (batch_size * seq_len, 1, hidden_dim)
         return value.reshape((-1, 1, value.size(-1)))
@@ -144,10 +148,11 @@ def flatten_activation_for_quantization(value: torch.Tensor, args: QuantizationA
 
 
 def flatten_attention_for_quantization(value: torch.Tensor, args: QuantizationArgs):
+    # value.shape = (batch_size, num_heads, seq_len, head_dim)
+
     if args.strategy == QuantizationStrategy.TENSOR:
-        # (batch_size, seq_len, num_heads, head_dim)
         # (batch_size * seq_len, 1, num_heads * head_dim)
-        return value.flatten(0, 1).flatten(-2, -1).unsqueeze(-2)
+        return value.transpose(1, 2).flatten(0, 1).flatten(-2, -1).unsqueeze(-2)
 
     if args.strategy == QuantizationStrategy.TOKEN:
         raise ValueError("Token quantization cannot be applied to attention")
diff --git a/tests/test_quantization/lifecycle/test_static_lifecycle.py b/tests/test_quantization/lifecycle/test_static_lifecycle.py
@@ -287,16 +287,24 @@ class MockAttention(torch.nn.Module):
                 strategy="tensor",
             ),
             torch.tensor([0.0]),
-            torch.tensor([11.0]),
+            torch.tensor([23.0]),
             torch.tensor(
                 [
                     [
-                        [[0.0000, 1.4688, 1.4688], [2.9375, 4.4062, 4.4062]],
-                        [[5.8750, 7.3438, 7.3438], [8.8125, 10.2500, 10.2500]],
+                        [
+                            [0.0000, 0.0000, 3.0625, 3.0625],
+                            [3.0625, 6.1250, 6.1250, 6.1250],
+                            [9.1875, 9.1875, 9.1875, 12.2500],
+                        ],
+                        [
+                            [12.2500, 12.2500, 15.3125, 15.3125],
+                            [15.3125, 18.3750, 18.3750, 18.3750],
+                            [21.5000, 21.5000, 21.5000, 21.5000],
+                        ],
                     ]
                 ]
             ),
-            0.19,
+            0.81,
         ),
         # static token is not supported
         # channel is not supported
@@ -310,35 +318,45 @@ class MockAttention(torch.nn.Module):
                 symmetric=True,
                 strategy="attn_head",
             ),
-            torch.tensor([[[0.0]], [[6.0]]]),
-            torch.tensor([[[5.0]], [[11.0]]]),
+            torch.tensor([[[0.0]], [[12.0]]]),
+            torch.tensor([[[11.0]], [[23.0]]]),
             torch.tensor(
                 [
                     [
-                        [[0.0000, 1.3359, 2.0000], [2.6719, 4.0000, 4.6875]],
-                        [[5.8750, 7.3438, 7.3438], [8.8125, 10.2500, 10.2500]],
+                        [
+                            [0.0000, 1.4688, 1.4688, 2.9375],
+                            [4.4062, 4.4062, 5.8750, 7.3438],
+                            [7.3438, 8.8125, 10.2500, 10.2500],
+                        ],
+                        [
+                            [12.2500, 12.2500, 15.3125, 15.3125],
+                            [15.3125, 18.3750, 18.3750, 18.3750],
+                            [21.5000, 21.5000, 21.5000, 21.5000],
+                        ],
                     ]
                 ]
             ),
-            0.13,
+            0.55,
         ),
     ],
 )
 def test_static_attention_quantization(
     args, exp_min_val, exp_max_val, exp_quant, exp_loss
 ):
     """
-    input = tensor([[[[ 0.,  1.,  2.],
-                      [ 3.,  4.,  5.]],
+    input = tensor([[[[ 0.,  1.,  2.,  3.],
+                      [ 4.,  5.,  6.,  7.],
+                      [ 8.,  9., 10., 11.]],
 
-                      [[ 6.,  7.,  8.],
-                      [ 9., 10., 11.]]]])
+                     [[12., 13., 14., 15.],
+                      [16., 17., 18., 19.],
+                      [20., 21., 22., 23.]]]])
     """
-    # set up activation (and identity weight)
-    batch_size, num_heads, seq_len, head_dim = 1, 2, 2, 3
+    # set up attention
+    batch_size, num_heads, seq_len, head_dim = 1, 2, 3, 4
     input = torch.arange(
-        (batch_size * seq_len * num_heads * head_dim), dtype=torch.bfloat16
-    ).reshape((batch_size, seq_len, num_heads, head_dim))
+        (batch_size * num_heads * seq_len * head_dim), dtype=torch.bfloat16
+    ).reshape((batch_size, num_heads, seq_len, head_dim))
     attention = MockAttention()
 
     # initialize quantization parameters
@@ -366,7 +384,5 @@ def test_static_attention_quantization(
         assert torch.equal(attention.k_observer.max_vals, exp_max_val)
 
     # check forward pass
-    print(output)
-    print(torch.nn.functional.mse_loss(output, input))
     assert torch.allclose(output, exp_quant.to(output.dtype))
     assert torch.nn.functional.mse_loss(output, input) <= exp_loss