Disable FlexAttention max-autotune when deterministic is used (#1808)

fegin · web-flow · commit 41eff5325b74 · 2025-10-08T10:45:40.000-07:00
With max-autotune, FlexAttention is not deterministic even if
torch.use_deterministic_algorithms is True. When deterministic mode is
set, we should also remove the usage of `max-autotune`.
diff --git a/torchtitan/distributed/utils.py b/torchtitan/distributed/utils.py
@@ -106,6 +106,14 @@ def set_determinism(
         # https://pytorch.org/docs/stable/generated/torch.use_deterministic_algorithms.html
         os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
 
+        # Ensure flex_attention is compiled without max-autotune. This is needed to ensure
+        # reproducibility, since the autotune results may not be deterministic.
+        from torch.nn.attention.flex_attention import flex_attention
+
+        from torchtitan.models.attention import FlexAttention
+
+        FlexAttention.flex_attn = torch.compile(flex_attention)
+
     if not world_mesh:
         if seed is not None:
             torch.manual_seed(seed)