[MoE][compile][full ac] weave torch.compile around the FSDP(GroupedExperts) graph break

xmfan · xmfan · commit b9efec70385b · 2025-10-15T17:53:21.000-07:00
diff --git a/torchtitan/models/deepseek_v3/infra/parallelize.py b/torchtitan/models/deepseek_v3/infra/parallelize.py
@@ -118,7 +118,7 @@ def parallelize_deepseekv3(
         )
 
     if model_compile_enabled:
-        apply_compile(model, job_config.compile)
+        apply_compile(model, job_config.compile, job_config.activation_checkpoint)
 
     dp_mesh: DeviceMesh | None = None
     if parallel_dims.fsdp_enabled or parallel_dims.ep_enabled:
diff --git a/torchtitan/models/llama4/infra/parallelize.py b/torchtitan/models/llama4/infra/parallelize.py
@@ -6,6 +6,9 @@
 
 import torch
 import torch.nn as nn
+from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import (
+    CheckpointWrapper,
+)
 from torch.distributed.device_mesh import DeviceMesh
 from torch.distributed.fsdp import CPUOffloadPolicy, fully_shard, MixedPrecisionPolicy
 from torch.distributed.tensor import Partial, Replicate, Shard
@@ -18,7 +21,10 @@
     SequenceParallel,
 )
 from torchtitan.config import JobConfig, TORCH_DTYPE_MAP
-from torchtitan.config.job_config import Compile as CompileConfig
+from torchtitan.config.job_config import (
+    ActivationCheckpoint as ACConfig,
+    Compile as CompileConfig,
+)
 from torchtitan.distributed import NoParallel, ParallelDims
 from torchtitan.distributed.activation_checkpoint import apply_ac
 
@@ -30,6 +36,7 @@
 )
 from torchtitan.distributed.tensor_parallel import maybe_enable_async_tp
 from torchtitan.models.llama3.infra.parallelize import apply_ddp
+from torchtitan.models.moe.moe import MoE
 from torchtitan.tools.logging import logger
 
 
@@ -125,7 +132,7 @@ def parallelize_llama(
 
     # turn on per-TransformerBlock compile after AC wrapping and before FSDP
     if model_compile_enabled:
-        apply_compile(model, job_config.compile)
+        apply_compile(model, job_config.compile, job_config.activation_checkpoint)
 
     dp_mesh: DeviceMesh | None = None
     if parallel_dims.fsdp_enabled or parallel_dims.ep_enabled:
@@ -503,24 +510,60 @@ def apply_moe_ep_tp(
         )
 
 
-def apply_compile(model: nn.Module, compile_config: CompileConfig):
+def apply_compile(model: nn.Module, compile_config: CompileConfig, ac_config: ACConfig):
     """
     Apply torch.compile to each TransformerBlock, which makes compilation efficient due to
     repeated structure. Alternatively one can compile the whole model (after applying DP).
     """
     # NOTE: This flag is needed for torch.compile to avoid graph breaking on dynamic shapes in token-choice MoE
     # but it is experimental.
-    # torch._dynamo.config.capture_scalar_outputs = True
+    torch._dynamo.config.capture_scalar_outputs = True
     for layer_id, transformer_block in model.layers.named_children():
-        # TODO: remove when torch.compile supports fullgraph=True for MoE
-        fullgraph = True
         if transformer_block.moe_enabled:
-            fullgraph = False
-        transformer_block = torch.compile(
-            transformer_block,
-            backend=compile_config.backend,
-            fullgraph=fullgraph,
-        )
+            # If it is a MoE layer, FSDP(GroupedExperts) will cause a graph break
+            # So we must weave compile wrappers around those FSDP hooks to
+            # prevent AC from falling back the whole graph to eager.
+            assert (
+                ac_config.mode != "selective"
+            ), "Selective Activation Checkpointing + Compile is not yet supported for MoE models."
+
+            if isinstance(transformer_block, CheckpointWrapper):
+                # unwrap so that .named_children() works
+                block = transformer_block._checkpoint_wrapped_module
+            else:
+                block = transformer_block
+
+            for attr_name, submod in block.named_children():
+                if isinstance(submod, MoE):
+                    # avoid graph breaking on the GroupedExperts' FSDP hooks
+                    # by wrapping each submod's forward instead of their __call__
+                    moe_key = attr_name
+                    moe = submod
+                    for attr_name, submod in moe.named_children():
+                        setattr(
+                            moe,
+                            attr_name,
+                            torch.compile(
+                                submod, backend=compile_config.backend, fullgraph=True
+                            ),
+                        )
+                else:
+                    setattr(
+                        transformer_block,
+                        attr_name,
+                        torch.compile(
+                            submod, backend=compile_config.backend, fullgraph=True
+                        ),
+                    )
+        else:
+            # If it's not a MoE layer, there is no FSDP(GroupedExperts)
+            # So we can compile the whole block
+            transformer_block = torch.compile(
+                transformer_block,
+                backend=compile_config.backend,
+                fullgraph=True,
+            )
+
         model.layers.register_module(layer_id, transformer_block)
 
     logger.info("Compiling each TransformerBlock with torch.compile")
diff --git a/torchtitan/models/qwen3/infra/parallelize.py b/torchtitan/models/qwen3/infra/parallelize.py
@@ -119,7 +119,7 @@ def parallelize_qwen3(
 
     # turn on per-TransformerBlock compile after AC wrapping and before FSDP
     if model_compile_enabled:
-        apply_compile(model, job_config.compile)
+        apply_compile(model, job_config.compile, job_config.activation_checkpoint)
 
     if parallel_dims.fsdp_enabled:
         # apply FSDP or HSDP, potentially with Context Parallel

Original file line number	Diff line number	Diff line change
`@@ -118,7 +118,7 @@ def parallelize_deepseekv3(`
`118`	`118`	`)`
`119`	`119`
`120`	`120`	`if model_compile_enabled:`
`121`		`- apply_compile(model, job_config.compile)`
	`121`	`+ apply_compile(model, job_config.compile, job_config.activation_checkpoint)`
`122`	`122`
`123`	`123`	`dp_mesh: DeviceMesh \| None = None`
`124`	`124`	`if parallel_dims.fsdp_enabled or parallel_dims.ep_enabled:`