Add support for AC budget API (pytorch#1731)

tohskai · githubsgi · commit fd224b323125 · 2025-10-13T13:01:06.000-07:00
Inspired by the blogpost: https://pytorch.org/blog/activation-checkpointing-techniques/
diff --git a/torchtitan/config/job_config.py b/torchtitan/config/job_config.py
@@ -560,7 +560,7 @@ class Checkpoint:
 
 @dataclass
 class ActivationCheckpoint:
-    mode: Literal["selective", "full", "none"] = "selective"
+    mode: Literal["selective", "full", "memory_budget", "none"] = "selective"
     """Type of activation checkpointing to use"""
 
     selective_ac_option: str = "2"
@@ -589,6 +589,24 @@ class ActivationCheckpoint:
     rematerialized.
     """
 
+    memory_budget: float = 0.5
+    """
+    When mode is set to "memory_budget", this value determines how much
+    partitioner in the compiler should trade off compute for memory.
+    0.0 corresponds to the activation memory from applying
+    activation checkpointing to the full compiled region, and 1.0 corresponds to
+    the activation memory from the default runtime-optimized strategy. Read here:
+    https://pytorch.org/blog/activation-checkpointing-techniques/
+    """
+
+    visualize_memory_budget_pareto: bool = False
+    """
+    This dumps out a SVG visualization of the expected runtime vs. activation
+    memory tradeoffs for all memory budget values from 0 to 1 in increments of
+    0.05 in {--job.dump_folder}/memory_budget_pareto folder. See an example here:
+    https://github.com/pytorch/pytorch/pull/126320#discussion_r1625104015
+    """
+
 
 @dataclass
 class Compile:
diff --git a/torchtitan/distributed/activation_checkpoint.py b/torchtitan/distributed/activation_checkpoint.py
@@ -7,6 +7,7 @@
 # This file provides the util functions to apply activation checkpointing to the model.
 # Technically, this is not a part of distributed, but distributed module is the best place to put it.
 
+import os
 from collections import defaultdict
 
 import torch
@@ -293,6 +294,7 @@ def apply_ac(
     model_compile_enabled: bool = False,
     use_flex_attn: bool = False,
     op_sac_save_list: set[torch._ops.OpOverload] | None = None,
+    base_folder: str = "",
 ) -> None:
     """Apply activation checkpointing to the model.
 
@@ -311,15 +313,27 @@ def apply_ac(
         None
     """
 
-    for layer_id, transformer_block in model.layers.named_children():
-        transformer_block = _apply_ac_to_transformer_block(
-            transformer_block,
-            ac_config,
-            base_fqn=f"layers.{layer_id}",
-            model_compile_enabled=model_compile_enabled,
-            use_flex_attn=use_flex_attn,
-            op_sac_save_list=op_sac_save_list,
-        )
-        model.layers.register_module(layer_id, transformer_block)
+    if ac_config.mode == "memory_budget":
+        assert model_compile_enabled, "Memory budget mode requires model to be compiled"
+        if ac_config.visualize_memory_budget_pareto:
+            pareto_dir = os.path.join(base_folder, "memory_budget_pareto")
+            if not os.path.exists(pareto_dir):
+                os.makedirs(pareto_dir, exist_ok=True)
+            torch._functorch.config.memory_budget_pareto_dir = pareto_dir
+            torch._functorch.config.visualize_memory_budget_pareto = True
+
+        torch._functorch.config.activation_memory_budget = ac_config.memory_budget
+        logger.info(f"Selected {ac_config.memory_budget} budget option")
+    else:
+        for layer_id, transformer_block in model.layers.named_children():
+            transformer_block = _apply_ac_to_transformer_block(
+                transformer_block,
+                ac_config,
+                base_fqn=f"layers.{layer_id}",
+                model_compile_enabled=model_compile_enabled,
+                use_flex_attn=use_flex_attn,
+                op_sac_save_list=op_sac_save_list,
+            )
+            model.layers.register_module(layer_id, transformer_block)
 
     logger.info(f"Applied {ac_config.mode} activation checkpointing to the model")
diff --git a/torchtitan/experiments/llama4/infra/parallelize.py b/torchtitan/experiments/llama4/infra/parallelize.py
@@ -120,6 +120,7 @@ def parallelize_llama(
             model_compile_enabled=model_compile_enabled,
             use_flex_attn=use_flex_attn,
             op_sac_save_list=_op_sac_save_list,
+            base_folder=job_config.job.dump_folder,
         )
 
     # turn on per-TransformerBlock compile after AC wrapping and before FSDP
diff --git a/torchtitan/experiments/qwen3/infra/parallelize.py b/torchtitan/experiments/qwen3/infra/parallelize.py
@@ -114,6 +114,7 @@ def parallelize_qwen3(
             model_compile_enabled=model_compile_enabled,
             use_flex_attn=use_flex_attn,
             op_sac_save_list=_op_sac_save_list,
+            base_folder=job_config.job.dump_folder,
         )
 
     # turn on per-TransformerBlock compile after AC wrapping and before FSDP
diff --git a/torchtitan/experiments/simple_fsdp/llama3/parallelize.py b/torchtitan/experiments/simple_fsdp/llama3/parallelize.py
@@ -85,6 +85,7 @@ def parallelize_llama(
             model_compile_enabled=model_compile_enabled,
             use_flex_attn=use_flex_attn,
             op_sac_save_list=_op_sac_save_list,
+            base_folder=job_config.job.dump_folder,
         )
 
     # apply data parallel
diff --git a/torchtitan/models/deepseek_v3/infra/parallelize.py b/torchtitan/models/deepseek_v3/infra/parallelize.py
@@ -113,6 +113,7 @@ def parallelize_deepseekv3(
             model_compile_enabled=model_compile_enabled,
             use_flex_attn=use_flex_attn,
             op_sac_save_list=_op_sac_save_list,
+            base_folder=job_config.job.dump_folder,
         )
 
     if model_compile_enabled:
diff --git a/torchtitan/models/llama3/infra/parallelize.py b/torchtitan/models/llama3/infra/parallelize.py
@@ -102,6 +102,7 @@ def parallelize_llama(
             model_compile_enabled=model_compile_enabled,
             use_flex_attn=use_flex_attn,
             op_sac_save_list=_op_sac_save_list,
+            base_folder=job_config.job.dump_folder,
         )
 
     # turn on per-TransformerBlock compile after AC wrapping and before FSDP

Original file line number	Diff line number	Diff line change
`@@ -120,6 +120,7 @@ def parallelize_llama(`
`120`	`120`	`model_compile_enabled=model_compile_enabled,`
`121`	`121`	`use_flex_attn=use_flex_attn,`
`122`	`122`	`op_sac_save_list=_op_sac_save_list,`
	`123`	`+ base_folder=job_config.job.dump_folder,`
`123`	`124`	`)`
`124`	`125`
`125`	`126`	`# turn on per-TransformerBlock compile after AC wrapping and before FSDP`
Original file line number	Diff line number	Diff line change
`@@ -114,6 +114,7 @@ def parallelize_qwen3(`
`114`	`114`	`model_compile_enabled=model_compile_enabled,`
`115`	`115`	`use_flex_attn=use_flex_attn,`
`116`	`116`	`op_sac_save_list=_op_sac_save_list,`
	`117`	`+ base_folder=job_config.job.dump_folder,`
`117`	`118`	`)`
`118`	`119`
`119`	`120`	`# turn on per-TransformerBlock compile after AC wrapping and before FSDP`
Original file line number	Diff line number	Diff line change
`@@ -85,6 +85,7 @@ def parallelize_llama(`
`85`	`85`	`model_compile_enabled=model_compile_enabled,`
`86`	`86`	`use_flex_attn=use_flex_attn,`
`87`	`87`	`op_sac_save_list=_op_sac_save_list,`
	`88`	`+ base_folder=job_config.job.dump_folder,`
`88`	`89`	`)`
`89`	`90`
`90`	`91`	`# apply data parallel`
Original file line number	Diff line number	Diff line change
`@@ -113,6 +113,7 @@ def parallelize_deepseekv3(`
`113`	`113`	`model_compile_enabled=model_compile_enabled,`
`114`	`114`	`use_flex_attn=use_flex_attn,`
`115`	`115`	`op_sac_save_list=_op_sac_save_list,`
	`116`	`+ base_folder=job_config.job.dump_folder,`
`116`	`117`	`)`
`117`	`118`
`118`	`119`	`if model_compile_enabled:`
Original file line number	Diff line number	Diff line change
`@@ -102,6 +102,7 @@ def parallelize_llama(`
`102`	`102`	`model_compile_enabled=model_compile_enabled,`
`103`	`103`	`use_flex_attn=use_flex_attn,`
`104`	`104`	`op_sac_save_list=_op_sac_save_list,`
	`105`	`+ base_folder=job_config.job.dump_folder,`
`105`	`106`	`)`
`106`	`107`
`107`	`108`	`# turn on per-TransformerBlock compile after AC wrapping and before FSDP`