pytorch · githubsgi · Oct 7, 2025 · Oct 10, 2025 · Oct 10, 2025 · Oct 8, 2025
@@ -70,7 +70,7 @@ When debugging issues with multi-dimensional parallelism (combinations of FSDP,
 Set consistent random seeds across all parallelism dimensions:
 
 ```bash
-CONFIG_FILE="./torchtitan/models/llama3/train_configs/debug_model.toml" ./run_train.sh --training.seed 42
+CONFIG_FILE="./torchtitan/models/llama3/train_configs/debug_model.toml" ./run_train.sh --debug.seed 42
 ```
 
 **Seed behavior with parallelism:**
@@ -84,7 +84,7 @@ CONFIG_FILE="./torchtitan/models/llama3/train_configs/debug_model.toml" ./run_tr
 Enable deterministic algorithms to ensure bit-for-bit reproducibility across runs:
 
 ```bash
-CONFIG_FILE="./torchtitan/models/llama3/train_configs/debug_model.toml" ./run_train.sh --training.deterministic
+CONFIG_FILE="./torchtitan/models/llama3/train_configs/debug_model.toml" ./run_train.sh --debug.deterministic
 ```
 
 **What it does:**
@@ -93,6 +93,19 @@ CONFIG_FILE="./torchtitan/models/llama3/train_configs/debug_model.toml" ./run_tr
 - Sets deterministic workspace configuration for CuBLAS operations
 - **Note:** This will significantly reduce training performance but ensures exact reproducibility
 
+Use `--debug.deterministic_warn_only` to only warn about (not stop running) kernel without deterministic implementation.
+
+### Activation Checkipointing Debugging ###
+
+The following debug configs are available for AC.
+
+`ac_preserve_rng_state` - if deterministic output compared to non-checkpointed passes is required, set to true. Results in stashing and restoring the RNG state during each checkpoint, may be slower.
+
+`ac_determinism_check` - A string specifying the determinism function
+
+`ac_debug` - capture ac debug information. Will be slower.
+
+See https://docs.pytorch.org/docs/stable/checkpoint.html for details.
 
 ### Seed-Checkpoint-based Reproducibility
 

@@ -11,6 +11,7 @@
 from torch.utils.flop_counter import FlopCounterMode
 
 from torchtitan.config.job_config import ActivationCheckpoint as ACConfig
+from torchtitan.config.job_config import JobConfig
 from torchtitan.distributed.activation_checkpoint import apply_ac
 
 
@@ -74,15 +75,16 @@ def get_bw_flops(model_fn):
         # 2. SAC
         # Per-op SAC's policy is to save every other mm
         model_selective_ac = ToyModule()
-        ac_config_no_force = ACConfig(
+        job_config = JobConfig()
+        job_config.activation_checkpoint = ACConfig(
             mode="selective",
             selective_ac_option="op",
             per_op_sac_force_recompute_mm_shapes_by_fqns=[],  # Empty list
             early_stop=False,
         )
         apply_ac(
             model_selective_ac,
-            ac_config_no_force,
+            job_config,
             model_compile_enabled=False,
             use_flex_attn=False,
             op_sac_save_list=_op_sac_save_list,
@@ -92,15 +94,15 @@ def get_bw_flops(model_fn):
         # 3. Per-op SAC with force recompute "moe.router.gate"
         # This leads to two mms being recomputed since they share the same shape!
         model_with_force_first = ToyModule()
-        ac_config_with_force_first = ACConfig(
+        job_config.activation_checkpoint = ACConfig(
             mode="selective",
             selective_ac_option="op",
             per_op_sac_force_recompute_mm_shapes_by_fqns=["moe.router.gate"],
             early_stop=False,
         )
         apply_ac(
             model_with_force_first,
-            ac_config_with_force_first,
+            job_config,
             model_compile_enabled=False,
             use_flex_attn=False,
             op_sac_save_list=_op_sac_save_list,
@@ -109,15 +111,15 @@ def get_bw_flops(model_fn):
 
         # 4. Per-op SAC with force recompute "output"
         model_with_force_last = ToyModule()
-        ac_config_with_force_last = ACConfig(
+        job_config.activation_checkpoint = ACConfig(
             mode="selective",
             selective_ac_option="op",
             per_op_sac_force_recompute_mm_shapes_by_fqns=["output"],
             early_stop=False,
         )
         apply_ac(
             model_with_force_last,
-            ac_config_with_force_last,
+            job_config,
             model_compile_enabled=False,
             use_flex_attn=False,
             op_sac_save_list=_op_sac_save_list,
@@ -126,13 +128,13 @@ def get_bw_flops(model_fn):
 
         # 5. Full AC
         model_with_full_ac = ToyModule()
-        ac_config_full_ac = ACConfig(
+        job_config.activation_checkpoint = ACConfig(
             mode="full",
             early_stop=False,
         )
         apply_ac(
             model_with_full_ac,
-            ac_config_full_ac,
+            job_config,
             model_compile_enabled=False,
             use_flex_attn=False,
             op_sac_save_list=_op_sac_save_list,
@@ -168,14 +170,14 @@ def get_act_mem(model_fn):
         # 2. SAC
         # Per-op SAC's policy is to save every other mm
         model_selective_ac = ToyModule().cuda()
-        ac_config_no_force = ACConfig(
+        job_config.activation_checkpoint = ACConfig(
             mode="selective",
             selective_ac_option="op",
             per_op_sac_force_recompute_mm_shapes_by_fqns=[],  # Empty list
         )
         apply_ac(
             model_selective_ac,
-            ac_config_no_force,
+            job_config,
             model_compile_enabled=False,
             use_flex_attn=False,
             op_sac_save_list=_op_sac_save_list,
@@ -185,14 +187,14 @@ def get_act_mem(model_fn):
         # 3. Per-op SAC with force recompute "moe.router.gate"
         # This leads to two mms being recomputed since they share the same shape!
         model_with_force_first = ToyModule().cuda()
-        ac_config_with_force_first = ACConfig(
+        job_config.activation_checkpoint = ACConfig(
             mode="selective",
             selective_ac_option="op",
             per_op_sac_force_recompute_mm_shapes_by_fqns=["moe.router.gate"],
         )
         apply_ac(
             model_with_force_first,
-            ac_config_with_force_first,
+            job_config,
             model_compile_enabled=False,
             use_flex_attn=False,
             op_sac_save_list=_op_sac_save_list,
@@ -201,14 +203,14 @@ def get_act_mem(model_fn):
 
         # 4. Per-op SAC with force recompute "output"
         model_with_force_last = ToyModule().cuda()
-        ac_config_with_force_last = ACConfig(
+        job_config.activation_checkpoint = ACConfig(
             mode="selective",
             selective_ac_option="op",
             per_op_sac_force_recompute_mm_shapes_by_fqns=["output"],
         )
         apply_ac(
             model_with_force_last,
-            ac_config_with_force_last,
+            job_config,
             model_compile_enabled=False,
             use_flex_attn=False,
             op_sac_save_list=_op_sac_save_list,
@@ -217,12 +219,12 @@ def get_act_mem(model_fn):
 
         # 5. Full AC
         model_with_full_ac = ToyModule().cuda()
-        ac_config_full_ac = ACConfig(
+        job_config.activation_checkpoint = ACConfig(
             mode="full",
         )
         apply_ac(
             model_with_full_ac,
-            ac_config_full_ac,
+            job_config,
             model_compile_enabled=False,
             use_flex_attn=False,
             op_sac_save_list=_op_sac_save_list,
@@ -243,40 +245,44 @@ def test_correctness(self):
 
         model_selective_ac = ToyModule()
         model_selective_ac.load_state_dict(model_no_ac.state_dict())
-        apply_ac(
-            model_selective_ac,
-            ACConfig(
+        job_config = JobConfig()
+        job_config.activation_checkpoint = ACConfig(
                 mode="selective",
                 selective_ac_option="op",
                 per_op_sac_force_recompute_mm_shapes_by_fqns=[],
-            ),
+        )
+        apply_ac(
+            model_selective_ac,
+            job_config,
             model_compile_enabled=False,
             use_flex_attn=False,
             op_sac_save_list=_op_sac_save_list,
         )
         model_force_first = ToyModule()
         model_force_first.load_state_dict(model_no_ac.state_dict())
-        apply_ac(
-            model_force_first,
-            ACConfig(
+        job_config.activation_checkpoint =  ACConfig(
                 mode="selective",
                 selective_ac_option="op",
                 per_op_sac_force_recompute_mm_shapes_by_fqns=["moe.router.gate"],
-            ),
+        )
+        apply_ac(
+            model_force_first,
+            job_config,
             model_compile_enabled=False,
             use_flex_attn=False,
             op_sac_save_list=_op_sac_save_list,
         )
 
         model_force_last = ToyModule()
         model_force_last.load_state_dict(model_no_ac.state_dict())
-        apply_ac(
-            model_force_last,
-            ACConfig(
+        job_config.activation_checkpoint = ACConfig(
                 mode="selective",
                 selective_ac_option="op",
                 per_op_sac_force_recompute_mm_shapes_by_fqns=["output"],
-            ),
+        )
+        apply_ac(
+            model_force_last,
+            job_config,
             model_compile_enabled=False,
             use_flex_attn=False,
             op_sac_save_list=_op_sac_save_list,

@@ -28,6 +28,7 @@
     Quantize,
     Training,
     Validation,
+    Debug
 )
 from .manager import ConfigManager
 
@@ -49,4 +50,5 @@
     "Profiling",
     "Training",
     "Validation",
+    "Debug"
 ]
@@ -253,15 +253,6 @@ class Training:
     many temporary files.
     """
 
-    seed: int | None = None
-    """Choose the base RNG seed used for training"""
-
-    deterministic: bool = False
-    """Use deterministic algorithms wherever possible, may be slower"""
-
-    debug_moe_force_load_balance: bool = False
-    """If True, we force each experts to get the same amount of tokens via round-robin. This option is for debugging usage only."""
-
 
 @dataclass
 class Parallelism:
@@ -880,6 +871,29 @@ def __post_init__(self):
         ), "validation steps must be positive or -1"
 
 
+@dataclass
+class Debug:
+    seed: int | None = None
+    """Choose the base RNG seed used for training"""
+
+    deterministic: bool = False
+    """Use deterministic algorithms wherever possible, may be slower"""
+
+    deterministic_warn_only: bool = False
+    """Only warns about ops without deterministic implementations rather than erroring out  """
+
+    ac_preserve_rng_state: bool = False
+    """If deterministic output compared to non-checkpointed passes is required, set to true. Results in stashing and restoring the RNG state during each checkpoint, may be slower. See https://docs.pytorch.org/docs/stable/checkpoint.html for details."""
+
+    ac_determinism_check: str = "default"
+    """A string specifying the determinism function. See https://docs.pytorch.org/docs/stable/checkpoint.html for details."""
+
+    ac_debug: bool = False
+    """ Capture ac debug information. Will be slower. See https://docs.pytorch.org/docs/stable/checkpoint.html for details."""
+
+    moe_force_load_balance: bool = False
+    """If True, we force each experts to get the same amount of tokens via round-robin. This option is for debugging usage only."""
+
 @dataclass
 class JobConfig:
     """
@@ -905,6 +919,7 @@ class JobConfig:
     fault_tolerance: FaultTolerance = field(default_factory=FaultTolerance)
     experimental: Experimental = field(default_factory=Experimental)
     validation: Validation = field(default_factory=Validation)
+    debug: Debug = field(default_factory=Debug)
 
     def to_dict(self) -> dict[str, Any]:
         return asdict(self)