Merge branch 'PR2411' into 26.04-alpha

mmarcinkiewicz · mmarcinkiewicz · commit 68d8bcc94cb5 · 2026-04-23T11:31:49.000+02:00
diff --git a/docs/training/cuda-graphs.md b/docs/training/cuda-graphs.md
@@ -162,6 +162,10 @@ Larger MoE runs can become memory-gated before graph replay pays off:
 - treat CUDA graphs as a throughput optimization for runs with margin, not as a
   fit-enabling technique
 
+## Optimizer CUDA graph
+
+The `OptimizerCudaGraphWrapper` is an experimental utility that enables CUDA graph capture of the ADAM optimizer step. By encapsulating optimizer updates into a replayable CUDA graph, it can further reduce host launch overhead, especially when optimizer has additional CPU overhead such as additional cast when using low-precision native parameters. This feature can be enabled by passing `optimizer_cuda_graph=True` to `OptimizerConfig`. Avoid using it if your optimizer step is includes conditional logic, or uses operations that are not graph-compatible, as these may cause capture or replay failures.
+
 ## Common Failure Modes
 
 - Missing TE RNG tracker settings causes an assertion before training starts.
diff --git a/src/megatron/bridge/training/train.py b/src/megatron/bridge/training/train.py
@@ -100,6 +100,13 @@
 from megatron.bridge.utils.common_utils import get_world_size_safe, print_rank_0
 
 
+# For Optimizer CUDA graph support
+try:
+    from megatron.core.optimizer.optimizer_cuda_graph import OptimizerCudaGraphWrapper
+
+    HAS_OPTIMIZER_CUDA_GRAPH = True
+except ImportError:
+    HAS_OPTIMIZER_CUDA_GRAPH = False
 # For Paged Stashing support
 try:
     from megatron.core.transformer.moe.paged_stash import PagedStashRunner
@@ -303,6 +310,12 @@ def train(
         forward_backward_func = FullCudaGraphWrapper(
             forward_backward_func, cuda_graph_warmup_steps=config.model.cuda_graph_warmup_steps
         )
+
+    if config.optimizer.optimizer_cuda_graph and HAS_OPTIMIZER_CUDA_GRAPH:
+        optimizer.step = OptimizerCudaGraphWrapper(
+            optimizer.step, cuda_graph_warmup_steps=config.model.cuda_graph_warmup_steps
+        )
+
     # Wrap model with PagedStashRunner when moe_expert_rank_capacity_factor padding is enabled.
     # PagedStashRunner is responsible for detecting overflow and re-running iteration in eager-mode without padding.
     if HAS_PAGED_STASHING and config.model.moe_expert_rank_capacity_factor is not None:
@@ -1581,6 +1594,11 @@ def _delete_cuda_graphs(cuda_graph_helper: TECudaGraphHelper):
     if "training" in FullCudaGraphWrapper.cuda_graph:
         del FullCudaGraphWrapper.cuda_graph["training"]
 
+    # Explicitly delete optimizer CUDA graph
+    if HAS_OPTIMIZER_CUDA_GRAPH and OptimizerCudaGraphWrapper.cuda_graph is not None:
+        del OptimizerCudaGraphWrapper.cuda_graph
+        OptimizerCudaGraphWrapper.cuda_graph = None
+
     # Cleanup CUDA graphs object for partial Cuda-graphs (implemented in TransformerEngine)
     if cuda_graph_helper is not None:
         cuda_graph_helper.delete_cuda_graphs()
diff --git a/tests/functional_tests/test_groups/recipes/test_llama_recipes_pretrain_cuda_graphs.py b/tests/functional_tests/test_groups/recipes/test_llama_recipes_pretrain_cuda_graphs.py
@@ -54,6 +54,17 @@
             "ddp": {"check_for_nan_in_grad": False},
         },
     ),
+    (
+        llama32_1b_config,
+        "llama32_1b",
+        {
+            "optimizer": {"optimizer_cuda_graph": True},
+            # Disable checkpoint save as it's not supported currently with OptimizerCG.
+            "checkpoint": {"save": None},
+            "rerun_state_machine": {"check_for_nan_in_loss": False},
+            "ddp": {"check_for_nan_in_grad": False},
+        },
+    ),
 ]