Skip to content

Commit 68d8bcc

Browse files
Merge branch 'PR2411' into 26.04-alpha
2 parents 090da65 + d21e0d4 commit 68d8bcc

3 files changed

Lines changed: 33 additions & 0 deletions

File tree

docs/training/cuda-graphs.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -162,6 +162,10 @@ Larger MoE runs can become memory-gated before graph replay pays off:
162162
- treat CUDA graphs as a throughput optimization for runs with margin, not as a
163163
fit-enabling technique
164164

165+
## Optimizer CUDA graph
166+
167+
The `OptimizerCudaGraphWrapper` is an experimental utility that enables CUDA graph capture of the ADAM optimizer step. By encapsulating optimizer updates into a replayable CUDA graph, it can further reduce host launch overhead, especially when optimizer has additional CPU overhead such as additional cast when using low-precision native parameters. This feature can be enabled by passing `optimizer_cuda_graph=True` to `OptimizerConfig`. Avoid using it if your optimizer step is includes conditional logic, or uses operations that are not graph-compatible, as these may cause capture or replay failures.
168+
165169
## Common Failure Modes
166170

167171
- Missing TE RNG tracker settings causes an assertion before training starts.

src/megatron/bridge/training/train.py

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,13 @@
100100
from megatron.bridge.utils.common_utils import get_world_size_safe, print_rank_0
101101

102102

103+
# For Optimizer CUDA graph support
104+
try:
105+
from megatron.core.optimizer.optimizer_cuda_graph import OptimizerCudaGraphWrapper
106+
107+
HAS_OPTIMIZER_CUDA_GRAPH = True
108+
except ImportError:
109+
HAS_OPTIMIZER_CUDA_GRAPH = False
103110
# For Paged Stashing support
104111
try:
105112
from megatron.core.transformer.moe.paged_stash import PagedStashRunner
@@ -303,6 +310,12 @@ def train(
303310
forward_backward_func = FullCudaGraphWrapper(
304311
forward_backward_func, cuda_graph_warmup_steps=config.model.cuda_graph_warmup_steps
305312
)
313+
314+
if config.optimizer.optimizer_cuda_graph and HAS_OPTIMIZER_CUDA_GRAPH:
315+
optimizer.step = OptimizerCudaGraphWrapper(
316+
optimizer.step, cuda_graph_warmup_steps=config.model.cuda_graph_warmup_steps
317+
)
318+
306319
# Wrap model with PagedStashRunner when moe_expert_rank_capacity_factor padding is enabled.
307320
# PagedStashRunner is responsible for detecting overflow and re-running iteration in eager-mode without padding.
308321
if HAS_PAGED_STASHING and config.model.moe_expert_rank_capacity_factor is not None:
@@ -1581,6 +1594,11 @@ def _delete_cuda_graphs(cuda_graph_helper: TECudaGraphHelper):
15811594
if "training" in FullCudaGraphWrapper.cuda_graph:
15821595
del FullCudaGraphWrapper.cuda_graph["training"]
15831596

1597+
# Explicitly delete optimizer CUDA graph
1598+
if HAS_OPTIMIZER_CUDA_GRAPH and OptimizerCudaGraphWrapper.cuda_graph is not None:
1599+
del OptimizerCudaGraphWrapper.cuda_graph
1600+
OptimizerCudaGraphWrapper.cuda_graph = None
1601+
15841602
# Cleanup CUDA graphs object for partial Cuda-graphs (implemented in TransformerEngine)
15851603
if cuda_graph_helper is not None:
15861604
cuda_graph_helper.delete_cuda_graphs()

tests/functional_tests/test_groups/recipes/test_llama_recipes_pretrain_cuda_graphs.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,17 @@
5454
"ddp": {"check_for_nan_in_grad": False},
5555
},
5656
),
57+
(
58+
llama32_1b_config,
59+
"llama32_1b",
60+
{
61+
"optimizer": {"optimizer_cuda_graph": True},
62+
# Disable checkpoint save as it's not supported currently with OptimizerCG.
63+
"checkpoint": {"save": None},
64+
"rerun_state_machine": {"check_for_nan_in_loss": False},
65+
"ddp": {"check_for_nan_in_grad": False},
66+
},
67+
),
5768
]
5869

5970

0 commit comments

Comments
 (0)