Fix Adam subgroup inconsistency (#7982)

st-bang97 · web-flow · commit 44c51e342368 · 2026-04-23T10:46:22.000+08:00
Fix CPUAdam same-step subgroup drift in ZeRO-3 (#7819) This PR ports the fix from #7820 to the latest DeepSpeed version. It makes `Adam_Optimizer::IncrementStep` idempotent for repeated calls at the same logical step and avoids unnecessary recomputation when the step has not changed. ZeRO-3/SuperOffload can invoke multiple subgroup updates within a single logical step on a shared native optimizer object. The previous logic mixed multiply and recompute paths, producing non-bit-identical bias-correction metadata across subgroup calls. This change aligns the step-transition logic in both the CPU and XPU headers, clarifies first-step and non-sequential-step behavior, and prevents unnecessary work on repeated same-step updates. It also adds CPUAdam regression tests covering subgroup-style repeated same-step updates through both `step_subgroup()` and `step()` with parameter swapping. Signed-off-by: st_bang <st.bang@dgist.ac.kr>
diff --git a/csrc/includes/cpu_adam.h b/csrc/includes/cpu_adam.h
@@ -63,14 +63,17 @@ class Adam_Optimizer {
             _betta1_t = std::pow(_betta1, step);
             _betta2_t = std::pow(_betta2, step);
         } else {
-            _step++;
-            if (_step != step) {
+            if (step == _step + 1) {  // first optimizer step increase
+                _step++;
+                _betta1_t *= _betta1;
+                _betta2_t *= _betta2;
+            } else if (step ==
+                       _step) {  // no need to update step; beta1_t and beta2_t already updated
+                return;
+            } else {  // support step increase not equal to 1
                 _betta1_t = std::pow(_betta1, step);
                 _betta2_t = std::pow(_betta2, step);
                 _step = step;
-            } else {
-                _betta1_t *= _betta1;
-                _betta2_t *= _betta2;
             }
         }
     }
diff --git a/csrc/xpu/includes/cpu_adam.h b/csrc/xpu/includes/cpu_adam.h
@@ -69,14 +69,17 @@ class Adam_Optimizer {
             _betta1_t = std::pow(_betta1, step);
             _betta2_t = std::pow(_betta2, step);
         } else {
-            _step++;
-            if (_step != step) {
+            if (step == _step + 1) {  // first optimizer step increase
+                _step++;
+                _betta1_t *= _betta1;
+                _betta2_t *= _betta2;
+            } else if (step ==
+                       _step) {  // no need to update step; beta1_t and beta2_t already updated
+                return;
+            } else {  // support step increase not equal to 1
                 _betta1_t = std::pow(_betta1, step);
                 _betta2_t = std::pow(_betta2, step);
                 _step = step;
-            } else {
-                _betta1_t *= _betta1;
-                _betta2_t *= _betta2;
             }
         }
     }
diff --git a/tests/unit/ops/adam/test_cpu_adam.py b/tests/unit/ops/adam/test_cpu_adam.py
@@ -312,3 +312,59 @@ def test_multiple_subgroups(self):
         optimizer.rollback_subgroup(0)
         assert optimizer.state[0]['step'] == 1, "Subgroup 0 step count should be decremented"
         assert optimizer.state[1]['step'] == 1, "Subgroup 1 step count should be unchanged"
+
+    def test_step_subgroup_same_step_idempotent_across_subgroups(self):
+        """Repeated same-step subgroup updates should remain bit-identical."""
+        from deepspeed.ops.adam import DeepSpeedCPUAdam
+
+        model_size = 128
+        steps = 4
+        base = torch.randn(model_size, device='cpu', dtype=torch.float32)
+        param_a = torch.nn.Parameter(base.clone())
+        param_b = torch.nn.Parameter(base.clone())
+
+        optimizer = DeepSpeedCPUAdam([param_a])
+        for logical_step in range(1, steps + 1):
+            grad = torch.randn(model_size, device='cpu', dtype=torch.float32)
+
+            optimizer.param_groups[0]['params'] = [param_a]
+            param_a.grad = grad.clone()
+            optimizer.step_subgroup(0)
+
+            optimizer.param_groups[0]['params'] = [param_b]
+            param_b.grad = grad.clone()
+            optimizer.step_subgroup(1)
+
+            assert optimizer.state[0]['step'] == logical_step
+            assert optimizer.state[1]['step'] == logical_step
+            assert torch.equal(param_a.data, param_b.data)
+            assert torch.equal(optimizer.state[0]['exp_avg'], optimizer.state[1]['exp_avg'])
+            assert torch.equal(optimizer.state[0]['exp_avg_sq'], optimizer.state[1]['exp_avg_sq'])
+
+    def test_step_same_step_idempotent_across_param_keys(self):
+        """Repeated optimizer.step() with swapped param keys should be deterministic."""
+        from deepspeed.ops.adam import DeepSpeedCPUAdam
+
+        model_size = 128
+        steps = 4
+        base = torch.randn(model_size, device='cpu', dtype=torch.float32)
+        param_a = torch.nn.Parameter(base.clone())
+        param_b = torch.nn.Parameter(base.clone())
+
+        optimizer = DeepSpeedCPUAdam([param_a])
+        for logical_step in range(1, steps + 1):
+            grad = torch.randn(model_size, device='cpu', dtype=torch.float32)
+
+            optimizer.param_groups[0]['params'] = [param_a]
+            param_a.grad = grad.clone()
+            optimizer.step()
+
+            optimizer.param_groups[0]['params'] = [param_b]
+            param_b.grad = grad.clone()
+            optimizer.step()
+
+            assert optimizer.state[param_a]['step'] == logical_step
+            assert optimizer.state[param_b]['step'] == logical_step
+            assert torch.equal(param_a.data, param_b.data)
+            assert torch.equal(optimizer.state[param_a]['exp_avg'], optimizer.state[param_b]['exp_avg'])
+            assert torch.equal(optimizer.state[param_a]['exp_avg_sq'], optimizer.state[param_b]['exp_avg_sq'])