test(moe): fix stale unit tests broken by lazy DeepEP buffer + packed-param requires_grad

Daniel Shen · claude · Daniel Shen · commit 9822d2e25ac1 · 2026-06-15T05:42:24.000Z
Two L0 unit tests were stale relative to earlier branch code changes:

- test_grouped_experts_deepep_token_dispatcher_init asserted init_token_dispatcher
  eagerly calls _init_deepep_buffer, but buffer allocation is now lazy (deferred to
  FusedDispatch.forward) — the revert that fixed the single-node load-time OOM. Assert
  it is NOT called.
- ExpertParallel._partition_fn now constructs nn.Parameter(..., requires_grad=...) so
  non-floating packed mxfp4 params (int8 / e8m0) don't trip the default requires_grad=True.
  The test's stub Parameter didn't accept/store requires_grad; add it (also unblocks the
  requires_grad-preservation test).

Both fixes verified: tests/unit_tests/moe now 450 passed, 0 failed.

Co-Authored-By: Claude Opus 4.8 (1M context) &lt;noreply@anthropic.com&gt;
Signed-off-by: Daniel Shen &lt;dshen@crusoe.ai&gt;
diff --git a/tests/unit_tests/moe/test_experts.py b/tests/unit_tests/moe/test_experts.py
@@ -759,7 +759,10 @@ def test_grouped_experts_deepep_token_dispatcher_init(self, moe_config):
             assert hasattr(experts, "token_dispatcher")
             assert experts.ep_size == 2
             assert experts.ep_rank == 0
-            mock_init_buffer.assert_called_once_with(mock_mesh.get_group.return_value)
+            # The DeepEP NVSHMEM buffer is allocated lazily (in FusedDispatch.forward),
+            # not eagerly in init_token_dispatcher — the revert that fixed the single-node
+            # load-time OOM. So init_token_dispatcher must NOT call _init_deepep_buffer.
+            mock_init_buffer.assert_not_called()
 
     def test_grouped_experts_deepep_apply_bias_no_bias(self, moe_config):
         """Test _apply_bias method with no bias."""
diff --git a/tests/unit_tests/moe/test_parallelizer.py b/tests/unit_tests/moe/test_parallelizer.py
@@ -84,8 +84,9 @@ def _install_torch_and_layers_stubs(monkeypatch):
     nn_stub = types.ModuleType("torch.nn")
 
     class Parameter:
-        def __init__(self, data=None):
+        def __init__(self, data=None, requires_grad=True):
             self.data = data
+            self.requires_grad = requires_grad
 
     class Module:
         pass