fix deepseek tp sharding error

NuojCheng · NuojCheng · commit 66ac814a1fd8 · 2025-11-11T05:43:52.000Z
diff --git a/src/MaxText/layers/moe.py b/src/MaxText/layers/moe.py
@@ -29,7 +29,7 @@
 import jax.numpy as jnp
 import numpy as np
 
-from MaxText import common_types as ctypes
+from MaxText import common_types as ctypes, EP_AS_CONTEXT
 from MaxText import max_logging
 from MaxText import max_utils
 from MaxText.kernels import megablox as mblx
@@ -1833,8 +1833,18 @@ def routed_moe(self):
     return self.MoeBlock_0
 
   def __call__(self, inputs: jax.Array) -> jax.Array:
+    batch_logical_axes = (
+      "activation_batch_no_exp" if self.config.expert_shard_attention_option == EP_AS_CONTEXT
+      else "activation_batch"
+    )
+    seq_logical_axes = (
+      "activation_length" if self.config.expert_shard_attention_option == EP_AS_CONTEXT
+      else "activation_length_no_exp"
+    )
     routed_experts, _ = self.routed_moe(inputs)
+    routed_experts = nn.with_logical_constraint(routed_experts, (batch_logical_axes, seq_logical_axes, "activation_embed"))
     shared_experts = self.shared_experts(inputs)
+    shared_experts = nn.with_logical_constraint(shared_experts, (batch_logical_axes, seq_logical_axes, "activation_embed"))
     return routed_experts + shared_experts