Fixes accuracy issues with backward pass of ring-of-experts technique

jesselu-google · Google-ML-Automation · commit d4495e11d2e1 · 2025-09-18T16:57:11.000-07:00
PiperOrigin-RevId: 808794168
diff --git a/src/MaxText/layers/moe.py b/src/MaxText/layers/moe.py
@@ -515,7 +515,7 @@ def apply_ffn_activation(self, layer_w0, layer_w1):
         intermediate_layer = jnp.multiply(layer_act, layer_w1)
       return intermediate_layer.astype(self.dtype)
 
-  def permute(self, inputs, gate_logits, pre_bias_logits, use_custom_sort_vjp=True, rngs=None):
+  def permute(self, inputs, gate_logits, pre_bias_logits, use_custom_sort_vjp=True, rngs=None, roll_to_expert_id=None):
     """Permute tokens to group by expert to fit gmm call."""
     # reshape inputs (batch, sequence, emb) to (batch * sequence, emb)
     inputs_shape = inputs.shape
@@ -530,6 +530,8 @@ def permute(self, inputs, gate_logits, pre_bias_logits, use_custom_sort_vjp=True
       inputs_2d = inputs_2d * router_scores.reshape(bsz_times_seq_len, -1)
 
     flatten_selected_experts = jnp.ravel(selected_experts)
+    if roll_to_expert_id is not None:
+      flatten_selected_experts = (flatten_selected_experts - roll_to_expert_id) % self.num_experts
     sorted_selected_experts = jnp.argsort(flatten_selected_experts)
     # sort inputs for number of selected experts
     replicated_inputs_2d = jnp.repeat(inputs_2d, self.num_experts_per_tok, axis=0)
@@ -942,23 +944,17 @@ def wrapper(x, logits, pre_bias_logits, w0, w1, wo, w0_bias, w1_bias, wo_bias, r
         )
 
         # "Route" tokens within each shard.
-        x, sorted_selected_experts, weights, full_group_sizes, selected_experts = self.permute(
-            x, logits, pre_bias_logits, self.config.use_custom_sort_vjp)
+        num_experts_per_shard = self.config.num_experts // num_expert_parallelism
+        x, sorted_selected_experts, weights, group_sizes, selected_experts = self.permute(
+            x, logits, pre_bias_logits, self.config.use_custom_sort_vjp,
+            roll_to_expert_id=num_experts_per_shard * expert_shard_id)
 
         # Filter down to the group sizes that apply to only the experts in the
         # current shard.
-        full_group_sizes = jnp.reshape(full_group_sizes, (num_expert_parallelism, -1))
-        group_sizes = full_group_sizes[expert_shard_id]
-
-        # Move the tokens for the experts in the current shard to the start of
-        # the inputs array.
-        num_tokens_to_skip = (
-            jnp.cumsum(jnp.sum(full_group_sizes, axis=-1))[expert_shard_id] -
-            jnp.sum(full_group_sizes[expert_shard_id])
-          )
-        x = jnp.roll(x, shift=-num_tokens_to_skip, axis=0)
+        group_sizes = group_sizes[:num_experts_per_shard]
+        mask = jnp.arange(x.shape[0]) < jnp.sum(group_sizes)
+        x = jnp.where(mask[:, None], x, 0)
       else:
-        num_tokens_to_skip = None
         x, sorted_selected_experts, weights, group_sizes, selected_experts = self.permute(
             x, logits, pre_bias_logits, self.config.use_custom_sort_vjp, rngs)
 
@@ -1056,9 +1052,6 @@ def wrapper(x, logits, pre_bias_logits, w0, w1, wo, w0_bias, w1_bias, wo_bias, r
         mask = jnp.arange(intermediate_output.shape[0]) < jnp.sum(group_sizes)
         intermediate_output = jnp.where(mask[:, None], intermediate_output, 0)
 
-        # Move the tokens back to their original positions.
-        intermediate_output = jnp.roll(intermediate_output, shift=num_tokens_to_skip, axis=0)
-
         # Unsort and deduplicate the outputs locally.
         output = self.unpermute(
             intermediate_output,