Fix token sorting with custom vjp in moe.py

jesselu-google · Google-ML-Automation · commit fd2ff2b03a67 · 2025-09-18T12:53:36.000-07:00
PiperOrigin-RevId: 808698648
diff --git a/src/MaxText/layers/moe.py b/src/MaxText/layers/moe.py
@@ -53,11 +53,14 @@ def _sort_activations(
 ) -> jax.Array:
   """Sort activations by `sort_indices`.
 
-  If `use_custom_vjp` is True, then we use a custom backward pass that
+  If `use_custom_vjp=True`, then we use a custom backward pass that
   reverses the sort order. Specifically, this unsort operation is simply a sort
   with `jnp.argsort(sort_indices)` as the sort indices. This is only needed in
   the case where the compiler generates a less efficient backward pass op.
- 
+
+  Note that `use_custom_vjp=True` assumes that `sort_indices` is a permutation
+  of `jnp.arange(inputs.shape[0])`.
+
   Args:
     inputs: `(tokens, ...)`-shaped array of input activations to sort.
     sort_indices: `(tokens,)`-shaped array containing the sort order.
@@ -511,7 +514,7 @@ def apply_ffn_activation(self, layer_w0, layer_w1):
         layer_act = self.activation_fn(layer_w0)
         intermediate_layer = jnp.multiply(layer_act, layer_w1)
       return intermediate_layer.astype(self.dtype)
-  
+
   def permute(self, inputs, gate_logits, pre_bias_logits, use_custom_sort_vjp=True, rngs=None):
     """Permute tokens to group by expert to fit gmm call."""
     # reshape inputs (batch, sequence, emb) to (batch * sequence, emb)
@@ -528,12 +531,9 @@ def permute(self, inputs, gate_logits, pre_bias_logits, use_custom_sort_vjp=True
 
     flatten_selected_experts = jnp.ravel(selected_experts)
     sorted_selected_experts = jnp.argsort(flatten_selected_experts)
-    sorted_indices = sorted_selected_experts // self.num_experts_per_tok
     # sort inputs for number of selected experts
-    replicated_inputs_2d = jnp.reshape(
-        jnp.broadcast_to(inputs_2d[None, ...], (self.num_experts_per_tok, *inputs_2d.shape)), 
-        (self.num_experts_per_tok * inputs_2d.shape[0], inputs_2d.shape[1]))
-    sorted_inputs = _sort_activations(replicated_inputs_2d, sorted_indices, use_custom_sort_vjp).astype(self.dtype)
+    replicated_inputs_2d = jnp.repeat(inputs_2d, self.num_experts_per_tok, axis=0)
+    sorted_inputs = _sort_activations(replicated_inputs_2d, sorted_selected_experts, use_custom_sort_vjp).astype(self.dtype)
     group_size = jnp.bincount(flatten_selected_experts, length=self.num_experts)
     # Return the experts for each sorted input.
     expert_indices = jnp.arange(self.num_experts)