Migrate LlamaDecoderLayer to NNX

bvandermoon · bvandermoon · commit b48c899fb5b0 · 2025-09-12T17:57:57.000Z
diff --git a/src/MaxText/inference/kvcache.py b/src/MaxText/inference/kvcache.py
@@ -310,8 +310,11 @@ def __init__(
     self.model_mode = model_mode
     self.use_chunked_prefill = use_chunked_prefill
 
-    self._initialize_prefill_caches(model_mode)
-    self._initialize_ar_cache_vars(model_mode)
+    if self.model_mode in (MODEL_MODE_PREFILL):
+      self._initialize_prefill_caches(model_mode)
+    if self.model_mode in (MODEL_MODE_AUTOREGRESSIVE):
+      self._initialize_prefill_caches(model_mode)
+      self._initialize_ar_cache_vars(model_mode)
 
   @property
   def prefill_key_vars(self):
diff --git a/src/MaxText/layers/decoders.py b/src/MaxText/layers/decoders.py
@@ -374,7 +374,7 @@ def get_decoder_layers(self):
       case DecoderBlockType.DEFAULT:
         return [DecoderLayer]
       case DecoderBlockType.LLAMA2:
-        return [llama2.LlamaDecoderLayer]
+        return [llama2.LlamaDecoderLayerToLinen]
       case DecoderBlockType.MISTRAL:
         # TODO(ranran): update to Mistral with sliding window attention
         return [mistral.MistralDecoderLayer]
diff --git a/src/MaxText/layers/llama2.py b/src/MaxText/layers/llama2.py
@@ -19,34 +19,121 @@
 import jax.numpy as jnp
 from jax.ad_checkpoint import checkpoint_name
 from jax.sharding import Mesh
-# from jax.experimental.pallas.ops.tpu import flash_attention
 
 from flax import linen as nn
+from flax import nnx
 
 from MaxText.inference import page_manager
 from MaxText.common_types import Config
-from MaxText.layers.linears import mlp_block
+from MaxText.layers.linears import MlpBlock
+from MaxText.layers import initializers
+from MaxText.layers import nnx_wrappers
 from MaxText.layers import quantizations
-from MaxText.layers.attentions import attention_as_linen
+from MaxText.layers.attentions import Attention
 from MaxText.layers.quantizations import AqtQuantization as Quant
-from MaxText.layers.normalizations import rms_norm
-from MaxText.common_types import MODEL_MODE_PREFILL
+from MaxText.layers.normalizations import RMSNorm
+from MaxText.common_types import MODEL_MODE_PREFILL, MODEL_MODE_AUTOREGRESSIVE
 
 
 # -----------------------------------------
 # The Decoder Layer specific for Llama2
 # -----------------------------------------
 
 
-class LlamaDecoderLayer(nn.Module):
+class LlamaDecoderLayer(nnx.Module):
   """Transformer decoder layer that attends to the encoder."""
 
-  config: Config
-  mesh: Mesh
-  model_mode: str
-  quant: None | Quant = None
+  def __init__(
+      self,
+      config: Config,
+      model_mode: str,
+      mesh: Mesh,
+      rngs: nnx.Rngs,
+      quant: None | Quant = None,
+  ):
+
+    self.config = config
+    self.mesh = mesh
+    self.quant = quant
+
+    batch_size = 1 if model_mode == MODEL_MODE_PREFILL else config.micro_batch_size_to_train_on
+
+    if model_mode == MODEL_MODE_PREFILL:
+      seq_len = config.max_prefill_predict_length
+    elif model_mode == MODEL_MODE_AUTOREGRESSIVE:
+      seq_len = 1
+    else:
+      seq_len = config.max_target_length
+
+    dummy_inputs_shape = (batch_size, seq_len, config.emb_dim)
+
+    self.pre_self_attention_layer_norm = RMSNorm(
+        num_features=config.emb_dim,
+        dtype=config.dtype,
+        weight_dtype=config.weight_dtype,
+        kernel_axes=("norm",),
+        epsilon=config.normalization_layer_epsilon,
+        rngs=rngs,
+    )
+
+    self.self_attention = Attention(
+        config=config,
+        num_query_heads=config.num_query_heads,
+        num_kv_heads=config.num_kv_heads,
+        head_dim=config.head_dim,
+        max_target_length=config.max_target_length,
+        max_prefill_predict_length=config.max_prefill_predict_length,
+        attention_kernel=config.attention,
+        inputs_q_shape=dummy_inputs_shape,
+        inputs_kv_shape=dummy_inputs_shape,
+        mesh=mesh,
+        dtype=config.dtype,
+        weight_dtype=config.weight_dtype,
+        dropout_rate=config.dropout_rate,
+        float32_qk_product=config.float32_qk_product,
+        float32_logits=config.float32_logits,
+        quant=self.quant,
+        kv_quant=quantizations.configure_kv_quant(config),
+        prefill_cache_axis_order=tuple(map(int, config.prefill_cache_axis_order.split(","))),
+        ar_cache_axis_order=tuple(map(int, config.ar_cache_axis_order.split(","))),
+        compute_axis_order=tuple(map(int, config.compute_axis_order.split(","))),
+        reshape_q=config.reshape_q,
+        use_ragged_attention=config.use_ragged_attention,
+        ragged_block_size=config.ragged_block_size,
+        model_mode=model_mode,
+        rngs=rngs,
+    )
+
+    self.post_self_attention_layer_norm = RMSNorm(
+        num_features=config.emb_dim,
+        dtype=config.dtype,
+        weight_dtype=config.weight_dtype,
+        kernel_axes=("norm",),
+        epsilon=config.normalization_layer_epsilon,
+        rngs=rngs,
+    )
+
+    self.mlp = MlpBlock(
+        in_features=config.emb_dim,
+        intermediate_dim=config.mlp_dim,
+        activations=config.mlp_activations,
+        intermediate_dropout_rate=config.dropout_rate,
+        dtype=config.dtype,
+        weight_dtype=config.weight_dtype,
+        config=config,
+        quant=self.quant,
+        model_mode=model_mode,
+        rngs=rngs,
+    )
+
+    self.dropout = nnx.Dropout(rate=config.dropout_rate, broadcast_dims=(-2,), rngs=rngs)
+
+    if model_mode == MODEL_MODE_PREFILL:
+      self.activation_axis_names = ("activation_batch", "prefill_activation_norm_length", "activation_embed")
+    else:
+      self.activation_axis_names = ("activation_batch", "activation_norm_length", "activation_embed")
+
 
-  @nn.compact
   def __call__(
       self,
       inputs,
@@ -59,57 +146,15 @@ def __call__(
       previous_chunk=None,
   ):
     cfg = self.config
-    mesh = self.mesh
 
-    if model_mode == MODEL_MODE_PREFILL:
-      activation_axis_names = ("activation_batch", "prefill_activation_norm_length", "activation_embed")
-    else:
-      activation_axis_names = ("activation_batch", "activation_norm_length", "activation_embed")
-
-    inputs = nn.with_logical_constraint(inputs, activation_axis_names)
+    inputs = nn.with_logical_constraint(inputs, self.activation_axis_names)
     inputs = checkpoint_name(inputs, "decoder_layer_input")
-    lnx_rms = rms_norm(
-        num_features=inputs.shape[-1],
-        dtype=cfg.dtype,
-        weight_dtype=cfg.weight_dtype,
-        name="pre_self_attention_layer_norm",
-        kernel_axes=("norm",),
-        epsilon=cfg.normalization_layer_epsilon,
-    )
-    lnx = lnx_rms(inputs)
+    lnx = self.pre_self_attention_layer_norm(inputs)
 
-    lnx = nn.with_logical_constraint(lnx, activation_axis_names)
+    lnx = nn.with_logical_constraint(lnx, self.activation_axis_names)
 
     # Self-attention block
-    attention_layer = attention_as_linen(
-        config=cfg,
-        num_query_heads=cfg.num_query_heads,
-        num_kv_heads=cfg.num_kv_heads,
-        head_dim=cfg.head_dim,
-        max_target_length=cfg.max_target_length,
-        max_prefill_predict_length=cfg.max_prefill_predict_length,
-        attention_kernel=cfg.attention,
-        inputs_q_shape=lnx.shape,
-        inputs_kv_shape=lnx.shape,
-        mesh=mesh,
-        dtype=cfg.dtype,
-        weight_dtype=cfg.weight_dtype,
-        dropout_rate=cfg.dropout_rate,
-        name="self_attention",
-        float32_qk_product=cfg.float32_qk_product,
-        float32_logits=cfg.float32_logits,
-        quant=self.quant,
-        kv_quant=quantizations.configure_kv_quant(cfg),
-        prefill_cache_axis_order=tuple(map(int, cfg.prefill_cache_axis_order.split(","))),
-        ar_cache_axis_order=tuple(map(int, cfg.ar_cache_axis_order.split(","))),
-        compute_axis_order=tuple(map(int, cfg.compute_axis_order.split(","))),
-        reshape_q=cfg.reshape_q,
-        use_ragged_attention=cfg.use_ragged_attention,
-        ragged_block_size=cfg.ragged_block_size,
-        model_mode=model_mode,
-    )
-
-    attention_lnx = attention_layer(
+    attention_lnx = self.self_attention(
         lnx,
         lnx,
         decoder_positions,
@@ -121,40 +166,20 @@ def __call__(
         previous_chunk=previous_chunk,
     )
 
-    attention_lnx = nn.with_logical_constraint(attention_lnx, activation_axis_names)
+    attention_lnx = nn.with_logical_constraint(attention_lnx, self.activation_axis_names)
     intermediate_inputs = inputs + attention_lnx
 
     # Fully Connected
-    hidden_states = rms_norm(
-        num_features=intermediate_inputs.shape[-1],
-        dtype=cfg.dtype,
-        weight_dtype=cfg.weight_dtype,
-        name="post_self_attention_layer_norm",
-        kernel_axes=("norm",),
-        epsilon=cfg.normalization_layer_epsilon,
-    )(intermediate_inputs)
-    hidden_states = nn.with_logical_constraint(hidden_states, activation_axis_names)
+    hidden_states = self.post_self_attention_layer_norm(intermediate_inputs)
+    hidden_states = nn.with_logical_constraint(hidden_states, self.activation_axis_names)
 
     # MLP block.
-    mlp_lnx = mlp_block(
-        in_features=hidden_states.shape[-1],
-        intermediate_dim=cfg.mlp_dim,
-        activations=cfg.mlp_activations,
-        intermediate_dropout_rate=cfg.dropout_rate,
-        dtype=cfg.dtype,
-        weight_dtype=cfg.weight_dtype,
-        name="mlp",
-        config=cfg,
-        quant=self.quant,
-        model_mode=model_mode,
-    )(hidden_states, deterministic=deterministic)
-    mlp_lnx = nn.with_logical_constraint(mlp_lnx, activation_axis_names)
+    mlp_lnx = self.mlp(hidden_states, deterministic=deterministic)
+    mlp_lnx = nn.with_logical_constraint(mlp_lnx, self.activation_axis_names)
 
     layer_output = mlp_lnx + intermediate_inputs
-
-    layer_output = nn.Dropout(rate=cfg.dropout_rate, broadcast_dims=(-2,))(layer_output, deterministic=deterministic)
-
-    layer_output = nn.with_logical_constraint(layer_output, activation_axis_names)
+    layer_output = self.dropout(layer_output, deterministic=deterministic)
+    layer_output = nn.with_logical_constraint(layer_output, self.activation_axis_names)
 
     if cfg.record_internal_nn_metrics:
       self.sow("intermediates", "activation_mean", jnp.mean(layer_output))
@@ -169,3 +194,9 @@ def __call__(
       return layer_output, None
     else:
       return layer_output
+
+
+LlamaDecoderLayerToLinen = nnx_wrappers.to_linen_class(
+    LlamaDecoderLayer,
+    base_metadata_fn=initializers.variable_to_logically_partitioned,
+)
diff --git a/src/MaxText/layers/models.py b/src/MaxText/layers/models.py
@@ -257,7 +257,7 @@ def __init__(self, config: Config, mesh: Mesh, quant: Quant, *, model_mode: str
     else:
       seq_len = cfg.max_target_length
 
-    batch_size = cfg.micro_batch_size_to_train_on
+    batch_size = 1 if self.model_mode == MODEL_MODE_PREFILL else cfg.micro_batch_size_to_train_on
     dummy_decoder_input_tokens = jnp.ones((batch_size, seq_len), dtype=jnp.int32)
     dummy_decoder_positions = jnp.ones((batch_size, seq_len), dtype=jnp.int32)
 
diff --git a/src/MaxText/layers/nnx_wrappers.py b/src/MaxText/layers/nnx_wrappers.py
diff --git a/tests/model_test.py b/tests/model_test.py