Modalities · le1nux · Nov 12, 2025 · Oct 23, 2025 · Oct 23, 2025 · Oct 23, 2025
diff --git a/pyproject.toml b/pyproject.toml
@@ -5,7 +5,8 @@ requires-python = ">=3.10,<3.13"
 description = "Modalities, a PyTorch-native framework for distributed and reproducible foundation model training."
 readme = "README.md"
 dependencies = [
-    "numpy<2.0",
+    "numpy",
+    "torch",
     "packaging",
     "tqdm",
     "pyyaml",

diff --git a/src/modalities/models/components/layer_norms.py b/src/modalities/models/components/layer_norms.py
@@ -11,16 +11,13 @@ class RMSLayerNorm(nn.Module):
     def __init__(self, ndim: int, bias: bool = True, epsilon: float = 1e-5):
         """
         Initializes a LayerNorm module.
-
         Args:
             ndim (int): The number of dimensions of the input tensor.
             bias (bool, optional): If True, adds a learnable bias to the normalized tensor. Defaults to True.
             epsilon (float, optional): A small value added to the denominator for numerical stability. Defaults to 1e-5.
-
         Note:
             Original paper: https://arxiv.org/pdf/1910.07467.pdf
             Source code adopted from https://github.com/facebookresearch/llama/blob/a0a4da8b497c566403941ceec47c2512ecf9dd20/llama/model.py#L34C1-L77C36
-
         Returns:
             None
         """
@@ -41,13 +38,10 @@ def _norm(self, x: torch.Tensor) -> torch.Tensor:
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         """
         Forward pass of the layer normalization module.
-
         Args:
             x (torch.Tensor): Input tensor.
-
         Returns:
             torch.Tensor: Output tensor after applying layer normalization.
-
         """
         output = self._norm(x.float()).type_as(x)
         if self.bias is None:
@@ -97,3 +91,16 @@ class RMSLayerNormConfig(BaseModel):
     ndim: Annotated[int, Field(strict=True, ge=1)]
     epsilon: Annotated[float, Field(gt=0, default=1e-6)]
     bias: Annotated[bool, Field(strict=True, default=True)]
+
+
+class PytorchRMSLayerNormConfig(BaseModel):
+    """
+    Configuration class for RMSLayerNorm.
+
+    Args:
+        normalized_shape (int): The expected size of the input shape.
+        eps (float, optional): Small value added to the input to avoid division by zero. Defaults to 1e-5.
+    """
+
+    normalized_shape: Annotated[int, Field(strict=True, ge=1)]
+    eps: Annotated[float, Field(strict=True, gt=0, default=1e-5)]
diff --git a/src/modalities/models/gpt2/gpt2_model.py b/src/modalities/models/gpt2/gpt2_model.py
@@ -10,7 +10,12 @@
 
 from modalities.config.lookup_enum import LookupEnum
 from modalities.config.utils import convert_base_model_config_to_dict
-from modalities.models.components.layer_norms import LayerNormConfig, RMSLayerNorm, RMSLayerNormConfig
+from modalities.models.components.layer_norms import (
+    LayerNormConfig,
+    PytorchRMSLayerNormConfig,
+    RMSLayerNorm,
+    RMSLayerNormConfig,
+)
 from modalities.models.model import ActivationType, NNModel, SwiGLU
 from modalities.util import parse_enum_by_name
 
@@ -33,15 +38,17 @@ class LayerNorms(LookupEnum):
     Attributes:
         RMSNorm: RMSLayerNorm class.
         LayerNorm: nn.LayerNorm class.
+        PyTorchRMSNorm: nn.RMSNorm class.
     """
 
     rms_norm = RMSLayerNorm
     layer_norm = nn.LayerNorm
+    pytorch_rms_norm = nn.RMSNorm
 
 
 class LayerNormWrapperConfig(BaseModel):
     norm_type: LayerNorms
-    config: LayerNormConfig | RMSLayerNormConfig
+    config: PytorchRMSLayerNormConfig | RMSLayerNormConfig | LayerNormConfig
 
 
 class PositionTypes(str, Enum):
@@ -292,6 +299,7 @@ def parse_sharding_strategy_by_name(cls, name):
         config: RotaryTransformConfig | IdentityTransformConfig
 
     qkv_transforms: list[QueryKeyValueTransformConfig]
+    qk_norm_config: Optional[LayerNormWrapperConfig] = None
 
 
 class GPT2LLMConfig(BaseModel):
@@ -461,6 +469,23 @@ def __init__(
             for transform_config in attention_config.qkv_transforms
         )
 
+        # QK Norm - helpful for models >1B to stabilize training
+        # Baseline logits w/o qk norm: (Q @ K^T) / sqrt(d_h)
+        # with geometric form of dot product: (||q_i|| * ||k_j|| * cos(θ_ij)) / sqrt(d_h)
+        # so if the model wants to increase the distance between logits
+        # it needs to scale q or k OR adjust the angle between them
+        # qk norm forces the model to mostly adjust the angle between q and k which stabilizes training
+        if attention_config.attention_config is not None:
+            self.q_norm = attention_config.qk_norm_config.norm_type.value(
+                **dict(attention_config.qk_norm_config.config)
+            )
+            self.k_norm = attention_config.qk_norm_config.norm_type.value(
+                **dict(attention_config.qk_norm_config.config)
+            )
+        else:
+            self.q_norm = None
+            self.k_norm = None
+
     def projection(self, x: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
         """
         Applies projections to the input tensor to get queries, keys, and values.
@@ -632,6 +657,9 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 
         # q: (B, nh_q, T, hd), k: (B, nh_kv, T, hd), v: (B, nh_kv, T, hd)
         q, k, v = CausalSelfAttention.execute_qkv_transforms(q, k, v, self.qkv_transforms, self.n_head_q)
+        if self.q_norm is not None and self.k_norm is not None:
+            q = self.q_norm(q)
+            k = self.k_norm(k)
         y = CausalSelfAttention.execute_attention(q, k, v, self.dropout, self.attention_impl)  # (B, T, nh_q, hd)
         y = y.reshape(B, T, -1)  # (B, T, n_embd), re-assemble all head outputs side by side
         return self.resid_dropout(self.c_proj(y))  # (B, T, n_embd), output projection

diff --git a/tests/models/test_causal_self_attention.py b/tests/models/test_causal_self_attention.py
@@ -7,7 +7,13 @@
 import pytest
 import torch
 
-from modalities.models.gpt2.gpt2_model import AttentionConfig, CausalSelfAttention
+from modalities.models.gpt2.gpt2_model import (
+    AttentionConfig,
+    CausalSelfAttention,
+    LayerNorms,
+    LayerNormWrapperConfig,
+    PytorchRMSLayerNormConfig,
+)
 
 torch.manual_seed(0)
 
@@ -222,3 +228,47 @@ def test_attention_implementation_approximate_equality(
         atol=2.5e-3,  # default for bfloat16: 1e-5
         rtol=0.016,  # default for bfloat16: 0.016
     )
+
+
+@pytest.mark.skipif(torch.cuda.device_count() < 1, reason="This test requires 1 GPU.")
+@pytest.mark.parametrize(
+    "n_head_q, n_head_kv, n_embd, attention_impl",
+    [
+        (4, 4, 32, "manual"),
+        (8, 2, 32, "manual"),
+        (4, 4, 32, "pytorch_flash"),
+        (8, 2, 32, "pytorch_flash"),
+        (4, 4, 32, "dao_flash"),
+        (8, 2, 32, "dao_flash"),
+    ],
+)
+def test_qk_norm(n_head_q, n_head_kv, n_embd, attention_impl):
+    batch_size = 2
+    block_size = 10
+    head_dim = n_embd // n_head_q
+    embedding_shape = (batch_size, block_size - 1, n_embd)
+    embedded_input_seq = _get_random_input_seq(embedding_shape)
+
+    attention_config_no_norm = AttentionConfig(qkv_transforms=[], use_qk_norm=False)
+    attention_config_with_norm = AttentionConfig(
+        qkv_transforms=[],
+        use_qk_norm=True,
+        qk_norm_config=LayerNormWrapperConfig(
+            norm_type=LayerNorms.pytorch_rms_norm, config=PytorchRMSLayerNormConfig(normalized_shape=head_dim)
+        ),
+    )
+
+    # Create two separate layers with same initial weights
+    torch.manual_seed(0)
+    layer_no_norm = _get_random_attention_layer(n_head_q, n_head_kv, n_embd, attention_impl, attention_config_no_norm)
+
+    torch.manual_seed(0)
+    layer_with_norm = _get_random_attention_layer(
+        n_head_q, n_head_kv, n_embd, attention_impl, attention_config_with_norm
+    )
+
+    output_no_norm = layer_no_norm(embedded_input_seq)
+    output_with_norm = layer_with_norm(embedded_input_seq)
+
+    assert output_no_norm.shape == output_with_norm.shape == embedding_shape
+    assert not torch.allclose(output_no_norm, output_with_norm, atol=1e-6)