Add query-key normalization to CausalAttn and Attention classes, including learned scaling factor

rka97 · rka97 · commit bbde48bc9efe · 2025-10-23T17:00:58.000Z
diff --git a/algoperf/workloads/lm/lm_jax/nanodo_model.py b/algoperf/workloads/lm/lm_jax/nanodo_model.py
@@ -25,6 +25,7 @@ class ModelConfig:
   rmsnorm_epsilon: float = 1e-6
   use_residual_scaling: bool = True
   tie_embeddings: bool = True  # Whether to tie input and output embed
+  qknorm_epsilon: float = 1e-6
 
   dtype: jnp.dtype = jnp.float32
   attention_init: nn.initializers.Initializer = nn.initializers.normal(stddev=0.02)
@@ -116,6 +117,7 @@ def setup(self):
     cfg = self.cfg
     assert cfg.model_dim % cfg.num_heads == 0, f'D {cfg.model_dim} not divisible by H {cfg.num_heads}'
     self.Dh = cfg.model_dim // cfg.num_heads
+    self.eps = cfg.qknorm_epsilon
 
     # Initialize rotary embeddings
     self.freqs_cis = init_rope(cfg.model_dim, cfg.seq_len, cfg.num_heads)
@@ -129,10 +131,13 @@ def setup(self):
       use_bias=False,
       dtype=cfg.dtype,
     )
-
     self.multilinear_query = self.multilinear(name='query')
     self.multilinear_key = self.multilinear(name='key')
     self.multilinear_value = self.multilinear(name='value')
+    # See Henry et al. (2020) "Query Key Normalization for Transformers"
+    seq_len = cfg.seq_len
+    attn_scale0 = jnp.log2(seq_len**2 - seq_len) 
+    self.attn_scale = self.param('attn_scale', nn.initializers.constant(attn_scale0), ())
     self.output_projection = nn.DenseGeneral(
       features=cfg.model_dim,
       name='attn_out_proj',
@@ -153,8 +158,9 @@ def __call__(self, x_BxLxD: jax.Array):
     # Apply rotary embeddings to Q and K
     q_BxLxHxDh, k_BxLxHxDh = apply_rope(q_BxLxHxDh, k_BxLxHxDh, self.freqs_cis)
 
-    # Scale queries
-    q_BxLxHxDh /= self.Dh**0.5
+    # Apply QK normalization
+    q_BxLxHxDh /= jnp.linalg.norm(q_BxLxHxDh, axis=-1, keepdims=True) + self.eps
+    k_BxLxHxDh /= jnp.linalg.norm(k_BxLxHxDh, axis=-1, keepdims=True) + self.eps
 
     # Compute attention scores
     att_BxHxLxL = jnp.einsum('...qhd,...khd->...hqk', q_BxLxHxDh, k_BxLxHxDh)
@@ -166,6 +172,7 @@ def __call__(self, x_BxLxD: jax.Array):
     # Apply mask and softmax
     _NEG_INF = jnp.finfo(cfg.dtype).min
     att_BxHxLxL = jnp.where(mask_1x1xLxL, att_BxHxLxL, _NEG_INF)
+    att_BxHxLxL = self.attn_scale * att_BxHxLxL # Learned scaling factor for QK norm
     att_BxHxLxL = jax.nn.softmax(att_BxHxLxL, axis=-1)
     att_BxHxLxL = att_BxHxLxL.astype(cfg.dtype)
 
diff --git a/algoperf/workloads/lm/lm_pytorch/plainlm_model.py b/algoperf/workloads/lm/lm_pytorch/plainlm_model.py
@@ -23,6 +23,7 @@ class ModelConfig:
   expanded_model_dim: int
   multiple_of: int = 256
   rmsnorm_epsilon: float = 1e-6
+  qknorm_epsilon: float = 1e-6
   use_residual_scaling: bool = True
   tie_embeddings: bool = True
 
@@ -95,6 +96,12 @@ def __init__(self, cfg: ModelConfig):
         nn.init.normal_(w, std=0.02)
     nn.init.normal_(self.w_out.weight, std=0.02)
 
+    self.eps = cfg.qknorm_epsilon  # e.g., 1e-6
+    seq_len = cfg.seq_len
+    attn_scale0 = math.log2(seq_len**2 - seq_len)
+    self.attn_scale = nn.Parameter(torch.tensor(attn_scale0))
+
+
   def forward(self, x, freqs_cis):
     bsz, seqlen, d = x.shape  # (bsz, seqlen, d)
 
@@ -117,10 +124,14 @@ def forward(self, x, freqs_cis):
     k = k.transpose(1, 2)  # (bsz, nh, seqlen, h_dim)
     v = v.transpose(1, 2)  # (bsz, nh, seqlen, h_dim)
 
+    # Apply QK normalization
+    q = q / torch.norm(q, dim=-1, keepdim=True) + self.eps
+    k = k / torch.norm(k, dim=-1, keepdim=True) + self.eps
+    q *= self.attn_scale
+
     out = F.scaled_dot_product_attention(
-      q, k, v, is_causal=True
+      q, k, v, is_causal=True, scale=1.0
     )  # (bsz, nh, seqlen, h_dim)
-
     out = (
       out.transpose(1, 2).contiguous().view(bsz, seqlen, d)
     )  # (bsz, seqlen, d)