ecmwf · csjfwang · Jul 16, 2025 · Jul 16, 2025 · Jul 22, 2025 · Jul 25, 2025
diff --git a/config/default_config.yml b/config/default_config.yml
@@ -50,12 +50,12 @@ pred_mlp_adaln: True
 
 # number of steps offset applied to first target window; if set to zero and forecast_steps=0 then
 # one is training an auto-encoder
-forecast_offset : 0
+forecast_offset : 1
 forecast_delta_hrs: 0
-forecast_steps: 0
-forecast_policy: null
+forecast_steps: 2
+forecast_policy: fixed
 forecast_att_dense_rate: 1.0
-fe_num_blocks: 0
+fe_num_blocks: 8
 fe_num_heads: 16
 fe_dropout_rate: 0.1
 fe_with_qk_lnorm: True
@@ -93,7 +93,7 @@ ema_halflife_in_thousands: 1e-3
 
 # training mode: "forecast" or "masking" (masked token modeling)
 # for "masking" to train with auto-encoder mode, forecast_offset should be 0
-training_mode: "masking"
+training_mode: "forecast"
 training_mode_config: {"losses": {LossPhysical: {weight: 0.7, loss_fcts: [['mse', 0.8], ['mae', 0.2]]},}
                       }
 # training_mode_config: {"loss": {LossPhysical: [['mse', 0.7]],
@@ -121,7 +121,7 @@ masking_strategy_config: {"strategies": ["random", "healpix", "channel"],
                           "same_strategy_per_batch": false
                           }
 
-num_mini_epochs: 32
+num_mini_epochs: 16
 samples_per_mini_epoch: 4096
 samples_per_validation: 512
 

diff --git a/src/weathergen/model/attention.py b/src/weathergen/model/attention.py
@@ -14,6 +14,25 @@
 from torch.nn.attention.flex_attention import create_block_mask, flex_attention
 
 from weathergen.model.norms import AdaLayerNorm, RMSNorm
+from weathergen.model.positional_encoding import (
+    apply_rotary_emb,
+    compute_mixed_cis,
+    init_random_2d_freqs,
+)
+
+
+def _maybe_init_rope(dim_head: int, num_heads: int, theta: float = 10.0, rotate: bool = True):
+    dim_total = dim_head * num_heads
+    return init_random_2d_freqs(dim_total, num_heads, theta=theta, rotate=rotate)
+
+
+def _compute_rope(freqs: torch.Tensor, coords: torch.Tensor, num_heads: int) -> torch.Tensor:
+    coords = coords.to(freqs.device)
+    coords_flat = coords.reshape(-1, coords.shape[-1])
+    freqs_cis = compute_mixed_cis(freqs, coords_flat[:, 0], coords_flat[:, 1], num_heads)
+    freqs_cis = torch.diagonal(freqs_cis, dim1=0, dim2=1).permute(1, 0, 2)
+    freqs_cis = freqs_cis.reshape(*coords.shape[:-1], num_heads, -1)
+    return freqs_cis
 
 
 class MultiSelfAttentionHeadVarlen(torch.nn.Module):
@@ -197,13 +216,16 @@ def __init__(
         dim_aux=None,
         norm_eps=1e-5,
         attention_dtype=torch.bfloat16,
+        with_rope=False,
+        rope_theta=10.0,
     ):
         super(MultiSelfAttentionHeadLocal, self).__init__()
 
         self.num_heads = num_heads
         self.with_flash = with_flash
         self.softcap = softcap
         self.with_residual = with_residual
+        self.with_rope = with_rope
 
         assert dim_embed % num_heads == 0
         self.dim_head_proj = dim_embed // num_heads if dim_head_proj is None else dim_head_proj
@@ -231,6 +253,14 @@ def __init__(
 
         self.dtype = attention_dtype
         assert with_flash, "Only flash attention supported."
+        if self.with_rope:
+            self.register_buffer(
+                "rope_freqs",
+                _maybe_init_rope(self.dim_head_proj, self.num_heads, theta=rope_theta),
+                persistent=False,
+            )
+        else:
+            self.rope_freqs = None
 
         # define block mask
         def mask_block_local(batch, head, idx_q, idx_kv):
@@ -242,7 +272,7 @@ def mask_block_local(batch, head, idx_q, idx_kv):
         # compile for efficiency
         self.flex_attention = torch.compile(flex_attention, dynamic=False)
 
-    def forward(self, x, ada_ln_aux=None):
+    def forward(self, x, ada_ln_aux=None, rope_coords=None):
         if self.with_residual:
             x_in = x
         x = self.lnorm(x) if ada_ln_aux is None else self.lnorm(x, ada_ln_aux)
@@ -253,6 +283,10 @@ def forward(self, x, ada_ln_aux=None):
         ks = self.lnorm_k(self.proj_heads_k(x).reshape(s)).to(self.dtype).permute([0, 2, 1, 3])
         vs = self.proj_heads_v(x).reshape(s).permute([0, 2, 1, 3])
 
+        if self.with_rope and rope_coords is not None:
+            freqs = _compute_rope(self.rope_freqs, rope_coords, self.num_heads)
+            qs, ks = apply_rotary_emb(qs, ks, freqs)
+
         outs = self.flex_attention(qs, ks, vs, block_mask=self.block_mask).transpose(1, 2)
 
         out = self.proj_out(self.dropout(outs.flatten(-2, -1)))
@@ -378,6 +412,8 @@ def __init__(
         dim_aux=None,
         norm_eps=1e-5,
         attention_dtype=torch.bfloat16,
+        with_rope=False,
+        rope_theta=10.0,
     ):
         super(MultiCrossAttentionHeadVarlenSlicedQ, self).__init__()
 
@@ -387,6 +423,7 @@ def __init__(
         self.with_residual = with_residual
         self.with_flash = with_flash
         self.softcap = softcap
+        self.with_rope = with_rope
 
         if norm_type == "LayerNorm":
             norm = partial(torch.nn.LayerNorm, elementwise_affine=False, eps=norm_eps)
@@ -426,8 +463,16 @@ def __init__(
 
         self.dtype = attention_dtype
         assert with_flash, "Only flash attention supported at the moment"
+        if self.with_rope:
+            self.register_buffer(
+                "rope_freqs",
+                _maybe_init_rope(self.dim_head_proj, self.num_heads, theta=rope_theta),
+                persistent=False,
+            )
+        else:
+            self.rope_freqs = None
 
-    def forward(self, x_q, x_kv, x_q_lens=None, x_kv_lens=None, ada_ln_aux=None):
+    def forward(self, x_q, x_kv, x_q_lens=None, x_kv_lens=None, ada_ln_aux=None, rope_coords=None):
         if self.with_residual:
             x_q_in = x_q
         x_q = self.lnorm_in_q(x_q) if ada_ln_aux is None else self.lnorm_in_q(x_q, ada_ln_aux)
@@ -444,6 +489,13 @@ def forward(self, x_q, x_kv, x_q_lens=None, x_kv_lens=None, ada_ln_aux=None):
         ks = self.lnorm_k(self.proj_heads_k(x_kv).reshape(s)).to(self.dtype)
         vs = self.proj_heads_v(x_kv).reshape(s)
 
+        if self.with_rope and rope_coords is not None:
+            freqs = _compute_rope(self.rope_freqs, rope_coords, self.num_heads)
+            qs = [
+                apply_rotary_emb(q_i, q_i, freqs[:, idx].contiguous())[0]
+                for idx, q_i in enumerate(qs)
+            ]
+
         # set dropout rate according to training/eval mode as required by flash_attn
         dropout_rate = self.dropout_rate if self.training else 0.0
 
@@ -487,6 +539,8 @@ def __init__(
         dim_aux=None,
         norm_eps=1e-5,
         attention_dtype=torch.bfloat16,
+        with_rope=False,
+        rope_theta=10.0,
     ):
         super(MultiSelfAttentionHead, self).__init__()
 
@@ -495,6 +549,7 @@ def __init__(
         self.softcap = softcap
         self.dropout_rate = dropout_rate
         self.with_residual = with_residual
+        self.with_rope = with_rope
 
         assert dim_embed % num_heads == 0
         self.dim_head_proj = dim_embed // num_heads if dim_head_proj is None else dim_head_proj
@@ -526,8 +581,16 @@ def __init__(
         else:
             self.att = self.attention
             self.softmax = torch.nn.Softmax(dim=-1)
+        if self.with_rope:
+            self.register_buffer(
+                "rope_freqs",
+                _maybe_init_rope(self.dim_head_proj, self.num_heads, theta=rope_theta),
+                persistent=False,
+            )
+        else:
+            self.rope_freqs = None
 
-    def forward(self, x, ada_ln_aux=None):
+    def forward(self, x, ada_ln_aux=None, rope_coords=None):
         if self.with_residual:
             x_in = x
         x = self.lnorm(x) if ada_ln_aux is None else self.lnorm(x, ada_ln_aux)
@@ -539,6 +602,10 @@ def forward(self, x, ada_ln_aux=None):
         ks = self.lnorm_k(self.proj_heads_k(x).reshape(s)).to(self.dtype)
         vs = self.proj_heads_v(x).reshape(s).to(self.dtype)
 
+        if self.with_rope and rope_coords is not None:
+            freqs = _compute_rope(self.rope_freqs, rope_coords, self.num_heads)
+            qs, ks = apply_rotary_emb(qs, ks, freqs)
+
         # set dropout rate according to training/eval mode as required by flash_attn
         dropout_rate = self.dropout_rate if self.training else 0.0
 

diff --git a/src/weathergen/model/engines.py b/src/weathergen/model/engines.py
@@ -225,7 +225,9 @@ def __init__(self, cf: Config) -> None:
             )
         )
 
-    def forward(self, tokens_c, tokens_global_c, q_cells_lens_c, cell_lens_c, use_reentrant):
+    def forward(
+        self, tokens_c, tokens_global_c, q_cells_lens_c, cell_lens_c, use_reentrant
+    ):
         for block in self.ae_adapter:
             tokens_global_c = checkpoint(
                 block,
@@ -273,6 +275,7 @@ def __init__(self, cf: Config, num_healpix_cells: int) -> None:
                         norm_type=self.cf.norm_type,
                         norm_eps=self.cf.norm_eps,
                         attention_dtype=get_dtype(self.cf.attention_dtype),
+                        with_rope=True,
                     )
                 )
             else:
@@ -288,6 +291,7 @@ def __init__(self, cf: Config, num_healpix_cells: int) -> None:
                         norm_type=self.cf.norm_type,
                         norm_eps=self.cf.norm_eps,
                         attention_dtype=get_dtype(self.cf.attention_dtype),
+                        with_rope=True,
                     )
                 )
             # MLP block
@@ -303,9 +307,22 @@ def __init__(self, cf: Config, num_healpix_cells: int) -> None:
                 )
             )
 
-    def forward(self, tokens, use_reentrant):
+    def forward(self, tokens, coords, use_reentrant):
         for block in self.ae_aggregation_blocks:
-            tokens = checkpoint(block, tokens, use_reentrant=use_reentrant)
+            if isinstance(block, MultiSelfAttentionHead):
+                tokens = checkpoint(
+                    lambda x, blk=block, c=coords: blk(x, rope_coords=c),
+                    tokens,
+                    use_reentrant=use_reentrant,
+                )
+            elif isinstance(block, MultiSelfAttentionHeadLocal):
+                tokens = checkpoint(
+                    lambda x, blk=block, c=coords: blk(x, rope_coords=c),
+                    tokens,
+                    use_reentrant=use_reentrant,
+                )
+            else:
+                tokens = checkpoint(block, tokens, use_reentrant=use_reentrant)
         return tokens
 
 
@@ -341,6 +358,7 @@ def __init__(self, cf: Config, num_healpix_cells: int) -> None:
                         norm_type=self.cf.norm_type,
                         norm_eps=self.cf.norm_eps,
                         attention_dtype=get_dtype(self.cf.attention_dtype),
+                        with_rope=True,
                     )
                 )
             else:
@@ -356,6 +374,7 @@ def __init__(self, cf: Config, num_healpix_cells: int) -> None:
                         norm_type=self.cf.norm_type,
                         norm_eps=self.cf.norm_eps,
                         attention_dtype=get_dtype(self.cf.attention_dtype),
+                        with_rope=True,
                     )
                 )
             # MLP block
@@ -371,9 +390,22 @@ def __init__(self, cf: Config, num_healpix_cells: int) -> None:
                 )
             )
 
-    def forward(self, tokens, use_reentrant):
+    def forward(self, tokens, coords, use_reentrant):
         for block in self.ae_global_blocks:
-            tokens = checkpoint(block, tokens, use_reentrant=use_reentrant)
+            if isinstance(block, MultiSelfAttentionHead):
+                tokens = checkpoint(
+                    lambda x, blk=block, c=coords: blk(x, rope_coords=c),
+                    tokens,
+                    use_reentrant=use_reentrant,
+                )
+            elif isinstance(block, MultiSelfAttentionHeadLocal):
+                tokens = checkpoint(
+                    lambda x, blk=block, c=coords: blk(x, rope_coords=c),
+                    tokens,
+                    use_reentrant=use_reentrant,
+                )
+            else:
+                tokens = checkpoint(block, tokens, use_reentrant=use_reentrant)
         return tokens