Merge pull request #390 from Modalities/tp_swiglu_hidden_dim_fix

le1nux · web-flow · commit 1563b9e303dd · 2025-08-06T23:42:31.000+02:00
Fix of SwiGLU hidden not being multiple of world size
diff --git a/src/modalities/models/gpt2/gpt2_model.py b/src/modalities/models/gpt2/gpt2_model.py
@@ -319,7 +319,10 @@ class GPT2LLMConfig(BaseModel):
         ffn_norm_config (LayerNormWrapperConfig): Config for normalization of the feed-forward network.
         lm_head_norm_config (LayerNormWrapperConfig): Config for normalization of the language model head.
         use_weight_tying (bool): Whether to use weight tying.
-
+        seed: int = None: The random seed for reproducibility.
+        enforce_swiglu_hidden_dim_multiple_of (Optional[int]): If specified, enforces the hidden dimension
+            in the SwiGLU layer to be a multiple of this value. Note that this is only relevant if the
+            activation_type is SwiGLU. Defaults to None.
     """
 
     sample_key: str
@@ -344,6 +347,8 @@ class GPT2LLMConfig(BaseModel):
     ffn_norm_config: LayerNormWrapperConfig
     lm_head_norm_config: LayerNormWrapperConfig
     use_weight_tying: bool
+    seed: Optional[int] = None
+    enforce_swiglu_hidden_dim_multiple_of: Optional[int] = None
 
     @model_validator(mode="after")
     def check_divisibility(self) -> "GPT2LLMConfig":
@@ -695,6 +700,7 @@ def __init__(
         ffn_hidden: int,
         attention_norm: nn.Module,
         ffn_norm: nn.Module,
+        enforce_swiglu_hidden_dim_multiple_of: Optional[int] = None,
     ):
         """
         Initializes the GPT2Block.
@@ -711,6 +717,9 @@ def __init__(
             ffn_hidden (int): The size of the hidden layer in the feed-forward network.
             attention_norm (nn.Module): The normalization layer for attention.
             ffn_norm (nn.Module): The normalization layer for feed-forward network.
+            enforce_swiglu_hidden_dim_multiple_of (Optional[int]): If specified, enforces the
+                hidden dimension in the SwiGLU layer to be a multiple of this value. Note that this
+                is only relevant if the activation_type is SwiGLU. Defaults to None.
         """
         super().__init__()
         self.attention_norm = attention_norm
@@ -728,7 +737,12 @@ def __init__(
         if activation_type == ActivationType.GELU:
             self.mlp = TransformerMLP(n_embd=n_embd, ffn_hidden=ffn_hidden, bias=bias, dropout=dropout)
         elif activation_type == ActivationType.SWIGLU:
-            self.mlp = SwiGLU(n_embd=n_embd, ffn_hidden=ffn_hidden, bias=bias)
+            self.mlp = SwiGLU(
+                n_embd=n_embd,
+                ffn_hidden=ffn_hidden,
+                bias=bias,
+                enforce_swiglu_hidden_dim_multiple_of=enforce_swiglu_hidden_dim_multiple_of,
+            )
         else:
             raise NotImplementedError("unimplemented activation")
 
@@ -781,6 +795,7 @@ def __init__(
         lm_head_norm_config: LayerNormWrapperConfig,
         use_weight_tying: bool,
         seed: int = None,
+        enforce_swiglu_hidden_dim_multiple_of: Optional[int] = None,
     ):
         """
         Initializes the GPT2LLM object.
@@ -806,6 +821,9 @@ def __init__(
             lm_head_norm_config (LayerNormWrapperConfig): Config for the language model head normalization module.
             seed (int, optional): The random seed. Defaults to None.
             use_weight_tying (bool): Whether to use weight tying.
+            enforce_swiglu_hidden_dim_multiple_of (Optional[int]): If specified, enforces
+                the hidden dimension in the SwiGLU layer to be a multiple of this value.
+                Note that this is only relevant if the activation_type is SwiGLU. Defaults to None.
         """
         weight_decay_groups = {
             "linear": [".attn", ".mlp", ".lm_head.weight"],
@@ -861,6 +879,7 @@ def __init__(
                             # a meta device!
                             attention_norm=attention_norm_config.norm_type.value(**dict(attention_norm_config.config)),
                             ffn_norm=ffn_norm_config.norm_type.value(**dict(ffn_norm_config.config)),
+                            enforce_swiglu_hidden_dim_multiple_of=enforce_swiglu_hidden_dim_multiple_of,
                         )
                         for _ in range(n_layer)
                     ]
diff --git a/src/modalities/models/model.py b/src/modalities/models/model.py
@@ -75,7 +75,9 @@ def get_parameters(self) -> dict[str, torch.Tensor]:
 class SwiGLU(nn.Module):
     """SwiGLU class to define the SwiGLU activation function."""
 
-    def __init__(self, n_embd: int, ffn_hidden: int, bias: bool):
+    def __init__(
+        self, n_embd: int, ffn_hidden: int, bias: bool, enforce_swiglu_hidden_dim_multiple_of: Optional[int] = None
+    ):
         """
         Initializes the SwiGLU object.
 
@@ -84,11 +86,17 @@ def __init__(self, n_embd: int, ffn_hidden: int, bias: bool):
             ffn_hidden (int): The number of hidden dimensions in the feed-forward network.
             Best practice: 4 * n_embd (https://arxiv.org/pdf/1706.03762)
             bias (bool): Whether to include bias terms in the linear layers.
+            enforce_swiglu_hidden_dim_multiple_of (int): The multiple of which the hidden dimension should be enforced.
+                This is required for FSDP + TP as the combincation does not support uneven sharding (yet).
+                Defaults to 256 if not provided.
         """
 
         super().__init__()
-
-        hidden_dim = SwiGLU._get_hidden_dim(ffn_hidden=ffn_hidden)
+        if enforce_swiglu_hidden_dim_multiple_of is None:
+            enforce_swiglu_hidden_dim_multiple_of = 256
+        hidden_dim = SwiGLU._get_hidden_dim(
+            ffn_hidden=ffn_hidden, enforce_swiglu_hidden_dim_multiple_of=enforce_swiglu_hidden_dim_multiple_of
+        )
 
         self.W = nn.Linear(
             in_features=n_embd,
@@ -108,16 +116,21 @@ def __init__(self, n_embd: int, ffn_hidden: int, bias: bool):
         )
 
     @staticmethod
-    def _get_hidden_dim(ffn_hidden: int) -> int:
+    def _get_hidden_dim(ffn_hidden: int, enforce_swiglu_hidden_dim_multiple_of: int) -> int:
         # Calculate the hidden dimension for the SwiGLU module based on the provided embedding dimension.
 
         # Best practice: 4 * n_embd (https://arxiv.org/pdf/1706.03762)
         # To ensure that the number of parameters in the SwiGLU module with its additional
         # linear layer are equivalent to the TransformerMLP, we need to adapt the SwiGLU hidden dimension as follows:
         # 2 * (n_embd * hidden_dim) == 3 * (n_embd * 2/3 * hidden_dim)
         # Besides, we ensure that hidden_dim is the smallest multiple of
-        # 256 that is greater than or equal the provided hidden_dim
-        return 256 * ((int(2 * ffn_hidden / 3) + 256 - 1) // 256)
+        # `enforce_swiglu_hidden_dim_multiple_of` that is greater than or equal the provided hidden_dim.
+        # In case of TP we must set this to be at least of world size as FSDP + TP does not uneven sharding.
+        # FSDP itself without TP support it already however.
+        return enforce_swiglu_hidden_dim_multiple_of * (
+            (int(2 * ffn_hidden / 3) + enforce_swiglu_hidden_dim_multiple_of - 1)
+            // enforce_swiglu_hidden_dim_multiple_of
+        )
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         """
diff --git a/src/modalities/models/model_factory.py b/src/modalities/models/model_factory.py
@@ -567,7 +567,8 @@ def get_gpt2_model(
         lm_head_norm_config: LayerNormWrapperConfig,
         use_weight_tying: bool,
         use_meta_device: Optional[bool] = False,
-        seed: int = None,
+        seed: Optional[int] = None,
+        enforce_swiglu_hidden_dim_multiple_of: Optional[int] = None,
     ) -> GPT2LLM:
         config = dict(
             sample_key=sample_key,
@@ -590,6 +591,7 @@ def get_gpt2_model(
             lm_head_norm_config=lm_head_norm_config,
             seed=seed,
             use_weight_tying=use_weight_tying,
+            enforce_swiglu_hidden_dim_multiple_of=enforce_swiglu_hidden_dim_multiple_of,
         )
         if use_meta_device and use_weight_tying:
             raise ValueError(