fix: HWIO to OIHW (#39200)

RyanMullins · ArthurZucker · web-flow · commit d913b39ef391 · 2025-07-25T19:23:15.000+02:00
* fix: HWIO to OIHW

* Bug in attention type

* Conversion script docstring

* style

---------

Co-authored-by: Arthur &lt;48595927+ArthurZucker@users.noreply.github.com&gt;
Co-authored-by: Arthur &lt;arthur.zucker@gmail.com&gt;
diff --git a/src/transformers/models/gemma3n/configuration_gemma3n.py b/src/transformers/models/gemma3n/configuration_gemma3n.py
@@ -271,7 +271,7 @@ def __init__(
 
         if layer_types is None:
             self.layer_types = [
-                "full_attention" if i % 5 == 0 else "sliding_attention" for i in range(self.num_hidden_layers)
+                "full_attention" if (i + 1) % 5 == 0 else "sliding_attention" for i in range(self.num_hidden_layers)
             ]
         else:
             self.layer_types = layer_types
diff --git a/src/transformers/models/gemma3n/convert_gemma3n_weights.py b/src/transformers/models/gemma3n/convert_gemma3n_weights.py
@@ -18,9 +18,9 @@
 
 python src/transformers/models/gemma3n/convert_gemma3n_weights.py \
     --variant='gemma3n_e4b' \
-    --tokenizer_path="$HOME/nano3/checkpoints/tokenizer/gemma-3n-tokenizer.model" \
-    --checkpoint_path="$HOME/nano3/checkpoints/g251_orbax/" \
-    --output_path="$HOME/nano3/checkpoints/g251_vision_encoder/"
+    --tokenizer_path="$HOME/tokenizers/gemma-3n-tokenizer.model" \
+    --checkpoint_path="$HOME/checkpoints/gemma-3n-orbax/" \
+    --output_path="$HOME/checkpoints/gemma-3n-safetensors/"
 """
 
 import json
@@ -552,8 +552,9 @@ def generate_base_path(path: str, block_type: str) -> tuple[str, tuple[int, int]
             converted_weight = weights
     elif _MOBILE_NET_CONV in path:
         if "Conv_0" in path:
-            converted_path = "conv_stem.conv.weight"
-            converted_weight = weights.transpose(3, 2, 1, 0)
+            converted_path = ("conv_stem.conv.weight", "conv_stem.conv.bias")
+            converted_weight = weights.transpose(3, 2, 0, 1)
+            converted_weight = (converted_weight, np.zeros(converted_weight.shape[0]))
         elif "Normalize_0" in path:
             converted_path = "conv_stem.bn.weight"
             converted_weight = weights
@@ -567,7 +568,7 @@ def generate_base_path(path: str, block_type: str) -> tuple[str, tuple[int, int]
             converted_weight = weights
         elif "expand_conv" in path:
             converted_path += ".conv_exp.weight"
-            converted_weight = weights.transpose(3, 2, 1, 0)
+            converted_weight = weights.transpose(3, 2, 0, 1)
         else:
             converted_path += ".conv_pwl.weight"
             converted_weight = weights.transpose()[:, :, None, None]
@@ -588,7 +589,7 @@ def generate_base_path(path: str, block_type: str) -> tuple[str, tuple[int, int]
             converted_weight = weights
         elif "key_dwconv" in path:
             converted_path += ".attn.key.down_conv.weight"
-            converted_weight = weights.transpose()
+            converted_weight = weights.transpose(3, 2, 0, 1)
         elif "key_proj" in path:
             converted_path += ".attn.key.proj.weight"
             converted_weight = weights.transpose()[:, :, None, None]
@@ -600,7 +601,7 @@ def generate_base_path(path: str, block_type: str) -> tuple[str, tuple[int, int]
             converted_weight = weights.transpose()[:, :, None, None]
         elif "value_dwconv" in path:
             converted_path += ".attn.value.down_conv.weight"
-            converted_weight = weights.transpose()
+            converted_weight = weights.transpose(3, 2, 0, 1)
         elif "value_proj" in path:
             converted_path += ".attn.value.proj.weight"
             converted_weight = weights.transpose()[:, :, None, None]
@@ -630,15 +631,18 @@ def generate_base_path(path: str, block_type: str) -> tuple[str, tuple[int, int]
             converted_weight = weights.transpose()[:, :, None, None]
         elif "middle_dwconv" in path:
             converted_path += ".dw_mid.conv.weight"
-            converted_weight = weights.transpose(3, 2, 1, 0)
+            converted_weight = weights.transpose(3, 2, 0, 1)
         elif "project" in path:
             converted_path += ".pw_proj.conv.weight"
             converted_weight = weights.transpose()[:, :, None, None]
         elif "start_dwconv" in path:
             converted_path += ".dw_start.conv.weight"
-            converted_weight = weights.transpose(3, 2, 1, 0)
+            converted_weight = weights.transpose(3, 2, 0, 1)
 
-    return [(converted_path, converted_weight)]
+    if isinstance(converted_path, (tuple, list)):
+        return zip(converted_path, converted_weight)
+    else:
+        return [(converted_path, converted_weight)]
 
 
 def convert(checkpoint_path: str, config: Gemma3nConfig) -> dict[str, torch.Tensor]:
diff --git a/src/transformers/models/gemma3n/modular_gemma3n.py b/src/transformers/models/gemma3n/modular_gemma3n.py
@@ -283,7 +283,7 @@ def __init__(
 
         if layer_types is None:
             self.layer_types = [
-                "full_attention" if i % 5 == 0 else "sliding_attention" for i in range(self.num_hidden_layers)
+                "full_attention" if (i + 1) % 5 == 0 else "sliding_attention" for i in range(self.num_hidden_layers)
             ]
         else:
             self.layer_types = layer_types

Original file line number	Diff line number	Diff line change
`@@ -271,7 +271,7 @@ def __init__(`
`271`	`271`
`272`	`272`	`if layer_types is None:`
`273`	`273`	`self.layer_types = [`
`274`		`- "full_attention" if i % 5 == 0 else "sliding_attention" for i in range(self.num_hidden_layers)`
	`274`	`+ "full_attention" if (i + 1) % 5 == 0 else "sliding_attention" for i in range(self.num_hidden_layers)`
`275`	`275`	`]`
`276`	`276`	`else:`
`277`	`277`	`self.layer_types = layer_types`
Original file line number	Diff line number	Diff line change
`@@ -283,7 +283,7 @@ def __init__(`
`283`	`283`
`284`	`284`	`if layer_types is None:`
`285`	`285`	`self.layer_types = [`
`286`		`- "full_attention" if i % 5 == 0 else "sliding_attention" for i in range(self.num_hidden_layers)`
	`286`	`+ "full_attention" if (i + 1) % 5 == 0 else "sliding_attention" for i in range(self.num_hidden_layers)`
`287`	`287`	`]`
`288`	`288`	`else:`
`289`	`289`	`self.layer_types = layer_types`