keras-team · mattdangerw · Jul 16, 2025 · Jun 6, 2025 · Jun 6, 2025 · Jun 12, 2025
diff --git a/keras_hub/src/utils/transformers/export_gemma_to_safetensor.py b/keras_hub/src/utils/transformers/export_gemma_to_safetensor.py
@@ -0,0 +1,145 @@
+import json
+import os
+import shutil
+import warnings
+
+import jax.numpy as jnp
+import keras
+import keras.ops as ops
+from safetensors.flax import save_file as flax_save_file
+from safetensors.tensorflow import save_file as tf_save_file
+from safetensors.torch import save_file as torch_save_file
+
+
+def convert_to_hf_config(keras_config):
+    hf_config = {
+        "vocab_size": keras_config.vocabulary_size,
+        "num_hidden_layers": keras_config.num_layers,
+        "num_attention_heads": keras_config.num_query_heads,
+        "num_key_value_heads": keras_config.num_key_value_heads,
+        "hidden_size": keras_config.hidden_dim,
+        "intermediate_size": keras_config.intermediate_dim // 2,
+        "head_dim": keras_config.head_dim,
+        "max_position_embeddings": 8192,
+    }
+    return hf_config
+
+
+def export_to_hf(keras_model, path):
+    """This function converts a Keras Gemma model to Hugging Face format by:
+    - Extracting and mapping weights from the Keras backbone to safetensors.
+    - Saving the configuration as 'config.json'.
+    - Saving weights in 'model.safetensors'.
+    - Saving tokenizer assets.
+    Args:
+        keras_model: The Keras Gemma model (e.g., GemmaCausalLM) to convert.
+        path: str. Path of the directory to which the safetensors file,
+        config and tokenizer will be saved.
+    """
+    backend = keras.config.backend()
+    backbone = keras_model.backbone
+    hf_config = convert_to_hf_config(backbone)
+
+    weights_dict = {}
+
+    # Map token embedding
+    token_embedding_layer = backbone.get_layer("token_embedding")
+    weights_dict["model.embed_tokens.weight"] = token_embedding_layer.weights[0]
+
+    for i in range(backbone.num_layers):
+        decoder_layer = backbone.get_layer(f"decoder_block_{i}")
+
+        # Pre-attention normalization
+        weights_dict[f"model.layers.{i}.input_layernorm.weight"] = (
+            decoder_layer.pre_attention_norm.weights[0]
+        )
+
+        # Attention query projection
+        query_kernel = decoder_layer.attention.query_dense.weights[0]
+        query_kernel = ops.transpose(query_kernel, axes=(1, 0, 2))
+        query_kernel = ops.reshape(query_kernel, (-1, backbone.hidden_dim))
+        query_kernel = ops.transpose(query_kernel)
+        weights_dict[f"model.layers.{i}.self_attn.q_proj.weight"] = query_kernel
+
+        # Attention key projection
+        key_kernel = decoder_layer.attention.key_dense.weights[0][0]
+        weights_dict[f"model.layers.{i}.self_attn.k_proj.weight"] = (
+            ops.transpose(key_kernel)
+        )
+
+        # Attention value projection
+        value_kernel = decoder_layer.attention.value_dense.weights[0][0]
+        weights_dict[f"model.layers.{i}.self_attn.v_proj.weight"] = (
+            ops.transpose(value_kernel)
+        )
+
+        # Attention output projection
+        out_kernel = decoder_layer.attention.output_dense.weights[0]
+        out_kernel = ops.transpose(out_kernel, axes=(2, 0, 1))
+        out_kernel = ops.reshape(out_kernel, (backbone.hidden_dim, -1))
+        weights_dict[f"model.layers.{i}.self_attn.o_proj.weight"] = out_kernel
+
+        # Post-attention normalization
+        weights_dict[f"model.layers.{i}.post_attention_layernorm.weight"] = (
+            decoder_layer.pre_ffw_norm.weights[0]
+        )
+
+        # MLP gate projection
+        gate_kernel = decoder_layer.gating_ffw.weights[0]
+        weights_dict[f"model.layers.{i}.mlp.gate_proj.weight"] = ops.transpose(
+            gate_kernel
+        )
+
+        # MLP up projection
+        up_kernel = decoder_layer.gating_ffw_2.weights[0]
+        weights_dict[f"model.layers.{i}.mlp.up_proj.weight"] = ops.transpose(
+            up_kernel
+        )
+
+        # MLP down projection
+        down_kernel = decoder_layer.ffw_linear.weights[0]
+        weights_dict[f"model.layers.{i}.mlp.down_proj.weight"] = ops.transpose(
+            down_kernel
+        )
+
+    # Map final normalization
+    weights_dict["model.norm.weight"] = backbone.get_layer(
+        "final_normalization"
+    ).weights[0]
+
+    # Tie lm_head.weight to embedding weights
+    weights_dict["lm_head.weight"] = token_embedding_layer.weights[0]
+
+    # Save config
+    os.makedirs(path, exist_ok=True)
+    config_path = os.path.join(path, "config.json")
+    with open(config_path, "w") as f:
+        json.dump(hf_config, f)
+
+    # Save weights based on backend
+    weights_path = os.path.join(path, "model.safetensors")
+    if backend == "torch":
+        weights_dict_contiguous = {
+            k: v.contiguous() for k, v in weights_dict.items()
+        }
+        torch_save_file(weights_dict_contiguous, weights_path)
+    elif backend == "tensorflow":
+        tf_save_file(weights_dict, weights_path)
+    elif backend == "jax":
+        weights_dict_contiguous = {
+            k: jnp.ascontiguousarray(v) for k, v in weights_dict.items()
+        }
+        flax_save_file(weights_dict_contiguous, weights_path)
+
+    # Save tokenizer assets
+    keras_model.preprocessor.tokenizer.save_assets(path)
+
+    # Rename vocabulary file
+    vocab_spm_path = os.path.join(path, "vocabulary.spm")
+    tokenizer_model_path = os.path.join(path, "tokenizer.model")
+    if os.path.exists(vocab_spm_path):
+        shutil.move(vocab_spm_path, tokenizer_model_path)
+    else:
+        warnings.warn(
+            f"{vocab_spm_path} not found. Tokenizer may not load correctly."
+        )
diff --git a/keras_hub/src/utils/transformers/export_gemma_to_safetensors_test.py b/keras_hub/src/utils/transformers/export_gemma_to_safetensors_test.py
@@ -0,0 +1,44 @@
+import os
+
+import pytest
+import torch
+from transformers import GemmaForCausalLM
+from transformers import GemmaTokenizer
+
+from keras_hub.src.models.gemma.gemma_causal_lm import GemmaCausalLM
+from keras_hub.src.tests.test_case import TestCase
+from keras_hub.src.utils.transformers.export_gemma_to_safetensor import (
+    export_to_hf,
+)
+
+
+class TestGemmaExport(TestCase):
+    @pytest.mark.large
+    def test_export_to_hf(self):
+        # Load Keras model
+        keras_model = GemmaCausalLM.from_preset("gemma_2b_en")
+        input_text = "All hail RCB"
+        max_length = 25
+
+        # Export to Hugging Face format using self.tmp_path
+        export_path = os.path.join(self.get_temp_dir(), "export_to_hf")
+        export_to_hf(keras_model, export_path)
+
+        # Load Hugging Face model and tokenizer
+        hf_model = GemmaForCausalLM.from_pretrained(export_path)
+        hf_tokenizer = GemmaTokenizer.from_pretrained(export_path)
+
+        # Generate text with Keras model
+        keras_output = keras_model.generate(input_text, max_length=max_length)
+
+        # Generate text with Hugging Face model
+        hf_inputs = hf_tokenizer(input_text, return_tensors="pt")
+        with torch.no_grad():
+            hf_outputs = hf_model.generate(
+                **hf_inputs, max_length=max_length, do_sample=False
+            )
+        hf_output_text = hf_tokenizer.decode(
+            hf_outputs[0], skip_special_tokens=True
+        )
+
+        self.assertEqual(keras_output, hf_output_text)
diff --git a/requirements-common.txt b/requirements-common.txt
@@ -18,3 +18,4 @@ sentencepiece
 tensorflow-datasets
 safetensors
 pillow
+transformers