-
Notifications
You must be signed in to change notification settings - Fork 311
Added Falcon model converter and Added Falcon 7b support #2040
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
f0d3696
21df61e
bc4b4f7
060e95c
496f3e7
9dd0e61
b990401
8f2284c
3642f1e
6da4ced
a8ea36f
cea948d
60078c5
d7a5c31
c7d4a9c
152c19e
3bc83bd
d3cbdec
50e6d06
89bac89
164e6cc
7873b3c
5f174d4
559ee01
5047254
af2c647
3aaa529
13c04d7
8cc06a6
21e4473
6aa4244
1ce3837
496eeeb
9ccc46a
b64cd4c
fba4aba
f3c5041
9b860c1
41289d8
2284520
2933774
b3ba59a
f0fb361
88a91a1
6b26899
e7b39bb
8e7520b
6d5ae8c
5948b6e
79951dd
a9eed7c
0243caf
06bd348
a588b76
92aa32b
9345630
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -44,8 +44,10 @@ class FalconBackbone(Backbone): | |
| } | ||
|
|
||
| # Pretrained Falcon decoder. | ||
| # TODO: Update the preset. | ||
| model = keras_hub.models.FalconBackbone.from_preset("falcon_preset") | ||
| model = keras_hub.models.FalconBackbone.from_preset("falcon-7b-instruct") | ||
| model(input_data) | ||
|
|
||
| model = keras_hub.models.FalconBackbone.from_preset("falcon-rw-1b") | ||
| model(input_data) | ||
|
|
||
| # Randomly initialized Falcon decoder with a custom config. | ||
|
|
@@ -70,13 +72,16 @@ def __init__( | |
| num_layers, | ||
| num_attention_heads, | ||
| hidden_dim, | ||
| num_kv_heads, | ||
| intermediate_dim, | ||
| layer_norm_epsilon=1e-5, | ||
| attention_dropout_rate=0, | ||
| feedforward_dropout_rate=0, | ||
| dtype=None, | ||
| **kwargs, | ||
| ): | ||
| use_bias = True if hidden_dim == 2048 else False | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If |
||
|
|
||
| # === Layers === | ||
| self.token_embedding = ReversibleEmbedding( | ||
| input_dim=vocabulary_size, | ||
|
|
@@ -92,7 +97,9 @@ def __init__( | |
| intermediate_dim=intermediate_dim, | ||
| attention_dropout_rate=attention_dropout_rate, | ||
| feedforward_dropout_rate=feedforward_dropout_rate, | ||
| num_kv_heads=num_kv_heads, | ||
| dtype=dtype, | ||
| use_bias=use_bias, | ||
| name=f"transformer_layer_{i}", | ||
| ) | ||
| self.transformer_layers.append(layer) | ||
|
|
@@ -134,6 +141,7 @@ def __init__( | |
| self.intermediate_dim = intermediate_dim | ||
| self.attention_dropout_rate = attention_dropout_rate | ||
| self.feedforward_dropout_rate = feedforward_dropout_rate | ||
| self.num_kv_heads = num_kv_heads | ||
| self.layer_norm_epsilon = layer_norm_epsilon | ||
|
|
||
| def get_config(self): | ||
|
|
@@ -146,6 +154,7 @@ def get_config(self): | |
| "hidden_dim": self.hidden_dim, | ||
| "intermediate_dim": self.intermediate_dim, | ||
| "attention_dropout_rate": self.attention_dropout_rate, | ||
| "num_kv_heads": self.num_kv_heads, | ||
| "feedforward_dropout_rate": self.feedforward_dropout_rate, | ||
| "layer_norm_epsilon": self.layer_norm_epsilon, | ||
| } | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -60,6 +60,7 @@ | |
| HF_TOKENIZER_CONFIG_FILE = "tokenizer_config.json" | ||
| SAFETENSOR_CONFIG_FILE = "model.safetensors.index.json" | ||
| SAFETENSOR_FILE = "model.safetensors" | ||
| PYTORCH_BIN_FILE = "pytorch_model.bin" | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is the file name holds good for all the different models with .bin files?
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
|
|
||
| # Global state for preset registry. | ||
| BUILTIN_PRESETS = {} | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,124 @@ | ||
| import numpy as np | ||
|
|
||
| from keras_hub.src.models.falcon import FalconBackbone | ||
| from keras_hub.src.utils.preset_utils import load_json | ||
|
|
||
| backbone_cls = FalconBackbone | ||
|
|
||
|
|
||
| def convert_backbone_config(transformers_config): | ||
| if transformers_config.get("multi_query", False): | ||
| num_kv_heads = 1 | ||
| else: | ||
| num_kv_heads = transformers_config.get( | ||
| "num_kv_heads", transformers_config["num_attention_heads"] | ||
| ) | ||
| return { | ||
| "vocabulary_size": transformers_config["vocab_size"], | ||
| "num_layers": transformers_config["num_hidden_layers"], | ||
| "hidden_dim": transformers_config["hidden_size"], | ||
| "num_attention_heads": transformers_config["num_attention_heads"], | ||
| "head_dim": transformers_config["hidden_size"] | ||
| // transformers_config["num_attention_heads"], | ||
| "intermediate_dim": transformers_config.get( | ||
| "ffn_hidden_size", 4 * transformers_config["hidden_size"] | ||
| ), | ||
| "num_kv_heads": num_kv_heads, | ||
| "use_bias": transformers_config.get("use_bias", True), | ||
| } | ||
|
|
||
|
|
||
| def convert_weights(backbone, loader, transformers_config): | ||
| hidden_dim = transformers_config["hidden_size"] | ||
| num_attention_heads = transformers_config["num_attention_heads"] | ||
| head_dim = hidden_dim // num_attention_heads | ||
| if transformers_config.get("multi_query", False): | ||
| num_kv_heads = 1 | ||
| else: | ||
| num_kv_heads = transformers_config.get( | ||
| "num_kv_heads", num_attention_heads | ||
| ) | ||
|
|
||
| # Embeddings | ||
| loader.port_weight( | ||
| keras_variable=backbone.get_layer("token_embedding").embeddings, | ||
| hf_weight_key="word_embeddings.weight", | ||
| ) | ||
|
|
||
| for i in range(backbone.num_layers): | ||
| decoder_layer = backbone.get_layer(f"transformer_layer_{i}") | ||
|
|
||
| # Norm layer | ||
| loader.port_weight( | ||
| keras_variable=decoder_layer.input_layernorm.gamma, | ||
| hf_weight_key=f"h.{i}.input_layernorm.weight", | ||
| ) | ||
|
|
||
| if decoder_layer.input_layernorm.beta is not None: | ||
| loader.port_weight( | ||
| keras_variable=decoder_layer.input_layernorm.beta, | ||
| hf_weight_key=f"h.{i}.input_layernorm.bias", | ||
| ) | ||
| # Attention layers | ||
| loader.port_weight( | ||
| keras_variable=decoder_layer.attention_layer.output_dense.kernel, | ||
| hf_weight_key=f"h.{i}.self_attention.dense.weight", | ||
| ) | ||
|
|
||
| # Load the combined QKV weight | ||
| hf_qkv_tensor = loader.get_tensor( | ||
| f"h.{i}.self_attention.query_key_value.weight" | ||
| ) | ||
|
|
||
| if hf_qkv_tensor.shape[0] != hidden_dim: | ||
| hf_qkv_tensor = np.transpose(hf_qkv_tensor) | ||
|
|
||
| query_output_dim = num_attention_heads * head_dim | ||
| kv_output_dim = num_kv_heads * head_dim | ||
| query_kernel = hf_qkv_tensor[:, :query_output_dim] | ||
| key_kernel = hf_qkv_tensor[ | ||
| :, query_output_dim : query_output_dim + kv_output_dim | ||
| ] | ||
| value_kernel = hf_qkv_tensor[:, query_output_dim + kv_output_dim :] | ||
| query_kernel = query_kernel.reshape( | ||
| hidden_dim, num_attention_heads, head_dim | ||
| ) | ||
| key_kernel = key_kernel.reshape(hidden_dim, num_kv_heads, head_dim) | ||
| value_kernel = value_kernel.reshape(hidden_dim, num_kv_heads, head_dim) | ||
| decoder_layer.attention_layer.query_dense.kernel.assign(query_kernel) | ||
| decoder_layer.attention_layer.key_dense.kernel.assign(key_kernel) | ||
| decoder_layer.attention_layer.value_dense.kernel.assign(value_kernel) | ||
|
|
||
| # MLP dense layers | ||
| loader.port_weight( | ||
| keras_variable=decoder_layer.dense_h_to_4h.kernel, | ||
| hf_weight_key=f"h.{i}.mlp.dense_h_to_4h.weight", | ||
| hook_fn=lambda x, y: np.transpose(x), | ||
| ) | ||
|
|
||
| loader.port_weight( | ||
| keras_variable=decoder_layer.dense_4h_to_h.kernel, | ||
| hf_weight_key=f"h.{i}.mlp.dense_4h_to_h.weight", | ||
| hook_fn=lambda x, y: np.transpose(x), | ||
| ) | ||
|
|
||
| if hasattr(backbone, "final_layernorm"): | ||
| loader.port_weight( | ||
| keras_variable=backbone.final_layernorm.gamma, | ||
| hf_weight_key="ln_f.weight", | ||
| ) | ||
| loader.port_weight( | ||
| keras_variable=backbone.final_layernorm.beta, | ||
| hf_weight_key="ln_f.bias", | ||
| ) | ||
|
|
||
|
|
||
| def convert_tokenizer(cls, preset, **kwargs): | ||
| tokenizer_data = load_json(preset, "tokenizer.json") | ||
| vocab = tokenizer_data["model"]["vocab"] | ||
| merges = tokenizer_data["model"].get("merges", None) | ||
| tokenizer_kwargs = {"vocabulary": vocab} | ||
| if merges is not None: | ||
| tokenizer_kwargs["merges"] = merges | ||
| tokenizer_kwargs.update(kwargs) | ||
| return cls(**tokenizer_kwargs) |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,23 @@ | ||
| import pytest | ||
|
|
||
| from keras_hub.src.models.falcon.falcon_backbone import FalconBackbone | ||
| from keras_hub.src.models.falcon.falcon_causal_lm import FalconCausalLM | ||
| from keras_hub.src.tests.test_case import TestCase | ||
|
|
||
|
|
||
| class TestTask(TestCase): | ||
| @pytest.mark.large | ||
| def test_convert_tiny_preset(self): | ||
| model = FalconCausalLM.from_preset("hf://tiiuae/falcon-rw-1b") | ||
| prompt = "What is your favorite condiment?" | ||
| model.generate([prompt], max_length=15) | ||
|
|
||
| @pytest.mark.large | ||
| def test_class_detection(self): | ||
| model = FalconCausalLM.from_preset("hf://tiiuae/falcon-rw-1b") | ||
| self.assertIsInstance(model, FalconCausalLM) | ||
| model = FalconBackbone.from_preset( | ||
| "hf://tiiuae/falcon-1b", | ||
| load_weights=False, | ||
| ) | ||
| self.assertIsInstance(model, FalconBackbone) |
Uh oh!
There was an error while loading. Please reload this page.