AI-Hypercomputer
diff --git a/‎src/MaxText/utils/ckpt_conversion/to_huggingface.py‎
Lines changed: 68 additions & 13 deletions b/‎src/MaxText/utils/ckpt_conversion/to_huggingface.py‎
Lines changed: 68 additions & 13 deletions
diff --git a/‎src/MaxText/utils/ckpt_conversion/to_maxtext.py‎
Lines changed: 6 additions & 2 deletions b/‎src/MaxText/utils/ckpt_conversion/to_maxtext.py‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎src/MaxText/utils/ckpt_conversion/utils/hf_model_configs.py‎
Lines changed: 156 additions & 1 deletion b/‎src/MaxText/utils/ckpt_conversion/utils/hf_model_configs.py‎
Lines changed: 156 additions & 1 deletion
@@ -56,6 +56,7 @@
 from typing import Sequence, Any
 import time
 from tqdm import tqdm
+import numpy as np
 
 from transformers import AutoTokenizer, AutoProcessor
 
@@ -71,20 +72,22 @@
 )
 from MaxText.utils.ckpt_conversion.utils.hf_shape import HF_SHAPE
 from MaxText.utils.ckpt_conversion.utils.hf_model_configs import HF_MODEL_CONFIGS
-from MaxText.utils.ckpt_conversion.utils.utils import (process_leaf_param, save_model_files, HF_IDS)
+from MaxText.utils.ckpt_conversion.utils.utils import process_maxtext_param, save_model_files, HF_IDS
 
 
 os.environ["JAX_PLATFORMS"] = "cpu"
 os.environ["XLA_FLAGS"] = "--xla_force_host_platform_device_count=16"
 
 
-def _get_model_mappings(model_name: str, scan_layers: bool, config_dict: dict):
+def _get_model_mappings(model_name: str, scan_layers: bool, hf_config_dict: dict, inhomogeneous_layer_cycle_interval: int):
   """Retrieves parameter, shape, and hook function mappings for the model.
 
   Args:
     model_name: The name of the model (e.g., "gemma2-2b").
     scan_layers: Boolean indicating if the model was trained with scanned layers.
-    config_dict: The Hugging Face model configuration dictionary.
+    hf_config_dict: The Hugging Face model configuration dictionary.
+    inhomogeneous_layer_cycle_interval: For complex architectures (llama4, gpt-oss), there are repeated sets of
+      n inhomogeneous layers. The scan structure has n blocks.
 
   Returns:
     A dictionary containing the parameter mapping, shape mapping, and hook
@@ -97,12 +100,42 @@ def _get_model_mappings(model_name: str, scan_layers: bool, config_dict: dict):
     raise ValueError(f"Mappings not found for model: {model_name}. Available PARAM_MAPPING keys: {PARAM_MAPPING.keys()}")
 
   return {
-      "param_mapping": PARAM_MAPPING[model_name](config_dict, scan_layers),
-      "shape_mapping": HF_SHAPE[model_name](config_dict),
-      "hook_fn_mapping": HOOK_FNS[model_name](config_dict, scan_layers, saving_to_hf=True),
+      "param_mapping": PARAM_MAPPING[model_name](hf_config_dict, scan_layers, inhomogeneous_layer_cycle_interval),
+      "shape_mapping": HF_SHAPE[model_name](hf_config_dict),
+      "hook_fn_mapping": HOOK_FNS[model_name](
+          hf_config_dict, scan_layers, inhomogeneous_layer_cycle_interval, saving_to_hf=True
+      ),
   }
 
 
+def _check_param_map_keys(param_map_keys, maxtext_state_keys):
+  """Verifies that the keys in the parameter map match the keys in the MaxText state.
+
+  This function handles cases where the parameter map contains tuples as keys,
+  which represent N-to-1 mappings (multiple MaxText parameters combined into one
+  Hugging Face parameter). It flattens these tuples to ensure every individual
+  MaxText parameter is accounted for.
+
+  Args:
+    param_map_keys: The keys from the parameter mapping dictionary.
+    maxtext_state_keys: A set of all parameter keys from the loaded MaxText
+      checkpoint.
+  """
+  flattened_map_keys = set()
+  for key in param_map_keys:
+    if isinstance(key, tuple):
+      flattened_map_keys.update(key)
+    else:
+      flattened_map_keys.add(key)
+
+  if flattened_map_keys != maxtext_state_keys:
+    raise ValueError(
+        f"param_map and maxtext_state_dict have different keys."
+        + f"\nparam map\n{param_map_keys}"
+        + f"\nmaxtext:\n{maxtext_state_keys}"
+    )
+
+
 def main(argv: Sequence[str]) -> None:
   """Main function to convert a MaxText checkpoint to HuggingFace format.
 
@@ -156,29 +189,51 @@ def main(argv: Sequence[str]) -> None:
   processor = AutoProcessor.from_pretrained(hf_tokenizer_id, token=hf_token) if config.use_multimodal else None
 
   # 3. Get parameter mappings
-  mappings = _get_model_mappings(model_key, config.scan_layers, hf_config_obj.to_dict())
+  mappings = _get_model_mappings(
+      model_key, config.scan_layers, hf_config_obj.to_dict(), config.inhomogeneous_layer_cycle_interval
+  )
   param_map = mappings["param_mapping"]
   shape_map = mappings["shape_mapping"]  # HF target shapes
   hook_fn_map = mappings["hook_fn_mapping"]
 
   # 4. Transform Weights
-  transformed_hf_weights: dict[str, Any] = {}
-
   # MaxText `engine.load_params()` returns `state.params` (a FrozenDict).
   # The actual weights are typically under `state.params['params']`.
   actual_weights_dict = loaded_params_from_engine.get("params")
   if actual_weights_dict is None:
     raise ValueError("Loaded parameters from engine do not contain a 'params' key. Structure might be unexpected.")
-
   leaves_with_paths = jax.tree_util.tree_leaves_with_path(actual_weights_dict)
 
-  # traverse leavse to build: mt_param_key:mt_weights
+  # Construct maxtext_state_dict: {parameter name: parameter weight}
+  maxtext_state_dict = {}
+  for path_tuple, leaf_value in leaves_with_paths:
+    # Construct maxtext_param_key from path_tuple
+    maxtext_param_key = "params-" + "-".join(k.key for k in path_tuple)
+    # Check leaf value is an array
+    if not isinstance(leaf_value, (jax.Array, np.ndarray)):
+      raise ValueError(f"Leaf value for {maxtext_param_key} is not an array. Type: {type(leaf_value)}.")
+    maxtext_state_dict[maxtext_param_key] = leaf_value
+
+  # The param_map may contain tuples as keys, which represent N-to-1 mappings from maxtext to huggingface
+  # Check param_map after flattening has the same keys as maxtext_state_dict
+  _check_param_map_keys(param_map.keys(), maxtext_state_dict.keys())
+
+  # Iterate over param_map to build: mt_param_key:mt_weights
   max_logging.log("\nProccessing weight...")
   start = time.time()
   processed_params_list = []
-  for path_tuple_iter, leaf_value_iter in tqdm(leaves_with_paths, total=len(leaves_with_paths)):
-    processed_params = process_leaf_param(path_tuple_iter, leaf_value_iter, param_map, shape_map, hook_fn_map, config)
+
+  for key in tqdm(param_map, total=len(param_map)):
+    if isinstance(key, tuple):
+      # key is tuple of param names, weight is list of param weights
+      weight = [maxtext_state_dict[subkey] for subkey in key]
+    else:
+      # key is single param name, weight is single param weight
+      weight = maxtext_state_dict[key]
+
+    processed_params = process_maxtext_param(key, weight, param_map, hook_fn_map, shape_map, config)
     processed_params_list.extend(processed_params)
+
   transformed_hf_weights = dict(processed_params_list)
   max_logging.log(f"Elapse: {(time.time() - start) / 60:.2f} min")
 
 
@@ -273,11 +273,15 @@ def main(argv: Sequence[str]) -> None:
   #   f"model.layers.{global_layer_idx}.input_layernorm.weight",
 
   model_key = config.model_name
-  param_map_mt_to_hf = PARAM_MAPPING[model_key](hf_config_obj.to_dict(), config.scan_layers)
+  param_map_mt_to_hf = PARAM_MAPPING[model_key](
+      hf_config_obj.to_dict(), config.scan_layers, config.inhomogeneous_layer_cycle_interval
+  )
 
   # Example of Hook FN mapping, to perform reshape:
   # f"params-decoder-layers_{maxtext_layer_idx}-self_attention_global-key-kernel": reshape_kernel,
-  hook_fn_map_mt = HOOK_FNS[model_key](hf_config_obj.to_dict(), config.scan_layers, saving_to_hf=False)
+  hook_fn_map_mt = HOOK_FNS[model_key](
+      hf_config_obj.to_dict(), config.scan_layers, config.inhomogeneous_layer_cycle_interval, saving_to_hf=False
+  )
   max_logging.log("Parameter mappings and hooks obtained.")
 
   # Transform weights
 
@@ -469,6 +469,8 @@
     vocab_size=151936,
 )
 
+# copy from https://huggingface.co/deepseek-ai/DeepSeek-V3/blob/main/config.json
+# remove fp8 quantization_config, since we are using bf16
 deepseek3_671b_dict = {
     "architectures": ["DeepseekV3ForCausalLM"],
     "attention_bias": False,
@@ -527,7 +529,157 @@
 }
 deepseek3_671b_config = transformers.DeepseekV3Config(**deepseek3_671b_dict)
 
-# {maxtext model name: hf model config}
+# copy from https://huggingface.co/openai/gpt-oss-20b/blob/main/config.json
+# remove mxfp4 quantization_config, since we are using bf16
+gpt_oss_20b_dict = {
+    "architectures": ["GptOssForCausalLM"],
+    "attention_bias": True,
+    "attention_dropout": 0.0,
+    "eos_token_id": 200002,
+    "experts_per_token": 4,
+    "head_dim": 64,
+    "hidden_act": "silu",
+    "hidden_size": 2880,
+    "initial_context_length": 4096,
+    "initializer_range": 0.02,
+    "intermediate_size": 2880,
+    "layer_types": [
+        "sliding_attention",
+        "full_attention",
+        "sliding_attention",
+        "full_attention",
+        "sliding_attention",
+        "full_attention",
+        "sliding_attention",
+        "full_attention",
+        "sliding_attention",
+        "full_attention",
+        "sliding_attention",
+        "full_attention",
+        "sliding_attention",
+        "full_attention",
+        "sliding_attention",
+        "full_attention",
+        "sliding_attention",
+        "full_attention",
+        "sliding_attention",
+        "full_attention",
+        "sliding_attention",
+        "full_attention",
+        "sliding_attention",
+        "full_attention",
+    ],
+    "max_position_embeddings": 131072,
+    "model_type": "gpt_oss",
+    "num_attention_heads": 64,
+    "num_experts_per_tok": 4,
+    "num_hidden_layers": 24,
+    "num_key_value_heads": 8,
+    "num_local_experts": 32,
+    "output_router_logits": False,
+    "pad_token_id": 199999,
+    "rms_norm_eps": 1e-05,
+    "rope_scaling": {
+        "beta_fast": 32.0,
+        "beta_slow": 1.0,
+        "factor": 32.0,
+        "original_max_position_embeddings": 4096,
+        "rope_type": "yarn",
+        "truncate": False,
+    },
+    "rope_theta": 150000,
+    "router_aux_loss_coef": 0.9,
+    "sliding_window": 128,
+    "swiglu_limit": 7.0,
+    "tie_word_embeddings": False,
+    "transformers_version": "4.55.0.dev0",
+    "use_cache": True,
+    "vocab_size": 201088,
+}
+gpt_oss_20b_config = transformers.GptOssConfig(**gpt_oss_20b_dict)
+
+# copy from https://huggingface.co/openai/gpt-oss-120b/blob/main/config.json
+# remove mxfp4 quantization_config, since we are using bf16
+gpt_oss_120b_dict = {
+    "architectures": ["GptOssForCausalLM"],
+    "attention_bias": True,
+    "attention_dropout": 0.0,
+    "eos_token_id": 200002,
+    "experts_per_token": 4,
+    "head_dim": 64,
+    "hidden_act": "silu",
+    "hidden_size": 2880,
+    "initial_context_length": 4096,
+    "initializer_range": 0.02,
+    "intermediate_size": 2880,
+    "layer_types": [
+        "sliding_attention",
+        "full_attention",
+        "sliding_attention",
+        "full_attention",
+        "sliding_attention",
+        "full_attention",
+        "sliding_attention",
+        "full_attention",
+        "sliding_attention",
+        "full_attention",
+        "sliding_attention",
+        "full_attention",
+        "sliding_attention",
+        "full_attention",
+        "sliding_attention",
+        "full_attention",
+        "sliding_attention",
+        "full_attention",
+        "sliding_attention",
+        "full_attention",
+        "sliding_attention",
+        "full_attention",
+        "sliding_attention",
+        "full_attention",
+        "sliding_attention",
+        "full_attention",
+        "sliding_attention",
+        "full_attention",
+        "sliding_attention",
+        "full_attention",
+        "sliding_attention",
+        "full_attention",
+        "sliding_attention",
+        "full_attention",
+        "sliding_attention",
+        "full_attention",
+    ],
+    "max_position_embeddings": 131072,
+    "model_type": "gpt_oss",
+    "num_attention_heads": 64,
+    "num_experts_per_tok": 4,
+    "num_hidden_layers": 36,
+    "num_key_value_heads": 8,
+    "num_local_experts": 128,
+    "output_router_logits": False,
+    "pad_token_id": 199999,
+    "rms_norm_eps": 1e-05,
+    "rope_scaling": {
+        "beta_fast": 32.0,
+        "beta_slow": 1.0,
+        "factor": 32.0,
+        "original_max_position_embeddings": 4096,
+        "rope_type": "yarn",
+        "truncate": False,
+    },
+    "rope_theta": 150000,
+    "router_aux_loss_coef": 0.9,
+    "sliding_window": 128,
+    "swiglu_limit": 7.0,
+    "tie_word_embeddings": False,
+    "transformers_version": "4.55.0.dev0",
+    "use_cache": True,
+    "vocab_size": 201088,
+}
+gpt_oss_120b_config = transformers.GptOssConfig(**gpt_oss_120b_dict)
+
+
 qwen3_omni_30b_a3b_config = transformers.Qwen3OmniMoeConfig(
     # TODO(hengtaoguo): Pure-text Omni model, need to fill in visual/audio/code2wav parts
     architectures=["Qwen3OmniMoeForConditionalGeneration"],
@@ -539,6 +691,7 @@
     },
 )
 
+# {maxtext model name: hf model config}
 HF_MODEL_CONFIGS = {
     "gemma2-2b": gemma2_2b_config,
     "gemma2-9b": gemma2_9b_config,
@@ -560,5 +713,7 @@
     "qwen3-235b-a22b": qwen3_235b_a22b_thinking_2507_config,
     "qwen3-480b-a35b": qwen3_coder_480b_a35b_config,
     "deepseek3-671b": deepseek3_671b_config,
+    "gpt-oss-20b": gpt_oss_20b_config,
+    "gpt-oss-120b": gpt_oss_120b_config,
     "qwen3-omni-30b-a3b": qwen3_omni_30b_a3b_config,
 }