From 7cf5c4c0a1c9721a5770e4282322bac05b2e869c Mon Sep 17 00:00:00 2001 From: RunningLeon Date: Wed, 16 Jul 2025 20:55:57 +0800 Subject: [PATCH 1/6] support internvl --- convert_hf_to_gguf.py | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index c201883509ceb..7fa093d7acc59 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -420,6 +420,9 @@ def get_model_part_names(dir_model: Path, prefix: str, suffix: str) -> list[str] for filename in os.listdir(dir_model): if filename.startswith(prefix) and filename.endswith(suffix): part_names.append(filename) + # TODO remove later + elif filename.endswith(suffix): + part_names.append(filename) part_names.sort() @@ -607,13 +610,14 @@ def get_vocab_base(self) -> tuple[list[str], list[int], str]: toktypes: list[int] = [] from transformers import AutoTokenizer - tokenizer = AutoTokenizer.from_pretrained(self.dir_model) - vocab_size = self.hparams.get("vocab_size", len(tokenizer.vocab)) - assert max(tokenizer.vocab.values()) < vocab_size + tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True) + vocab = getattr(tokenizer, 'vocab', tokenizer.get_vocab()) + vocab_size = self.hparams.get("vocab_size", len(vocab)) + assert max(vocab.values()) < vocab_size tokpre = self.get_vocab_base_pre(tokenizer) - reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()} + reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in vocab.items()} added_vocab = tokenizer.get_added_vocab() added_tokens_decoder = tokenizer.added_tokens_decoder @@ -1218,8 +1222,12 @@ def __init__(self, *args, **kwargs): self.tensor_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.MMPROJ, self.block_count) # load preprocessor config - with open(self.dir_model / "preprocessor_config.json", "r", encoding="utf-8") as f: - self.preprocessor_config = json.load(f) + preprocess_config_file = self.dir_model / "preprocessor_config.json" + if preprocess_config_file.exists(): + with open(preprocess_config_file, "r", encoding="utf-8") as f: + self.preprocessor_config = json.load(f) + else: + self.preprocessor_config = dict(image_mean=[0.485, 0.456, 0.406], image_std=[0.229, 0.224, 0.225]) def get_vision_config(self) -> dict[str, Any] | None: return self.global_config.get("vision_config") @@ -3115,6 +3123,10 @@ def set_gguf_parameters(self): def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: # process the experts separately + name = name.replace("language_model.", "") # InternVL + if name.startswith("mlp") or name.startswith("vision_model"): + # skip visual tensors + return [] if name.find("experts") != -1: n_experts = self.hparams["num_experts"] assert bid is not None From 859796e207dfb565d84fe5846c99177be20b399a Mon Sep 17 00:00:00 2001 From: RunningLeon Date: Fri, 25 Jul 2025 19:29:59 +0800 Subject: [PATCH 2/6] support interns1 --- convert_hf_to_gguf.py | 82 ++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 77 insertions(+), 5 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 7fa093d7acc59..9b70cfe3639fc 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -420,9 +420,6 @@ def get_model_part_names(dir_model: Path, prefix: str, suffix: str) -> list[str] for filename in os.listdir(dir_model): if filename.startswith(prefix) and filename.endswith(suffix): part_names.append(filename) - # TODO remove later - elif filename.endswith(suffix): - part_names.append(filename) part_names.sort() @@ -3006,7 +3003,12 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter @ModelBase.register("InternVisionModel") class InternVisionModel(MmprojModel): def set_gguf_parameters(self): + if isinstance(self.hparams_vision['image_size'], list): + self.hparams_vision['image_size'] = self.hparams_vision['image_size'][0] + if isinstance(self.hparams_vision['patch_size'], list): + self.hparams_vision['patch_size'] = self.hparams_vision['patch_size'][0] super().set_gguf_parameters() + hparams = self.hparams self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.INTERNVL) self.gguf_writer.add_vision_attention_layernorm_eps(hparams["layer_norm_eps"]) @@ -3030,8 +3032,43 @@ def tensor_force_quant(self, name, new_name, bid, n_dims): return gguf.GGMLQuantizationType.F32 return False + def _mapping_name_interns1(self, name): + names_map = { + "model.multi_modal_projector.layer_norm.bias": "mlp1.0.bias", + "model.multi_modal_projector.layer_norm.weight": "mlp1.0.weight", + "model.multi_modal_projector.linear_1.bias": "mlp1.1.bias", + "model.multi_modal_projector.linear_1.weight": "mlp1.1.weight", + "model.multi_modal_projector.linear_2.bias": "mlp1.3.bias", + "model.multi_modal_projector.linear_2.weight": "mlp1.3.weight", + "model.vision_tower.embeddings.cls_token": "vision_model.embeddings.class_embedding", + "model.vision_tower.embeddings.patch_embeddings.projection.bias": "vision_model.embeddings.patch_embedding.bias", + "model.vision_tower.embeddings.patch_embeddings.projection.weight": "vision_model.embeddings.patch_embedding.weight", + "model.vision_tower.embeddings.position_embeddings": "vision_model.embeddings.position_embedding", + } + if name in names_map: + name = names_map[name] + elif name.startswith("model.language_model."): + name = "language_model.model." + name[len("model.language_model.") :] + elif name.startswith("model.vision_tower."): + name = "vision_model." + name[len("model.vision_tower.") :] + + if name.startswith("vision_model.encoder.layer"): + name = name.replace(r".layer.", r".layers.") + name = name.replace(r".attention.", r".attn.") + name = name.replace(r".attn.q_proj", r".self_attn.q_proj") + name = name.replace(r".attn.k_proj", r".self_attn.k_proj") + name = name.replace(r".attn.v_proj", r".self_attn.v_proj") + name = name.replace(r".projection_layer.", r".proj.") + name = name.replace(r".lambda_1", r".ls1") + name = name.replace(r".lambda_2", r".ls2") + name = name.replace(r".layernorm_before.", r".norm1.") + name = name.replace(r".layernorm_after.", r".norm2.") + return name + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: del bid # unused + name = self._mapping_name_interns1(name) + # support interns1 if name.startswith("vision_model") or name.startswith("mlp"): # process visual tensors # correct name @@ -3123,8 +3160,8 @@ def set_gguf_parameters(self): def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: # process the experts separately - name = name.replace("language_model.", "") # InternVL - if name.startswith("mlp") or name.startswith("vision_model"): + name = name.replace(r"language_model.", r"") # InternVL + if name.startswith("mlp") or name.startswith("vision_model") or name.startswith("model.vision_tower") or name.startswith("model.multi_modal_projector"): # skip visual tensors return [] if name.find("experts") != -1: @@ -3180,6 +3217,41 @@ class Qwen3Model(Qwen2Model): class Qwen3MoeModel(Qwen2MoeModel): model_arch = gguf.MODEL_ARCH.QWEN3MOE + def set_vocab(self): + # deal with interns1 + if 'interns1' in f'{self.dir_model}'.lower(): + self._set_vocab_interns1() + return + + try: + self._set_vocab_sentencepiece() + except FileNotFoundError: + self._set_vocab_gpt2() + + def _set_vocab_interns1(self): + tokens, toktypes, tokpre = self.get_vocab_base() + self.gguf_writer.add_tokenizer_model("gpt2") + self.gguf_writer.add_tokenizer_pre(tokpre) + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_types(toktypes) + + special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True) + special_tokens_map_file = self.dir_model / 'special_tokens_map.json' + additional_special_tokens = [] + if special_tokens_map_file.is_file(): + with open(special_tokens_map_file, encoding = 'utf-8') as f: + additional_special_tokens = json.load(f).get('additional_special_tokens', []) + tokenizer_cfg_file = self.dir_model / 'special_tokens_map.json' + if tokenizer_cfg_file.is_file(): + with open(tokenizer_cfg_file, encoding = 'utf-8') as f: + added_tokens_decoder = json.load(f).get('added_tokens_decoder', {}) + token2ids_map = {data['content'] : int(token) for token, data in added_tokens_decoder.items() if data['special']} + for token in additional_special_tokens: + if token in token2ids_map: + special_vocab._set_special_token(token, token2ids_map[token]) + special_vocab._set_special_token('eos', 151645) + special_vocab._set_special_token("bos", 151643) + special_vocab.add_to_gguf(self.gguf_writer) @ModelBase.register("GPT2LMHeadModel") class GPT2Model(TextModel): From 483ffef516fcc706198c61757af8d9aebfc7bf14 Mon Sep 17 00:00:00 2001 From: RunningLeon Date: Mon, 28 Jul 2025 18:10:12 +0800 Subject: [PATCH 3/6] resolve comments --- convert_hf_to_gguf.py | 34 ++++++++++++++++++---------------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 9b70cfe3639fc..76e55dea623a6 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -607,7 +607,7 @@ def get_vocab_base(self) -> tuple[list[str], list[int], str]: toktypes: list[int] = [] from transformers import AutoTokenizer - tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True) + tokenizer = AutoTokenizer.from_pretrained(self.dir_model) vocab = getattr(tokenizer, 'vocab', tokenizer.get_vocab()) vocab_size = self.hparams.get("vocab_size", len(vocab)) assert max(vocab.values()) < vocab_size @@ -1219,12 +1219,8 @@ def __init__(self, *args, **kwargs): self.tensor_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.MMPROJ, self.block_count) # load preprocessor config - preprocess_config_file = self.dir_model / "preprocessor_config.json" - if preprocess_config_file.exists(): - with open(preprocess_config_file, "r", encoding="utf-8") as f: - self.preprocessor_config = json.load(f) - else: - self.preprocessor_config = dict(image_mean=[0.485, 0.456, 0.406], image_std=[0.229, 0.224, 0.225]) + with open(self.dir_model / "preprocessor_config.json", "r", encoding="utf-8") as f: + self.preprocessor_config = json.load(f) def get_vision_config(self) -> dict[str, Any] | None: return self.global_config.get("vision_config") @@ -3160,7 +3156,7 @@ def set_gguf_parameters(self): def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: # process the experts separately - name = name.replace(r"language_model.", r"") # InternVL + name = name.replace("language_model.", "") # InternVL if name.startswith("mlp") or name.startswith("vision_model") or name.startswith("model.vision_tower") or name.startswith("model.multi_modal_projector"): # skip visual tensors return [] @@ -3217,9 +3213,14 @@ class Qwen3Model(Qwen2Model): class Qwen3MoeModel(Qwen2MoeModel): model_arch = gguf.MODEL_ARCH.QWEN3MOE + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + hparams = ModelBase.load_hparams(self.dir_model) + self.origin_hf_arch = hparams.get('architectures', [None])[0] + def set_vocab(self): - # deal with interns1 - if 'interns1' in f'{self.dir_model}'.lower(): + # deal with intern-s1 + if self.origin_hf_arch == 'InternS1ForConditionalGeneration': self._set_vocab_interns1() return @@ -3240,19 +3241,20 @@ def _set_vocab_interns1(self): additional_special_tokens = [] if special_tokens_map_file.is_file(): with open(special_tokens_map_file, encoding = 'utf-8') as f: - additional_special_tokens = json.load(f).get('additional_special_tokens', []) + additional_special_tokens = json.load(f).get('additional_special_tokens', []) tokenizer_cfg_file = self.dir_model / 'special_tokens_map.json' if tokenizer_cfg_file.is_file(): with open(tokenizer_cfg_file, encoding = 'utf-8') as f: - added_tokens_decoder = json.load(f).get('added_tokens_decoder', {}) - token2ids_map = {data['content'] : int(token) for token, data in added_tokens_decoder.items() if data['special']} - for token in additional_special_tokens: - if token in token2ids_map: - special_vocab._set_special_token(token, token2ids_map[token]) + added_tokens_decoder = json.load(f).get('added_tokens_decoder', {}) + token2ids_map = {data['content'] : int(token) for token, data in added_tokens_decoder.items() if data['special']} + for token in additional_special_tokens: + if token in token2ids_map: + special_vocab._set_special_token(token, token2ids_map[token]) special_vocab._set_special_token('eos', 151645) special_vocab._set_special_token("bos", 151643) special_vocab.add_to_gguf(self.gguf_writer) + @ModelBase.register("GPT2LMHeadModel") class GPT2Model(TextModel): model_arch = gguf.MODEL_ARCH.GPT2 From 5eba3e37ee4d6c3ea91469d493d495d15089cb9f Mon Sep 17 00:00:00 2001 From: RunningLeon Date: Wed, 30 Jul 2025 10:37:42 +0800 Subject: [PATCH 4/6] put interns1 in tensor mapping --- convert_hf_to_gguf.py | 31 ++++++------------------------- gguf-py/gguf/tensor_mapping.py | 16 ++++++++++++++++ 2 files changed, 22 insertions(+), 25 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 76e55dea623a6..9d0c17f5d8c6a 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -3028,7 +3028,7 @@ def tensor_force_quant(self, name, new_name, bid, n_dims): return gguf.GGMLQuantizationType.F32 return False - def _mapping_name_interns1(self, name): + def _mapping_interns1_name(self, name): names_map = { "model.multi_modal_projector.layer_norm.bias": "mlp1.0.bias", "model.multi_modal_projector.layer_norm.weight": "mlp1.0.weight", @@ -3036,41 +3036,22 @@ def _mapping_name_interns1(self, name): "model.multi_modal_projector.linear_1.weight": "mlp1.1.weight", "model.multi_modal_projector.linear_2.bias": "mlp1.3.bias", "model.multi_modal_projector.linear_2.weight": "mlp1.3.weight", - "model.vision_tower.embeddings.cls_token": "vision_model.embeddings.class_embedding", - "model.vision_tower.embeddings.patch_embeddings.projection.bias": "vision_model.embeddings.patch_embedding.bias", - "model.vision_tower.embeddings.patch_embeddings.projection.weight": "vision_model.embeddings.patch_embedding.weight", - "model.vision_tower.embeddings.position_embeddings": "vision_model.embeddings.position_embedding", } if name in names_map: name = names_map[name] - elif name.startswith("model.language_model."): - name = "language_model.model." + name[len("model.language_model.") :] - elif name.startswith("model.vision_tower."): - name = "vision_model." + name[len("model.vision_tower.") :] - - if name.startswith("vision_model.encoder.layer"): - name = name.replace(r".layer.", r".layers.") - name = name.replace(r".attention.", r".attn.") - name = name.replace(r".attn.q_proj", r".self_attn.q_proj") - name = name.replace(r".attn.k_proj", r".self_attn.k_proj") - name = name.replace(r".attn.v_proj", r".self_attn.v_proj") - name = name.replace(r".projection_layer.", r".proj.") - name = name.replace(r".lambda_1", r".ls1") - name = name.replace(r".lambda_2", r".ls2") - name = name.replace(r".layernorm_before.", r".norm1.") - name = name.replace(r".layernorm_after.", r".norm2.") return name def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: del bid # unused - name = self._mapping_name_interns1(name) - # support interns1 - if name.startswith("vision_model") or name.startswith("mlp"): + vision_prefix = ['vision_model', 'mlp', 'model.vision_tower', 'model.multi_modal_projector'] + # deal with intern-s1 special case + name = self._mapping_interns1_name(name) + if any([name.startswith(prefix) for prefix in vision_prefix]): # process visual tensors # correct name if name.startswith("vision_model"): name = "vision_tower." + name - if (".ls" in name or "position_embedding" in name) and not name.endswith(".weight"): + if (".ls" in name or ".lambda_" in name or "position_embedding" in name) and not name.endswith(".weight"): name += ".weight" # split QKV tensors if needed if ".qkv." in name: diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 75855eba52c3c..436987bf27748 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -1054,11 +1054,13 @@ class TensorNameMap: MODEL_TENSOR.V_ENC_EMBD_CLS: ( "vision_tower.vision_model.embeddings.class_embedding", + "model.vision_tower.embeddings.cls_token", # Intern-S1 "vision_model.class_embedding", # llama 4 ), MODEL_TENSOR.V_ENC_EMBD_PATCH: ( "vision_tower.vision_model.embeddings.patch_embedding", + "model.vision_tower.embeddings.patch_embeddings.projection", # Intern-S1 "vpm.embeddings.patch_embedding", "model.vision_model.embeddings.patch_embedding", # SmolVLM "vision_tower.patch_conv", # pixtral @@ -1068,6 +1070,7 @@ class TensorNameMap: MODEL_TENSOR.V_ENC_EMBD_POS: ( "vision_tower.vision_model.embeddings.position_embedding", + "model.vision_tower.embeddings.position_embeddings", # Intern-S1 "vpm.embeddings.position_embedding", "model.vision_model.embeddings.position_embedding", # SmolVLM "vision_model.positional_embedding_vlm", # llama 4 @@ -1075,6 +1078,7 @@ class TensorNameMap: MODEL_TENSOR.V_ENC_ATTN_Q: ( "vision_tower.vision_model.encoder.layers.{bid}.self_attn.q_proj", + "model.vision_tower.encoder.layer.{bid}.attention.q_proj", # Intern-S1 "vpm.encoder.layers.{bid}.self_attn.q_proj", "model.vision_model.encoder.layers.{bid}.self_attn.q_proj", # SmolVLM "vision_model.model.layers.{bid}.self_attn.q_proj", # llama4 @@ -1084,10 +1088,12 @@ class TensorNameMap: MODEL_TENSOR.V_ENC_ATTN_Q_NORM: ( "vision_tower.vision_model.encoder.layers.{bid}.attn.q_norm", # InternVL + "model.vision_tower.encoder.layer.{bid}.attention.q_norm", # Intern-S1 ), MODEL_TENSOR.V_ENC_ATTN_K: ( "vision_tower.vision_model.encoder.layers.{bid}.self_attn.k_proj", + "model.vision_tower.encoder.layer.{bid}.attention.k_proj", # Intern-S1 "vpm.encoder.layers.{bid}.self_attn.k_proj", "model.vision_model.encoder.layers.{bid}.self_attn.k_proj", # SmolVLM "vision_model.model.layers.{bid}.self_attn.k_proj", # llama4 @@ -1097,10 +1103,12 @@ class TensorNameMap: MODEL_TENSOR.V_ENC_ATTN_K_NORM: ( "vision_tower.vision_model.encoder.layers.{bid}.attn.k_norm", # InternVL + "model.vision_tower.encoder.layer.{bid}.attention.k_norm", # Intern-S1 ), MODEL_TENSOR.V_ENC_ATTN_V: ( "vision_tower.vision_model.encoder.layers.{bid}.self_attn.v_proj", + "model.vision_tower.encoder.layer.{bid}.attention.v_proj", # Intern-S1 "vpm.encoder.layers.{bid}.self_attn.v_proj", "model.vision_model.encoder.layers.{bid}.self_attn.v_proj", # SmolVLM "vision_model.model.layers.{bid}.self_attn.v_proj", # llama4 @@ -1111,6 +1119,7 @@ class TensorNameMap: MODEL_TENSOR.V_ENC_INPUT_NORM: ( "vision_tower.vision_model.encoder.layers.{bid}.layer_norm1", "vision_tower.vision_model.encoder.layers.{bid}.norm1", # InternVL + "model.vision_tower.encoder.layer.{bid}.layernorm_before", # Intern-S1 "vpm.encoder.layers.{bid}.layer_norm1", "model.vision_model.encoder.layers.{bid}.layer_norm1", # SmolVLM "vision_tower.transformer.layers.{bid}.attention_norm", # pixtral @@ -1121,6 +1130,7 @@ class TensorNameMap: MODEL_TENSOR.V_ENC_ATTN_O: ( "vision_tower.vision_model.encoder.layers.{bid}.self_attn.out_proj", "vision_tower.vision_model.encoder.layers.{bid}.attn.proj", # InternVL + "model.vision_tower.encoder.layer.{bid}.attention.projection_layer", # Intern-S1 "vpm.encoder.layers.{bid}.self_attn.out_proj", "model.vision_model.encoder.layers.{bid}.self_attn.out_proj", # SmolVLM "vision_model.model.layers.{bid}.self_attn.o_proj", # llama4 @@ -1131,6 +1141,7 @@ class TensorNameMap: MODEL_TENSOR.V_ENC_POST_ATTN_NORM: ( "vision_tower.vision_model.encoder.layers.{bid}.layer_norm2", "vision_tower.vision_model.encoder.layers.{bid}.norm2", # InternVL + "model.vision_tower.encoder.layer.{bid}.layernorm_after", # Intern-S1 "vpm.encoder.layers.{bid}.layer_norm2", "model.vision_model.encoder.layers.{bid}.layer_norm2", # SmolVLM "vision_model.model.layers.{bid}.post_attention_layernorm", # llama4 @@ -1140,6 +1151,7 @@ class TensorNameMap: MODEL_TENSOR.V_ENC_FFN_UP: ( "vision_tower.vision_model.encoder.layers.{bid}.mlp.fc1", + "model.vision_tower.encoder.layer.{bid}.mlp.fc1", # Intern-S1 "vpm.encoder.layers.{bid}.mlp.fc1", "model.vision_model.encoder.layers.{bid}.mlp.fc1", # SmolVLM, gemma3 "vision_tower.transformer.layers.{bid}.feed_forward.up_proj", # pixtral @@ -1155,6 +1167,7 @@ class TensorNameMap: MODEL_TENSOR.V_ENC_FFN_DOWN: ( "vision_tower.vision_model.encoder.layers.{bid}.mlp.fc2", + "model.vision_tower.encoder.layer.{bid}.mlp.fc2", # Intern-S1 "vpm.encoder.layers.{bid}.mlp.fc2", "model.vision_model.encoder.layers.{bid}.mlp.fc2", # SmolVLM, gemma3 "vision_tower.transformer.layers.{bid}.feed_forward.down_proj", # pixtral @@ -1165,10 +1178,12 @@ class TensorNameMap: MODEL_TENSOR.V_LAYER_SCALE_1: ( "vision_tower.vision_model.encoder.layers.{bid}.ls1", # InternVL + "model.vision_tower.encoder.layer.{bid}.lambda_1", # Intern-S1 ), MODEL_TENSOR.V_LAYER_SCALE_2: ( "vision_tower.vision_model.encoder.layers.{bid}.ls2", # InternVL + "model.vision_tower.encoder.layer.{bid}.lambda_2", # Intern-S1 ), MODEL_TENSOR.V_PRE_NORM: ( @@ -1190,6 +1205,7 @@ class TensorNameMap: MODEL_TENSOR.V_MM_INP_NORM: ( "multi_modal_projector.norm", + "model.multi_modal_projector.layer_norm", # Intern-S1 ), MODEL_TENSOR.V_MM_SOFT_EMB_NORM: ( From c71543c832addd37ed75b59b0198e41e2b3c5e5f Mon Sep 17 00:00:00 2001 From: RunningLeon Date: Wed, 30 Jul 2025 20:27:55 +0800 Subject: [PATCH 5/6] resolve comment --- convert_hf_to_gguf.py | 1 + gguf-py/gguf/tensor_mapping.py | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 9d0c17f5d8c6a..39c87fe8d9895 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -2999,6 +2999,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter @ModelBase.register("InternVisionModel") class InternVisionModel(MmprojModel): def set_gguf_parameters(self): + assert self.hparams_vision is not None if isinstance(self.hparams_vision['image_size'], list): self.hparams_vision['image_size'] = self.hparams_vision['image_size'][0] if isinstance(self.hparams_vision['patch_size'], list): diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 436987bf27748..ed622b746e494 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -1205,7 +1205,6 @@ class TensorNameMap: MODEL_TENSOR.V_MM_INP_NORM: ( "multi_modal_projector.norm", - "model.multi_modal_projector.layer_norm", # Intern-S1 ), MODEL_TENSOR.V_MM_SOFT_EMB_NORM: ( From 490a13f9665e1e6879b0291c42475211a1cc2610 Mon Sep 17 00:00:00 2001 From: RunningLeon Date: Mon, 4 Aug 2025 18:52:40 +0800 Subject: [PATCH 6/6] move tokenizer changes to sub class --- convert_hf_to_gguf.py | 47 ++++++++++++++++++++++++++++++++++++++----- 1 file changed, 42 insertions(+), 5 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 39c87fe8d9895..f28ce00f08b46 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -608,13 +608,12 @@ def get_vocab_base(self) -> tuple[list[str], list[int], str]: from transformers import AutoTokenizer tokenizer = AutoTokenizer.from_pretrained(self.dir_model) - vocab = getattr(tokenizer, 'vocab', tokenizer.get_vocab()) - vocab_size = self.hparams.get("vocab_size", len(vocab)) - assert max(vocab.values()) < vocab_size + vocab_size = self.hparams.get("vocab_size", len(tokenizer.vocab)) + assert max(tokenizer.vocab.values()) < vocab_size tokpre = self.get_vocab_base_pre(tokenizer) - reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in vocab.items()} + reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()} added_vocab = tokenizer.get_added_vocab() added_tokens_decoder = tokenizer.added_tokens_decoder @@ -3212,7 +3211,45 @@ def set_vocab(self): self._set_vocab_gpt2() def _set_vocab_interns1(self): - tokens, toktypes, tokpre = self.get_vocab_base() + tokens: list[str] = [] + toktypes: list[int] = [] + + from transformers import AutoTokenizer + tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True) + vocab = getattr(tokenizer, 'vocab', tokenizer.get_vocab()) + vocab_size = self.hparams.get("vocab_size", len(vocab)) + assert max(vocab.values()) < vocab_size + + tokpre = self.get_vocab_base_pre(tokenizer) + + reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in vocab.items()} + added_vocab = tokenizer.get_added_vocab() + + added_tokens_decoder = tokenizer.added_tokens_decoder + + for i in range(vocab_size): + if i not in reverse_vocab: + tokens.append(f"[PAD{i}]") + toktypes.append(gguf.TokenType.UNUSED) + else: + token: str = reverse_vocab[i] + if token in added_vocab: + # The tokenizer in llama.cpp assumes the CONTROL and USER_DEFINED tokens are pre-normalized. + # To avoid unexpected issues - we make sure to normalize non-normalized tokens + if not added_tokens_decoder[i].normalized: + previous_token = token + token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False)) + if previous_token != token: + logger.info(f"{repr(previous_token)} is encoded and decoded back to {repr(token)} using AutoTokenizer") + + if added_tokens_decoder[i].special or self.does_token_look_special(token): + toktypes.append(gguf.TokenType.CONTROL) + else: + toktypes.append(gguf.TokenType.USER_DEFINED) + else: + toktypes.append(gguf.TokenType.NORMAL) + tokens.append(token) + self.gguf_writer.add_tokenizer_model("gpt2") self.gguf_writer.add_tokenizer_pre(tokpre) self.gguf_writer.add_token_list(tokens)