feat: support GLM 4.5 family of models

sammcj · sammcj · commit 272c1f068beb · 2025-07-29T23:04:39.000+10:00
diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
@@ -6645,10 +6645,11 @@ def set_gguf_parameters(self):
     def modify_tensors(
         self, data_torch: Tensor, name: str, bid: int | None
     ) -> Iterable[tuple[str, Tensor]]:
-        # Handle special GLM4_MOE layer 46 tensors (nextn prediction layer)
+        # Handle layer 46 tensors - preserve all for future MTP support
         if bid is not None and bid == 46:
-            # Layer 46 is the nextn prediction layer - skip all tensors
-            return []
+            # Convert layer 46 tensors to GGUF naming but don't try to map them
+            new_name = name.replace("model.layers.", "blk.")
+            return [(new_name, data_torch)]
 
         if name.startswith("model.visual."):  # ignore visual part
             return []
@@ -6659,8 +6660,8 @@ def modify_tensors(
         if name == "model.embed_tokens.weight":
             return [(self.map_tensor_name("token_embd.weight"), data_torch)]
 
-        # Handle routed experts
-        if name.find("mlp.experts") != -1 and "shared_experts" not in name:
+        # Handle routed experts (skip for NextN layer 46)
+        if name.find("mlp.experts") != -1 and "shared_experts" not in name and bid != 46:
             n_experts = self.hparams["n_routed_experts"]
             assert bid is not None
 
@@ -6727,16 +6728,17 @@ def modify_tensors(
                 new_name = name
             return [(self.map_tensor_name(new_name), data_torch)]
 
-        # Handle other special GLM4_MOE tensors (nextn prediction)
+        # Handle special NextN tensors - preserve for future MTP support
         if (
             ".embed_tokens." in name
             or ".shared_head." in name
             or ".eh_proj." in name
             or ".enorm." in name
             or ".hnorm." in name
         ):
-            # Skip these special tensors - they are for nextn prediction
-            return []
+            # For NextN tensors, convert to GGUF naming convention
+            new_name = name.replace("model.layers.", "blk.").replace("model.", "")
+            return [(new_name, data_torch)]
 
         return super().modify_tensors(data_torch, name, bid)
 
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
@@ -4399,7 +4399,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                         layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K, "bias", i), { n_embd_k_gqa }, TENSOR_NOT_REQUIRED);
                         layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V, "bias", i), { n_embd_v_gqa }, TENSOR_NOT_REQUIRED);
 
-                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd, n_embd }, 0);
+                        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), { n_embd_head_k * n_head, n_embd }, 0);
                         layer.attn_post_norm = create_tensor(tn(LLM_TENSOR_ATTN_POST_NORM, "weight", i), { n_embd }, 0);
 
                         // K/Q norm tensors (optional for GLM-4.5 355B variant)