vocab : JetBrains Mellum pre-tokenizer (ggml-org#15045)

csabakecskemeti · Nexesenex · commit 3fc1091d3d47 · 2025-08-05T13:18:13.000+02:00
diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
@@ -1182,6 +1182,9 @@ def get_vocab_base_pre(self, tokenizer) -> str:
         if chkhsh == "2085e1638f6c377a0aa4ead21b27bb4cb941bf800df86ed391011769c1758dfb":
             # ref: https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-32B
             res = "exaone4"
+        if chkhsh == "a1e163ecab2e718a4c829d1148b6e86824ec36163bb71941c3dca9cd5ac25756":
+            # ref: https://huggingface.co/JetBrains/Mellum-4b-base
+            res = "mellum"
 
         if res is None:
             logger.warning("\n")
diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py
@@ -130,6 +130,7 @@ class TOKENIZER_TYPE(IntEnum):
     {"name": "midm-2.0",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/K-intelligence/Midm-2.0-Base-Instruct", },
     {"name": "lfm2",             "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LiquidAI/LFM2-Tokenizer"},
     {"name": "exaone4",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-32B", },
+    {"name": "mellum",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/JetBrains/Mellum-4b-base", },
 ]
 
 # some models are known to be broken upstream, so we will skip them as exceptions
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
@@ -2092,7 +2092,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                     tokenizer_pre == "gigachat"   ||
                     tokenizer_pre == "jina-v2-es" ||
                     tokenizer_pre == "jina-v2-de" ||
-                    tokenizer_pre == "a.x-4.0") {
+                    tokenizer_pre == "a.x-4.0" ||
+                    tokenizer_pre == "mellum") {
                 pre_type = LLAMA_VOCAB_PRE_TYPE_GPT2;
             } else if (
                     tokenizer_pre == "jina-v1-en" ||

Original file line number	Diff line number	Diff line change
`@@ -130,6 +130,7 @@ class TOKENIZER_TYPE(IntEnum):`
`130`	`130`	`{"name": "midm-2.0", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/K-intelligence/Midm-2.0-Base-Instruct", },`
`131`	`131`	`{"name": "lfm2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LiquidAI/LFM2-Tokenizer"},`
`132`	`132`	`{"name": "exaone4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-32B", },`
	`133`	`+ {"name": "mellum", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/JetBrains/Mellum-4b-base", },`
`133`	`134`	`]`
`134`	`135`
`135`	`136`	`# some models are known to be broken upstream, so we will skip them as exceptions`