Skip to content

Commit 3fc1091

Browse files
csabakecskemetiNexesenex
authored andcommitted
vocab : JetBrains Mellum pre-tokenizer (ggml-org#15045)
1 parent 874f67a commit 3fc1091

File tree

3 files changed

+6
-1
lines changed

3 files changed

+6
-1
lines changed

convert_hf_to_gguf.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1182,6 +1182,9 @@ def get_vocab_base_pre(self, tokenizer) -> str:
11821182
if chkhsh == "2085e1638f6c377a0aa4ead21b27bb4cb941bf800df86ed391011769c1758dfb":
11831183
# ref: https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-32B
11841184
res = "exaone4"
1185+
if chkhsh == "a1e163ecab2e718a4c829d1148b6e86824ec36163bb71941c3dca9cd5ac25756":
1186+
# ref: https://huggingface.co/JetBrains/Mellum-4b-base
1187+
res = "mellum"
11851188

11861189
if res is None:
11871190
logger.warning("\n")

convert_hf_to_gguf_update.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -130,6 +130,7 @@ class TOKENIZER_TYPE(IntEnum):
130130
{"name": "midm-2.0", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/K-intelligence/Midm-2.0-Base-Instruct", },
131131
{"name": "lfm2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LiquidAI/LFM2-Tokenizer"},
132132
{"name": "exaone4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-32B", },
133+
{"name": "mellum", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/JetBrains/Mellum-4b-base", },
133134
]
134135

135136
# some models are known to be broken upstream, so we will skip them as exceptions

src/llama-vocab.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2092,7 +2092,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
20922092
tokenizer_pre == "gigachat" ||
20932093
tokenizer_pre == "jina-v2-es" ||
20942094
tokenizer_pre == "jina-v2-de" ||
2095-
tokenizer_pre == "a.x-4.0") {
2095+
tokenizer_pre == "a.x-4.0" ||
2096+
tokenizer_pre == "mellum") {
20962097
pre_type = LLAMA_VOCAB_PRE_TYPE_GPT2;
20972098
} else if (
20982099
tokenizer_pre == "jina-v1-en" ||

0 commit comments

Comments
 (0)