[model] support glyph (#6324)

Jintao-Huang · web-flow · commit 414bb08be3d5 · 2025-10-28T15:14:47.000+08:00
diff --git a/docs/source/Instruction/支持的模型和数据集.md b/docs/source/Instruction/支持的模型和数据集.md
@@ -756,6 +756,7 @@
 |[ZhipuAI/cogagent-9b-20241220](https://modelscope.cn/models/ZhipuAI/cogagent-9b-20241220)|glm4v|glm4v|transformers>=4.42|&#x2718;|-|[zai-org/cogagent-9b-20241220](https://huggingface.co/zai-org/cogagent-9b-20241220)|
 |[ZhipuAI/GLM-4.1V-9B-Base](https://modelscope.cn/models/ZhipuAI/GLM-4.1V-9B-Base)|glm4_1v|glm4_1v|transformers>=4.53|&#x2718;|-|[zai-org/GLM-4.1V-9B-Base](https://huggingface.co/zai-org/GLM-4.1V-9B-Base)|
 |[ZhipuAI/GLM-4.1V-9B-Thinking](https://modelscope.cn/models/ZhipuAI/GLM-4.1V-9B-Thinking)|glm4_1v|glm4_1v|transformers>=4.53|&#x2718;|-|[zai-org/GLM-4.1V-9B-Thinking](https://huggingface.co/zai-org/GLM-4.1V-9B-Thinking)|
+|[ZhipuAI/Glyph](https://modelscope.cn/models/ZhipuAI/Glyph)|glm4_1v|glm4_1v|transformers>=4.57|&#x2718;|-|[zai-org/Glyph](https://huggingface.co/zai-org/Glyph)|
 |[ZhipuAI/GLM-4.5V](https://modelscope.cn/models/ZhipuAI/GLM-4.5V)|glm4_5v|glm4_5v|transformers>=4.56|&#x2714;|-|[zai-org/GLM-4.5V](https://huggingface.co/zai-org/GLM-4.5V)|
 |[ZhipuAI/GLM-4.5V-FP8](https://modelscope.cn/models/ZhipuAI/GLM-4.5V-FP8)|glm4_5v|glm4_5v|transformers>=4.56|&#x2718;|-|[zai-org/GLM-4.5V-FP8](https://huggingface.co/zai-org/GLM-4.5V-FP8)|
 |[ZhipuAI/glm-edge-v-2b](https://modelscope.cn/models/ZhipuAI/glm-edge-v-2b)|glm_edge_v|glm_edge_v|transformers>=4.46|&#x2718;|vision|[zai-org/glm-edge-v-2b](https://huggingface.co/zai-org/glm-edge-v-2b)|
diff --git a/docs/source_en/Instruction/Supported-models-and-datasets.md b/docs/source_en/Instruction/Supported-models-and-datasets.md
@@ -756,6 +756,7 @@ The table below introduces the models integrated with ms-swift:
 |[ZhipuAI/cogagent-9b-20241220](https://modelscope.cn/models/ZhipuAI/cogagent-9b-20241220)|glm4v|glm4v|transformers>=4.42|&#x2718;|-|[zai-org/cogagent-9b-20241220](https://huggingface.co/zai-org/cogagent-9b-20241220)|
 |[ZhipuAI/GLM-4.1V-9B-Base](https://modelscope.cn/models/ZhipuAI/GLM-4.1V-9B-Base)|glm4_1v|glm4_1v|transformers>=4.53|&#x2718;|-|[zai-org/GLM-4.1V-9B-Base](https://huggingface.co/zai-org/GLM-4.1V-9B-Base)|
 |[ZhipuAI/GLM-4.1V-9B-Thinking](https://modelscope.cn/models/ZhipuAI/GLM-4.1V-9B-Thinking)|glm4_1v|glm4_1v|transformers>=4.53|&#x2718;|-|[zai-org/GLM-4.1V-9B-Thinking](https://huggingface.co/zai-org/GLM-4.1V-9B-Thinking)|
+|[ZhipuAI/Glyph](https://modelscope.cn/models/ZhipuAI/Glyph)|glm4_1v|glm4_1v|transformers>=4.57|&#x2718;|-|[zai-org/Glyph](https://huggingface.co/zai-org/Glyph)|
 |[ZhipuAI/GLM-4.5V](https://modelscope.cn/models/ZhipuAI/GLM-4.5V)|glm4_5v|glm4_5v|transformers>=4.56|&#x2714;|-|[zai-org/GLM-4.5V](https://huggingface.co/zai-org/GLM-4.5V)|
 |[ZhipuAI/GLM-4.5V-FP8](https://modelscope.cn/models/ZhipuAI/GLM-4.5V-FP8)|glm4_5v|glm4_5v|transformers>=4.56|&#x2718;|-|[zai-org/GLM-4.5V-FP8](https://huggingface.co/zai-org/GLM-4.5V-FP8)|
 |[ZhipuAI/glm-edge-v-2b](https://modelscope.cn/models/ZhipuAI/glm-edge-v-2b)|glm_edge_v|glm_edge_v|transformers>=4.46|&#x2718;|vision|[zai-org/glm-edge-v-2b](https://huggingface.co/zai-org/glm-edge-v-2b)|
diff --git a/swift/llm/model/model/glm.py b/swift/llm/model/model/glm.py
@@ -273,6 +273,12 @@ def get_model_tokenizer_glm4_1v(*args, **kwargs):
                 ],
                 requires=['transformers>=4.53'],
             ),
+            ModelGroup(
+                [
+                    Model('ZhipuAI/Glyph', 'zai-org/Glyph'),
+                ],
+                requires=['transformers>=4.57'],
+            ),
         ],
         TemplateType.glm4_1v,
         get_model_tokenizer_glm4_1v,
diff --git a/swift/llm/template/template/glm.py b/swift/llm/template/template/glm.py
@@ -117,10 +117,16 @@ def _data_collator(self, batch: List[Dict[str, Any]], *, padding_to: Optional[in
 class GLM4_1VTemplate(Template):
     begin_of_image_token = 151339
     end_of_image_token = 151340
-    image_token = 151343
     begin_of_video_token = 151341
     end_of_video_token = 151342
-    video_token = 151344
+    placeholder_tokens = ['<|image|>', '<|video|>']
+
+    def init_processor(self, processor) -> None:
+        if processor is None:
+            return
+        super().init_processor(processor)
+        self.image_token = self._tokenize('<|image|>')[0]
+        self.video_token = self._tokenize('<|video|>')[0]
 
     def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index: int,
                     inputs: StdTemplateInputs) -> List[Context]:
diff --git a/tests/test_align/test_template/test_vision.py b/tests/test_align/test_template/test_vision.py
@@ -710,6 +710,19 @@ def test_glm4_1v():
         assert response == response2
 
 
+def test_glyph():
+    messages = [{'role': 'user', 'content': '<image><image>What is the difference between the two images?'}]
+    images = [
+        'http://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/cat.png',
+        'http://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/animal.png'
+    ]
+    pt_engine = PtEngine('ZhipuAI/Glyph')
+    response = _infer_model(pt_engine, messages=messages, images=images)
+    pt_engine.default_template.template_backend = 'jinja'
+    response2 = _infer_model(pt_engine, messages=messages, images=images)
+    assert response == response2
+
+
 def test_gemma3n():
     pt_engine = PtEngine('google/gemma-3n-E2B-it')
     messages = [{
@@ -1066,6 +1079,7 @@ def test_paddle_ocr():
     # test_kimi_vl()
     # test_kimi_vl_thinking()
     # test_glm4_1v()
+    test_glyph()
     # test_gemma3n()
     # test_keye_vl()
     # test_dots_ocr()
@@ -1081,4 +1095,4 @@ def test_paddle_ocr():
     # test_sailvl2()
     # test_deepseek_ocr()
     # test_llava_onevision1_5()
-    test_paddle_ocr()
+    # test_paddle_ocr()