Skip to content

Commit 414bb08

Browse files
authored
[model] support glyph (#6324)
1 parent 2361909 commit 414bb08

File tree

5 files changed

+31
-3
lines changed

5 files changed

+31
-3
lines changed

docs/source/Instruction/支持的模型和数据集.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -756,6 +756,7 @@
756756
|[ZhipuAI/cogagent-9b-20241220](https://modelscope.cn/models/ZhipuAI/cogagent-9b-20241220)|glm4v|glm4v|transformers>=4.42|✘|-|[zai-org/cogagent-9b-20241220](https://huggingface.co/zai-org/cogagent-9b-20241220)|
757757
|[ZhipuAI/GLM-4.1V-9B-Base](https://modelscope.cn/models/ZhipuAI/GLM-4.1V-9B-Base)|glm4_1v|glm4_1v|transformers>=4.53|✘|-|[zai-org/GLM-4.1V-9B-Base](https://huggingface.co/zai-org/GLM-4.1V-9B-Base)|
758758
|[ZhipuAI/GLM-4.1V-9B-Thinking](https://modelscope.cn/models/ZhipuAI/GLM-4.1V-9B-Thinking)|glm4_1v|glm4_1v|transformers>=4.53|✘|-|[zai-org/GLM-4.1V-9B-Thinking](https://huggingface.co/zai-org/GLM-4.1V-9B-Thinking)|
759+
|[ZhipuAI/Glyph](https://modelscope.cn/models/ZhipuAI/Glyph)|glm4_1v|glm4_1v|transformers>=4.57|✘|-|[zai-org/Glyph](https://huggingface.co/zai-org/Glyph)|
759760
|[ZhipuAI/GLM-4.5V](https://modelscope.cn/models/ZhipuAI/GLM-4.5V)|glm4_5v|glm4_5v|transformers>=4.56|✔|-|[zai-org/GLM-4.5V](https://huggingface.co/zai-org/GLM-4.5V)|
760761
|[ZhipuAI/GLM-4.5V-FP8](https://modelscope.cn/models/ZhipuAI/GLM-4.5V-FP8)|glm4_5v|glm4_5v|transformers>=4.56|✘|-|[zai-org/GLM-4.5V-FP8](https://huggingface.co/zai-org/GLM-4.5V-FP8)|
761762
|[ZhipuAI/glm-edge-v-2b](https://modelscope.cn/models/ZhipuAI/glm-edge-v-2b)|glm_edge_v|glm_edge_v|transformers>=4.46|✘|vision|[zai-org/glm-edge-v-2b](https://huggingface.co/zai-org/glm-edge-v-2b)|

docs/source_en/Instruction/Supported-models-and-datasets.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -756,6 +756,7 @@ The table below introduces the models integrated with ms-swift:
756756
|[ZhipuAI/cogagent-9b-20241220](https://modelscope.cn/models/ZhipuAI/cogagent-9b-20241220)|glm4v|glm4v|transformers>=4.42|✘|-|[zai-org/cogagent-9b-20241220](https://huggingface.co/zai-org/cogagent-9b-20241220)|
757757
|[ZhipuAI/GLM-4.1V-9B-Base](https://modelscope.cn/models/ZhipuAI/GLM-4.1V-9B-Base)|glm4_1v|glm4_1v|transformers>=4.53|✘|-|[zai-org/GLM-4.1V-9B-Base](https://huggingface.co/zai-org/GLM-4.1V-9B-Base)|
758758
|[ZhipuAI/GLM-4.1V-9B-Thinking](https://modelscope.cn/models/ZhipuAI/GLM-4.1V-9B-Thinking)|glm4_1v|glm4_1v|transformers>=4.53|✘|-|[zai-org/GLM-4.1V-9B-Thinking](https://huggingface.co/zai-org/GLM-4.1V-9B-Thinking)|
759+
|[ZhipuAI/Glyph](https://modelscope.cn/models/ZhipuAI/Glyph)|glm4_1v|glm4_1v|transformers>=4.57|✘|-|[zai-org/Glyph](https://huggingface.co/zai-org/Glyph)|
759760
|[ZhipuAI/GLM-4.5V](https://modelscope.cn/models/ZhipuAI/GLM-4.5V)|glm4_5v|glm4_5v|transformers>=4.56|✔|-|[zai-org/GLM-4.5V](https://huggingface.co/zai-org/GLM-4.5V)|
760761
|[ZhipuAI/GLM-4.5V-FP8](https://modelscope.cn/models/ZhipuAI/GLM-4.5V-FP8)|glm4_5v|glm4_5v|transformers>=4.56|✘|-|[zai-org/GLM-4.5V-FP8](https://huggingface.co/zai-org/GLM-4.5V-FP8)|
761762
|[ZhipuAI/glm-edge-v-2b](https://modelscope.cn/models/ZhipuAI/glm-edge-v-2b)|glm_edge_v|glm_edge_v|transformers>=4.46|✘|vision|[zai-org/glm-edge-v-2b](https://huggingface.co/zai-org/glm-edge-v-2b)|

swift/llm/model/model/glm.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -273,6 +273,12 @@ def get_model_tokenizer_glm4_1v(*args, **kwargs):
273273
],
274274
requires=['transformers>=4.53'],
275275
),
276+
ModelGroup(
277+
[
278+
Model('ZhipuAI/Glyph', 'zai-org/Glyph'),
279+
],
280+
requires=['transformers>=4.57'],
281+
),
276282
],
277283
TemplateType.glm4_1v,
278284
get_model_tokenizer_glm4_1v,

swift/llm/template/template/glm.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -117,10 +117,16 @@ def _data_collator(self, batch: List[Dict[str, Any]], *, padding_to: Optional[in
117117
class GLM4_1VTemplate(Template):
118118
begin_of_image_token = 151339
119119
end_of_image_token = 151340
120-
image_token = 151343
121120
begin_of_video_token = 151341
122121
end_of_video_token = 151342
123-
video_token = 151344
122+
placeholder_tokens = ['<|image|>', '<|video|>']
123+
124+
def init_processor(self, processor) -> None:
125+
if processor is None:
126+
return
127+
super().init_processor(processor)
128+
self.image_token = self._tokenize('<|image|>')[0]
129+
self.video_token = self._tokenize('<|video|>')[0]
124130

125131
def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index: int,
126132
inputs: StdTemplateInputs) -> List[Context]:

tests/test_align/test_template/test_vision.py

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -710,6 +710,19 @@ def test_glm4_1v():
710710
assert response == response2
711711

712712

713+
def test_glyph():
714+
messages = [{'role': 'user', 'content': '<image><image>What is the difference between the two images?'}]
715+
images = [
716+
'http://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/cat.png',
717+
'http://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/animal.png'
718+
]
719+
pt_engine = PtEngine('ZhipuAI/Glyph')
720+
response = _infer_model(pt_engine, messages=messages, images=images)
721+
pt_engine.default_template.template_backend = 'jinja'
722+
response2 = _infer_model(pt_engine, messages=messages, images=images)
723+
assert response == response2
724+
725+
713726
def test_gemma3n():
714727
pt_engine = PtEngine('google/gemma-3n-E2B-it')
715728
messages = [{
@@ -1066,6 +1079,7 @@ def test_paddle_ocr():
10661079
# test_kimi_vl()
10671080
# test_kimi_vl_thinking()
10681081
# test_glm4_1v()
1082+
test_glyph()
10691083
# test_gemma3n()
10701084
# test_keye_vl()
10711085
# test_dots_ocr()
@@ -1081,4 +1095,4 @@ def test_paddle_ocr():
10811095
# test_sailvl2()
10821096
# test_deepseek_ocr()
10831097
# test_llava_onevision1_5()
1084-
test_paddle_ocr()
1098+
# test_paddle_ocr()

0 commit comments

Comments
 (0)