diff --git a/docs/source/openvino/models.mdx b/docs/source/openvino/models.mdx index 705eca4b7d..4e4d34a5b4 100644 --- a/docs/source/openvino/models.mdx +++ b/docs/source/openvino/models.mdx @@ -93,6 +93,7 @@ Here is the list of the supported architectures : - Marian - MiniCPM - MiniCPM3 +- MiniCPM-o - MiniCPMV - Mistral - Mixtral diff --git a/optimum/exporters/openvino/convert.py b/optimum/exporters/openvino/convert.py index 81ff4b0aeb..104b3143b6 100644 --- a/optimum/exporters/openvino/convert.py +++ b/optimum/exporters/openvino/convert.py @@ -670,7 +670,7 @@ def export_from_model( # some model configs may have issues with loading without parameters initialization try: misplaced_generation_parameters = model.config._get_non_default_generation_parameters() - except (KeyError, TypeError): + except (AttributeError, KeyError, TypeError): misplaced_generation_parameters = {} if isinstance(model, GenerationMixin) and len(misplaced_generation_parameters) > 0: logger.warning( diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index 634e2c1084..bc4425b3f4 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -2682,6 +2682,12 @@ def patch_model_for_export(self, model: PreTrainedModel, model_kwargs: Optional[ return super().patch_model_for_export(model, model_kwargs) +@register_in_tasks_manager("minicpmo", *["image-text-to-text"], library_name="transformers") +class MiniCPMOOpenVINOConfig(MiniCPMVOpenVINOConfig): + MIN_TRANSFORMERS_VERSION = "4.43.0" + MAX_TRANSFORMERS_VERSION = "4.51.99" + + class Phi3VisionConfigBehavior(str, enum.Enum): LANGUAGE = "language" VISION_PROJECTION = "vision_projection" diff --git a/optimum/exporters/openvino/utils.py b/optimum/exporters/openvino/utils.py index 19e7969084..d99fb0fae8 100644 --- a/optimum/exporters/openvino/utils.py +++ b/optimum/exporters/openvino/utils.py @@ -234,6 +234,7 @@ def get_submodels(model): "phi4mm", "phi4_multimodal", "llama4", + "minicpmo", ] SSM_MODELS = ["mamba", "falcon_mamba"] diff --git a/optimum/intel/openvino/modeling_visual_language.py b/optimum/intel/openvino/modeling_visual_language.py index e8ec4baaf3..6f991ea457 100644 --- a/optimum/intel/openvino/modeling_visual_language.py +++ b/optimum/intel/openvino/modeling_visual_language.py @@ -2114,6 +2114,36 @@ def preprocess_inputs( return inputs +class _OVMiniCPMOForCausalLM(_OVMiniCPMVForCausalLM): + def prepare_inputs_for_generation( + self, + input_ids, + past_key_values=None, + inputs_embeds=None, + pixel_values=None, + image_sizes=None, + attention_mask=None, + audio_bounds=None, + spk_bounds=None, + audio_features=None, + audio_feature_lens=None, + **kwargs, + ): + # Audio modality is not supported for MiniCPMO + if audio_features is not None and len(audio_features) > 0: + raise ValueError("Audio input is not supported for MiniCPMO") + + return super().prepare_inputs_for_generation( + input_ids=input_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + pixel_values=pixel_values, + image_sizes=image_sizes, + attention_mask=attention_mask, + **kwargs, + ) + + class _OVNanoLlavaForCausalLM(OVModelForVisualCausalLM): def get_vision_embeddings(self, pixel_values, input_ids=None, **kwargs): if input_ids is not None and input_ids.shape[1] == 1: @@ -4355,4 +4385,5 @@ def preprocess_inputs( "phi4mm": _OVPhi4MMForCausalLM, "phi4_multimodal": _OVPhi4MMForCausalLM, "llama4": _OVLlama4ForCausalLM, + "minicpmo": _OVMiniCPMOForCausalLM, } diff --git a/setup.py b/setup.py index 89342c67ea..84e2d4878d 100644 --- a/setup.py +++ b/setup.py @@ -60,6 +60,8 @@ "langchain-huggingface", "hf_xet", "num2words", + "vocos", + "vector_quantize_pytorch", ] QUALITY_REQUIRE = ["black~=23.1", "ruff==0.4.4"] diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py index c7467ea70a..3cf5ead2d2 100644 --- a/tests/openvino/test_exporters_cli.py +++ b/tests/openvino/test_exporters_cli.py @@ -733,6 +733,18 @@ class OVCLIExportTestCase(unittest.TestCase): "vision_embeddings_model": {"int8": 16}, }, ), + ( + "image-text-to-text", + "minicpmo", + 'int4 --group-size 4 --ratio 0.8 --sensitivity-metric "mean_activation_magnitude" ' + "--dataset contextual --num-samples 1 --trust-remote-code", + { + "lm_model": {"int8": 6, "int4": 10}, + "text_embeddings_model": {"int8": 1}, + "vision_embeddings_model": {"int8": 8}, + "resampler_model": {"int8": 6}, + }, + ), ] # filter models type depending on min max transformers version @@ -754,7 +766,7 @@ def test_filtered_architectures(cls): elif is_transformers_version("<", "4.52"): expected = set() else: - expected = {"llava-qwen2", "phi3_v", "phi4mm"} + expected = {"llava-qwen2", "phi3_v", "phi4mm", "minicpmo"} all_model_type = {config[1] for config in cls.TRANSFORMERS_4BIT_CONFIGURATIONS} filtered_model_type = {config[1] for config in cls.SUPPORTED_4BIT_CONFIGURATIONS} diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py index d4bd60ddb3..3f4795cf0a 100644 --- a/tests/openvino/test_quantization.py +++ b/tests/openvino/test_quantization.py @@ -1006,6 +1006,27 @@ class OVWeightCompressionTest(unittest.TestCase): "vision_embeddings_model": {"int8": 16}, }, ), + ( + OVModelForVisualCausalLM, + "minicpmo", + True, + dict( + bits=4, + group_size=4, + dataset="contextual", + ratio=0.8, + sensitivity_metric="mean_activation_magnitude", + num_samples=1, + processor=MODEL_NAMES["minicpmo"], + trust_remote_code=True, + ), + { + "lm_model": {"int8": 6, "int4": 10}, + "text_embeddings_model": {"int8": 1}, + "vision_embeddings_model": {"int8": 8}, + "resampler_model": {"int8": 6}, + }, + ), ] # filter models type depending on min max transformers version @@ -1037,6 +1058,9 @@ class OVWeightCompressionTest(unittest.TestCase): if is_transformers_version("<", "4.54.0"): SUPPORTED_ARCHITECTURES_WITH_AUTO_COMPRESSION.append((OVModelForVisualCausalLM, "llava-qwen2", True)) + if is_transformers_version("<", "4.52.0"): + SUPPORTED_ARCHITECTURES_WITH_AUTO_COMPRESSION.append((OVModelForVisualCausalLM, "minicpmo", True)) + SUPPORTED_ARCHITECTURES_WITH_HYBRID_QUANTIZATION = [ (OVStableDiffusionPipeline, "stable-diffusion", 72, 195), (OVStableDiffusionXLPipeline, "stable-diffusion-xl", 84, 331), @@ -1058,7 +1082,7 @@ def test_filtered_architectures(cls): elif is_transformers_version("<", "4.52"): expected = set() else: - expected = {"llava-qwen2", "phi3_v"} + expected = {"llava-qwen2", "phi3_v", "minicpmo"} all_model_type = {config[1] for config in cls.TRANSFORMERS_4BIT_CONFIGURATIONS} filtered_model_type = {config[1] for config in cls.LOAD_IN_4_BITS_SCOPE} diff --git a/tests/openvino/test_seq2seq.py b/tests/openvino/test_seq2seq.py index ac3aa7e8bf..0a474bc31c 100644 --- a/tests/openvino/test_seq2seq.py +++ b/tests/openvino/test_seq2seq.py @@ -498,13 +498,15 @@ class OVModelForVisualCausalLMIntegrationTest(unittest.TestCase): SUPPORTED_ARCHITECTURES += ["gemma3", "smolvlm"] if is_transformers_version(">=", "4.51"): SUPPORTED_ARCHITECTURES += ["llama4"] + if is_transformers_version("<", "4.52"): + SUPPORTED_ARCHITECTURES += ["minicpmo"] if is_transformers_version(">=", "4.54.0"): # remote code models differs after transformers v4.54 SUPPORTED_ARCHITECTURES = set(SUPPORTED_ARCHITECTURES) - {"llava-qwen2", "phi3_v", "phi4mm"} TASK = "image-text-to-text" - REMOTE_CODE_MODELS = ["internvl_chat", "minicpmv", "llava-qwen2", "phi3_v", "maira2", "phi4mm"] + REMOTE_CODE_MODELS = ["internvl_chat", "minicpmv", "minicpmo", "llava-qwen2", "phi3_v", "maira2", "phi4mm"] IMAGE = Image.open( requests.get( @@ -609,7 +611,7 @@ def test_compare_to_transformers(self, model_arch): self._check_device_and_request(ov_model, test_device, False) # pytorch minicpmv and internvl_chat are not designed to be used via forward - if model_arch not in ["minicpmv", "internvl_chat"]: + if model_arch not in ["minicpmv", "minicpmo", "internvl_chat"]: set_seed(SEED) ov_outputs = ov_model(**inputs) set_seed(SEED) @@ -654,12 +656,21 @@ def test_compare_to_transformers(self, model_arch): transformers_inputs["past_key_values"] = DynamicCache() with torch.no_grad(): + if model_arch in ["minicpmo"]: + # `generate` method for minicpmo requires tokenizer + tokenizer = AutoTokenizer.from_pretrained( + model_id, trust_remote_code=model_arch in self.REMOTE_CODE_MODELS + ) + additional_inputs["tokenizer"] = tokenizer transformers_outputs = transformers_model.generate( **transformers_inputs, generation_config=gen_config, **additional_inputs ) + if model_arch in ["minicpmo"]: + # retrieve decoded tokens for comparation + transformers_outputs = transformers_outputs[1].sequences # original minicpmv, internvl always skip input tokens in generation results, while transformers based approach provide them - if model_arch in ["minicpmv", "internvl_chat"]: + if model_arch in ["minicpmv", "minicpmo", "internvl_chat"]: ov_outputs = ov_outputs[:, inputs["input_ids"].shape[1] :] self.assertTrue( torch.equal(ov_outputs, transformers_outputs), @@ -685,7 +696,7 @@ def test_compare_to_transformers(self, model_arch): transformers_inputs = copy.deepcopy(inputs) ov_outputs = ov_model.generate(**inputs, generation_config=gen_config) # original minicpmv, internvl always skip input tokens in generation results, while transformers based approach provide them - if model_arch in ["minicpmv", "internvl_chat"]: + if model_arch in ["minicpmv", "minicpmo", "internvl_chat"]: ov_outputs = ov_outputs[:, inputs["input_ids"].shape[1] :] with torch.no_grad(): transformers_outputs = transformers_model.generate( @@ -703,7 +714,7 @@ def test_compare_to_transformers(self, model_arch): transformers_inputs = copy.deepcopy(inputs) ov_outputs = ov_model.generate(**inputs, generation_config=gen_config) # original minicpmv, internvl always skip input tokens in generation results, while transformers based approach provide them - if model_arch in ["minicpmv", "internvl_chat"]: + if model_arch in ["minicpmv", "minicpmo", "internvl_chat"]: ov_outputs = ov_outputs[:, inputs["input_ids"].shape[1] :] with torch.no_grad(): transformers_outputs = transformers_model.generate( diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py index ca4275c466..bc9432894c 100644 --- a/tests/openvino/utils_tests.py +++ b/tests/openvino/utils_tests.py @@ -116,6 +116,7 @@ "minicpm": "katuni4ka/tiny-random-minicpm", "minicpm3": "katuni4ka/tiny-random-minicpm3", "minicpmv": "katuni4ka/tiny-random-minicpmv-2_6", + "minicpmo": "rkazants/tiny-random-MiniCPM-o-2_6", "mistral": "echarlaix/tiny-random-mistral", "mistral-nemo": "katuni4ka/tiny-random-mistral-nemo", "mixtral": "TitanML/tiny-mixtral", @@ -327,6 +328,12 @@ "clip": {"model": 130}, "mamba": {"model": 386}, "falcon-mamba": {"model": 194}, + "minicpmo": { + "lm_model": 16, + "text_embeddings_model": 1, + "vision_embeddings_model": 8, + "resampler_model": 6, + }, } TEST_IMAGE_URL = "http://images.cocodataset.org/val2017/000000039769.jpg"