chore: set default device to cpu on Multimodal models (NVIDIA#5994)

yechank-nvidia · lancelly · commit fe1c5fceee71 · 2025-08-06T03:01:24.000Z
Signed-off-by: yechank &lt;161688079+yechank-nvidia@users.noreply.github.com&gt;
Signed-off-by: Lanyu Liao &lt;lancelly@users.noreply.github.com&gt;
diff --git a/examples/llm-api/quickstart_multimodal.py b/examples/llm-api/quickstart_multimodal.py
@@ -138,7 +138,7 @@ def main():
             open(os.path.join(llm._hf_model_dir, 'config.json')))['model_type']
     assert model_type in ALL_SUPPORTED_MULTIMODAL_MODELS, f"Unsupported model_type: {model_type}"
 
-    device = "cuda"
+    device = "cpu"
     inputs = default_multimodal_input_loader(tokenizer=llm.tokenizer,
                                              model_dir=llm._hf_model_dir,
                                              model_type=model_type,
diff --git a/tensorrt_llm/_torch/models/modeling_mistral.py b/tensorrt_llm/_torch/models/modeling_mistral.py
@@ -227,7 +227,6 @@ def __init__(
         self.model_config = model_config
         self.tokenizer = tokenizer
 
-        self._device = "cuda"
         self._processor = AutoProcessor.from_pretrained(model_path,
                                                         use_fast=False)
 
@@ -257,7 +256,6 @@ def __call__(
         if pixel_values is not None:
             # We have no use for the `attention_mask`.
             processed.pop("attention_mask")
-            processed = processed.to(self._device)
             # NOTE: `processed` is a dict-like object, but not actually a dict.
             extra_processed_inputs = {
                 "multimodal_data": {
diff --git a/tensorrt_llm/_torch/models/modeling_qwen2vl.py b/tensorrt_llm/_torch/models/modeling_qwen2vl.py
@@ -34,9 +34,7 @@ def __init__(self,
                  trust_remote_code: bool = True):
         self.model_config = model_config
         self.tokenizer = tokenizer
-        # TODO: change to True and also change the according test result
-        self.use_fast = False
-        self.device = 'cuda'
+        self.use_fast = True
         self.processor = AutoProcessor.from_pretrained(
             model_path,
             use_fast=self.use_fast,
@@ -226,7 +224,7 @@ def _post_init_(self):
                     self.model_config.num_attention_heads),
             theta=float(self.model_config.rope_theta),
             scale_type=RotaryScalingType.mrope)
-        self.rotary_cos_sin = torch.from_numpy(rotary_cos_sin).to(self.device)
+        self.rotary_cos_sin = torch.from_numpy(rotary_cos_sin)
         self.rotary_cos_sin = self.rotary_cos_sin.reshape(
             self.model_config.max_position_embeddings,
             int(self.model_config.hidden_size /
@@ -344,7 +342,7 @@ def __call__(
                         inputs.get("multi_modal_data", {}), inputs.get("mm_processor_kwargs", {})
 
         processed_inputs = self._preprocess(text_prompt, mm_data,
-                                            mm_processor_kwargs).to(self.device)
+                                            mm_processor_kwargs)
 
         if not mm_data:
             fused_input_ids = processed_inputs['input_ids']
diff --git a/tensorrt_llm/inputs/utils.py b/tensorrt_llm/inputs/utils.py
@@ -45,7 +45,7 @@ def load_base64_image(parsed_url: str) -> Image.Image:
 
 def load_image(image: str,
                format: str = "pt",
-               device: str = "cuda") -> Union[Image.Image, torch.Tensor]:
+               device: str = "cpu") -> Union[Image.Image, torch.Tensor]:
     assert format in ["pt", "pil"], "format must be either Pytorch or PIL"
 
     parsed_url = urlparse(image)
@@ -67,7 +67,7 @@ def load_image(image: str,
 async def async_load_image(
         image: str,
         format: str = "pt",
-        device: str = "cuda") -> Union[Image.Image, torch.Tensor]:
+        device: str = "cpu") -> Union[Image.Image, torch.Tensor]:
     assert format in ["pt", "pil"], "format must be either Pytorch or PIL"
 
     parsed_url = urlparse(image)
@@ -92,7 +92,7 @@ def load_video(
         video: str,
         num_frames: int = 10,
         format: str = "pt",
-        device: str = "cuda") -> Union[List[Image.Image], List[torch.Tensor]]:
+        device: str = "cpu") -> Union[List[Image.Image], List[torch.Tensor]]:
 
     # Keep this import local to avoid importing cv2 if not needed
     import cv2
@@ -141,7 +141,7 @@ async def async_load_video(
         video: str,
         num_frames: int = 10,
         format: str = "pt",
-        device: str = "cuda") -> Union[List[Image.Image], List[torch.Tensor]]:
+        device: str = "cpu") -> Union[List[Image.Image], List[torch.Tensor]]:
     assert format in ["pt", "pil"], "format must be either Pytorch or PIL"
 
     parsed_url = urlparse(video)
@@ -480,7 +480,7 @@ def default_multimodal_input_loader(
         media: Union[List[str], List[List[str]]],
         image_data_format: str = "pt",
         num_frames: int = 8,
-        device: str = "cuda") -> List[dict[str, Union[str, torch.Tensor]]]:
+        device: str = "cpu") -> List[dict[str, Union[str, torch.Tensor]]]:
 
     def convert_to_conversation_message(prompt: str, media: Union[str,
                                                                   List[str]],
diff --git a/tests/integration/defs/test_e2e.py b/tests/integration/defs/test_e2e.py
@@ -1994,22 +1994,19 @@ def test_ptp_quickstart_multimodal(llm_root, llm_venv, model_name, model_path,
         },
         "llava-v1.6-mistral-7b": {
             "image": [
+                ["ocean", "sky", "large", "waves", "shore", "blue"],
                 [
-                    "ocean", "cloud", "waves", "white", "shore", "large",
-                    "dramatic", "breaking"
+                    "landscape", "rock", "landmark", "formation", "smooth",
+                    "mountain"
                 ],
-                ["mountain", "butte", "flat", "top", "sky"],
-                ["highway", "vehicles", "traffic", "divider", "suburban"],
+                ["highway", "vehicles", "traffic", "bus", "suburban"],
             ],
         },
         "qwen2-vl-7b-instruct": {
             "image": [
-                ["ocean", "waves", "shore", "natural", "clouds", "turbulent"],
-                [
-                    "mountainous", "landscape", "rock", "peak", "weather",
-                    "steep"
-                ],
-                ["traffic", "vehicles", "moderate", "lanes", "road"],
+                ["ocean", "waves", "atmosphere", "stormy", "clouds", "intense"],
+                ["trees", "rocks", "road", "sunny", "natural", "greenery"],
+                ["traffic", "vehicles", "moderate", "lanes", "road", "cars"],
             ],
             "video": [
                 ["city", "night", "lights", "jacket", "wet"],
@@ -2018,33 +2015,27 @@ def test_ptp_quickstart_multimodal(llm_root, llm_venv, model_name, model_path,
         },
         "qwen2.5-vl-7b-instruct": {
             "image": [
-                ["dramatic", "moody", "stormy", "turbulent", "wave"],
-                [
-                    "large", "dome", "yosemite", "landmark", "rock", "road",
-                    "formation"
-                ],
-                ["highway", "traffic", "vehicles", "bus", "police"],
+                ["dramatic", "moody", "ocean", "stormy", "sky", "clouds"],
+                ["large", "dome", "yosemite", "landmark", "rock", "road"],
+                ["highway", "traffic", "vehicles", "bus", "police", "traffic"],
             ],
             "video": [
                 ["woman", "neon", "night", "jacket", "wet"],
-                ["earth", "rotating", "night", "lights", "cities"],
+                ["earth", "world", "night", "lights", "cities"],
             ],
         },
         "mistral-small-3.1-24b-instruct": {
             "image": [
-                [
-                    "dramatic", "seascape", "cloudy", "turbulent", "waves",
-                    "water"
-                ],
-                ["scenic", "rock", "landscape", "snow", "formation"],
+                ["dramatic", "seascape", "ocean", "turbulent", "waves", "dark"],
+                ["scenic", "rock", "landscape", "snow", "altitude"],
                 ["highway", "traffic", "directions", "lanes", "Jurong"],
             ],
         },
         "gemma-3-27b-it": {
             "image": [
                 ["dramatic", "turbulent", "waves", "ocean", "overcast"],
                 ["half", "dome", "yosemite", "landmark", "rounded"],
-                ["flowing", "standstill", "vehicles", "road", "Changi"],
+                ["flowing", "traffic", "vehicles", "road", "Changi"],
             ],
         },
     }