Skip to content

Commit fe1c5fc

Browse files
yechank-nvidialancelly
authored andcommitted
chore: set default device to cpu on Multimodal models (NVIDIA#5994)
Signed-off-by: yechank <[email protected]> Signed-off-by: Lanyu Liao <[email protected]>
1 parent fc3ca26 commit fe1c5fc

File tree

5 files changed

+23
-36
lines changed

5 files changed

+23
-36
lines changed

examples/llm-api/quickstart_multimodal.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -138,7 +138,7 @@ def main():
138138
open(os.path.join(llm._hf_model_dir, 'config.json')))['model_type']
139139
assert model_type in ALL_SUPPORTED_MULTIMODAL_MODELS, f"Unsupported model_type: {model_type}"
140140

141-
device = "cuda"
141+
device = "cpu"
142142
inputs = default_multimodal_input_loader(tokenizer=llm.tokenizer,
143143
model_dir=llm._hf_model_dir,
144144
model_type=model_type,

tensorrt_llm/_torch/models/modeling_mistral.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -227,7 +227,6 @@ def __init__(
227227
self.model_config = model_config
228228
self.tokenizer = tokenizer
229229

230-
self._device = "cuda"
231230
self._processor = AutoProcessor.from_pretrained(model_path,
232231
use_fast=False)
233232

@@ -257,7 +256,6 @@ def __call__(
257256
if pixel_values is not None:
258257
# We have no use for the `attention_mask`.
259258
processed.pop("attention_mask")
260-
processed = processed.to(self._device)
261259
# NOTE: `processed` is a dict-like object, but not actually a dict.
262260
extra_processed_inputs = {
263261
"multimodal_data": {

tensorrt_llm/_torch/models/modeling_qwen2vl.py

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -34,9 +34,7 @@ def __init__(self,
3434
trust_remote_code: bool = True):
3535
self.model_config = model_config
3636
self.tokenizer = tokenizer
37-
# TODO: change to True and also change the according test result
38-
self.use_fast = False
39-
self.device = 'cuda'
37+
self.use_fast = True
4038
self.processor = AutoProcessor.from_pretrained(
4139
model_path,
4240
use_fast=self.use_fast,
@@ -226,7 +224,7 @@ def _post_init_(self):
226224
self.model_config.num_attention_heads),
227225
theta=float(self.model_config.rope_theta),
228226
scale_type=RotaryScalingType.mrope)
229-
self.rotary_cos_sin = torch.from_numpy(rotary_cos_sin).to(self.device)
227+
self.rotary_cos_sin = torch.from_numpy(rotary_cos_sin)
230228
self.rotary_cos_sin = self.rotary_cos_sin.reshape(
231229
self.model_config.max_position_embeddings,
232230
int(self.model_config.hidden_size /
@@ -344,7 +342,7 @@ def __call__(
344342
inputs.get("multi_modal_data", {}), inputs.get("mm_processor_kwargs", {})
345343

346344
processed_inputs = self._preprocess(text_prompt, mm_data,
347-
mm_processor_kwargs).to(self.device)
345+
mm_processor_kwargs)
348346

349347
if not mm_data:
350348
fused_input_ids = processed_inputs['input_ids']

tensorrt_llm/inputs/utils.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ def load_base64_image(parsed_url: str) -> Image.Image:
4545

4646
def load_image(image: str,
4747
format: str = "pt",
48-
device: str = "cuda") -> Union[Image.Image, torch.Tensor]:
48+
device: str = "cpu") -> Union[Image.Image, torch.Tensor]:
4949
assert format in ["pt", "pil"], "format must be either Pytorch or PIL"
5050

5151
parsed_url = urlparse(image)
@@ -67,7 +67,7 @@ def load_image(image: str,
6767
async def async_load_image(
6868
image: str,
6969
format: str = "pt",
70-
device: str = "cuda") -> Union[Image.Image, torch.Tensor]:
70+
device: str = "cpu") -> Union[Image.Image, torch.Tensor]:
7171
assert format in ["pt", "pil"], "format must be either Pytorch or PIL"
7272

7373
parsed_url = urlparse(image)
@@ -92,7 +92,7 @@ def load_video(
9292
video: str,
9393
num_frames: int = 10,
9494
format: str = "pt",
95-
device: str = "cuda") -> Union[List[Image.Image], List[torch.Tensor]]:
95+
device: str = "cpu") -> Union[List[Image.Image], List[torch.Tensor]]:
9696

9797
# Keep this import local to avoid importing cv2 if not needed
9898
import cv2
@@ -141,7 +141,7 @@ async def async_load_video(
141141
video: str,
142142
num_frames: int = 10,
143143
format: str = "pt",
144-
device: str = "cuda") -> Union[List[Image.Image], List[torch.Tensor]]:
144+
device: str = "cpu") -> Union[List[Image.Image], List[torch.Tensor]]:
145145
assert format in ["pt", "pil"], "format must be either Pytorch or PIL"
146146

147147
parsed_url = urlparse(video)
@@ -480,7 +480,7 @@ def default_multimodal_input_loader(
480480
media: Union[List[str], List[List[str]]],
481481
image_data_format: str = "pt",
482482
num_frames: int = 8,
483-
device: str = "cuda") -> List[dict[str, Union[str, torch.Tensor]]]:
483+
device: str = "cpu") -> List[dict[str, Union[str, torch.Tensor]]]:
484484

485485
def convert_to_conversation_message(prompt: str, media: Union[str,
486486
List[str]],

tests/integration/defs/test_e2e.py

Lines changed: 14 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1994,22 +1994,19 @@ def test_ptp_quickstart_multimodal(llm_root, llm_venv, model_name, model_path,
19941994
},
19951995
"llava-v1.6-mistral-7b": {
19961996
"image": [
1997+
["ocean", "sky", "large", "waves", "shore", "blue"],
19971998
[
1998-
"ocean", "cloud", "waves", "white", "shore", "large",
1999-
"dramatic", "breaking"
1999+
"landscape", "rock", "landmark", "formation", "smooth",
2000+
"mountain"
20002001
],
2001-
["mountain", "butte", "flat", "top", "sky"],
2002-
["highway", "vehicles", "traffic", "divider", "suburban"],
2002+
["highway", "vehicles", "traffic", "bus", "suburban"],
20032003
],
20042004
},
20052005
"qwen2-vl-7b-instruct": {
20062006
"image": [
2007-
["ocean", "waves", "shore", "natural", "clouds", "turbulent"],
2008-
[
2009-
"mountainous", "landscape", "rock", "peak", "weather",
2010-
"steep"
2011-
],
2012-
["traffic", "vehicles", "moderate", "lanes", "road"],
2007+
["ocean", "waves", "atmosphere", "stormy", "clouds", "intense"],
2008+
["trees", "rocks", "road", "sunny", "natural", "greenery"],
2009+
["traffic", "vehicles", "moderate", "lanes", "road", "cars"],
20132010
],
20142011
"video": [
20152012
["city", "night", "lights", "jacket", "wet"],
@@ -2018,33 +2015,27 @@ def test_ptp_quickstart_multimodal(llm_root, llm_venv, model_name, model_path,
20182015
},
20192016
"qwen2.5-vl-7b-instruct": {
20202017
"image": [
2021-
["dramatic", "moody", "stormy", "turbulent", "wave"],
2022-
[
2023-
"large", "dome", "yosemite", "landmark", "rock", "road",
2024-
"formation"
2025-
],
2026-
["highway", "traffic", "vehicles", "bus", "police"],
2018+
["dramatic", "moody", "ocean", "stormy", "sky", "clouds"],
2019+
["large", "dome", "yosemite", "landmark", "rock", "road"],
2020+
["highway", "traffic", "vehicles", "bus", "police", "traffic"],
20272021
],
20282022
"video": [
20292023
["woman", "neon", "night", "jacket", "wet"],
2030-
["earth", "rotating", "night", "lights", "cities"],
2024+
["earth", "world", "night", "lights", "cities"],
20312025
],
20322026
},
20332027
"mistral-small-3.1-24b-instruct": {
20342028
"image": [
2035-
[
2036-
"dramatic", "seascape", "cloudy", "turbulent", "waves",
2037-
"water"
2038-
],
2039-
["scenic", "rock", "landscape", "snow", "formation"],
2029+
["dramatic", "seascape", "ocean", "turbulent", "waves", "dark"],
2030+
["scenic", "rock", "landscape", "snow", "altitude"],
20402031
["highway", "traffic", "directions", "lanes", "Jurong"],
20412032
],
20422033
},
20432034
"gemma-3-27b-it": {
20442035
"image": [
20452036
["dramatic", "turbulent", "waves", "ocean", "overcast"],
20462037
["half", "dome", "yosemite", "landmark", "rounded"],
2047-
["flowing", "standstill", "vehicles", "road", "Changi"],
2038+
["flowing", "traffic", "vehicles", "road", "Changi"],
20482039
],
20492040
},
20502041
}

0 commit comments

Comments
 (0)