Skip to content

Commit 6f245ec

Browse files
dominicshanshanStanleySun639ruodilyiqingy0chzblych
authored
[None][chore] Mass integration of release/1.0 (#6864)
Signed-off-by: Stanley Sun <[email protected]> Signed-off-by: Wangshanshan <[email protected]> Signed-off-by: ruodil <[email protected]> Signed-off-by: Yiqing Yan <[email protected]> Signed-off-by: Yanchao Lu <[email protected]> Signed-off-by: Balaram Buddharaju <[email protected]> Signed-off-by: Ivy Zhang <[email protected]> Signed-off-by: Bo Deng <[email protected]> Signed-off-by: Chang Liu <[email protected]> Signed-off-by: Stefan Niebler <[email protected]> Signed-off-by: Yuxian Qiu <[email protected]> Signed-off-by: Superjomn <[email protected]> Signed-off-by: qqiao <[email protected]> Signed-off-by: yechank <[email protected]> Signed-off-by: William Zhang <[email protected]> Signed-off-by: raayandhar <[email protected]> Co-authored-by: Stanley Sun <[email protected]> Co-authored-by: ruodil <[email protected]> Co-authored-by: Yiqing Yan <[email protected]> Co-authored-by: Yanchao Lu <[email protected]> Co-authored-by: brb-nv <[email protected]> Co-authored-by: Ivy Zhang <[email protected]> Co-authored-by: Larry <[email protected]> Co-authored-by: Bo Deng <[email protected]> Co-authored-by: Guoming Zhang <[email protected]> Co-authored-by: Stefan Niebler <[email protected]> Co-authored-by: Yuxian Qiu <[email protected]> Co-authored-by: Yan Chunwei <[email protected]> Co-authored-by: Emma Qiao <[email protected]> Co-authored-by: Yechan Kim <[email protected]> Co-authored-by: 2ez4bz <[email protected]> Co-authored-by: Raayan Dhar <[email protected]> Co-authored-by: Zhanrui Sun <[email protected]>
1 parent f7c597e commit 6f245ec

File tree

24 files changed

+417
-76
lines changed

24 files changed

+417
-76
lines changed

cpp/tensorrt_llm/kernels/topkLastDim.cu

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1356,8 +1356,8 @@ void standalone_stable_radix_topk_(void* buf, size_t& buf_size, T const* in, Idx
13561356
sort_in = static_cast<decltype(sort_in)>(aligned_pointers[9]);
13571357
sort_in_idx = static_cast<decltype(sort_in_idx)>(aligned_pointers[10]);
13581358
}
1359-
cudaMemsetAsync(
1360-
buf, 0, static_cast<char*>(aligned_pointers[2]) - static_cast<char*>(aligned_pointers[0]), stream);
1359+
cudaMemsetAsync(aligned_pointers[0], 0,
1360+
static_cast<char*>(aligned_pointers[2]) - static_cast<char*>(aligned_pointers[0]), stream);
13611361
}
13621362

13631363
T const* in_buf = nullptr;

examples/llm-api/star_attention.py

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,8 @@
77
import torch
88

99
from tensorrt_llm import LLM, SamplingParams
10+
from tensorrt_llm.llmapi.llm_args import KvCacheConfig
1011
from tensorrt_llm.mapping import CpType
11-
from tensorrt_llm.models.modeling_utils import QuantAlgo, QuantConfig
1212

1313

1414
def dump_jsonl(data, fname):
@@ -54,11 +54,8 @@ def similarity_score(a, b):
5454
return SequenceMatcher(None, a, b).ratio()
5555

5656

57-
# Generate the outputs using either TRT or PyTorch (based on the use_pytorch argument). It’s the same function for both workflows.
5857
def generate_llm_outputs(args, data, fp8=False, fp8_kv_cache=False):
59-
quant_config = QuantConfig(quant_algo=QuantAlgo.FP8,
60-
kv_cache_quant_algo=QuantAlgo.FP8 if fp8_kv_cache
61-
else None) if fp8 else QuantConfig()
58+
kv_cache_config = KvCacheConfig(dtype="fp8" if fp8_kv_cache else "auto")
6259
cp_config = {
6360
"cp_type": CpType.STAR,
6461
"cp_anchor_size": args.sa_anchor_size,
@@ -70,7 +67,7 @@ def generate_llm_outputs(args, data, fp8=False, fp8_kv_cache=False):
7067
max_input_len=args.max_input_len,
7168
max_seq_len=args.max_seq_len,
7269
max_num_tokens=args.max_num_tokens,
73-
quant_config=quant_config,
70+
kv_cache_config=kv_cache_config,
7471
tensor_parallel_size=1,
7572
context_parallel_size=args.num_procs,
7673
cp_config=cp_config,

tensorrt_llm/_torch/models/modeling_gemma3vl.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -194,11 +194,16 @@ def get_sub_model_config(
194194
"text_config", "vision_config"
195195
], f"Expected subconfig name to be either 'text_config' or 'vision_config'. Got {name} instead."
196196
pretrained_config = getattr(model_config.pretrained_config, name)
197+
# ModelOpt currently doesn't quantize the vision part. Without setting quant config to None,
198+
# weight loading fails for vision.
199+
quant_config = model_config.quant_config if name == "text_config" else None
200+
# FlashInfer backend supports custom mask which is needed for bidirectional mask in decoder.
197201
preferred_backend = "FLASHINFER" if name == "text_config" else "TRTLLM"
198202
sub_model_config: ModelConfig[Gemma3Config] = dataclasses.replace(
199203
model_config,
200204
pretrained_config=pretrained_config,
201-
attn_backend=preferred_backend)
205+
attn_backend=preferred_backend,
206+
quant_config=quant_config)
202207
# Make sure some fields that are not explicitly included in the sub config, but present
203208
# in the top-level config, are replicated.
204209
if (hasattr(sub_model_config.pretrained_config, "torch_dtype")

tensorrt_llm/executor/proxy.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -317,14 +317,15 @@ def mpi_done_callback(future: concurrent.futures.Future):
317317

318318
while True:
319319
if self.worker_init_status_queue.poll(1):
320-
ready_signal = self.worker_init_status_queue.get()
320+
ready_signal, error_trace = self.worker_init_status_queue.get()
321321
break
322322
if any(fut.done() for fut in self.mpi_futures):
323323
logger.error("Executor worker died during initialization.")
324324
raise RuntimeError("Executor worker died during initialization")
325325
self._handle_background_error()
326326

327327
if ready_signal != GenerationExecutorProxy.READY_SIGNAL:
328+
logger.error(f"Executor worker initialization error: {error_trace}")
328329
self.mpi_session.shutdown_abort(reason=ready_signal)
329330
raise RuntimeError(
330331
"Executor worker returned error") from ready_signal

tensorrt_llm/executor/worker.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -774,7 +774,7 @@ def notify_proxy_threads_to_quit():
774774
logger.error(traceback.format_exc())
775775
print_colored_debug(f"error: {traceback.format_exc()}", "red")
776776
if is_leader:
777-
worker_init_status_queue.put(e)
777+
worker_init_status_queue.put((e, traceback.format_exc()))
778778
return
779779

780780
with worker:
@@ -792,7 +792,7 @@ def notify_proxy_threads_to_quit():
792792
mp_stats_queue)
793793
worker._set_iteration_result_queue(worker.kv_events_queues,
794794
kv_cache_events_queue)
795-
worker_init_status_queue.put(ready_signal)
795+
worker_init_status_queue.put((ready_signal, None))
796796
while (req := request_queue.get()) is not None:
797797
if isinstance(req, CancellingRequest):
798798
worker.abort_request(req.id)

tensorrt_llm/llmapi/llm.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -124,15 +124,21 @@ def __init__(self,
124124
self._executor_cls = kwargs.pop("executor_cls", GenerationExecutor)
125125
self._llm_id = None
126126

127+
log_level = logger.level
128+
logger.set_level("info") # force display the backend
129+
127130
try:
128131
backend = kwargs.get('backend', None)
129-
if backend == 'pytorch':
132+
if backend == "pytorch":
133+
logger.info("Using LLM with PyTorch backend")
130134
llm_args_cls = TorchLlmArgs
131135
elif backend == '_autodeploy':
136+
logger.info("Using LLM with AutoDeploy backend")
132137
from .._torch.auto_deploy.llm_args import \
133138
LlmArgs as AutoDeployLlmArgs
134139
llm_args_cls = AutoDeployLlmArgs
135140
else:
141+
logger.info("Using LLM with TensorRT backend")
136142
llm_args_cls = TrtLlmArgs
137143

138144
# check the kwargs and raise ValueError directly
@@ -162,6 +168,9 @@ def __init__(self,
162168
f"Failed to parse the arguments for the LLM constructor: {e}")
163169
raise e
164170

171+
finally:
172+
logger.set_level(log_level) # restore the log level
173+
165174
print_colored_debug(f"LLM.args.mpi_session: {self.args.mpi_session}\n",
166175
"yellow")
167176
self.mpi_session = self.args.mpi_session

tensorrt_llm/tools/multimodal_builder.py

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1190,8 +1190,18 @@ def forward(self, pixel_values, aspect_ratio_ids, aspect_ratio_mask):
11901190
model = MllamaForConditionalGeneration.from_pretrained(args.model_path,
11911191
torch_dtype='auto',
11921192
device_map='auto')
1193-
wrapper = MLLaMAVisionWrapper(model.vision_model,
1194-
model.multi_modal_projector)
1193+
1194+
# Check if the model structure is updated to transformers >= 4.52.0
1195+
if hasattr(model, 'model') and hasattr(model.model, 'vision_model'):
1196+
vision_model = model.model.vision_model
1197+
multi_modal_projector = model.model.multi_modal_projector
1198+
else:
1199+
# transformers < 4.52.0
1200+
vision_model = model.vision_model
1201+
multi_modal_projector = model.multi_modal_projector
1202+
1203+
wrapper = MLLaMAVisionWrapper(vision_model, multi_modal_projector)
1204+
11951205
model_dtype = model.dtype
11961206
image = Image.new('RGB', [2048, 2688]) # dummy image
11971207
inputs = processor(images=image,

tests/integration/defs/accuracy/references/cnn_dailymail.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,9 @@ google/gemma-3-1b-it:
55
accuracy: 20.699
66
google/gemma-3-27b-it:
77
- accuracy: 28.90
8+
- quant_algo: FP8
9+
kv_cache_quant_algo: FP8
10+
accuracy: 27.90
811
gpt2:
912
- accuracy: 18.408
1013
- quant_algo: W8A16

tests/integration/defs/accuracy/references/gsm8k.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -150,8 +150,14 @@ speakleash/Bielik-11B-v2.2-Instruct:
150150
accuracy: 40.41
151151
google/gemma-3-1b-it:
152152
- accuracy: 25.52 # score getting from lm-eval with HF implementation
153+
- quant_algo: FP8
154+
kv_cache_quant_algo: FP8
155+
accuracy: 23.96
153156
google/gemma-3-27b-it:
154157
- accuracy: 91.66
158+
- quant_algo: FP8
159+
kv_cache_quant_algo: FP8
160+
accuracy: 90.66
155161
mistralai/Ministral-8B-Instruct-2410:
156162
- accuracy: 79.25
157163
- quant_algo: FP8

tests/integration/defs/accuracy/references/mmlu.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,9 @@ google/gemma-3-1b-it:
119119
accuracy: 37.5
120120
google/gemma-3-27b-it:
121121
- accuracy: 77.80
122+
- quant_algo: FP8
123+
kv_cache_quant_algo: FP8
124+
accuracy: 76.80
122125
Qwen/Qwen2-0.5B-Instruct:
123126
- accuracy: 45.30
124127
- quant_algo: FP8

0 commit comments

Comments
 (0)