NVIDIA
diff --git a/‎cpp/tensorrt_llm/kernels/topkLastDim.cu‎
Lines changed: 2 additions & 2 deletions b/‎cpp/tensorrt_llm/kernels/topkLastDim.cu‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎examples/llm-api/star_attention.py‎
Lines changed: 3 additions & 6 deletions b/‎examples/llm-api/star_attention.py‎
Lines changed: 3 additions & 6 deletions
diff --git a/‎tensorrt_llm/_torch/models/modeling_gemma3vl.py‎
Lines changed: 6 additions & 1 deletion b/‎tensorrt_llm/_torch/models/modeling_gemma3vl.py‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎tensorrt_llm/executor/proxy.py‎
Lines changed: 2 additions & 1 deletion b/‎tensorrt_llm/executor/proxy.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎tensorrt_llm/executor/worker.py‎
Lines changed: 2 additions & 2 deletions b/‎tensorrt_llm/executor/worker.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎tensorrt_llm/llmapi/llm.py‎
Lines changed: 10 additions & 1 deletion b/‎tensorrt_llm/llmapi/llm.py‎
Lines changed: 10 additions & 1 deletion
diff --git a/‎tensorrt_llm/tools/multimodal_builder.py‎
Lines changed: 12 additions & 2 deletions b/‎tensorrt_llm/tools/multimodal_builder.py‎
Lines changed: 12 additions & 2 deletions
diff --git a/‎tests/integration/defs/accuracy/references/cnn_dailymail.yaml‎
Lines changed: 3 additions & 0 deletions b/‎tests/integration/defs/accuracy/references/cnn_dailymail.yaml‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎tests/integration/defs/accuracy/references/gsm8k.yaml‎
Lines changed: 6 additions & 0 deletions b/‎tests/integration/defs/accuracy/references/gsm8k.yaml‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎tests/integration/defs/accuracy/references/mmlu.yaml‎
Lines changed: 3 additions & 0 deletions b/‎tests/integration/defs/accuracy/references/mmlu.yaml‎
Lines changed: 3 additions & 0 deletions
@@ -1356,8 +1356,8 @@ void standalone_stable_radix_topk_(void* buf, size_t& buf_size, T const* in, Idx
             sort_in = static_cast<decltype(sort_in)>(aligned_pointers[9]);
             sort_in_idx = static_cast<decltype(sort_in_idx)>(aligned_pointers[10]);
         }
-        cudaMemsetAsync(
-            buf, 0, static_cast<char*>(aligned_pointers[2]) - static_cast<char*>(aligned_pointers[0]), stream);
+        cudaMemsetAsync(aligned_pointers[0], 0,
+            static_cast<char*>(aligned_pointers[2]) - static_cast<char*>(aligned_pointers[0]), stream);
     }
 
     T const* in_buf = nullptr;
 
@@ -7,8 +7,8 @@
 import torch
 
 from tensorrt_llm import LLM, SamplingParams
+from tensorrt_llm.llmapi.llm_args import KvCacheConfig
 from tensorrt_llm.mapping import CpType
-from tensorrt_llm.models.modeling_utils import QuantAlgo, QuantConfig
 
 
 def dump_jsonl(data, fname):
@@ -54,11 +54,8 @@ def similarity_score(a, b):
     return SequenceMatcher(None, a, b).ratio()
 
 
-# Generate the outputs using either TRT or PyTorch (based on the use_pytorch argument). It’s the same function for both workflows.
 def generate_llm_outputs(args, data, fp8=False, fp8_kv_cache=False):
-    quant_config = QuantConfig(quant_algo=QuantAlgo.FP8,
-                               kv_cache_quant_algo=QuantAlgo.FP8 if fp8_kv_cache
-                               else None) if fp8 else QuantConfig()
+    kv_cache_config = KvCacheConfig(dtype="fp8" if fp8_kv_cache else "auto")
     cp_config = {
         "cp_type": CpType.STAR,
         "cp_anchor_size": args.sa_anchor_size,
@@ -70,7 +67,7 @@ def generate_llm_outputs(args, data, fp8=False, fp8_kv_cache=False):
               max_input_len=args.max_input_len,
               max_seq_len=args.max_seq_len,
               max_num_tokens=args.max_num_tokens,
-              quant_config=quant_config,
+              kv_cache_config=kv_cache_config,
               tensor_parallel_size=1,
               context_parallel_size=args.num_procs,
               cp_config=cp_config,
 
@@ -194,11 +194,16 @@ def get_sub_model_config(
             "text_config", "vision_config"
         ], f"Expected subconfig name to be either 'text_config' or 'vision_config'. Got {name} instead."
         pretrained_config = getattr(model_config.pretrained_config, name)
+        # ModelOpt currently doesn't quantize the vision part. Without setting quant config to None,
+        # weight loading fails for vision.
+        quant_config = model_config.quant_config if name == "text_config" else None
+        # FlashInfer backend supports custom mask which is needed for bidirectional mask in decoder.
         preferred_backend = "FLASHINFER" if name == "text_config" else "TRTLLM"
         sub_model_config: ModelConfig[Gemma3Config] = dataclasses.replace(
             model_config,
             pretrained_config=pretrained_config,
-            attn_backend=preferred_backend)
+            attn_backend=preferred_backend,
+            quant_config=quant_config)
         # Make sure some fields that are not explicitly included in the sub config, but present
         # in the top-level config, are replicated.
         if (hasattr(sub_model_config.pretrained_config, "torch_dtype")
 
@@ -317,14 +317,15 @@ def mpi_done_callback(future: concurrent.futures.Future):
 
         while True:
             if self.worker_init_status_queue.poll(1):
-                ready_signal = self.worker_init_status_queue.get()
+                ready_signal, error_trace = self.worker_init_status_queue.get()
                 break
             if any(fut.done() for fut in self.mpi_futures):
                 logger.error("Executor worker died during initialization.")
                 raise RuntimeError("Executor worker died during initialization")
             self._handle_background_error()
 
         if ready_signal != GenerationExecutorProxy.READY_SIGNAL:
+            logger.error(f"Executor worker initialization error: {error_trace}")
             self.mpi_session.shutdown_abort(reason=ready_signal)
             raise RuntimeError(
                 "Executor worker returned error") from ready_signal
 
@@ -774,7 +774,7 @@ def notify_proxy_threads_to_quit():
         logger.error(traceback.format_exc())
         print_colored_debug(f"error: {traceback.format_exc()}", "red")
         if is_leader:
-            worker_init_status_queue.put(e)
+            worker_init_status_queue.put((e, traceback.format_exc()))
         return
 
     with worker:
@@ -792,7 +792,7 @@ def notify_proxy_threads_to_quit():
                                                    mp_stats_queue)
                 worker._set_iteration_result_queue(worker.kv_events_queues,
                                                    kv_cache_events_queue)
-                worker_init_status_queue.put(ready_signal)
+                worker_init_status_queue.put((ready_signal, None))
                 while (req := request_queue.get()) is not None:
                     if isinstance(req, CancellingRequest):
                         worker.abort_request(req.id)
 
@@ -124,15 +124,21 @@ def __init__(self,
         self._executor_cls = kwargs.pop("executor_cls", GenerationExecutor)
         self._llm_id = None
 
+        log_level = logger.level
+        logger.set_level("info")  # force display the backend
+
         try:
             backend = kwargs.get('backend', None)
-            if backend == 'pytorch':
+            if backend == "pytorch":
+                logger.info("Using LLM with PyTorch backend")
                 llm_args_cls = TorchLlmArgs
             elif backend == '_autodeploy':
+                logger.info("Using LLM with AutoDeploy backend")
                 from .._torch.auto_deploy.llm_args import \
                     LlmArgs as AutoDeployLlmArgs
                 llm_args_cls = AutoDeployLlmArgs
             else:
+                logger.info("Using LLM with TensorRT backend")
                 llm_args_cls = TrtLlmArgs
 
             # check the kwargs and raise ValueError directly
@@ -162,6 +168,9 @@ def __init__(self,
                 f"Failed to parse the arguments for the LLM constructor: {e}")
             raise e
 
+        finally:
+            logger.set_level(log_level)  # restore the log level
+
         print_colored_debug(f"LLM.args.mpi_session: {self.args.mpi_session}\n",
                             "yellow")
         self.mpi_session = self.args.mpi_session
 
@@ -1190,8 +1190,18 @@ def forward(self, pixel_values, aspect_ratio_ids, aspect_ratio_mask):
     model = MllamaForConditionalGeneration.from_pretrained(args.model_path,
                                                            torch_dtype='auto',
                                                            device_map='auto')
-    wrapper = MLLaMAVisionWrapper(model.vision_model,
-                                  model.multi_modal_projector)
+
+    # Check if the model structure is updated to transformers >= 4.52.0
+    if hasattr(model, 'model') and hasattr(model.model, 'vision_model'):
+        vision_model = model.model.vision_model
+        multi_modal_projector = model.model.multi_modal_projector
+    else:
+        # transformers < 4.52.0
+        vision_model = model.vision_model
+        multi_modal_projector = model.multi_modal_projector
+
+    wrapper = MLLaMAVisionWrapper(vision_model, multi_modal_projector)
+
     model_dtype = model.dtype
     image = Image.new('RGB', [2048, 2688])  # dummy image
     inputs = processor(images=image,
 
@@ -5,6 +5,9 @@ google/gemma-3-1b-it:
     accuracy: 20.699
 google/gemma-3-27b-it:
   - accuracy: 28.90
+  - quant_algo: FP8
+    kv_cache_quant_algo: FP8
+    accuracy: 27.90
 gpt2:
   - accuracy: 18.408
   - quant_algo: W8A16
 
@@ -150,8 +150,14 @@ speakleash/Bielik-11B-v2.2-Instruct:
     accuracy: 40.41
 google/gemma-3-1b-it:
   - accuracy: 25.52 # score getting from lm-eval with HF implementation
+  - quant_algo: FP8
+    kv_cache_quant_algo: FP8
+    accuracy: 23.96
 google/gemma-3-27b-it:
   - accuracy: 91.66
+  - quant_algo: FP8
+    kv_cache_quant_algo: FP8
+    accuracy: 90.66
 mistralai/Ministral-8B-Instruct-2410:
   - accuracy: 79.25
   - quant_algo: FP8
 
@@ -119,6 +119,9 @@ google/gemma-3-1b-it:
     accuracy: 37.5
 google/gemma-3-27b-it:
   - accuracy: 77.80
+  - quant_algo: FP8
+    kv_cache_quant_algo: FP8
+    accuracy: 76.80
 Qwen/Qwen2-0.5B-Instruct:
   - accuracy: 45.30
   - quant_algo: FP8
Original file line number	Diff line number	Diff line change
`@@ -1356,8 +1356,8 @@ void standalone_stable_radix_topk_(void* buf, size_t& buf_size, T const* in, Idx`
`1356`	`1356`	`sort_in = static_cast<decltype(sort_in)>(aligned_pointers[9]);`
`1357`	`1357`	`sort_in_idx = static_cast<decltype(sort_in_idx)>(aligned_pointers[10]);`
`1358`	`1358`	`}`
`1359`		`- cudaMemsetAsync(`
`1360`		`- buf, 0, static_cast<char>(aligned_pointers[2]) - static_cast<char>(aligned_pointers[0]), stream);`
	`1359`	`+ cudaMemsetAsync(aligned_pointers[0], 0,`
	`1360`	`+ static_cast<char>(aligned_pointers[2]) - static_cast<char>(aligned_pointers[0]), stream);`
`1361`	`1361`	`}`
`1362`	`1362`
`1363`	`1363`	`T const* in_buf = nullptr;`