[TRTLLM-7158][feat] Introduce sampler options in trtllm bench (#6855)

dcampora · web-flow · commit d16af87d037f · 2025-08-18T18:10:05.000-04:00
Signed-off-by: Daniel Campora &lt;961215+dcampora@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/bench/benchmark/low_latency.py b/tensorrt_llm/bench/benchmark/low_latency.py
@@ -25,7 +25,7 @@
 from tensorrt_llm.models.modeling_utils import SpeculativeDecodingMode
 
 # isort: off
-from tensorrt_llm.bench.benchmark.utils.general import get_settings_from_engine, get_settings, ALL_SUPPORTED_BACKENDS
+from tensorrt_llm.bench.benchmark.utils.general import get_settings_from_engine, get_settings, update_sampler_args_with_extra_options, ALL_SUPPORTED_BACKENDS
 # isort: on
 from tensorrt_llm.bench.utils.data import (create_dataset_from_stream,
                                            initialize_tokenizer,
@@ -135,6 +135,13 @@
     default=1,
     help="Number of search beams.",
 )
+@optgroup.option("--sampler_options",
+                 type=click.Path(exists=True,
+                                 readable=True,
+                                 path_type=Path,
+                                 resolve_path=True),
+                 default=None,
+                 help="Path to a YAML file that sets sampler options.")
 @optgroup.option(
     "--concurrency",
     type=int,
@@ -326,12 +333,16 @@ def latency_command(
         eos_id = tokenizer.eos_token_id if not ignore_eos else -1
         pad_id = tokenizer.pad_token_id if not ignore_eos else -1
 
-        sampling_params = SamplingParams(
-            end_id=eos_id,
-            pad_id=pad_id,
-            n=beam_width,
-            use_beam_search=beam_width > 1,
-        )
+        sampler_args = {
+            "end_id": eos_id,
+            "pad_id": pad_id,
+            "n": beam_width,
+            "use_beam_search": beam_width > 1
+        }
+        sampler_args = update_sampler_args_with_extra_options(
+            sampler_args, params.pop("sampler_options"))
+        sampling_params = SamplingParams(**sampler_args)
+
         post_proc_params = None  # No detokenization
 
         # Perform warmup if requested.
diff --git a/tensorrt_llm/bench/benchmark/throughput.py b/tensorrt_llm/bench/benchmark/throughput.py
@@ -22,7 +22,8 @@
 from tensorrt_llm import LLM as PyTorchLLM
 from tensorrt_llm._tensorrt_engine import LLM
 from tensorrt_llm._torch.auto_deploy import LLM as AutoDeployLLM
-from tensorrt_llm.bench.benchmark.utils.general import generate_warmup_dataset
+from tensorrt_llm.bench.benchmark.utils.general import (
+    generate_warmup_dataset, update_sampler_args_with_extra_options)
 from tensorrt_llm.bench.dataclasses.configuration import RuntimeConfig
 from tensorrt_llm.bench.dataclasses.general import BenchmarkEnvironment
 from tensorrt_llm.bench.dataclasses.reporting import ReportUtility
@@ -67,6 +68,13 @@
     help=
     "Path to a YAML file that overwrites the parameters specified by trtllm-bench."
 )
+@optgroup.option("--sampler_options",
+                 type=click.Path(exists=True,
+                                 readable=True,
+                                 path_type=Path,
+                                 resolve_path=True),
+                 default=None,
+                 help="Path to a YAML file that sets sampler options.")
 @optgroup.option(
     "--max_batch_size",
     type=int,
@@ -455,10 +463,16 @@ def ignore_trt_only_args(kwargs: dict):
         else:
             llm = LLM(**kwargs)
 
-        sampling_params = SamplingParams(end_id=eos_id,
-                                         pad_id=eos_id,
-                                         n=beam_width,
-                                         use_beam_search=beam_width > 1)
+        sampler_args = {
+            "end_id": eos_id,
+            "pad_id": eos_id,
+            "n": beam_width,
+            "use_beam_search": beam_width > 1
+        }
+        sampler_args = update_sampler_args_with_extra_options(
+            sampler_args, params.pop("sampler_options"))
+        sampling_params = SamplingParams(**sampler_args)
+
         post_proc_params = None  # No detokenization
 
         # Perform warmup if requested.
diff --git a/tensorrt_llm/bench/benchmark/utils/general.py b/tensorrt_llm/bench/benchmark/utils/general.py
@@ -199,3 +199,39 @@ def generate_warmup_dataset(requests, steps) -> List[InferenceRequest]:
     warm_up_dataset = choices(requests, k=steps)
     shuffle(warm_up_dataset)
     return warm_up_dataset
+
+
+def update_sampler_args_with_extra_options(sampler_args: Dict,
+                                           sampler_options: str) -> Dict:
+    """Update sampler arguments with options from a YAML file.
+
+    Args:
+        sampler_args: Base sampler arguments dictionary.
+        sampler_options: Path to YAML file containing additional options.
+
+    Returns:
+        Dict: Merged sampler arguments.
+
+    Raises:
+        FileNotFoundError: If the YAML file doesn't exist.
+        yaml.YAMLError: If the YAML file is malformed.
+        TypeError: If the YAML content is not a dictionary.
+    """
+    if sampler_options is not None:
+        try:
+            with open(sampler_options, 'r') as f:
+                sampler_options_dict = yaml.safe_load(f)
+        except FileNotFoundError:
+            raise FileNotFoundError(
+                f"Sampler options file not found: {sampler_options}")
+        except yaml.YAMLError as e:
+            raise yaml.YAMLError(
+                f"Invalid YAML in sampler options file {sampler_options}: {e}")
+
+        if not isinstance(sampler_options_dict, dict):
+            raise TypeError(
+                f"Sampler options file {sampler_options} must contain a dictionary, "
+                f"got {type(sampler_options_dict)}")
+
+        sampler_args = sampler_args | sampler_options_dict
+    return sampler_args