[https://nvbugs/5452167][fix] Fix ngram padding issue (NVIDIA#6837)

mikeiovine · web-flow · commit f68e03e64660 · 2025-08-13T11:23:16.000+08:00
Signed-off-by: Mike Iovine &lt;6158008+mikeiovine@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/speculative/ngram.py b/tensorrt_llm/_torch/speculative/ngram.py
@@ -87,13 +87,13 @@ def get_draft_tokens(
         self,
         prefix: list[int],
         request_id: int,
-        end_id: int,
+        padding_id: int,
         max_sequence_length: int,
     ):
         prefix_len = len(prefix)
         max_draft_token_length_this_step = max_sequence_length - 1 - prefix_len
         if max_draft_token_length_this_step <= 0:  # No draft token is need if the prefix is long enough
-            return [end_id]
+            return [padding_id]
         if request_id not in self.start_index:  # Extend start_index and pool for a new request
             self.start_index[request_id] = 0
             if not self.is_public_pool:
@@ -126,7 +126,7 @@ def get_draft_tokens(
                     pool[pattern].add(new_match)
 
         # Find match
-        draft_tokens = [end_id]  # fallback value
+        draft_tokens = [padding_id]  # fallback value
         for size in range(min(self.max_matching_ngram_size, prefix_len - 1), 0,
                           -1):
             pattern = tuple(prefix[-size:])
@@ -194,11 +194,12 @@ def prepare_draft_tokens(
             draft_tokens = self.spec_resource_manager.get_draft_tokens(
                 prefix,
                 request.request_id,
-                request.py_end_id,
-                request.py_orig_prompt_len + request.py_max_new_tokens,
+                padding_id=0,
+                max_sequence_length=request.py_orig_prompt_len +
+                request.py_max_new_tokens,
             )
             # Pad length to `self.max_draft_len`
             if len(draft_tokens) > 0:
                 pad_length = self.max_draft_len - len(draft_tokens)
-                draft_tokens.extend([request.py_end_id] * pad_length)
+                draft_tokens.extend([0] * pad_length)
             request.py_draft_tokens = draft_tokens
diff --git a/tests/unittest/_torch/speculative/test_ngram.py b/tests/unittest/_torch/speculative/test_ngram.py
@@ -54,7 +54,7 @@ def test_llama_ngram(disable_overlap_scheduler: bool, use_cuda_graph: bool,
         "The capital of France is",
         "The president of the United States is",
     ]
-    sampling_params = SamplingParams(max_tokens=32)
+    sampling_params = SamplingParams(max_tokens=32, ignore_eos=True)
 
     llm_spec = LLM(**llm_common_config, speculative_config=spec_config)
     results_spec = llm_spec.generate(prompts, sampling_params)

Original file line number	Diff line number	Diff line change
`@@ -54,7 +54,7 @@ def test_llama_ngram(disable_overlap_scheduler: bool, use_cuda_graph: bool,`
`54`	`54`	`"The capital of France is",`
`55`	`55`	`"The president of the United States is",`
`56`	`56`	`]`
`57`		`- sampling_params = SamplingParams(max_tokens=32)`
	`57`	`+ sampling_params = SamplingParams(max_tokens=32, ignore_eos=True)`
`58`	`58`
`59`	`59`	`llm_spec = LLM(**llm_common_config, speculative_config=spec_config)`
`60`	`60`	`results_spec = llm_spec.generate(prompts, sampling_params)`