[TRTLLM-1302][feat] Topk logprobs for TRT backend and top1 logprob for PyT backend (#6097)

LinPoly · web-flow · commit c2bc39af63e0 · 2025-09-12T15:32:34.000+08:00
Signed-off-by: Pengyun Lin &lt;81065165+LinPoly@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/executor/postproc_worker.py b/tensorrt_llm/executor/postproc_worker.py
@@ -6,7 +6,6 @@
                     Optional, Union)
 
 import zmq
-import zmq.asyncio
 
 from .._utils import nvtx_range_debug
 from ..bindings import executor as tllm
diff --git a/tensorrt_llm/executor/result.py b/tensorrt_llm/executor/result.py
@@ -91,7 +91,7 @@ class CompletionOutput:
         text (str): The generated output text. Defaults to "".
         token_ids (List[int], optional): The token ids of the generated output text. Defaults to [].
         cumulative_logprob (float, optional): The cumulative log probability of the generated output text. Defaults to None.
-        logprobs (TokenLogprobs, optional): The log probabilities of the top probability words at each position if the logprobs are requested. Defaults to None.
+        logprobs (TokenLogprobs | List[float], optional): The log probabilities of the top probability words at each position if the logprobs are requested. Defaults to None.
         prompt_logprobs (TokenLogprobs, optional): The log probabilities per prompt token. Defaults to None.
         finish_reason (Literal['stop', 'length', 'timeout', 'cancelled'], optional): The reason why the sequence is finished. Defaults to None.
         stop_reason (int, str, optional): The stop string or token id that caused the completion to stop, None if the completion finished for some other reason. Defaults to None.
@@ -102,14 +102,15 @@ class CompletionOutput:
     Attributes:
         length (int): The number of generated tokens.
         token_ids_diff (List[int]): Newly generated token ids.
-        logprobs_diff (List[float]): Logprobs of newly generated tokens.
+        logprobs_diff (TokenLogprobs | List[float]): Logprobs of newly generated tokens.
         text_diff (str): Newly generated tokens.
     """
     index: int
     text: str = ""
     token_ids: Optional[List[int]] = field(default_factory=list)
     cumulative_logprob: Optional[float] = None
-    logprobs: Optional[TokenLogprobs] = field(default_factory=list)
+    logprobs: Optional[TokenLogprobs
+                       | List[float]] = field(default_factory=list)
     prompt_logprobs: Optional[TokenLogprobs] = field(default_factory=list)
     finish_reason: Optional[Literal['stop', 'length', 'timeout',
                                     'cancelled']] = None
@@ -141,7 +142,7 @@ def token_ids_diff(self) -> List[int]:
         return self.token_ids[self._last_token_ids_len:]
 
     @property
-    def logprobs_diff(self) -> List[float]:
+    def logprobs_diff(self) -> TokenLogprobs | List[float]:
         return self.logprobs[self._last_logprobs_len:]
 
 
@@ -244,10 +245,12 @@ def _handle_sequence(self,
             output.cumulative_logprob = response_tensors.cum_log_probs[src_idx]
 
         if logprobs_result:
+            # update logprobs from ResponseWrapper (TRT top logprobs WAR)
+            output._last_logprobs_len = len(output.logprobs)
             output.prompt_logprobs = logprobs_result.prompt
-            output.logprobs = logprobs_result.generation
-
-        if response_tensors.log_probs is not None:
+            output.logprobs += logprobs_result.generation
+        elif response_tensors.log_probs is not None:
+            # handle logprobs directly from response tensors
             output._last_logprobs_len = len(output.logprobs)
             output.logprobs = response_tensors.log_probs[src_idx]
             # overcome some WAR in the cpp executor
diff --git a/tensorrt_llm/serve/openai_protocol.py b/tensorrt_llm/serve/openai_protocol.py
@@ -498,7 +498,7 @@ class ChatCompletionRequest(OpenAIBaseModel):
     model: str
     frequency_penalty: Optional[float] = 0.0
     logit_bias: Optional[Dict[str, float]] = None
-    logprobs: Optional[int] = None
+    logprobs: Optional[bool] = False
     top_logprobs: Optional[int] = 0
     max_completion_tokens: Optional[int] = Field(default=None,
                                                  validation_alias='max_tokens')
@@ -602,8 +602,10 @@ class ChatCompletionRequest(OpenAIBaseModel):
 
     # doc: end-chat-completion-extra-params
 
-    def to_sampling_params(self, vocab_size: int = 32000) -> SamplingParams:
-
+    def to_sampling_params(self,
+                           vocab_size: int = 32000,
+                           gather_generation_logits: bool = False,
+                           backend: Optional[str] = None) -> SamplingParams:
         sampling_params = SamplingParams(
             frequency_penalty=self.frequency_penalty,
             max_tokens=self.max_completion_tokens,
@@ -639,10 +641,20 @@ def to_sampling_params(self, vocab_size: int = 32000) -> SamplingParams:
 
             # chat-completion-extra-params
             add_special_tokens=self.add_special_tokens,
-
-            # TODO: migrate to use logprobs and prompt_logprobs
-            _return_log_probs=bool(self.logprobs),
         )
+        if self.logprobs:
+            logprobs = 1 if not self.top_logprobs else self.top_logprobs
+            if backend == "pytorch":
+                sampling_params.logprobs = logprobs
+            else:
+                if gather_generation_logits:
+                    sampling_params.logprobs = logprobs
+                elif self.top_logprobs:
+                    raise ValueError(
+                        "`gather_generation_logits` must be `True` to use `top_logprobs`"
+                    )
+                else:
+                    sampling_params._return_log_probs = True
         return sampling_params
 
     @model_validator(mode='before')
@@ -667,9 +679,12 @@ def check_tool_choice(cls, data):
     @model_validator(mode="before")
     @classmethod
     def check_logprobs(cls, data):
-        top_logprobs = data.get("top_logprobs")
-        if top_logprobs is not None and top_logprobs > 0:
-            raise ValueError("top_logprobs is not supported")
+        if (top_logprobs := data.get("top_logprobs")) is not None:
+            if top_logprobs < 0:
+                raise ValueError("top_logprobs must be positive or zero")
+            if not data.get("logprobs"):
+                raise ValueError(
+                    "logprobs must be true when using top_logprobs")
         return data
 
     @model_validator(mode="before")
diff --git a/tensorrt_llm/serve/openai_server.py b/tensorrt_llm/serve/openai_server.py
@@ -424,7 +424,9 @@ async def create_chat_response(
             # Pass the tokenizer vocabulary size so ``logit_bias`` can be
             # expanded into an embedding bias tensor in the sampler.
             sampling_params = request.to_sampling_params(
-                vocab_size=self.tokenizer.tokenizer.vocab_size)
+                vocab_size=self.tokenizer.tokenizer.vocab_size,
+                gather_generation_logits=self.llm.args.gather_generation_logits,
+                backend=self.llm.args.backend)
             # TODO: better way to enable metrics
             if len(os.getenv("TRTLLM_KVCACHE_TIME_OUTPUT_PATH", "")) > 0:
                 sampling_params.return_perf_metrics = True
diff --git a/tensorrt_llm/serve/postprocess_handlers.py b/tensorrt_llm/serve/postprocess_handlers.py
@@ -5,6 +5,7 @@
 from ..executor import (DetokenizedGenerationResultBase, GenerationResult,
                         GenerationResultBase)
 from ..executor.postproc_worker import PostprocArgs
+from ..executor.result import Logprob, TokenLogprobs
 from ..llmapi.reasoning_parser import (BaseReasoningParser,
                                        ReasoningParserFactory)
 from ..llmapi.tokenizer import TransformersTokenizer
@@ -39,6 +40,7 @@ class ChatPostprocArgs(PostprocArgs):
     tool_choice: Optional[Union[Literal["none"],
                                 ChatCompletionNamedToolChoiceParam]] = "none"
     return_logprobs: bool = False
+    top_logprobs: bool = False
     stream_options: Optional[StreamOptions] = None
     last_message_content: Optional[str] = None
     reasoning_parser: Optional[str] = None
@@ -56,23 +58,38 @@ def from_request(cls, request: ChatCompletionRequest):
             tools=request.tools,
             tool_choice=request.tool_choice,
             stream_options=request.stream_options,
-            return_logprobs=request.logprobs,
+            return_logprobs=bool(request.logprobs),
+            top_logprobs=bool(request.top_logprobs),
         )
 
 
 def create_logprobs(token_ids: List[int], tokenizer: TransformersTokenizer,
-                    logprobs: List[float]) -> ChatCompletionLogProbs:
+                    logprobs: List[float] | TokenLogprobs,
+                    top_logprobs: bool) -> ChatCompletionLogProbs:
     assert len(token_ids) == len(logprobs), \
             "token_ids and logprobs have different lengths"
     content: List[ChatCompletionLogProbsContent] = []
     for token_id, logprob in zip(token_ids, logprobs):
+        logprob: float | dict[int, Logprob]
         token = tokenizer.decode(token_id)
-        # returning multiple logprobs is not supported
-        first_logprob = ChatCompletionLogProbsContent(
+        chat_logprob = ChatCompletionLogProbsContent(
             token=token,
-            logprob=max(logprob, -9999.0),
-            bytes=list(token.encode("utf-8", errors="replace")))
-        content.append(first_logprob)
+            bytes=list(token.encode("utf-8", errors="replace")),
+        )
+        if isinstance(logprob, dict):
+            if token_id in logprob:
+                chat_logprob.logprob = max(logprob[token_id].logprob, -9999.0)
+                if top_logprobs:
+                    chat_logprob.top_logprobs = [
+                        ChatCompletionLogProbsContent(
+                            token=(tk := tokenizer.decode(tid)),
+                            logprob=max(logprob.logprob, -9999.0),
+                            bytes=list(tk.encode("utf-8", errors="replace")))
+                        for tid, logprob in logprob.items()
+                    ]
+        else:
+            chat_logprob.logprob = max(logprob, -9999.0)
+        content.append(chat_logprob)
     chat_logprobs = ChatCompletionLogProbs(content=content)
     return chat_logprobs
 
@@ -178,7 +195,7 @@ def yield_first_chat(num_tokens: int,
             logprobs = output.logprobs_diff
             token_ids = output.token_ids_diff
             choice.logprobs = create_logprobs(token_ids, args.tokenizer,
-                                              logprobs)
+                                              logprobs, args.top_logprobs)
         if output.finish_reason is not None:
             choice.finish_reason = output.finish_reason
             choice.stop_reason = output.stop_reason
@@ -247,7 +264,8 @@ def chat_response_post_processor(
 
         if args.return_logprobs:
             choice.logprobs = create_logprobs(output.token_ids, args.tokenizer,
-                                              output.logprobs)
+                                              output.logprobs,
+                                              args.top_logprobs)
         choices.append(choice)
 
     if args.echo and args.last_message_content:
diff --git a/tests/integration/defs/test_e2e.py b/tests/integration/defs/test_e2e.py
@@ -1548,6 +1548,20 @@ def test_trtllm_serve_lora_example(llm_root, llm_venv):
          str(test_root / "_test_trtllm_serve_lora.py")])
 
 
+@pytest.mark.parametrize("backend", ["pytorch", "trt"])
+def test_trtllm_serve_top_logprobs(llm_root, llm_venv, backend: str):
+    example_root = Path(os.path.join(llm_root, "examples", "serve"))
+    test_root = unittest_path() / "llmapi" / "apps"
+    llm_venv.run_cmd([
+        "-m", "pip", "install", "-r",
+        os.path.join(example_root, "requirements.txt")
+    ])
+    llm_venv.run_cmd([
+        "-m", "pytest",
+        str(test_root / "_test_trtllm_serve_top_logprobs.py"), "-k", backend
+    ])
+
+
 @pytest.mark.parametrize("backend", ["pytorch", "trt"])
 def test_openai_misc_example(llm_root, llm_venv, backend: str):
     test_root = unittest_path() / "llmapi" / "apps"
diff --git a/tests/integration/test_lists/test-db/l0_a10.yml b/tests/integration/test_lists/test-db/l0_a10.yml
@@ -33,6 +33,7 @@ l0_a10:
   - test_e2e.py::test_openai_lora
   - test_e2e.py::test_trtllm_serve_multimodal_example
   - test_e2e.py::test_trtllm_serve_lora_example
+  - test_e2e.py::test_trtllm_serve_top_logprobs[pytorch]
   - test_e2e.py::test_openai_misc_example[pytorch]
   - test_e2e.py::test_openai_reasoning[pytorch]
   - test_e2e.py::test_openai_completions_example[pytorch]
@@ -106,6 +107,7 @@ l0_a10:
   - llmapi/test_llm_examples.py::test_llmapi_server_example
   - llmapi/test_llm_examples.py::test_llmapi_kv_cache_connector[Qwen2-0.5B]
   - test_e2e.py::test_trtllm_serve_example
+  - test_e2e.py::test_trtllm_serve_top_logprobs[trt]
   - test_e2e.py::test_openai_misc_example[trt]
   - test_e2e.py::test_openai_completions_example[trt]
   - test_e2e.py::test_openai_chat_example[trt]
diff --git a/tests/unittest/api_stability/references/completion_output.yaml b/tests/unittest/api_stability/references/completion_output.yaml
@@ -16,7 +16,7 @@ properties:
     annotation: int
     default: inspect._empty
   logprobs_diff:
-    annotation: List[float]
+    annotation: list[dict[int, tensorrt_llm.executor.result.Logprob]] | List[float]
     default: inspect._empty
   text_diff:
     annotation: str
diff --git a/tests/unittest/api_stability/references_committed/completion_output.yaml b/tests/unittest/api_stability/references_committed/completion_output.yaml
@@ -20,7 +20,7 @@ methods:
         annotation: Optional[torch.Tensor]
         default: null
       logprobs:
-        annotation: Optional[list[dict[int, tensorrt_llm.executor.result.Logprob]]]
+        annotation: Optional[list[dict[int, tensorrt_llm.executor.result.Logprob]] | List[float]]
         default: null
       prompt_logprobs:
         annotation: Optional[list[dict[int, tensorrt_llm.executor.result.Logprob]]]
diff --git a/tests/unittest/llmapi/apps/_test_openai_chat.py b/tests/unittest/llmapi/apps/_test_openai_chat.py
@@ -141,42 +141,14 @@ def test_single_chat_session(client: openai.OpenAI, model_name: str):
     message = chat_completion.choices[0].message
     assert message.content is not None
     assert message.role == "assistant"
-
-
-def test_single_chat_session_with_logprobs(client: openai.OpenAI,
-                                           model_name: str, backend: str):
-    if backend == "pytorch":
-        pytest.skip("Logprobs are not supported in PyTorch backend yet")
-
-    messages = [{
-        "role": "system",
-        "content": "you are a helpful assistant"
-    }, {
-        "role": "user",
-        "content": "what is 1+1?"
-    }]
-
+    # test logprobs
     chat_completion = client.chat.completions.create(
         model=model_name,
         messages=messages,
         max_completion_tokens=10,
         logprobs=True,
     )
-    assert chat_completion.id is not None
-    assert len(chat_completion.choices) == 1
-    message = chat_completion.choices[0].message
-    assert message.content is not None
-    assert message.role == "assistant"
-    # test logprobs
     logprobs = chat_completion.choices[0].logprobs.content
-    finish_reason = chat_completion.choices[0].finish_reason
-    if finish_reason == "length":
-        assert len(logprobs) == 10
-    elif finish_reason == "stop":
-        assert len(logprobs) <= 10
-    else:
-        raise RuntimeError(
-            f"finish_reason {finish_reason} not in [length, stop]")
     for logprob in logprobs:
         assert logprob.token is not None
         assert logprob.logprob is not None
@@ -204,10 +176,11 @@ def test_multi_turn_dialogue(client: openai.OpenAI, model_name: str):
     assert message.content is not None and len(message.content) >= 0
 
 
-def test_multiple_response(client: openai.OpenAI, model_name: str,
-                           backend: str):
+def test_multiple_responses(client: openai.OpenAI, model_name: str,
+                            backend: str):
     if backend == "pytorch":
-        pytest.skip("Beam search is not supported in PyTorch backend yet")
+        pytest.skip(
+            "Multiple responses are not supported in PyTorch backend yet")
 
     messages = [{
         "role": "system",
@@ -252,70 +225,6 @@ async def test_chat_streaming(async_client: openai.AsyncOpenAI,
         "content": "what is 1+1?"
     }]
 
-    chat_completion = await async_client.chat.completions.create(
-        model=model_name,
-        messages=messages,
-        max_completion_tokens=10,
-        temperature=0.0,
-        logprobs=False,
-    )
-    output = chat_completion.choices[0].message.content
-    _finish_reason = chat_completion.choices[0].finish_reason
-
-    # test streaming
-    stream = await async_client.chat.completions.create(
-        model=model_name,
-        messages=messages,
-        max_completion_tokens=10,
-        temperature=0.0,
-        logprobs=False,
-        stream=True,
-    )
-    str_chunks: List[str] = []
-
-    finish_reason_counter = 0
-    finish_reason: str = None
-    async for chunk in stream:
-        choice = chunk.choices[0]
-        delta = choice.delta
-        if choice.finish_reason is not None:
-            finish_reason_counter += 1
-            finish_reason = choice.finish_reason
-        if delta.role:
-            assert delta.role == "assistant"
-        if delta.content:
-            str_chunks.append(delta.content)
-    # test finish_reason
-    if delta.content == "":
-        assert finish_reason == "stop"
-    assert finish_reason_counter == 1
-    assert finish_reason == _finish_reason
-    num_tokens = len(str_chunks)
-    if finish_reason == "length":
-        assert num_tokens == 10
-    elif finish_reason == "stop":
-        assert num_tokens <= 10
-    else:
-        raise RuntimeError(
-            f"finish_reason {finish_reason} not in [length, stop]")
-    # test generated tokens
-    assert "".join(str_chunks) == output
-
-
-@pytest.mark.asyncio(loop_scope="module")
-async def test_chat_streaming_with_logprobs(async_client: openai.AsyncOpenAI,
-                                            model_name: str, backend: str):
-    if backend == "pytorch":
-        pytest.skip("Logprobs are not supported in PyTorch backend yet")
-
-    messages = [{
-        "role": "system",
-        "content": "you are a helpful assistant"
-    }, {
-        "role": "user",
-        "content": "what is 1+1?"
-    }]
-
     chat_completion = await async_client.chat.completions.create(
         model=model_name,
         messages=messages,
diff --git a/tests/unittest/llmapi/apps/_test_trtllm_serve_top_logprobs.py b/tests/unittest/llmapi/apps/_test_trtllm_serve_top_logprobs.py