Skip to content

Commit 7f31555

Browse files
committed
code style clearnup and minor tweaks (no logic change)
Signed-off-by: Xuanyu Chen <[email protected]>
1 parent 1d243b9 commit 7f31555

File tree

3 files changed

+24
-25
lines changed

3 files changed

+24
-25
lines changed

tensorrt_llm/_torch/pyexecutor/model_engine.py

Lines changed: 14 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -380,22 +380,21 @@ def __init__(
380380
self.max_draft_len = spec_config.max_draft_len
381381

382382
if self.is_advanced_mtp_sampler:
383+
mtp_total_sampling_size = self.batch_size * (
384+
self.max_draft_len + 1)
383385
self.temperatures_cuda = torch.empty(
384-
(self.batch_size * (self.max_draft_len + 1), ),
385-
dtype=torch.float,
386-
device='cuda')
387-
self.top_k_cuda = torch.empty(
388-
(self.batch_size * (self.max_draft_len + 1), ),
389-
dtype=torch.int,
390-
device='cuda')
391-
self.top_p_cuda = torch.empty(
392-
(self.batch_size * (self.max_draft_len + 1), ),
393-
dtype=torch.float,
394-
device='cuda')
395-
self.min_p_cuda = torch.empty(
396-
(self.batch_size * (self.max_draft_len + 1), ),
386+
(mtp_total_sampling_size, ),
397387
dtype=torch.float,
398388
device='cuda')
389+
self.top_k_cuda = torch.empty((mtp_total_sampling_size, ),
390+
dtype=torch.int,
391+
device='cuda')
392+
self.top_p_cuda = torch.empty((mtp_total_sampling_size, ),
393+
dtype=torch.float,
394+
device='cuda')
395+
self.min_p_cuda = torch.empty((mtp_total_sampling_size, ),
396+
dtype=torch.float,
397+
device='cuda')
399398
else:
400399
self.without_logits = False
401400
self.max_draft_len = 0
@@ -1162,7 +1161,7 @@ def _prepare_tp_inputs(
11621161

11631162
def get_request_temperature(request: LlmRequest) -> float:
11641163
if not request.sampling_config.temperature:
1165-
return 0.7
1164+
return 1.0
11661165
temperature = request.sampling_config.temperature[0]
11671166
if 0 < temperature < 1e-2:
11681167
# temperature less than 0.01 may cause numerical errors
@@ -1174,7 +1173,7 @@ def get_request_top_k(request: LlmRequest) -> int:
11741173
top_k = 0
11751174
else:
11761175
top_k = request.sampling_config.top_k[0]
1177-
# flashinfer expects k > d for no top_k filter
1176+
11781177
if top_k <= 0:
11791178
top_k = 2147483647
11801179
return top_k

tensorrt_llm/llmapi/llm_args.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -414,11 +414,11 @@ def supports_backend(self, backend: str) -> bool:
414414

415415

416416
class MTPDecodingConfig(DecodingBaseConfig):
417-
num_nextn_predict_layers: Optional[int] = 1
418-
use_relaxed_acceptance_for_thinking: Optional[bool] = False
419-
relaxed_topk: Optional[int] = 1
420-
relaxed_delta: Optional[float] = 0.
421-
use_mtp_vanilla: Optional[bool] = False
417+
num_nextn_predict_layers: int = 1
418+
use_relaxed_acceptance_for_thinking: bool = False
419+
relaxed_topk: int = 1
420+
relaxed_delta: float = 0.
421+
use_mtp_vanilla: bool = False
422422
use_advanced_mtp_sampler: Optional[bool] = False
423423

424424
# TODO: remove this after distinguishing `max_draft_len` and `num_nextn_predict_layers`

tensorrt_llm/llmapi/tokenizer.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -96,11 +96,11 @@ def convert_ids_to_tokens(
9696
skip_special_tokens: bool = False) -> Union[str, List[str]]:
9797
# DeepSeek vocabulary has token ids not mapped to any tokens, these will get converted to None
9898
# by the tokenizer. We need to filter them out.
99-
return [
100-
token for token in self.tokenizer.convert_ids_to_tokens(
101-
ids, skip_special_tokens=skip_special_tokens)
102-
if token is not None
103-
]
99+
tokens = self.tokenizer.convert_ids_to_tokens(
100+
ids, skip_special_tokens=skip_special_tokens)
101+
if isinstance(ids, int):
102+
return tokens # Single token, return as-is (could be None)
103+
return [token for token in tokens if token is not None]
104104

105105
def convert_tokens_to_string(
106106
self,

0 commit comments

Comments
 (0)