Skip to content

Commit 8861b56

Browse files
committed
[Perf] Improve Llama4 performance for small max_seqlen cases
Signed-off-by: Yilin Fan <[email protected]>
1 parent 8df7a26 commit 8861b56

File tree

1 file changed

+1
-1
lines changed

1 file changed

+1
-1
lines changed

tensorrt_llm/_torch/models/modeling_llama.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,7 @@ def __init__(
7777
else:
7878
# Disable chunked attention when max_seq_len is smaller than attention_chunk_size
7979
# TODO: Remove this after all attention kernels in TRTLLM backend support chunked attention
80-
if attention_chunk_size and model_config.max_seq_len and model_confg.max_seq_len < attention_chunk_size:
80+
if attention_chunk_size and model_config.max_seq_len and model_config.max_seq_len < attention_chunk_size:
8181
attention_chunk_size = None
8282

8383
super().__init__(

0 commit comments

Comments
 (0)