[Perf] Improve Llama4 performance for small max_seqlen cases

nv-yilinf · nv-yilinf · commit 8861b5696a7a · 2025-08-08T13:56:50.000-07:00
Signed-off-by: Yilin Fan &lt;206948969+nv-yilinf@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/models/modeling_llama.py b/tensorrt_llm/_torch/models/modeling_llama.py
@@ -77,7 +77,7 @@ def __init__(
         else:
             # Disable chunked attention when max_seq_len is smaller than attention_chunk_size
             # TODO: Remove this after all attention kernels in TRTLLM backend support chunked attention
-            if attention_chunk_size and model_config.max_seq_len and model_confg.max_seq_len < attention_chunk_size:
+            if attention_chunk_size and model_config.max_seq_len and model_config.max_seq_len < attention_chunk_size:
                 attention_chunk_size = None
 
         super().__init__(