feat: Add head size 72 support for QKV Preprocessing kernel (#3743)

qixiang-99 · web-flow · commit ecd621fb0a38 · 2025-04-25T11:07:40.000-07:00
* refactor: Fix headsize 72 attention error for TRTLLM attn backend in PyTorch workflow

- Remove the head size pre-check logic in AttentionOp because head size 72 can be supported with fmha kernels.
- Added support for head size 72 in unfused attention kernels(QKVPreprocessing).
- Enhanced unit tests by introducing a scenario generation function for better test coverage of attention configurations(include head size 72).

Signed-off-by: qixiang-99 &lt;203170375+qixiang-99@users.noreply.github.com&gt;

* update: Waive head_dim=72 test cases and enhance test representation

- Added a waiver for head_dim=72 cases on post sm100 in the test suite to address known issues.
- Introduced a custom __repr__ method in the Scenario class for pytest substring match.

Signed-off-by: qixiang-99 &lt;203170375+qixiang-99@users.noreply.github.com&gt;

---------

Signed-off-by: qixiang-99 &lt;203170375+qixiang-99@users.noreply.github.com&gt;
diff --git a/cpp/tensorrt_llm/common/attentionOp.cpp b/cpp/tensorrt_llm/common/attentionOp.cpp
@@ -2297,8 +2297,8 @@ int AttentionOp::initialize() noexcept
         "Unsupported data type, pre SM 80 GPUs do not support bfloat16");
 
     // Pre-check whether the head size is supported by MMHA.
-    // Support head size == 72 only for fmha kernels (in Cross Attention), so skip pre-check here.
-    if (getHeadSize() == 72 && mCrossAttention)
+    // Support head size == 72 only for fmha kernels, so skip pre-check here.
+    if (getHeadSize() == 72)
     {
         ;
     }
diff --git a/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_template.h b/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_template.h
@@ -1603,6 +1603,7 @@ void invokeApplyBiasRopeUpdateKVCacheDispatch(QKVPreprocessingParams<T, KVCacheB
     case 32: kernelV2DispatchHeadSize<256, 32, T, TCache, KVCacheBuffer>(params, stream); break;
     case 48: kernelV2DispatchHeadSize<192, 48, T, TCache, KVCacheBuffer>(params, stream); break;
     case 64: kernelV2DispatchHeadSize<256, 64, T, TCache, KVCacheBuffer>(params, stream); break;
+    case 72: kernelV2DispatchHeadSize<288, 72, T, TCache, KVCacheBuffer>(params, stream); break;
     case 80: kernelV2DispatchHeadSize<160, 80, T, TCache, KVCacheBuffer>(params, stream); break;
     case 96: kernelV2DispatchHeadSize<192, 96, T, TCache, KVCacheBuffer>(params, stream); break;
     case 104: kernelV2DispatchHeadSize<416, 104, T, TCache, KVCacheBuffer>(params, stream); break;
diff --git a/tests/unittest/_torch/test_attention_no_cache.py b/tests/unittest/_torch/test_attention_no_cache.py
@@ -1,16 +1,35 @@
+import itertools
 import math
 import random
 from dataclasses import dataclass
-from typing import List
+from typing import List, Tuple
 
 import pytest
 import torch
+from utils.util import skip_blackwell
 
 from tensorrt_llm._torch.attention_backend.interface import \
     PredefinedAttentionMask
 from tensorrt_llm._torch.attention_backend.utils import get_attention_backend
 
 
+def generate_attn_scenarios(num_q_heads_kv_heads: List[Tuple[int, int]],
+                            head_dim: List[int], num_layers: List[int],
+                            dtype: List[torch.dtype]):
+    scenarios = []
+    product_iter = itertools.product(num_q_heads_kv_heads, head_dim, num_layers,
+                                     dtype)
+    for num_q_heads_kv_head, head_dim, num_layers, dtype in product_iter:
+        num_q_heads, num_kv_heads = num_q_heads_kv_head
+        scenarios.append(
+            Scenario(num_heads=num_q_heads,
+                     num_kv_heads=num_kv_heads,
+                     head_dim=head_dim,
+                     num_layers=num_layers,
+                     dtype=dtype))
+    return scenarios
+
+
 def calculate_ref_result(q: torch.Tensor,
                          k: torch.Tensor,
                          v: torch.Tensor,
@@ -110,6 +129,10 @@ class Scenario:
     def num_kv_groups(self) -> int:
         return self.num_heads // self.num_kv_heads
 
+    # self-defined repr for pytest substring match
+    def __repr__(self) -> str:
+        return f"Scenario(num_heads_{self.num_heads}, num_kv_heads_{self.num_kv_heads}, head_dim_{self.head_dim}, num_layers_{self.num_layers}, dtype_{self.dtype})"
+
 
 def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
     """
@@ -144,26 +167,21 @@ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
     random_context_sequence_lengths,
 ]
 
-scenarios = [
-    # num_heads == num_kv_heads, single layer
-    Scenario(
-        num_layers=1,
-        num_heads=32,
-        num_kv_heads=32,
-        head_dim=128,
-        dtype=torch.float16,
-    ),
-    # num_heads > num_kv_heads, multi-layer
-    Scenario(
-        num_layers=2,
-        num_heads=32,
-        num_kv_heads=8,
-        head_dim=128,
-        dtype=torch.float16,
-    ),
+num_q_heads_kv_heads = [
+    (32, 32),
+    (32, 8),
+    (16, 16),
 ]
+num_layers = [1, 2, 16]
+head_dim = [64, 72, 128]
+dtype = [torch.float16]
+
+scenarios = generate_attn_scenarios(num_q_heads_kv_heads, head_dim, num_layers,
+                                    dtype)
 
 
+# skip for blackwell
+@skip_blackwell
 # Convert parameterized tests to pytest parametrize
 @pytest.mark.parametrize("accuracy", [(1e-2, 1e-3)],
                          ids=lambda x: f"atol={x[0]} rtol={x[1]}")
@@ -178,6 +196,7 @@ def test_attention_no_cache(scenario: Scenario,
                             context_sequence_lengths: List[int], mask_type,
                             accuracy):
     """Test attention computation without using cache for both FULL and CAUSAL masks"""
+
     num_heads = scenario.num_heads
     num_kv_heads = scenario.num_kv_heads
     head_dim = scenario.head_dim

Original file line number	Diff line number	Diff line change
`@@ -2297,8 +2297,8 @@ int AttentionOp::initialize() noexcept`
`2297`	`2297`	`"Unsupported data type, pre SM 80 GPUs do not support bfloat16");`
`2298`	`2298`
`2299`	`2299`	`// Pre-check whether the head size is supported by MMHA.`
`2300`		`- // Support head size == 72 only for fmha kernels (in Cross Attention), so skip pre-check here.`
`2301`		`- if (getHeadSize() == 72 && mCrossAttention)`
	`2300`	`+ // Support head size == 72 only for fmha kernels, so skip pre-check here.`
	`2301`	`+ if (getHeadSize() == 72)`
`2302`	`2302`	`{`
`2303`	`2303`	`;`
`2304`	`2304`	`}`