[cuDNN][SDPA] Fix head-dim 256 condition for SM 10.0 (pytorch#152076)

eqy · pytorchmergebot · commit 0488883d6e44 · 2025-05-02T18:43:33.000Z
turns out the backward is not supported yet, whoops Pull Request resolved: pytorch#152076 Approved by: https://github.com/drisspg
diff --git a/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp b/aten/src/ATen/native/transformers/cuda/sdp_utils.cpp
@@ -416,7 +416,7 @@ bool check_cudnn_tensor_shapes(sdp_params const& params, bool debug) {
   auto head_dim_limit = 128;
   if (cudnn_version >= 90501) {
     auto dprops = at::cuda::getCurrentDeviceProperties();
-    if ((dprops->major == 9 || dprops->major == 10) && !dprops->minor) {
+    if (dprops->major == 9  && !dprops->minor) {
       head_dim_limit = 256;
     }
   }
diff --git a/test/test_transformers.py b/test/test_transformers.py
@@ -2493,6 +2493,31 @@ def test_cudnn_attention_gqa(self, device):
 
         self.assertEqual(output_math, output_cudnn)
 
+    @skipIfRocm  # No cuDNN Attention
+    @unittest.skipIf(not PLATFORM_SUPPORTS_CUDNN_ATTENTION, "cuDNN Attention is not supported on this system")
+    def test_cudnn_attention_d256_heuristic(self, device):
+        dtype = torch.bfloat16
+        make_tensor = partial(torch.rand, device=device, dtype=dtype, requires_grad=True)
+        batch, num_heads, head_dim_k, head_dim_v = 32, 16, 256, 64
+        seq_len = 640
+        q_shape = SdpaShape(batch, num_heads, seq_len, head_dim_k)
+        k_shape = SdpaShape(batch, num_heads, seq_len, head_dim_k)
+        v_shape = SdpaShape(batch, num_heads, seq_len, head_dim_v)
+        query, key, value = make_tensor(q_shape), make_tensor(k_shape), make_tensor(v_shape)
+
+        with sdpa_kernel(backends=[SDPBackend.CUDNN_ATTENTION, SDPBackend.MATH], set_priority=True):
+            actual = torch.nn.functional.scaled_dot_product_attention(
+                query, key, value, attn_mask=None, dropout_p=0.0, is_causal=False)
+            actual.backward(torch.randn_like(actual))
+        with sdpa_kernel(backends=[SDPBackend.MATH]):
+            math_ref = torch.nn.functional.scaled_dot_product_attention(
+                query.contiguous().to(torch.float32),
+                key.contiguous().to(torch.float32),
+                value.contiguous().to(torch.float32),
+                attn_mask=None, dropout_p=0.0, is_causal=False)
+
+        self.assertEqual(actual.contiguous(), math_ref.contiguous().to(dtype), atol=1e-3, rtol=1e-2)
+
     @skipIfRocm(msg="No cuDNN on ROCm")
     @unittest.skipIf(not PLATFORM_SUPPORTS_CUDNN_ATTENTION, "cuDNN Attention is not supported on this system")
     def test_fused_attention_different_dk_dv(self, device):

Original file line number	Diff line number	Diff line change
`@@ -416,7 +416,7 @@ bool check_cudnn_tensor_shapes(sdp_params const& params, bool debug) {`
`416`	`416`	`auto head_dim_limit = 128;`
`417`	`417`	`if (cudnn_version >= 90501) {`
`418`	`418`	`auto dprops = at::cuda::getCurrentDeviceProperties();`
`419`		`- if ((dprops->major == 9 \|\| dprops->major == 10) && !dprops->minor) {`
	`419`	`+ if (dprops->major == 9 && !dprops->minor) {`
`420`	`420`	`head_dim_limit = 256;`
`421`	`421`	`}`
`422`	`422`	`}`