test: add test for selfatten, crossatten (#85)

zhangzefeng92 · web-flow · commit 8bca2fb034de · 2024-04-10T11:44:32.000+08:00
add selfattention &amp; crossattention utest for internevo
diff --git a/deeplink_ext/internevo_ops/flash_attention_fallback.py b/deeplink_ext/internevo_ops/flash_attention_fallback.py
@@ -28,7 +28,23 @@ def __init__(self, causal=False, softmax_scale=None, attention_dropout=0.0):
         self.softmax_scale = softmax_scale
         self.drop = nn.Dropout(attention_dropout)
 
-    def forward(self, qkv, causal=None, key_padding_mask=None):
+    def forward(
+        self,
+        qkv=None,
+        q=None,
+        k=None,
+        v=None,
+        kv=None,
+        causal=None,
+        cu_seqlens=None,
+        max_seqlen=None,
+        cu_seqlens_q=None,
+        cu_seqlens_k=None,
+        max_seqlen_q=None,
+        max_seqlen_k=None,
+        softmax_scale=None,
+        dropout_p=0.0,
+    ):
         """Only supports the padded mode"""
         """Implements the multihead softmax attention.
         Arguments
@@ -38,29 +54,48 @@ def forward(self, qkv, causal=None, key_padding_mask=None):
             key_padding_mask: boolean mask to apply to the attention weights. True means to keep,
                 False means to mask out. (B, S)
         """
-        batch_size, seqlen = qkv.shape[0], qkv.shape[1]
+        if qkv is not None:
+            query, key, value = qkv[:, :, 0], qkv[:, :, 1], qkv[:, :, 2]
+            device = query.device
+        elif kv is not None:
+            assert q is not None, "q should not be None, when kv is not None"
+            assert q.device == kv.device, "the devices of q and kv should be same"
+            query = q
+            key = kv[:, :, 0], kv[:, :, 1]
+            device = query.device
+        else:
+            assert (
+                q is not None and k is not None and q is not None
+            ), "q, k, v should not be None"
+            assert (
+                q.device == k.device and k.device == v.device
+            ), "the devices of q, k and v should be same"
+            query = q
+            key, value = k, v
+            device = query.device
+
+        batch_size, seqlen = query.shape[0], query.shape[1]
         causal = self.causal if causal is None else causal
-        q, k, v = qkv.unbind(dim=2)
         softmax_scale = self.softmax_scale or 1.0 / math.sqrt(q.shape[-1])
-        scores = torch.einsum("bthd,bshd->bhts", q, k * softmax_scale)
-        if key_padding_mask is not None:
-            padding_mask = torch.full(
-                (batch_size, seqlen), -10000.0, dtype=scores.dtype, device=scores.device
-            )
-            padding_mask.masked_fill_(key_padding_mask, 0.0)
-            # TD [2022-09-30]: Adding is faster than masked_fill_ (idk why, just better kernel I guess)
-            scores = scores + rearrange(padding_mask, "b s -> b 1 1 s")
+        scores = torch.einsum("bthd,bshd->bhts", query, key * softmax_scale)
+        # if key_padding_mask is not None:
+        #     padding_mask = torch.full(
+        #         (batch_size, seqlen), -10000.0, dtype=scores.dtype, device=scores.device
+        #     )
+        #     padding_mask.masked_fill_(key_padding_mask, 0.0)
+        #     # TD [2022-09-30]: Adding is faster than masked_fill_ (idk why, just better kernel I guess)
+        #     scores = scores + rearrange(padding_mask, "b s -> b 1 1 s")
         if causal:
             # "triu_tril_cuda_template" not implemented for 'BFloat16'
             # So we have to construct the mask in float
             causal_mask = torch.triu(
-                torch.full((seqlen, seqlen), -10000.0, device=scores.device), 1
+                torch.full((seqlen, seqlen), -10000.0, device=device), 1
             )
             # TD [2022-09-30]: Adding is faster than masked_fill_ (idk why, just better kernel I guess)
             scores = scores + causal_mask.to(dtype=scores.dtype)
         attention = torch.softmax(scores, dim=-1, dtype=v.dtype)
         attention_drop = self.drop(attention)
-        output = torch.einsum("bhts,bshd->bthd", attention_drop, v)
+        output = torch.einsum("bhts,bshd->bthd", attention_drop, value)
         return output
 
 
diff --git a/tests/internevo/test_flash_attention.py b/tests/internevo/test_flash_attention.py
@@ -0,0 +1,74 @@
+import torch
+
+from deeplink_ext.internevo_ops.flash_attention import (
+    FlashSelfAttention,
+    FlashCrossAttention,
+)
+from deeplink_ext.internevo_ops.flash_attention_fallback import (
+    SelfAttention,
+    CrossAttention,
+)
+
+
+def test_self_attention():
+    batch = 8
+    seqlen = 32
+    nheads = 16
+    headdim = 64
+
+    q_ref = torch.rand([batch, seqlen, nheads, headdim], requires_grad=True)
+    k_ref = torch.rand([batch, seqlen, nheads, headdim], requires_grad=True)
+    v_ref = torch.rand([batch, seqlen, nheads, headdim], requires_grad=True)
+    qkv_ref = torch.stack([q_ref, k_ref, v_ref], 2)
+    q_ext = q_ref.clone().detach().to(torch.float16).cuda().requires_grad_()
+    k_ext = k_ref.clone().detach().to(torch.float16).cuda().requires_grad_()
+    v_ext = v_ref.clone().detach().to(torch.float16).cuda().requires_grad_()
+
+    model_ref = SelfAttention()
+    model_ext = FlashSelfAttention()
+    out_ref = model_ref(None, q_ref, k_ref, v_ref, None)
+    out_ext = model_ext(None, q_ext, k_ext, v_ext, None)
+    out_ref.backward(torch.ones_like(out_ref))
+    out_ext.backward(torch.ones_like(out_ext))
+
+    assert torch.allclose(
+        out_ext.cpu(), out_ref.to(torch.float16), rtol=1e-3, atol=1e-3
+    )
+    assert torch.allclose(
+        q_ext.grad.cpu(), q_ref.grad.to(torch.float16), rtol=1e-3, atol=1e-3
+    )
+    assert torch.allclose(
+        k_ext.grad.cpu(), k_ref.grad.to(torch.float16), rtol=1e-3, atol=1e-3
+    )
+    assert torch.allclose(
+        v_ext.grad.cpu(), v_ref.grad.to(torch.float16), rtol=1e-3, atol=1e-3
+    )
+
+
+def test_cross_attention():
+    batch = 8
+    seqlen = 32
+    nheads = 16
+    headdim = 64
+
+    q_ref = torch.rand([batch, seqlen, nheads, headdim], requires_grad=True)
+    kv_ref = torch.rand([batch, seqlen, 2, nheads, headdim], requires_grad=True)
+    q_ext = q_ref.clone().detach().to(torch.float16).cuda().requires_grad_()
+    kv_ext = kv_ref.clone().detach().to(torch.float16).cuda().requires_grad_()
+
+    model_ref = CrossAttention()
+    model_ext = FlashCrossAttention()
+    out_ref = model_ref(q_ref, kv_ref)
+    out_ext = model_ext(q_ext, kv_ext)
+    out_ref.backward(torch.ones_like(out_ref))
+    out_ext.backward(torch.ones_like(out_ext))
+
+    assert torch.allclose(
+        out_ext.cpu(), out_ref.to(torch.float16), rtol=1e-3, atol=1e-3
+    )
+    assert torch.allclose(
+        q_ext.grad.cpu(), q_ref.grad.to(torch.float16), rtol=1e-3, atol=1e-3
+    )
+    assert torch.allclose(
+        kv_ext.grad.cpu(), kv_ref.grad.to(torch.float16), rtol=1e-3, atol=1e-3
+    )
diff --git a/tests/internevo/test_rms_norm_internevo.py b/tests/internevo/test_rms_norm_internevo.py
@@ -62,6 +62,3 @@ def test_multi_cases_for_mixed_rms_norm():
             print(
                 f"When input dtype is {input_dtype} and weight dtype is {weight_dtype}, MixedRMSNorm passes the backward test!"
             )
-
-
-test_multi_cases_for_mixed_rms_norm()

Original file line number	Diff line number	Diff line change
`@@ -62,6 +62,3 @@ def test_multi_cases_for_mixed_rms_norm():`
`62`	`62`	`print(`
`63`	`63`	`f"When input dtype is {input_dtype} and weight dtype is {weight_dtype}, MixedRMSNorm passes the backward test!"`
`64`	`64`	`)`
`65`		`-`
`66`		`-`
`67`		`-test_multi_cases_for_mixed_rms_norm()`