|
1 | 1 | import torch
|
| 2 | +from tests.core import copy_to_cpu, allclose, call_module, call_func |
2 | 3 |
|
3 | 4 | from deeplink_ext.internevo_ops.flash_attention import (
|
4 | 5 | FlashSelfAttention,
|
|
11 | 12 |
|
12 | 13 |
|
13 | 14 | def test_self_attention():
|
14 |
| - batch = 8 |
15 |
| - seqlen = 32 |
16 |
| - nheads = 16 |
17 |
| - headdim = 64 |
| 15 | + batch, seqlen, nheads, headdim = [8, 32, 16, 64] |
18 | 16 |
|
19 |
| - q_ref = torch.rand([batch, seqlen, nheads, headdim], requires_grad=True) |
20 |
| - k_ref = torch.rand([batch, seqlen, nheads, headdim], requires_grad=True) |
21 |
| - v_ref = torch.rand([batch, seqlen, nheads, headdim], requires_grad=True) |
22 |
| - qkv_ref = torch.stack([q_ref, k_ref, v_ref], 2) |
23 |
| - q_ext = q_ref.clone().detach().to(torch.float16).cuda().requires_grad_() |
24 |
| - k_ext = k_ref.clone().detach().to(torch.float16).cuda().requires_grad_() |
25 |
| - v_ext = v_ref.clone().detach().to(torch.float16).cuda().requires_grad_() |
26 |
| - |
27 |
| - model_ref = SelfAttention() |
28 |
| - model_ext = FlashSelfAttention() |
29 |
| - out_ref = model_ref(None, q_ref, k_ref, v_ref, None) |
30 |
| - out_ext = model_ext(None, q_ext, k_ext, v_ext, None) |
31 |
| - out_ref.backward(torch.ones_like(out_ref)) |
32 |
| - out_ext.backward(torch.ones_like(out_ext)) |
33 |
| - |
34 |
| - assert torch.allclose( |
35 |
| - out_ext.cpu(), out_ref.to(torch.float16), rtol=1e-3, atol=1e-3 |
| 17 | + q_gpu = torch.rand( |
| 18 | + [batch, seqlen, nheads, headdim], |
| 19 | + dtype=torch.float16, |
| 20 | + requires_grad=True, |
| 21 | + device="cuda", |
36 | 22 | )
|
37 |
| - assert torch.allclose( |
38 |
| - q_ext.grad.cpu(), q_ref.grad.to(torch.float16), rtol=1e-3, atol=1e-3 |
| 23 | + k_gpu = torch.rand( |
| 24 | + [batch, seqlen, nheads, headdim], |
| 25 | + dtype=torch.float16, |
| 26 | + requires_grad=True, |
| 27 | + device="cuda", |
39 | 28 | )
|
40 |
| - assert torch.allclose( |
41 |
| - k_ext.grad.cpu(), k_ref.grad.to(torch.float16), rtol=1e-3, atol=1e-3 |
| 29 | + v_gpu = torch.rand( |
| 30 | + [batch, seqlen, nheads, headdim], |
| 31 | + dtype=torch.float16, |
| 32 | + requires_grad=True, |
| 33 | + device="cuda", |
| 34 | + ) |
| 35 | + |
| 36 | + q_cpu, k_cpu, v_cpu = copy_to_cpu([q_gpu, k_gpu, v_gpu]) |
| 37 | + ouput_forward_cpu, grads_cpu = call_module( |
| 38 | + SelfAttention(), None, q_cpu, k_cpu, v_cpu, None |
42 | 39 | )
|
43 |
| - assert torch.allclose( |
44 |
| - v_ext.grad.cpu(), v_ref.grad.to(torch.float16), rtol=1e-3, atol=1e-3 |
| 40 | + ouput_forward_gpu, grads_gpu = call_module( |
| 41 | + FlashSelfAttention().cuda(), None, q_gpu, k_gpu, v_gpu, None |
45 | 42 | )
|
| 43 | + assert allclose(ouput_forward_cpu, ouput_forward_gpu, rtol=1e-3, atol=1e-3) |
| 44 | + assert allclose(grads_cpu, grads_gpu, rtol=1e-3, atol=1e-3) |
46 | 45 |
|
47 | 46 |
|
48 | 47 | def test_cross_attention():
|
49 |
| - batch = 8 |
50 |
| - seqlen = 32 |
51 |
| - nheads = 16 |
52 |
| - headdim = 64 |
| 48 | + batch, seqlen, nheads, headdim = [8, 32, 16, 64] |
53 | 49 |
|
54 |
| - q_ref = torch.rand([batch, seqlen, nheads, headdim], requires_grad=True) |
55 |
| - kv_ref = torch.rand([batch, seqlen, 2, nheads, headdim], requires_grad=True) |
56 |
| - q_ext = q_ref.clone().detach().to(torch.float16).cuda().requires_grad_() |
57 |
| - kv_ext = kv_ref.clone().detach().to(torch.float16).cuda().requires_grad_() |
58 |
| - |
59 |
| - model_ref = CrossAttention() |
60 |
| - model_ext = FlashCrossAttention() |
61 |
| - out_ref = model_ref(q_ref, kv_ref) |
62 |
| - out_ext = model_ext(q_ext, kv_ext) |
63 |
| - out_ref.backward(torch.ones_like(out_ref)) |
64 |
| - out_ext.backward(torch.ones_like(out_ext)) |
65 |
| - |
66 |
| - assert torch.allclose( |
67 |
| - out_ext.cpu(), out_ref.to(torch.float16), rtol=1e-3, atol=1e-3 |
| 50 | + q_gpu = torch.rand( |
| 51 | + [batch, seqlen, nheads, headdim], |
| 52 | + dtype=torch.float16, |
| 53 | + requires_grad=True, |
| 54 | + device="cuda", |
68 | 55 | )
|
69 |
| - assert torch.allclose( |
70 |
| - q_ext.grad.cpu(), q_ref.grad.to(torch.float16), rtol=1e-3, atol=1e-3 |
| 56 | + kv_gpu = torch.rand( |
| 57 | + [batch, seqlen, 2, nheads, headdim], |
| 58 | + dtype=torch.float16, |
| 59 | + requires_grad=True, |
| 60 | + device="cuda", |
71 | 61 | )
|
72 |
| - assert torch.allclose( |
73 |
| - kv_ext.grad.cpu(), kv_ref.grad.to(torch.float16), rtol=1e-3, atol=1e-3 |
| 62 | + |
| 63 | + q_cpu, kv_cpu = copy_to_cpu([q_gpu, kv_gpu]) |
| 64 | + ouput_forward_cpu, grads_cpu = call_module(CrossAttention(), q_cpu, kv_cpu) |
| 65 | + ouput_forward_gpu, grads_gpu = call_module( |
| 66 | + FlashCrossAttention().cuda(), q_gpu, kv_gpu |
74 | 67 | )
|
| 68 | + |
| 69 | + assert allclose(ouput_forward_cpu, ouput_forward_gpu, rtol=1e-3, atol=1e-3) |
| 70 | + assert allclose(grads_cpu, grads_gpu, rtol=1e-3, atol=1e-3) |
0 commit comments