Skip to content

Commit 7450216

Browse files
authored
AMDGPU: Convert some mfma tests to generated checks (#150607)
1 parent 965bb5d commit 7450216

File tree

7 files changed

+7562
-620
lines changed

7 files changed

+7562
-620
lines changed

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.bf16.ll

Lines changed: 497 additions & 90 deletions
Large diffs are not rendered by default.

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll

Lines changed: 860 additions & 153 deletions
Large diffs are not rendered by default.

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx942.ll

Lines changed: 4506 additions & 241 deletions
Large diffs are not rendered by default.
Lines changed: 158 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1,41 +1,172 @@
1-
; RUN: llc -mtriple=amdgcn -mcpu=gfx908 < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX908 %s
2-
; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -mattr=-mfma-inline-literal-bug < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX908 %s
3-
; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX90A %s
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc -mtriple=amdgcn -mcpu=gfx908 < %s | FileCheck --check-prefixes=GCN,GFX908 %s
3+
; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -mattr=-mfma-inline-literal-bug < %s | FileCheck --check-prefixes=GCN,GFX908 %s
4+
; RUN: llc -mtriple=amdgcn -mcpu=gfx90a < %s | FileCheck --check-prefixes=GCN,GFX90A %s
45

56
declare <16 x i32> @llvm.amdgcn.mfma.i32.32x32x8i8(i32, i32, <16 x i32>, i32, i32, i32)
67
declare <4 x i32> @llvm.amdgcn.mfma.i32.16x16x16i8(i32, i32, <4 x i32>, i32, i32, i32)
78

8-
; GCN-LABEL: {{^}}test_mfma_i32_32x32x8i8:
9-
; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2
10-
; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1
11-
; GCN-DAG: s_load_dwordx16
12-
; GFX908-DAG-COUNT-16: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
13-
; GFX90A-COUNT-16: v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}}
14-
; GCN: v_mfma_i32_32x32x8i8 a[{{[0-9]+:[0-9]+}}], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
15-
; GFX908-COUNT-16: v_accvgpr_read_b32
16-
; GFX908: global_store_dwordx4
17-
; GFX90A-NOT: v_accvgpr_read_b32
18-
; GFX90A-COUNT-4: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
199
define amdgpu_kernel void @test_mfma_i32_32x32x8i8(ptr addrspace(1) %arg) #0 {
10+
; GFX908-LABEL: test_mfma_i32_32x32x8i8:
11+
; GFX908: ; %bb.0: ; %bb
12+
; GFX908-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24
13+
; GFX908-NEXT: v_mov_b32_e32 v0, 1
14+
; GFX908-NEXT: v_mov_b32_e32 v16, 0
15+
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
16+
; GFX908-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
17+
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
18+
; GFX908-NEXT: v_mov_b32_e32 v17, s0
19+
; GFX908-NEXT: v_mov_b32_e32 v1, s1
20+
; GFX908-NEXT: v_mov_b32_e32 v2, s2
21+
; GFX908-NEXT: v_accvgpr_write_b32 a0, v17
22+
; GFX908-NEXT: v_mov_b32_e32 v17, s3
23+
; GFX908-NEXT: v_accvgpr_write_b32 a1, v1
24+
; GFX908-NEXT: v_accvgpr_write_b32 a2, v2
25+
; GFX908-NEXT: v_accvgpr_write_b32 a3, v17
26+
; GFX908-NEXT: v_mov_b32_e32 v1, s4
27+
; GFX908-NEXT: v_mov_b32_e32 v2, s5
28+
; GFX908-NEXT: v_mov_b32_e32 v17, s6
29+
; GFX908-NEXT: v_accvgpr_write_b32 a4, v1
30+
; GFX908-NEXT: v_accvgpr_write_b32 a5, v2
31+
; GFX908-NEXT: v_accvgpr_write_b32 a6, v17
32+
; GFX908-NEXT: v_mov_b32_e32 v1, s7
33+
; GFX908-NEXT: v_mov_b32_e32 v2, s8
34+
; GFX908-NEXT: v_mov_b32_e32 v17, s9
35+
; GFX908-NEXT: v_accvgpr_write_b32 a7, v1
36+
; GFX908-NEXT: v_accvgpr_write_b32 a8, v2
37+
; GFX908-NEXT: v_accvgpr_write_b32 a9, v17
38+
; GFX908-NEXT: v_mov_b32_e32 v1, s10
39+
; GFX908-NEXT: v_mov_b32_e32 v2, s11
40+
; GFX908-NEXT: v_mov_b32_e32 v17, s12
41+
; GFX908-NEXT: v_accvgpr_write_b32 a10, v1
42+
; GFX908-NEXT: v_accvgpr_write_b32 a11, v2
43+
; GFX908-NEXT: v_accvgpr_write_b32 a12, v17
44+
; GFX908-NEXT: v_mov_b32_e32 v1, s13
45+
; GFX908-NEXT: v_mov_b32_e32 v2, s14
46+
; GFX908-NEXT: v_mov_b32_e32 v17, s15
47+
; GFX908-NEXT: v_accvgpr_write_b32 a13, v1
48+
; GFX908-NEXT: v_accvgpr_write_b32 a14, v2
49+
; GFX908-NEXT: v_accvgpr_write_b32 a15, v17
50+
; GFX908-NEXT: v_mov_b32_e32 v1, 2
51+
; GFX908-NEXT: s_nop 1
52+
; GFX908-NEXT: v_mfma_i32_32x32x8i8 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3
53+
; GFX908-NEXT: s_nop 7
54+
; GFX908-NEXT: s_nop 7
55+
; GFX908-NEXT: s_nop 1
56+
; GFX908-NEXT: v_accvgpr_read_b32 v15, a15
57+
; GFX908-NEXT: v_accvgpr_read_b32 v14, a14
58+
; GFX908-NEXT: v_accvgpr_read_b32 v13, a13
59+
; GFX908-NEXT: v_accvgpr_read_b32 v12, a12
60+
; GFX908-NEXT: v_accvgpr_read_b32 v3, a3
61+
; GFX908-NEXT: v_accvgpr_read_b32 v2, a2
62+
; GFX908-NEXT: v_accvgpr_read_b32 v1, a1
63+
; GFX908-NEXT: v_accvgpr_read_b32 v0, a0
64+
; GFX908-NEXT: v_accvgpr_read_b32 v7, a7
65+
; GFX908-NEXT: v_accvgpr_read_b32 v6, a6
66+
; GFX908-NEXT: v_accvgpr_read_b32 v5, a5
67+
; GFX908-NEXT: v_accvgpr_read_b32 v4, a4
68+
; GFX908-NEXT: v_accvgpr_read_b32 v11, a11
69+
; GFX908-NEXT: v_accvgpr_read_b32 v10, a10
70+
; GFX908-NEXT: v_accvgpr_read_b32 v9, a9
71+
; GFX908-NEXT: v_accvgpr_read_b32 v8, a8
72+
; GFX908-NEXT: global_store_dwordx4 v16, v[12:15], s[16:17] offset:48
73+
; GFX908-NEXT: s_nop 0
74+
; GFX908-NEXT: global_store_dwordx4 v16, v[8:11], s[16:17] offset:32
75+
; GFX908-NEXT: global_store_dwordx4 v16, v[4:7], s[16:17] offset:16
76+
; GFX908-NEXT: global_store_dwordx4 v16, v[0:3], s[16:17]
77+
; GFX908-NEXT: s_endpgm
78+
;
79+
; GFX90A-LABEL: test_mfma_i32_32x32x8i8:
80+
; GFX90A: ; %bb.0: ; %bb
81+
; GFX90A-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24
82+
; GFX90A-NEXT: v_mov_b32_e32 v0, 1
83+
; GFX90A-NEXT: v_mov_b32_e32 v1, 2
84+
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
85+
; GFX90A-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
86+
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
87+
; GFX90A-NEXT: v_accvgpr_write_b32 a0, s0
88+
; GFX90A-NEXT: v_accvgpr_write_b32 a1, s1
89+
; GFX90A-NEXT: v_accvgpr_write_b32 a2, s2
90+
; GFX90A-NEXT: v_accvgpr_write_b32 a3, s3
91+
; GFX90A-NEXT: v_accvgpr_write_b32 a4, s4
92+
; GFX90A-NEXT: v_accvgpr_write_b32 a5, s5
93+
; GFX90A-NEXT: v_accvgpr_write_b32 a6, s6
94+
; GFX90A-NEXT: v_accvgpr_write_b32 a7, s7
95+
; GFX90A-NEXT: v_accvgpr_write_b32 a8, s8
96+
; GFX90A-NEXT: v_accvgpr_write_b32 a9, s9
97+
; GFX90A-NEXT: v_accvgpr_write_b32 a10, s10
98+
; GFX90A-NEXT: v_accvgpr_write_b32 a11, s11
99+
; GFX90A-NEXT: v_accvgpr_write_b32 a12, s12
100+
; GFX90A-NEXT: v_accvgpr_write_b32 a13, s13
101+
; GFX90A-NEXT: v_accvgpr_write_b32 a14, s14
102+
; GFX90A-NEXT: v_accvgpr_write_b32 a15, s15
103+
; GFX90A-NEXT: s_nop 1
104+
; GFX90A-NEXT: v_mfma_i32_32x32x8i8 a[0:15], v0, v1, a[0:15] cbsz:1 abid:2 blgp:3
105+
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
106+
; GFX90A-NEXT: s_nop 7
107+
; GFX90A-NEXT: s_nop 7
108+
; GFX90A-NEXT: s_nop 1
109+
; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
110+
; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
111+
; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
112+
; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17]
113+
; GFX90A-NEXT: s_endpgm
20114
bb:
21115
%in.1 = load <16 x i32>, ptr addrspace(1) %arg
22116
%mai.1 = tail call <16 x i32> @llvm.amdgcn.mfma.i32.32x32x8i8(i32 1, i32 2, <16 x i32> %in.1, i32 1, i32 2, i32 3)
23117
store <16 x i32> %mai.1, ptr addrspace(1) %arg
24118
ret void
25119
}
26120

27-
; GCN-LABEL: {{^}}test_mfma_i32_16x16x16i8:
28-
; GCN-DAG: v_mov_b32_e32 [[TWO:v[0-9]+]], 2
29-
; GCN-DAG: v_mov_b32_e32 [[ONE:v[0-9]+]], 1
30-
; GCN: s_load_dwordx4
31-
; GFX908-COUNT-4: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
32-
; GFX90A-COUNT-4: v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}}
33-
; GCN: v_mfma_i32_16x16x16i8 [[RES:a\[[0-9]+:[0-9]+\]]], [[ONE]], [[TWO]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
34-
; GFX908-COUNT-4: v_accvgpr_read_b32
35-
; GFX908: global_store_dwordx4
36-
; GFX90A-NOT: v_accvgpr_read_b32
37-
; GFX90A: global_store_dwordx4 v{{[0-9]+}}, [[RES]]
38121
define amdgpu_kernel void @test_mfma_i32_16x16x16i8(ptr addrspace(1) %arg) #0 {
122+
; GFX908-LABEL: test_mfma_i32_16x16x16i8:
123+
; GFX908: ; %bb.0: ; %bb
124+
; GFX908-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
125+
; GFX908-NEXT: v_mov_b32_e32 v0, 1
126+
; GFX908-NEXT: v_mov_b32_e32 v1, 2
127+
; GFX908-NEXT: v_mov_b32_e32 v4, 0
128+
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
129+
; GFX908-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
130+
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
131+
; GFX908-NEXT: v_mov_b32_e32 v5, s0
132+
; GFX908-NEXT: v_mov_b32_e32 v2, s1
133+
; GFX908-NEXT: v_mov_b32_e32 v3, s2
134+
; GFX908-NEXT: v_accvgpr_write_b32 a0, v5
135+
; GFX908-NEXT: v_mov_b32_e32 v5, s3
136+
; GFX908-NEXT: v_accvgpr_write_b32 a1, v2
137+
; GFX908-NEXT: v_accvgpr_write_b32 a2, v3
138+
; GFX908-NEXT: v_accvgpr_write_b32 a3, v5
139+
; GFX908-NEXT: s_nop 0
140+
; GFX908-NEXT: v_mfma_i32_16x16x16i8 a[0:3], v0, v1, a[0:3] cbsz:1 abid:2 blgp:3
141+
; GFX908-NEXT: s_nop 7
142+
; GFX908-NEXT: s_nop 1
143+
; GFX908-NEXT: v_accvgpr_read_b32 v0, a0
144+
; GFX908-NEXT: v_accvgpr_read_b32 v1, a1
145+
; GFX908-NEXT: v_accvgpr_read_b32 v2, a2
146+
; GFX908-NEXT: v_accvgpr_read_b32 v3, a3
147+
; GFX908-NEXT: s_nop 1
148+
; GFX908-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
149+
; GFX908-NEXT: s_endpgm
150+
;
151+
; GFX90A-LABEL: test_mfma_i32_16x16x16i8:
152+
; GFX90A: ; %bb.0: ; %bb
153+
; GFX90A-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
154+
; GFX90A-NEXT: v_mov_b32_e32 v0, 1
155+
; GFX90A-NEXT: v_mov_b32_e32 v2, 2
156+
; GFX90A-NEXT: v_mov_b32_e32 v1, 0
157+
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
158+
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
159+
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
160+
; GFX90A-NEXT: v_accvgpr_write_b32 a0, s0
161+
; GFX90A-NEXT: v_accvgpr_write_b32 a1, s1
162+
; GFX90A-NEXT: v_accvgpr_write_b32 a2, s2
163+
; GFX90A-NEXT: v_accvgpr_write_b32 a3, s3
164+
; GFX90A-NEXT: s_nop 1
165+
; GFX90A-NEXT: v_mfma_i32_16x16x16i8 a[0:3], v0, v2, a[0:3] cbsz:1 abid:2 blgp:3
166+
; GFX90A-NEXT: s_nop 7
167+
; GFX90A-NEXT: s_nop 2
168+
; GFX90A-NEXT: global_store_dwordx4 v1, a[0:3], s[6:7]
169+
; GFX90A-NEXT: s_endpgm
39170
bb:
40171
%in.1 = load <4 x i32>, ptr addrspace(1) %arg
41172
%mai.1 = tail call <4 x i32> @llvm.amdgcn.mfma.i32.16x16x16i8(i32 1, i32 2, <4 x i32> %in.1, i32 1, i32 2, i32 3)
@@ -44,3 +175,5 @@ bb:
44175
}
45176

46177
attributes #0 = { "amdgpu-flat-work-group-size"="1,256" }
178+
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
179+
; GCN: {{.*}}
Lines changed: 64 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1,40 +1,77 @@
1-
; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX942 %s
2-
; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -global-isel < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GISEL %s
3-
; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -stress-regalloc=10 < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GFX942 %s
4-
; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -stress-regalloc=10 -global-isel < %s | FileCheck -enable-var-scope --check-prefixes=GCN,GISEL %s
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck --check-prefixes=GCN,GFX942 %s
3+
; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -global-isel < %s | FileCheck --check-prefixes=GCN,GISEL %s
4+
; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -stress-regalloc=10 < %s | FileCheck --check-prefixes=GCN,GFX942 %s
5+
; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -stress-regalloc=10 -global-isel < %s | FileCheck --check-prefixes=GCN,GISEL %s
56

67
declare <4 x float> @llvm.amdgcn.mfma.f32.16x16x8.xf32(<2 x float>, <2 x float>, <4 x float>, i32, i32, i32)
78
declare <16 x float> @llvm.amdgcn.mfma.f32.32x32x4.xf32(<2 x float>, <2 x float>, <16 x float>, i32, i32, i32)
89

9-
; GCN-LABEL: {{^}}test_mfma_f32_16x16x8xf32:
10-
; GFX942-DAG: v_mov_b32_e32 v[[ONE:[0-9]+]], 1.0
11-
; GFX942-DAG: v_mov_b32_e32 v[[TWO:[0-9]+]], 2.0
12-
; GFX942-DAG: v_mov_b32_e32 v[[THREE:[0-9]+]], 0x40400000
13-
; GFX942-DAG: v_mov_b32_e32 v[[FOUR:[0-9]+]], 4.0
14-
; GCN-COUNT-4: v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}}
15-
; GFX942: v_mfma_f32_16x16x8_xf32 a[{{[0-9]+:[0-9]+}}], v[[[ONE]]:[[TWO]]], v[[[THREE]]:[[FOUR]]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
16-
; GISEL: v_mfma_f32_16x16x8_xf32 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
17-
; GCN-NOT: v_accvgpr_read_b32
18-
; GCN: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
1910
define amdgpu_kernel void @test_mfma_f32_16x16x8xf32(ptr addrspace(1) %arg) #0 {
11+
; GFX942-LABEL: test_mfma_f32_16x16x8xf32:
12+
; GFX942: ; %bb.0: ; %bb
13+
; GFX942-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x24
14+
; GFX942-NEXT: v_mov_b32_e32 v0, 1.0
15+
; GFX942-NEXT: v_mov_b32_e32 v1, 2.0
16+
; GFX942-NEXT: v_mov_b32_e32 v2, 0x40400000
17+
; GFX942-NEXT: v_mov_b32_e32 v3, 4.0
18+
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
19+
; GFX942-NEXT: s_load_dwordx4 s[0:3], s[6:7], 0x0
20+
; GFX942-NEXT: v_mov_b32_e32 v4, 0
21+
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
22+
; GFX942-NEXT: v_accvgpr_write_b32 a0, s0
23+
; GFX942-NEXT: v_accvgpr_write_b32 a1, s1
24+
; GFX942-NEXT: v_accvgpr_write_b32 a2, s2
25+
; GFX942-NEXT: v_accvgpr_write_b32 a3, s3
26+
; GFX942-NEXT: s_nop 1
27+
; GFX942-NEXT: v_mfma_f32_16x16x8_xf32 a[0:3], v[0:1], v[2:3], a[0:3] cbsz:1 abid:2 blgp:3
28+
; GFX942-NEXT: s_nop 6
29+
; GFX942-NEXT: global_store_dwordx4 v4, a[0:3], s[6:7]
30+
; GFX942-NEXT: s_endpgm
2031
bb:
2132
%in.1 = load <4 x float>, ptr addrspace(1) %arg
2233
%mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x8.xf32(<2 x float> <float 1.0, float 2.0>, <2 x float> <float 3.0, float 4.0>, <4 x float> %in.1, i32 1, i32 2, i32 3)
2334
store <4 x float> %mai.1, ptr addrspace(1) %arg
2435
ret void
2536
}
2637

27-
; GCN-LABEL: {{^}}test_mfma_f32_32x32x4xf32:
28-
; GFX942-DAG: v_mov_b32_e32 v[[ONE:[0-9]+]], 1.0
29-
; GFX942-DAG: v_mov_b32_e32 v[[TWO:[0-9]+]], 2.0
30-
; GFX942-DAG: v_mov_b32_e32 v[[THREE:[0-9]+]], 0x40400000
31-
; GFX942-DAG: v_mov_b32_e32 v[[FOUR:[0-9]+]], 4.0
32-
; GCN-COUNT-4: v_accvgpr_write_b32 a{{[0-9]+}}, s{{[0-9]+}}
33-
; GFX942: v_mfma_f32_32x32x4_xf32 a[{{[0-9]+:[0-9]+}}], v[[[ONE]]:[[TWO]]], v[[[THREE]]:[[FOUR]]], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
34-
; GISEL: v_mfma_f32_32x32x4_xf32 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a[{{[0-9]+:[0-9]+}}] cbsz:1 abid:2 blgp:3
35-
; GCN-NOT: v_accvgpr_read_b32
36-
; GCN: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
3738
define amdgpu_kernel void @test_mfma_f32_32x32x4xf32(ptr addrspace(1) %arg) #0 {
39+
; GFX942-LABEL: test_mfma_f32_32x32x4xf32:
40+
; GFX942: ; %bb.0: ; %bb
41+
; GFX942-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x24
42+
; GFX942-NEXT: v_mov_b32_e32 v0, 1.0
43+
; GFX942-NEXT: v_mov_b32_e32 v1, 2.0
44+
; GFX942-NEXT: v_mov_b32_e32 v2, 0x40400000
45+
; GFX942-NEXT: v_mov_b32_e32 v3, 4.0
46+
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
47+
; GFX942-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
48+
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
49+
; GFX942-NEXT: v_accvgpr_write_b32 a0, s0
50+
; GFX942-NEXT: v_accvgpr_write_b32 a1, s1
51+
; GFX942-NEXT: v_accvgpr_write_b32 a2, s2
52+
; GFX942-NEXT: v_accvgpr_write_b32 a3, s3
53+
; GFX942-NEXT: v_accvgpr_write_b32 a4, s4
54+
; GFX942-NEXT: v_accvgpr_write_b32 a5, s5
55+
; GFX942-NEXT: v_accvgpr_write_b32 a6, s6
56+
; GFX942-NEXT: v_accvgpr_write_b32 a7, s7
57+
; GFX942-NEXT: v_accvgpr_write_b32 a8, s8
58+
; GFX942-NEXT: v_accvgpr_write_b32 a9, s9
59+
; GFX942-NEXT: v_accvgpr_write_b32 a10, s10
60+
; GFX942-NEXT: v_accvgpr_write_b32 a11, s11
61+
; GFX942-NEXT: v_accvgpr_write_b32 a12, s12
62+
; GFX942-NEXT: v_accvgpr_write_b32 a13, s13
63+
; GFX942-NEXT: v_accvgpr_write_b32 a14, s14
64+
; GFX942-NEXT: v_accvgpr_write_b32 a15, s15
65+
; GFX942-NEXT: s_nop 1
66+
; GFX942-NEXT: v_mfma_f32_32x32x4_xf32 a[0:15], v[0:1], v[2:3], a[0:15] cbsz:1 abid:2 blgp:3
67+
; GFX942-NEXT: v_mov_b32_e32 v0, 0
68+
; GFX942-NEXT: s_nop 7
69+
; GFX942-NEXT: s_nop 1
70+
; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[16:17] offset:48
71+
; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[16:17] offset:32
72+
; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[16:17] offset:16
73+
; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[16:17]
74+
; GFX942-NEXT: s_endpgm
3875
bb:
3976
%in.1 = load <16 x float>, ptr addrspace(1) %arg
4077
%mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x4.xf32(<2 x float> <float 1.0, float 2.0>, <2 x float> <float 3.0, float 4.0>, <16 x float> %in.1, i32 1, i32 2, i32 3)
@@ -43,3 +80,6 @@ bb:
4380
}
4481

4582
attributes #0 = { "amdgpu-flat-work-group-size"="1,256" }
83+
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
84+
; GCN: {{.*}}
85+
; GISEL: {{.*}}

0 commit comments

Comments
 (0)