Skip to content

Commit 67e2faa

Browse files
authored
[AMDGPU] MC support for async load and store on gfx1250 (#151030)
1 parent 379949d commit 67e2faa

File tree

5 files changed

+520
-10
lines changed

5 files changed

+520
-10
lines changed

llvm/lib/Target/AMDGPU/FLATInstructions.td

Lines changed: 64 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -369,31 +369,68 @@ multiclass FLAT_Global_Store_Pseudo_t16<string opName> {
369369
}
370370
}
371371

372-
class FLAT_Global_Load_LDS_Pseudo <string opName, bit EnableSaddr = 0> : FLAT_Pseudo<
372+
// Async loads, introduced in gfx1250, will store directly
373+
// to a DS address in vdst (they will not use M0 for DS addess).
374+
class FLAT_Global_Load_LDS_Pseudo <string opName, bit EnableSaddr = 0, bit IsAsync = 0> : FLAT_Pseudo<
373375
opName,
374376
(outs ),
375377
!con(
376-
!if(EnableSaddr, (ins SReg_64:$saddr, VGPR_32:$vaddr), (ins VReg_64:$vaddr)),
377-
(ins flat_offset:$offset, CPol_0:$cpol)),
378-
" $vaddr"#!if(EnableSaddr, ", $saddr", ", off")#"$offset$cpol"> {
379-
let LGKM_CNT = 1;
378+
!if(IsAsync, (ins VGPR_32:$vdst), (ins)),
379+
!if(EnableSaddr, (ins SReg_64:$saddr, VGPR_32:$vaddr), (ins VReg_64:$vaddr)),
380+
(ins flat_offset:$offset, CPol_0:$cpol)),
381+
!if(IsAsync, " $vdst,", "")#" $vaddr"#!if(EnableSaddr, ", $saddr", ", off")#"$offset$cpol"> {
382+
let LGKM_CNT = !not(IsAsync);
383+
let VM_CNT = !not(IsAsync);
384+
let ASYNC_CNT = IsAsync;
380385
let is_flat_global = 1;
381386
let lds = 1;
382387
let has_data = 0;
388+
let has_vdst = IsAsync; // vdst for ds address with IsAsync
389+
let mayLoad = 1;
390+
let mayStore = 1;
391+
let has_saddr = 1;
392+
let enabled_saddr = EnableSaddr;
393+
let VALU = 1;
394+
let PseudoInstr = opName#!if(EnableSaddr, "_SADDR", "");
395+
let Uses = !if(IsAsync, [EXEC, ASYNCcnt], [M0, EXEC]);
396+
let Defs = !if(IsAsync, [ASYNCcnt], []);
397+
let SchedRW = [WriteVMEM, WriteLDS];
398+
}
399+
400+
multiclass FLAT_Global_Load_LDS_Pseudo<string opName, bit IsAsync = 0> {
401+
def "" : FLAT_Global_Load_LDS_Pseudo<opName, 0, IsAsync>,
402+
GlobalSaddrTable<0, opName>;
403+
def _SADDR : FLAT_Global_Load_LDS_Pseudo<opName, 1, IsAsync>,
404+
GlobalSaddrTable<1, opName>;
405+
}
406+
407+
class FLAT_Global_STORE_LDS_Pseudo <string opName, bit EnableSaddr = 0> : FLAT_Pseudo<
408+
opName,
409+
(outs ),
410+
!con(
411+
!if(EnableSaddr, (ins SReg_64:$saddr, VGPR_32:$vaddr), (ins VReg_64:$vaddr)), (ins VGPR_32:$vdata),
412+
(ins flat_offset:$offset, CPol_0:$cpol)),
413+
" $vaddr, $vdata"#!if(EnableSaddr, ", $saddr", ", off")#"$offset$cpol"> {
414+
let VM_CNT = 0;
415+
let ASYNC_CNT = 1;
416+
let is_flat_global = 1;
417+
let lds = 1;
418+
let has_data = 1; // vdata for ds address
383419
let has_vdst = 0;
384420
let mayLoad = 1;
385421
let mayStore = 1;
386422
let has_saddr = 1;
387423
let enabled_saddr = EnableSaddr;
388424
let VALU = 1;
389-
let Uses = [M0, EXEC];
425+
let Uses = [EXEC, ASYNCcnt];
426+
let Defs = [ASYNCcnt];
390427
let SchedRW = [WriteVMEM, WriteLDS];
391428
}
392429

393-
multiclass FLAT_Global_Load_LDS_Pseudo<string opName> {
394-
def "" : FLAT_Global_Load_LDS_Pseudo<opName>,
430+
multiclass FLAT_Global_STORE_LDS_Pseudo<string opName> {
431+
def "" : FLAT_Global_STORE_LDS_Pseudo<opName>,
395432
GlobalSaddrTable<0, opName>;
396-
def _SADDR : FLAT_Global_Load_LDS_Pseudo<opName, 1>,
433+
def _SADDR : FLAT_Global_STORE_LDS_Pseudo<opName, 1>,
397434
GlobalSaddrTable<1, opName>;
398435
}
399436

@@ -1156,6 +1193,15 @@ let SubtargetPredicate = isGFX12Plus in {
11561193

11571194
let SubtargetPredicate = isGFX1250Plus in {
11581195

1196+
defm GLOBAL_LOAD_ASYNC_TO_LDS_B8 : FLAT_Global_Load_LDS_Pseudo<"global_load_async_to_lds_b8", 1>;
1197+
defm GLOBAL_LOAD_ASYNC_TO_LDS_B32 : FLAT_Global_Load_LDS_Pseudo<"global_load_async_to_lds_b32", 1>;
1198+
defm GLOBAL_LOAD_ASYNC_TO_LDS_B64 : FLAT_Global_Load_LDS_Pseudo<"global_load_async_to_lds_b64", 1>;
1199+
defm GLOBAL_LOAD_ASYNC_TO_LDS_B128 : FLAT_Global_Load_LDS_Pseudo<"global_load_async_to_lds_b128", 1>;
1200+
defm GLOBAL_STORE_ASYNC_FROM_LDS_B8 : FLAT_Global_STORE_LDS_Pseudo<"global_store_async_from_lds_b8">;
1201+
defm GLOBAL_STORE_ASYNC_FROM_LDS_B32 : FLAT_Global_STORE_LDS_Pseudo<"global_store_async_from_lds_b32">;
1202+
defm GLOBAL_STORE_ASYNC_FROM_LDS_B64 : FLAT_Global_STORE_LDS_Pseudo<"global_store_async_from_lds_b64">;
1203+
defm GLOBAL_STORE_ASYNC_FROM_LDS_B128 : FLAT_Global_STORE_LDS_Pseudo<"global_store_async_from_lds_b128">;
1204+
11591205
def TENSOR_SAVE : FLAT_Global_Tensor_Pseudo<"tensor_save", 1>;
11601206
def TENSOR_STOP : FLAT_Global_Tensor_Pseudo<"tensor_stop">;
11611207
} // End SubtargetPredicate = isGFX1250Plus
@@ -3374,6 +3420,15 @@ defm GLOBAL_LOAD_MONITOR_B32 : VFLAT_Real_AllAddr_gfx1250<0x070>;
33743420
defm GLOBAL_LOAD_MONITOR_B64 : VFLAT_Real_AllAddr_gfx1250<0x071>;
33753421
defm GLOBAL_LOAD_MONITOR_B128 : VFLAT_Real_AllAddr_gfx1250<0x072>;
33763422

3423+
defm GLOBAL_LOAD_ASYNC_TO_LDS_B8 : VFLAT_Real_AllAddr_gfx1250<0x5f>;
3424+
defm GLOBAL_LOAD_ASYNC_TO_LDS_B32 : VFLAT_Real_AllAddr_gfx1250<0x60>;
3425+
defm GLOBAL_LOAD_ASYNC_TO_LDS_B64 : VFLAT_Real_AllAddr_gfx1250<0x61>;
3426+
defm GLOBAL_LOAD_ASYNC_TO_LDS_B128 : VFLAT_Real_AllAddr_gfx1250<0x62>;
3427+
defm GLOBAL_STORE_ASYNC_FROM_LDS_B8 : VFLAT_Real_AllAddr_gfx1250<0x63>;
3428+
defm GLOBAL_STORE_ASYNC_FROM_LDS_B32 : VFLAT_Real_AllAddr_gfx1250<0x64>;
3429+
defm GLOBAL_STORE_ASYNC_FROM_LDS_B64 : VFLAT_Real_AllAddr_gfx1250<0x65>;
3430+
defm GLOBAL_STORE_ASYNC_FROM_LDS_B128 : VFLAT_Real_AllAddr_gfx1250<0x66>;
3431+
33773432
defm GLOBAL_LOAD_TR_B128_w32 : VFLAT_Real_AllAddr_gfx1250<0x057, "global_load_tr16_b128">;
33783433
defm GLOBAL_LOAD_TR_B64_w32 : VFLAT_Real_AllAddr_gfx1250<0x058, "global_load_tr8_b64">;
33793434

llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -732,7 +732,14 @@ bool isGenericAtomic(unsigned Opc) {
732732
}
733733

734734
bool isAsyncStore(unsigned Opc) {
735-
return false; // placeholder before async store implementation.
735+
return Opc == GLOBAL_STORE_ASYNC_FROM_LDS_B8_gfx1250 ||
736+
Opc == GLOBAL_STORE_ASYNC_FROM_LDS_B32_gfx1250 ||
737+
Opc == GLOBAL_STORE_ASYNC_FROM_LDS_B64_gfx1250 ||
738+
Opc == GLOBAL_STORE_ASYNC_FROM_LDS_B128_gfx1250 ||
739+
Opc == GLOBAL_STORE_ASYNC_FROM_LDS_B8_SADDR_gfx1250 ||
740+
Opc == GLOBAL_STORE_ASYNC_FROM_LDS_B32_SADDR_gfx1250 ||
741+
Opc == GLOBAL_STORE_ASYNC_FROM_LDS_B64_SADDR_gfx1250 ||
742+
Opc == GLOBAL_STORE_ASYNC_FROM_LDS_B128_SADDR_gfx1250;
736743
}
737744

738745
bool isTensorStore(unsigned Opc) {

0 commit comments

Comments
 (0)