@@ -369,31 +369,68 @@ multiclass FLAT_Global_Store_Pseudo_t16<string opName> {
369
369
}
370
370
}
371
371
372
- class FLAT_Global_Load_LDS_Pseudo <string opName, bit EnableSaddr = 0> : FLAT_Pseudo<
372
+ // Async loads, introduced in gfx1250, will store directly
373
+ // to a DS address in vdst (they will not use M0 for DS addess).
374
+ class FLAT_Global_Load_LDS_Pseudo <string opName, bit EnableSaddr = 0, bit IsAsync = 0> : FLAT_Pseudo<
373
375
opName,
374
376
(outs ),
375
377
!con(
376
- !if(EnableSaddr, (ins SReg_64:$saddr, VGPR_32:$vaddr), (ins VReg_64:$vaddr)),
377
- (ins flat_offset:$offset, CPol_0:$cpol)),
378
- " $vaddr"#!if(EnableSaddr, ", $saddr", ", off")#"$offset$cpol"> {
379
- let LGKM_CNT = 1;
378
+ !if(IsAsync, (ins VGPR_32:$vdst), (ins)),
379
+ !if(EnableSaddr, (ins SReg_64:$saddr, VGPR_32:$vaddr), (ins VReg_64:$vaddr)),
380
+ (ins flat_offset:$offset, CPol_0:$cpol)),
381
+ !if(IsAsync, " $vdst,", "")#" $vaddr"#!if(EnableSaddr, ", $saddr", ", off")#"$offset$cpol"> {
382
+ let LGKM_CNT = !not(IsAsync);
383
+ let VM_CNT = !not(IsAsync);
384
+ let ASYNC_CNT = IsAsync;
380
385
let is_flat_global = 1;
381
386
let lds = 1;
382
387
let has_data = 0;
388
+ let has_vdst = IsAsync; // vdst for ds address with IsAsync
389
+ let mayLoad = 1;
390
+ let mayStore = 1;
391
+ let has_saddr = 1;
392
+ let enabled_saddr = EnableSaddr;
393
+ let VALU = 1;
394
+ let PseudoInstr = opName#!if(EnableSaddr, "_SADDR", "");
395
+ let Uses = !if(IsAsync, [EXEC, ASYNCcnt], [M0, EXEC]);
396
+ let Defs = !if(IsAsync, [ASYNCcnt], []);
397
+ let SchedRW = [WriteVMEM, WriteLDS];
398
+ }
399
+
400
+ multiclass FLAT_Global_Load_LDS_Pseudo<string opName, bit IsAsync = 0> {
401
+ def "" : FLAT_Global_Load_LDS_Pseudo<opName, 0, IsAsync>,
402
+ GlobalSaddrTable<0, opName>;
403
+ def _SADDR : FLAT_Global_Load_LDS_Pseudo<opName, 1, IsAsync>,
404
+ GlobalSaddrTable<1, opName>;
405
+ }
406
+
407
+ class FLAT_Global_STORE_LDS_Pseudo <string opName, bit EnableSaddr = 0> : FLAT_Pseudo<
408
+ opName,
409
+ (outs ),
410
+ !con(
411
+ !if(EnableSaddr, (ins SReg_64:$saddr, VGPR_32:$vaddr), (ins VReg_64:$vaddr)), (ins VGPR_32:$vdata),
412
+ (ins flat_offset:$offset, CPol_0:$cpol)),
413
+ " $vaddr, $vdata"#!if(EnableSaddr, ", $saddr", ", off")#"$offset$cpol"> {
414
+ let VM_CNT = 0;
415
+ let ASYNC_CNT = 1;
416
+ let is_flat_global = 1;
417
+ let lds = 1;
418
+ let has_data = 1; // vdata for ds address
383
419
let has_vdst = 0;
384
420
let mayLoad = 1;
385
421
let mayStore = 1;
386
422
let has_saddr = 1;
387
423
let enabled_saddr = EnableSaddr;
388
424
let VALU = 1;
389
- let Uses = [M0, EXEC];
425
+ let Uses = [EXEC, ASYNCcnt];
426
+ let Defs = [ASYNCcnt];
390
427
let SchedRW = [WriteVMEM, WriteLDS];
391
428
}
392
429
393
- multiclass FLAT_Global_Load_LDS_Pseudo <string opName> {
394
- def "" : FLAT_Global_Load_LDS_Pseudo <opName>,
430
+ multiclass FLAT_Global_STORE_LDS_Pseudo <string opName> {
431
+ def "" : FLAT_Global_STORE_LDS_Pseudo <opName>,
395
432
GlobalSaddrTable<0, opName>;
396
- def _SADDR : FLAT_Global_Load_LDS_Pseudo <opName, 1>,
433
+ def _SADDR : FLAT_Global_STORE_LDS_Pseudo <opName, 1>,
397
434
GlobalSaddrTable<1, opName>;
398
435
}
399
436
@@ -1156,6 +1193,15 @@ let SubtargetPredicate = isGFX12Plus in {
1156
1193
1157
1194
let SubtargetPredicate = isGFX1250Plus in {
1158
1195
1196
+ defm GLOBAL_LOAD_ASYNC_TO_LDS_B8 : FLAT_Global_Load_LDS_Pseudo<"global_load_async_to_lds_b8", 1>;
1197
+ defm GLOBAL_LOAD_ASYNC_TO_LDS_B32 : FLAT_Global_Load_LDS_Pseudo<"global_load_async_to_lds_b32", 1>;
1198
+ defm GLOBAL_LOAD_ASYNC_TO_LDS_B64 : FLAT_Global_Load_LDS_Pseudo<"global_load_async_to_lds_b64", 1>;
1199
+ defm GLOBAL_LOAD_ASYNC_TO_LDS_B128 : FLAT_Global_Load_LDS_Pseudo<"global_load_async_to_lds_b128", 1>;
1200
+ defm GLOBAL_STORE_ASYNC_FROM_LDS_B8 : FLAT_Global_STORE_LDS_Pseudo<"global_store_async_from_lds_b8">;
1201
+ defm GLOBAL_STORE_ASYNC_FROM_LDS_B32 : FLAT_Global_STORE_LDS_Pseudo<"global_store_async_from_lds_b32">;
1202
+ defm GLOBAL_STORE_ASYNC_FROM_LDS_B64 : FLAT_Global_STORE_LDS_Pseudo<"global_store_async_from_lds_b64">;
1203
+ defm GLOBAL_STORE_ASYNC_FROM_LDS_B128 : FLAT_Global_STORE_LDS_Pseudo<"global_store_async_from_lds_b128">;
1204
+
1159
1205
def TENSOR_SAVE : FLAT_Global_Tensor_Pseudo<"tensor_save", 1>;
1160
1206
def TENSOR_STOP : FLAT_Global_Tensor_Pseudo<"tensor_stop">;
1161
1207
} // End SubtargetPredicate = isGFX1250Plus
@@ -3374,6 +3420,15 @@ defm GLOBAL_LOAD_MONITOR_B32 : VFLAT_Real_AllAddr_gfx1250<0x070>;
3374
3420
defm GLOBAL_LOAD_MONITOR_B64 : VFLAT_Real_AllAddr_gfx1250<0x071>;
3375
3421
defm GLOBAL_LOAD_MONITOR_B128 : VFLAT_Real_AllAddr_gfx1250<0x072>;
3376
3422
3423
+ defm GLOBAL_LOAD_ASYNC_TO_LDS_B8 : VFLAT_Real_AllAddr_gfx1250<0x5f>;
3424
+ defm GLOBAL_LOAD_ASYNC_TO_LDS_B32 : VFLAT_Real_AllAddr_gfx1250<0x60>;
3425
+ defm GLOBAL_LOAD_ASYNC_TO_LDS_B64 : VFLAT_Real_AllAddr_gfx1250<0x61>;
3426
+ defm GLOBAL_LOAD_ASYNC_TO_LDS_B128 : VFLAT_Real_AllAddr_gfx1250<0x62>;
3427
+ defm GLOBAL_STORE_ASYNC_FROM_LDS_B8 : VFLAT_Real_AllAddr_gfx1250<0x63>;
3428
+ defm GLOBAL_STORE_ASYNC_FROM_LDS_B32 : VFLAT_Real_AllAddr_gfx1250<0x64>;
3429
+ defm GLOBAL_STORE_ASYNC_FROM_LDS_B64 : VFLAT_Real_AllAddr_gfx1250<0x65>;
3430
+ defm GLOBAL_STORE_ASYNC_FROM_LDS_B128 : VFLAT_Real_AllAddr_gfx1250<0x66>;
3431
+
3377
3432
defm GLOBAL_LOAD_TR_B128_w32 : VFLAT_Real_AllAddr_gfx1250<0x057, "global_load_tr16_b128">;
3378
3433
defm GLOBAL_LOAD_TR_B64_w32 : VFLAT_Real_AllAddr_gfx1250<0x058, "global_load_tr8_b64">;
3379
3434
0 commit comments