From f3bc55c8f3a1bb0aabbed5e110831278f2d20ec5 Mon Sep 17 00:00:00 2001 From: Ivan Butygin Date: Sat, 19 Jul 2025 20:11:13 +0200 Subject: [PATCH 1/7] [mlir][amdgpu] Add `amdgpu.waitcnt` wrapper The main motivations is to pass vmcnt/expcnt/lgkmcnt values directly and delegate architecture-dependent bitpacking to the amdgpu->rocdl lowering. Only gfx9 bitpacking support added as part of this commit. --- mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td | 20 +++++++ .../AMDGPUToROCDL/AMDGPUToROCDL.cpp | 52 +++++++++++++++++-- .../Conversion/AMDGPUToROCDL/waitcnt.mlir | 20 +++++++ mlir/test/Dialect/AMDGPU/ops.mlir | 13 +++++ 4 files changed, 102 insertions(+), 3 deletions(-) create mode 100644 mlir/test/Conversion/AMDGPUToROCDL/waitcnt.mlir diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td index 5a53b15a9c679..7fe1ef37e1f9b 100644 --- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td +++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td @@ -719,6 +719,26 @@ def AMDGPU_SchedBarrierOp : }]; } +def AMDGPU_WaitcntOp : + AMDGPU_Op<"waitcnt">, + Arguments<(ins + OptionalAttr:$vmcnt, + OptionalAttr:$expcnt, + OptionalAttr:$lgkmcnt + )> + { + let summary = "Wrapper on ROCDL SWaitcntOp"; + let description = [{ + Covenience wrapper on `rocdl.s.waitcnt`. Hides the architecture specific + bitpacking from user. Missing values will be assumed maximum values supported + by the architecture. Large values will also be clamped to the maximum + supported values. + }]; + let assemblyFormat = [{ + (`vmcnt` `(` $vmcnt^ `)` )? (`expcnt` `(` $expcnt^ `)` )? (`lgkmcnt` `(` $lgkmcnt^ `)`)? attr-dict + }]; +} + def AMDGPU_MFMAPermB : I32EnumAttr<"MFMAPermB", "The possible permutations of the lanes storing B available in an MFMA", [ diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp index ef35ee208f002..af588d5b70a45 100644 --- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp +++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp @@ -419,6 +419,52 @@ struct RawBufferOpLowering : public ConvertOpToLLVMPattern { } }; +// TODO: AMDGPU backend already have all this bitpacking logic, we should move +// it to some common place. +static FailureOr encodeWaitcnt(Chipset chipset, unsigned vmcnt, + unsigned expcnt, unsigned lgkmcnt) { + if (chipset.majorVersion == 9) { + vmcnt = std::min(63u, vmcnt); + expcnt = std::min(7u, expcnt); + lgkmcnt = std::min(15u, lgkmcnt); + unsigned lowBits = vmcnt & 0xF; + unsigned highBits = (vmcnt >> 4) << 14; + unsigned otherCnts = (expcnt << 4) | (lgkmcnt << 8); + return lowBits | highBits | otherCnts; + } + return failure(); +} + +struct WaitcntOpLowering : public ConvertOpToLLVMPattern { + WaitcntOpLowering(const LLVMTypeConverter &converter, Chipset chipset) + : ConvertOpToLLVMPattern(converter), chipset(chipset) {} + + Chipset chipset; + + LogicalResult + matchAndRewrite(WaitcntOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto getVal = [](Attribute attr) -> unsigned { + if (attr) + return cast(attr).getInt(); + + // This value will be clamped to the maximum value for the chipset. + return 1024 * 1024; + }; + unsigned vmcnt = getVal(adaptor.getVmcntAttr()); + unsigned expcnt = getVal(adaptor.getExpcntAttr()); + unsigned lgkmcnt = getVal(adaptor.getLgkmcntAttr()); + + FailureOr waitcnt = + encodeWaitcnt(chipset, vmcnt, expcnt, lgkmcnt); + if (failed(waitcnt)) + return op.emitOpError("unsupported chipset"); + + rewriter.replaceOpWithNewOp(op, *waitcnt); + return success(); + } +}; + struct LDSBarrierOpLowering : public ConvertOpToLLVMPattern { LDSBarrierOpLowering(const LLVMTypeConverter &converter, Chipset chipset) : ConvertOpToLLVMPattern(converter), chipset(chipset) {} @@ -1825,9 +1871,9 @@ void mlir::populateAMDGPUToROCDLConversionPatterns(LLVMTypeConverter &converter, ROCDL::RawPtrBufferAtomicUminOp>, RawBufferOpLowering, - AMDGPUDPPLowering, LDSBarrierOpLowering, SchedBarrierOpLowering, - MFMAOpLowering, ScaledMFMAOpLowering, WMMAOpLowering, - ExtPackedFp8OpLowering, ScaledExtPackedOpLowering, + AMDGPUDPPLowering, WaitcntOpLowering, LDSBarrierOpLowering, + SchedBarrierOpLowering, MFMAOpLowering, ScaledMFMAOpLowering, + WMMAOpLowering, ExtPackedFp8OpLowering, ScaledExtPackedOpLowering, PackedScaledTruncOpLowering, PackedTrunc2xFp8OpLowering, PackedStochRoundFp8OpLowering, GatherToLDSOpLowering, TransposeLoadOpLowering>(converter, chipset); diff --git a/mlir/test/Conversion/AMDGPUToROCDL/waitcnt.mlir b/mlir/test/Conversion/AMDGPUToROCDL/waitcnt.mlir new file mode 100644 index 0000000000000..9c785670198ae --- /dev/null +++ b/mlir/test/Conversion/AMDGPUToROCDL/waitcnt.mlir @@ -0,0 +1,20 @@ +// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx942 | FileCheck %s --check-prefixes=CHECK,GFX9 +// TODO: Add more chipsets support + + +// CHECK-LABEL: func @waitcnt +func.func @waitcnt() { + // GFX9: rocdl.s.waitcnt 53119 + amdgpu.waitcnt + + // GFX9: rocdl.s.waitcnt 3952 + amdgpu.waitcnt vmcnt(0) + + // GFX9: rocdl.s.waitcnt 53007 + amdgpu.waitcnt expcnt(0) + + // GFX9: rocdl.s.waitcnt 49279 + amdgpu.waitcnt lgkmcnt(0) + + return +} diff --git a/mlir/test/Dialect/AMDGPU/ops.mlir b/mlir/test/Dialect/AMDGPU/ops.mlir index fe2b32be04de4..086b5884be5c7 100644 --- a/mlir/test/Dialect/AMDGPU/ops.mlir +++ b/mlir/test/Dialect/AMDGPU/ops.mlir @@ -548,3 +548,16 @@ func.func @gather_to_lds(%idx1 : index, %idx2 : index, %mem1 : memref<32xf16>, % amdgpu.gather_to_lds %mem1[%idx1], %smem2[%idx1, %idx2] : vector<2xf16>, memref<32xf16>, memref<32x32xf16, #gpu.address_space> func.return } + +// CHECK-LABEL: func @waitcnt +func.func @waitcnt() { + // CHECK: amdgpu.waitcnt vmcnt(1) expcnt(2) lgkmcnt(3) + // CHECK: amdgpu.waitcnt vmcnt(1) + // CHECK: amdgpu.waitcnt expcnt(2) + // CHECK: amdgpu.waitcnt lgkmcnt(3) + amdgpu.waitcnt vmcnt(1) expcnt(2) lgkmcnt(3) + amdgpu.waitcnt vmcnt(1) + amdgpu.waitcnt expcnt(2) + amdgpu.waitcnt lgkmcnt(3) + func.return +} From 5320853675ede402b99d9fbcc4446134e80ad12b Mon Sep 17 00:00:00 2001 From: Ivan Butygin Date: Sun, 20 Jul 2025 10:29:25 +0200 Subject: [PATCH 2/7] more chisets Signed-off-by: Ivan Butygin --- .../AMDGPUToROCDL/AMDGPUToROCDL.cpp | 30 +++++++++++++++++++ .../Conversion/AMDGPUToROCDL/waitcnt.mlir | 11 ++++++- 2 files changed, 40 insertions(+), 1 deletion(-) diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp index af588d5b70a45..1940ef8775688 100644 --- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp +++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp @@ -421,8 +421,23 @@ struct RawBufferOpLowering : public ConvertOpToLLVMPattern { // TODO: AMDGPU backend already have all this bitpacking logic, we should move // it to some common place. +/// \details \p Vmcnt, \p Expcnt and \p Lgkmcnt are decoded as follows: +/// \p Vmcnt = \p Waitcnt[3:0] (pre-gfx9) +/// \p Vmcnt = \p Waitcnt[15:14,3:0] (gfx9,10) +/// \p Vmcnt = \p Waitcnt[15:10] (gfx11) +/// \p Expcnt = \p Waitcnt[6:4] (pre-gfx11) +/// \p Expcnt = \p Waitcnt[2:0] (gfx11) +/// \p Lgkmcnt = \p Waitcnt[11:8] (pre-gfx10) +/// \p Lgkmcnt = \p Waitcnt[13:8] (gfx10) +/// \p Lgkmcnt = \p Waitcnt[9:4] (gfx11) static FailureOr encodeWaitcnt(Chipset chipset, unsigned vmcnt, unsigned expcnt, unsigned lgkmcnt) { + if (chipset.majorVersion < 9) { + vmcnt = std::min(15u, vmcnt); + expcnt = std::min(7u, expcnt); + lgkmcnt = std::min(15u, lgkmcnt); + return vmcnt | (expcnt << 4) | (lgkmcnt << 8); + } if (chipset.majorVersion == 9) { vmcnt = std::min(63u, vmcnt); expcnt = std::min(7u, expcnt); @@ -432,6 +447,21 @@ static FailureOr encodeWaitcnt(Chipset chipset, unsigned vmcnt, unsigned otherCnts = (expcnt << 4) | (lgkmcnt << 8); return lowBits | highBits | otherCnts; } + if (chipset.majorVersion == 10) { + vmcnt = std::min(63u, vmcnt); + expcnt = std::min(7u, expcnt); + lgkmcnt = std::min(63u, lgkmcnt); + unsigned lowBits = vmcnt & 0xF; + unsigned highBits = (vmcnt >> 4) << 14; + unsigned otherCnts = (expcnt << 4) | (lgkmcnt << 8); + return lowBits | highBits | otherCnts; + } + if (chipset.majorVersion == 11) { + vmcnt = std::min(63u, vmcnt); + expcnt = std::min(7u, expcnt); + lgkmcnt = std::min(63u, lgkmcnt); + return (vmcnt << 10) | expcnt | (lgkmcnt << 4); + } return failure(); } diff --git a/mlir/test/Conversion/AMDGPUToROCDL/waitcnt.mlir b/mlir/test/Conversion/AMDGPUToROCDL/waitcnt.mlir index 9c785670198ae..71617df05eb60 100644 --- a/mlir/test/Conversion/AMDGPUToROCDL/waitcnt.mlir +++ b/mlir/test/Conversion/AMDGPUToROCDL/waitcnt.mlir @@ -1,19 +1,28 @@ // RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx942 | FileCheck %s --check-prefixes=CHECK,GFX9 -// TODO: Add more chipsets support +// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx1030 | FileCheck %s --check-prefixes=CHECK,GFX10 +// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx1100 | FileCheck %s --check-prefixes=CHECK,GFX11 // CHECK-LABEL: func @waitcnt func.func @waitcnt() { // GFX9: rocdl.s.waitcnt 53119 + // GFX10: rocdl.s.waitcnt 65407 + // GFX11: rocdl.s.waitcnt 65527 amdgpu.waitcnt // GFX9: rocdl.s.waitcnt 3952 + // GFX10: rocdl.s.waitcnt 16240 + // GFX11: rocdl.s.waitcnt 1015 amdgpu.waitcnt vmcnt(0) // GFX9: rocdl.s.waitcnt 53007 + // GFX10: rocdl.s.waitcnt 65295 + // GFX11: rocdl.s.waitcnt 65520 amdgpu.waitcnt expcnt(0) // GFX9: rocdl.s.waitcnt 49279 + // GFX10: rocdl.s.waitcnt 49279 + // GFX11: rocdl.s.waitcnt 64519 amdgpu.waitcnt lgkmcnt(0) return From a8569157e2995cad79515eb2206ea6756f7bc5d6 Mon Sep 17 00:00:00 2001 From: Ivan Butygin Date: Mon, 21 Jul 2025 22:22:12 +0200 Subject: [PATCH 3/7] oilist Signed-off-by: Ivan Butygin --- mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td | 2 +- mlir/test/Dialect/AMDGPU/ops.mlir | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td index 7fe1ef37e1f9b..481cebdf30852 100644 --- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td +++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td @@ -735,7 +735,7 @@ def AMDGPU_WaitcntOp : supported values. }]; let assemblyFormat = [{ - (`vmcnt` `(` $vmcnt^ `)` )? (`expcnt` `(` $expcnt^ `)` )? (`lgkmcnt` `(` $lgkmcnt^ `)`)? attr-dict + oilist( `vmcnt` `(` $vmcnt `)` | `expcnt` `(` $expcnt `)` | `lgkmcnt` `(` $lgkmcnt `)` ) attr-dict }]; } diff --git a/mlir/test/Dialect/AMDGPU/ops.mlir b/mlir/test/Dialect/AMDGPU/ops.mlir index 086b5884be5c7..82dd2bec248a7 100644 --- a/mlir/test/Dialect/AMDGPU/ops.mlir +++ b/mlir/test/Dialect/AMDGPU/ops.mlir @@ -552,10 +552,12 @@ func.func @gather_to_lds(%idx1 : index, %idx2 : index, %mem1 : memref<32xf16>, % // CHECK-LABEL: func @waitcnt func.func @waitcnt() { // CHECK: amdgpu.waitcnt vmcnt(1) expcnt(2) lgkmcnt(3) + // CHECK: amdgpu.waitcnt vmcnt(3) expcnt(2) lgkmcnt(1) // CHECK: amdgpu.waitcnt vmcnt(1) // CHECK: amdgpu.waitcnt expcnt(2) // CHECK: amdgpu.waitcnt lgkmcnt(3) amdgpu.waitcnt vmcnt(1) expcnt(2) lgkmcnt(3) + amdgpu.waitcnt lgkmcnt(1) expcnt(2) vmcnt(3) amdgpu.waitcnt vmcnt(1) amdgpu.waitcnt expcnt(2) amdgpu.waitcnt lgkmcnt(3) From 811633e21a309c2b97ba342fa7ec1a9a2f955884 Mon Sep 17 00:00:00 2001 From: Ivan Butygin Date: Tue, 22 Jul 2025 19:37:52 +0200 Subject: [PATCH 4/7] switch to new api --- mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td | 25 ++++++----- .../AMDGPUToROCDL/AMDGPUToROCDL.cpp | 42 ++++++++++++++----- .../AMDGPUToROCDL/memory_counter_wait.mlir | 42 +++++++++++++++++++ .../Conversion/AMDGPUToROCDL/waitcnt.mlir | 29 ------------- mlir/test/Dialect/AMDGPU/ops.mlir | 26 ++++++------ 5 files changed, 102 insertions(+), 62 deletions(-) create mode 100644 mlir/test/Conversion/AMDGPUToROCDL/memory_counter_wait.mlir delete mode 100644 mlir/test/Conversion/AMDGPUToROCDL/waitcnt.mlir diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td index 481cebdf30852..b237f7b5749e7 100644 --- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td +++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td @@ -719,23 +719,26 @@ def AMDGPU_SchedBarrierOp : }]; } -def AMDGPU_WaitcntOp : - AMDGPU_Op<"waitcnt">, +def AMDGPU_MemoryCounterWaitOp : + AMDGPU_Op<"memory_counter_wait">, Arguments<(ins - OptionalAttr:$vmcnt, - OptionalAttr:$expcnt, - OptionalAttr:$lgkmcnt + OptionalAttr:$load, + OptionalAttr:$store, + OptionalAttr:$ds, + OptionalAttr:$exp )> { - let summary = "Wrapper on ROCDL SWaitcntOp"; + let summary = "Wait for specified hardware counters"; let description = [{ - Covenience wrapper on `rocdl.s.waitcnt`. Hides the architecture specific - bitpacking from user. Missing values will be assumed maximum values supported - by the architecture. Large values will also be clamped to the maximum - supported values. + Wait for the specified counters to be less-than or equal-to the provided + values before continuing. + + Counters can lower to different instructions on different architectires, + including clamping to the some HW supported max value or combining multiple + counters into one. }]; let assemblyFormat = [{ - oilist( `vmcnt` `(` $vmcnt `)` | `expcnt` `(` $expcnt `)` | `lgkmcnt` `(` $lgkmcnt `)` ) attr-dict + oilist( `load` `(` $load `)` | `store` `(` $store `)` | `ds` `(` $ds `)` | `exp` `(` $exp `)` ) attr-dict }]; } diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp index 1940ef8775688..057dfced09087 100644 --- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp +++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp @@ -465,15 +465,35 @@ static FailureOr encodeWaitcnt(Chipset chipset, unsigned vmcnt, return failure(); } -struct WaitcntOpLowering : public ConvertOpToLLVMPattern { - WaitcntOpLowering(const LLVMTypeConverter &converter, Chipset chipset) - : ConvertOpToLLVMPattern(converter), chipset(chipset) {} +struct MemoryCounterWaitOpLowering + : public ConvertOpToLLVMPattern { + MemoryCounterWaitOpLowering(const LLVMTypeConverter &converter, + Chipset chipset) + : ConvertOpToLLVMPattern(converter), + chipset(chipset) {} Chipset chipset; LogicalResult - matchAndRewrite(WaitcntOp op, OpAdaptor adaptor, + matchAndRewrite(MemoryCounterWaitOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override { + if (chipset.majorVersion >= 12) { + Location loc = op.getLoc(); + if (auto ds = adaptor.getDs()) + rewriter.create(loc, *ds); + + if (auto load = adaptor.getLoad()) + rewriter.create(loc, *load); + + if (auto store = adaptor.getStore()) + rewriter.create(loc, *store); + + if (auto exp = adaptor.getExp()) + rewriter.create(loc, *exp); + + return success(); + } + auto getVal = [](Attribute attr) -> unsigned { if (attr) return cast(attr).getInt(); @@ -481,12 +501,14 @@ struct WaitcntOpLowering : public ConvertOpToLLVMPattern { // This value will be clamped to the maximum value for the chipset. return 1024 * 1024; }; - unsigned vmcnt = getVal(adaptor.getVmcntAttr()); - unsigned expcnt = getVal(adaptor.getExpcntAttr()); - unsigned lgkmcnt = getVal(adaptor.getLgkmcntAttr()); + unsigned ds = getVal(adaptor.getDsAttr()); + unsigned load = getVal(adaptor.getLoadAttr()); + unsigned store = getVal(adaptor.getStoreAttr()); + unsigned exp = getVal(adaptor.getExpAttr()); + + unsigned vmcnt = std::min(load, store); - FailureOr waitcnt = - encodeWaitcnt(chipset, vmcnt, expcnt, lgkmcnt); + FailureOr waitcnt = encodeWaitcnt(chipset, vmcnt, exp, ds); if (failed(waitcnt)) return op.emitOpError("unsupported chipset"); @@ -1901,7 +1923,7 @@ void mlir::populateAMDGPUToROCDLConversionPatterns(LLVMTypeConverter &converter, ROCDL::RawPtrBufferAtomicUminOp>, RawBufferOpLowering, - AMDGPUDPPLowering, WaitcntOpLowering, LDSBarrierOpLowering, + AMDGPUDPPLowering, MemoryCounterWaitOpLowering, LDSBarrierOpLowering, SchedBarrierOpLowering, MFMAOpLowering, ScaledMFMAOpLowering, WMMAOpLowering, ExtPackedFp8OpLowering, ScaledExtPackedOpLowering, PackedScaledTruncOpLowering, PackedTrunc2xFp8OpLowering, diff --git a/mlir/test/Conversion/AMDGPUToROCDL/memory_counter_wait.mlir b/mlir/test/Conversion/AMDGPUToROCDL/memory_counter_wait.mlir new file mode 100644 index 0000000000000..1016ee859e462 --- /dev/null +++ b/mlir/test/Conversion/AMDGPUToROCDL/memory_counter_wait.mlir @@ -0,0 +1,42 @@ +// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx942 | FileCheck %s --check-prefixes=CHECK,GFX9 +// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx1030 | FileCheck %s --check-prefixes=CHECK,GFX10 +// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx1100 | FileCheck %s --check-prefixes=CHECK,GFX11 +// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx1201 | FileCheck %s --check-prefixes=CHECK,GFX12 + +// CHECK-LABEL: func @memory_counter_wait +func.func @memory_counter_wait() { + // GFX9: rocdl.s.waitcnt 53119 + // GFX10: rocdl.s.waitcnt 65407 + // GFX11: rocdl.s.waitcnt 65527 + // GFX12-NOT: rocdl.s.wait.loadcnt + // GFX12-NOT: rocdl.s.wait.storecnt + // GFX12-NOT: rocdl.s.wait.expcnt + // GFX12-NOT: rocdl.s.wait.dscnt + amdgpu.memory_counter_wait + + // GFX9: rocdl.s.waitcnt 3952 + // GFX10: rocdl.s.waitcnt 16240 + // GFX11: rocdl.s.waitcnt 1015 + // GFX12: rocdl.s.wait.loadcnt 0 + amdgpu.memory_counter_wait load(0) + + // GFX9: rocdl.s.waitcnt 3952 + // GFX10: rocdl.s.waitcnt 16240 + // GFX11: rocdl.s.waitcnt 1015 + // GFX12: rocdl.s.wait.storecnt 0 + amdgpu.memory_counter_wait store(0) + + // GFX9: rocdl.s.waitcnt 53007 + // GFX10: rocdl.s.waitcnt 65295 + // GFX11: rocdl.s.waitcnt 65520 + // GFX12: rocdl.s.wait.expcnt 0 + amdgpu.memory_counter_wait exp(0) + + // GFX9: rocdl.s.waitcnt 49279 + // GFX10: rocdl.s.waitcnt 49279 + // GFX11: rocdl.s.waitcnt 64519 + // GFX12: rocdl.s.wait.dscnt 0 + amdgpu.memory_counter_wait ds(0) + + return +} diff --git a/mlir/test/Conversion/AMDGPUToROCDL/waitcnt.mlir b/mlir/test/Conversion/AMDGPUToROCDL/waitcnt.mlir deleted file mode 100644 index 71617df05eb60..0000000000000 --- a/mlir/test/Conversion/AMDGPUToROCDL/waitcnt.mlir +++ /dev/null @@ -1,29 +0,0 @@ -// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx942 | FileCheck %s --check-prefixes=CHECK,GFX9 -// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx1030 | FileCheck %s --check-prefixes=CHECK,GFX10 -// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx1100 | FileCheck %s --check-prefixes=CHECK,GFX11 - - -// CHECK-LABEL: func @waitcnt -func.func @waitcnt() { - // GFX9: rocdl.s.waitcnt 53119 - // GFX10: rocdl.s.waitcnt 65407 - // GFX11: rocdl.s.waitcnt 65527 - amdgpu.waitcnt - - // GFX9: rocdl.s.waitcnt 3952 - // GFX10: rocdl.s.waitcnt 16240 - // GFX11: rocdl.s.waitcnt 1015 - amdgpu.waitcnt vmcnt(0) - - // GFX9: rocdl.s.waitcnt 53007 - // GFX10: rocdl.s.waitcnt 65295 - // GFX11: rocdl.s.waitcnt 65520 - amdgpu.waitcnt expcnt(0) - - // GFX9: rocdl.s.waitcnt 49279 - // GFX10: rocdl.s.waitcnt 49279 - // GFX11: rocdl.s.waitcnt 64519 - amdgpu.waitcnt lgkmcnt(0) - - return -} diff --git a/mlir/test/Dialect/AMDGPU/ops.mlir b/mlir/test/Dialect/AMDGPU/ops.mlir index 82dd2bec248a7..fe78b5365745a 100644 --- a/mlir/test/Dialect/AMDGPU/ops.mlir +++ b/mlir/test/Dialect/AMDGPU/ops.mlir @@ -549,17 +549,19 @@ func.func @gather_to_lds(%idx1 : index, %idx2 : index, %mem1 : memref<32xf16>, % func.return } -// CHECK-LABEL: func @waitcnt -func.func @waitcnt() { - // CHECK: amdgpu.waitcnt vmcnt(1) expcnt(2) lgkmcnt(3) - // CHECK: amdgpu.waitcnt vmcnt(3) expcnt(2) lgkmcnt(1) - // CHECK: amdgpu.waitcnt vmcnt(1) - // CHECK: amdgpu.waitcnt expcnt(2) - // CHECK: amdgpu.waitcnt lgkmcnt(3) - amdgpu.waitcnt vmcnt(1) expcnt(2) lgkmcnt(3) - amdgpu.waitcnt lgkmcnt(1) expcnt(2) vmcnt(3) - amdgpu.waitcnt vmcnt(1) - amdgpu.waitcnt expcnt(2) - amdgpu.waitcnt lgkmcnt(3) +// CHECK-LABEL: func @memory_counter_wait +func.func @memory_counter_wait() { + // CHECK: amdgpu.memory_counter_wait load(1) store(2) ds(3) exp(4) + // CHECK: amdgpu.memory_counter_wait load(4) store(2) ds(3) exp(1) + // CHECK: amdgpu.memory_counter_wait load(1) + // CHECK: amdgpu.memory_counter_wait store(2) + // CHECK: amdgpu.memory_counter_wait ds(3) + // CHECK: amdgpu.memory_counter_wait exp(4) + amdgpu.memory_counter_wait load(1) store(2) ds(3) exp(4) + amdgpu.memory_counter_wait exp(1) store(2) ds(3) load(4) + amdgpu.memory_counter_wait load(1) + amdgpu.memory_counter_wait store(2) + amdgpu.memory_counter_wait ds(3) + amdgpu.memory_counter_wait exp(4) func.return } From b02a25cb8a0746340d1b1bf826f6f42a3f880162 Mon Sep 17 00:00:00 2001 From: Ivan Butygin Date: Tue, 22 Jul 2025 19:41:42 +0200 Subject: [PATCH 5/7] erase op --- mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp index 057dfced09087..0501aa968347f 100644 --- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp +++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp @@ -491,6 +491,7 @@ struct MemoryCounterWaitOpLowering if (auto exp = adaptor.getExp()) rewriter.create(loc, *exp); + rewriter.eraseOp(op); return success(); } From 0660da5c64bcba3b6f3e52d8ed8553607566b39f Mon Sep 17 00:00:00 2001 From: Ivan Butygin Date: Tue, 22 Jul 2025 20:46:08 +0200 Subject: [PATCH 6/7] comments --- .../AMDGPUToROCDL/AMDGPUToROCDL.cpp | 26 +++++++++---------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp index 0501aa968347f..93d220f56026b 100644 --- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp +++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp @@ -421,15 +421,15 @@ struct RawBufferOpLowering : public ConvertOpToLLVMPattern { // TODO: AMDGPU backend already have all this bitpacking logic, we should move // it to some common place. -/// \details \p Vmcnt, \p Expcnt and \p Lgkmcnt are decoded as follows: -/// \p Vmcnt = \p Waitcnt[3:0] (pre-gfx9) -/// \p Vmcnt = \p Waitcnt[15:14,3:0] (gfx9,10) -/// \p Vmcnt = \p Waitcnt[15:10] (gfx11) -/// \p Expcnt = \p Waitcnt[6:4] (pre-gfx11) -/// \p Expcnt = \p Waitcnt[2:0] (gfx11) -/// \p Lgkmcnt = \p Waitcnt[11:8] (pre-gfx10) -/// \p Lgkmcnt = \p Waitcnt[13:8] (gfx10) -/// \p Lgkmcnt = \p Waitcnt[9:4] (gfx11) +/// Vmcnt, Expcnt and Lgkmcnt are decoded as follows: +/// Vmcnt = Waitcnt[3:0] (pre-gfx9) +/// Vmcnt = Waitcnt[15:14,3:0] (gfx9,10) +/// Vmcnt = Waitcnt[15:10] (gfx11) +/// Expcnt = Waitcnt[6:4] (pre-gfx11) +/// Expcnt = Waitcnt[2:0] (gfx11) +/// Lgkmcnt = Waitcnt[11:8] (pre-gfx10) +/// Lgkmcnt = Waitcnt[13:8] (gfx10) +/// Lgkmcnt = Waitcnt[9:4] (gfx11) static FailureOr encodeWaitcnt(Chipset chipset, unsigned vmcnt, unsigned expcnt, unsigned lgkmcnt) { if (chipset.majorVersion < 9) { @@ -479,16 +479,16 @@ struct MemoryCounterWaitOpLowering ConversionPatternRewriter &rewriter) const override { if (chipset.majorVersion >= 12) { Location loc = op.getLoc(); - if (auto ds = adaptor.getDs()) + if (std::optional ds = adaptor.getDs()) rewriter.create(loc, *ds); - if (auto load = adaptor.getLoad()) + if (std::optional load = adaptor.getLoad()) rewriter.create(loc, *load); - if (auto store = adaptor.getStore()) + if (std::optional store = adaptor.getStore()) rewriter.create(loc, *store); - if (auto exp = adaptor.getExp()) + if (std::optional exp = adaptor.getExp()) rewriter.create(loc, *exp); rewriter.eraseOp(op); From 611f679f10dd2c922eb3fdfdea840e46ebaadd26 Mon Sep 17 00:00:00 2001 From: Ivan Butygin Date: Tue, 22 Jul 2025 21:23:41 +0200 Subject: [PATCH 7/7] add load and store --- .../Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp index 93d220f56026b..309476ca7136a 100644 --- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp +++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp @@ -500,14 +500,21 @@ struct MemoryCounterWaitOpLowering return cast(attr).getInt(); // This value will be clamped to the maximum value for the chipset. - return 1024 * 1024; + return 1024; }; unsigned ds = getVal(adaptor.getDsAttr()); - unsigned load = getVal(adaptor.getLoadAttr()); - unsigned store = getVal(adaptor.getStoreAttr()); unsigned exp = getVal(adaptor.getExpAttr()); - unsigned vmcnt = std::min(load, store); + unsigned vmcnt = 1024; + Attribute load = adaptor.getLoadAttr(); + Attribute store = adaptor.getStoreAttr(); + if (load && store) { + vmcnt = getVal(load) + getVal(store); + } else if (load) { + vmcnt = getVal(load); + } else if (store) { + vmcnt = getVal(store); + } FailureOr waitcnt = encodeWaitcnt(chipset, vmcnt, exp, ds); if (failed(waitcnt))