From f3bc55c8f3a1bb0aabbed5e110831278f2d20ec5 Mon Sep 17 00:00:00 2001
From: Ivan Butygin <ivan.butygin@gmail.com>
Date: Sat, 19 Jul 2025 20:11:13 +0200
Subject: [PATCH 1/7] [mlir][amdgpu] Add `amdgpu.waitcnt` wrapper

The main motivations is to pass vmcnt/expcnt/lgkmcnt values directly and delegate architecture-dependent bitpacking to the amdgpu->rocdl lowering.
Only gfx9 bitpacking support added as part of this commit.
---
 mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td | 20 +++++++
 .../AMDGPUToROCDL/AMDGPUToROCDL.cpp           | 52 +++++++++++++++++--
 .../Conversion/AMDGPUToROCDL/waitcnt.mlir     | 20 +++++++
 mlir/test/Dialect/AMDGPU/ops.mlir             | 13 +++++
 4 files changed, 102 insertions(+), 3 deletions(-)
 create mode 100644 mlir/test/Conversion/AMDGPUToROCDL/waitcnt.mlir
diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
index 5a53b15a9c679..7fe1ef37e1f9b 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
@@ -719,6 +719,26 @@ def AMDGPU_SchedBarrierOp :
   }];
 }
 
+def AMDGPU_WaitcntOp :
+  AMDGPU_Op<"waitcnt">,
+  Arguments<(ins
+      OptionalAttr<I32Attr>:$vmcnt,
+      OptionalAttr<I32Attr>:$expcnt,
+      OptionalAttr<I32Attr>:$lgkmcnt
+    )>
+  {
+  let summary = "Wrapper on ROCDL SWaitcntOp";
+  let description = [{
+    Covenience wrapper on `rocdl.s.waitcnt`. Hides the architecture specific
+    bitpacking from user. Missing values will be assumed maximum values supported
+    by the architecture. Large values will also be clamped to the maximum
+    supported values.
+  }];
+  let assemblyFormat = [{
+    (`vmcnt` `(` $vmcnt^ `)` )? (`expcnt` `(` $expcnt^ `)` )? (`lgkmcnt` `(` $lgkmcnt^ `)`)? attr-dict
+  }];
+}
+
 def AMDGPU_MFMAPermB : I32EnumAttr<"MFMAPermB",
     "The possible permutations of the lanes storing B available in an MFMA",
     [
diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
index ef35ee208f002..af588d5b70a45 100644
--- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
+++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
@@ -419,6 +419,52 @@ struct RawBufferOpLowering : public ConvertOpToLLVMPattern<GpuOp> {
   }
 };
 
+// TODO: AMDGPU backend already have all this bitpacking logic, we should move
+// it to some common place.
+static FailureOr<unsigned> encodeWaitcnt(Chipset chipset, unsigned vmcnt,
+                                         unsigned expcnt, unsigned lgkmcnt) {
+  if (chipset.majorVersion == 9) {
+    vmcnt = std::min(63u, vmcnt);
+    expcnt = std::min(7u, expcnt);
+    lgkmcnt = std::min(15u, lgkmcnt);
+    unsigned lowBits = vmcnt & 0xF;
+    unsigned highBits = (vmcnt >> 4) << 14;
+    unsigned otherCnts = (expcnt << 4) | (lgkmcnt << 8);
+    return lowBits | highBits | otherCnts;
+  }
+  return failure();
+}
+
+struct WaitcntOpLowering : public ConvertOpToLLVMPattern<WaitcntOp> {
+  WaitcntOpLowering(const LLVMTypeConverter &converter, Chipset chipset)
+      : ConvertOpToLLVMPattern<WaitcntOp>(converter), chipset(chipset) {}
+
+  Chipset chipset;
+
+  LogicalResult
+  matchAndRewrite(WaitcntOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    auto getVal = [](Attribute attr) -> unsigned {
+      if (attr)
+        return cast<IntegerAttr>(attr).getInt();
+
+      // This value will be clamped to the maximum value for the chipset.
+      return 1024 * 1024;
+    };
+    unsigned vmcnt = getVal(adaptor.getVmcntAttr());
+    unsigned expcnt = getVal(adaptor.getExpcntAttr());
+    unsigned lgkmcnt = getVal(adaptor.getLgkmcntAttr());
+
+    FailureOr<unsigned> waitcnt =
+        encodeWaitcnt(chipset, vmcnt, expcnt, lgkmcnt);
+    if (failed(waitcnt))
+      return op.emitOpError("unsupported chipset");
+
+    rewriter.replaceOpWithNewOp<ROCDL::SWaitcntOp>(op, *waitcnt);
+    return success();
+  }
+};
+
 struct LDSBarrierOpLowering : public ConvertOpToLLVMPattern<LDSBarrierOp> {
   LDSBarrierOpLowering(const LLVMTypeConverter &converter, Chipset chipset)
       : ConvertOpToLLVMPattern<LDSBarrierOp>(converter), chipset(chipset) {}
@@ -1825,9 +1871,9 @@ void mlir::populateAMDGPUToROCDLConversionPatterns(LLVMTypeConverter &converter,
                                ROCDL::RawPtrBufferAtomicUminOp>,
            RawBufferOpLowering<RawBufferAtomicCmpswapOp,
                                ROCDL::RawPtrBufferAtomicCmpSwap>,
-           AMDGPUDPPLowering, LDSBarrierOpLowering, SchedBarrierOpLowering,
-           MFMAOpLowering, ScaledMFMAOpLowering, WMMAOpLowering,
-           ExtPackedFp8OpLowering, ScaledExtPackedOpLowering,
+           AMDGPUDPPLowering, WaitcntOpLowering, LDSBarrierOpLowering,
+           SchedBarrierOpLowering, MFMAOpLowering, ScaledMFMAOpLowering,
+           WMMAOpLowering, ExtPackedFp8OpLowering, ScaledExtPackedOpLowering,
            PackedScaledTruncOpLowering, PackedTrunc2xFp8OpLowering,
            PackedStochRoundFp8OpLowering, GatherToLDSOpLowering,
            TransposeLoadOpLowering>(converter, chipset);
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/waitcnt.mlir b/mlir/test/Conversion/AMDGPUToROCDL/waitcnt.mlir
new file mode 100644
index 0000000000000..9c785670198ae
--- /dev/null
+++ b/mlir/test/Conversion/AMDGPUToROCDL/waitcnt.mlir
@@ -0,0 +1,20 @@
+// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx942 | FileCheck %s --check-prefixes=CHECK,GFX9
+// TODO: Add more chipsets support
+
+
+// CHECK-LABEL: func @waitcnt
+func.func @waitcnt() {
+  // GFX9: rocdl.s.waitcnt 53119
+  amdgpu.waitcnt
+
+  // GFX9: rocdl.s.waitcnt 3952
+  amdgpu.waitcnt vmcnt(0)
+
+  // GFX9: rocdl.s.waitcnt 53007
+  amdgpu.waitcnt expcnt(0)
+
+  // GFX9: rocdl.s.waitcnt 49279
+  amdgpu.waitcnt lgkmcnt(0)
+
+  return
+}
diff --git a/mlir/test/Dialect/AMDGPU/ops.mlir b/mlir/test/Dialect/AMDGPU/ops.mlir
index fe2b32be04de4..086b5884be5c7 100644
--- a/mlir/test/Dialect/AMDGPU/ops.mlir
+++ b/mlir/test/Dialect/AMDGPU/ops.mlir
@@ -548,3 +548,16 @@ func.func @gather_to_lds(%idx1 : index, %idx2 : index, %mem1 : memref<32xf16>, %
   amdgpu.gather_to_lds %mem1[%idx1],        %smem2[%idx1, %idx2] : vector<2xf16>, memref<32xf16>,    memref<32x32xf16, #gpu.address_space<workgroup>>
   func.return
 }
+
+// CHECK-LABEL: func @waitcnt
+func.func @waitcnt() {
+  // CHECK: amdgpu.waitcnt vmcnt(1) expcnt(2) lgkmcnt(3)
+  // CHECK: amdgpu.waitcnt vmcnt(1)
+  // CHECK: amdgpu.waitcnt expcnt(2)
+  // CHECK: amdgpu.waitcnt lgkmcnt(3)
+  amdgpu.waitcnt vmcnt(1) expcnt(2) lgkmcnt(3)
+  amdgpu.waitcnt vmcnt(1)
+  amdgpu.waitcnt expcnt(2)
+  amdgpu.waitcnt lgkmcnt(3)
+  func.return
+}

From 5320853675ede402b99d9fbcc4446134e80ad12b Mon Sep 17 00:00:00 2001
From: Ivan Butygin <ivan.butygin@gmail.com>
Date: Sun, 20 Jul 2025 10:29:25 +0200
Subject: [PATCH 2/7] more chisets

Signed-off-by: Ivan Butygin <ivan.butygin@gmail.com>
---
 .../AMDGPUToROCDL/AMDGPUToROCDL.cpp           | 30 +++++++++++++++++++
 .../Conversion/AMDGPUToROCDL/waitcnt.mlir     | 11 ++++++-
 2 files changed, 40 insertions(+), 1 deletion(-)

diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
index af588d5b70a45..1940ef8775688 100644
--- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
+++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
@@ -421,8 +421,23 @@ struct RawBufferOpLowering : public ConvertOpToLLVMPattern<GpuOp> {
 
 // TODO: AMDGPU backend already have all this bitpacking logic, we should move
 // it to some common place.
+/// \details \p Vmcnt, \p Expcnt and \p Lgkmcnt are decoded as follows:
+///     \p Vmcnt = \p Waitcnt[3:0]        (pre-gfx9)
+///     \p Vmcnt = \p Waitcnt[15:14,3:0]  (gfx9,10)
+///     \p Vmcnt = \p Waitcnt[15:10]      (gfx11)
+///     \p Expcnt = \p Waitcnt[6:4]       (pre-gfx11)
+///     \p Expcnt = \p Waitcnt[2:0]       (gfx11)
+///     \p Lgkmcnt = \p Waitcnt[11:8]     (pre-gfx10)
+///     \p Lgkmcnt = \p Waitcnt[13:8]     (gfx10)
+///     \p Lgkmcnt = \p Waitcnt[9:4]      (gfx11)
 static FailureOr<unsigned> encodeWaitcnt(Chipset chipset, unsigned vmcnt,
                                          unsigned expcnt, unsigned lgkmcnt) {
+  if (chipset.majorVersion < 9) {
+    vmcnt = std::min(15u, vmcnt);
+    expcnt = std::min(7u, expcnt);
+    lgkmcnt = std::min(15u, lgkmcnt);
+    return vmcnt | (expcnt << 4) | (lgkmcnt << 8);
+  }
   if (chipset.majorVersion == 9) {
     vmcnt = std::min(63u, vmcnt);
     expcnt = std::min(7u, expcnt);
@@ -432,6 +447,21 @@ static FailureOr<unsigned> encodeWaitcnt(Chipset chipset, unsigned vmcnt,
     unsigned otherCnts = (expcnt << 4) | (lgkmcnt << 8);
     return lowBits | highBits | otherCnts;
   }
+  if (chipset.majorVersion == 10) {
+    vmcnt = std::min(63u, vmcnt);
+    expcnt = std::min(7u, expcnt);
+    lgkmcnt = std::min(63u, lgkmcnt);
+    unsigned lowBits = vmcnt & 0xF;
+    unsigned highBits = (vmcnt >> 4) << 14;
+    unsigned otherCnts = (expcnt << 4) | (lgkmcnt << 8);
+    return lowBits | highBits | otherCnts;
+  }
+  if (chipset.majorVersion == 11) {
+    vmcnt = std::min(63u, vmcnt);
+    expcnt = std::min(7u, expcnt);
+    lgkmcnt = std::min(63u, lgkmcnt);
+    return (vmcnt << 10) | expcnt | (lgkmcnt << 4);
+  }
   return failure();
 }
 
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/waitcnt.mlir b/mlir/test/Conversion/AMDGPUToROCDL/waitcnt.mlir
index 9c785670198ae..71617df05eb60 100644
--- a/mlir/test/Conversion/AMDGPUToROCDL/waitcnt.mlir
+++ b/mlir/test/Conversion/AMDGPUToROCDL/waitcnt.mlir
@@ -1,19 +1,28 @@
 // RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx942 | FileCheck %s --check-prefixes=CHECK,GFX9
-// TODO: Add more chipsets support
+// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx1030 | FileCheck %s --check-prefixes=CHECK,GFX10
+// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx1100 | FileCheck %s --check-prefixes=CHECK,GFX11
 
 
 // CHECK-LABEL: func @waitcnt
 func.func @waitcnt() {
   // GFX9: rocdl.s.waitcnt 53119
+  // GFX10: rocdl.s.waitcnt 65407
+  // GFX11: rocdl.s.waitcnt 65527
   amdgpu.waitcnt
 
   // GFX9: rocdl.s.waitcnt 3952
+  // GFX10: rocdl.s.waitcnt 16240
+  // GFX11: rocdl.s.waitcnt 1015
   amdgpu.waitcnt vmcnt(0)
 
   // GFX9: rocdl.s.waitcnt 53007
+  // GFX10: rocdl.s.waitcnt 65295
+  // GFX11: rocdl.s.waitcnt 65520
   amdgpu.waitcnt expcnt(0)
 
   // GFX9: rocdl.s.waitcnt 49279
+  // GFX10: rocdl.s.waitcnt 49279
+  // GFX11: rocdl.s.waitcnt 64519
   amdgpu.waitcnt lgkmcnt(0)
 
   return

From a8569157e2995cad79515eb2206ea6756f7bc5d6 Mon Sep 17 00:00:00 2001
From: Ivan Butygin <ivan.butygin@gmail.com>
Date: Mon, 21 Jul 2025 22:22:12 +0200
Subject: [PATCH 3/7] oilist

Signed-off-by: Ivan Butygin <ivan.butygin@gmail.com>
---
 mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td | 2 +-
 mlir/test/Dialect/AMDGPU/ops.mlir             | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
index 7fe1ef37e1f9b..481cebdf30852 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
@@ -735,7 +735,7 @@ def AMDGPU_WaitcntOp :
     supported values.
   }];
   let assemblyFormat = [{
-    (`vmcnt` `(` $vmcnt^ `)` )? (`expcnt` `(` $expcnt^ `)` )? (`lgkmcnt` `(` $lgkmcnt^ `)`)? attr-dict
+    oilist( `vmcnt` `(` $vmcnt `)` | `expcnt` `(` $expcnt `)` | `lgkmcnt` `(` $lgkmcnt `)` ) attr-dict
   }];
 }
 
diff --git a/mlir/test/Dialect/AMDGPU/ops.mlir b/mlir/test/Dialect/AMDGPU/ops.mlir
index 086b5884be5c7..82dd2bec248a7 100644
--- a/mlir/test/Dialect/AMDGPU/ops.mlir
+++ b/mlir/test/Dialect/AMDGPU/ops.mlir
@@ -552,10 +552,12 @@ func.func @gather_to_lds(%idx1 : index, %idx2 : index, %mem1 : memref<32xf16>, %
 // CHECK-LABEL: func @waitcnt
 func.func @waitcnt() {
   // CHECK: amdgpu.waitcnt vmcnt(1) expcnt(2) lgkmcnt(3)
+  // CHECK: amdgpu.waitcnt vmcnt(3) expcnt(2) lgkmcnt(1)
   // CHECK: amdgpu.waitcnt vmcnt(1)
   // CHECK: amdgpu.waitcnt expcnt(2)
   // CHECK: amdgpu.waitcnt lgkmcnt(3)
   amdgpu.waitcnt vmcnt(1) expcnt(2) lgkmcnt(3)
+  amdgpu.waitcnt lgkmcnt(1) expcnt(2) vmcnt(3)
   amdgpu.waitcnt vmcnt(1)
   amdgpu.waitcnt expcnt(2)
   amdgpu.waitcnt lgkmcnt(3)

From 811633e21a309c2b97ba342fa7ec1a9a2f955884 Mon Sep 17 00:00:00 2001
From: Ivan Butygin <ivan.butygin@gmail.com>
Date: Tue, 22 Jul 2025 19:37:52 +0200
Subject: [PATCH 4/7] switch to new api

---
 mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td | 25 ++++++-----
 .../AMDGPUToROCDL/AMDGPUToROCDL.cpp           | 42 ++++++++++++++-----
 .../AMDGPUToROCDL/memory_counter_wait.mlir    | 42 +++++++++++++++++++
 .../Conversion/AMDGPUToROCDL/waitcnt.mlir     | 29 -------------
 mlir/test/Dialect/AMDGPU/ops.mlir             | 26 ++++++------
 5 files changed, 102 insertions(+), 62 deletions(-)
 create mode 100644 mlir/test/Conversion/AMDGPUToROCDL/memory_counter_wait.mlir
 delete mode 100644 mlir/test/Conversion/AMDGPUToROCDL/waitcnt.mlir

diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
index 481cebdf30852..b237f7b5749e7 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
@@ -719,23 +719,26 @@ def AMDGPU_SchedBarrierOp :
   }];
 }
 
-def AMDGPU_WaitcntOp :
-  AMDGPU_Op<"waitcnt">,
+def AMDGPU_MemoryCounterWaitOp :
+  AMDGPU_Op<"memory_counter_wait">,
   Arguments<(ins
-      OptionalAttr<I32Attr>:$vmcnt,
-      OptionalAttr<I32Attr>:$expcnt,
-      OptionalAttr<I32Attr>:$lgkmcnt
+      OptionalAttr<I32Attr>:$load,
+      OptionalAttr<I32Attr>:$store,
+      OptionalAttr<I32Attr>:$ds,
+      OptionalAttr<I32Attr>:$exp
     )>
   {
-  let summary = "Wrapper on ROCDL SWaitcntOp";
+  let summary = "Wait for specified hardware counters";
   let description = [{
-    Covenience wrapper on `rocdl.s.waitcnt`. Hides the architecture specific
-    bitpacking from user. Missing values will be assumed maximum values supported
-    by the architecture. Large values will also be clamped to the maximum
-    supported values.
+    Wait for the specified counters to be less-than or equal-to the provided
+    values before continuing.
+
+    Counters can lower to different instructions on different architectires,
+    including clamping to the some HW supported max value or combining multiple
+    counters into one.
   }];
   let assemblyFormat = [{
-    oilist( `vmcnt` `(` $vmcnt `)` | `expcnt` `(` $expcnt `)` | `lgkmcnt` `(` $lgkmcnt `)` ) attr-dict
+    oilist( `load` `(` $load `)` | `store` `(` $store `)` | `ds` `(` $ds `)` | `exp` `(` $exp `)` ) attr-dict
   }];
 }
 
diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
index 1940ef8775688..057dfced09087 100644
--- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
+++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
@@ -465,15 +465,35 @@ static FailureOr<unsigned> encodeWaitcnt(Chipset chipset, unsigned vmcnt,
   return failure();
 }
 
-struct WaitcntOpLowering : public ConvertOpToLLVMPattern<WaitcntOp> {
-  WaitcntOpLowering(const LLVMTypeConverter &converter, Chipset chipset)
-      : ConvertOpToLLVMPattern<WaitcntOp>(converter), chipset(chipset) {}
+struct MemoryCounterWaitOpLowering
+    : public ConvertOpToLLVMPattern<MemoryCounterWaitOp> {
+  MemoryCounterWaitOpLowering(const LLVMTypeConverter &converter,
+                              Chipset chipset)
+      : ConvertOpToLLVMPattern<MemoryCounterWaitOp>(converter),
+        chipset(chipset) {}
 
   Chipset chipset;
 
   LogicalResult
-  matchAndRewrite(WaitcntOp op, OpAdaptor adaptor,
+  matchAndRewrite(MemoryCounterWaitOp op, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
+    if (chipset.majorVersion >= 12) {
+      Location loc = op.getLoc();
+      if (auto ds = adaptor.getDs())
+        rewriter.create<ROCDL::WaitDscntOp>(loc, *ds);
+
+      if (auto load = adaptor.getLoad())
+        rewriter.create<ROCDL::WaitLoadcntOp>(loc, *load);
+
+      if (auto store = adaptor.getStore())
+        rewriter.create<ROCDL::WaitStorecntOp>(loc, *store);
+
+      if (auto exp = adaptor.getExp())
+        rewriter.create<ROCDL::WaitExpcntOp>(loc, *exp);
+
+      return success();
+    }
+
     auto getVal = [](Attribute attr) -> unsigned {
       if (attr)
         return cast<IntegerAttr>(attr).getInt();
@@ -481,12 +501,14 @@ struct WaitcntOpLowering : public ConvertOpToLLVMPattern<WaitcntOp> {
       // This value will be clamped to the maximum value for the chipset.
       return 1024 * 1024;
     };
-    unsigned vmcnt = getVal(adaptor.getVmcntAttr());
-    unsigned expcnt = getVal(adaptor.getExpcntAttr());
-    unsigned lgkmcnt = getVal(adaptor.getLgkmcntAttr());
+    unsigned ds = getVal(adaptor.getDsAttr());
+    unsigned load = getVal(adaptor.getLoadAttr());
+    unsigned store = getVal(adaptor.getStoreAttr());
+    unsigned exp = getVal(adaptor.getExpAttr());
+
+    unsigned vmcnt = std::min(load, store);
 
-    FailureOr<unsigned> waitcnt =
-        encodeWaitcnt(chipset, vmcnt, expcnt, lgkmcnt);
+    FailureOr<unsigned> waitcnt = encodeWaitcnt(chipset, vmcnt, exp, ds);
     if (failed(waitcnt))
       return op.emitOpError("unsupported chipset");
 
@@ -1901,7 +1923,7 @@ void mlir::populateAMDGPUToROCDLConversionPatterns(LLVMTypeConverter &converter,
                                ROCDL::RawPtrBufferAtomicUminOp>,
            RawBufferOpLowering<RawBufferAtomicCmpswapOp,
                                ROCDL::RawPtrBufferAtomicCmpSwap>,
-           AMDGPUDPPLowering, WaitcntOpLowering, LDSBarrierOpLowering,
+           AMDGPUDPPLowering, MemoryCounterWaitOpLowering, LDSBarrierOpLowering,
            SchedBarrierOpLowering, MFMAOpLowering, ScaledMFMAOpLowering,
            WMMAOpLowering, ExtPackedFp8OpLowering, ScaledExtPackedOpLowering,
            PackedScaledTruncOpLowering, PackedTrunc2xFp8OpLowering,
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/memory_counter_wait.mlir b/mlir/test/Conversion/AMDGPUToROCDL/memory_counter_wait.mlir
new file mode 100644
index 0000000000000..1016ee859e462
--- /dev/null
+++ b/mlir/test/Conversion/AMDGPUToROCDL/memory_counter_wait.mlir
@@ -0,0 +1,42 @@
+// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx942 | FileCheck %s --check-prefixes=CHECK,GFX9
+// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx1030 | FileCheck %s --check-prefixes=CHECK,GFX10
+// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx1100 | FileCheck %s --check-prefixes=CHECK,GFX11
+// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx1201 | FileCheck %s --check-prefixes=CHECK,GFX12
+
+// CHECK-LABEL: func @memory_counter_wait
+func.func @memory_counter_wait() {
+  // GFX9: rocdl.s.waitcnt 53119
+  // GFX10: rocdl.s.waitcnt 65407
+  // GFX11: rocdl.s.waitcnt 65527
+  // GFX12-NOT: rocdl.s.wait.loadcnt
+  // GFX12-NOT: rocdl.s.wait.storecnt
+  // GFX12-NOT: rocdl.s.wait.expcnt
+  // GFX12-NOT: rocdl.s.wait.dscnt
+  amdgpu.memory_counter_wait
+
+  // GFX9: rocdl.s.waitcnt 3952
+  // GFX10: rocdl.s.waitcnt 16240
+  // GFX11: rocdl.s.waitcnt 1015
+  // GFX12: rocdl.s.wait.loadcnt 0
+  amdgpu.memory_counter_wait load(0)
+
+  // GFX9: rocdl.s.waitcnt 3952
+  // GFX10: rocdl.s.waitcnt 16240
+  // GFX11: rocdl.s.waitcnt 1015
+  // GFX12: rocdl.s.wait.storecnt 0
+  amdgpu.memory_counter_wait store(0)
+
+  // GFX9: rocdl.s.waitcnt 53007
+  // GFX10: rocdl.s.waitcnt 65295
+  // GFX11: rocdl.s.waitcnt 65520
+  // GFX12: rocdl.s.wait.expcnt 0
+  amdgpu.memory_counter_wait exp(0)
+
+  // GFX9: rocdl.s.waitcnt 49279
+  // GFX10: rocdl.s.waitcnt 49279
+  // GFX11: rocdl.s.waitcnt 64519
+  // GFX12: rocdl.s.wait.dscnt 0
+  amdgpu.memory_counter_wait ds(0)
+
+  return
+}
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/waitcnt.mlir b/mlir/test/Conversion/AMDGPUToROCDL/waitcnt.mlir
deleted file mode 100644
index 71617df05eb60..0000000000000
--- a/mlir/test/Conversion/AMDGPUToROCDL/waitcnt.mlir
+++ /dev/null
@@ -1,29 +0,0 @@
-// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx942 | FileCheck %s --check-prefixes=CHECK,GFX9
-// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx1030 | FileCheck %s --check-prefixes=CHECK,GFX10
-// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx1100 | FileCheck %s --check-prefixes=CHECK,GFX11
-
-
-// CHECK-LABEL: func @waitcnt
-func.func @waitcnt() {
-  // GFX9: rocdl.s.waitcnt 53119
-  // GFX10: rocdl.s.waitcnt 65407
-  // GFX11: rocdl.s.waitcnt 65527
-  amdgpu.waitcnt
-
-  // GFX9: rocdl.s.waitcnt 3952
-  // GFX10: rocdl.s.waitcnt 16240
-  // GFX11: rocdl.s.waitcnt 1015
-  amdgpu.waitcnt vmcnt(0)
-
-  // GFX9: rocdl.s.waitcnt 53007
-  // GFX10: rocdl.s.waitcnt 65295
-  // GFX11: rocdl.s.waitcnt 65520
-  amdgpu.waitcnt expcnt(0)
-
-  // GFX9: rocdl.s.waitcnt 49279
-  // GFX10: rocdl.s.waitcnt 49279
-  // GFX11: rocdl.s.waitcnt 64519
-  amdgpu.waitcnt lgkmcnt(0)
-
-  return
-}
diff --git a/mlir/test/Dialect/AMDGPU/ops.mlir b/mlir/test/Dialect/AMDGPU/ops.mlir
index 82dd2bec248a7..fe78b5365745a 100644
--- a/mlir/test/Dialect/AMDGPU/ops.mlir
+++ b/mlir/test/Dialect/AMDGPU/ops.mlir
@@ -549,17 +549,19 @@ func.func @gather_to_lds(%idx1 : index, %idx2 : index, %mem1 : memref<32xf16>, %
   func.return
 }
 
-// CHECK-LABEL: func @waitcnt
-func.func @waitcnt() {
-  // CHECK: amdgpu.waitcnt vmcnt(1) expcnt(2) lgkmcnt(3)
-  // CHECK: amdgpu.waitcnt vmcnt(3) expcnt(2) lgkmcnt(1)
-  // CHECK: amdgpu.waitcnt vmcnt(1)
-  // CHECK: amdgpu.waitcnt expcnt(2)
-  // CHECK: amdgpu.waitcnt lgkmcnt(3)
-  amdgpu.waitcnt vmcnt(1) expcnt(2) lgkmcnt(3)
-  amdgpu.waitcnt lgkmcnt(1) expcnt(2) vmcnt(3)
-  amdgpu.waitcnt vmcnt(1)
-  amdgpu.waitcnt expcnt(2)
-  amdgpu.waitcnt lgkmcnt(3)
+// CHECK-LABEL: func @memory_counter_wait
+func.func @memory_counter_wait() {
+  // CHECK: amdgpu.memory_counter_wait load(1) store(2) ds(3) exp(4)
+  // CHECK: amdgpu.memory_counter_wait load(4) store(2) ds(3) exp(1)
+  // CHECK: amdgpu.memory_counter_wait load(1)
+  // CHECK: amdgpu.memory_counter_wait store(2)
+  // CHECK: amdgpu.memory_counter_wait ds(3)
+  // CHECK: amdgpu.memory_counter_wait exp(4)
+  amdgpu.memory_counter_wait load(1) store(2) ds(3) exp(4)
+  amdgpu.memory_counter_wait exp(1) store(2) ds(3) load(4)
+  amdgpu.memory_counter_wait load(1)
+  amdgpu.memory_counter_wait store(2)
+  amdgpu.memory_counter_wait ds(3)
+  amdgpu.memory_counter_wait exp(4)
   func.return
 }

From b02a25cb8a0746340d1b1bf826f6f42a3f880162 Mon Sep 17 00:00:00 2001
From: Ivan Butygin <ivan.butygin@gmail.com>
Date: Tue, 22 Jul 2025 19:41:42 +0200
Subject: [PATCH 5/7] erase op

---
 mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
index 057dfced09087..0501aa968347f 100644
--- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
+++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
@@ -491,6 +491,7 @@ struct MemoryCounterWaitOpLowering
       if (auto exp = adaptor.getExp())
         rewriter.create<ROCDL::WaitExpcntOp>(loc, *exp);
 
+      rewriter.eraseOp(op);
       return success();
     }
 

From 0660da5c64bcba3b6f3e52d8ed8553607566b39f Mon Sep 17 00:00:00 2001
From: Ivan Butygin <ivan.butygin@gmail.com>
Date: Tue, 22 Jul 2025 20:46:08 +0200
Subject: [PATCH 6/7] comments

---
 .../AMDGPUToROCDL/AMDGPUToROCDL.cpp           | 26 +++++++++----------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
index 0501aa968347f..93d220f56026b 100644
--- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
+++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
@@ -421,15 +421,15 @@ struct RawBufferOpLowering : public ConvertOpToLLVMPattern<GpuOp> {
 
 // TODO: AMDGPU backend already have all this bitpacking logic, we should move
 // it to some common place.
-/// \details \p Vmcnt, \p Expcnt and \p Lgkmcnt are decoded as follows:
-///     \p Vmcnt = \p Waitcnt[3:0]        (pre-gfx9)
-///     \p Vmcnt = \p Waitcnt[15:14,3:0]  (gfx9,10)
-///     \p Vmcnt = \p Waitcnt[15:10]      (gfx11)
-///     \p Expcnt = \p Waitcnt[6:4]       (pre-gfx11)
-///     \p Expcnt = \p Waitcnt[2:0]       (gfx11)
-///     \p Lgkmcnt = \p Waitcnt[11:8]     (pre-gfx10)
-///     \p Lgkmcnt = \p Waitcnt[13:8]     (gfx10)
-///     \p Lgkmcnt = \p Waitcnt[9:4]      (gfx11)
+///  Vmcnt, Expcnt and Lgkmcnt are decoded as follows:
+///     Vmcnt = Waitcnt[3:0]        (pre-gfx9)
+///     Vmcnt = Waitcnt[15:14,3:0]  (gfx9,10)
+///     Vmcnt = Waitcnt[15:10]      (gfx11)
+///     Expcnt = Waitcnt[6:4]       (pre-gfx11)
+///     Expcnt = Waitcnt[2:0]       (gfx11)
+///     Lgkmcnt = Waitcnt[11:8]     (pre-gfx10)
+///     Lgkmcnt = Waitcnt[13:8]     (gfx10)
+///     Lgkmcnt = Waitcnt[9:4]      (gfx11)
 static FailureOr<unsigned> encodeWaitcnt(Chipset chipset, unsigned vmcnt,
                                          unsigned expcnt, unsigned lgkmcnt) {
   if (chipset.majorVersion < 9) {
@@ -479,16 +479,16 @@ struct MemoryCounterWaitOpLowering
                   ConversionPatternRewriter &rewriter) const override {
     if (chipset.majorVersion >= 12) {
       Location loc = op.getLoc();
-      if (auto ds = adaptor.getDs())
+      if (std::optional<int> ds = adaptor.getDs())
         rewriter.create<ROCDL::WaitDscntOp>(loc, *ds);
 
-      if (auto load = adaptor.getLoad())
+      if (std::optional<int> load = adaptor.getLoad())
         rewriter.create<ROCDL::WaitLoadcntOp>(loc, *load);
 
-      if (auto store = adaptor.getStore())
+      if (std::optional<int> store = adaptor.getStore())
         rewriter.create<ROCDL::WaitStorecntOp>(loc, *store);
 
-      if (auto exp = adaptor.getExp())
+      if (std::optional<int> exp = adaptor.getExp())
         rewriter.create<ROCDL::WaitExpcntOp>(loc, *exp);
 
       rewriter.eraseOp(op);

From 611f679f10dd2c922eb3fdfdea840e46ebaadd26 Mon Sep 17 00:00:00 2001
From: Ivan Butygin <ivan.butygin@gmail.com>
Date: Tue, 22 Jul 2025 21:23:41 +0200
Subject: [PATCH 7/7] add load and store

---
 .../Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp    | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
index 93d220f56026b..309476ca7136a 100644
--- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
+++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
@@ -500,14 +500,21 @@ struct MemoryCounterWaitOpLowering
         return cast<IntegerAttr>(attr).getInt();
 
       // This value will be clamped to the maximum value for the chipset.
-      return 1024 * 1024;
+      return 1024;
     };
     unsigned ds = getVal(adaptor.getDsAttr());
-    unsigned load = getVal(adaptor.getLoadAttr());
-    unsigned store = getVal(adaptor.getStoreAttr());
     unsigned exp = getVal(adaptor.getExpAttr());
 
-    unsigned vmcnt = std::min(load, store);
+    unsigned vmcnt = 1024;
+    Attribute load = adaptor.getLoadAttr();
+    Attribute store = adaptor.getStoreAttr();
+    if (load && store) {
+      vmcnt = getVal(load) + getVal(store);
+    } else if (load) {
+      vmcnt = getVal(load);
+    } else if (store) {
+      vmcnt = getVal(store);
+    }
 
     FailureOr<unsigned> waitcnt = encodeWaitcnt(chipset, vmcnt, exp, ds);
     if (failed(waitcnt))