ROCm · samremes · Oct 13, 2025 · Oct 13, 2025 · Oct 13, 2025 · Oct 14, 2025
@@ -17,7 +17,7 @@ template <typename GemmConfig,
           typename ALayout,
           typename BLayout,
           typename CLayout,
-          uint32_t QuantGroupSize,
+          typename QuantGroupSize,
           ck_tile::QuantType QuantMode,
           typename CDEElementWise>
 float gemm_calc_quant(const ck_tile::QuantGemmHostArgs& args, const ck_tile::stream_config& s)
@@ -229,7 +229,7 @@ float gemm_calc_quant(const ck_tile::QuantGemmHostArgs& args, const ck_tile::str
 
 template <typename GemmConfig,
           typename TypeConfig,
-          uint32_t QuantGroupSize,
+          typename QuantGroupSize,
           ck_tile::QuantType QuantMode>
 int run_gemm_example_prec_type(std::string a_layout, std::string b_layout, int argc, char* argv[])
 {
@@ -279,6 +279,8 @@ int run_gemm_example(int argc, char* argv[])
 
     std::string quant_mode = arg_parser.get_str("quant_mode");
 
+    using QuantGroupSize = ck_tile::QuantGroupShape<ck_tile::sequence<1, 1, 128>>;
+
     if(data_type == "fp8")
     {
         using TypeConfig =
@@ -288,31 +290,31 @@ int run_gemm_example(int argc, char* argv[])
         {
             return run_gemm_example_prec_type<GemmConfig<ck_tile::fp8_t>,
                                               TypeConfig,
-                                              128,
+                                              QuantGroupSize,
                                               ck_tile::QuantType::AQuantGrouped>(
                 a_layout, b_layout, argc, argv);
         }
         else if(quant_mode == "bquant")
         {
             return run_gemm_example_prec_type<GemmConfig<ck_tile::fp8_t>,
                                               TypeConfig,
-                                              128,
+                                              QuantGroupSize,
                                               ck_tile::QuantType::BQuantGrouped>(
                 a_layout, b_layout, argc, argv);
         }
         else if(quant_mode == "rowcol")
         {
             return run_gemm_example_prec_type<GemmConfig<ck_tile::fp8_t>,
                                               TypeConfig,
-                                              128,
+                                              QuantGroupSize,
                                               ck_tile::QuantType::RowColQuant>(
                 a_layout, b_layout, argc, argv);
         }
         else if(quant_mode == "tensor")
         {
             return run_gemm_example_prec_type<GemmConfig<ck_tile::fp8_t>,
                                               TypeConfig,
-                                              128,
+                                              QuantGroupSize,
                                               ck_tile::QuantType::TensorQuant>(
                 a_layout, b_layout, argc, argv);
         }
@@ -331,31 +333,31 @@ int run_gemm_example(int argc, char* argv[])
         {
             return run_gemm_example_prec_type<GemmConfig<ck_tile::bf8_t>,
                                               TypeConfig,
-                                              128,
+                                              QuantGroupSize,
                                               ck_tile::QuantType::AQuantGrouped>(
                 a_layout, b_layout, argc, argv);
         }
         else if(quant_mode == "bquant")
         {
             return run_gemm_example_prec_type<GemmConfig<ck_tile::bf8_t>,
                                               TypeConfig,
-                                              128,
+                                              QuantGroupSize,
                                               ck_tile::QuantType::BQuantGrouped>(
                 a_layout, b_layout, argc, argv);
         }
         else if(quant_mode == "rowcol")
         {
             return run_gemm_example_prec_type<GemmConfig<ck_tile::bf8_t>,
                                               TypeConfig,
-                                              128,
+                                              QuantGroupSize,
                                               ck_tile::QuantType::RowColQuant>(
                 a_layout, b_layout, argc, argv);
         }
         else if(quant_mode == "tensor")
         {
             return run_gemm_example_prec_type<GemmConfig<ck_tile::bf8_t>,
                                               TypeConfig,
-                                              128,
+                                              QuantGroupSize,
                                               ck_tile::QuantType::TensorQuant>(
                 a_layout, b_layout, argc, argv);
         }
@@ -376,7 +378,7 @@ int run_gemm_example(int argc, char* argv[])
         {
             return run_gemm_example_prec_type<GemmConfig<ck_tile::fp8_t>,
                                               TypeConfig,
-                                              128,
+                                              QuantGroupSize,
                                               ck_tile::QuantType::AQuantGrouped>(
                 a_layout, b_layout, argc, argv);
         }
@@ -397,7 +399,7 @@ int run_gemm_example(int argc, char* argv[])
         {
             return run_gemm_example_prec_type<GemmConfig<ck_tile::bf8_t>,
                                               TypeConfig,
-                                              128,
+                                              QuantGroupSize,
                                               ck_tile::QuantType::AQuantGrouped>(
                 a_layout, b_layout, argc, argv);
         }
@@ -418,7 +420,7 @@ int run_gemm_example(int argc, char* argv[])
         {
             return run_gemm_example_prec_type<GemmConfig<ck_tile::fp8_t>,
                                               TypeConfig,
-                                              128,
+                                              QuantGroupSize,
                                               ck_tile::QuantType::BQuantGrouped>(
                 a_layout, b_layout, argc, argv);
         }
@@ -439,7 +441,7 @@ int run_gemm_example(int argc, char* argv[])
         {
             return run_gemm_example_prec_type<GemmConfig<ck_tile::bf8_t>,
                                               TypeConfig,
-                                              128,
+                                              QuantGroupSize,
                                               ck_tile::QuantType::BQuantGrouped>(
                 a_layout, b_layout, argc, argv);
         }

@@ -14,7 +14,7 @@ template <typename GemmConfig,
           typename BLayout,
           typename BQLayout,
           typename CLayout,
-          uint32_t QuantGroupSize,
+          typename QuantGroupSize,
           ck_tile::QuantType QuantMode,
           typename CDEElementWise = ck_tile::element_wise::PassThrough>
 float invoke_gemm(ck_tile::DeviceMem& a_m_k_dev_buf,
@@ -113,7 +113,7 @@ float invoke_gemm(ck_tile::DeviceMem& a_m_k_dev_buf,
 
 template <typename GemmConfig,
           typename TypeConfig,
-          uint32_t QuantGroupSize,
+          typename QuantGroupSize,
           ck_tile::QuantType QuantMode,
           typename ALayout,
           typename AQLayout,
@@ -146,7 +146,7 @@ int run_gemm_example_with_layouts(int argc,
     if constexpr(QuantMode == ck_tile::QuantType::AQuantGrouped ||
                  QuantMode == ck_tile::QuantType::BQuantGrouped)
     {
-        if(K % QuantGroupSize != 0)
+        if(K % QuantGroupSize::kK != 0)
         {
             throw std::runtime_error(
                 "K must be aligned with QuantGroupSize for AQuantGrouped/BQuantGrouped mode");
@@ -155,13 +155,13 @@ int run_gemm_example_with_layouts(int argc,
     ck_tile::index_t AQK, BQK;
     if constexpr(QuantMode == ck_tile::QuantType::AQuantGrouped)
     {
-        AQK = K / QuantGroupSize; // Group quantization: AQK = K / GroupSize
-        BQK = 0;                  // No B quantization
+        AQK = K / QuantGroupSize::kK; // Group quantization: AQK = K / GroupSize
+        BQK = 0;                      // No B quantization
     }
     else if constexpr(QuantMode == ck_tile::QuantType::BQuantGrouped)
     {
-        AQK = 0;                  // No A quantization
-        BQK = K / QuantGroupSize; // Group quantization: BQK = K / GroupSize
+        AQK = 0;                      // No A quantization
+        BQK = K / QuantGroupSize::kK; // Group quantization: BQK = K / GroupSize
     }
     else if constexpr(QuantMode == ck_tile::QuantType::RowColQuant ||
                       QuantMode == ck_tile::QuantType::TensorQuant)
@@ -357,7 +357,7 @@ int run_gemm_example_with_layouts(int argc,
         if constexpr(GemmConfig::PreshuffleQuant)
         {
             ck_tile::HostTensor<AQDataType> aq_shuffle_host =
-                ck_tile::shuffle_aq(aq_tensor_ptr.get(), GemmConfig::K_Tile / QuantGroupSize);
+                ck_tile::shuffle_aq(aq_tensor_ptr.get(), GemmConfig::K_Tile / QuantGroupSize::kK);
             aq_dev_buf_ptr->ToDevice(aq_shuffle_host.data());
         }
         else

@@ -16,7 +16,7 @@ template <typename ADataType,
           typename BDataType,
           typename AccDataType,
           typename CDataType,
-          uint32_t QuantGroupSize,
+          typename QuantGroupSize,
           bool aquant,
           typename AElementOp   = ck_tile::identity,
           typename BElementOp   = ck_tile::identity,
@@ -80,12 +80,11 @@ CK_TILE_HOST void reference_gemm_quant(const HostTensor<ADataType>& a_m_k,
             v_block_acc += v_a * v_b;
 
             // Apply group dequant scale
-            if((k + 1) % QuantGroupSize == 0)
+            if((k + 1) % QuantGroupSize::kK == 0)
             {
                 float scale       = 0.f;
-                index_t outer_dim = (aquant) ? m : k / QuantGroupSize;
-                index_t inner_dim = (aquant) ? k / QuantGroupSize : n;
-
+                index_t outer_dim = (aquant) ? (m / QuantGroupSize::kM) : (k / QuantGroupSize::kK);
+                index_t inner_dim = (aquant) ? (k / QuantGroupSize::kK) : (n / QuantGroupSize::kN);
                 if constexpr(std::is_same_v<QDataType, float>)
                 {
                     scale = q(outer_dim, inner_dim);

@@ -10,7 +10,7 @@ namespace ck_tile {
 
 // A is block window on shared memory
 // BQ (scale tensor) is block distributed tensor.
-// Consecutive kQuantGroupSize elements of B are quantized with a separate scale.
+// Consecutive QuantGroupSize elements of B are quantized with a separate scale.
 // B is block window on block distributed tensor.
 // C is block distributed tensor
 template <typename Problem_, typename BlockPolicy_>
@@ -24,6 +24,10 @@ struct BlockGemmWeightPreshuffleBQuantARegBRegCReg
     using CDataType       = remove_cvref_t<typename Problem::CDataType>;
     using ComputeDataType = remove_cvref_t<typename Problem::ComputeDataType>;
     using BlockGemmShape  = remove_cvref_t<typename Problem::BlockGemmShape>; // TileFlatmmShape
+    using QuantGroupSize  = remove_cvref_t<typename Problem::QuantGroupSize>;
+
+    static_assert(QuantGroupSize::kM == 1, "only N/K blocks for BQuant preshuffle kernel!");
+    static_assert(QuantGroupSize::kN == 1, "no block for N supported yet!");
-    static_assert(QuantGroupSize::kN == 1, "no block for N supported yet!");
+    // static_assert(QuantGroupSize::kN == 1, "no block for N supported yet!");
-    static_assert(QuantGroupSize::kN == 1, "no block for N supported yet!");
+    // static_assert(QuantGroupSize::kN == 1, "no block for N supported yet!");
 
     static constexpr auto I0   = number<0>();
     static constexpr auto I1   = number<1>();
@@ -47,8 +51,7 @@ struct BlockGemmWeightPreshuffleBQuantARegBRegCReg
     static constexpr index_t MPerBlock = BlockGemmShape::kM;
     static constexpr index_t KPerBlock = BlockGemmShape::kK;
 
-    static constexpr index_t kQuantGroupSize = Problem::kQuantGroupSize;
-    static constexpr index_t kBlockSize      = Problem::kBlockSize;
+    static constexpr index_t kBlockSize = Problem::kBlockSize;
 
     static constexpr index_t MIterPerWarp = MPerBlock / (MWarp * WG::kM);
     static constexpr index_t NIterPerWarp =
@@ -58,13 +61,12 @@ struct BlockGemmWeightPreshuffleBQuantARegBRegCReg
     static constexpr auto MIter_2nd_last =
         (MIterPerWarp >= 2) ? MIterPerWarp - 2 : MIterPerWarp - 1;
 
-    static constexpr index_t KPerBlockBQ = KPerBlock / kQuantGroupSize;
+    static constexpr index_t KPerBlockBQ = KPerBlock / QuantGroupSize::kK;
 
     static constexpr index_t QScalesPerBlockRow =
-        (KPerBlock + kQuantGroupSize - 1) / kQuantGroupSize;
-
+        integer_divide_ceil(KPerBlock, QuantGroupSize::kK);
     static constexpr index_t QScalesPerWarpGemmRow =
-        (WG::kK + kQuantGroupSize - 1) / kQuantGroupSize;
+        integer_divide_ceil(WG::kK, QuantGroupSize::kK);
 
     static constexpr index_t KIterPerQScale = KIterPerWarp / QScalesPerBlockRow;
     static constexpr index_t DsReadPreload  = 2; // default 2, preload 2 ds read

@@ -46,7 +46,7 @@ struct BlockGemmAQuantBase
 
 // A is block window on shared memory
 // AQ (scale tensor) is block distributed tensor.
-// Consecutive kQuantGroupSize elements of A are quantized with a separate scale.
+// Consecutive QuantGroupSize elements of A are quantized with a separate scale.
 // B is block window on shared memory
 // C is block distributed tensor
 template <typename Problem_,
@@ -66,16 +66,16 @@ struct AQuantBlockUniversalGemmAsBsCr : public BlockGemmAQuantBase<Problem_>
         using ComputeDataType = remove_cvref_t<typename Problem::ComputeDataType>;
         using CDataType       = remove_cvref_t<typename Problem::CDataType>;
         using BlockGemmShape  = remove_cvref_t<typename Problem::BlockGemmShape>;
+        using QuantGroupSize  = remove_cvref_t<typename Problem::QuantGroupSize>;
 
-        static constexpr index_t kQuantGroupSize = Problem::kQuantGroupSize;
-        static constexpr index_t kBlockSize      = Problem::kBlockSize;
-        static constexpr auto Scheduler          = Problem::Scheduler;
+        static constexpr index_t kBlockSize = Problem::kBlockSize;
+        static constexpr auto Scheduler     = Problem::Scheduler;
 
         // Threadblock GEMM tile size
         static constexpr index_t MPerBlock  = BlockGemmShape::kM;
         static constexpr index_t NPerBlock  = BlockGemmShape::kN;
         static constexpr index_t KPerBlock  = BlockGemmShape::kK;
-        static constexpr index_t AQPerBlock = KPerBlock / kQuantGroupSize;
+        static constexpr index_t AQPerBlock = KPerBlock / QuantGroupSize::kK;
 
         static constexpr auto config = Policy::template GetWarpGemmMWarpNWarp<Problem>();
         using WarpGemm               = remove_cvref_t<decltype(config.template at<0>())>;
@@ -101,20 +101,20 @@ struct AQuantBlockUniversalGemmAsBsCr : public BlockGemmAQuantBase<Problem_>
         static constexpr index_t KIterPerWarp = KPerBlock / WarpGemm::kK;
 
         static constexpr index_t QScalesPerBlockRow =
-            (KPerBlock + kQuantGroupSize - 1) / kQuantGroupSize;
+            integer_divide_ceil(KPerBlock, QuantGroupSize::kK);
         static constexpr index_t QScalesPerWarpGemmRow =
-            (WarpGemm::kK + kQuantGroupSize - 1) / kQuantGroupSize;
+            integer_divide_ceil(WarpGemm::kK, QuantGroupSize::kK);
 
         static constexpr index_t KIterPerQScale = KIterPerWarp / QScalesPerBlockRow;
 
-        static_assert(kQuantGroupSize % WarpGemm::kK == 0,
-                      "Error! WarpGemm::kK should be a multiple of kQuantGroupSize");
+        static_assert(QuantGroupSize::kK % WarpGemm::kK == 0,
+                      "Error! WarpGemm::kK should be a multiple of QuantGroupSize");
         static_assert(QScalesPerWarpGemmRow == 1,
-                      "Error! kQuantGroupSize shouldn't be smaller than WarpGemm::kK");
+                      "Error! QuantGroupSize shouldn't be smaller than WarpGemm::kK");
         static_assert(KIterPerWarp % QScalesPerBlockRow == 0,
                       "Error! KItersPerWarp should be a multiple of QscalesPerBlockRow");
 
-        static_assert(KPerBlock / kQuantGroupSize > 0,
+        static_assert(KPerBlock / QuantGroupSize::kK > 0,
                       "Error! Each row of blockgemm should have a separate scale");
 
         static_assert(MIterPerWarp * MWarp * WarpGemm::kM == MPerBlock,