From 2ae9efbb630d002df8f0a1dc1b13e0d58329c30e Mon Sep 17 00:00:00 2001
From: Emily Martins <emily.martins@amd.com>
Date: Tue, 26 Aug 2025 17:41:56 +0000
Subject: [PATCH 1/9] Change splitk_batch_offset parameter to k_size in
 UniversalGemmKernel::MakeGemmTensorViews function

Prior to this change, the splitk_batch_offset parameter of
MakeGemmTensorViews had type SplitKBatchOffset. But, the only member
variable of the SplitKBatchOffset class used in the MakeGemmTensorViews
function was splitted_k (an int32_t). The splitted_k value was used as
part of defining the dimensions of the tensor view. That said, for
Stream K, we do not need to use the SplitKBatchOffset class since we are
not using Split K. Thus, this commit changes the splitk_batch_offset
parameter to a int32_t called k_size. This will avoid the constraint of
requiring a caller of MakeGemmTensorViews to use the SplitKBatchOffset
class while still providing the same functionality. Calls to
UniversalGemmKernel::MakeGemmTensorViews have been updated accordingly.
---
 .../ops/gemm/kernel/grouped_gemm_kernel.hpp   |  4 ++--
 .../ops/gemm/kernel/universal_gemm_kernel.hpp | 20 +++++++++----------
 2 files changed, 12 insertions(+), 12 deletions(-)
diff --git a/include/ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp b/include/ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp
index 704d0d01ee..dda38bbc47 100644
--- a/include/ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp
+++ b/include/ck_tile/ops/gemm/kernel/grouped_gemm_kernel.hpp
@@ -374,7 +374,7 @@ struct GroupedGemmKernel
         // Create Gemm tensor views, pad views and tile windows
         const auto& gemm_tensor_views_tuple =
             Base::template MakeGemmTensorViews<EpiloguePipeline::MemoryOperation>(
-                {a_ptr}, {b_ptr}, {/*ds_ptr*/}, c_ptr, kargs, splitk_batch_offset);
+                {a_ptr}, {b_ptr}, {/*ds_ptr*/}, c_ptr, kargs, splitk_batch_offset.splitted_k);
 
         const auto& gemm_pad_views = Base::MakeGemmPadViews(gemm_tensor_views_tuple);
         auto gemm_tile_windows =
@@ -436,7 +436,7 @@ struct GroupedGemmKernel
         // Create Gemm tensor views, pad views and tile windows
         const auto& gemm_tensor_views_tuple =
             Base::template MakeGemmTensorViews<EpiloguePipeline::MemoryOperation>(
-                {a_ptr}, {b_ptr}, {/*ds_ptr*/}, c_ptr, kargs, splitk_batch_offset);
+                {a_ptr}, {b_ptr}, {/*ds_ptr*/}, c_ptr, kargs, splitk_batch_offset.splitted_k);
 
         const auto& gemm_pad_views = Base::MakeGemmPadViews(gemm_tensor_views_tuple);
         auto gemm_tile_windows =
diff --git a/include/ck_tile/ops/gemm/kernel/universal_gemm_kernel.hpp b/include/ck_tile/ops/gemm/kernel/universal_gemm_kernel.hpp
index 8117d65758..cfba8b6c9d 100644
--- a/include/ck_tile/ops/gemm/kernel/universal_gemm_kernel.hpp
+++ b/include/ck_tile/ops/gemm/kernel/universal_gemm_kernel.hpp
@@ -579,7 +579,7 @@ struct UniversalGemmKernel
                         const std::array<const void*, NumDTensor>& ds_ptr,
                         EDataType* e_ptr,
                         const KernelArgs& kargs,
-                        const SplitKBatchOffset& splitk_batch_offset)
+                        const index_t k_size)
     {
         static_assert(!TilePartitioner::BlockGemmShape::PermuteA, "Not implemented!");
 
@@ -591,7 +591,7 @@ struct UniversalGemmKernel
                 {
                     return make_naive_tensor_view<address_space_enum::global>(
                         static_cast<const AiDataType*>(as_ptr[i]),
-                        make_tuple(kargs.M, splitk_batch_offset.splitted_k),
+                        make_tuple(kargs.M, k_size),
                         make_tuple(kargs.stride_As[i], 1),
                         number<GemmPipeline::GetVectorSizeA()>{},
                         number<1>{});
@@ -600,7 +600,7 @@ struct UniversalGemmKernel
                 {
                     return make_naive_tensor_view<address_space_enum::global>(
                         static_cast<const AiDataType*>(as_ptr[i]),
-                        make_tuple(splitk_batch_offset.splitted_k, kargs.M),
+                        make_tuple(k_size, kargs.M),
                         make_tuple(kargs.stride_As[i], 1),
                         number<GemmPipeline::GetVectorSizeA()>{},
                         number<1>{});
@@ -617,7 +617,7 @@ struct UniversalGemmKernel
                     if constexpr(TilePartitioner::BlockGemmShape::PermuteB)
                     {
                         constexpr index_t K1 = GemmPipeline::GetSmemPackB();
-                        const index_t K0     = splitk_batch_offset.splitted_k / K1;
+                        const index_t K0     = k_size / K1;
                         constexpr index_t VectorSizeB =
                             std::min(K1, GemmPipeline::GetVectorSizeB());
                         const auto b_k0_n_k1_desc =
@@ -638,7 +638,7 @@ struct UniversalGemmKernel
                     {
                         return make_naive_tensor_view<address_space_enum::global>(
                             bs_ptr[i],
-                            make_tuple(splitk_batch_offset.splitted_k, kargs.N),
+                            make_tuple(k_size, kargs.N),
                             make_tuple(kargs.stride_Bs[i], 1),
                             number<GemmPipeline::GetVectorSizeB()>{},
                             number<1>{});
@@ -649,7 +649,7 @@ struct UniversalGemmKernel
                     if constexpr(TilePartitioner::BlockGemmShape::PermuteB)
                     {
                         constexpr index_t K1 = GemmPipeline::GetSmemPackB();
-                        const index_t K0     = splitk_batch_offset.splitted_k / K1;
+                        const index_t K0     = k_size / K1;
                         constexpr index_t VectorSizeB =
                             std::min(K1, GemmPipeline::GetVectorSizeB());
                         const auto b_k0_n_k1_desc =
@@ -672,7 +672,7 @@ struct UniversalGemmKernel
                         {
                             index_t kFlatK =
                                 GemmPipeline::BlockGemmShape::flatKPerWarp *
-                                (splitk_batch_offset.splitted_k /
+                                (k_size /
                                  TilePartitioner::BlockGemmShape::WarpTile::at(number<2>{}));
                             index_t kFlatN = kargs.N * kargs.K / kFlatK;
 
@@ -687,7 +687,7 @@ struct UniversalGemmKernel
                         {
                             return make_naive_tensor_view<address_space_enum::global>(
                                 bs_ptr[i],
-                                make_tuple(kargs.N, splitk_batch_offset.splitted_k),
+                                make_tuple(kargs.N, k_size),
                                 make_tuple(kargs.stride_Bs[i], 1),
                                 number<GemmPipeline::GetVectorSizeB()>{},
                                 number<1>{});
@@ -962,7 +962,7 @@ struct UniversalGemmKernel
         // Create Gemm tensor views, pad views and tile windows
         const auto& gemm_tensor_views_tuple =
             MakeGemmTensorViews<EpiloguePipeline::MemoryOperation>(
-                as_ptr, bs_ptr, ds_ptr, e_ptr, kargs, splitk_batch_offset);
+                as_ptr, bs_ptr, ds_ptr, e_ptr, kargs, splitk_batch_offset.splitted_k);
 
         const auto& gemm_pad_views = MakeGemmPadViews(gemm_tensor_views_tuple);
         auto gemm_tile_windows     = MakeGemmTileWindows(gemm_pad_views, block_idx_m, block_idx_n);
@@ -1018,7 +1018,7 @@ struct UniversalGemmKernel
         // Create Gemm tensor views, pad views and tile windows
         const auto& gemm_tensor_views_tuple =
             MakeGemmTensorViews<EpiloguePipeline::MemoryOperation>(
-                as_ptr, bs_ptr, ds_ptr, e_ptr, kargs, splitk_batch_offset);
+                as_ptr, bs_ptr, ds_ptr, e_ptr, kargs, splitk_batch_offset.splitted_k);
 
         const auto& gemm_pad_views = MakeGemmPadViews(gemm_tensor_views_tuple);
         auto gemm_tile_windows     = MakeGemmTileWindows(gemm_pad_views, block_idx_m, block_idx_n);

From a9f1861f58ca5db43a53eaf17e486257e17b5357 Mon Sep 17 00:00:00 2001
From: Emily Martins <emily.martins@amd.com>
Date: Tue, 26 Aug 2025 18:03:00 +0000
Subject: [PATCH 2/9] StreamK Kernel RunGemm Implementation

Stream K cannot simply use UniversalGemmKernel's RunGemm for the
following reasons:

1. The UniversalGemmKernel::RunGemm function computes num_loop based on
   a static function of the TilePartitioner. That said, for Stream K,
num_loop must be computed using a member function (namely
GetCurrentIterLength from PR #2708).
2. The UniversalGemmKernel::RunGemm function requires the use of a
   SplitKBatchOffset object which is not used for Stream K since we are
not using Split K.

Thus, this change adds a RunGemm function in the StreamKKernel class.
---
 .../ops/gemm/kernel/streamk_gemm_kernel.hpp   | 41 +++++++++++++++++++
 1 file changed, 41 insertions(+)

diff --git a/include/ck_tile/ops/gemm/kernel/streamk_gemm_kernel.hpp b/include/ck_tile/ops/gemm/kernel/streamk_gemm_kernel.hpp
index 77c431e49c..f1a98dc30a 100644
--- a/include/ck_tile/ops/gemm/kernel/streamk_gemm_kernel.hpp
+++ b/include/ck_tile/ops/gemm/kernel/streamk_gemm_kernel.hpp
@@ -171,6 +171,47 @@ struct StreamKKernel
                                                  host_args.num_sk_blocks}};
     }
 
+    template <bool UseDefaultScheduler = true>
+    CK_TILE_DEVICE static void
+    RunGemm(const std::array<const ADataType*, UniversalGemmKernel::NumATensor>& as_ptr,
+            const std::array<const BDataType*, UniversalGemmKernel::NumBTensor>& bs_ptr,
+            const std::array<const void*, UniversalGemmKernel::NumDTensor>& ds_ptr,
+            CDataType* c_ptr,
+            void* smem_ptr_0,
+            const typename UniversalGemmKernel::KernelArgs& kargs,
+            const index_t num_loop,
+            const index_t block_idx_m,
+            const index_t block_idx_n,
+            const index_t k_size)
+    {
+        // Create Gemm tensor views, pad views and tile windows
+        const auto& gemm_tensor_views_tuple =
+            UniversalGemmKernel::template MakeGemmTensorViews<EpiloguePipeline::MemoryOperation>(
+                as_ptr, bs_ptr, ds_ptr, c_ptr, kargs, k_size);
+
+        const auto& gemm_pad_views = UniversalGemmKernel::MakeGemmPadViews(gemm_tensor_views_tuple);
+        auto gemm_tile_windows =
+            UniversalGemmKernel::MakeGemmTileWindows(gemm_pad_views, block_idx_m, block_idx_n);
+
+        // Run GEMM cooperatively by whole workgroup.
+        const auto& as_block_window = gemm_tile_windows.at(UniversalGemmKernel::I0);
+        const auto& bs_block_window = gemm_tile_windows.at(UniversalGemmKernel::I1);
+        const auto& ds_block_window = gemm_tile_windows.at(UniversalGemmKernel::I2);
+
+        const auto& c_block_tile = GemmPipeline{}(as_block_window[UniversalGemmKernel::I0],
+                                                  bs_block_window[UniversalGemmKernel::I0],
+                                                  num_loop,
+                                                  smem_ptr_0);
+
+        if(UseDefaultScheduler || (get_warp_id() == 0))
+        {
+            // Run Epilogue Pipeline
+            auto& c_block_window = gemm_tile_windows.at(UniversalGemmKernel::I3);
+
+            EpiloguePipeline{}(c_block_window, c_block_tile, ds_block_window, smem_ptr_0);
+        }
+    }
+
     CK_TILE_HOST static bool
     IsSupportedArgument(const typename UniversalGemmKernel::KernelArgs& kargs)
     {

From 55ea03eddc97ee642db17fcfe83ed5ca87de0985 Mon Sep 17 00:00:00 2001
From: Astha Rai <astha.rai713@gmail.com>
Date: Wed, 27 Aug 2025 20:38:56 +0000
Subject: [PATCH 3/9] initial implementation for operator() for StreamKKernel:
 adding stream-k algorithm and calls to RunGemm

---
 .../ops/gemm/kernel/streamk_gemm_kernel.hpp   | 54 +++++++++++++++++--
 1 file changed, 51 insertions(+), 3 deletions(-)

diff --git a/include/ck_tile/ops/gemm/kernel/streamk_gemm_kernel.hpp b/include/ck_tile/ops/gemm/kernel/streamk_gemm_kernel.hpp
index f1a98dc30a..212d6904c4 100644
--- a/include/ck_tile/ops/gemm/kernel/streamk_gemm_kernel.hpp
+++ b/include/ck_tile/ops/gemm/kernel/streamk_gemm_kernel.hpp
@@ -240,9 +240,57 @@ struct StreamKKernel
         kargs.workspace_ptr = workspace_ptr;
     }
 
-    // Temporary placeholder to support the Occupancy() static function.
-    // Since the Occupancy function uses kentry, this class must have an operator() function
-    CK_TILE_DEVICE void operator()(StreamKKernelArgs /*kargs*/) const {}
+    CK_TILE_DEVICE void operator()(StreamKKernelArgs kargs) const
+    {
+        // allocate LDS
+        __shared__ char smem_ptr_0[UniversalGemmKernel::GetSmemSize()];
+
+        uint32_t block_idx = ck_tile::get_block_1d_id();
+
+        uint32_t iter_start, iter_end;
+        kargs.tile_partitioner.GetBlockItr(block_idx, iter_start, iter_end);
+        uint32_t total_iter_length = iter_end - iter_start;
+
+        while(true)
+        {
+
+            uint32_t current_iter_length =
+                __builtin_amdgcn_readfirstlane(kargs.tile_partitioner.GetCurrentIterLength(
+                    iter_start, iter_end, total_iter_length));
+
+            uint32_t tile_idx, iter_offset;
+            kargs.tile_partitioner.GetTileIdxWithOffset(iter_end - 1, tile_idx, iter_offset);
+            iter_offset = __builtin_amdgcn_readfirstlane(iter_offset - current_iter_length + 1);
+            auto spatial_idx = kargs.tile_partitioner.GetOutputTileIndex(tile_idx);
+
+            index_t i_m    = static_cast<index_t>(spatial_idx[UniversalGemmKernel::I0]);
+            index_t i_n    = static_cast<index_t>(spatial_idx[UniversalGemmKernel::I1]);
+            index_t i_k    = static_cast<index_t>(iter_offset) * TilePartitioner::KPerBlock;
+            index_t k_size = static_cast<index_t>(current_iter_length * TilePartitioner::KPerBlock);
+
+            const ADataType* a_ptr = static_cast<const ADataType*>(kargs.as_ptr[0]) + i_k;
+
+            const BDataType* b_ptr = static_cast<const BDataType*>(kargs.bs_ptr[0]) + i_k;
+
+            CDataType* c_ptr = static_cast<CDataType*>(kargs.e_ptr);
+
+            RunGemm({a_ptr},
+                    {b_ptr},
+                    {/*ds_ptr*/},
+                    c_ptr,
+                    smem_ptr_0,
+                    kargs,
+                    current_iter_length,
+                    i_m,
+                    i_n,
+                    k_size);
+
+            iter_end -= current_iter_length;
+            if(iter_end <= iter_start)
+                break;
+            block_sync_lds();
+        }
+    }
 
     private:
     CK_TILE_HOST static int NumCU()

From 59ca723c83a0d0cbd99d0bb36ee2945164bfbb73 Mon Sep 17 00:00:00 2001
From: Emily Martins <emily.martins@amd.com>
Date: Wed, 3 Sep 2025 22:46:36 +0000
Subject: [PATCH 4/9] Fix indexing and offset issues for StreamK

These changes do the following:
- Ensure offsets along the M and N dimensions are multiplied by
  MPerblock or NPerBlock, respectively. This ensures tile window origins
are at the correct locations.
- Fix bug in the tile partitioner's GetTileIdxWithOffset. Now, we apply
  divmod to the given references to ensure correct values are available
to the caller.
- Added documentation in the Stream-K operator()
---
 .../ops/gemm/kernel/gemm_tile_partitioner.hpp |  4 +-
 .../ops/gemm/kernel/streamk_gemm_kernel.hpp   | 37 +++++++++++++++----
 2 files changed, 30 insertions(+), 11 deletions(-)

diff --git a/include/ck_tile/ops/gemm/kernel/gemm_tile_partitioner.hpp b/include/ck_tile/ops/gemm/kernel/gemm_tile_partitioner.hpp
index 92ae6411a5..4f53c20b5b 100644
--- a/include/ck_tile/ops/gemm/kernel/gemm_tile_partitioner.hpp
+++ b/include/ck_tile/ops/gemm/kernel/gemm_tile_partitioner.hpp
@@ -672,9 +672,7 @@ struct StreamKTilePartitioner
     CK_TILE_DEVICE void
     GetTileIdxWithOffset(uint32_t iter, uint32_t& tile_idx, uint32_t& iter_offset) const noexcept
     {
-        uint32_t tile_idx_val    = static_cast<uint32_t>(tile_idx);
-        uint32_t iter_offset_val = static_cast<uint32_t>(iter_offset);
-        k_iters_per_tile.divmod(iter, tile_idx_val, iter_offset_val);
+        k_iters_per_tile.divmod(iter, tile_idx, iter_offset);
     }
 
     /**
diff --git a/include/ck_tile/ops/gemm/kernel/streamk_gemm_kernel.hpp b/include/ck_tile/ops/gemm/kernel/streamk_gemm_kernel.hpp
index 212d6904c4..f1e0a7c057 100644
--- a/include/ck_tile/ops/gemm/kernel/streamk_gemm_kernel.hpp
+++ b/include/ck_tile/ops/gemm/kernel/streamk_gemm_kernel.hpp
@@ -240,40 +240,60 @@ struct StreamKKernel
         kargs.workspace_ptr = workspace_ptr;
     }
 
+    /// @brief Entry point for the Stream-K Kernel, performing the main Stream-K loop.
     CK_TILE_DEVICE void operator()(StreamKKernelArgs kargs) const
     {
-        // allocate LDS
+        // Allocate LDS
         __shared__ char smem_ptr_0[UniversalGemmKernel::GetSmemSize()];
 
         uint32_t block_idx = ck_tile::get_block_1d_id();
 
+        // Determine the K offset of the first and final macro tile in the A and B tensors along the
+        // K dimension.
         uint32_t iter_start, iter_end;
         kargs.tile_partitioner.GetBlockItr(block_idx, iter_start, iter_end);
+
+        // An "iteration" denotes the multiplication of one macro tile in A with a macro tile in B.
+        // The total iteration length is the total of such multiplications performed.
         uint32_t total_iter_length = iter_end - iter_start;
 
+        // Main Stream-K loop
         while(true)
         {
-
+            // Determine the number of macro tiles in A and B this WG is resposible for in the
+            // current C macro tile.
             uint32_t current_iter_length =
                 __builtin_amdgcn_readfirstlane(kargs.tile_partitioner.GetCurrentIterLength(
                     iter_start, iter_end, total_iter_length));
 
+            // Determine the 1D tile_idx and the iter_offset for this WG.
+            // The tile_idx is the 1D macro tile index in the C tensor.
+            // The iter_offset is the starting macro tile index in the K dimension for the WG in the
+            // current iteration of the while loop.
             uint32_t tile_idx, iter_offset;
             kargs.tile_partitioner.GetTileIdxWithOffset(iter_end - 1, tile_idx, iter_offset);
             iter_offset = __builtin_amdgcn_readfirstlane(iter_offset - current_iter_length + 1);
+
+            // Get the 2D tile index in the C tensor for this WG using the 1D index (i.e. tile_idx)
             auto spatial_idx = kargs.tile_partitioner.GetOutputTileIndex(tile_idx);
 
-            index_t i_m    = static_cast<index_t>(spatial_idx[UniversalGemmKernel::I0]);
-            index_t i_n    = static_cast<index_t>(spatial_idx[UniversalGemmKernel::I1]);
-            index_t i_k    = static_cast<index_t>(iter_offset) * TilePartitioner::KPerBlock;
+            // Get the offsets in A, B, C tensors.
+            index_t i_m = static_cast<index_t>(spatial_idx[UniversalGemmKernel::I0] *
+                                               TilePartitioner::MPerBlock);
+            index_t i_n = static_cast<index_t>(spatial_idx[UniversalGemmKernel::I1] *
+                                               TilePartitioner::NPerBlock);
+            index_t i_k = static_cast<index_t>(iter_offset) * TilePartitioner::KPerBlock;
+
+            // Determine the total size along the K dimension the WG is using in this iteration
+            // (used to construct tensor views).
             index_t k_size = static_cast<index_t>(current_iter_length * TilePartitioner::KPerBlock);
 
+            // Update pointer offsets for A, B, and C.
             const ADataType* a_ptr = static_cast<const ADataType*>(kargs.as_ptr[0]) + i_k;
-
             const BDataType* b_ptr = static_cast<const BDataType*>(kargs.bs_ptr[0]) + i_k;
+            CDataType* c_ptr       = static_cast<CDataType*>(kargs.e_ptr);
 
-            CDataType* c_ptr = static_cast<CDataType*>(kargs.e_ptr);
-
+            // Run the GEMM pipeline and Epilogue.
             RunGemm({a_ptr},
                     {b_ptr},
                     {/*ds_ptr*/},
@@ -285,6 +305,7 @@ struct StreamKKernel
                     i_n,
                     k_size);
 
+            // Prepare for next Stream-K loop iteration.
             iter_end -= current_iter_length;
             if(iter_end <= iter_start)
                 break;

From 8d976cf532a22d54d0b42da0fab69f3e7d658e19 Mon Sep 17 00:00:00 2001
From: Emily Martins <emily.martins@amd.com>
Date: Thu, 4 Sep 2025 00:05:11 +0000
Subject: [PATCH 5/9] Initial gtests for Stream-K

These changes add an initial gtest suite for the CK Tile Stream-K
kernel. Currently, due to bugs in the StreamKTilePartitioner (which will
be handled in a future PR), there are validation issues for certain
cases which may differ on different architectures. Thus, we opted to run
cases that are only fully data-parallel (skipping others). A guard was
added to Stream-K's IsSupportedArgument method to ensure that callers
are aware of this constraint. Additionally, to ensure testing
reproducibility, options for setting the number of CUs and occupancy
were added to MakeKernelArgs.
---
 .../ops/gemm/kernel/streamk_gemm_kernel.hpp   |  30 +-
 test/ck_tile/CMakeLists.txt                   |   1 +
 test/ck_tile/gemm_streamk/CMakeLists.txt      |   6 +
 .../gemm_streamk/test_gemm_streamk.cpp        |  14 +
 .../gemm_streamk/test_gemm_streamk_cases.inc  | 109 ++++++
 .../gemm_streamk/test_gemm_streamk_types.hpp  |  25 ++
 .../gemm_streamk/test_gemm_streamk_util.hpp   | 316 ++++++++++++++++++
 7 files changed, 493 insertions(+), 8 deletions(-)
 create mode 100644 test/ck_tile/gemm_streamk/CMakeLists.txt
 create mode 100644 test/ck_tile/gemm_streamk/test_gemm_streamk.cpp
 create mode 100644 test/ck_tile/gemm_streamk/test_gemm_streamk_cases.inc
 create mode 100644 test/ck_tile/gemm_streamk/test_gemm_streamk_types.hpp
 create mode 100644 test/ck_tile/gemm_streamk/test_gemm_streamk_util.hpp

diff --git a/include/ck_tile/ops/gemm/kernel/streamk_gemm_kernel.hpp b/include/ck_tile/ops/gemm/kernel/streamk_gemm_kernel.hpp
index f1e0a7c057..69d2e4b511 100644
--- a/include/ck_tile/ops/gemm/kernel/streamk_gemm_kernel.hpp
+++ b/include/ck_tile/ops/gemm/kernel/streamk_gemm_kernel.hpp
@@ -141,11 +141,17 @@ struct StreamKKernel
         return UniversalGemmKernel::BlockSize();
     }
 
-    CK_TILE_HOST static StreamKKernelArgs MakeKernelArgs(const StreamKHostArgs& host_args)
+    /// @brief Constructs kernel arguments for the Stream-K kernel.
+    /// @param host_args Stream-K host arguments.
+    /// @param num_cu Number of compute units (CUs). The default is the number of CUs on the device.
+    /// The caller may select their own to assist with test reproducibility, etc.
+    /// @param occupancy The maximum number of active blocks per CU for this kernel. The caller may
+    /// select their own to assist with test reproducibility, etc.
+    /// @return The kernel arguments for Stream-K.
+    CK_TILE_HOST static StreamKKernelArgs MakeKernelArgs(const StreamKHostArgs& host_args,
+                                                         int num_cu    = NumCU(),
+                                                         int occupancy = Occupancy())
     {
-        uint32_t occupancy = static_cast<uint32_t>(Occupancy());
-        uint32_t num_cu    = static_cast<uint32_t>(NumCU());
-
         return StreamKKernelArgs{{host_args.as_ptr,
                                   host_args.bs_ptr,
                                   host_args.ds_ptr,
@@ -166,8 +172,8 @@ struct StreamKKernel
                                  TilePartitioner{static_cast<uint32_t>(host_args.M),
                                                  static_cast<uint32_t>(host_args.N),
                                                  static_cast<uint32_t>(host_args.K),
-                                                 num_cu,
-                                                 occupancy,
+                                                 static_cast<uint32_t>(num_cu),
+                                                 static_cast<uint32_t>(occupancy),
                                                  host_args.num_sk_blocks}};
     }
 
@@ -212,9 +218,17 @@ struct StreamKKernel
         }
     }
 
-    CK_TILE_HOST static bool
-    IsSupportedArgument(const typename UniversalGemmKernel::KernelArgs& kargs)
+    CK_TILE_HOST static bool IsSupportedArgument(const StreamKKernelArgs& kargs)
     {
+        if(kargs.tile_partitioner.sk_num_blocks != 0)
+        {
+            if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
+            {
+                CK_TILE_ERROR("CK Tile Stream-K currently only supports 0 SK blocks (i.e., "
+                              "data-parallel only).");
+            }
+            return false;
+        }
         return UniversalGemmKernel::IsSupportedArgument(kargs);
     }
 
diff --git a/test/ck_tile/CMakeLists.txt b/test/ck_tile/CMakeLists.txt
index 993df2ec40..32230bbce2 100644
--- a/test/ck_tile/CMakeLists.txt
+++ b/test/ck_tile/CMakeLists.txt
@@ -4,6 +4,7 @@ add_subdirectory(gemm_weight_preshuffle)
 add_subdirectory(batched_gemm)
 add_subdirectory(grouped_gemm)
 add_subdirectory(gemm_multi_d)
+add_subdirectory(gemm_streamk)
 add_subdirectory(data_type)
 add_subdirectory(container)
 add_subdirectory(elementwise)
diff --git a/test/ck_tile/gemm_streamk/CMakeLists.txt b/test/ck_tile/gemm_streamk/CMakeLists.txt
new file mode 100644
index 0000000000..07b6ed3d63
--- /dev/null
+++ b/test/ck_tile/gemm_streamk/CMakeLists.txt
@@ -0,0 +1,6 @@
+# Currently test_ck_tile_streamk is only built on gfx9
+if(GPU_TARGETS MATCHES "gfx9")
+    add_gtest_executable(test_ck_tile_streamk test_gemm_streamk.cpp)
+else()
+    message(DEBUG "Skipping test_ck_tile_streamk tests for current target")
+endif()
diff --git a/test/ck_tile/gemm_streamk/test_gemm_streamk.cpp b/test/ck_tile/gemm_streamk/test_gemm_streamk.cpp
new file mode 100644
index 0000000000..99c3fb397f
--- /dev/null
+++ b/test/ck_tile/gemm_streamk/test_gemm_streamk.cpp
@@ -0,0 +1,14 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier:  MIT
+
+#include "test_gemm_streamk_types.hpp"
+#include "test_gemm_streamk_util.hpp"
+#include "gtest/gtest.h"
+
+#define TEST_SUITE_NAME TestCkTileStreamK
+
+TYPED_TEST_SUITE(TestCkTileStreamK, KernelTypesStreamK);
+
+#include "test_gemm_streamk_cases.inc"
+
+#undef TEST_SUITE_NAME
diff --git a/test/ck_tile/gemm_streamk/test_gemm_streamk_cases.inc b/test/ck_tile/gemm_streamk/test_gemm_streamk_cases.inc
new file mode 100644
index 0000000000..fba3c1ae04
--- /dev/null
+++ b/test/ck_tile/gemm_streamk/test_gemm_streamk_cases.inc
@@ -0,0 +1,109 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier:  MIT
+
+#pragma once
+
+#ifndef TEST_STREAM_K_CASES_INC
+#define TEST_STREAM_K_CASES_INC
+
+TYPED_TEST(TEST_SUITE_NAME, StreamK_M256_N256_K256_DP)
+{
+
+    ck_tile::index_t M     = 256;
+    ck_tile::index_t N     = 256;
+    ck_tile::index_t K     = 256;
+    uint32_t num_sk_blocks = 0;
+
+    this->Run(M, N, K, num_sk_blocks);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, StreamK_M256_N256_K256_SKBlocks4)
+{
+
+    ck_tile::index_t M     = 256;
+    ck_tile::index_t N     = 256;
+    ck_tile::index_t K     = 256;
+    uint32_t num_sk_blocks = 4;
+
+    this->Run(M, N, K, num_sk_blocks);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, StreamK_M256_N256_K256_SKBlocks8)
+{
+
+    ck_tile::index_t M     = 256;
+    ck_tile::index_t N     = 256;
+    ck_tile::index_t K     = 256;
+    uint32_t num_sk_blocks = 8;
+
+    this->Run(M, N, K, num_sk_blocks);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, StreamK_M512_N512_K512_DP)
+{
+
+    ck_tile::index_t M     = 512;
+    ck_tile::index_t N     = 512;
+    ck_tile::index_t K     = 512;
+    uint32_t num_sk_blocks = 0;
+
+    this->Run(M, N, K, num_sk_blocks);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, StreamK_M512_N512_K512_SKBlocks16)
+{
+
+    ck_tile::index_t M     = 512;
+    ck_tile::index_t N     = 512;
+    ck_tile::index_t K     = 512;
+    uint32_t num_sk_blocks = 16;
+
+    this->Run(M, N, K, num_sk_blocks);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, StreamK_M512_N512_K512_SKBlocks8)
+{
+
+    ck_tile::index_t M     = 512;
+    ck_tile::index_t N     = 512;
+    ck_tile::index_t K     = 512;
+    uint32_t num_sk_blocks = 8;
+
+    this->Run(M, N, K, num_sk_blocks);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, StreamK_M3840_N4096_K4096_DP)
+{
+
+    ck_tile::index_t M     = 3840;
+    ck_tile::index_t N     = 4096;
+    ck_tile::index_t K     = 4096;
+    uint32_t num_sk_blocks = 0;
+
+    this->Run(M, N, K, num_sk_blocks);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, StreamK_M3840_N4096_K4096_SKBlocks960)
+{
+
+    ck_tile::index_t M     = 3840;
+    ck_tile::index_t N     = 4096;
+    ck_tile::index_t K     = 4096;
+    uint32_t num_sk_blocks = 960;
+
+    this->Run(M, N, K, num_sk_blocks);
+}
+
+TYPED_TEST(TEST_SUITE_NAME, StreamK_Unsupported_Reduction)
+{
+
+    ck_tile::index_t M     = 3840;
+    ck_tile::index_t N     = 4096;
+    ck_tile::index_t K     = 4096;
+    uint32_t num_sk_blocks = 64;
+
+    EXPECT_THROW(this->Run(M, N, K, num_sk_blocks, ck_tile::StreamKReductionStrategy::Reduction),
+                 std::runtime_error);
+}
+
+#endif
diff --git a/test/ck_tile/gemm_streamk/test_gemm_streamk_types.hpp b/test/ck_tile/gemm_streamk/test_gemm_streamk_types.hpp
new file mode 100644
index 0000000000..399f3f11e8
--- /dev/null
+++ b/test/ck_tile/gemm_streamk/test_gemm_streamk_types.hpp
@@ -0,0 +1,25 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier:  MIT
+
+#include <tuple>
+#include <type_traits>
+
+#include "gtest/gtest.h"
+
+#include "ck_tile/host.hpp"
+
+using F16  = ck_tile::half_t;
+using F32  = float;
+using BF16 = ck_tile::bf16_t;
+
+using Row = ck_tile::tensor_layout::gemm::RowMajor;
+using Col = ck_tile::tensor_layout::gemm::ColumnMajor;
+
+// clang-format off
+using KernelTypesStreamK = ::testing::Types<
+//                ALayout  BLayout  CLayout   ADataType  BDataType  AccDataType  CDataType
+    std::tuple<    Row,     Col,     Row,       F16,       F16,         F32,       F16>,
+    std::tuple<    Row,     Col,     Row,       BF16,      BF16,        F32,       BF16>
+>;
+
+// clang-format on
diff --git a/test/ck_tile/gemm_streamk/test_gemm_streamk_util.hpp b/test/ck_tile/gemm_streamk/test_gemm_streamk_util.hpp
new file mode 100644
index 0000000000..3ebc8449aa
--- /dev/null
+++ b/test/ck_tile/gemm_streamk/test_gemm_streamk_util.hpp
@@ -0,0 +1,316 @@
+// Copyright © Advanced Micro Devices, Inc., or its affiliates.
+// SPDX-License-Identifier:  MIT
+
+#include <gtest/gtest.h>
+#include <hip/hip_runtime.h>
+#include <iostream>
+#include <string>
+#include <tuple>
+
+#include "ck_tile/host.hpp"
+#include "ck_tile/ops/epilogue.hpp"
+#include "ck_tile/ops/gemm.hpp"
+
+template <typename ADataType, typename BDataType, typename AccDataType, typename CDataType>
+auto calculate_rtol_atol(const ck_tile::index_t K,
+                         const ck_tile::index_t kbatch,
+                         const float max_accumulated_value)
+{
+    using ComputeType =
+        std::conditional_t<sizeof(ADataType) < sizeof(BDataType), ADataType, BDataType>;
+    // Calculate thresholds
+    const auto rtol = ck_tile::get_relative_threshold<ComputeType, CDataType, AccDataType>(
+        ck_tile::integer_divide_ceil(K, kbatch));
+    const auto atol = ck_tile::get_absolute_threshold<ComputeType, CDataType, AccDataType>(
+        max_accumulated_value / kbatch, ck_tile::integer_divide_ceil(K, kbatch));
+
+    // The logic below may need to become more advanced once bugs in Stream-K Tile Partitioner are
+    // resolved. Because the number of WGs contributing to a macro tile in C may not be the same for
+    // all macro tiles in C.
+
+    // Calculate error due to more than 1 WG contributing to the same macro tile in C
+    const auto rtol_split_k =
+        ck_tile::get_relative_threshold<CDataType, CDataType, CDataType>(kbatch);
+    const auto atol_split_k = ck_tile::get_absolute_threshold<CDataType, CDataType, CDataType>(
+        max_accumulated_value, kbatch);
+    // Use higher threshold
+    return ck_tile::make_tuple(std::max(rtol, rtol_split_k), std::max(atol, atol_split_k));
+}
+
+template <typename Tuple>
+class TestCkTileStreamK : public ::testing::Test
+{
+    protected:
+    using ALayout     = std::tuple_element_t<0, Tuple>;
+    using BLayout     = std::tuple_element_t<1, Tuple>;
+    using CLayout     = std::tuple_element_t<2, Tuple>;
+    using ADataType   = std::tuple_element_t<3, Tuple>;
+    using BDataType   = std::tuple_element_t<4, Tuple>;
+    using AccDataType = std::tuple_element_t<5, Tuple>;
+    using CDataType   = std::tuple_element_t<6, Tuple>;
+    using DsLayout    = ck_tile::tuple<>;
+    using DsDataType  = ck_tile::tuple<>;
+
+    template <ck_tile::StreamKReductionStrategy ReductionStrategy,
+              bool PadM       = true,
+              bool PadN       = true,
+              bool PadK       = true,
+              bool Preshuffle = false,
+              bool TransposeC = false>
+    void invoke_streamk(const ck_tile::StreamKHostArgs& args,
+                        const ck_tile::stream_config& s,
+                        int num_cu,
+                        int occupancy)
+    {
+
+        constexpr ck_tile::index_t M_Tile = 128;
+        constexpr ck_tile::index_t N_Tile = 128;
+        constexpr ck_tile::index_t K_Tile = 32;
+
+        constexpr ck_tile::index_t M_Warp = 2;
+        constexpr ck_tile::index_t N_Warp = 2;
+        constexpr ck_tile::index_t K_Warp = 1;
+
+        constexpr ck_tile::index_t M_Warp_Tile = 32;
+        constexpr ck_tile::index_t N_Warp_Tile = 32;
+        constexpr ck_tile::index_t K_Warp_Tile = 16;
+
+        constexpr bool kPadM      = PadM;
+        constexpr bool kPadN      = PadN;
+        constexpr bool kPadK      = PadK;
+        constexpr bool preshuffle = Preshuffle;
+
+        constexpr bool DoubleSmemBuffer   = false;
+        constexpr int kBlockPerCu         = 1;
+        constexpr bool StructuredSparsity = false;
+        constexpr bool NumWaveGroup       = 1;
+
+        using GemmShape =
+            ck_tile::TileGemmShape<ck_tile::sequence<M_Tile, N_Tile, K_Tile>,
+                                   ck_tile::sequence<M_Warp, N_Warp, K_Warp>,
+                                   ck_tile::sequence<M_Warp_Tile, N_Warp_Tile, K_Warp_Tile>>;
+
+        using TilePartitioner = ck_tile::StreamKTilePartitioner<GemmShape, ReductionStrategy>;
+
+        using Traits = ck_tile::TileGemmTraits<kPadM, kPadN, kPadK, ALayout, BLayout, CLayout>;
+
+        using GemmUniversalTraits = ck_tile::TileGemmUniversalTraits<kPadM,
+                                                                     kPadN,
+                                                                     kPadK,
+                                                                     DoubleSmemBuffer,
+                                                                     ALayout,
+                                                                     BLayout,
+                                                                     CLayout,
+                                                                     TransposeC,
+                                                                     StructuredSparsity,
+                                                                     false,
+                                                                     NumWaveGroup,
+                                                                     preshuffle>;
+        // For initial testing, we will just test with one pipeline
+        // More extensive testing is coming later and will test other pipelines
+        using GemmPipelineProblem =
+            ck_tile::GemmPipelineProblem<ADataType, BDataType, AccDataType, GemmShape, Traits>;
+
+        using BaseGemmPipeline = ck_tile::BaseGemmPipelineAgBgCrMem<GemmPipelineProblem>;
+
+        const ck_tile::index_t k_grain     = args.k_batch * K_Tile;
+        const ck_tile::index_t K_split     = (args.K + k_grain - 1) / k_grain * K_Tile;
+        const ck_tile::index_t num_loop    = TilePartitioner::GetLoopNum(K_split);
+        const bool has_hot_loop            = BaseGemmPipeline::BlockHasHotloop(num_loop);
+        const ck_tile::TailNumber tail_num = BaseGemmPipeline::GetBlockLoopTailNum(num_loop);
+
+        const auto Run = [&](const auto has_hot_loop_,
+                             const auto tail_number_,
+                             const auto memory_operation_) {
+            constexpr bool has_hot_loop_v   = has_hot_loop_.value;
+            constexpr auto tail_number_v    = tail_number_.value;
+            constexpr auto memory_operation = memory_operation_.value;
+            constexpr auto scheduler        = ck_tile::GemmPipelineScheduler::Intrawave;
+
+            using UniversalGemmProblem = ck_tile::UniversalGemmPipelineProblem<ADataType,
+                                                                               BDataType,
+                                                                               AccDataType,
+                                                                               GemmShape,
+                                                                               GemmUniversalTraits,
+                                                                               scheduler,
+                                                                               has_hot_loop_v,
+                                                                               tail_number_v>;
+
+            using GemmPipeline = ck_tile::GemmPipelineAgBgCrMem<UniversalGemmProblem>;
+
+            using GemmEpilogue = ck_tile::CShuffleEpilogue<
+                ck_tile::CShuffleEpilogueProblem<ADataType,
+                                                 BDataType,
+                                                 ck_tile::tuple<>,
+                                                 AccDataType,
+                                                 CDataType,
+                                                 ck_tile::tuple<>,
+                                                 CLayout,
+                                                 ck_tile::element_wise::PassThrough,
+                                                 TilePartitioner::MPerBlock,
+                                                 TilePartitioner::NPerBlock,
+                                                 M_Warp,
+                                                 N_Warp,
+                                                 M_Warp_Tile,
+                                                 N_Warp_Tile,
+                                                 K_Warp_Tile,
+                                                 UniversalGemmProblem::TransposeC,
+                                                 memory_operation>>;
+
+            using Kernel = ck_tile::StreamKKernel<TilePartitioner, GemmPipeline, GemmEpilogue>;
+
+            auto kargs = Kernel::MakeKernelArgs(args, num_cu, occupancy);
+
+            if(!Kernel::IsSupportedArgument(kargs))
+            {
+                EXPECT_TRUE(false);
+            }
+
+            dim3 grid_dims  = Kernel::GridSize(kargs.tile_partitioner);
+            dim3 block_dims = Kernel::BlockSize();
+
+            ck_tile::launch_kernel(
+                s, ck_tile::make_kernel<kBlockPerCu>(Kernel{}, grid_dims, block_dims, 0, kargs));
+        };
+
+        const auto RunSplitk = [&](const auto has_hot_loop_, const auto tail_number_) {
+            if(args.k_batch == 1)
+            {
+                Run(has_hot_loop_,
+                    tail_number_,
+                    ck_tile::integral_constant<ck_tile::memory_operation_enum,
+                                               // Since we are doing stream K, in the case of
+                                               // atomics, multiple workgroups may write to the same
+                                               // output tile in the C tensor, so we must atomic add
+                                               // the results (not set)
+                                               ck_tile::memory_operation_enum::atomic_add>{});
+            }
+            else
+            {
+                std::cout << "Stream K is only run with k_batch of 1" << std::endl;
+                EXPECT_TRUE(false);
+            }
+        };
+
+        BaseGemmPipeline::TailHandler(RunSplitk, has_hot_loop, tail_num);
+    }
+
+    public:
+    // Since Stream-K is build on gfx9, the lower bound for CUs is 104. Thus, we default num_cu to
+    // 104 and occupancy to 1 to ensure tests are reproducible on different architectures.
+    void Run(ck_tile::index_t M,
+             ck_tile::index_t N,
+             ck_tile::index_t K,
+             uint32_t num_sk_blocks = 0xffffffff,
+             ck_tile::StreamKReductionStrategy reduction_strategy =
+                 ck_tile::StreamKReductionStrategy::Atomic,
+             int occupancy             = 1,
+             int num_cu                = 104,
+             ck_tile::index_t stride_A = 0,
+             ck_tile::index_t stride_B = 0,
+             ck_tile::index_t stride_C = 0)
+    {
+
+        using namespace ck_tile::literals;
+
+        if(reduction_strategy == ck_tile::StreamKReductionStrategy::Reduction)
+        {
+            throw std::runtime_error("Reduction Strategy is current unsupported!\n");
+        }
+
+        if(num_sk_blocks != 0)
+        {
+            GTEST_SKIP() << "CK Tile Stream K currently only supports DP.";
+        }
+
+        auto f_host_tensor_descriptor = [](std::size_t row,
+                                           std::size_t col,
+                                           std::size_t stride,
+                                           auto layout) {
+            if constexpr(std::is_same_v<decltype(layout), ck_tile::tensor_layout::gemm::RowMajor>)
+            {
+                return ck_tile::HostTensorDescriptor({row, col}, {stride, 1_uz});
+            }
+            else
+            {
+                return ck_tile::HostTensorDescriptor({row, col}, {1_uz, stride});
+            }
+        };
+
+        auto f_get_default_stride =
+            [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+                if(stride == 0)
+                {
+                    if constexpr(std::is_same_v<decltype(layout),
+                                                ck_tile::tensor_layout::gemm::RowMajor>)
+                    {
+                        return col;
+                    }
+                    else
+                    {
+                        return row;
+                    }
+                }
+                else
+                    return stride;
+            };
+
+        stride_A = f_get_default_stride(M, K, stride_A, ALayout{});
+        stride_B = f_get_default_stride(K, N, stride_B, BLayout{});
+        stride_C = f_get_default_stride(M, N, stride_C, CLayout{});
+
+        ck_tile::HostTensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, stride_A, ALayout{}));
+        ck_tile::HostTensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, stride_B, BLayout{}));
+        ck_tile::HostTensor<CDataType> c_m_n_dev_result(
+            f_host_tensor_descriptor(M, N, stride_C, CLayout{}));
+
+        ck_tile::FillUniformDistributionIntegerValue<ADataType>{-5, 5, /*seed*/ 11939}(a_m_k);
+        ck_tile::FillUniformDistributionIntegerValue<BDataType>{-5, 5, /*seed*/ 11940}(b_k_n);
+
+        ck_tile::DeviceMem a_m_k_dev_buf(a_m_k.get_element_space_size_in_bytes());
+        ck_tile::DeviceMem b_k_n_dev_buf(b_k_n.get_element_space_size_in_bytes());
+        ck_tile::DeviceMem c_m_n_dev_buf(c_m_n_dev_result.get_element_space_size_in_bytes());
+
+        a_m_k_dev_buf.ToDevice(a_m_k.data());
+        b_k_n_dev_buf.ToDevice(b_k_n.data());
+        c_m_n_dev_buf.SetZero();
+        c_m_n_dev_result.SetZero();
+
+        ck_tile::StreamKHostArgs args{a_m_k_dev_buf.GetDeviceBuffer(),
+                                      b_k_n_dev_buf.GetDeviceBuffer(),
+                                      c_m_n_dev_buf.GetDeviceBuffer(),
+                                      M,
+                                      N,
+                                      K,
+                                      stride_A,
+                                      stride_B,
+                                      stride_C,
+                                      reduction_strategy,
+                                      num_sk_blocks};
+
+        invoke_streamk<ck_tile::StreamKReductionStrategy::Atomic>(
+            args, ck_tile::stream_config{nullptr, false, 0, 0, 1}, num_cu, occupancy);
+
+        c_m_n_dev_buf.FromDevice(c_m_n_dev_result.data());
+
+        ck_tile::HostTensor<CDataType> c_m_n_host_ref(
+            f_host_tensor_descriptor(M, N, stride_C, CLayout{}));
+        c_m_n_host_ref.SetZero();
+
+        ck_tile::reference_gemm<ADataType, BDataType, AccDataType, CDataType>(
+            a_m_k, b_k_n, c_m_n_host_ref);
+
+        const float max_accumulated_value =
+            *std::max_element(c_m_n_host_ref.mData.begin(), c_m_n_host_ref.mData.end());
+        const auto rtol_atol = calculate_rtol_atol<ADataType, BDataType, AccDataType, CDataType>(
+            K, /*kbatch*/ 1, max_accumulated_value);
+
+        bool pass = ck_tile::check_err(c_m_n_dev_result,
+                                       c_m_n_host_ref,
+                                       "Error: Incorrect results!",
+                                       rtol_atol.at(ck_tile::number<0>{}),
+                                       rtol_atol.at(ck_tile::number<1>{}));
+
+        EXPECT_TRUE(pass);
+    };
+};

From 087571223379d7e829ce26ea535c1f6f593edea8 Mon Sep 17 00:00:00 2001
From: Emily Martins <emily.martins@amd.com>
Date: Tue, 9 Sep 2025 17:48:29 +0000
Subject: [PATCH 6/9] Use GemmPipeline operator() variant that takes hot loop
 and tail num

In Stream-K, the num_loop value varies per WG and per iteration of a
Stream-K loop. So instead, we use the version of the GemmPipeline's
operator() function that takes in has_hot_loop and tail_num. This is
similar to what is done in Grouped GEMM.
---
 .../ops/gemm/kernel/streamk_gemm_kernel.hpp   |  9 +++
 .../gemm_streamk/test_gemm_streamk_util.hpp   | 59 +++++--------------
 2 files changed, 24 insertions(+), 44 deletions(-)

diff --git a/include/ck_tile/ops/gemm/kernel/streamk_gemm_kernel.hpp b/include/ck_tile/ops/gemm/kernel/streamk_gemm_kernel.hpp
index 69d2e4b511..913f55da17 100644
--- a/include/ck_tile/ops/gemm/kernel/streamk_gemm_kernel.hpp
+++ b/include/ck_tile/ops/gemm/kernel/streamk_gemm_kernel.hpp
@@ -204,9 +204,18 @@ struct StreamKKernel
         const auto& bs_block_window = gemm_tile_windows.at(UniversalGemmKernel::I1);
         const auto& ds_block_window = gemm_tile_windows.at(UniversalGemmKernel::I2);
 
+        // Since num_loop can vary per WG and per iteration of the Stream-K while loop, we compute
+        // has_hot_loop and tail_num here. This is a similar pattern used by grouped GEMM. In this
+        // case, we call the GemmPipeline's operator() function that takes both has_hot_loop and
+        // tail_num.
+        const bool has_hot_loop   = GemmPipeline::BlockHasHotloop(num_loop);
+        const TailNumber tail_num = GemmPipeline::GetBlockLoopTailNum(num_loop);
+
         const auto& c_block_tile = GemmPipeline{}(as_block_window[UniversalGemmKernel::I0],
                                                   bs_block_window[UniversalGemmKernel::I0],
                                                   num_loop,
+                                                  has_hot_loop,
+                                                  tail_num,
                                                   smem_ptr_0);
 
         if(UseDefaultScheduler || (get_warp_id() == 0))
diff --git a/test/ck_tile/gemm_streamk/test_gemm_streamk_util.hpp b/test/ck_tile/gemm_streamk/test_gemm_streamk_util.hpp
index 3ebc8449aa..17eb1a31e1 100644
--- a/test/ck_tile/gemm_streamk/test_gemm_streamk_util.hpp
+++ b/test/ck_tile/gemm_streamk/test_gemm_streamk_util.hpp
@@ -92,8 +92,6 @@ class TestCkTileStreamK : public ::testing::Test
 
         using TilePartitioner = ck_tile::StreamKTilePartitioner<GemmShape, ReductionStrategy>;
 
-        using Traits = ck_tile::TileGemmTraits<kPadM, kPadN, kPadK, ALayout, BLayout, CLayout>;
-
         using GemmUniversalTraits = ck_tile::TileGemmUniversalTraits<kPadM,
                                                                      kPadN,
                                                                      kPadK,
@@ -106,36 +104,23 @@ class TestCkTileStreamK : public ::testing::Test
                                                                      false,
                                                                      NumWaveGroup,
                                                                      preshuffle>;
-        // For initial testing, we will just test with one pipeline
-        // More extensive testing is coming later and will test other pipelines
-        using GemmPipelineProblem =
-            ck_tile::GemmPipelineProblem<ADataType, BDataType, AccDataType, GemmShape, Traits>;
-
-        using BaseGemmPipeline = ck_tile::BaseGemmPipelineAgBgCrMem<GemmPipelineProblem>;
-
-        const ck_tile::index_t k_grain     = args.k_batch * K_Tile;
-        const ck_tile::index_t K_split     = (args.K + k_grain - 1) / k_grain * K_Tile;
-        const ck_tile::index_t num_loop    = TilePartitioner::GetLoopNum(K_split);
-        const bool has_hot_loop            = BaseGemmPipeline::BlockHasHotloop(num_loop);
-        const ck_tile::TailNumber tail_num = BaseGemmPipeline::GetBlockLoopTailNum(num_loop);
-
-        const auto Run = [&](const auto has_hot_loop_,
-                             const auto tail_number_,
-                             const auto memory_operation_) {
-            constexpr bool has_hot_loop_v   = has_hot_loop_.value;
-            constexpr auto tail_number_v    = tail_number_.value;
+
+        const auto Run = [&](const auto memory_operation_) {
             constexpr auto memory_operation = memory_operation_.value;
             constexpr auto scheduler        = ck_tile::GemmPipelineScheduler::Intrawave;
 
+            // We create the GEMM pipeline without specifying has_hot_loop or tail_num.
+            // This is because num_loop can vary (a) per WG and (b) per iteration of the Stream-K
+            // while loop. Instead, has_hot_loop and tail_num are determined in the Stream-K
+            // Kernel's RunGemm function. This is a similar pattern used by grouped GEMM.
             using UniversalGemmProblem = ck_tile::UniversalGemmPipelineProblem<ADataType,
                                                                                BDataType,
                                                                                AccDataType,
                                                                                GemmShape,
                                                                                GemmUniversalTraits,
-                                                                               scheduler,
-                                                                               has_hot_loop_v,
-                                                                               tail_number_v>;
-
+                                                                               scheduler>;
+            // For initial testing, we will just test with one pipeline.
+            // More extensive testing is coming later and will test other pipelines.
             using GemmPipeline = ck_tile::GemmPipelineAgBgCrMem<UniversalGemmProblem>;
 
             using GemmEpilogue = ck_tile::CShuffleEpilogue<
@@ -173,26 +158,12 @@ class TestCkTileStreamK : public ::testing::Test
                 s, ck_tile::make_kernel<kBlockPerCu>(Kernel{}, grid_dims, block_dims, 0, kargs));
         };
 
-        const auto RunSplitk = [&](const auto has_hot_loop_, const auto tail_number_) {
-            if(args.k_batch == 1)
-            {
-                Run(has_hot_loop_,
-                    tail_number_,
-                    ck_tile::integral_constant<ck_tile::memory_operation_enum,
-                                               // Since we are doing stream K, in the case of
-                                               // atomics, multiple workgroups may write to the same
-                                               // output tile in the C tensor, so we must atomic add
-                                               // the results (not set)
-                                               ck_tile::memory_operation_enum::atomic_add>{});
-            }
-            else
-            {
-                std::cout << "Stream K is only run with k_batch of 1" << std::endl;
-                EXPECT_TRUE(false);
-            }
-        };
-
-        BaseGemmPipeline::TailHandler(RunSplitk, has_hot_loop, tail_num);
+        Run(ck_tile::integral_constant<ck_tile::memory_operation_enum,
+                                       // Since we are doing stream K, in the case of
+                                       // atomics, multiple workgroups may write to the same
+                                       // output tile in the C tensor, so we must atomic add
+                                       // the results (not set)
+                                       ck_tile::memory_operation_enum::atomic_add>{});
     }
 
     public:

From 90491b0f20dc14a97637af94aadf96f26059cd03 Mon Sep 17 00:00:00 2001
From: Astha Rai <astha.rai713@gmail.com>
Date: Fri, 12 Sep 2025 18:35:41 +0000
Subject: [PATCH 7/9] changes from review: comments, move readfirstlane, remove
 ifndef

---
 include/ck_tile/ops/gemm/kernel/streamk_gemm_kernel.hpp | 9 ++++-----
 test/ck_tile/gemm_streamk/CMakeLists.txt                | 1 +
 test/ck_tile/gemm_streamk/test_gemm_streamk_cases.inc   | 5 -----
 3 files changed, 5 insertions(+), 10 deletions(-)

diff --git a/include/ck_tile/ops/gemm/kernel/streamk_gemm_kernel.hpp b/include/ck_tile/ops/gemm/kernel/streamk_gemm_kernel.hpp
index 913f55da17..db019819fd 100644
--- a/include/ck_tile/ops/gemm/kernel/streamk_gemm_kernel.hpp
+++ b/include/ck_tile/ops/gemm/kernel/streamk_gemm_kernel.hpp
@@ -278,16 +278,15 @@ struct StreamKKernel
 
         // An "iteration" denotes the multiplication of one macro tile in A with a macro tile in B.
         // The total iteration length is the total of such multiplications performed.
-        uint32_t total_iter_length = iter_end - iter_start;
+        uint32_t total_iter_length = __builtin_amdgcn_readfirstlane(iter_end - iter_start);
 
         // Main Stream-K loop
         while(true)
         {
             // Determine the number of macro tiles in A and B this WG is resposible for in the
             // current C macro tile.
-            uint32_t current_iter_length =
-                __builtin_amdgcn_readfirstlane(kargs.tile_partitioner.GetCurrentIterLength(
-                    iter_start, iter_end, total_iter_length));
+            uint32_t current_iter_length = kargs.tile_partitioner.GetCurrentIterLength(
+                iter_start, iter_end, total_iter_length);
 
             // Determine the 1D tile_idx and the iter_offset for this WG.
             // The tile_idx is the 1D macro tile index in the C tensor.
@@ -295,7 +294,7 @@ struct StreamKKernel
             // current iteration of the while loop.
             uint32_t tile_idx, iter_offset;
             kargs.tile_partitioner.GetTileIdxWithOffset(iter_end - 1, tile_idx, iter_offset);
-            iter_offset = __builtin_amdgcn_readfirstlane(iter_offset - current_iter_length + 1);
+            iter_offset = iter_offset - current_iter_length + 1;
 
             // Get the 2D tile index in the C tensor for this WG using the 1D index (i.e. tile_idx)
             auto spatial_idx = kargs.tile_partitioner.GetOutputTileIndex(tile_idx);
diff --git a/test/ck_tile/gemm_streamk/CMakeLists.txt b/test/ck_tile/gemm_streamk/CMakeLists.txt
index 07b6ed3d63..e00874ba07 100644
--- a/test/ck_tile/gemm_streamk/CMakeLists.txt
+++ b/test/ck_tile/gemm_streamk/CMakeLists.txt
@@ -1,5 +1,6 @@
 # Currently test_ck_tile_streamk is only built on gfx9
 if(GPU_TARGETS MATCHES "gfx9")
+    #TODO: support all arches
     add_gtest_executable(test_ck_tile_streamk test_gemm_streamk.cpp)
 else()
     message(DEBUG "Skipping test_ck_tile_streamk tests for current target")
diff --git a/test/ck_tile/gemm_streamk/test_gemm_streamk_cases.inc b/test/ck_tile/gemm_streamk/test_gemm_streamk_cases.inc
index fba3c1ae04..2fd73b882b 100644
--- a/test/ck_tile/gemm_streamk/test_gemm_streamk_cases.inc
+++ b/test/ck_tile/gemm_streamk/test_gemm_streamk_cases.inc
@@ -3,9 +3,6 @@
 
 #pragma once
 
-#ifndef TEST_STREAM_K_CASES_INC
-#define TEST_STREAM_K_CASES_INC
-
 TYPED_TEST(TEST_SUITE_NAME, StreamK_M256_N256_K256_DP)
 {
 
@@ -105,5 +102,3 @@ TYPED_TEST(TEST_SUITE_NAME, StreamK_Unsupported_Reduction)
     EXPECT_THROW(this->Run(M, N, K, num_sk_blocks, ck_tile::StreamKReductionStrategy::Reduction),
                  std::runtime_error);
 }
-
-#endif

From 81d59d96f1d93b10a25f8f2b0ebc1c0924008fc7 Mon Sep 17 00:00:00 2001
From: Emily Martins <emily.martins@amd.com>
Date: Fri, 12 Sep 2025 19:05:06 +0000
Subject: [PATCH 8/9] Switch direction of C tensor traversal & add padding
 guard

Prior to this change, WGs travelled backwards through their assigned
macro tiles in the C tensor. For instance, if WG0 is responsible for C
tiles 0 and 1, it would first visit tile 1 then tile 0. This means that
the iter_end decrements in each iteration of the stream-K while loop.

Since we are working with unsigned integers, the subtraction operation
may not be safe. Thus, this change makes is such that WGs travel forward
so that their iter_start is incremented and their iter_end remains
fixed.

Additionally, we added a guard against WGs that are neither sk_blocks
nor dp_blocks to ensure such WGs do not participate in the GEMM.

Together, these changes make is such that the algorithm is correct when
sk_blocks is greater than zero.
---
 .../ops/gemm/kernel/gemm_tile_partitioner.hpp | 15 +++++------
 .../ops/gemm/kernel/streamk_gemm_kernel.hpp   | 27 ++++++++++---------
 .../gemm_streamk/test_gemm_streamk_cases.inc  | 15 +++++++++--
 .../gemm_streamk/test_gemm_streamk_util.hpp   |  5 ----
 4 files changed, 34 insertions(+), 28 deletions(-)

diff --git a/include/ck_tile/ops/gemm/kernel/gemm_tile_partitioner.hpp b/include/ck_tile/ops/gemm/kernel/gemm_tile_partitioner.hpp
index 4f53c20b5b..a891d4df55 100644
--- a/include/ck_tile/ops/gemm/kernel/gemm_tile_partitioner.hpp
+++ b/include/ck_tile/ops/gemm/kernel/gemm_tile_partitioner.hpp
@@ -646,16 +646,13 @@ struct StreamKTilePartitioner
      * @brief Get length of loop iterations for stream-k loop
      */
     CK_TILE_DEVICE uint32_t GetCurrentIterLength(uint32_t iter_start,
-                                                 uint32_t iter_end,
-                                                 uint32_t total_iter_length) const noexcept
+                                                 uint32_t iter_end) const noexcept
     {
-        uint32_t iter_length_mod, iter_length_quo /*unused*/;
-        k_iters_per_tile.divmod(iter_end, iter_length_quo, iter_length_mod);
-        uint32_t total_iter_length_val = static_cast<uint32_t>(total_iter_length);
-        uint32_t current_iter_length =
-            min(iter_length_mod == 0 ? (iter_end - iter_start) : iter_length_mod,
-                total_iter_length_val);
-        return current_iter_length;
+        // A WG's iter_end is either in the current C macro tile or not.
+        // If it is not, then the macro tile boundary is where the WG must stop.
+        uint32_t distance_to_tile_boundary =
+            k_iters_per_tile.get() - (iter_start % k_iters_per_tile.get());
+        return min(iter_start + distance_to_tile_boundary, iter_end) - iter_start;
     }
 
     /**
diff --git a/include/ck_tile/ops/gemm/kernel/streamk_gemm_kernel.hpp b/include/ck_tile/ops/gemm/kernel/streamk_gemm_kernel.hpp
index db019819fd..5df1f092d7 100644
--- a/include/ck_tile/ops/gemm/kernel/streamk_gemm_kernel.hpp
+++ b/include/ck_tile/ops/gemm/kernel/streamk_gemm_kernel.hpp
@@ -229,12 +229,11 @@ struct StreamKKernel
 
     CK_TILE_HOST static bool IsSupportedArgument(const StreamKKernelArgs& kargs)
     {
-        if(kargs.tile_partitioner.sk_num_blocks != 0)
+        if(kargs.reduction_strategy == StreamKReductionStrategy::Reduction)
         {
             if(ck_tile::EnvIsEnabled(CK_TILE_ENV(CK_TILE_LOGGING)))
             {
-                CK_TILE_ERROR("CK Tile Stream-K currently only supports 0 SK blocks (i.e., "
-                              "data-parallel only).");
+                CK_TILE_ERROR("CK Tile Stream-K only supports the atomic reduction strategy.");
             }
             return false;
         }
@@ -271,30 +270,34 @@ struct StreamKKernel
 
         uint32_t block_idx = ck_tile::get_block_1d_id();
 
+        bool is_padding_block =
+            __builtin_amdgcn_readfirstlane(block_idx >= kargs.tile_partitioner.sk_num_blocks &&
+                                           block_idx < kargs.tile_partitioner.dp_start_block_idx);
+
+        // Padding blocks make it such that the DP blocks are aligned with the number of CUs; they
+        // should not partake in the GEMM
+        if(is_padding_block)
+            return;
+
         // Determine the K offset of the first and final macro tile in the A and B tensors along the
         // K dimension.
         uint32_t iter_start, iter_end;
         kargs.tile_partitioner.GetBlockItr(block_idx, iter_start, iter_end);
 
-        // An "iteration" denotes the multiplication of one macro tile in A with a macro tile in B.
-        // The total iteration length is the total of such multiplications performed.
-        uint32_t total_iter_length = __builtin_amdgcn_readfirstlane(iter_end - iter_start);
-
         // Main Stream-K loop
         while(true)
         {
             // Determine the number of macro tiles in A and B this WG is resposible for in the
             // current C macro tile.
-            uint32_t current_iter_length = kargs.tile_partitioner.GetCurrentIterLength(
-                iter_start, iter_end, total_iter_length);
+            uint32_t current_iter_length = __builtin_amdgcn_readfirstlane(
+                kargs.tile_partitioner.GetCurrentIterLength(iter_start, iter_end));
 
             // Determine the 1D tile_idx and the iter_offset for this WG.
             // The tile_idx is the 1D macro tile index in the C tensor.
             // The iter_offset is the starting macro tile index in the K dimension for the WG in the
             // current iteration of the while loop.
             uint32_t tile_idx, iter_offset;
-            kargs.tile_partitioner.GetTileIdxWithOffset(iter_end - 1, tile_idx, iter_offset);
-            iter_offset = iter_offset - current_iter_length + 1;
+            kargs.tile_partitioner.GetTileIdxWithOffset(iter_start, tile_idx, iter_offset);
 
             // Get the 2D tile index in the C tensor for this WG using the 1D index (i.e. tile_idx)
             auto spatial_idx = kargs.tile_partitioner.GetOutputTileIndex(tile_idx);
@@ -328,7 +331,7 @@ struct StreamKKernel
                     k_size);
 
             // Prepare for next Stream-K loop iteration.
-            iter_end -= current_iter_length;
+            iter_start += current_iter_length;
             if(iter_end <= iter_start)
                 break;
             block_sync_lds();
diff --git a/test/ck_tile/gemm_streamk/test_gemm_streamk_cases.inc b/test/ck_tile/gemm_streamk/test_gemm_streamk_cases.inc
index 2fd73b882b..788687d4ca 100644
--- a/test/ck_tile/gemm_streamk/test_gemm_streamk_cases.inc
+++ b/test/ck_tile/gemm_streamk/test_gemm_streamk_cases.inc
@@ -25,6 +25,17 @@ TYPED_TEST(TEST_SUITE_NAME, StreamK_M256_N256_K256_SKBlocks4)
     this->Run(M, N, K, num_sk_blocks);
 }
 
+TYPED_TEST(TEST_SUITE_NAME, StreamK_M256_N256_K256_SKBlocks12)
+{
+
+    ck_tile::index_t M     = 256;
+    ck_tile::index_t N     = 256;
+    ck_tile::index_t K     = 256;
+    uint32_t num_sk_blocks = 12;
+
+    this->Run(M, N, K, num_sk_blocks);
+}
+
 TYPED_TEST(TEST_SUITE_NAME, StreamK_M256_N256_K256_SKBlocks8)
 {
 
@@ -80,13 +91,13 @@ TYPED_TEST(TEST_SUITE_NAME, StreamK_M3840_N4096_K4096_DP)
     this->Run(M, N, K, num_sk_blocks);
 }
 
-TYPED_TEST(TEST_SUITE_NAME, StreamK_M3840_N4096_K4096_SKBlocks960)
+TYPED_TEST(TEST_SUITE_NAME, StreamK_M3840_N4096_K4096_SKBlocks64)
 {
 
     ck_tile::index_t M     = 3840;
     ck_tile::index_t N     = 4096;
     ck_tile::index_t K     = 4096;
-    uint32_t num_sk_blocks = 960;
+    uint32_t num_sk_blocks = 64;
 
     this->Run(M, N, K, num_sk_blocks);
 }
diff --git a/test/ck_tile/gemm_streamk/test_gemm_streamk_util.hpp b/test/ck_tile/gemm_streamk/test_gemm_streamk_util.hpp
index 17eb1a31e1..b8a55b024d 100644
--- a/test/ck_tile/gemm_streamk/test_gemm_streamk_util.hpp
+++ b/test/ck_tile/gemm_streamk/test_gemm_streamk_util.hpp
@@ -189,11 +189,6 @@ class TestCkTileStreamK : public ::testing::Test
             throw std::runtime_error("Reduction Strategy is current unsupported!\n");
         }
 
-        if(num_sk_blocks != 0)
-        {
-            GTEST_SKIP() << "CK Tile Stream K currently only supports DP.";
-        }
-
         auto f_host_tensor_descriptor = [](std::size_t row,
                                            std::size_t col,
                                            std::size_t stride,

From f932ebbabcab7eb375576c7802ee0314a4c613f5 Mon Sep 17 00:00:00 2001
From: Emily Martins <emily.martins@amd.com>
Date: Fri, 12 Sep 2025 22:50:59 +0000
Subject: [PATCH 9/9] Disable StreamK_M256_N256_K256_SKBlocks12 test case

This instance involves >=3 WGs contributing to each macro tile in C. Due
to the use of atomics, this is resulting in precision errors. These
errors will not persist once the reduction strategy is implemented. We
will re-enable this test then.
---
 test/ck_tile/gemm_streamk/test_gemm_streamk_cases.inc | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/test/ck_tile/gemm_streamk/test_gemm_streamk_cases.inc b/test/ck_tile/gemm_streamk/test_gemm_streamk_cases.inc
index 788687d4ca..1db7ef0fb0 100644
--- a/test/ck_tile/gemm_streamk/test_gemm_streamk_cases.inc
+++ b/test/ck_tile/gemm_streamk/test_gemm_streamk_cases.inc
@@ -25,8 +25,11 @@ TYPED_TEST(TEST_SUITE_NAME, StreamK_M256_N256_K256_SKBlocks4)
     this->Run(M, N, K, num_sk_blocks);
 }
 
+// TODO: Renable this test once reduction is implemented
 TYPED_TEST(TEST_SUITE_NAME, StreamK_M256_N256_K256_SKBlocks12)
 {
+    GTEST_SKIP() << "Skipping this test: There are precision issues with atomics due to >=3 WGs "
+                    "contributing to each macro tile in C";
 
     ck_tile::index_t M     = 256;
     ck_tile::index_t N     = 256;