Add instance traits for two more grouped forward convolutions (#3112)

shumway · web-flow · commit cafaeb6b7bac · 2025-10-29T16:04:13.000+01:00
diff --git a/experimental/builder/include/ck_tile/builder/reflect/instance_traits_device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp b/experimental/builder/include/ck_tile/builder/reflect/instance_traits_device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp
diff --git a/experimental/builder/include/ck_tile/builder/reflect/instance_traits_device_grouped_conv_fwd_multiple_d_xdl_large_tensor_cshuffle.hpp b/experimental/builder/include/ck_tile/builder/reflect/instance_traits_device_grouped_conv_fwd_multiple_d_xdl_large_tensor_cshuffle.hpp
diff --git a/experimental/builder/include/ck_tile/builder/reflect/instance_traits_util.hpp b/experimental/builder/include/ck_tile/builder/reflect/instance_traits_util.hpp
@@ -15,6 +15,7 @@
 #include <ck/utility/data_type.hpp>
 #include <ck/utility/sequence.hpp>
 #include <ck/utility/blkgemmpipe_scheduler.hpp>
+#include <ck/utility/loop_scheduler.hpp>
 #include <ck/tensor_operation/gpu/device/tensor_layout.hpp>
 #include <ck_tile/ops/common/tensor_layout.hpp>
 #include <ck/tensor_operation/gpu/element/element_wise_operation.hpp>
@@ -160,6 +161,17 @@ constexpr std::string_view pipeline_version_name(ck::BlockGemmPipelineVersion ve
     }
 }
 
+// Convert LoopScheduler enum to string
+constexpr std::string_view loop_scheduler_name(ck::LoopScheduler sched)
+{
+    using enum ck::LoopScheduler;
+    switch(sched)
+    {
+    case Default: return "Default";
+    case Interwave: return "Interwave";
+    }
+}
+
 // Convert std::array to string
 template <typename T, std::size_t N>
 inline std::string array_to_string(const std::array<T, N>& arr)
diff --git a/experimental/builder/test/CMakeLists.txt b/experimental/builder/test/CMakeLists.txt
@@ -26,7 +26,9 @@ add_ck_builder_test(test_inline_diff test_inline_diff.cpp)
 
 # Testing the virtual GetInstanceString methods requires kernel compilation.
 add_ck_builder_test(test_get_instance_string
-    test_get_instance_string.cpp)
+    test_get_instance_string_fwd_grp_conv_v3.cpp
+    test_get_instance_string_fwd_grp_conv.cpp
+    test_get_instance_string_fwd_grp_conv_large_tensor.cpp)
 
 # Testing the fwd convolution builder requires kernel compilation.
 # To enable parallel compilation, the individual tests are split into separate files.
diff --git a/experimental/builder/test/test_fwd_instance_traits.cpp b/experimental/builder/test/test_fwd_instance_traits.cpp
diff --git a/experimental/builder/test/test_get_instance_string_fwd_grp_conv.cpp b/experimental/builder/test/test_get_instance_string_fwd_grp_conv.cpp
@@ -0,0 +1,104 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <gtest/gtest.h>
+#include <ck_tile/builder/reflect/instance_traits.hpp>
+#include <ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_abd.hpp>
+#include <ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_instance.hpp>
+
+// Test GetInstanceString through base class pointer for non-V3 variant
+TEST(GetInstanceString, ReturnsStringForFwdGrpConvInstance)
+{
+    // Use the template helper to get a working instance configuration
+    using InstanceTuple =
+        ck::tensor_operation::device::instance::device_grouped_conv_fwd_xdl_f16_instances<
+            2,                                                       // NDimSpatial
+            ck::tensor_operation::device::instance::GNHWC,           // ALayout
+            ck::tensor_operation::device::instance::GKYXC,           // BLayout
+            ck::tensor_operation::device::instance::Empty_Tuple,     // DsLayout
+            ck::tensor_operation::device::instance::GNHWK,           // ELayout
+            ck::tensor_operation::device::instance::ConvFwdDefault>; // ConvForwardSpecialization
+
+    // Get the first instance from the tuple
+    using DeviceInstance = typename std::tuple_element<0, InstanceTuple>::type;
+
+    // Define the base class type using DeviceGroupedConvFwdMultipleABD
+    using BaseClass = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD<
+        2,                                                   // NDimSpatial
+        ck::tensor_operation::device::instance::GNHWC,       // ALayout
+        ck::tensor_operation::device::instance::GKYXC,       // BLayout
+        ck::tensor_operation::device::instance::Empty_Tuple, // DsLayout
+        ck::tensor_operation::device::instance::GNHWK,       // ELayout
+        ck::half_t,                                          // ADataType
+        ck::half_t,                                          // BDataType
+        ck::Tuple<>,                                         // DsDataType
+        ck::half_t,                                          // EDataType
+        ck::tensor_operation::element_wise::PassThrough,     // AElementwiseOperation
+        ck::tensor_operation::element_wise::PassThrough,     // BElementwiseOperation
+        ck::tensor_operation::element_wise::PassThrough,     // CDEElementwiseOperation
+        ck::half_t,                                          // AComputeType
+        ck::half_t>;                                         // BComputeType
+
+    // Create an instance of the derived class
+    DeviceInstance device_instance;
+
+    // Get a pointer to the base class
+    BaseClass* base_ptr = &device_instance;
+
+    // Call GetInstanceString through the base class pointer
+    std::string instance_str = base_ptr->GetInstanceString();
+
+    // Expected complete instance string based on the first instance from
+    // device_grouped_conv_fwd_xdl_f16_instances
+    std::string expected_str = "DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle"
+                               "<2"             // NDimSpatial
+                               ",GNHWC"         // ALayout
+                               ",GKYXC"         // BLayout
+                               ",EmptyTuple"    // DsLayout
+                               ",GNHWK"         // ELayout
+                               ",fp16"          // ADataType
+                               ",fp16"          // BDataType
+                               ",fp32"          // AccDataType
+                               ",fp16"          // CShuffleDataType
+                               ",EmptyTuple"    // DsDataType
+                               ",fp16"          // EDataType
+                               ",PassThrough"   // AElementwiseOperation
+                               ",PassThrough"   // BElementwiseOperation
+                               ",PassThrough"   // CDEElementwiseOperation
+                               ",Default"       // ConvForwardSpecialization
+                               ",MNKPadding"    // GemmSpec
+                               ",1"             // NumGemmKPrefetchStage
+                               ",64"            // BlockSize
+                               ",64"            // MPerBlock
+                               ",64"            // NPerBlock
+                               ",32"            // KPerBlock
+                               ",8"             // AK1
+                               ",8"             // BK1
+                               ",32"            // MPerXDL
+                               ",32"            // NPerXDL
+                               ",2"             // MXdlPerWave
+                               ",2"             // NXdlPerWave
+                               ",Seq(4,16,1)"   // ABlockTransferThreadClusterLengths
+                               ",Seq(1,0,2)"    // ABlockTransferThreadClusterArrangeOrder
+                               ",Seq(1,0,2)"    // ABlockTransferSrcAccessOrder
+                               ",2"             // ABlockTransferSrcVectorDim
+                               ",1"             // ABlockTransferSrcScalarPerVector
+                               ",8"             // ABlockTransferDstScalarPerVector_AK1
+                               ",1"             // ABlockLdsExtraM
+                               ",Seq(4,16,1)"   // BBlockTransferThreadClusterLengths
+                               ",Seq(1,0,2)"    // BBlockTransferThreadClusterArrangeOrder
+                               ",Seq(1,0,2)"    // BBlockTransferSrcAccessOrder
+                               ",2"             // BBlockTransferSrcVectorDim
+                               ",1"             // BBlockTransferSrcScalarPerVector
+                               ",8"             // BBlockTransferDstScalarPerVector_BK1
+                               ",1"             // BBlockLdsExtraN
+                               ",1"             // CShuffleMXdlPerWavePerShuffle
+                               ",1"             // CShuffleNXdlPerWavePerShuffle
+                               ",Seq(1,16,1,4)" // CDEBlockTransferClusterLengths
+                               ",1"             // CDEBlockTransferScalarPerVector_NPerBlock
+                               ",fp16"          // AComputeDataType
+                               ",fp16"          // BComputeDataType
+                               ",Default"       // LoopScheduler
+                               ",1>";           // NumGroupsToMerge
+    EXPECT_EQ(instance_str, expected_str);
+}
diff --git a/experimental/builder/test/test_get_instance_string_fwd_grp_conv_large_tensor.cpp b/experimental/builder/test/test_get_instance_string_fwd_grp_conv_large_tensor.cpp
@@ -0,0 +1,103 @@
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2025, Advanced Micro Devices, Inc. All rights reserved.
+
+#include <gtest/gtest.h>
+#include <ck_tile/builder/reflect/instance_traits.hpp>
+#include <ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_abd.hpp>
+#include <ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_large_tensor_instance.hpp>
+
+// Test GetInstanceString through base class pointer for large tensor variant
+TEST(GetInstanceString, ReturnsStringForFwdGrpConvLargeTensorInstance)
+{
+    // Use the template helper to get a working instance configuration
+    using InstanceTuple = ck::tensor_operation::device::instance::
+        device_grouped_conv_fwd_xdl_large_tensor_f16_instances<
+            2,                                                       // NDimSpatial
+            ck::tensor_operation::device::instance::GNHWC,           // ALayout
+            ck::tensor_operation::device::instance::GKYXC,           // BLayout
+            ck::tensor_operation::device::instance::Empty_Tuple,     // DsLayout
+            ck::tensor_operation::device::instance::GNHWK,           // ELayout
+            ck::tensor_operation::device::instance::ConvFwdDefault>; // ConvForwardSpecialization
+
+    // Get the first instance from the tuple
+    using DeviceInstance = typename std::tuple_element<0, InstanceTuple>::type;
+
+    // Define the base class type using DeviceGroupedConvFwdMultipleABD
+    using BaseClass = ck::tensor_operation::device::DeviceGroupedConvFwdMultipleABD<
+        2,                                                   // NDimSpatial
+        ck::tensor_operation::device::instance::GNHWC,       // ALayout
+        ck::tensor_operation::device::instance::GKYXC,       // BLayout
+        ck::tensor_operation::device::instance::Empty_Tuple, // DsLayout
+        ck::tensor_operation::device::instance::GNHWK,       // ELayout
+        ck::half_t,                                          // ADataType
+        ck::half_t,                                          // BDataType
+        ck::Tuple<>,                                         // DsDataType
+        ck::half_t,                                          // EDataType
+        ck::tensor_operation::element_wise::PassThrough,     // AElementwiseOperation
+        ck::tensor_operation::element_wise::PassThrough,     // BElementwiseOperation
+        ck::tensor_operation::element_wise::PassThrough,     // CDEElementwiseOperation
+        ck::half_t,                                          // AComputeType
+        ck::half_t>;                                         // BComputeType
+
+    // Create an instance of the derived class
+    DeviceInstance device_instance;
+
+    // Get a pointer to the base class
+    BaseClass* base_ptr = &device_instance;
+
+    // Call GetInstanceString through the base class pointer
+    std::string instance_str = base_ptr->GetInstanceString();
+
+    // Expected complete instance string based on the first instance from
+    // device_grouped_conv_fwd_xdl_large_tensor_f16_instances
+    std::string expected_str = "DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor"
+                               "<2"             // NDimSpatial
+                               ",GNHWC"         // ALayout
+                               ",GKYXC"         // BLayout
+                               ",EmptyTuple"    // DsLayout
+                               ",GNHWK"         // ELayout
+                               ",fp16"          // ADataType
+                               ",fp16"          // BDataType
+                               ",fp32"          // AccDataType
+                               ",fp16"          // CShuffleDataType
+                               ",EmptyTuple"    // DsDataType
+                               ",fp16"          // EDataType
+                               ",PassThrough"   // AElementwiseOperation
+                               ",PassThrough"   // BElementwiseOperation
+                               ",PassThrough"   // CDEElementwiseOperation
+                               ",Default"       // ConvForwardSpecialization
+                               ",MNKPadding"    // GemmSpec
+                               ",1"             // NumGemmKPrefetchStage
+                               ",64"            // BlockSize
+                               ",64"            // MPerBlock
+                               ",64"            // NPerBlock
+                               ",32"            // KPerBlock
+                               ",8"             // AK1
+                               ",8"             // BK1
+                               ",32"            // MPerXDL
+                               ",32"            // NPerXDL
+                               ",2"             // MXdlPerWave
+                               ",2"             // NXdlPerWave
+                               ",Seq(4,16,1)"   // ABlockTransferThreadClusterLengths
+                               ",Seq(1,0,2)"    // ABlockTransferThreadClusterArrangeOrder
+                               ",Seq(1,0,2)"    // ABlockTransferSrcAccessOrder
+                               ",2"             // ABlockTransferSrcVectorDim
+                               ",1"             // ABlockTransferSrcScalarPerVector
+                               ",8"             // ABlockTransferDstScalarPerVector_AK1
+                               ",1"             // ABlockLdsExtraM
+                               ",Seq(4,16,1)"   // BBlockTransferThreadClusterLengths
+                               ",Seq(1,0,2)"    // BBlockTransferThreadClusterArrangeOrder
+                               ",Seq(1,0,2)"    // BBlockTransferSrcAccessOrder
+                               ",2"             // BBlockTransferSrcVectorDim
+                               ",1"             // BBlockTransferSrcScalarPerVector
+                               ",8"             // BBlockTransferDstScalarPerVector_BK1
+                               ",1"             // BBlockLdsExtraN
+                               ",1"             // CShuffleMXdlPerWavePerShuffle
+                               ",1"             // CShuffleNXdlPerWavePerShuffle
+                               ",Seq(1,16,1,4)" // CDEBlockTransferClusterLengths
+                               ",1"             // CDEBlockTransferScalarPerVector_NPerBlock
+                               ",fp16"          // AComputeDataType
+                               ",fp16"          // BComputeDataType
+                               ",Default>";     // LoopScheduler
+    EXPECT_EQ(instance_str, expected_str);
+}
diff --git a/experimental/builder/test/test_get_instance_string_fwd_grp_conv_v3.cpp b/experimental/builder/test/test_get_instance_string_fwd_grp_conv_v3.cpp
@@ -6,8 +6,8 @@
 #include <ck/tensor_operation/gpu/device/device_grouped_conv_fwd_multiple_abd.hpp>
 #include <ck/library/tensor_operation_instance/gpu/grouped_conv_fwd/device_grouped_conv_fwd_xdl_comp_instance.hpp>
 
-// Test GetInstanceString through base class pointer
-TEST(GetInstanceStringTest, GetInstanceStringThroughBaseClass)
+// Test GetInstanceString through base class pointer for V3 variant
+TEST(GetInstanceString, ReturnsStringForFwdGrpConvV3Instance)
 {
     // Use the template helper to get a working instance configuration
     using InstanceTuple =
diff --git a/experimental/builder/test/test_instance_traits_util.cpp b/experimental/builder/test/test_instance_traits_util.cpp
@@ -199,6 +199,14 @@ TEST(InstanceTraitsUtil, PipelineVersionNameReturnsCorrectStrings)
                 ElementsAre("v1", "v2", "v3", "v4", "v5"));
 }
 
+TEST(InstanceTraitsUtil, LoopSchedulerNameReturnsCorrectStrings)
+{
+    using enum ck::LoopScheduler;
+    EXPECT_THAT(std::vector<std::string_view> names = {loop_scheduler_name(Default),
+                                                       loop_scheduler_name(Interwave)},
+                ElementsAre("Default", "Interwave"));
+}
+
 TEST(InstanceTraitsUtil, TupleNameReturnsEmptyTupleForEmptyTuple)
 {
     EXPECT_EQ(tuple_name<ck::Tuple<>>(), "EmptyTuple");
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp
@@ -28,6 +28,9 @@
 #include "ck/host_utility/device_prop.hpp"
 #include "ck/host_utility/kernel_launch.hpp"
 #include "ck/host_utility/io.hpp"
+#ifdef CK_EXPERIMENTAL_BUILDER
+#include "ck_tile/builder/reflect/instance_traits_device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp"
+#endif
 
 namespace ck {
 namespace tensor_operation {
@@ -2063,6 +2066,19 @@ struct DeviceGroupedConvFwdMultipleABD_Xdl_CShuffle
         return str.str();
     }
 
+#ifdef CK_EXPERIMENTAL_BUILDER
+    std::string GetInstanceString() const override
+    {
+        static_assert(ck_tile::reflect::HasInstanceTraits<DeviceOp>,
+                      "Specialization of instance_traits not found. Please check that a "
+                      "specialization exists in file "
+                      "ck_tile/builder/reflect/"
+                      "instance_traits_device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp "
+                      "for the given template parameters.");
+        return ck_tile::reflect::instance_string<DeviceOp>();
+    }
+#endif
+
     size_t GetWorkSpaceSize(const BaseArgument* p_arg) const override
     {
         auto arg = dynamic_cast<const Argument*>(p_arg);
diff --git a/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_large_tensor_cshuffle.hpp b/include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_d_xdl_large_tensor_cshuffle.hpp
@@ -24,6 +24,9 @@
 #include "ck/host_utility/device_prop.hpp"
 #include "ck/host_utility/kernel_launch.hpp"
 #include "ck/host_utility/io.hpp"
+#ifdef CK_EXPERIMENTAL_BUILDER
+#include "ck_tile/builder/reflect/instance_traits_device_grouped_conv_fwd_multiple_d_xdl_large_tensor_cshuffle.hpp"
+#endif
 
 namespace ck {
 namespace tensor_operation {
@@ -1220,6 +1223,20 @@ struct DeviceGroupedConvFwdMultipleD_Xdl_CShuffle_Large_Tensor
 
         return str.str();
     }
+
+#ifdef CK_EXPERIMENTAL_BUILDER
+    std::string GetInstanceString() const override
+    {
+        static_assert(
+            ck_tile::reflect::HasInstanceTraits<DeviceOp>,
+            "Specialization of instance_traits not found. Please check that a "
+            "specialization exists in file "
+            "ck_tile/builder/reflect/"
+            "instance_traits_device_grouped_conv_fwd_multiple_d_xdl_large_tensor_cshuffle.hpp "
+            "for the given template parameters.");
+        return ck_tile::reflect::instance_string<DeviceOp>();
+    }
+#endif
 };
 
 } // namespace device

Original file line number	Diff line number	Diff line change
`@@ -199,6 +199,14 @@ TEST(InstanceTraitsUtil, PipelineVersionNameReturnsCorrectStrings)`
`199`	`199`	`ElementsAre("v1", "v2", "v3", "v4", "v5"));`
`200`	`200`	`}`
`201`	`201`
	`202`	`+TEST(InstanceTraitsUtil, LoopSchedulerNameReturnsCorrectStrings)`
	`203`	`+{`
	`204`	`+ using enum ck::LoopScheduler;`
	`205`	`+ EXPECT_THAT(std::vector<std::string_view> names = {loop_scheduler_name(Default),`
	`206`	`+ loop_scheduler_name(Interwave)},`
	`207`	`+ ElementsAre("Default", "Interwave"));`
	`208`	`+}`
	`209`	`+`
`202`	`210`	`TEST(InstanceTraitsUtil, TupleNameReturnsEmptyTupleForEmptyTuple)`
`203`	`211`	`{`
`204`	`212`	`EXPECT_EQ(tuple_name<ck::Tuple<>>(), "EmptyTuple");`