Force unrolling of certain loops (#497)

NikhilAPatel · facebook-github-bot · commit e6b83664749b · 2025-09-30T00:01:24.000-07:00
Summary:

Force the unrolling of 3 loops through the use of `range_constexpr` to increase performance.

Differential Revision: D83513863
diff --git a/tritonbench/operators/grouped_gemm/cutedsl/kernels.py b/tritonbench/operators/grouped_gemm/cutedsl/kernels.py
@@ -573,7 +573,7 @@ def kernel(
 
         #  init barrier for loading A, B with TMA
         if warp_idx == self.epilog_warp_id[0]:
-            for k_stage in range(self.num_ab_stage):
+            for k_stage in range_constexpr(self.num_ab_stage):  # noqa: F821
                 num_tma_producer = self.num_mcast_ctas_a + self.num_mcast_ctas_b - 1
                 with cute.arch.elect_one():
                     cute.arch.mbarrier_init(ab_full_mbar_ptr + k_stage, 1)
@@ -582,7 +582,7 @@ def kernel(
                     )
         # Accumulator barrier init
         if warp_idx == self.mma_warp_id:
-            for acc_stage in range(self.num_acc_stage):
+            for acc_stage in range_constexpr(self.num_acc_stage):  # noqa: F821
                 with cute.arch.elect_one():
                     cute.arch.mbarrier_init(acc_full_mbar_ptr + acc_stage, 1)
                     cute.arch.mbarrier_init(
@@ -1287,7 +1287,7 @@ def kernel(
                 #
                 subtile_cnt = cute.size(tTR_tAcc.shape, mode=[3])
                 num_prev_subtiles = tile_sched.num_tiles_executed * subtile_cnt
-                for subtile_idx in range(subtile_cnt):
+                for subtile_idx in range_constexpr(subtile_cnt):  # noqa: F821
                     #
                     # Load accumulator from tensor memory buffer to register
                     #

Original file line number	Diff line number	Diff line change
`@@ -573,7 +573,7 @@ def kernel(`
`573`	`573`
`574`	`574`	`# init barrier for loading A, B with TMA`
`575`	`575`	`if warp_idx == self.epilog_warp_id[0]:`
`576`		`- for k_stage in range(self.num_ab_stage):`
	`576`	`+ for k_stage in range_constexpr(self.num_ab_stage): # noqa: F821`
`577`	`577`	`num_tma_producer = self.num_mcast_ctas_a + self.num_mcast_ctas_b - 1`
`578`	`578`	`with cute.arch.elect_one():`
`579`	`579`	`cute.arch.mbarrier_init(ab_full_mbar_ptr + k_stage, 1)`
`@@ -582,7 +582,7 @@ def kernel(`
`582`	`582`	`)`
`583`	`583`	`# Accumulator barrier init`
`584`	`584`	`if warp_idx == self.mma_warp_id:`
`585`		`- for acc_stage in range(self.num_acc_stage):`
	`585`	`+ for acc_stage in range_constexpr(self.num_acc_stage): # noqa: F821`
`586`	`586`	`with cute.arch.elect_one():`
`587`	`587`	`cute.arch.mbarrier_init(acc_full_mbar_ptr + acc_stage, 1)`
`588`	`588`	`cute.arch.mbarrier_init(`
`@@ -1287,7 +1287,7 @@ def kernel(`
`1287`	`1287`	`#`
`1288`	`1288`	`subtile_cnt = cute.size(tTR_tAcc.shape, mode=[3])`
`1289`	`1289`	`num_prev_subtiles = tile_sched.num_tiles_executed * subtile_cnt`
`1290`		`- for subtile_idx in range(subtile_cnt):`
	`1290`	`+ for subtile_idx in range_constexpr(subtile_cnt): # noqa: F821`
`1291`	`1291`	`#`
`1292`	`1292`	`# Load accumulator from tensor memory buffer to register`
`1293`	`1293`	`#`