Skip to content

Commit e6b8366

Browse files
NikhilAPatelfacebook-github-bot
authored andcommitted
Force unrolling of certain loops (#497)
Summary: Force the unrolling of 3 loops through the use of `range_constexpr` to increase performance. Differential Revision: D83513863
1 parent dee73ff commit e6b8366

File tree

1 file changed

+3
-3
lines changed
  • tritonbench/operators/grouped_gemm/cutedsl

1 file changed

+3
-3
lines changed

tritonbench/operators/grouped_gemm/cutedsl/kernels.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -573,7 +573,7 @@ def kernel(
573573

574574
# init barrier for loading A, B with TMA
575575
if warp_idx == self.epilog_warp_id[0]:
576-
for k_stage in range(self.num_ab_stage):
576+
for k_stage in range_constexpr(self.num_ab_stage): # noqa: F821
577577
num_tma_producer = self.num_mcast_ctas_a + self.num_mcast_ctas_b - 1
578578
with cute.arch.elect_one():
579579
cute.arch.mbarrier_init(ab_full_mbar_ptr + k_stage, 1)
@@ -582,7 +582,7 @@ def kernel(
582582
)
583583
# Accumulator barrier init
584584
if warp_idx == self.mma_warp_id:
585-
for acc_stage in range(self.num_acc_stage):
585+
for acc_stage in range_constexpr(self.num_acc_stage): # noqa: F821
586586
with cute.arch.elect_one():
587587
cute.arch.mbarrier_init(acc_full_mbar_ptr + acc_stage, 1)
588588
cute.arch.mbarrier_init(
@@ -1287,7 +1287,7 @@ def kernel(
12871287
#
12881288
subtile_cnt = cute.size(tTR_tAcc.shape, mode=[3])
12891289
num_prev_subtiles = tile_sched.num_tiles_executed * subtile_cnt
1290-
for subtile_idx in range(subtile_cnt):
1290+
for subtile_idx in range_constexpr(subtile_cnt): # noqa: F821
12911291
#
12921292
# Load accumulator from tensor memory buffer to register
12931293
#

0 commit comments

Comments
 (0)