From d637a669d425dcff00ba45df90ca896f8251d9ac Mon Sep 17 00:00:00 2001
From: Janani Sriram <jananisriram@meta.com>
Date: Thu, 28 Aug 2025 10:08:40 -0700
Subject: [PATCH] Validate exhaustive autotuning for FP8 Inductor templates
 (#355)

Summary:

X-link: https://github.com/pytorch/pytorch/pull/161442

Validate exhaustive autotuning for FP8 Inductor templates: scaled MM templates require `block_k >= 32`. Before, exhaustive autotuning defaulted to a limited set of autotuning configs, as limitations for exhaustively autotuning on FP8 shapes had not been tested.

Reviewed By: coconutruben

Differential Revision: D80958642
---
 tritonbench/operators/fp8_gemm/fp8_gemm.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tritonbench/operators/fp8_gemm/fp8_gemm.py b/tritonbench/operators/fp8_gemm/fp8_gemm.py
index e35e34bb8..7acf87c1d 100644
--- a/tritonbench/operators/fp8_gemm/fp8_gemm.py
+++ b/tritonbench/operators/fp8_gemm/fp8_gemm.py
@@ -17,6 +17,10 @@
 
 from .tutorial import matmul as tutorial_matmul
 
+torch._dynamo.config.recompile_limit = (
+    10000  # Set high recompile limit to allow for exhausting autotuning
+)
+
 logger = logging.getLogger(__name__)
 try:
     from .persistent import (