[None][feat] Apply AutoTuner to fp8_block_scale_deep_gemm to trigger JIT ahead of time. (#7113)

hyukn · web-flow · commit 9c5b464fe0b2 · 2025-08-25T10:48:31.000+08:00
Because deep_gemm.gp8_gemm_nt will trigger many JIT processes during the inference phase, we need to sweep these shapes ahead of time. Apply the AutoTuner framework to achieve this and retain the potential capability to tune the swap_ab flag.

Signed-off-by: Yukun He &lt;23156053+hyukn@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/custom_ops/torch_custom_ops.py b/tensorrt_llm/_torch/custom_ops/torch_custom_ops.py
@@ -4,6 +4,8 @@
 import torch
 
 import tensorrt_llm.quantization.utils.fp4_utils as fp4_utils
+import tensorrt_llm.quantization.utils.fp8_utils as fp8_utils
+from tensorrt_llm import deep_gemm
 from tensorrt_llm._utils import get_sm_version
 
 from ..autotuner import (AutoTuner, ConstraintSpec, DynamicTensorSpec,
@@ -890,6 +892,94 @@ def _(
     return input.new_empty((M, N), dtype=output_dtype)
 
 
+def fp8_swap_ab_gen_tuning_buckets(x: int):
+    buckets = tuple(range(8, 128, 8))
+    if x >= 128:
+        buckets += tuple(range(128, x, 128))
+    return buckets
+
+
+class fp8SwapABGemmRunner(TunableRunner):
+    tuning_config = TuningConfig(
+        dynamic_tensor_specs=(DynamicTensorSpec(
+            0, 0, fp8_swap_ab_gen_tuning_buckets), ),
+        tune_max_num_tokens=4096,
+    )
+
+    def __init__(self, output_dtype: torch.dtype, disable_ue8m0_cast: bool):
+        self.output_dtype = output_dtype
+        self.disable_ue8m0_cast = disable_ue8m0_cast
+
+    def get_valid_tactics(
+        self,
+        inputs: List[torch.Tensor],
+        profile: OptimizationProfile,
+    ) -> List[int]:
+        # Encode swap_ab as False (0) and True (1). Currently only add one tactic here.
+        return [0]
+
+    def forward(
+        self,
+        inputs: List[torch.Tensor],
+        tactic: int = -1,
+    ) -> torch.Tensor:
+        input, weight, weight_scale = inputs
+        a, a_sf = fp8_utils.per_token_quant_and_transform(input)
+        output = torch.empty(
+            (input.size(0), weight.size(0)),
+            device=input.device,
+            dtype=self.output_dtype,
+        )
+        # TODO: add swap_ab=tactic == 0 to detemrmine the swap_ab value
+        # Treat the default tactic=-1 as swap_ab=False
+        deep_gemm.fp8_gemm_nt(
+            (a, a_sf),
+            (weight, weight_scale),
+            output,
+            disable_ue8m0_cast=self.disable_ue8m0_cast,
+        )
+        return output
+
+
+@torch.library.custom_op("trtllm::fp8_swap_ab_gemm", mutates_args=())
+def fp8_swap_ab_gemm(
+    input: torch.Tensor,
+    weight: torch.Tensor,
+    weight_scale: torch.Tensor,
+    output_dtype: torch.dtype = torch.bfloat16,
+    disable_ue8m0_cast: bool = False,
+    tune_max_num_tokens: int = 4096,
+) -> torch.Tensor:
+    tuner = AutoTuner.get()
+    fp8_swap_ab_gemm_runner = fp8SwapABGemmRunner(
+        output_dtype,
+        disable_ue8m0_cast,
+    )
+    fp8SwapABGemmRunner.tuning_config.tune_max_num_tokens = tune_max_num_tokens
+    _, best_tactic = tuner.choose_one(
+        "trtllm::fp8_swap_ab_gemm",
+        [fp8_swap_ab_gemm_runner],
+        fp8SwapABGemmRunner.tuning_config,
+        [input, weight, weight_scale],
+    )
+    return fp8_swap_ab_gemm_runner(
+        inputs=[input, weight, weight_scale],
+        tactic=best_tactic,
+    )
+
+
+@fp8_swap_ab_gemm.register_fake
+def _(
+    input: torch.Tensor,
+    weight: torch.Tensor,
+    weight_scale: torch.Tensor,
+    output_dtype: torch.dtype = torch.bfloat16,
+    disable_ue8m0_cast: bool = False,
+    tune_max_num_tokens: int = 4096,
+) -> torch.Tensor:
+    return input.new_empty((input.size(0), weight.size(0)), dtype=output_dtype)
+
+
 def get_event(event_idx: int):
     from ..utils import get_model_extra_attrs
     extra_attrs = get_model_extra_attrs()
diff --git a/tensorrt_llm/_torch/modules/linear.py b/tensorrt_llm/_torch/modules/linear.py
@@ -12,7 +12,6 @@
 from torch.nn.parameter import Parameter
 
 import tensorrt_llm.quantization.utils.fp4_utils as fp4_utils
-import tensorrt_llm.quantization.utils.fp8_utils as fp8_utils
 from tensorrt_llm._torch.peft.lora.layer import LoraLayer
 from tensorrt_llm.functional import (AllReduceFusionOp, AllReduceParams,
                                      AllReduceStrategy)
@@ -591,15 +590,12 @@ def apply(self, module: Linear, input: torch.Tensor,
                     act_input_fp8, module.weight, act_input_sf,
                     module.weight_scale)
             else:
-                from tensorrt_llm import deep_gemm
-                a, a_sf = fp8_utils.per_token_quant_and_transform(input)
-                output = torch.empty((input.shape[0], module.weight.shape[0]),
-                                     device=input.device,
-                                     dtype=torch.bfloat16)
-                deep_gemm.fp8_gemm_nt((a, a_sf),
-                                      (module.weight, module.weight_scale),
-                                      output,
-                                      disable_ue8m0_cast=True)
+                output = torch.ops.trtllm.fp8_swap_ab_gemm(
+                    input,
+                    module.weight,
+                    module.weight_scale,
+                    disable_ue8m0_cast=True,
+                )
         else:
             act_input_fp8, act_input_sf = torch.ops.trtllm.fp8_quantize_1x128(
                 input)
diff --git a/tests/unittest/_torch/thop/test_fp8_block_scale_gemm.py b/tests/unittest/_torch/thop/test_fp8_block_scale_gemm.py
@@ -19,10 +19,11 @@
 import pytest
 import torch
 from _torch.helpers import (calc_diff, per_block_cast_to_fp8,
-                            per_block_cast_to_fp8_e8m0,
-                            per_token_cast_to_fp8_e8m0)
+                            per_block_cast_to_fp8_e8m0)
 from utils.util import getSMVersion
 
+from tensorrt_llm._torch.autotuner import autotune
+
 
 @pytest.mark.skipif(
     getSMVersion() != 100,
@@ -46,16 +47,17 @@ def test_fp8_block_scale_deep_gemm(dtype, m, k, n):
     a = torch.randn((m, k), device='cuda', dtype=dtype)
     b = torch.randn((n, k), device='cuda', dtype=dtype)
 
-    act_a_fp8, act_a_sf = per_token_cast_to_fp8_e8m0(a)
     act_b_fp8, act_b_sf = per_block_cast_to_fp8_e8m0(b)
 
     output_expected = a @ b.t()
-    from tensorrt_llm import deep_gemm
-    output = torch.empty((act_a_fp8.shape[0], act_b_fp8.shape[0]),
-                         device=act_a_fp8.device,
-                         dtype=torch.bfloat16)
 
-    deep_gemm.fp8_gemm_nt((act_a_fp8, act_a_sf), (act_b_fp8, act_b_sf), output)
+    with autotune():
+        output = torch.ops.trtllm.fp8_swap_ab_gemm(
+            a,
+            act_b_fp8,
+            act_b_sf,
+        )
+
     diff = calc_diff(output, output_expected)
     assert diff < 1e-2