[moe training] add bench script for fp8 rowwise kernels and update autotune configs

danielvegamyhre · danielvegamyhre · commit c621ce57b9ff · 2025-08-05T17:55:53.000-07:00
stack-info: PR: #2697, branch: danielvegamyhre/stack/31
diff --git a/test/prototype/moe_training/test_kernels.py b/test/prototype/moe_training/test_kernels.py
@@ -109,8 +109,6 @@ def test_fp8_rowwise_3d_transpose_rhs(round_scales_to_power_of_2: bool):
         target_dtype=torch.float8_e4m3fn,
         round_scales_to_power_of_2=round_scales_to_power_of_2,
     )
-    # Pytorch impl keeps the empty scaled dimension, so we need to squeeze it out.
-    ref_scales = ref_scales.squeeze(1)
 
     triton_fp8, triton_scales = triton_fp8_rowwise_3d_transpose_rhs(
         x,
diff --git a/torchao/prototype/moe_training/benchmarks/benchmark_per_group_scaling_kernels.py b/torchao/prototype/moe_training/benchmarks/benchmark_per_group_scaling_kernels.py
diff --git a/torchao/prototype/moe_training/benchmarks/benchmark_rowwise_3d_quant_kernels.py b/torchao/prototype/moe_training/benchmarks/benchmark_rowwise_3d_quant_kernels.py
@@ -0,0 +1,151 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD 3-Clause license found in the
+# LICENSE file in the root directory of this source tree.
+# this benchmarking script is a modified version of the original script from: https://github.com/drisspg/transformer_nuggets/blob/main/transformer_nuggets/utils/benchmark.py
+
+import itertools
+from dataclasses import dataclass
+from typing import List
+
+import torch
+from tabulate import tabulate
+from tqdm import tqdm
+from triton.testing import do_bench
+
+from torchao.prototype.moe_training.kernels.float8_rowwise import (
+    triton_fp8_rowwise_3d_transpose_rhs,
+)
+from torchao.prototype.moe_training.utils import (
+    torch_to_3d_rowwise_float8_transpose_rhs,
+)
+
+device = torch.device("cuda")
+
+# Needed since changing args to function causes recompiles
+torch._dynamo.config.cache_size_limit = 1000
+
+
+@dataclass(frozen=True)
+class ExperimentConfig:
+    high_precision_dtype: torch.dtype
+    input_shape: tuple[int]
+
+
+@dataclass(frozen=True)
+class ExperimentResult:
+    torch_time_us: float
+    triton_time_us: float
+
+
+@dataclass(frozen=True)
+class Experiment:
+    config: ExperimentConfig
+    result: ExperimentResult
+
+
+def get_configs() -> List[ExperimentConfig]:
+    # Llama4 and DeepSeekV3 shapes
+    input_shapes = [(8, 4096, 1024), (16, 5120 * 4, 5120)]
+    high_precision_dtypes = [torch.bfloat16]
+    configs = []
+    for input_shape, high_precision_dtype in itertools.product(
+        input_shapes, high_precision_dtypes
+    ):
+        configs.append(
+            ExperimentConfig(
+                input_shape=input_shape,
+                high_precision_dtype=high_precision_dtype,
+            )
+        )
+    return configs
+
+
+def run_experiment(config: ExperimentConfig) -> ExperimentResult:
+    # Expert weights will be passed in transposed and column major in practice
+    input_tensor = torch.randn(
+        *config.input_shape,
+        dtype=config.high_precision_dtype,
+        device=device,
+    ).transpose(-2, -1)
+
+    def warmup(func, *args, **kwargs):
+        for _ in range(10):
+            func(*args, **kwargs)
+
+    def run_torch(input_tensor: torch.Tensor):
+        out = torch_to_3d_rowwise_float8_transpose_rhs(
+            input_tensor,
+            target_dtype=torch.float8_e4m3fn,
+            round_scales_to_power_of_2=True,
+        )
+        torch.cuda.synchronize()
+        return out
+
+    def run_triton(input_tensor: torch.Tensor):
+        _ = triton_fp8_rowwise_3d_transpose_rhs(
+            input_tensor,
+            output_dtype=torch.float8_e4m3fn,
+            round_scales_to_power_of_2=True,
+        )
+        torch.cuda.synchronize()
+
+    # bench torch
+    compiled_run_torch = torch.compile(run_torch)
+    warmup(run_torch, input_tensor)
+    torch_time_us = benchmark_cuda_function_in_microseconds(
+        compiled_run_torch,
+        input_tensor,
+    )
+
+    # bench triton
+    warmup(run_triton, input_tensor)
+    triton_time_us = benchmark_cuda_function_in_microseconds(
+        run_triton,
+        input_tensor,
+    )
+
+    return ExperimentResult(
+        torch_time_us=torch_time_us,
+        triton_time_us=triton_time_us,
+    )
+
+
+def print_results(experiments: List[Experiment]):
+    headers = [
+        "input_shape",
+        "torch_time_us",
+        "triton_time_us",
+    ]
+    rows = []
+    for experiment in experiments:
+        input_shape = f"({experiment.config.input_shape[0]}, {experiment.config.input_shape[1], experiment.config.input_shape[2]})"
+        rows.append(
+            [
+                input_shape,
+                experiment.result.torch_time_us,
+                experiment.result.triton_time_us,
+            ]
+        )
+    print(tabulate(rows, headers=headers))
+
+
+def benchmark_cuda_function_in_microseconds(f, *args):
+    return do_bench(lambda: f(*args), return_mode="median") * 1e3
+
+
+def main():
+    torch.random.manual_seed(123)
+    configs = get_configs()
+    results = []
+    for config in tqdm(configs):
+        result = run_experiment(config)
+        results.append(Experiment(config=config, result=result))
+
+    # Use Tabulate to print results
+    print_results(results)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/torchao/prototype/moe_training/kernels/float8_rowwise.py b/torchao/prototype/moe_training/kernels/float8_rowwise.py
@@ -26,16 +26,18 @@
     torch.float64: tl.float64,
 }
 
-block_sizes = [16]
-num_warps = [4]
-num_stages = [2]
+block_sizes_n = [32, 128, 512]  # large dim (output_features)
+block_sizes_k = [32, 128, 512]  # small dim (input_features)
+num_warps = [8]
+num_stages = [2, 3]
 kernel_configs_2D = [
     triton.Config(
-        {"BLOCK_SIZE_N": block_size, "BLOCK_SIZE_K": block_size * 2},
+        {"BLOCK_SIZE_N": block_size_n, "BLOCK_SIZE_K": block_size_k},
         num_warps=warps,
         num_stages=stages,
     )
-    for block_size in block_sizes
+    for block_size_n in block_sizes_n
+    for block_size_k in block_sizes_k
     for warps in num_warps
     for stages in num_stages
 ]
@@ -62,8 +64,9 @@ def triton_fp8_rowwise_3d_transpose_rhs(
 
     # allocate on-device buffers for output and scales
     # output shape = input.transpose(-2, -1).shape = (E, N, K) in column major layout
-    output_buffer = torch.empty((e, k, n), dtype=output_dtype, device=hp_tensor.device)
-    output_buffer = output_buffer.transpose(-2, -1)
+    output_buffer = torch.empty_like(
+        hp_tensor, dtype=output_dtype, device=hp_tensor.device
+    )
     scales_buffer = torch.full(
         (e, k), float("inf"), dtype=torch.float32, device=hp_tensor.device
     )
diff --git a/torchao/prototype/moe_training/utils.py b/torchao/prototype/moe_training/utils.py
@@ -146,7 +146,7 @@ def torch_to_float8_per_group_rowwise(
 
 
 def torch_to_3d_rowwise_float8_transpose_rhs(
-    input_hp: torch.Tensor,  # (E, K, N)
+    input_hp_t: torch.Tensor,  # (E, K, N)
     target_dtype: torch.dtype = torch.float8_e4m3fn,
     round_scales_to_power_of_2: bool = False,
 ) -> Tuple[torch.Tensor, torch.Tensor]:
@@ -162,17 +162,18 @@ def torch_to_3d_rowwise_float8_transpose_rhs(
         Output shape: (E, N, K)
         Scales shape: (E, 1, K
     """
-    input_hp_t = input_hp.transpose(-2, -1)  # (E, N, K)
+    assert _is_column_major(input_hp_t), "input tensor must be column-major"
+    input_hp = input_hp_t.transpose(-2, -1)  # (E, N, K)
     scales = tensor_to_scale(
-        input_hp_t,
+        input_hp,
         target_dtype,
         scaling_granularity=ScalingGranularity.AXISWISE,
         axiswise_dim=-2,
         round_scales_to_power_of_2=round_scales_to_power_of_2,
     )  # (E, 1, K)
 
     # Apply scales to tensor and convert to float8.
-    tensor_scaled = input_hp_t.to(torch.float32) * scales
+    tensor_scaled = input_hp.to(torch.float32) * scales
     float8_tensor = to_fp8_saturated(tensor_scaled, target_dtype)
 
     # To column major

Original file line number	Diff line number	Diff line change
`@@ -109,8 +109,6 @@ def test_fp8_rowwise_3d_transpose_rhs(round_scales_to_power_of_2: bool):`
`109`	`109`	`target_dtype=torch.float8_e4m3fn,`
`110`	`110`	`round_scales_to_power_of_2=round_scales_to_power_of_2,`
`111`	`111`	`)`
`112`		`- # Pytorch impl keeps the empty scaled dimension, so we need to squeeze it out.`
`113`		`- ref_scales = ref_scales.squeeze(1)`
`114`	`112`
`115`	`113`	`triton_fp8, triton_scales = triton_fp8_rowwise_3d_transpose_rhs(`
`116`	`114`	`x,`