Add amax as default per-row scaling factor for fp8_gemm benchmark

jananisriram · facebook-github-bot · commit 01177cdf02a9 · 2025-08-20T09:05:02.000-07:00
Summary: Add `amax` (absolute maximum) as the default scaling factor for per-row scaling for fp8 GEMMs, as is used in practice.

Reviewed By: NikhilAPatel

Differential Revision: D80590746
diff --git a/tritonbench/operators/fp8_gemm/fp8_gemm.py b/tritonbench/operators/fp8_gemm/fp8_gemm.py
@@ -63,13 +63,32 @@ def _get_dtype(self):
             return torch.float16
 
     def get_input_iter(self):
-        def _get_scale_per_tensor(x: torch.Tensor, custom_scale: float = None) -> torch.Tensor:
+        def _get_scale_per_tensor(
+            x: torch.Tensor, custom_scale: float = None
+        ) -> torch.Tensor:
             # For tensor-wise scaling, kernel requires a float32 scale tensor
             if custom_scale:
                 return torch.tensor(custom_scale, dtype=torch.float32, device=x.device)
             scale = torch.finfo(torch.float8_e4m3fn).max / x.abs().max()
             return scale.to(torch.float32)
 
+        def _get_scale_per_row(
+            x: torch.Tensor, transpose: bool = False
+        ) -> torch.Tensor:
+            if transpose:  # scale_b.shape should be [1, N]
+                scale = (
+                    torch.finfo(torch.float8_e4m3fn).max
+                    / x.abs().max(dim=0, keepdim=True).values
+                )
+            else:  # scale_a.shape should be [M, 1]
+                scale = (
+                    torch.finfo(torch.float8_e4m3fn).max
+                    / x.abs().max(dim=1, keepdim=True).values
+                )
+            return scale.to(
+                torch.float32
+            )  # For row-wise scaling, kernel requires a float32 scale tensor
+
         def args(m, n, k):
             a = torch.randn(m, k, device=self.device).to(self._get_dtype())
             b = (
@@ -80,26 +99,33 @@ def args(m, n, k):
             )
 
             if self.extra_args.scaling_rowwise:
-                M, N = a.shape[0], b.shape[1]
-                scale_a = torch.ones((M, 1), dtype=torch.float32, device=a.device)
-                scale_b = torch.ones((1, N), dtype=torch.float32, device=b.device)
+                scale_a = _get_scale_per_row(a)
+                scale_b = _get_scale_per_row(b, transpose=True)
             else:
-                scale_a = _get_scale_per_tensor(a, custom_scale=self.extra_args.per_tensor_scale_a)
-                scale_b = _get_scale_per_tensor(b, custom_scale=self.extra_args.per_tensor_scale_b)
+                scale_a = _get_scale_per_tensor(
+                    a, custom_scale=self.extra_args.per_tensor_scale_a
+                )
+                scale_b = _get_scale_per_tensor(
+                    b, custom_scale=self.extra_args.per_tensor_scale_b
+                )
 
             # Kernels expect dtype=float8_e4m3fn
             a = a.to(torch.float8_e4m3fn)
             b = b.to(torch.float8_e4m3fn)
 
             return (a, b, scale_a, scale_b)
 
-        if hasattr(self, 'external_shapes') and self.external_shapes:  # Check for external shapes loaded from input-loader
+        if (
+            hasattr(self, "external_shapes") and self.external_shapes
+        ):  # Check for external shapes loaded from input-loader
             for shape in self.external_shapes:
                 if len(shape) == 3:
                     m, n, k = shape
                     yield args(m, n, k)
                 else:
-                    logger.warning(f"Skipping invalid shape: {shape}, expected [M, N, K]")
+                    logger.warning(
+                        f"Skipping invalid shape: {shape}, expected [M, N, K]"
+                    )
         elif self.extra_args.llama:
             for m, n, k, _bias in llama_shapes():
                 yield args(m, n, k)