Add input loader module for fp8_gemm (#336)

jananisriram · facebook-github-bot · commit 9902c283cc83 · 2025-08-19T19:07:34.000-07:00
Summary:

Add input loader module for fp8_gemm kernels. This diff enables us to invoke custom shapes in fp8_gemm benchmarking by passing a `json` file into the TritonBench `--input-loader` arg.

Reviewed By: NikhilAPatel, xuzhao9

Differential Revision: D80565971
diff --git a/tritonbench/operators/fp8_gemm/fp8_gemm.py b/tritonbench/operators/fp8_gemm/fp8_gemm.py
@@ -65,7 +65,14 @@ def args(m, n, k):
             )
             return (a, b)
 
-        if self.extra_args.llama:
+        if hasattr(self, 'external_shapes') and self.external_shapes:  # Check for external shapes loaded from input-loader
+            for shape in self.external_shapes:
+                if len(shape) == 3:
+                    m, n, k = shape
+                    yield args(m, n, k)
+                else:
+                    logger.warning(f"Skipping invalid shape: {shape}, expected [M, N, K]")
+        elif self.extra_args.llama:
             for m, n, k, _bias in llama_shapes():
                 yield args(m, n, k)
         elif self.extra_args.m:
@@ -115,7 +122,7 @@ def pt2_fp8_gemm(self, a, b) -> Callable:
                 out_dtype = torch.bfloat16
             else:
                 scale_a = torch.tensor(1.0, device=a.device)
-                scale_b = torch.tensor(1.0, device=a.device)
+                scale_b = torch.tensor(1.0, device=b.device)
                 out_dtype = torch.float16
             f = lambda a, b: torch._scaled_mm(
                 a, b, scale_a, scale_b, use_fast_accum=True, out_dtype=out_dtype