pytorch · danielvegamyhre · Jul 15, 2025 · Jul 16, 2025
@@ -56,11 +56,20 @@ def __init__(self, job_config: JobConfig, parallel_dims: ParallelDims):
         self.filter_fqns = mx_job_config.filter_fqns
 
         # Configure MXFP8
-        from torchao.prototype.mx_formats.config import MXLinearConfig
+        from torchao.prototype.mx_formats.config import (
+            MXFP8CastKernelChoice,
+            MXLinearConfig,
+        )
 
         config = MXLinearConfig.from_recipe_name(NAME_MAP[mx_job_config.recipe_name])
-        config.use_fp8_dim1_cast_triton_kernel = (
-            mx_job_config.use_fp8_dim1_cast_triton_kernel
+
+        dim1_cast_kernel_choice_str = (
+            mx_job_config.mxfp8_dim1_cast_kernel_choice.upper()
+        )
+        config.mxfp8_dim1_cast_kernel_choice = (
+            MXFP8CastKernelChoice[dim1_cast_kernel_choice_str]
+            if mx_job_config.mxfp8_dim1_cast_kernel_choice != "NONE"
+            else None
         )
         self.config = config
 

@@ -523,7 +523,7 @@ class Float8:
 
 @dataclass
 class MX:
-    use_fp8_dim1_cast_triton_kernel: bool = True
+    mxfp8_dim1_cast_kernel_choice: Literal["triton", "cuda", "none"] = "triton"
     """Temp work around for inductor performance gap"""
 
     recipe_name: Literal["mxfp8"] = "mxfp8"