make mxfp8 dim1 cast kernel configurable (#1427)

danielvegamyhre · web-flow · commit f3e2a75e1518 · 2025-07-24T18:31:02.000-07:00
Stacked PRs: * __->__#1427 --- --- --- make mxfp8 dim1 cast kernel configurable ## Summary - We recently added a new CUDA kernel for the mxfp8 dim1 cast which is ~1.4x faster than the existing Triton kernel or torch.compile, and using it results in an e2e training speedup of +1.5-2.5% TPS with Llama3 8b using FSDP=4/8 (pytorch/ao#2513). The integration work for composability with torch.compile + FSDP is complete as well: pytorch/ao#2564 - This PR updates the mxfp8 user facing API to replace the boolean flag `"--mx.use_triton_for_dim1_cast=[true|false]` to `mxfp8_dim1_cast_kernel_choice=[triton|cuda|torch]` ## Test plan - Triton: `NGPU=8 CONFIG_FILE="./torchtitan/models/llama3/train_configs/llama3_8b.toml" ./run_train.sh --training.steps=100 --model.converters="mx" --mx.recipe_name="mxfp8" --training.compile --mx.mxfp8_dim1_cast_kernel_choice="triton"` - Cuda: `NGPU=8 CONFIG_FILE="./torchtitan/models/llama3/train_configs/llama3_8b.toml" ./run_train.sh --training.steps=100 --model.converters="mx" --mx.recipe_name="mxfp8" --training.compile --mx.mxfp8_dim1_cast_kernel_choice="cuda"` - Torch: `NGPU=8 CONFIG_FILE="./torchtitan/models/llama3/train_configs/llama3_8b.toml" ./run_train.sh --training.steps=100 --model.converters="mx" --mx.recipe_name="mxfp8" --training.compile --mx.mxfp8_dim1_cast_kernel_choice="torch"` ## Limitations - TP is currently not supported yet, as both the Triton kernel and CUDA kernel are affected by an issue: `RuntimeError: Attempting to use FunctionalTensor on its own. Instead, please use it with a corresponding FunctionalTensorMode()`. This is a known issue we were talking to Brian about, will continue following up on it.
diff --git a/torchtitan/components/quantization/mx.py b/torchtitan/components/quantization/mx.py
@@ -40,30 +40,39 @@ def __init__(self, job_config: JobConfig, parallel_dims: ParallelDims):
                 "torchao is not installed. Please install it to use MXFP8 linear layers."
             )
         torchao_version = version("torchao")
-        mxfp8_min_version = "0.11.0"
-        if torchao_version < mxfp8_min_version:
+
+        # Last torchao release was 0.12.0, so nightly build starts with 0.13.0+git...
+        is_nightly_build = torchao_version.startswith("0.13.0")
+        if not is_nightly_build:
             raise ImportError(
-                f"torchao version {torchao_version} is too old, please install torchao {mxfp8_min_version} or later and try again"
+                f"torchao version {torchao_version} is too old, please install torchao nightly build and try again"
             )
 
         # Can be removed if we enable the emulated versions
         assert has_cuda_capability(
             10, 0
         ), "MXFP8 is only supported on SM100 or architectures"
 
-        self.enabled = True
-        mx_job_config: MX = job_config.mx
-        self.filter_fqns = mx_job_config.filter_fqns
+        # TP not yet supported with torch.compile
+        assert not (
+            job_config.training.compile
+            and job_config.parallelism.tensor_parallel_degree > 1
+        ), "TP not yet supported with torch.compile for mxfp8"
 
         # Configure MXFP8
-        from torchao.prototype.mx_formats.config import MXLinearConfig
+        from torchao.prototype.mx_formats.config import (
+            MXFP8Dim1CastKernelChoice,
+            MXLinearConfig,
+        )
 
+        mx_job_config: MX = job_config.mx
         config = MXLinearConfig.from_recipe_name(NAME_MAP[mx_job_config.recipe_name])
-        config.use_fp8_dim1_cast_triton_kernel = (
-            mx_job_config.use_fp8_dim1_cast_triton_kernel
-        )
+        config.mxfp8_dim1_cast_kernel_choice = MXFP8Dim1CastKernelChoice[
+            mx_job_config.mxfp8_dim1_cast_kernel_choice.upper()
+        ]
+        self.filter_fqns = mx_job_config.filter_fqns
         self.config = config
-
+        self.enabled = True
         logger.info(f"Float8 training active with recipe {mx_job_config.recipe_name}")
 
     def convert(self, model: nn.Module):
diff --git a/torchtitan/config/job_config.py b/torchtitan/config/job_config.py
@@ -534,7 +534,7 @@ class Float8:
 
 @dataclass
 class MX:
-    use_fp8_dim1_cast_triton_kernel: bool = True
+    mxfp8_dim1_cast_kernel_choice: Literal["triton", "cuda", "torch"] = "triton"
     """Temp work around for inductor performance gap"""
 
     recipe_name: Literal["mxfp8"] = "mxfp8"