integration of new mxfp8 casting cuda kernel

danielvegamyhre · danielvegamyhre · commit bb930b6f8fd7 · 2025-07-16T14:56:17.000-07:00
stack-info: PR: #2564, branch: danielvegamyhre/stack/13
diff --git a/test/prototype/mx_formats/test_mx_dtensor.py b/test/prototype/mx_formats/test_mx_dtensor.py
@@ -25,6 +25,7 @@
 from tqdm import tqdm
 
 from torchao.prototype.mx_formats import MXLinearConfig
+from torchao.prototype.mx_formats.config import MXFP8CastKernelChoice
 from torchao.prototype.mx_formats.mx_tensor import MXTensor
 from torchao.testing.training.dtensor_utils import (
     _test_lowp_mlp_tensor_parallelism_base,
@@ -82,7 +83,7 @@ def _test_mxfp8_mlp_tensor_parallelism(mesh: DeviceMesh, size=128):
 def _test_mxfp8_mlp_tensor_parallelism_dim1_triton(mesh: DeviceMesh, size=128):
     config = MXLinearConfig.from_recipe_name("mxfp8_emulated")
     config.block_size = 32
-    config.use_fp8_dim1_cast_triton_kernel = True
+    config.mxfp8_cast_kernel_choice = MXFP8CastKernelChoice.TRITON
     _test_lowp_mlp_tensor_parallelism_base(
         mesh, config, size, compile=False, allgather_in_lowp=False
     )
@@ -93,12 +94,22 @@ def _test_mxfp8_mlp_tensor_parallelism_dim1_triton(mesh: DeviceMesh, size=128):
     # )
 
 
+def _test_mxfp8_mlp_tensor_parallelism_dim1_cuda(mesh: DeviceMesh, size=128):
+    config = MXLinearConfig.from_recipe_name("mxfp8_emulated")
+    config.block_size = 32
+    config.mxfp8_cast_kernel_choice = MXFP8CastKernelChoice.CUDA
+    _test_lowp_mlp_tensor_parallelism_base(
+        mesh, config, size, compile=False, allgather_in_lowp=False
+    )
+
+
 if __name__ == "__main__":
     device_mesh = setup_distributed()
     tests = [
         _test_dtensor_cast_to_mxfp8,
         _test_mxfp8_mlp_tensor_parallelism,
         _test_mxfp8_mlp_tensor_parallelism_dim1_triton,
+        _test_mxfp8_mlp_tensor_parallelism_dim1_cuda,
     ]
 
     for test in tqdm(tests, desc="Running tests"):
diff --git a/test/prototype/mx_formats/test_mx_linear.py b/test/prototype/mx_formats/test_mx_linear.py
@@ -12,6 +12,7 @@
 import torch.nn.functional as F
 
 from torchao.prototype.mx_formats.config import (
+    MXFP8CastKernelChoice,
     MXGemmKernelChoice,
     MXInferenceLinearConfig,
     MXLinearConfig,
@@ -81,16 +82,21 @@ def run_around_tests():
 @pytest.mark.parametrize("elem_dtype", elem_dtypes)
 @pytest.mark.parametrize("bias", [True, False])
 @pytest.mark.parametrize("input_shape", [(128, 256), (1, 128, 256), (1, 1, 128, 256)])
-@pytest.mark.parametrize("use_fp8_dim1_cast_triton_kernel", [False, True])
-def test_linear_eager_vs_hp(
-    elem_dtype, bias, input_shape, use_fp8_dim1_cast_triton_kernel
-):
+@pytest.mark.parametrize(
+    "mxfp8_cast_kernel_choice",
+    [
+        MXFP8CastKernelChoice.TORCH,
+        MXFP8CastKernelChoice.TRITON,
+        MXFP8CastKernelChoice.CUDA,
+    ],
+)
+def test_linear_eager_vs_hp(elem_dtype, bias, input_shape, mxfp8_cast_kernel_choice):
     """
     Smoke test for training linear module with mx weight, compares the following:
     * baseline: float32
     * experiment: emulated MX
     """
-    if use_fp8_dim1_cast_triton_kernel:
+    if mxfp8_cast_kernel_choice != MXFP8CastKernelChoice.TORCH:
         if elem_dtype != (
             torch.float8_e4m3fn,
             torch.float8_e4m3fn,
@@ -109,11 +115,11 @@ def test_linear_eager_vs_hp(
     )
     m_mx = copy.deepcopy(m)
     config = MXLinearConfig(
-        block_size=4,
+        block_size=32,  # Only 32 is supported for now
         elem_dtype=elem_dtype[0],
         elem_dtype_weight_override=elem_dtype[1],
         elem_dtype_grad_output_override=elem_dtype[2],
-        use_fp8_dim1_cast_triton_kernel=use_fp8_dim1_cast_triton_kernel,
+        mxfp8_cast_kernel_choice=mxfp8_cast_kernel_choice,
     )
     quantize_(m_mx, config)
 
@@ -227,8 +233,15 @@ def test_activation_checkpointing():
 @pytest.mark.parametrize("bias", [False, True])
 # TODO(future PR): figure out why torch.compile does not match eager when
 # autocast is on
-@pytest.mark.parametrize("use_fp8_dim1_cast_triton_kernel", [False, True])
-def test_linear_compile(hp_dtype, recipe_name, bias, use_fp8_dim1_cast_triton_kernel):
+@pytest.mark.parametrize(
+    "mxfp8_cast_kernel_choice",
+    [
+        MXFP8CastKernelChoice.TORCH,
+        MXFP8CastKernelChoice.TRITON,
+        MXFP8CastKernelChoice.CUDA,
+    ],
+)
+def test_linear_compile(hp_dtype, recipe_name, bias, mxfp8_cast_kernel_choice):
     """
     Verify that compile does not change numerics of MX linear fw + bw
     """
@@ -246,7 +259,7 @@ def test_linear_compile(hp_dtype, recipe_name, bias, use_fp8_dim1_cast_triton_ke
         # TODO(future PR): fix this, things are clearly broken with bias=True
         pytest.skip("this test is broken for non-emulated recipes with bias=True")
 
-    if use_fp8_dim1_cast_triton_kernel:
+    if mxfp8_cast_kernel_choice != MXFP8CastKernelChoice.TORCH:
         if recipe_name not in ("mxfp8_emulated", "mxfp8_cublas"):
             pytest.skip("unsupported configuration")
         if not is_sm_at_least_89():
@@ -267,7 +280,7 @@ def test_linear_compile(hp_dtype, recipe_name, bias, use_fp8_dim1_cast_triton_ke
         nn.Linear(K, N, bias=bias, device="cuda", dtype=hp_dtype),
     )
     config = MXLinearConfig.from_recipe_name(recipe_name)
-    config.use_fp8_dim1_cast_triton_kernel = use_fp8_dim1_cast_triton_kernel
+    config.mxfp8_cast_kernel_choice = mxfp8_cast_kernel_choice
 
     quantize_(m_mx, config=config)
     m_mx_c = copy.deepcopy(m_mx)
diff --git a/torchao/prototype/mx_formats/config.py b/torchao/prototype/mx_formats/config.py
@@ -33,6 +33,12 @@ class MXGemmKernelChoice(Enum):
     CUBLAS = "cublas"
 
 
+class MXFP8CastKernelChoice(Enum):
+    TRITON = "triton"
+    CUDA = "cuda"
+    TORCH = "torch"
+
+
 # Pre-made recipes for common configurations
 class MXLinearRecipeName(Enum):
     MXFP8_EMULATED = "mxfp8_emulated"
@@ -85,10 +91,10 @@ class MXLinearConfig(AOBaseConfig):
     # on the given hardware an exception will be thrown
     gemm_kernel_choice: MXGemmKernelChoice = MXGemmKernelChoice.EMULATED
 
-    # If True, uses a custom triton kernel for cast to mxfp8 across dim1
+    # define which kernel to use for mxfp8 casting
     # TODO(1945): remove this config option once torch.compile gives us
     # a fast kernel
-    use_fp8_dim1_cast_triton_kernel: bool = False
+    mxfp8_cast_kernel_choice: MXFP8CastKernelChoice = MXFP8CastKernelChoice.TRITON
 
     # If True, uses a custom triton kernel for fp4 dequantize
     use_fp4_custom_triton_dequant_kernel: bool = False
@@ -146,8 +152,7 @@ def short_str(self) -> str:
         if self.elem_dtype_grad_output_override is not None:
             s += f", lp_go_override={DTYPE_TO_SHORT_STR[self.elem_dtype_grad_output_override]}"
         s += f", kernel={self.gemm_kernel_choice.value}"
-        if self.use_fp8_dim1_cast_triton_kernel:
-            s += ", use_fp8_dim1_cast_triton_kernel=True"
+        s += f", mxfp8_cast_kernel_choice={self.mxfp8_cast_kernel_choice.value}"
         if self.use_fp4_custom_triton_dequant_kernel:
             s += ", use_fp4_custom_triton_dequant_kernel=True"
         return s
diff --git a/torchao/prototype/mx_formats/kernels.py b/torchao/prototype/mx_formats/kernels.py
@@ -1404,6 +1404,7 @@ def triton_scale_swizzle(
         scale_cols,
         output_ptr,
         input_row_stride,
+        input_col_stride,
         output_block_stride,
         BLOCK_ROWS: tl.constexpr,
         BLOCK_COLS: tl.constexpr,
@@ -1423,7 +1424,7 @@ def triton_scale_swizzle(
         mask = (global_rows < scale_rows) & (global_cols < scale_cols)
 
         input_scales = tl.load(
-            scale_ptr + global_rows * input_row_stride + global_cols,
+            scale_ptr + global_rows * input_row_stride + global_cols * input_col_stride,
             mask=mask,
             other=0.0,
         )
@@ -1463,7 +1464,6 @@ def triton_mx_block_rearrange(scale_tensor: torch.Tensor) -> torch.Tensor:
         assert scale_tensor.element_size() == 1, (
             "Expected element size to be 1 byte (8 bits)"
         )
-        assert scale_tensor.is_contiguous(), "Input tensor must be contiguous"
 
         rows, cols = scale_tensor.shape
 
@@ -1476,7 +1476,8 @@ def triton_mx_block_rearrange(scale_tensor: torch.Tensor) -> torch.Tensor:
         out = scale_tensor.new_empty((padded_rows, padded_cols))
 
         # Input stride (for row-major format)
-        input_row_stride = cols
+        input_row_stride = scale_tensor.stride()[0]
+        input_col_stride = scale_tensor.stride()[1]
 
         # We probably want handle multiple blocks per tile but for now keep it simple
         BLOCK_ROWS, BLOCK_COLS = 128, 4
@@ -1495,6 +1496,7 @@ def triton_mx_block_rearrange(scale_tensor: torch.Tensor) -> torch.Tensor:
             cols,
             out.view(torch.uint8),
             input_row_stride,
+            input_col_stride,
             output_block_stride,
             BLOCK_ROWS=BLOCK_ROWS,
             BLOCK_COLS=BLOCK_COLS,
@@ -1812,6 +1814,42 @@ def _(
 
         return output_rowwise, output_colwise, scales_rowwise, scales_colwise
 
+    @register_sharding(torch.ops.torchao.mxfp8_quantize_cuda.default)
+    def custom_mxfp8_quantize_cuda_dim1_sharding(
+        x: torch.Tensor,
+        rowwise: bool = False,
+        colwise: bool = True,
+        scaling_mode: str = "floor",
+    ):
+        # This function signature can be used to understand the shardings:
+        # _, colwise_data, _, colwise_scales = mxfp8_quantize_cuda(x, rowwise=False, colwise=True)
+
+        # When inputs and scale are replicated, we return a quantized output tensor (replicated).
+        inputs_replicated = [None, Replicate(), None, Replicate()]
+        outputs_replicated = [None, Replicate(), None, None]
+        rule_for_input_replicated = (
+            inputs_replicated,
+            outputs_replicated,
+        )
+
+        # When inputs and scale are sharded along dim 0,
+        # we return a quantized output tensor (sharded along dim1 due to transpose).
+        inputs_sharded_dim0 = [None, Shard(0), None, Shard(0)]
+        outputs_sharded_dim1 = [None, Shard(1), None, None]
+        rule_for_input_sharded_dim0 = (inputs_sharded_dim0, outputs_sharded_dim1)
+
+        # When inputs and scale are sharded along dim 1,
+        # we return a quantized output tensor (sharded along dim0 due to transpose).
+        inputs_sharded_dim1 = [None, Shard(1), None, Shard(1)]
+        outputs_sharded_dim0 = [None, Shard(0), None, None]
+        rule_for_input_sharded_dim1 = (inputs_sharded_dim1, outputs_sharded_dim0)
+
+        acceptable_shardings = [
+            rule_for_input_replicated,
+            rule_for_input_sharded_dim0,
+            rule_for_input_sharded_dim1,
+        ]
+        return acceptable_shardings
 else:
 
     def mxfp8_quantize_cuda(
diff --git a/torchao/prototype/mx_formats/mx_linear.py b/torchao/prototype/mx_formats/mx_linear.py