add wrapper with dtensor handling for mxfp8 dim1 cast kernel

danielvegamyhre · danielvegamyhre · commit 7ce84dd75735 · 2025-07-15T13:22:09.000-07:00
diff --git a/torchao/prototype/mx_formats/kernels.py b/torchao/prototype/mx_formats/kernels.py
@@ -1811,11 +1811,15 @@ def custom_mxfp8_quantize_cuda_dim1_sharding(
         colwise: bool = True,
         scaling_mode: str = "floor",
     ):
-        replicate = ([Replicate(), Replicate()], [Replicate(), None])
+        # _, colwise_data, _, colwise_scales = mxfp8_quantize_cuda(x, rowwise, colwise, scaling_mode)
+        replicate = (
+            [None, Replicate(), None, Replicate()],
+            [None, Replicate(), None, None],
+        )
         # Note that the data is returned transposed, which is why
         # we flip the sharding dim below
-        shard_dim0 = ([Shard(1), Shard(1)], [Shard(0), None])
-        shard_dim1 = ([Shard(0), Shard(0)], [Shard(1), None])
+        shard_dim0 = ([None, Shard(1), None, Shard(1)], [None, Shard(0), None, None])
+        shard_dim1 = ([None, Shard(0), None, Shard(0)], [None, Shard(1), None, None])
         acceptable_shardings = [replicate, shard_dim0, shard_dim1]
         return acceptable_shardings
 else:
diff --git a/torchao/prototype/mx_formats/mx_linear.py b/torchao/prototype/mx_formats/mx_linear.py
@@ -19,7 +19,10 @@
     MXInferenceLinearConfig,
     MXLinearConfig,
 )
-from torchao.prototype.mx_formats.kernels import triton_to_mxfp8_dim1
+from torchao.prototype.mx_formats.kernels import (
+    mxfp8_quantize_cuda,
+    triton_to_mxfp8_dim1,
+)
 from torchao.prototype.mx_formats.mx_tensor import MXTensor
 from torchao.quantization.transform_module import (
     register_quantize_module_handler,
@@ -66,6 +69,51 @@ def _triton_to_mxfp8_dim1_wrapper(
     return mx_tensor
 
 
+def _cuda_to_mxfp8_dim1_wrapper(
+    a, block_size, elem_dtype, hp_dtype, gemm_kernel_choice
+):
+    _, a_data, _, a_scale = mxfp8_quantize_cuda(
+        a,
+        rowwise=False,
+        colwise=True,
+        scaling_mode="floor",
+    )
+    if isinstance(a_data, DTensor):
+        assert isinstance(a_scale, DTensor)
+        a_data_local = a_data.to_local()
+        a_scale_local = a_scale.to_local()
+        inner = MXTensor(
+            a_scale_local,
+            a_data_local.t(),
+            elem_dtype,
+            block_size,
+            hp_dtype,
+            False,
+            gemm_kernel_choice,
+            False,
+        )
+        mx_tensor = DTensor.from_local(
+            inner,
+            a_data.device_mesh,
+            a_data.placements,
+            run_check=False,
+            shape=a_data.t().size(),
+            stride=a_data.t().stride(),
+        )
+    else:
+        mx_tensor = MXTensor(
+            a_scale,
+            a_data.t(),
+            elem_dtype,
+            block_size,
+            hp_dtype,
+            False,
+            gemm_kernel_choice,
+            False,
+        )
+    return mx_tensor
+
+
 @torch._dynamo.allow_in_graph
 class mx_mm(torch.autograd.Function):
     # There are three gemms in a forward + backward of a Linear layer: