register custom sharding for mxfp8 dim1 cast cuda kernel

danielvegamyhre · danielvegamyhre · commit f93b1e69ae90 · 2025-07-16T09:21:40.000-07:00
stack-info: PR: #2551, branch: danielvegamyhre/stack/7
diff --git a/torchao/prototype/mx_formats/kernels.py b/torchao/prototype/mx_formats/kernels.py
@@ -1812,6 +1812,20 @@ def _(
 
         return output_rowwise, output_colwise, scales_rowwise, scales_colwise
 
+    @register_sharding(torch.ops.torchao.mxfp8_quantize_cuda.default)
+    def custom_mxfp8_quantize_cuda_dim1_sharding(
+        x: torch.Tensor,
+        rowwise: bool = False,
+        colwise: bool = True,
+        scaling_mode: str = "floor",
+    ):
+        replicate = ([Replicate(), Replicate()], [Replicate(), None])
+        # Note that the data is returned transposed, which is why
+        # we flip the sharding dim below
+        shard_dim0 = ([Shard(1), Shard(1)], [Shard(0), None])
+        shard_dim1 = ([Shard(0), Shard(0)], [Shard(1), None])
+        acceptable_shardings = [replicate, shard_dim0, shard_dim1]
+        return acceptable_shardings
 else:
 
     def mxfp8_quantize_cuda(