mx: make CUDA kernel for dim1 cast in mxfp8_cublas recipe

vkuzo · vkuzo · commit 775d2a21acae · 2025-08-04T10:01:54.000-07:00
Summary: As titled, this is the fastest option right now so should be default. Test Plan: ```bash pytest test/prototype/mx_formats/ -s -x ``` Reviewers: Subscribers: Tasks: Tags: ghstack-source-id: 52eb5cd ghstack-comment-id: 3144899603 Pull-Request: #2661
diff --git a/torchao/prototype/mx_formats/config.py b/torchao/prototype/mx_formats/config.py
@@ -184,8 +184,10 @@ def from_recipe_name(
         if recipe_name is MXLinearRecipeName.MXFP8_EMULATED:
             return MXLinearConfig()
         elif recipe_name is MXLinearRecipeName.MXFP8_CUBLAS:
-            # TODO(future PR): default to CUDA dim1 kernel
-            return MXLinearConfig(gemm_kernel_choice=MXGemmKernelChoice.CUBLAS)
+            return MXLinearConfig(
+                gemm_kernel_choice=MXGemmKernelChoice.CUBLAS,
+                mxfp8_cast_kernel_choice=MXFP8Dim1CastKernelChoice.CUDA,
+            )
         elif recipe_name is MXLinearRecipeName.MXFP8_CUBLAS_RCEIL:
             return MXLinearConfig(
                 gemm_kernel_choice=MXGemmKernelChoice.CUBLAS,