[moe training] integrate rowwise expert quant kernel

danielvegamyhre · danielvegamyhre · commit 9f8a1daff97a · 2025-08-05T17:00:28.000-07:00
stack-info: PR: #2698, branch: danielvegamyhre/stack/32
diff --git a/torchao/prototype/moe_training/kernels/__init__.py b/torchao/prototype/moe_training/kernels/__init__.py
@@ -4,3 +4,6 @@
 from torchao.prototype.moe_training.kernels.jagged_float8_scales import (
     triton_fp8_row_major_jagged_rowwise_scales as triton_fp8_row_major_jagged_rowwise_scales,
 )
+from torchao.prototype.moe_training.kernels.float8_rowwise import (
+    triton_fp8_rowwise_3d_transpose_rhs as triton_fp8_rowwise_3d_transpose_rhs,
+)
diff --git a/torchao/prototype/moe_training/scaled_grouped_mm.py b/torchao/prototype/moe_training/scaled_grouped_mm.py
@@ -14,6 +14,7 @@
 from torchao.prototype.moe_training.kernels import (
     triton_fp8_col_major_jagged_colwise_scales,
     triton_fp8_row_major_jagged_rowwise_scales,
+    triton_fp8_rowwise_3d_transpose_rhs,
 )
 from torchao.prototype.moe_training.utils import (
     _is_column_major,
@@ -44,7 +45,7 @@ def _scaled_grouped_mm(
         out_dtype (Optional[torch.dtype]): The dtype of the output tensor. Currently only torch.bfloat16 is supported.
     """
     # TODO: Remove once prototype is more mature. This is currently very useful for development and debugging.
-    logger.info("Using scaled_grouped_mm")
+    # logger.info("Using scaled_grouped_mm")
     return _Float8GroupedMM.apply(
         A,
         B_t,
@@ -127,20 +128,11 @@ def forward(
         # Precompute non-transposed B column-major for backward, to save memory by storing the
         # low precision B tensor instead of the high precision B tensor.
         # In the backward this is needed for grad_A: grad_output @ B.
-        B = B_t.contiguous().transpose(-2, -1)
-
-        # - B shape: (E, N, K)
-        # - B scales must be computed rowwise keeping the outer/final dim, so:
-        # - B_scale shape: (E, 1, K)
-        B_scales = tensor_to_scale(
-            B,
-            torch.float8_e4m3fn,
-            scaling_granularity=ScalingGranularity.AXISWISE,
-            axiswise_dim=-2,
+        B_fp8_col_major, B_scales = triton_fp8_rowwise_3d_transpose_rhs(
+            B_t,
+            output_dtype=torch.float8_e4m3fn,
             round_scales_to_power_of_2=True,
         )
-        B_scaled = B.to(torch.float32) * B_scales
-        B_fp8_col_major = to_fp8_saturated(B_scaled, torch.float8_e4m3fn)
 
         # Store what we need for backward.
         ctx.save_for_backward(A, B_fp8_col_major, B_scales, offs)
diff --git a/torchao/prototype/moe_training/utils.py b/torchao/prototype/moe_training/utils.py
@@ -152,7 +152,7 @@ def torch_to_3d_rowwise_float8_transpose_rhs(
 ) -> Tuple[torch.Tensor, torch.Tensor]:
     """
     This function converts the 3D input tensor to a float8 tensor, with scales computed along logical columns
-    on a per-expert basis.
+    on a per-expert basis. Output will be in column-major memory layout.
 
     Args:
         x (torch.Tensor): The input tensor to be converted to a float8 tensor. Shape (E, K, N).

Original file line number	Diff line number	Diff line change
`@@ -4,3 +4,6 @@`
`4`	`4`	`from torchao.prototype.moe_training.kernels.jagged_float8_scales import (`
`5`	`5`	`triton_fp8_row_major_jagged_rowwise_scales as triton_fp8_row_major_jagged_rowwise_scales,`
`6`	`6`	`)`
	`7`	`+from torchao.prototype.moe_training.kernels.float8_rowwise import (`
	`8`	`+ triton_fp8_rowwise_3d_transpose_rhs as triton_fp8_rowwise_3d_transpose_rhs,`
	`9`	`+)`