mxfp8 grouped mm backward pass

danielvegamyhre · danielvegamyhre · commit c8e08978d98c · 2025-07-30T11:00:20.000-07:00
stack-info: PR: #2632, branch: danielvegamyhre/stack/24
diff --git a/torchao/prototype/moe_training/scaled_grouped_mm.py b/torchao/prototype/moe_training/scaled_grouped_mm.py
@@ -17,6 +17,8 @@
 )
 from torchao.prototype.moe_training.utils import (
     _is_column_major,
+    _to_mxfp8_per_group_rowwise,
+    _to_mxfp8_per_group_colwise,
 )
 from torchao.prototype.mx_formats.mx_tensor import to_mx
 
@@ -300,6 +302,7 @@ def forward(
 
         # Store what we need for backward.
         ctx.save_for_backward(A, B_t, offs)
+        ctx.block_size = block_size
         ctx.out_dtype = out_dtype
 
         # Perform scaled grouped GEMM and return result.
@@ -317,8 +320,52 @@ def forward(
         return out
 
     @staticmethod
-    def backward(ctx, grad_output: torch.Tensor):
-        raise NotImplementedError
+    def backward(ctx, grad_out: torch.Tensor):
+        A, B_t, offs = ctx.saved_tensors
+        block_size = ctx.block_size
+        out_dtype = ctx.out_dtype
+
+        # Compute grad_A.
+        # grad_A = grad_output @ B
+        # grad_A = scaled grouped mm of (M,N) @ (B,N,K) = (M,K)
+        grad_out_scale, grad_out_mx = to_mx(
+            grad_out, elem_dtype=torch.float8_e4m3fn, block_size=block_size
+        )
+
+        B_t_scale, B_t_mx = _to_mxfp8_3d_expert_weights_dim1(
+            B_t.transpose(-2, -1).contiguous(),
+            block_size=block_size,
+            elem_dtype=torch.float8_e4m3fn,
+        )
+
+        grad_A = emulated_mxfp8_scaled_grouped_mm(
+            grad_out_mx,
+            grad_out_scale,
+            B_t_mx,
+            B_t_scale,
+            offs=offs,
+            out_dtype=out_dtype,
+        )
+
+        # Compute grad_B = grad_output_t @ A
+        grad_out_t_scale, grad_out_t_mx = _to_mxfp8_per_group_rowwise(
+            grad_out, 
+            offs=offs,
+            block_size=block_size, 
+        )
+        A_scale, A_mx = _to_mxfp8_per_group_colwise(
+            A,
+            offs=offs,
+            block_size=block_size, 
+        )
+        grad_B = emulated_mxfp8_scaled_grouped_mm(
+            grad_out_t_mx,
+            grad_out_t_scale,
+            A_mx,
+            A_scale,
+            offs=offs,
+        )
+        return grad_A, grad_B, None, None, None
 
 
 def _to_mxfp8_3d_expert_weights_dim1(
@@ -352,6 +399,26 @@ def emulated_mxfp8_scaled_grouped_mm(
     offs: Optional[torch.Tensor] = None,
     out_dtype: Optional[torch.dtype] = torch.bfloat16,
     block_size: int = 32,
+) -> torch.Tensor:
+    if A_mx.ndim == 2 and B_t_mx.ndim == 3:
+        return _emulated_mxfp8_scaled_grouped_mm_2d_3d(
+            A_mx, A_scale, B_t_mx, B_t_scale, offs, out_dtype, block_size
+        )
+    elif A_mx.ndim == 2 and B_t_mx.ndim == 2:
+        return _emulated_mxfp8_scaled_grouped_mm_2d_2d(
+            A_mx, A_scale, B_t_mx, B_t_scale, offs, out_dtype, block_size
+        )
+    else:
+        raise NotImplemented
+
+def _emulated_mxfp8_scaled_grouped_mm_2d_3d(
+    A_mx: torch.Tensor,
+    A_scale: torch.Tensor,
+    B_t_mx: torch.Tensor,
+    B_t_scale: torch.Tensor,
+    offs: Optional[torch.Tensor] = None,
+    out_dtype: Optional[torch.dtype] = torch.bfloat16,
+    block_size: int = 32,
 ) -> torch.Tensor:
     # Dequantize input
     # A_mx shape: (M, K)
@@ -397,3 +464,49 @@ def emulated_mxfp8_scaled_grouped_mm(
     # Perform bf16 grouped GEMM.
     out = torch._grouped_mm(A, B_t, offs=offs, out_dtype=out_dtype)
     return out
+
+
+def _emulated_mxfp8_scaled_grouped_mm_2d_2d(
+    A_mx: torch.Tensor,
+    A_scale: torch.Tensor,
+    B_t_mx: torch.Tensor,
+    B_t_scale: torch.Tensor,
+    offs: torch.Tensor,
+    out_dtype: Optional[torch.dtype] = torch.bfloat16,
+    block_size: int = 32,
+) -> torch.Tensor:
+    A = torch.empty(A_mx.shape, dtype=torch.bfloat16, device=A_mx.device, requires_grad=A_mx.requires_grad)
+    B_t = torch.empty(B_t_mx.shape, dtype=torch.bfloat16, device=B_t_mx.device, requires_grad=B_t_mx.requires_grad)
+
+    # Dequantize input per each scaling group
+    scales_start_idx = 0
+    group_start_idx = 0
+    for group_end_idx in offs.tolist():
+        # -- Dequantize A tensor
+        # A_group shape: (M, group_size)
+        # A_scale shape: (M, group_size//block_size)
+        A_group = A_mx[:, group_start_idx:group_end_idx]
+        A_group_shape = A_group.shape
+
+        # Get scales for this group.
+        # scales shape: (M, group_size//block_size)
+        group_size = group_end_idx - group_start_idx + 1
+        num_scale_cols = group_size // block_size
+        scales = A_scale[:, scales_start_idx : scales_start_idx + num_scale_cols]
+
+        # Reshape to be able to do per-scaling group multiplication
+        # A_group shape: (M, group_size//block_size, block_size)
+        # A_scale shape: (M, group_size//block_size, 1)
+        A_group = A_group.reshape(*A_group.shape[:-1], A_group.shape[-1] // block_size, block_size)
+        scales = scales.unsqueeze(-1)
+
+        # Rescale and cast to bfloat16
+        A = A_group.to(torch.bfloat16) * scales.to(torch.bfloat16)
+
+        # Reshape back to original shape
+        # A shape: (M, group_size)
+        A = A.reshape(A_group_shape)
+        A[:, group_start_idx:group_end_idx] = A_group
+
+        # -- Dequantize B_t tensor
+        
diff --git a/torchao/prototype/moe_training/utils.py b/torchao/prototype/moe_training/utils.py
@@ -5,8 +5,9 @@
 
 from torchao.float8.config import ScalingGranularity
 from torchao.float8.float8_utils import tensor_to_scale, to_fp8_saturated
+from torchao.prototype.mx_formats.mx_tensor import to_mx
 
-
+# --- float8 rowwise scaling ---
 def _to_2d_jagged_float8_tensor_colwise(
     A_col_major: torch.Tensor,
     offs: torch.Tensor,
@@ -142,6 +143,91 @@ def _to_2d_jagged_float8_tensor_rowwise(
 
     return x_fp8, x_scales
 
+# --- mxfp8 scaling ---
+def _to_mxfp8_per_group_rowwise(
+    x: torch.Tensor,
+    offs: torch.Tensor,
+    block_size: int = 32,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    This is a reference implementation used for testing correctness, it is not performant.
+
+    This function converts the 2D input tensor a mxpf8 tensor along dim 0 with per-token-group scaling,
+    where groups are determined based on the offsets.
+
+    Args:
+        A (torch.Tensor): The input tensor to be converted to a jagged mxfp8 tensor.
+
+    Returns:
+        A tuple containing the jagged mxpf8 tensor and the scales used for the conversion.
+    """
+    assert x.ndim == 2, "input tensor must be 2D"
+
+    x_mx = torch.empty_like(x, dtype=torch.float8_e4m3fn)
+    x_scales = None
+
+    start_idx = 0
+    for end_idx in offs.tolist():
+        # Get the subtensor of A for this group, fetching all rows with the next group of rows.
+        subtensor = x[:, start_idx:end_idx]  # (M, local_group_size)
+
+        # Perform mxfp8 conversion on logically distinct subtensor.
+        scales, mx_subtensor = to_mx(subtensor, elem_dtype=torch.float8_e4m3fn, block_size=block_size)
+
+        # Store this portion of the resulting mxfp8 tensor and scales.
+        x_mx[:, start_idx:end_idx] = mx_subtensor
+        if x_scales is None:
+            x_scales = scales
+        else:
+            x_scales = torch.cat((x_scales, scales))
+
+        # Update start index for next group.
+        start_idx = end_idx
+
+    return x_mx, x_scales
+
+def _to_mxfp8_per_group_colwise(
+    A_col_major: torch.Tensor, # (K, N)
+    offs: torch.Tensor,
+    block_size: int = 32,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    This is a reference implementation used for testing correctness, it is not performant.
+
+    This function converts the 2D input tensor a mxpf8 tensor along dim 1 with per-token-group scaling,
+    where groups are determined based on the offsets.
+
+    Args:
+        A (torch.Tensor): The input tensor to be converted to a mxfp8 tensor.
+
+    Returns:
+        A tuple containing the mxpf8 tensor and the scales used for the conversion.
+    """
+    assert A_col_major.ndim == 2, "A must be 2D"
+
+    A_fp8_col_major = torch.empty_like(A_col_major, dtype=torch.float8_e4m3fn)
+    A_scales = None
+
+    start_idx = 0
+    for end_idx in offs.tolist():
+        # Get the subtensor of A for this group, fetching the next group of rows, with all columns for each.
+        subtensor = A_col_major[start_idx:end_idx, :]  # (local_group_size, N)
+
+        # Convert to mxfp8 along dim1, by transposing, converting, and transposing back.
+        scales, mx_subtensor = to_mx(subtensor.transpose(-2, -1), elem_dtype=torch.float8_e4m3fn, block_size=block_size)
+        scales, mx_subtensor = scales.transpose(-2, -1), mx_subtensor.transpose(-2, -1)
+
+        # Store this portion of the resulting mxfp8 tensor and scales.
+        A_fp8_col_major[start_idx:end_idx, :] = mx_subtensor
+        if A_scales is None:
+            A_scales = mx_subtensor
+        else:
+            A_scales = torch.cat((A_scales, scales))
+
+        # Update start index for next group.
+        start_idx = end_idx
+
+    return A_fp8_col_major, A_scales
 
 def _is_column_major(x: torch.Tensor) -> bool:
     """