make fp8 blockwise linear differentiable; use new kernels

danielvegamyhre · danielvegamyhre · commit 5a4f0faab2c7 · 2025-07-25T12:56:18.000-07:00
stack-info: PR: #2602, branch: danielvegamyhre/stack/16
diff --git a/test/prototype/blockwise_fp8/test_blockwise_linear.py b/test/prototype/blockwise_fp8/test_blockwise_linear.py
@@ -0,0 +1,65 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD 3-Clause license found in the
+# LICENSE file in the root directory of this source tree.
+
+import pytest
+import torch
+
+from torchao.float8.float8_utils import compute_error
+from torchao.prototype.blockwise_fp8.blockwise_linear import Float8BlockwiseLinear
+
+triton = pytest.importorskip("triton", reason="Triton required to run this test")
+
+
+@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+@pytest.mark.parametrize("in_features", [1024])
+@pytest.mark.parametrize("out_features", [1024])
+@pytest.mark.parametrize("batch_size", [1])
+@pytest.mark.parametrize("block_size", [128])
+def test_blockwise_quant_linear_fwd_bwd(
+    in_features,
+    out_features,
+    batch_size,
+    block_size,
+):
+    if in_features % block_size != 0 or out_features % block_size != 0:
+        pytest.skip(f"Dimensions must be divisible by block_size={block_size}")
+
+    torch.random.manual_seed(0)
+    layer_test = Float8BlockwiseLinear(
+        in_features=in_features,
+        out_features=out_features,
+        block_size=block_size,
+    ).cuda()
+
+    torch.random.manual_seed(0)
+    layer_ref = torch.nn.Linear(
+        in_features=in_features,
+        out_features=out_features,
+    ).cuda()
+
+    # Create input tensor
+    x_test = torch.randn(batch_size, in_features).cuda()
+    x_ref = x_test.clone().detach().requires_grad_(True)
+
+    # Forward pass
+    y_test = layer_test(x_test)
+    y_ref = layer_ref(x_ref)
+
+    # Compare outputs
+    sqnr = compute_error(y_ref, y_test)
+    assert sqnr >= 25.0, f"SQNR: {sqnr.item()} must be >= 25.0"
+
+    # Backward pass
+    y_test.sum().backward()
+    y_ref.sum().backward()
+
+    # Compare input grads
+    sqnr = compute_error(x_ref.grad, x_test.grad)
+    assert sqnr >= 25.0, f"SQNR: {sqnr} must be >= 25.0"
+
+    # Compare weight grads
+    sqnr = compute_error(layer_ref.weight, layer_test.weight)
+    assert sqnr >= 25.0, f"SQNR: {sqnr} must be >= 25.0"
diff --git a/test/prototype/blockwise_fp8/test_fp8_blockwise_kernels.py b/test/prototype/blockwise_fp8/test_fp8_blockwise_kernels.py
diff --git a/torchao/prototype/blockwise_fp8/__init__.py b/torchao/prototype/blockwise_fp8/__init__.py
@@ -1,4 +1,4 @@
-from .blockwise_linear import BlockwiseQuantLinear
+from .blockwise_linear import Float8BlockwiseLinear
 from .kernels import (
     blockwise_fp8_gemm,
     fp8_blockwise_act_quant,
@@ -8,7 +8,7 @@
 
 __all__ = [
     "blockwise_fp8_gemm",
-    "BlockwiseQuantLinear",
+    "Float8BlockwiseLinear",
     "fp8_blockwise_act_quant",
     "fp8_blockwise_weight_quant",
     "fp8_blockwise_weight_dequant",
diff --git a/torchao/prototype/blockwise_fp8/blockwise_linear.py b/torchao/prototype/blockwise_fp8/blockwise_linear.py
@@ -7,13 +7,107 @@
 import torch
 from torch import nn
 
+from torchao.core.config import AOBaseConfig
+from torchao.prototype.blockwise_fp8.deep_gemm_utils import (
+    scaled_mm_deep_gemm_128_1_128_1,
+    scaled_mm_deep_gemm_128_1_128_128,
+)
 from torchao.prototype.blockwise_fp8.kernels import (
-    blockwise_fp8_gemm,
     fp8_blockwise_act_quant,
+    triton_quantize_fp8_block,
+)
+from torchao.quantization.transform_module import (
+    register_quantize_module_handler,
 )
 
 
-class BlockwiseQuantLinear(nn.Module):
+class fp8_blockwise_mm(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, x, weight, block_size):
+        assert block_size == 128, "Only support block_size=128"
+
+        # Temporarily reshape x to 2D tensor
+        x_orig_shape = x.shape
+        x = x.reshape(-1, x_orig_shape[-1])
+
+        # Triton kernel from DeepGEMM currently has the fastest activation quantization (1 x block_size)
+        x_fp8, x_scale = fp8_blockwise_act_quant(x, block_size)
+
+        # fbgemm currently has the fastest weight quantization (block_size x block_size)
+        weight_t_fp8, weight_t_scale = triton_quantize_fp8_block(
+            weight,
+            block_m=block_size,
+            block_k=block_size,
+            k_major=True,  # For [M,K] -> [K,M] in column-major
+        )
+
+        # DeepGEMM for blockwise GEMM where activation has (1 x block_size) scaling granularity
+        # and weight has (block_size x block_size) scaling granularity.
+        out = scaled_mm_deep_gemm_128_1_128_128(
+            x_fp8,
+            x_scale,
+            weight_t_fp8,
+            weight_t_scale,
+        )
+        ctx.save_for_backward(x, weight)
+        ctx.block_size = block_size
+        return out
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        x, weight = ctx.saved_tensors
+        block_size = ctx.block_size
+
+        # left operand must be row-major
+        grad_output_fp8, grad_output_scale = fp8_blockwise_act_quant(
+            grad_output,
+            block_size,
+        )
+
+        # right operand must be column-major
+        weight_t_fp8, weight_t_scale = triton_quantize_fp8_block(
+            weight,
+            block_m=block_size,
+            block_k=block_size,
+            k_major=False,  # For [M,K] -> [K,M] in row-major
+        )
+        weight_t_fp8 = weight_t_fp8.t().contiguous().t()  # To col-major
+
+        # DeepGEMM for blockwise GEMM where left operand has (1 x block_size) scaling granularity
+        # and right operand has (block_size x block_size) scaling granularity.
+        # grad_x = grad_output @ weight.T
+        grad_x = scaled_mm_deep_gemm_128_1_128_128(
+            grad_output_fp8,
+            weight_t_fp8,
+            1.0 / grad_output_scale,
+            1.0 / weight_t_scale,
+        )
+
+        # left operand must be row-major
+        grad_output_t_fp8, grad_output_t_scale = fp8_blockwise_act_quant(
+            grad_output.t().contiguous(),
+            block_size,
+        )
+
+        # right operand must be column-major
+        x_fp8, x_scale = fp8_blockwise_act_quant(
+            x,
+            block_size,
+        )
+        x_fp8 = x_fp8.t().contiguous().t()  # To col-major
+
+        # DeepGEMM for blockwise GEMM where both operands have (1 x block_size) scaling granularity.
+        # grad_weight = grad_output.T @ x
+        grad_weight = scaled_mm_deep_gemm_128_1_128_1(
+            grad_output_t_fp8,
+            x_fp8,
+            1.0 / grad_output_t_scale,
+            1.0 / x_scale,
+        )
+        return grad_x, grad_weight, None, None
+
+
+class Float8BlockwiseLinear(nn.Linear):
     """
     Custom linear layer with support for quantized weights and optional bias.
 
@@ -25,53 +119,60 @@ class BlockwiseQuantLinear(nn.Module):
         dtype (torch.dtype): Data type for the weights. Defaults to torch.float8_e4m3fn.
     """
 
-    dtype = torch.bfloat16
+    supported_dtypes = [
+        torch.bfloat16,
+    ]
 
     def __init__(
         self,
-        in_features: int,
-        out_features: int,
-        bias: bool = False,
+        *args,
         block_size: int = 128,
-        dtype: torch.dtype = torch.float8_e4m3fn,
+        dtype=torch.bfloat16,
+        **kwargs,
     ):
-        super().__init__()
-        supported_dtypes = [
-            torch.float8_e4m3fn,
-            torch.float8_e5m2,
-        ]
-        assert dtype in supported_dtypes, (
-            f"Unsupported dtype: {dtype}. Supported dtypes: {supported_dtypes}"
-        )
-        scale_in_features = (in_features + block_size - 1) // block_size
-        scale_out_features = (out_features + block_size - 1) // block_size
-        self.weight = nn.Parameter(torch.empty(out_features, in_features, dtype=dtype))
-        self.weight.scale = self.scale = nn.Parameter(
-            torch.empty(scale_out_features, scale_in_features, dtype=torch.float32)
+        super().__init__(*args, **kwargs)
+
+        assert dtype in self.supported_dtypes, (
+            f"Unsupported dtype: {dtype}. Supported dtypes: {self.supported_dtypes}"
         )
         self.block_size = block_size
-        self.dtype
-
-        if bias:
-            self.bias = nn.Parameter(torch.empty(out_features))
-        else:
-            self.register_parameter("bias", None)
+        self.dtype = dtype
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         """
         Forward pass for the custom linear layer.
 
         Args:
-            x (torch.Tensor): Input tensor.
+            x (torch.Tensor): input tensor.
 
         Returns:
             torch.Tensor: Transformed tensor after linear computation.
         """
-        x, scale = fp8_blockwise_act_quant(x, self.block_size, self.dtype)
-        y = blockwise_fp8_gemm(
-            x, scale, self.weight, self.weight.scale, self.block_size
-        )
+        return fp8_blockwise_mm.apply(x, self.weight, self.block_size)
+
+    @classmethod
+    def from_float(
+        cls,
+        mod,
+    ):
+        assert mod.bias is None, "unsupported"
+        assert mod.in_features % 128 == 0, "unsupported"
+        assert mod.out_features % 128 == 0, "unsupported"
+        with torch.device("meta"):
+            new_mod = cls(
+                mod.in_features,
+                mod.out_features,
+                bias=False,
+            )
+        new_mod.weight = mod.weight
+        new_mod.bias = mod.bias
+        return new_mod
+
+
+class Float8BlockwiseLinearConfig(AOBaseConfig):
+    pass
+
 
-        if self.bias is not None:
-            y += self.bias
-        return y
+@register_quantize_module_handler(Float8BlockwiseLinearConfig)
+def _deep_gemm_float8_inference_linear_transform(module, config):
+    return Float8BlockwiseLinear.from_float(module)
diff --git a/torchao/prototype/blockwise_fp8/deep_gemm_utils.py b/torchao/prototype/blockwise_fp8/deep_gemm_utils.py
@@ -0,0 +1,29 @@
+import sys
+
+import torch
+
+try:
+    import deep_gemm
+except ImportError:
+    print("Please install deepgemm to use this feature")
+    sys.exit(0)
+
+
+def scaled_mm_deep_gemm_128_1_128_128(a, b, a_scale, b_scale):
+    M, K = a.shape
+    N, K = b.shape
+    out = torch.empty((M, N), dtype=torch.bfloat16, device=a.device)
+    deep_gemm.gemm_fp8_fp8_bf16_nt((a, a_scale), (b, b_scale), out=out)
+    return out
+
+
+def scaled_mm_deep_gemm_128_1_128_1(a, b, a_scale, b_scale):
+    M, K = a.shape
+    N, K = b.shape
+    # Note: the results from `wgrad_gemm_fp8_fp8_fp32_nt` are **accumulated**
+    # into this tensor. For now, we initialize with `zeros` to get correct
+    # numerics in toy examples. For a real use case, this will need to pass
+    # in the gradient tensor directly.
+    out = torch.zeros((M, N), dtype=torch.float, device=a.device)
+    deep_gemm.wgrad_gemm_fp8_fp8_fp32_nt((a, a_scale), (b, b_scale), out=out)
+    return out
diff --git a/torchao/prototype/blockwise_fp8/kernels.py b/torchao/prototype/blockwise_fp8/kernels.py
@@ -12,6 +12,12 @@
 import triton.language as tl
 from triton import Config
 
+# try:
+#     from fbgemm_gpu.experimental.gemm.triton_gemm.fp8_gemm import triton_quantize_fp8_block
+# except ImportError:
+#     print("Please install fbgemm-gpu to use this feature")
+#     sys.exit(1)
+
 # Original implementation at https://github.com/deepseek-ai/DeepSeek-V3/blob/main/inference/kernel.py
 
 fp8_gemm_configs = [