pytorch
diff --git a/‎benchmarks/float8/float8_roofline.py
Lines changed: 1 addition & 1 deletion b/‎benchmarks/float8/float8_roofline.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎benchmarks/mx_formats/cast_bench.py
Lines changed: 47 additions & 4 deletions b/‎benchmarks/mx_formats/cast_bench.py
Lines changed: 47 additions & 4 deletions
diff --git a/‎test/prototype/mx_formats/test_mx_linear.py
Lines changed: 52 additions & 5 deletions b/‎test/prototype/mx_formats/test_mx_linear.py
Lines changed: 52 additions & 5 deletions
diff --git a/‎torchao/prototype/mx_formats/config.py
Lines changed: 54 additions & 0 deletions b/‎torchao/prototype/mx_formats/config.py
Lines changed: 54 additions & 0 deletions
@@ -170,7 +170,7 @@ def get_gemm_times(
         elif float8_recipe_name in ("rowwise", "rowwise_with_gw_hp"):
             scale_a = torch.ones(M, 1, device=device)
             scale_b = torch.ones(1, N, device=device)
-        elif mx_recipe_name == "mxfp8_cublas":
+        elif mx_recipe_name in ("mxfp8_cublas", "mxfp8_cublas_rceil"):
             scale_a = torch.ones(M, K // 32, device=device, dtype=torch.float8_e8m0fnu)
             scale_b = torch.ones(N, K // 32, device=device, dtype=torch.float8_e8m0fnu)
         else:
 
@@ -11,6 +11,7 @@
 import triton
 from triton.testing import do_bench
 
+from torchao.prototype.mx_formats.config import ScaleCalculationMode
 from torchao.prototype.mx_formats.kernels import (
     triton_to_mxfp8_dim1,
 )
@@ -53,14 +54,18 @@ def scale_dim0_dim1_reference(
     return x_hp_d0_normalized, x_hp_d1_normalized.t(), amax_dim0, amax_dim1
 
 
-def to_mx_dim0_reference(x_hp, block_size):
-    scale_d0, data_d0 = to_mx(x_hp, torch.float8_e4m3fn, block_size)
+def to_mx_dim0_reference(x_hp, block_size, scaling_mode=ScaleCalculationMode.FLOOR):
+    scale_d0, data_d0 = to_mx(
+        x_hp, torch.float8_e4m3fn, block_size, scaling_mode=scaling_mode
+    )
     return data_d0, scale_d0
 
 
-def to_mx_dim1_reference(x_hp, block_size):
+def to_mx_dim1_reference(x_hp, block_size, scaling_mode=ScaleCalculationMode.FLOOR):
     x_hp = x_hp.t().contiguous()
-    scale_d1, data_d1 = to_mx(x_hp, torch.float8_e4m3fn, block_size)
+    scale_d1, data_d1 = to_mx(
+        x_hp, torch.float8_e4m3fn, block_size, scaling_mode=scaling_mode
+    )
     return data_d1.t(), scale_d1
 
 
@@ -84,7 +89,9 @@ def run(
         "dim1",
         "dim0_dim1",
         "dim0_mx_floor",
+        "dim0_mx_rceil",
         "dim1_mx_floor",
+        "dim1_mx_rceil",
         "dim1_mx_triton_floor",
         "dim1_mx_cuda_floor",
         "dim1_mx_cuda_rceil",
@@ -165,6 +172,24 @@ def run(
         bytes_w = (y_d0.numel() + s_d0.numel()) * bytes_per_el_fp8
         bps = (bytes_r + bytes_w) / (time_us / 1e6)
 
+    elif mode == "dim0_mx_rceil":
+        to_mx_dim0_reference_c = torch.compile(to_mx_dim0_reference)
+        y_d0, s_d0 = to_mx_dim0_reference_c(x, BLOCK_SIZE, ScaleCalculationMode.RCEIL)
+
+        for _ in range(2):
+            __ = to_mx_dim0_reference_c(x, BLOCK_SIZE)
+        time_us = benchmark_cuda_function_in_microseconds(
+            lambda x, b: to_mx_dim0_reference_c(x, BLOCK_SIZE),
+            x,
+            BLOCK_SIZE,
+        )
+
+        assert y_d0.dtype == torch.float8_e4m3fn
+        assert s_d0.dtype == torch.float8_e8m0fnu
+        bytes_r = x.numel() * bytes_per_el_bf16
+        bytes_w = (y_d0.numel() + s_d0.numel()) * bytes_per_el_fp8
+        bps = (bytes_r + bytes_w) / (time_us / 1e6)
+
     elif mode == "dim1_mx_floor":
         to_mx_dim1_reference_c = torch.compile(to_mx_dim1_reference)
         y_d1, s_d1 = to_mx_dim1_reference_c(x, BLOCK_SIZE)
@@ -183,6 +208,24 @@ def run(
         bytes_w = (y_d1.numel() + s_d1.numel()) * bytes_per_el_fp8
         bps = (bytes_r + bytes_w) / (time_us / 1e6)
 
+    elif mode == "dim1_mx_rceil":
+        to_mx_dim1_reference_c = torch.compile(to_mx_dim1_reference)
+        y_d1, s_d1 = to_mx_dim1_reference_c(x, BLOCK_SIZE, ScaleCalculationMode.RCEIL)
+
+        for _ in range(2):
+            __ = to_mx_dim1_reference_c(x, BLOCK_SIZE)
+        time_us = benchmark_cuda_function_in_microseconds(
+            lambda x, b: to_mx_dim1_reference_c(x, BLOCK_SIZE),
+            x,
+            BLOCK_SIZE,
+        )
+
+        assert y_d1.dtype == torch.float8_e4m3fn
+        assert s_d1.dtype == torch.float8_e8m0fnu
+        bytes_r = x.numel() * bytes_per_el_bf16
+        bytes_w = (y_d1.numel() + s_d1.numel()) * bytes_per_el_fp8
+        bps = (bytes_r + bytes_w) / (time_us / 1e6)
+
     elif mode == "dim1_mx_triton_floor":
         y_d1, s_d1 = triton_to_mxfp8_dim1(x, inner_block_size=BLOCK_SIZE)
 
 
@@ -14,6 +14,7 @@
     MXFP8Dim1CastKernelChoice,
     MXLinearConfig,
     MXLinearRecipeName,
+    ScaleCalculationMode,
 )
 from torchao.prototype.mx_formats.constants import (
     DTYPE_FP6_E2M3,
@@ -78,7 +79,18 @@ def run_around_tests():
         MXFP8Dim1CastKernelChoice.CUDA,
     ],
 )
-def test_linear_eager_vs_hp(elem_dtype, bias, input_shape, mxfp8_cast_kernel_choice):
+@pytest.mark.parametrize(
+    "scale_calculation_mode",
+    [
+        ScaleCalculationMode.FLOOR,
+        ScaleCalculationMode.CEIL,
+        ScaleCalculationMode.EVEN,
+        ScaleCalculationMode.RCEIL,
+    ],
+)
+def test_linear_eager_vs_hp(
+    elem_dtype, bias, input_shape, mxfp8_cast_kernel_choice, scale_calculation_mode
+):
     """
     Smoke test for training linear module with mx weight, compares the following:
     * baseline: float32
@@ -94,6 +106,16 @@ def test_linear_eager_vs_hp(elem_dtype, bias, input_shape, mxfp8_cast_kernel_cho
         elif not is_sm_at_least_89():
             pytest.skip("CUDA capability >= 8.9 required for float8 in triton")
 
+    if mxfp8_cast_kernel_choice == MXFP8Dim1CastKernelChoice.TRITON:
+        if scale_calculation_mode != ScaleCalculationMode.FLOOR:
+            pytest.skip("unsupported configuration")
+    elif mxfp8_cast_kernel_choice == MXFP8Dim1CastKernelChoice.CUDA:
+        if scale_calculation_mode not in (
+            ScaleCalculationMode.FLOOR,
+            ScaleCalculationMode.RCEIL,
+        ):
+            pytest.skip("unsupported configuration")
+
     # elem_dtype is a tuple of (input, weight, gradient) dtypes.
     grad_shape = list(input_shape)
     grad_shape[-1] = 256
@@ -108,6 +130,7 @@ def test_linear_eager_vs_hp(elem_dtype, bias, input_shape, mxfp8_cast_kernel_cho
         elem_dtype_weight_override=elem_dtype[1],
         elem_dtype_grad_output_override=elem_dtype[2],
         mxfp8_cast_kernel_choice=mxfp8_cast_kernel_choice,
+        scale_calculation_mode=scale_calculation_mode,
     )
     quantize_(m_mx, config)
 
@@ -125,9 +148,9 @@ def test_linear_eager_vs_hp(elem_dtype, bias, input_shape, mxfp8_cast_kernel_cho
     y_ref.backward(g)
     y_mx.backward(g)
 
-    y_sqnr = compute_error(y_ref, y_mx)
-    w_g_sqnr = compute_error(m[0].weight.grad, getattr(m_mx, "0").weight.grad)
-    x_g_sqnr = compute_error(x_ref.grad, x.grad)
+    y_sqnr = compute_error(y_ref, y_mx).item()
+    w_g_sqnr = compute_error(m[0].weight.grad, getattr(m_mx, "0").weight.grad).item()
+    x_g_sqnr = compute_error(x_ref.grad, x.grad).item()
 
     if elem_dtype == (torch.float8_e4m3fn, torch.float8_e4m3fn, torch.float8_e4m3fn):
         assert y_sqnr >= 18.0
@@ -229,7 +252,20 @@ def test_activation_checkpointing():
         MXFP8Dim1CastKernelChoice.CUDA,
     ],
 )
-def test_linear_compile(hp_dtype, recipe_name, bias, mxfp8_cast_kernel_choice):
+@pytest.mark.parametrize(
+    "scale_calculation_mode",
+    [
+        ScaleCalculationMode.FLOOR,
+        ScaleCalculationMode.CEIL,
+        # even + compile does not work yet:
+        # https://gist.github.com/vkuzo/1a04845cd503b1c75291aa1ea3bf79c4
+        # ScaleCalculationMode.EVEN,
+        ScaleCalculationMode.RCEIL,
+    ],
+)
+def test_linear_compile(
+    hp_dtype, recipe_name, bias, mxfp8_cast_kernel_choice, scale_calculation_mode
+):
     """
     Verify that compile does not change numerics of MX linear fw + bw
     """
@@ -255,6 +291,16 @@ def test_linear_compile(hp_dtype, recipe_name, bias, mxfp8_cast_kernel_choice):
         if hp_dtype != torch.bfloat16:
             pytest.skip("unsupported configuration")
 
+    if mxfp8_cast_kernel_choice == MXFP8Dim1CastKernelChoice.TRITON:
+        if scale_calculation_mode != ScaleCalculationMode.FLOOR:
+            pytest.skip("unsupported configuration")
+    elif mxfp8_cast_kernel_choice == MXFP8Dim1CastKernelChoice.CUDA:
+        if scale_calculation_mode not in (
+            ScaleCalculationMode.FLOOR,
+            ScaleCalculationMode.RCEIL,
+        ):
+            pytest.skip("unsupported configuration")
+
     if hp_dtype == torch.bfloat16 and recipe_name != "mxfp8_cublas":
         # TODO(future PR): properly enable float32 + bfloat16 for every
         # recipe, this needs a cleanup of out_dtype (needs to match in-hp-dtype, even
@@ -269,6 +315,7 @@ def test_linear_compile(hp_dtype, recipe_name, bias, mxfp8_cast_kernel_choice):
     )
     config = MXLinearConfig.from_recipe_name(recipe_name)
     config.mxfp8_cast_kernel_choice = mxfp8_cast_kernel_choice
+    config.scale_calculation_mode = scale_calculation_mode
 
     quantize_(m_mx, config=config)
     m_mx_c = copy.deepcopy(m_mx)
 
@@ -46,10 +46,34 @@ class MXFP8Dim1CastKernelChoice(Enum):
 class MXLinearRecipeName(Enum):
     MXFP8_EMULATED = "mxfp8_emulated"
     MXFP8_CUBLAS = "mxfp8_cublas"
+    MXFP8_CUBLAS_RCEIL = "mxfp8_cublas_rceil"
     MXFP4_EMULATED = "mxfp4_emulated"
     MXFP4_CUTLASS = "mxfp4_cutlass"
 
 
+class ScaleCalculationMode(Enum):
+    """
+    Enum representing the different methods for calculating MX block scaling.
+    There are three methods available:
+    FLOOR: This method is recommended by the OCP MX Spec 1.0 and uses X = 2^floor(log2(max_abs(v))-max_exp).
+           It result in overflow issues for large values and bad for gradient quantization.
+    CEIL: This method avoids overflow issues, but small values may shift to 0 due to a large scaling factor.
+           It uses X = 2^ceil(log2(max_abs(v))-max_exp).
+    EVEN: This method is a trade-off between Option 1 and Option 2. It uses X = 2^(floor(log2(rounding(max_abs(v)))-max_exp)).
+           It provides better accuracy for MX4 training compared to FLOOR and CEIL.
+    RCEIL: The method is to apply ceil to the ratio of max_abs(v) and max_pos.
+           This method's detail is described in https://docs.nvidia.com/cuda/cublas/index.html#d-block-quantization
+           Section "Computing scaling and conversion factors for FP8 with UE8M0 scales"
+    """
+
+    FLOOR = "floor"
+    CEIL = "ceil"
+    # Note: `even` does not work with torch.compile yet:
+    # https://gist.github.com/vkuzo/1a04845cd503b1c75291aa1ea3bf79c4
+    EVEN = "even"
+    RCEIL = "rceil"
+
+
 def _validate_elem_dtype(elem_dtype):
     assert elem_dtype in SUPPORTED_ELEM_DTYPES, (
         f"elem_dtype: expected one of {SUPPORTED_ELEM_DTYPES}, got {elem_dtype}"
@@ -75,6 +99,22 @@ def _validate_gemm_kernel_choice(gemm_kernel_choice, block_size, elem_dtype):
         )
 
 
+def _validate_mxfp8_cast_kernel_choice(
+    mxfp8_cast_kernel_choice, scale_calculation_mode
+):
+    if mxfp8_cast_kernel_choice == MXFP8Dim1CastKernelChoice.TRITON:
+        assert scale_calculation_mode == ScaleCalculationMode.FLOOR, (
+            f"unsupported ScaleCalculationMode value {scale_calculation_mode} for dim1 triton cast"
+        )
+    elif mxfp8_cast_kernel_choice == MXFP8Dim1CastKernelChoice.CUDA:
+        assert scale_calculation_mode in (
+            ScaleCalculationMode.FLOOR,
+            ScaleCalculationMode.RCEIL,
+        ), (
+            f"unsupported ScaleCalculationMode value {scale_calculation_mode} for dim1 cuda cast"
+        )
+
+
 @dataclass
 class MXLinearConfig(AOBaseConfig):
     # block size for scaling, default is 32 to match
@@ -104,6 +144,8 @@ class MXLinearConfig(AOBaseConfig):
     # If True, uses a custom triton kernel for fp4 dequantize
     use_fp4_custom_triton_dequant_kernel: bool = False
 
+    scale_calculation_mode: ScaleCalculationMode = ScaleCalculationMode.FLOOR
+
     def __post_init__(self):
         _validate_elem_dtype(self.elem_dtype)
         _validate_gemm_kernel_choice(
@@ -115,6 +157,9 @@ def __post_init__(self):
         if self.elem_dtype_grad_output_override is not None:
             _validate_elem_dtype(self.elem_dtype_grad_output_override)
             assert self.gemm_kernel_choice == MXGemmKernelChoice.EMULATED, "unsupported"
+        _validate_mxfp8_cast_kernel_choice(
+            self.mxfp8_cast_kernel_choice, self.scale_calculation_mode
+        )
 
     @staticmethod
     def from_recipe_name(
@@ -134,7 +179,14 @@ def from_recipe_name(
         if recipe_name is MXLinearRecipeName.MXFP8_EMULATED:
             return MXLinearConfig()
         elif recipe_name is MXLinearRecipeName.MXFP8_CUBLAS:
+            # TODO(future PR): default to CUDA dim1 kernel
             return MXLinearConfig(gemm_kernel_choice=MXGemmKernelChoice.CUBLAS)
+        elif recipe_name is MXLinearRecipeName.MXFP8_CUBLAS_RCEIL:
+            return MXLinearConfig(
+                gemm_kernel_choice=MXGemmKernelChoice.CUBLAS,
+                mxfp8_cast_kernel_choice=MXFP8Dim1CastKernelChoice.CUDA,
+                scale_calculation_mode=ScaleCalculationMode.RCEIL,
+            )
         elif recipe_name is MXLinearRecipeName.MXFP4_EMULATED:
             return MXLinearConfig(elem_dtype=torch.float4_e2m1fn_x2)
         elif recipe_name is MXLinearRecipeName.MXFP4_CUTLASS:
@@ -160,4 +212,6 @@ def short_str(self) -> str:
         s += f", mxfp8_cast_kernel_choice={self.mxfp8_cast_kernel_choice.value}"
         if self.use_fp4_custom_triton_dequant_kernel:
             s += ", use_fp4_custom_triton_dequant_kernel=True"
+        if self.scale_calculation_mode != ScaleCalculationMode.FLOOR:
+            s += ", scale_calculation_mode={self.scale_calculation_mode}"
         return s