Add NVFP4 QAT

andrewor14 · andrewor14 · commit 5909a5a395ce · 2025-08-22T16:08:26.000-07:00
**Summary:** This commit adds a QAT flow for NVFP4, following the numerics in `NVFP4Tensor` closely but without the dtyping casting, swizzling, and the packing/unpacking. Users can call this flow as follows: ``` from torchao.quantization import quantize_ from torchao.quantization.qat import NVFP4FakeQuantizeConfig, QATConfig qat_config = QATConfig( weight_config=NVFP4FakeQuantizeConfig(), step="prepare", ) quantize_(model, qat_config) ``` **Test Plan:** ``` python test/quantization/test_qat.py -k test_qat_nvfp4 ``` Initial benchmarks on fine-tuning Qwen3-1.7B on oasst1 for 3 epochs: ``` # Without QAT | Tasks |Version|Filter|n-shot| Metric | | Value | |Stderr| |--------|------:|------|------|---------------|---|------:|---|------| |wikitext| 2|none |None |bits_per_byte |↓ | 0.7927|± | N/A| | | |none |None |byte_perplexity|↓ | 1.7323|± | N/A| | | |none |None |word_perplexity|↓ |18.8815|± | N/A| # With QAT | Tasks |Version|Filter|n-shot| Metric | | Value | |Stderr| |--------|------:|------|------|---------------|---|------:|---|------| |wikitext| 2|none |None |bits_per_byte |↓ | 0.7921|± | N/A| | | |none |None |byte_perplexity|↓ | 1.7316|± | N/A| | | |none |None |word_perplexity|↓ |18.8409|± | N/A| ``` ghstack-source-id: 512c7c2 Pull Request resolved: #2666
diff --git a/docs/source/api_ref_qat.rst b/docs/source/api_ref_qat.rst
@@ -27,6 +27,7 @@ Custom QAT APIs
     FakeQuantizeConfigBase
     IntxFakeQuantizeConfig
     Float8FakeQuantizeConfig
+    NVFP4FakeQuantizeConfig
     FakeQuantizedLinear
     FakeQuantizedEmbedding
     FakeQuantizerBase
diff --git a/test/quantization/test_qat.py b/test/quantization/test_qat.py
@@ -50,6 +50,7 @@
 from torchao.quantization.qat.fake_quantize_config import (
     Float8FakeQuantizeConfig,
     IntxFakeQuantizeConfig,
+    NVFP4FakeQuantizeConfig,
 )
 from torchao.quantization.qat.fake_quantizer import (
     Float8FakeQuantizer,
@@ -118,8 +119,8 @@ def __init__(self):
         self.sub = Sub()
         self.linear2 = torch.nn.Linear(256, 512, bias=False).to(torch.float)
 
-    def example_inputs(self):
-        return (torch.randn(1, 512).to(torch.float),)
+    def example_inputs(self, device: torch.device = None):
+        return (torch.randn((1, 512), device=device).to(torch.float),)
 
     def _get_all_weight_scales(self) -> List[torch.Tensor]:
         return [
@@ -1928,7 +1929,7 @@ def test_quantize_api_fp8_int4(self):
         """
         self._test_quantize_api_against_ptq(
             Float8DynamicActivationInt4WeightConfig(),
-            target_prepare_sqnr=15,
+            target_prepare_sqnr=12,
             target_convert_sqnr=float("inf"),
         )
 
@@ -1952,6 +1953,45 @@ def test_infer_fp8_int4_config(self):
         self.assertEqual(weight_config.group_size, 128)
         self.assertTrue(weight_config.is_symmetric)
 
+    @unittest.skipIf(not is_sm_at_least_89(), "Need sm89+")
+    def test_quantize_api_nvfp4(self):
+        """
+        Test the following:
+            quantize_(model, QATConfig(NVFP4InferenceConfig(), step="prepare"))
+            quantize_(model, QATConfig(NVFP4InferenceConfig(), step="convert"))
+        """
+        from torchao.prototype.mx_formats import NVFP4InferenceConfig
+
+        self._test_quantize_api_against_ptq(
+            NVFP4InferenceConfig(),
+            target_prepare_sqnr=8,
+            target_convert_sqnr=float("inf"),
+        )
+
+    @unittest.skipIf(not _CUDA_IS_AVAILABLE, "skipping when cuda is not available")
+    @parametrize("use_per_tensor_scale", [True, False])
+    def test_qat_nvfp4(self, use_per_tensor_scale: bool):
+        """
+        Test QAT with `NVFP4FakeQuantizeConfig`.
+        """
+        torch.manual_seed(self.SEED)
+        m = M().cuda()
+        baseline_model = copy.deepcopy(m)
+        qat_config = QATConfig(
+            activation_config=NVFP4FakeQuantizeConfig(use_per_tensor_scale),
+            weight_config=NVFP4FakeQuantizeConfig(use_per_tensor_scale),
+            step="prepare",
+        )
+        quantize_(m, qat_config)
+
+        # Compare prepared values
+        torch.manual_seed(self.SEED)
+        x = m.example_inputs("cuda")
+        out = m(*x)
+        baseline_out = baseline_model(*x)
+        sqnr = compute_error(out, baseline_out).item()
+        self.assertGreater(sqnr, 24)
+
 
 instantiate_parametrized_tests(TestQAT)
 
diff --git a/torchao/prototype/mx_formats/nvfp4_tensor.py b/torchao/prototype/mx_formats/nvfp4_tensor.py
@@ -749,13 +749,37 @@ def nvfp4_quantize(
         AssertionError: If input dtype is not supported, tensor size is not
             divisible by block_size, tensor is not contiguous, or block_size != 16
     """
+    return _nvfp4_quantize(data_hp, block_size, per_tensor_scale)
+
+
+class _Float8Round(torch.autograd.Function):
+    """
+    Cast a tensor to float8 and back to float32 with backward STE.
+    """
+
+    @staticmethod
+    def forward(ctx, x: torch.Tensor) -> torch.Tensor:
+        return x.to(torch.float8_e4m3fn).to(torch.float32)
+
+    @staticmethod
+    def backward(ctx, gy: torch.Tensor) -> torch.Tensor:
+        return gy
+
+
+def _nvfp4_quantize(
+    data_hp: torch.Tensor,
+    block_size: int = 16,
+    per_tensor_scale: Optional[torch.Tensor] = None,
+    skip_dtype_cast_and_packing: bool = False,
+) -> tuple[torch.Tensor, torch.Tensor]:
     assert data_hp.dtype in (torch.bfloat16, torch.float), (
         f"{data_hp.dtype} not supported"
     )
     assert data_hp.size(-1) % block_size == 0, "K dim must be divisible by block_size"
     assert data_hp.is_contiguous(), "Only support contiguous data for now"
     assert block_size == 16, "NVFP4 requires block_size=16"
 
+    orig_dtype = data_hp.dtype
     orig_shape = data_hp.shape
     # Convert to float32 early for consistent precision with Triton implementation
     data_hp = data_hp.float().reshape(orig_shape[0], -1, block_size)
@@ -767,10 +791,8 @@ def nvfp4_quantize(
     out_scales = None
     if per_tensor_scale is None:
         # We are doing single level scaling
-        block_scale_fp8 = torch.clamp(block_scale, min=E4M3_EPS, max=F8E4M3_MAX).to(
-            torch.float8_e4m3fn
-        )
-        block_scale_fp32 = block_scale_fp8.to(torch.float32)
+        block_scale_fp8 = torch.clamp(block_scale, min=E4M3_EPS, max=F8E4M3_MAX)
+        block_scale_fp32 = _Float8Round.apply(block_scale_fp8)
         data_scaled = data_hp / block_scale_fp32.unsqueeze(-1)
         out_scales = block_scale_fp8
     else:
@@ -782,8 +804,8 @@ def nvfp4_quantize(
         scaled_block_scales = block_scale_fp32 / per_tensor_scale
         scaled_block_scales_fp8 = torch.clamp(
             scaled_block_scales, min=E4M3_EPS, max=F8E4M3_MAX
-        ).to(torch.float8_e4m3fn)
-        scaled_block_scales_fp32 = scaled_block_scales_fp8.to(torch.float32)
+        )
+        scaled_block_scales_fp32 = _Float8Round.apply(scaled_block_scales_fp8)
         # We "temporarily" dequant the scaled_block_scales_fp32 to get the per_tensor_scale
         # To apply to data
         total_scale = per_tensor_scale * scaled_block_scales_fp32
@@ -792,8 +814,11 @@ def nvfp4_quantize(
 
     data_scaled = torch.clamp(data_scaled, -F4_E2M1_MAX, F4_E2M1_MAX)
     data_scaled = data_scaled.view(orig_shape)
-    data_lp = f32_to_f4_unpacked(data_scaled)
-    # TODO: NotImplementedError: "copy_kernel" not implemented for 'Float4_e2m1fn_x2'
-    # data_lp = pack_uint4(data_lp).view(torch.float4_e2m1fn_x2)
-    data_lp = pack_uint4(data_lp)
-    return out_scales, data_lp
+    if skip_dtype_cast_and_packing:
+        return out_scales.to(torch.float32), data_scaled.to(orig_dtype)
+    else:
+        data_lp = f32_to_f4_unpacked(data_scaled)
+        # TODO: NotImplementedError: "copy_kernel" not implemented for 'Float4_e2m1fn_x2'
+        # data_lp = pack_uint4(data_lp).view(torch.float4_e2m1fn_x2)
+        data_lp = pack_uint4(data_lp)
+        return out_scales.to(torch.float8_e4m3fn), data_lp
diff --git a/torchao/quantization/qat/__init__.py b/torchao/quantization/qat/__init__.py
@@ -17,12 +17,14 @@
     FakeQuantizeConfigBase,
     Float8FakeQuantizeConfig,
     IntxFakeQuantizeConfig,
+    NVFP4FakeQuantizeConfig,
 )
 from .fake_quantizer import (
     FakeQuantizer,
     FakeQuantizerBase,
     Float8FakeQuantizer,
     IntxFakeQuantizer,
+    NVFP4FakeQuantizer,
 )
 from .linear import (
     FakeQuantizedLinear,
@@ -40,6 +42,8 @@
     "Float8FakeQuantizer",
     "IntxFakeQuantizeConfig",
     "IntxFakeQuantizer",
+    "NVFP4FakeQuantizeConfig",
+    "NVFP4FakeQuantizer",
     "FakeQuantizedLinear",
     "FakeQuantizedEmbedding",
     # Prototype
diff --git a/torchao/quantization/qat/fake_quantize_config.py b/torchao/quantization/qat/fake_quantize_config.py
@@ -77,6 +77,22 @@ def __post_init__(self):
             )
 
 
+@dataclass
+class NVFP4FakeQuantizeConfig(FakeQuantizeConfigBase):
+    """
+    Config for fake quantizing weights or activations to NVIDIA's NVFP4 format
+    according to https://developer.nvidia.com/blog/introducing-nvfp4-for-efficient-and-accurate-low-precision-inference/.
+
+    Fake quantization numerics follow `NVFP4Tensor` closely: https://github.com/pytorch/ao/blob/main/torchao/prototype/mx_formats/nvfp4_tensor.py.
+
+    Args:
+        use_per_tensor_scale (bool): Whether to use two-level per-tensor fp32 scaling
+            after the initial fp8 (e4m3) block-wise scaling (default True)
+    """
+
+    use_per_tensor_scale: bool = True
+
+
 @dataclass
 class IntxFakeQuantizeConfig(FakeQuantizeConfigBase):
     """
@@ -332,6 +348,10 @@ def _infer_fake_quantize_configs(
     Return a 2-tuple of (activation_config, weight_config) for fake quantization.
     """
     # avoid circular imports
+    from torchao.prototype.mx_formats import (
+        NVFP4InferenceConfig,
+        NVFP4MMConfig,
+    )
     from torchao.quantization import (
         Float8DynamicActivationFloat8WeightConfig,
         Float8DynamicActivationInt4WeightConfig,
@@ -385,6 +405,17 @@ def _infer_fake_quantize_configs(
             group_size=128,
             is_symmetric=True,
         )
+    elif isinstance(base_config, NVFP4InferenceConfig):
+        # Note: today the PTQ config does not allow the user to specify
+        # `per_tensor_scales` due to serialization concerns. In the future
+        # we may add a way to compute these dynamically (for activations),
+        # but for now QAT will mimic the existing behavior of not having
+        # `per_tensor_scales` (subject to change)
+        if NVFP4MMConfig.DYNAMIC:
+            act_config = NVFP4FakeQuantizeConfig(False)
+        else:
+            act_config = None
+        weight_config = NVFP4FakeQuantizeConfig(False)
     else:
         raise ValueError("Unexpected base config: %s" % base_config)
     return (act_config, weight_config)
diff --git a/torchao/quantization/qat/fake_quantizer.py b/torchao/quantization/qat/fake_quantizer.py
@@ -34,6 +34,7 @@
     FakeQuantizeConfigBase,
     Float8FakeQuantizeConfig,
     IntxFakeQuantizeConfig,
+    NVFP4FakeQuantizeConfig,
 )
 from .utils import (
     _fake_quantize_per_channel_group,
@@ -59,8 +60,10 @@ def __repr__(self) -> str:
     def from_config(config: FakeQuantizeConfigBase) -> "FakeQuantizerBase":
         if isinstance(config, IntxFakeQuantizeConfig):
             return IntxFakeQuantizer(config)
-        if isinstance(config, Float8FakeQuantizeConfig):
+        elif isinstance(config, Float8FakeQuantizeConfig):
             return Float8FakeQuantizer(config)
+        elif isinstance(config, NVFP4FakeQuantizeConfig):
+            return NVFP4FakeQuantizer(config)
         else:
             raise ValueError(f"Unknown config type: {config}")
 
@@ -73,6 +76,7 @@ class Float8FakeQuantizer(FakeQuantizerBase):
     def __init__(self, config: Float8FakeQuantizeConfig):
         super().__init__()
         self.config = config
+        torch._C._log_api_usage_once("torchao.quantization.qat.Float8FakeQuantizer")
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         original_dtype = x.dtype
@@ -91,14 +95,60 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         return dq
 
 
+class NVFP4FakeQuantizer(FakeQuantizerBase):
+    """
+    Generic module for applying NVFP4 fake quantization to a tensor, as specified in the config.
+    """
+
+    def __init__(self, config: NVFP4FakeQuantizeConfig):
+        super().__init__()
+        torch._C._log_api_usage_once("torchao.quantization.qat.NVFP4FakeQuantizer")
+        self.config = config
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        from torchao.prototype.mx_formats.nvfp4_tensor import (
+            _nvfp4_quantize,
+            per_tensor_amax_to_scale,
+        )
+
+        block_size = 16
+        original_shape = x.shape
+        if x.dim() == 3:
+            x = x.view(-1, x.shape[-1])
+        if self.config.use_per_tensor_scale:
+            tensor_amax = torch.max(torch.abs(x))
+            per_tensor_scale = per_tensor_amax_to_scale(tensor_amax)
+        else:
+            per_tensor_scale = None
+
+        # quantize
+        scale, q = _nvfp4_quantize(
+            x,
+            block_size=block_size,
+            per_tensor_scale=per_tensor_scale,
+            skip_dtype_cast_and_packing=True,
+        )
+        if self.config.use_per_tensor_scale:
+            scale = scale * per_tensor_scale
+        assert q.dtype == x.dtype
+        assert scale.dtype == torch.float32
+
+        # dequantize
+        M, K = q.shape[0], q.shape[1]
+        q = q.view(M, K // block_size, block_size)
+        scale = scale.view(M, K // block_size, 1)
+        dq = q * scale
+        return dq.view(original_shape).to(x.dtype)
+
+
 class IntxFakeQuantizer(FakeQuantizerBase):
     """
     Generic module for applying integer fake quantization to a tensor, as specified in the config.
     """
 
     def __init__(self, config: IntxFakeQuantizeConfig):
         super().__init__()
-        torch._C._log_api_usage_once("torchao.quantization.qat.FakeQuantizer")
+        torch._C._log_api_usage_once("torchao.quantization.qat.IntxFakeQuantizer")
         self.config = config
         self.enabled = True
         self.scale: Optional[torch.Tensor] = None
diff --git a/torchao/quantization/qat/linear.py b/torchao/quantization/qat/linear.py
@@ -92,7 +92,9 @@ def __init__(
 
         # initialize weight fake quantizer
         if weight_config is not None:
-            if isinstance(weight_config.granularity, PerGroup):
+            if isinstance(weight_config, IntxFakeQuantizeConfig) and isinstance(
+                weight_config.granularity, PerGroup
+            ):
                 group_size = weight_config.group_size
                 if group_size is not None and in_features % group_size != 0:
                     raise ValueError(