Bump version for float8 dynamic quant and weight only quant configs

jerryzh168 · jerryzh168 · commit e19cb46565cd · 2025-07-31T17:52:23.000-07:00
Summary: This PR changes the default VERSION for Float8DynamicActivationFloat8WeightConfig and Float8WeightOnlyConfig from 1 to 2 and makes the VERSION 1 config and VERSION 1 quantized models deprecated, more details in: #2649 Also extended current config serialization to work with multiple config versions Test Plan: tested with serializing a model with VERSION 1 config and load it, and checks warnings are properly printed ``` python test/integration/test_loading_deprecated_checkpoint.py ``` Reviewers: Subscribers: Tasks: Tags:
diff --git a/test/dtypes/test_affine_quantized_float.py b/test/dtypes/test_affine_quantized_float.py
@@ -319,7 +319,8 @@ def test_mm_float8dq_per_row(
         )
         test_linear = copy.deepcopy(ref_linear)
         quantize_(
-            test_linear, Float8DynamicActivationFloat8WeightConfig(granularity=PerRow())
+            test_linear,
+            Float8DynamicActivationFloat8WeightConfig(granularity=PerRow(), VERSION=1),
         )
 
         quant_weight = test_linear.weight
@@ -471,7 +472,10 @@ def test_float8_tensor_slicing_basic(self, granularity):
         # Create and quantize a model
         model = torch.nn.Linear(64, 32, bias=False).to(device).to(dtype)
         quantize_(
-            model, Float8DynamicActivationFloat8WeightConfig(granularity=granularity)
+            model,
+            Float8DynamicActivationFloat8WeightConfig(
+                granularity=granularity, VERSION=1
+            ),
         )
 
         weight_impl = model.weight.original_weight_tensor.tensor_impl
@@ -505,7 +509,10 @@ def test_float8_tensor_slicing_per_tensor(self):
         # Create and quantize with per-tensor granularity
         model = torch.nn.Linear(64, 32, bias=False).to(device).to(dtype)
         quantize_(
-            model, Float8DynamicActivationFloat8WeightConfig(granularity=PerTensor())
+            model,
+            Float8DynamicActivationFloat8WeightConfig(
+                granularity=PerTensor(), VERSION=1
+            ),
         )
 
         original_weight = model.weight
@@ -536,7 +543,8 @@ def test_float8_tensor_slicing_per_row(self):
         # Create and quantize with per-row granularity
         model = torch.nn.Linear(64, 32, bias=False).to(device).to(dtype)
         quantize_(
-            model, Float8DynamicActivationFloat8WeightConfig(granularity=PerRow())
+            model,
+            Float8DynamicActivationFloat8WeightConfig(granularity=PerRow(), VERSION=1),
         )
 
         original_weight = model.weight  # Shape: (32, 64)
@@ -574,7 +582,10 @@ def test_float8_tensor_slicing_edge_cases(self):
         # Create and quantize a model
         model = torch.nn.Linear(64, 32, bias=False).to(device).to(dtype)
         quantize_(
-            model, Float8DynamicActivationFloat8WeightConfig(granularity=PerTensor())
+            model,
+            Float8DynamicActivationFloat8WeightConfig(
+                granularity=PerTensor(), VERSION=1
+            ),
         )
 
         original_weight = model.weight
@@ -612,7 +623,9 @@ def test_float8_tensor_slicing_functional_correctness(self, granularity):
         quant_model = copy.deepcopy(ref_model)
         quantize_(
             quant_model,
-            Float8DynamicActivationFloat8WeightConfig(granularity=granularity),
+            Float8DynamicActivationFloat8WeightConfig(
+                granularity=granularity, VERSION=1
+            ),
         )
 
         # Create input with batch size that works well with slicing
diff --git a/test/float8/test_base.py b/test/float8/test_base.py
@@ -477,10 +477,10 @@ def test_quantize(self):
         m = nn.Sequential(nn.Linear(32, 32)).cuda()
         m = convert_to_float8_training(m)
         assert isinstance(m[0], Float8Linear), "Module is not a Float8Linear"
-        from torchao.quantization.quant_api import float8_weight_only, quantize_
+        from torchao.quantization import Float8WeightOnlyConfig, quantize_
 
-        quantize_(m, float8_weight_only())
-        assert m[0].weight.tensor_impl.float8_data.dtype == torch.float8_e4m3fn, (
+        quantize_(m, Float8WeightOnlyConfig())
+        assert m[0].weight.qdata.dtype == torch.float8_e4m3fn, (
             "Post quantization dtype should be torch.float8_e4m3fn"
         )
         with torch.no_grad():
diff --git a/test/integration/test_loading_deprecated_checkpoint.py b/test/integration/test_loading_deprecated_checkpoint.py
@@ -0,0 +1,67 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD 3-Clause license found in the
+# LICENSE file in the root directory of this source tree.
+import unittest
+import warnings
+
+import torch
+from torch.testing._internal import common_utils
+from torch.testing._internal.common_utils import (
+    TestCase,
+    run_tests,
+)
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+from torchao.utils import is_sm_at_least_89
+
+_MODEL_NAMES = [
+    "torchao-testing/opt-125m-float8dq-row-v1-0.13-dev",
+]
+
+
+@unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
+@unittest.skipIf(not is_sm_at_least_89(), "Nedd sm89+")
+class TestLoadingDeprecatedCheckpoint(TestCase):
+    @common_utils.parametrize("model_name", _MODEL_NAMES)
+    def test_load_model_and_run(self, model_name):
+        """Test that we print correct warning message when loading a deprecated checkpoint"""
+        # Load and quantize model
+        with warnings.catch_warnings(record=True) as caught_warnings:
+            quantized_model = AutoModelForCausalLM.from_pretrained(
+                model_name,
+                torch_dtype="bfloat16",
+                device_map="cuda",
+            )
+            got_expected_message = any(
+                "Stored version is not the same as current default version of the config"
+                in str(w.message)
+                for w in caught_warnings
+            )
+            assert got_expected_message, "Din't get expected message"
+
+            got_expected_message = any(
+                "Models quantized with VERSION 1 of Float8DynamicActivationFloat8WeightConfig is deprecated"
+                in str(w.message)
+                for w in caught_warnings
+            )
+            assert got_expected_message, "Din't get expected message"
+
+        tokenizer = AutoTokenizer.from_pretrained(model_name)
+        prompt = ("Hello, my name is",)
+        inputs = tokenizer(
+            prompt,
+            return_tensors="pt",
+        ).to("cuda")
+        generated_ids = quantized_model.generate(**inputs, max_new_tokens=128)
+        # make sure it runs
+        _ = tokenizer.batch_decode(
+            generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
+        )
+
+
+common_utils.instantiate_parametrized_tests(TestLoadingDeprecatedCheckpoint)
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/torchao/core/config.py b/torchao/core/config.py
@@ -8,13 +8,13 @@
 import enum
 import importlib
 import json
+import warnings
 from typing import Any, ClassVar, Dict
 
 import torch
 
 __all__ = [
     "AOBaseConfig",
-    "VersionMismatchError",
     "config_from_dict",
     "config_to_dict",
     "ALLOWED_AO_MODULES",
@@ -50,20 +50,6 @@ def _transform(
     VERSION: ClassVar[int] = 1
 
 
-class VersionMismatchError(Exception):
-    """Raised when trying to deserialize a config with a different version"""
-
-    def __init__(self, type_path, stored_version, current_version):
-        self.type_path = type_path
-        self.stored_version = stored_version
-        self.current_version = current_version
-        message = (
-            f"Version mismatch for {type_path}: "
-            f"stored version {stored_version} != current version {current_version}"
-        )
-        super().__init__(message)
-
-
 class ConfigJSONEncoder(json.JSONEncoder):
     """Custom JSON encoder for AOBaseConfig objects"""
 
@@ -80,7 +66,9 @@ def default(self, o):
             return {
                 # Only store the class name, not the full module path
                 "_type": o.__class__.__name__,
-                "_version": getattr(o.__class__, "VERSION", 1),
+                # not using class VERSION since we might be explicitly
+                # setting a different VERSION for the object itself
+                "_version": getattr(o, "VERSION", 1),
                 "_data": data_dict,
             }
 
@@ -94,7 +82,9 @@ def default(self, o):
 
             return {
                 "_type": o.__class__.__name__,
-                "_version": getattr(o.__class__, "VERSION", 1),
+                # not using class VERSION since we might be explicitly
+                # setting a different VERSION for the object itself
+                "_version": getattr(o, "VERSION", 1),
                 "_data": processed_data,
             }
 
@@ -109,7 +99,9 @@ def default(self, o):
             return {
                 # Only store the class name for dataclasses too
                 "_type": o.__class__.__name__,
-                "_version": getattr(o.__class__, "VERSION", 1),
+                # not using class VERSION since we might be explicitly
+                # setting a different VERSION for the object itself
+                "_version": getattr(o, "VERSION", 1),
                 "_data": data_dict,
             }
 
@@ -206,7 +198,6 @@ def config_from_dict(data: Dict[str, Any]) -> AOBaseConfig:
         An instance of the appropriate AOBaseConfig subclass
 
     Raises:
-        VersionMismatchError: If the stored version doesn't match the class version
         ValueError: If deserialization fails for other reasons
     """
     if not isinstance(data, dict):
@@ -241,10 +232,11 @@ def config_from_dict(data: Dict[str, Any]) -> AOBaseConfig:
             f"Failed to find class {type_path} in any of the allowed modules: {allowed_modules_str}"
         )
 
-    # Check version - require exact match
     current_version = getattr(cls, "VERSION", 1)
     if stored_version != current_version:
-        raise VersionMismatchError(type_path, stored_version, current_version)
+        warnings.warn(
+            f"Stored version is not the same as current default version of the config: {stored_version=}, {current_version=}, please check the deprecation warning"
+        )
 
     # Handle the case where obj_data is not a dictionary
     if not isinstance(obj_data, dict):
diff --git a/torchao/dtypes/floatx/float8_layout.py b/torchao/dtypes/floatx/float8_layout.py
@@ -3,6 +3,7 @@
 #
 # This source code is licensed under the BSD 3-Clause license found in the
 # LICENSE file in the root directory of this source tree.
+import warnings
 from dataclasses import dataclass
 from typing import Any, Dict, List, Optional, Tuple, Union
 
@@ -109,6 +110,9 @@ def __init__(
         transposed: bool,
         _layout: Layout,
     ):
+        warnings.warn(
+            "Models quantized with VERSION 1 of Float8DynamicActivationFloat8WeightConfig is deprecated and will no longer be supported in March 2026 (9 months), please upgrade torchao and quantize again, or download a newer torchao checkpoint, see https://github.com/pytorch/ao/issues/2649 for more details"
+        )
         self.float8_data = float8_data
         self.scale = scale
         self.transposed = transposed
diff --git a/torchao/quantization/quant_api.py b/torchao/quantization/quant_api.py
@@ -1489,15 +1489,15 @@ class Float8WeightOnlyConfig(AOBaseConfig):
     Args:
         weight_dtype (torch.dtype): The target data type for weight quantization. Default is torch.float8_e4m3fn.
         set_inductor_config (bool): if True, adjusts `torchinductor` settings to recommended values.
-        VERSION (int): the version of the config, version 1 is using AffineQuantizedTensor that we plan to deprecate/split, version 2 is using Float8Tensor
+        VERSION (int): the version of the config, version 1 is using AffineQuantizedTensor that we plan to deprecate/split, version 2 is using Float8Tensor (default)
 
     Note:
         The actual matmul will be computed in original precision of the weight tensor.
     """
 
     weight_dtype: torch.dtype = e4m3_dtype
     set_inductor_config: bool = True
-    VERSION: int = 1
+    VERSION: int = 2
 
 
 # for BC
@@ -1506,6 +1506,9 @@ class Float8WeightOnlyConfig(AOBaseConfig):
 
 def _float8_weight_only_quant_tensor(weight, config):
     if config.VERSION == 1:
+        warnings.warn(
+            "VERSION 1 of Float8WeightOnlyConfig is deprecated and will no longer be supported in March 2026 (9 months), please use VERSION 2, see https://github.com/pytorch/ao/issues/2649 for more details"
+        )
         from torchao.dtypes import to_affine_quantized_floatx
 
         block_size = tuple([1 for _ in range(weight.dim() - 1)] + [weight.shape[-1]])
@@ -1629,7 +1632,7 @@ class Float8DynamicActivationFloat8WeightConfig(AOBaseConfig):
         activation_value_ub (Optional[float]): the upper bound for activation value for calculating scale
         kernel_preference (KernelPreference): kernel preference for ops like matmul, grouped matmul etc. by defalut (KernelPreference.AUTO) it will be chosen for user based on hardware or other information, this only needs to be set in weight
         set_inductor_config (bool): if True, adjusts `torchinductor` settings to recommended values.
-        VERSION (int): the version of the config, version 1 is using AffineQuantizedTensor that we plan to deprecate/split, version 2 is using Float8Tensor
+        VERSION (int): the version of the config, version 1 is using AffineQuantizedTensor that we plan to deprecate/split, version 2 is using Float8Tensor (default)
 
     """
 
@@ -1641,7 +1644,7 @@ class Float8DynamicActivationFloat8WeightConfig(AOBaseConfig):
     activation_value_ub: Optional[float] = None
     kernel_preference: KernelPreference = KernelPreference.AUTO
     set_inductor_config: bool = True
-    VERSION: int = 1
+    VERSION: int = 2
 
     def __post_init__(self):
         if self.mm_config is None:
@@ -1680,6 +1683,10 @@ def _float8_dynamic_activation_float8_weight_quantize_tensor(weight, config):
         )
 
     if config.VERSION == 1:
+        warnings.warn(
+            "VERSION 1 of Float8DynamicActivationFloat8WeightConfig is deprecated and will no longer be supported in March 2026 (9 months), please use VERSION 2, see https://github.com/pytorch/ao/issues/2649 for more details"
+        )
+
         block_size = get_block_size(weight.shape[-2:], weight_granularity)
         if weight.dim() == 3:
             block_size = tuple([1] + list(block_size))
diff --git a/torchao/quantization/utils.py b/torchao/quantization/utils.py
@@ -681,3 +681,6 @@ def recommended_inductor_config_setter():
     torch._inductor.config.fx_graph_cache = True
     torch._inductor.config.triton.unique_kernel_names = True
     torch.set_float32_matmul_precision("high")
+
+
+AQT_FLOAT8_DEPRECATION_WARNING = ""