Bump version for float8 dynamic quant and weight only quant configs

jerryzh168 · jerryzh168 · commit 99a86bcc43c3 · 2025-08-04T18:31:22.000-07:00
Summary: This PR changes the default VERSION for Float8DynamicActivationFloat8WeightConfig and Float8WeightOnlyConfig from 1 to 2 and makes the VERSION 1 config and VERSION 1 quantized models deprecated, more details in: #2649 Also extended current config serialization to work with multiple config versions Deprecation Note: ``` from transformers import AutoModelForCausalLM, AutoTokenizer model_name = "torchao-testing/opt-125m-float8dq-row-v1-0.13-dev" quantized_model = AutoModelForCausalLM.from_pretrained( model_name, torch_dtype="bfloat16", device_map="cuda", ) /data/users/jerryzh/ao/torchao/core/config.py:249: UserWarning: Stored version is not the same as current default version of the config: stored_version=1, current_version=2, please check the deprecation warning warnings.warn( /data/users/jerryzh/ao/torchao/dtypes/floatx/float8_layout.py:113: UserWarning: Models quantized with VERSION 1 of Float8DynamicActivationFloat8WeightConfig is deprecated and will no longer be supported in a future release, please upgrade torchao and quantize again, or download a newer torchao checkpoint, see #2649 for more details warnings.warn( ``` Suggestion: upgrade torchao to 0.13 and later and generate the checkpoint again: ``` quantize_(model, Float8DynamicActivationFloat8WeightConfig(granularity=PerRow())) ``` Or download the checkpoint again (please let us know if the checkpoint is not updated) Test Plan: tested with serializing a model with VERSION 1 config and load it, and checks warnings are properly printed ``` python test/integration/test_loading_deprecated_checkpoint.py ``` Reviewers: Subscribers: Tasks: Tags: stack-info: PR: #2650, branch: jerryzh168/stack/14
diff --git a/test/core/test_config.py b/test/core/test_config.py
@@ -7,6 +7,7 @@
 import json
 import os
 import tempfile
+import warnings
 from dataclasses import dataclass
 from unittest import mock
 
@@ -15,7 +16,6 @@
 
 from torchao.core.config import (
     AOBaseConfig,
-    VersionMismatchError,
     config_from_dict,
     config_to_dict,
 )
@@ -176,7 +176,7 @@ def test_disallowed_modules():
 
 
 def test_version_mismatch():
-    """Test that version mismatch raises an error during reconstruction."""
+    """Test that version mismatch prints a warning during reconstruction."""
     # Create a config
     dummy_config = DummyNonAllowedConfig()
     reconstructable = config_to_dict(dummy_config)
@@ -186,11 +186,13 @@ def test_version_mismatch():
 
     # Patch to allow the module but should still fail due to version mismatch
     with mock.patch("torchao.core.config.ALLOWED_AO_MODULES", {__name__}):
-        with pytest.raises(
-            VersionMismatchError,
-            match="Version mismatch for DummyNonAllowedConfig: stored version 1 != current version 2",
-        ):
+        with warnings.catch_warnings(record=True) as caught_warnings:
             config_from_dict(reconstructable)
+            assert any(
+                "Stored version is not the same as current default version of the config"
+                in str(w.message)
+                for w in caught_warnings
+            ), "Din't get expected warning message for version mismatch"
 
 
 def test_default_version():
diff --git a/test/dtypes/test_affine_quantized_float.py b/test/dtypes/test_affine_quantized_float.py
@@ -320,7 +320,8 @@ def test_mm_float8dq_per_row(
         )
         test_linear = copy.deepcopy(ref_linear)
         quantize_(
-            test_linear, Float8DynamicActivationFloat8WeightConfig(granularity=PerRow())
+            test_linear,
+            Float8DynamicActivationFloat8WeightConfig(granularity=PerRow(), VERSION=1),
         )
 
         quant_weight = test_linear.weight
@@ -472,7 +473,10 @@ def test_float8_tensor_slicing_basic(self, granularity):
         # Create and quantize a model
         model = torch.nn.Linear(64, 32, bias=False).to(device).to(dtype)
         quantize_(
-            model, Float8DynamicActivationFloat8WeightConfig(granularity=granularity)
+            model,
+            Float8DynamicActivationFloat8WeightConfig(
+                granularity=granularity, VERSION=1
+            ),
         )
 
         weight_impl = model.weight.original_weight_tensor.tensor_impl
@@ -506,7 +510,10 @@ def test_float8_tensor_slicing_per_tensor(self):
         # Create and quantize with per-tensor granularity
         model = torch.nn.Linear(64, 32, bias=False).to(device).to(dtype)
         quantize_(
-            model, Float8DynamicActivationFloat8WeightConfig(granularity=PerTensor())
+            model,
+            Float8DynamicActivationFloat8WeightConfig(
+                granularity=PerTensor(), VERSION=1
+            ),
         )
 
         original_weight = model.weight
@@ -537,7 +544,8 @@ def test_float8_tensor_slicing_per_row(self):
         # Create and quantize with per-row granularity
         model = torch.nn.Linear(64, 32, bias=False).to(device).to(dtype)
         quantize_(
-            model, Float8DynamicActivationFloat8WeightConfig(granularity=PerRow())
+            model,
+            Float8DynamicActivationFloat8WeightConfig(granularity=PerRow(), VERSION=1),
         )
 
         original_weight = model.weight  # Shape: (32, 64)
@@ -575,7 +583,10 @@ def test_float8_tensor_slicing_edge_cases(self):
         # Create and quantize a model
         model = torch.nn.Linear(64, 32, bias=False).to(device).to(dtype)
         quantize_(
-            model, Float8DynamicActivationFloat8WeightConfig(granularity=PerTensor())
+            model,
+            Float8DynamicActivationFloat8WeightConfig(
+                granularity=PerTensor(), VERSION=1
+            ),
         )
 
         original_weight = model.weight
@@ -613,7 +624,9 @@ def test_float8_tensor_slicing_functional_correctness(self, granularity):
         quant_model = copy.deepcopy(ref_model)
         quantize_(
             quant_model,
-            Float8DynamicActivationFloat8WeightConfig(granularity=granularity),
+            Float8DynamicActivationFloat8WeightConfig(
+                granularity=granularity, VERSION=1
+            ),
         )
 
         # Create input with batch size that works well with slicing
diff --git a/test/float8/test_base.py b/test/float8/test_base.py
@@ -473,10 +473,10 @@ def test_quantize(self):
         m = nn.Sequential(nn.Linear(32, 32)).cuda()
         m = convert_to_float8_training(m)
         assert isinstance(m[0], Float8Linear), "Module is not a Float8Linear"
-        from torchao.quantization.quant_api import float8_weight_only, quantize_
+        from torchao.quantization import Float8WeightOnlyConfig, quantize_
 
-        quantize_(m, float8_weight_only())
-        assert m[0].weight.tensor_impl.float8_data.dtype == torch.float8_e4m3fn, (
+        quantize_(m, Float8WeightOnlyConfig())
+        assert m[0].weight.qdata.dtype == torch.float8_e4m3fn, (
             "Post quantization dtype should be torch.float8_e4m3fn"
         )
         with torch.no_grad():
diff --git a/test/integration/test_loading_deprecated_checkpoint.py b/test/integration/test_loading_deprecated_checkpoint.py
@@ -0,0 +1,65 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD 3-Clause license found in the
+# LICENSE file in the root directory of this source tree.
+import unittest
+import warnings
+
+import torch
+from torch.testing._internal import common_utils
+from torch.testing._internal.common_utils import (
+    TestCase,
+    run_tests,
+)
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+from torchao.utils import is_sm_at_least_89
+
+_MODEL_NAMES = [
+    "torchao-testing/opt-125m-float8dq-row-v1-0.13-dev",
+]
+
+
+@unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available")
+@unittest.skipIf(not is_sm_at_least_89(), "Nedd sm89+")
+class TestLoadingDeprecatedCheckpoint(TestCase):
+    @common_utils.parametrize("model_name", _MODEL_NAMES)
+    def test_load_model_and_run(self, model_name):
+        """Test that we print correct warning message when loading a deprecated checkpoint"""
+        # Load and quantize model
+        with warnings.catch_warnings(record=True) as caught_warnings:
+            quantized_model = AutoModelForCausalLM.from_pretrained(
+                model_name,
+                torch_dtype="bfloat16",
+                device_map="cuda",
+            )
+            assert any(
+                "Stored version is not the same as current default version of the config"
+                in str(w.message)
+                for w in caught_warnings
+            ), "Din't get expected warning message for version mismatch"
+
+            assert any(
+                "Models quantized with VERSION 1 of Float8DynamicActivationFloat8WeightConfig is deprecated"
+                in str(w.message)
+                for w in caught_warnings
+            ), "Din't get expected warning message for deprecation"
+
+        tokenizer = AutoTokenizer.from_pretrained(model_name)
+        prompt = ("Hello, my name is",)
+        inputs = tokenizer(
+            prompt,
+            return_tensors="pt",
+        ).to("cuda")
+        generated_ids = quantized_model.generate(**inputs, max_new_tokens=128)
+        # make sure it runs
+        _ = tokenizer.batch_decode(
+            generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
+        )
+
+
+common_utils.instantiate_parametrized_tests(TestLoadingDeprecatedCheckpoint)
+
+if __name__ == "__main__":
+    run_tests()
diff --git a/test/quantization/quantize_/workflows/float8/test_float8_tensor.py b/test/quantization/quantize_/workflows/float8/test_float8_tensor.py
@@ -184,7 +184,6 @@ def test_fp8_linear_variants(
                 config = Float8DynamicActivationFloat8WeightConfig(
                     granularity=granularity,
                     kernel_preference=kernel_preference,
-                    VERSION=2,
                 )
             else:
                 assert mode == "weight-only", f"Unsupported mode: {mode}"
@@ -210,9 +209,7 @@ def test_fp8_linear_variants(
         "AssertionError: tensor(False, device='cuda:0') is not true : sqnr: -2.90625, will fix a bit later",
     )
     def test_slice(self, granularity):
-        config = Float8DynamicActivationFloat8WeightConfig(
-            granularity=granularity, VERSION=2
-        )
+        config = Float8DynamicActivationFloat8WeightConfig(granularity=granularity)
         dtype = torch.bfloat16
         device = "cuda"
         dummy = torch.nn.Linear(256, 256, bias=False, dtype=dtype, device=device)
@@ -273,9 +270,7 @@ def test_slice(self, granularity):
 
     @common_utils.parametrize("granularity", [PerTensor(), PerRow()])
     def test_slice_preserves_aliasing(self, granularity):
-        config = Float8DynamicActivationFloat8WeightConfig(
-            granularity=granularity, VERSION=2
-        )
+        config = Float8DynamicActivationFloat8WeightConfig(granularity=granularity)
         l = torch.nn.Linear(1024, 1024).to("cuda").to(torch.bfloat16)
         l.weight = torch.nn.Parameter(
             torch.zeros(1024, 1024, dtype=torch.bfloat16, device="cuda")
@@ -296,9 +291,7 @@ def test_slice_and_copy_similar_to_vllm(self, granularity):
 
         dtype = torch.bfloat16
         device = "cuda"
-        config = Float8DynamicActivationFloat8WeightConfig(
-            granularity=granularity, VERSION=2
-        )
+        config = Float8DynamicActivationFloat8WeightConfig(granularity=granularity)
         l = torch.nn.Linear(1024, 1024, device="cuda", dtype=dtype)
         quantize_(l, config)
 
@@ -335,9 +328,7 @@ def test_slice_and_copy_similar_to_vllm(self, granularity):
     @unittest.skipIf(not is_sm_at_least_90(), "Nedd sm90+")
     def test_bmm(self):
         # only support per row quantization
-        config = Float8DynamicActivationFloat8WeightConfig(
-            granularity=PerRow(), VERSION=2
-        )
+        config = Float8DynamicActivationFloat8WeightConfig(granularity=PerRow())
 
         class M(torch.nn.Module):
             def __init__(self, weight):
@@ -369,9 +360,7 @@ def forward(self, x):
         ],
     )
     def test_to_device(self, granularity, sizes):
-        config = Float8DynamicActivationFloat8WeightConfig(
-            granularity=granularity, VERSION=2
-        )
+        config = Float8DynamicActivationFloat8WeightConfig(granularity=granularity)
         M, N, K = sizes
         dtype = torch.bfloat16
         for device in self.GPU_DEVICES:
@@ -401,9 +390,7 @@ def test_to_device(self, granularity, sizes):
         ],
     )
     def test_cat(self, granularity, sizes):
-        config = Float8DynamicActivationFloat8WeightConfig(
-            granularity=granularity, VERSION=2
-        )
+        config = Float8DynamicActivationFloat8WeightConfig(granularity=granularity)
         dtype = torch.bfloat16
         device = "cuda"
         M, N, K = sizes
@@ -461,9 +448,7 @@ def test_moe_weight_reshape_ops(self):
         dtype = torch.bfloat16
         device = "cuda"
 
-        bmm_config = Float8DynamicActivationFloat8WeightConfig(
-            granularity=granularity, VERSION=2
-        )
+        bmm_config = Float8DynamicActivationFloat8WeightConfig(granularity=granularity)
         moe_config = MoEQuantConfig(bmm_config)
 
         batch_size = 4
diff --git a/torchao/core/config.py b/torchao/core/config.py
@@ -8,13 +8,13 @@
 import enum
 import importlib
 import json
+import warnings
 from typing import Any, ClassVar, Dict
 
 import torch
 
 __all__ = [
     "AOBaseConfig",
-    "VersionMismatchError",
     "config_from_dict",
     "config_to_dict",
     "ALLOWED_AO_MODULES",
@@ -61,20 +61,6 @@ def _transform(
     VERSION: ClassVar[int] = _DEFAULT_VERSION
 
 
-class VersionMismatchError(Exception):
-    """Raised when trying to deserialize a config with a different version"""
-
-    def __init__(self, type_path, stored_version, current_version):
-        self.type_path = type_path
-        self.stored_version = stored_version
-        self.current_version = current_version
-        message = (
-            f"Version mismatch for {type_path}: "
-            f"stored version {stored_version} != current version {current_version}"
-        )
-        super().__init__(message)
-
-
 class ConfigJSONEncoder(json.JSONEncoder):
     """Custom JSON encoder for AOBaseConfig objects"""
 
@@ -91,7 +77,9 @@ def default(self, o):
             return {
                 # Only store the class name, not the full module path
                 "_type": o.__class__.__name__,
-                "_version": getattr(o.__class__, "VERSION", 1),
+                # not using class VERSION since we might be explicitly
+                # setting a different VERSION for the object itself
+                "_version": getattr(o, "VERSION", 1),
                 "_data": data_dict,
             }
 
@@ -105,7 +93,9 @@ def default(self, o):
 
             return {
                 "_type": o.__class__.__name__,
-                "_version": getattr(o.__class__, "VERSION", 1),
+                # not using class VERSION since we might be explicitly
+                # setting a different VERSION for the object itself
+                "_version": getattr(o, "VERSION", 1),
                 "_data": processed_data,
             }
 
@@ -120,7 +110,9 @@ def default(self, o):
             return {
                 # Only store the class name for dataclasses too
                 "_type": o.__class__.__name__,
-                "_version": getattr(o.__class__, "VERSION", 1),
+                # not using class VERSION since we might be explicitly
+                # setting a different VERSION for the object itself
+                "_version": getattr(o, "VERSION", 1),
                 "_data": data_dict,
             }
 
@@ -218,7 +210,6 @@ def config_from_dict(data: Dict[str, Any]) -> AOBaseConfig:
         An instance of the appropriate AOBaseConfig subclass
 
     Raises:
-        VersionMismatchError: If the stored version doesn't match the class version
         ValueError: If deserialization fails for other reasons
     """
     if not isinstance(data, dict):
@@ -253,10 +244,11 @@ def config_from_dict(data: Dict[str, Any]) -> AOBaseConfig:
             f"Failed to find class {type_path} in any of the allowed modules: {allowed_modules_str}"
         )
 
-    # Check version - require exact match
     current_version = getattr(cls, "VERSION", 1)
     if stored_version != current_version:
-        raise VersionMismatchError(type_path, stored_version, current_version)
+        warnings.warn(
+            f"Stored version is not the same as current default version of the config: {stored_version=}, {current_version=}, please check the deprecation warning"
+        )
 
     # Handle the case where obj_data is not a dictionary
     if not isinstance(obj_data, dict):
diff --git a/torchao/dtypes/floatx/float8_layout.py b/torchao/dtypes/floatx/float8_layout.py
@@ -3,6 +3,7 @@
 #
 # This source code is licensed under the BSD 3-Clause license found in the
 # LICENSE file in the root directory of this source tree.
+import warnings
 from dataclasses import dataclass
 from typing import Any, Dict, List, Optional, Tuple, Union
 
@@ -109,6 +110,9 @@ def __init__(
         transposed: bool,
         _layout: Layout,
     ):
+        warnings.warn(
+            "Models quantized with VERSION 1 of Float8DynamicActivationFloat8WeightConfig is deprecated and will no longer be supported in a future release, please upgrade torchao and quantize again, or download a newer torchao checkpoint, see https://github.com/pytorch/ao/issues/2649 for more details"
+        )
         self.float8_data = float8_data
         self.scale = scale
         self.transposed = transposed
diff --git a/torchao/quantization/quant_api.py b/torchao/quantization/quant_api.py