neuralmagic · kylesayrs · May 30, 2025 · May 30, 2025 · May 30, 2025 · May 30, 2025
diff --git a/src/compressed_tensors/quantization/lifecycle/forward.py b/src/compressed_tensors/quantization/lifecycle/forward.py
@@ -112,17 +112,21 @@ def dequantize(
             if scale.shape[1] == 1:
                 args = QuantizationArgs(strategy=QuantizationStrategy.CHANNEL)
             # Scale height matches input or is 1 -> group quantization across columns
-            # 
+            #
             # Example 1: scale.shape[0] == 1
             # x_q: (4, 8), scale: (1, 4) -> 2 columns per group
             #
-            # Example 2: scale.shape[0] == x_q.shape[0] 
+            # Example 2: scale.shape[0] == x_q.shape[0]
             # x_q: (4, 8), scale: (4, 4) -> 2 elements per group (per row)
             elif (scale.shape[0] == 1) or (scale.shape[0] == x_q.shape[0]):
                 group_size = int(x_q.shape[1] / scale.shape[1])
-                args = QuantizationArgs(strategy=QuantizationStrategy.GROUP, group_size=group_size)
+                args = QuantizationArgs(
+                    strategy=QuantizationStrategy.GROUP, group_size=group_size
+                )
             else:
-                args = QuantizationArgs(strategy=QuantizationStrategy.BLOCK, block_structure=scale.shape)
+                args = QuantizationArgs(
+                    strategy=QuantizationStrategy.BLOCK, block_structure=scale.shape
+                )
         else:
             raise ValueError(
                 f"Could not infer a quantization strategy from scale with {scale.ndim} "

diff --git a/src/compressed_tensors/quantization/lifecycle/initialize.py b/src/compressed_tensors/quantization/lifecycle/initialize.py
@@ -185,27 +185,29 @@ def _initialize_scale_zero_point(
         elif quantization_args.strategy == QuantizationStrategy.BLOCK:
             # For block quantization, scale shape should match number of blocks - only for weights
             if quantization_args.block_structure is None:
-                raise ValueError("Block quantization requires block_structure to be specified")
+                raise ValueError(
+                    "Block quantization requires block_structure to be specified"
+                )
             block_height, block_width = quantization_args.block_structure
             rows, cols = weight_shape[-2], weight_shape[-1]
             num_rows_blocks = math.ceil(rows / block_height)
             num_cols_blocks = math.ceil(cols / block_width)
-            
+
             # Warn if dimensions don't divide evenly
             if rows % block_height != 0 or cols % block_width != 0:
                 warnings.warn(
                     f"Block quantization: tensor shape {weight_shape} does not divide evenly "
                     f"by block structure {quantization_args.block_structure}. "
                     f"Some blocks will be incomplete which may affect quantization quality.",
-                    UserWarning
+                    UserWarning,
                 )
-            
+
             expected_shape = (num_rows_blocks, num_cols_blocks)
     elif quantization_args.strategy == QuantizationStrategy.BLOCK:
         warnings.warn(
             f"BLOCK quantization not supported for {base_name} activations. "
             f"Falling back to tensor-level quantization.",
-            UserWarning
+            UserWarning,
         )
         expected_shape = 1
 

diff --git a/src/compressed_tensors/quantization/quant_scheme.py b/src/compressed_tensors/quantization/quant_scheme.py
@@ -64,8 +64,9 @@ def validate_model_after(model: "QuantizationScheme") -> "QuantizationScheme":
                 raise ValueError("Cannot apply actorder to output activations")
 
         if (
-            inputs and weights
-            and weights.strategy == QuantizationStrategy.GROUP 
+            inputs
+            and weights
+            and weights.strategy == QuantizationStrategy.GROUP
             and inputs.strategy == QuantizationStrategy.GROUP
             and weights.group_size != inputs.group_size
         ):
@@ -75,7 +76,7 @@ def validate_model_after(model: "QuantizationScheme") -> "QuantizationScheme":
                 "may complicate fused kernel implementations. Consider using "
                 "TENSOR_GROUP strategy for both or matching group sizes.",
                 UserWarning,
-                stacklevel=2
+                stacklevel=2,
             )
 
         return model

diff --git a/src/compressed_tensors/transform/factory/base.py b/src/compressed_tensors/transform/factory/base.py
@@ -13,7 +13,8 @@
 # limitations under the License.
 
 from abc import ABC, abstractmethod
-from typing import Optional
+from collections import defaultdict
+from typing import List, Optional, Tuple
 
 import torch
 import torch.nn.utils.parametrize as P
@@ -49,10 +50,13 @@ class TransformFactory(RegistryMixin, ABC):
     :param seed: random seed used to transform weight randomization
     """
 
+    transforms: List["TransformBase"]
+
     def __init__(self, name: str, scheme: TransformScheme, seed: Optional[int] = None):
         self.name = name
         self.scheme = scheme
         self.generator = torch.Generator()
+        self.transforms = list()
         if seed is not None:
             self.generator.manual_seed(seed)
 
@@ -90,16 +94,26 @@ def apply_to_model(self, model: Module):
             for _, module in match_named_modules(model, arg.targets, arg.ignore):
                 self._apply_to_module(module, arg)
 
+        self._update_tied_weights()
+
     def _apply_to_module(self, module: Module, args: TransformArgs):
         """
         Create transforms and apply them to the module
 
         :param module: target module to apply transforms to
         :param args: defines how the transform will be applied to the target module
         """
+        if has_offloaded_params(module):
+            if module._hf_hook.place_submodules:
+                raise NotImplementedError(
+                    "Applying transforms to offloaded submodules with "
+                    "`place_submodules=True` is not supported"
+                )
+
         # create transform as submodule
         transform_name = f"{self.name}_{args.location}"
         transform = self.create_transform(module, args)
+        self.transforms.append(transform)
         register_offload_module(module, transform_name, transform)
 
         # register input transformation hook
@@ -128,8 +142,9 @@ def input_hook(_, args):
                     raise ValueError("Offloaded training is not supported")
                 P.register_parametrization(module, "weight", transform)
 
-            # transform is no longer needed (unfusing is not supported)
-            delete_offload_module(module, transform_name)
+            else:
+                # transform is no longer needed (unfusing is not supported)
+                delete_offload_module(module, transform_name)
 
         # register output transformation hook
         elif args.location == TransformLocation.OUTPUT:
@@ -143,6 +158,35 @@ def output_hook(_, _input, output):
         else:
             raise NotImplementedError()
 
+    def _update_tied_weights(self):
+        """
+        Populate the `_dynamic_tied_weights_keys` attribute of transforms,
+        which is used by transformers to detect and remove shared pointers
+        during saving
+        """
+        # avoid issues with this method being called twice
+        for transform in self.transforms:
+            transform._dynamic_tied_weights_keys = list()
+
+        # map from data_ptrs to keys
+        ptr_to_keys: dict[int, List[Tuple[TransformBase, str]]] = defaultdict(list)
+        for transform in self.transforms:
+            for name, param in transform.named_parameters(recurse=False):
+                # NOTE: previously asserted that parent._hf_hook.place_submodules=False
+                if has_offloaded_params(transform):
+                    param = transform._hf_hook.weights_map[name]
+                ptr_to_keys[param.data_ptr()].append((transform, name))
+
+        # populate `_dynamic_tied_weights_keys` if there is more than one key
+        # and ensure that they share tensors
+        for shared_keys in ptr_to_keys.values():
+            if len(shared_keys) > 1:
+                tensor = getattr(shared_keys[0][0], shared_keys[0][1])
+
+                for transform, name in shared_keys:
+                    transform._dynamic_tied_weights_keys.append(name)
+                    setattr(transform, name, tensor)
+
 
 class TransformBase(InternalModule, ABC):
     """
@@ -151,6 +195,11 @@ class TransformBase(InternalModule, ABC):
 
     args: TransformArgs
     weight: Parameter
+    _dynamic_tied_weights_keys: List[str]
+
+    def __init__(self):
+        super().__init__()
+        self._dynamic_tied_weights_keys = list()
 
     @abstractmethod
     def forward(self, value: Tensor) -> Tensor:

diff --git a/src/compressed_tensors/transform/factory/hadamard.py b/src/compressed_tensors/transform/factory/hadamard.py
@@ -12,9 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import math
 from typing import Optional, Union
 
-import math
 import torch
 from compressed_tensors.transform import TransformArgs, TransformScheme
 from compressed_tensors.transform.factory.base import TransformBase, TransformFactory
@@ -103,7 +103,8 @@ def forward(self, value: Tensor) -> Tensor:
 
         if self.args.inverse:
             weight = weight.T
-
-        return apply_transform_weight(
-            weight, value, self.args.location, self.module_type
-        ) / self._scale
+
+        return (
+            apply_transform_weight(weight, value, self.args.location, self.module_type)
+            / self._scale
+        )
diff --git a/src/compressed_tensors/transform/factory/matrix_multiply.py b/src/compressed_tensors/transform/factory/matrix_multiply.py
@@ -70,6 +70,7 @@ def _create_weight(self, size: int, dtype: dtype, device: device) -> Parameter:
 
     def _create_inverse(self, weight: Parameter) -> Parameter:
         data = high_precision_invert(weight.data)
+        data = data.contiguous()  # ensure proper serialization
         return Parameter(data, requires_grad=False)
 
 

diff --git a/tests/test_transform/conftest.py b/tests/test_transform/conftest.py
@@ -14,12 +14,13 @@
 
 import pytest
 import torch
-from compressed_tensors.transform import TransformArgs
+from compressed_tensors.transform import TransformArgs, TransformFactory
+from transformers import PretrainedConfig, PreTrainedModel
 
 
-class TransformableModel(torch.nn.Module):
+class TransformableModel(PreTrainedModel):
     def __init__(self, *sizes):
-        super().__init__()
+        super().__init__(config=PretrainedConfig())
         self.fcs = torch.nn.ModuleList(
             [
                 torch.nn.Linear(sizes[index], sizes[index + 1], bias=False)

diff --git a/tests/test_transform/factory/test_correctness.py b/tests/test_transform/factory/test_correctness.py
@@ -27,13 +27,13 @@
 
 
 @pytest.mark.parametrize("type", ("hadamard", "random-hadamard"))
-@pytest.mark.parametrize("randomized", (True, False))
+@pytest.mark.parametrize("randomize", (True, False))
 @pytest.mark.parametrize("head_dim", (None, 2, 4))
 @pytest.mark.parametrize("input_batch_size", (1, 5, 17))
-def test_correctness_linear(type, randomized, head_dim, input_batch_size):
+def test_correctness_linear(type, randomize, head_dim, input_batch_size):
     size = (4, 8)
     module = torch.nn.Linear(*size, bias=False)
-    scheme = TransformScheme(type=type, randomized=randomized, head_dim=head_dim)
+    scheme = TransformScheme(type=type, randomize=randomize, head_dim=head_dim)
     factory = TransformFactory.from_scheme(scheme, name="")
 
     input_tfm = factory.create_transform(
@@ -58,10 +58,10 @@ def test_correctness_linear(type, randomized, head_dim, input_batch_size):
 
 
 @pytest.mark.parametrize("type", ("hadamard", "random-hadamard"))
-@pytest.mark.parametrize("randomized", (True, False))
+@pytest.mark.parametrize("randomize", (True, False))
 @pytest.mark.parametrize("embed_loc", ("weight_output", "output"))
 @pytest.mark.parametrize("linear_loc", ("input", "weight_input"))
-def test_correctness_embedding(type, randomized, embed_loc, linear_loc):
+def test_correctness_embedding(type, randomize, embed_loc, linear_loc):
     model = torch.nn.Sequential(
         torch.nn.Embedding(2, 4),
         torch.nn.Linear(4, 8, bias=False),
@@ -74,7 +74,7 @@ def test_correctness_embedding(type, randomized, embed_loc, linear_loc):
         config_groups={
             "": TransformScheme(
                 type=type,
-                randomized=randomized,
+                randomize=randomize,
                 apply=[
                     TransformArgs(targets="Embedding", location=embed_loc),
                     TransformArgs(targets="Linear", location=linear_loc, inverse=True),
@@ -90,10 +90,10 @@ def test_correctness_embedding(type, randomized, embed_loc, linear_loc):
 
 
 @pytest.mark.parametrize("type", ("hadamard", "random-hadamard"))
-@pytest.mark.parametrize("randomized", (True, False))
+@pytest.mark.parametrize("randomize", (True, False))
 @pytest.mark.parametrize("input_batch_size", (1, 5, 17))
 def test_correctness_model(
-    type, randomized, input_batch_size, model_apply, offload=False
+    type, randomize, input_batch_size, model_apply, offload=False
 ):
     # load model
     model = model_apply[0]
@@ -109,7 +109,7 @@ def test_correctness_model(
     # apply transforms
     config = TransformConfig(
         config_groups={
-            "": TransformScheme(type=type, randomized=randomized, apply=model_apply[1])
+            "": TransformScheme(type=type, randomize=randomize, apply=model_apply[1])
         }
     )
     apply_transform_config(model, config)
@@ -122,19 +122,17 @@ def test_correctness_model(
 @requires_gpu
 @requires_accelerate()
 @pytest.mark.parametrize("type", ("hadamard", "random-hadamard"))
-@pytest.mark.parametrize("randomized", (True, False))
+@pytest.mark.parametrize("randomize", (True, False))
 @pytest.mark.parametrize("input_batch_size", (1, 5, 17))
-def test_correctness_model_offload(type, randomized, input_batch_size, model_apply):
-    test_correctness_model(
-        type, randomized, input_batch_size, model_apply, offload=True
-    )
+def test_correctness_model_offload(type, randomize, input_batch_size, model_apply):
+    test_correctness_model(type, randomize, input_batch_size, model_apply, offload=True)
 
 
 @pytest.mark.parametrize("type", ("hadamard", "random-hadamard"))
-@pytest.mark.parametrize("randomized", (True, False))
+@pytest.mark.parametrize("randomize", (True, False))
 @pytest.mark.parametrize("head_dim", (4, 8))
 @pytest.mark.parametrize("input_batch_size", (1, 5, 17))
-def test_correctness_attention_heads(type, randomized, head_dim, input_batch_size):
+def test_correctness_attention_heads(type, randomize, head_dim, input_batch_size):
     hidden_size = 64
     num_attention_heads = 8
 
@@ -151,7 +149,7 @@ def test_correctness_attention_heads(type, randomized, head_dim, input_batch_siz
         config_groups={
             "": TransformScheme(
                 type=type,
-                randomized=randomized,
+                randomize=randomize,
                 head_dim=head_dim,
                 apply=[
                     TransformArgs(targets="v_proj", location="weight_output"),

diff --git a/tests/test_transform/factory/test_memory.py b/tests/test_transform/factory/test_memory.py
@@ -29,9 +29,9 @@
 
 
 @pytest.mark.parametrize("type", ("hadamard", "random-hadamard"))
-@pytest.mark.parametrize("randomized", (True, False))
+@pytest.mark.parametrize("randomize", (True, False))
 @pytest.mark.parametrize("requires_grad", (True, False))
-def test_memory_sharing(type, randomized, requires_grad, offload=False):
+def test_memory_sharing(type, randomize, requires_grad, offload=False):
     # load model (maybe with offloading)
     model = TransformableModel(2, 2, 4, 4, 8, 8)
     if offload:
@@ -42,7 +42,7 @@ def test_memory_sharing(type, randomized, requires_grad, offload=False):
         config_groups={
             "": TransformScheme(
                 type=type,
-                randomzied=randomized,
+                randomzied=randomize,
                 requires_grad=requires_grad,
                 apply=[
                     TransformArgs(targets="Linear", location="input"),
@@ -84,9 +84,9 @@ def test_memory_sharing(type, randomized, requires_grad, offload=False):
 @requires_gpu
 @requires_accelerate()
 @pytest.mark.parametrize("type", ("hadamard", "random-hadamard"))
-@pytest.mark.parametrize("randomized", (True, False))
+@pytest.mark.parametrize("randomize", (True, False))
 def test_memory_sharing_offload(
     type,
-    randomized,
+    randomize,
 ):
-    test_memory_sharing(type, randomized, requires_grad=False, offload=True)
+    test_memory_sharing(type, randomize, requires_grad=False, offload=True)