call the right functions

shanjiaz · shanjiaz · commit 23b7cfc096bb · 2025-07-23T12:27:47.000-04:00
Signed-off-by: shanjiaz &lt;zsjwpianpian@gmail.com&gt;
diff --git a/src/llmcompressor/modifiers/quantization/calibration.py b/src/llmcompressor/modifiers/quantization/calibration.py
@@ -10,7 +10,11 @@
 )
 from compressed_tensors.quantization.lifecycle.forward import forward_quantize
 from compressed_tensors.quantization.utils import is_kv_cache_quant_scheme
-from compressed_tensors.utils import align_module_device, update_parameter_data
+from compressed_tensors.utils import (
+    align_module_device,
+    delete_offload_parameter,
+    update_offload_parameter,
+)
 from loguru import logger
 from torch.nn import Module
 
@@ -116,7 +120,7 @@ def call_observer(
                 value,
                 should_calculate_gparam=True,
             )
-            update_parameter_data(module, global_scale, f"{base_name}_global_scale")
+            update_offload_parameter(module, f"{base_name}_global_scale", global_scale)
         else:
             global_scale = getattr(module, f"{base_name}_global_scale", None)
 
@@ -127,22 +131,21 @@ def call_observer(
             # register or update scale & zero_point parameters (supports block shapes)
             scale_name = f"{base_name}_scale"
             zp_name = f"{base_name}_zero_point"
-            if not hasattr(module, scale_name) or getattr(module, scale_name).shape != updated_scale.shape:
-                if hasattr(module, scale_name):
-                    delattr(module, scale_name)
-                module.register_parameter(
-                    scale_name, torch.nn.Parameter(updated_scale.clone())
-                )
-            else:
-                update_parameter_data(module, updated_scale, scale_name)
-            if not hasattr(module, zp_name) or getattr(module, zp_name).shape != updated_zero_point.shape:
-                if hasattr(module, zp_name):
-                    delattr(module, zp_name)
-                module.register_parameter(
-                    zp_name, torch.nn.Parameter(updated_zero_point.clone())
-                )
-            else:
-                update_parameter_data(module, updated_zero_point, zp_name)
+            for name, value in [
+                (scale_name, updated_scale),
+                (zp_name, updated_zero_point),
+            ]:
+                if (
+                    not hasattr(module, name)
+                    or getattr(module, name).shape != value.shape
+                ):
+                    if hasattr(module, name):
+                        delete_offload_parameter(module, name)
+                    module.register_offload_parameter(
+                        name, torch.nn.Parameter(value.clone(), requires_grad=False)
+                    )
+                else:
+                    update_offload_parameter(module, name, value)
 
 
 def update_weight_global_scale(module: Module):
@@ -273,8 +276,8 @@ def calibrate_kv_cache_output_hook(module: Module, _args: Any, _output: torch.Te
     kv_cache = getattr(module, "kv_cache")
     k_scale = kv_cache.k_scales[module.layer_idx]
     v_scale = kv_cache.v_scales[module.layer_idx]
-    update_parameter_data(module, k_scale, KVCacheScaleType.KEY.value)
-    update_parameter_data(module, v_scale, KVCacheScaleType.VALUE.value)
+    update_offload_parameter(module, KVCacheScaleType.KEY.value, k_scale)
+    update_offload_parameter(module, KVCacheScaleType.VALUE.value, v_scale)
 
 
 def initialize_quantized_kv_cache(module: Module):
diff --git a/src/llmcompressor/observers/base.py b/src/llmcompressor/observers/base.py
@@ -193,17 +193,29 @@ def get_qparams(
                 )
 
             elif self.quantization_args.strategy == QuantizationStrategy.BLOCK:
-                # Block-wise quantization: one scale/zero_point per block of shape [block_rows, block_cols]
+                # Block-wise quantization: one scale/zero_point per block of shape
+                # [block_rows, block_cols]
                 rows, cols = observed.shape[:2]
                 bs = self.quantization_args.block_structure
-                if not (isinstance(bs, (list, tuple)) and len(bs) == 2 and all(isinstance(x, int) for x in bs)):
-                    raise ValueError(f"Invalid block_structure '{bs}'. Must be a list of two ints [rows, cols].")
+                if not (
+                    isinstance(bs, (list, tuple))
+                    and len(bs) == 2
+                    and all(isinstance(x, int) for x in bs)
+                ):
+                    raise ValueError(
+                        f"Invalid block_structure '{bs}'. "
+                        f"Must be a list of two ints [rows, cols]."
+                    )
                 block_rows, block_cols = bs
                 num_br = int(ceil(rows / block_rows))
                 num_bc = int(ceil(cols / block_cols))
                 # allocate per-block scale and zero_point
-                self._scale = torch.empty((num_br, num_bc), dtype=observed.dtype, device=observed.device)
-                self._zero_point = torch.empty((num_br, num_bc), dtype=observed.dtype, device=observed.device)
+                self._scale = torch.empty(
+                    (num_br, num_bc), dtype=observed.dtype, device=observed.device
+                )
+                self._zero_point = torch.empty(
+                    (num_br, num_bc), dtype=observed.dtype, device=observed.device
+                )
                 # compute qparams for each block
                 for i in range(num_br):
                     r0 = i * block_rows
diff --git a/tests/llmcompressor/modifiers/quantization/test_base.py b/tests/llmcompressor/modifiers/quantization/test_base.py
@@ -34,6 +34,7 @@ def q_config_kwargs(config_0, config_1):
         )
     )
 
+
 @pytest.fixture
 def block_q_config_kwargs():
     return dict(
@@ -53,6 +54,7 @@ def block_q_config_kwargs():
         )
     )
 
+
 def test_block_strategy_parsing(block_q_config_kwargs):
     modifier = GPTQModifier(**block_q_config_kwargs)
     resolved = modifier.resolve_quantization_config()

Original file line number	Diff line number	Diff line change
`@@ -34,6 +34,7 @@ def q_config_kwargs(config_0, config_1):`
`34`	`34`	`)`
`35`	`35`	`)`
`36`	`36`
	`37`	`+`
`37`	`38`	`@pytest.fixture`
`38`	`39`	`def block_q_config_kwargs():`
`39`	`40`	`return dict(`
`@@ -53,6 +54,7 @@ def block_q_config_kwargs():`
`53`	`54`	`)`
`54`	`55`	`)`
`55`	`56`
	`57`	`+`
`56`	`58`	`def test_block_strategy_parsing(block_q_config_kwargs):`
`57`	`59`	`modifier = GPTQModifier(**block_q_config_kwargs)`
`58`	`60`	`resolved = modifier.resolve_quantization_config()`