channel wise fp8 quantization, attention modules

George Ohashi · George Ohashi · commit c2a201657709 · 2025-03-06T23:20:31.000-05:00
Signed-off-by: George Ohashi &lt;george@neuralmagic.com&gt;
diff --git a/examples/quantization_kv_cache/llama3_fp8_kv_example.py b/examples/quantization_kv_cache/llama3_fp8_kv_example.py
@@ -58,33 +58,35 @@ def process_and_tokenize(example):
                         strategy: channel
                         dynamic: false
                         symmetric: true
-                    targets: ['re:.*q_proj', 're:.*k_proj', 're:.*v_proj']
+                    # targets: ['re:.*q_proj', 're:.*k_proj', 're:.*v_proj']
+                    targets: ['re:.*q_proj',]
+                    
 """
-recipe = """
-quant_stage:
-    quant_modifiers:
-        QuantizationModifier:
-            config_groups:
-                fp8_attention_q_proj:
-                    output_activations:
-                        num_bits: 8
-                        type: float
-                        strategy: group
-                        group_size: 512
-                        dynamic: false
-                        symmetric: true
-                    targets: ['re:.*q_proj']
-                fp8_attention_kv_proj:
-                    output_activations:
-                        num_bits: 8
-                        type: float
-                        strategy: group
-                        group_size: 128
-                        dynamic: false
-                        symmetric: true
-                    targets: ['re:.*k_proj', 're:.*v_proj']
+# recipe = """
+# quant_stage:
+#     quant_modifiers:
+#         QuantizationModifier:
+#             config_groups:
+#                 fp8_attention_q_proj:
+#                     output_activations:
+#                         num_bits: 8
+#                         type: float
+#                         strategy: channel
+#                         # group_size: 512
+#                         dynamic: false
+#                         symmetric: true
+#                     targets: ['re:.*q_proj']
+#                 # fp8_attention_kv_proj:
+#                 #     output_activations:
+#                 #         num_bits: 8
+#                 #         type: float
+#                 #         strategy: group
+#                 #         group_size: 128
+#                 #         dynamic: false
+#                 #         symmetric: true
+#                 #     targets: ['re:.*k_proj', 're:.*v_proj']
 
-"""
+# """
 
 # Apply algorithms.
 oneshot(
diff --git a/src/llmcompressor/modifiers/quantization/calibration.py b/src/llmcompressor/modifiers/quantization/calibration.py
@@ -81,7 +81,9 @@ def call_observer(module: Module, base_name: str, value: Optional[torch.Tensor]
         raise ValueError("Must provide a value to observe if not using weight observer")
 
     observer = getattr(module, f"{base_name}_observer")
-    updated_scale, updated_zero_point = observer(value, g_idx=g_idx)
+    updated_scale, updated_zero_point = observer(
+        value, g_idx=g_idx, base_name=base_name
+    )
 
     # update scale and zero point
     update_parameter_data(module, updated_scale, f"{base_name}_scale")
diff --git a/src/llmcompressor/observers/base.py b/src/llmcompressor/observers/base.py
@@ -31,7 +31,10 @@ def __init__(self, quantization_args: QuantizationArgs):
 
     @torch.no_grad()
     def forward(
-        self, observed: Tensor, g_idx: Optional[Tensor] = None
+        self,
+        observed: Tensor,
+        g_idx: Optional[Tensor] = None,
+        base_name: Optional[str] = None,
     ) -> Tuple[FloatTensor, IntTensor]:
         """
         maps directly to get_qparams
@@ -40,8 +43,9 @@ def forward(
         :param g_idx: optional mapping from column index to group index
         :return: tuple of scale and zero point based on last observed value
         """
+        # breakpoint()
         self.record_observed_tokens(observed)
-        return self.get_qparams(observed=observed, g_idx=g_idx)
+        return self.get_qparams(observed=observed, g_idx=g_idx, base_name=base_name)
 
     def calculate_qparams(
         self,
@@ -66,6 +70,7 @@ def get_qparams(
         self,
         observed: Optional[Tensor] = None,
         g_idx: Optional[Tensor] = None,
+        base_name: Optional[str] = None,
     ) -> Tuple[FloatTensor, IntTensor]:
         """
         Convenience function to wrap overwritten calculate_qparams
@@ -123,26 +128,24 @@ def get_qparams(
                     self._zero_point[:, group_index] = zero_point.squeeze(1)
 
             elif self.quantization_args.strategy == QuantizationStrategy.CHANNEL:
-                # assume observed is transposed, because its the output, hence use dim 0
-                # we pass in [1, 8, 2048, 128] for k_states 
-                # normally per channel: (output_dim, 1) and you have as many scales as the output_dim
-                # we want 8 - num_k_head_scales? or 
-                #breakpoint()
-
-                # weight --> get scales along the first dimension (output dim is first dim) 
-                # weight shape (output_dim, input_dim)
-                # self._scale, self._zero_point = self.get_qparams_along_dim(observed, 0)
-                # output when applied to the weight: (output_dim, 1)
-
-            
-                # for outputs:
-                self._scale, self._zero_point = self.get_qparams_along_dim(observed, 2)
-                self._scale = self._scale.squeeze(1)
-                self._zero_point = self._zero_point.squeeze(1)
-                # why is the output of self._scale: [1, 1, 1]
-
-
-                
+                if base_name == "output":
+                    # the last dimension is the hidden dimension
+                    # shape of [1,1, num_key_value_heads * head_dim]
+                    scale, zero_point = self.get_qparams_along_dim(
+                        observed, observed.ndim - 1
+                    )
+                    self._scale = (
+                        scale.squeeze()
+                    )  # shape of [num_key_value_heads * head_dim]
+                    self._zero_point = (
+                        zero_point.squeeze()
+                    )  # shape of [num_key_value_heads * head_dim]
+                else:
+                    # weight or input
+                    # assume observed is transposed, because its the output, hence use dim 0
+                    self._scale, self._zero_point = self.get_qparams_along_dim(
+                        observed, 0
+                    )
 
             elif self.quantization_args.strategy == QuantizationStrategy.TOKEN:
                 # use dim 1, assume the obsersed.shape = [batch, token, hidden]