Merge remote-tracking branch 'origin' into kylesayrs/shared-pipelines

kylesayrs · kylesayrs · commit 6c26ad5186a4 · 2025-05-05T16:26:03.000-04:00
diff --git a/.github/workflows/test-check-transformers.yaml b/.github/workflows/test-check-transformers.yaml
@@ -8,6 +8,7 @@ on:
 
 env:
   CADENCE: "commit"
+  HF_TOKEN: ${{ secrets.HF_TOKEN_READ }}
 
 jobs:
   detect-changes:
@@ -95,6 +96,10 @@ jobs:
         if: (success() || failure()) && steps.install.outcome == 'success'
         run: |
           pytest -v tests/llmcompressor/transformers/obcq
+      - name: Running Tracing Tests
+        if: (success() || failure()) && steps.install.outcome == 'success'
+        run: |
+          pytest -v tests/llmcompressor/transformers/tracing
       - name: Running KV Cache Tests
         if: (success() || failure()) && steps.install.outcome == 'success'
         run: |
diff --git a/src/llmcompressor/modifiers/awq/base.py b/src/llmcompressor/modifiers/awq/base.py
@@ -2,7 +2,11 @@
 from typing import Any, Dict, List, Optional, Union
 
 import torch
-from compressed_tensors.utils import align_module_device, update_offload_parameter
+from compressed_tensors.utils import (
+    align_module_device,
+    get_execution_device,
+    update_offload_parameter,
+)
 from loguru import logger
 from pydantic import ConfigDict
 from torch.nn import Module
@@ -11,7 +15,6 @@
 from llmcompressor.core import State
 from llmcompressor.modifiers import Modifier
 from llmcompressor.modifiers.utils.pytorch_helpers import run_calibration_forward
-from llmcompressor.pytorch.utils import tensor_forward_with_input_args
 from llmcompressor.utils.fsdp.helpers import get_fsdp_parent
 from llmcompressor.utils.helpers import calibration_forward_context
 from llmcompressor.utils.pytorch.module import (
@@ -217,7 +220,7 @@ def _set_resolved_mappings(self, model: Module) -> None:
         self._resolved_mappings = resolved_mappings
         return
 
-    def _setup_scale_hooks(self):
+    def _setup_scale_hooks(self) -> None:
         """
         Attach a forward hook to each activation we want to smooth. This allows us to
         calculate the dynamic range during calibration
@@ -243,7 +246,7 @@ def hook_fn(module, inp, out):
             self.register_hook(layer, create_hook_fn(name), "forward")
 
     @torch.no_grad()
-    def _calibrate(self, model: Module, calibration_dataloader: List):
+    def _calibrate(self, model: Module, calibration_dataloader: List) -> None:
         """
         Catch the output dynamic ranges of each layer that will be smoothed by running
         forward passes with calibration_dataloader
@@ -264,7 +267,7 @@ def _calibrate(self, model: Module, calibration_dataloader: List):
             calibration_dataloader,
         )
 
-    def _concat_collected_activations(self):
+    def _concat_collected_activations(self) -> None:
         """
         Concatenate the collected activation values from each forward pass into a single
         tensor for each layer
@@ -277,7 +280,7 @@ def _concat_collected_activations(self):
             self._scales[name] = torch.cat(self._scales[name], dim=0)
 
     @torch.no_grad()
-    def _apply_smoothing(self, model: Module):
+    def _apply_smoothing(self, model: Module) -> None:
         """
         Calculate the best scaling factors for each layer to smooth activations and
         apply the scaling factors to the weights of the next layer to offset the
@@ -484,7 +487,7 @@ def _compute_loss(
         fp16_output: torch.Tensor,
         int_w_output: torch.Tensor,
         device: torch.device,
-    ):
+    ) -> torch.Tensor:
         loss = 0.0
         fp16_output_flat = fp16_output.view(-1)
         int_w_output_flat = int_w_output.view(-1)
@@ -579,7 +582,7 @@ def _forward_input_with_kwargs(
         module: Module,
         inputs: torch.Tensor,
         input_kwargs: Optional[Dict[str, Any]] = None,
-    ):
+    ) -> torch.Tensor:
         """
         Forward pass with input arguments
 
@@ -590,43 +593,44 @@ def _forward_input_with_kwargs(
         """
         kwargs = input_kwargs or self._module_kwargs
         kwargs = _sanitize_kwargs(kwargs, module)
-        return tensor_forward_with_input_args(
-            module=module,
-            inputs=inputs,
-            input_kwargs=kwargs,
-        )[0]
+
+        inputs = inputs.to(get_execution_device(module))
+
+        return module(inputs, **kwargs)[0]
 
 
-def _sanitize_kwargs(inputs_kwargs, module):
+def _sanitize_kwargs(input_kwargs: Dict[str, Any], module: Module) -> Dict[str, Any]:
     """
-    Remove the arguments that are not supported in the module's
-    forward pass to avoid breaking behaviour between different versions
-    of transformers.
+    Sanitize input keyword arguments to match the module's forward method signature,
+    excluding `use_cache` which is not desired to be passed into module.
 
     Args:
         inputs_kwargs (`dict`):
             The input dictionary to pass to the model layer
         module (`torch.nn.Module`):
             Target module to quantize.
     """
+
     params = inspect.signature(module.forward).parameters
-    sanitized_kwargs = {}
-    for k, v in inputs_kwargs.items():
-        if k in params and k != "use_cache":
-            sanitized_kwargs[k] = v
-    # In case forward pass has optional dependencies that don't default to None.
+
+    # Filter out any kwargs not in module.forward signature
+    sanitized_kwargs = {k: v for k, v in input_kwargs.items() if k in params}
+
+    # Edge Case: forward pass has optional dependencies that don't default to None.
     # This is the case for `LlamaAttention.forward` which has input
     #  `attention_mask: Optional[torch.Tensor],` (with no `= None` default)
     # https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py#L246
     for k, v in params.items():
         if (
             k not in sanitized_kwargs
-            and k != "use_cache"
             and v.default is inspect.Parameter.empty
             and str(v.annotation).startswith("typing.Optional")
         ):
             sanitized_kwargs[k] = None
 
+    # Exclude `use_cache` entirely
+    sanitized_kwargs.pop("use_cache", None)
+
     return sanitized_kwargs
 
 
diff --git a/src/llmcompressor/modifiers/quantization/cache.py b/src/llmcompressor/modifiers/quantization/cache.py
@@ -87,8 +87,12 @@ def update(
                 v_observer_name, quantization_args=self.quantization_args
             )
 
-            self.k_observers.append(k_observer)
-            self.v_observers.append(v_observer)
+            # NOTE: User may ignore some layers in configuration,
+            # meaning len(self.k_observers) <= layer_idx-1
+            # Must account for that case by padding list so that
+            # index of lists corresponds to layer_idx
+            _pad_and_append_at_idx_(self.k_observers, layer_idx, k_observer)
+            _pad_and_append_at_idx_(self.v_observers, layer_idx, v_observer)
 
         q_key_states = self._quantize(
             key_states.contiguous(), KVCacheScaleType.KEY, layer_idx
@@ -151,12 +155,8 @@ def _quantize(self, tensor, kv_type, layer_idx):
             zps = self.v_zps
 
         scale, zp = observer(tensor)
-        if len(scales) <= layer_idx:
-            scales.append(scale)
-            zps.append(zp)
-        else:
-            scales[layer_idx] = scale
-            zps[layer_idx] = scale
+        _pad_and_append_at_idx_(scales, layer_idx, scale)
+        _pad_and_append_at_idx_(zps, layer_idx, zp)
 
         q_tensor = quantize(
             x=tensor,
@@ -185,3 +185,24 @@ def _dequantize(self, qtensor, kv_type, layer_idx):
             args=self.quantization_args,
         )
         return qdq_tensor
+
+
+# NOTE: Using _ suffix to denote l is modified in place
+def _pad_and_append_at_idx_(lst: List, idx: int, val: Any) -> list:
+    """
+    Append value val to list lst at index idx, right padding if necessary
+    Needed because user may ignore some layers in configuration, meaning
+    len(lst) <= idx-1
+
+    >>> _pad_and_append_at_idx_([0,1,2], 5, 5)
+    [0, 1, 2, None, None, 5]
+    >>> _pad_and_append_at_idx_([0,1,2], 3, 8)
+    [0, 1, 2, 8]
+    >>> _pad_and_append_at_idx_([0,1,2], 1, 5)
+    [0, 5, 2]
+    """
+    num_to_pad = idx - len(lst) + 1
+    if num_to_pad > 0:
+        lst += [None] * num_to_pad
+    lst[idx] = val
+    return lst
diff --git a/src/llmcompressor/pytorch/utils/helpers.py b/src/llmcompressor/pytorch/utils/helpers.py
@@ -2,8 +2,6 @@
 Utility / helper functions
 """
 
-import functools
-import inspect
 import random
 from typing import Any, Dict, Iterable, List, Mapping, OrderedDict, Tuple, Union
 
@@ -26,8 +24,6 @@
     "tensors_to_precision",
     "tensors_module_forward",
     "tensor_sparsity",
-    "tensor_forward_with_input_args",
-    "sanitize_kwargs_for_module",
     "get_linear_layers",
     "get_quantized_layers",
     "set_deterministic_seeds",
@@ -204,43 +200,6 @@ def tensor_sparsity(
     return zeros.float() / float(total)
 
 
-def sanitize_kwargs_for_module(
-    kwargs: Dict[str, Any], module: Module
-) -> Dict[str, Any]:
-    """
-    Sanitize the kwargs for a Module by removing any keys that are not
-    in the signature of the forward method.
-    :param kwargs: the kwargs to sanitize
-    :param module: the Module to sanitize the kwargs for
-    :return: the sanitized kwargs for the callable object
-    """
-    if not isinstance(kwargs, dict):
-        raise TypeError(f"Expected a dictionary as kwargs, but got {kwargs}")
-
-    allowed_params = inspect.signature(module.forward).parameters
-    return {key: value for key, value in kwargs.items() if key in allowed_params}
-
-
-def tensor_forward_with_input_args(
-    module: Module, inputs: Tensor, input_kwargs: Dict[str, Any]
-) -> Tensor:
-    """
-    Forward the given inputs through the given module with the given input_kwargs.
-    This function is a wrapper around tensors_module_forward that ensures that the
-    input_kwargs are sanitized and passed to the module as keyword arguments during
-    the forward pass.
-    :param module: the module to forward the inputs through
-    :param inputs: the inputs to forward through the module
-    :param input_kwargs: the keyword arguments to pass to the
-        module during the forward pass
-    :return: the output of the module after forwarding the inputs through it
-    """
-    inputs = inputs.to(next(module.parameters()).device)
-    input_kwargs = sanitize_kwargs_for_module(input_kwargs, module)
-
-    return tensors_module_forward(inputs, functools.partial(module, **input_kwargs))
-
-
 ##############################
 #
 # pytorch module helper functions
diff --git a/src/llmcompressor/transformers/finetune/data/peoples_speech.py b/src/llmcompressor/transformers/finetune/data/peoples_speech.py
@@ -26,20 +26,20 @@ class PeoplesSpeech(TextGenerationDataset):
     :param processor: processor or tokenizer to use on dataset
     """
 
-    def __init__(self, data_args: "DataArgs", split: str, processor: Processor):
-        data_args = deepcopy(data_args)
-        data_args.dataset = "MLCommons/peoples_speech"
-        data_args.dataset_config_name = "test"
-        if not data_args.overwrite_cache:
+    def __init__(self, dataset_args: "DataArgs", split: str, processor: Processor):
+        dataset_args = deepcopy(dataset_args)
+        dataset_args.dataset = "MLCommons/peoples_speech"
+        dataset_args.dataset_config_name = "test"
+        if not dataset_args.overwrite_cache:
             logger.warning(
                 "Because audio processors are more complex, dataset mapping functions "
                 "vary with model architecture and their results cannot be cached. "
                 "Setting overwrite_cache=True"
             )
-            data_args.overwrite_cache = True
+            dataset_args.overwrite_cache = True
         self.processor_type = processor.__class__.__name__
 
-        super().__init__(data_args=data_args, split=split, processor=processor)
+        super().__init__(dataset_args=dataset_args, split=split, processor=processor)
 
     def dataset_template(self, example):
         audio = example["audio"]["array"]
diff --git a/src/llmcompressor/transformers/tracing/debug.py b/src/llmcompressor/transformers/tracing/debug.py
@@ -12,6 +12,8 @@
 from llmcompressor.transformers import TextGenerationDataset
 from llmcompressor.args import DatasetArguments
 
+from llmcompressor.utils.dev import skip_weights_download
+
 __all__ = [
     "get_model_class"
 ]
@@ -24,6 +26,7 @@ def parse_args():
     parser.add_argument("--sequential_targets", type=str, nargs="*", default=None, metavar="TARGET", help="List of targets for sequential tracing")  # noqa: E501
     parser.add_argument("--ignore", type=str, nargs="*", default=[], metavar="PATTERN", help="List of patterns to ignore during tracing")  # noqa: E501
     parser.add_argument("--modality", type=str, default="text", help="Modality of calibration dataset, defaults to text")  # noqa: E501
+    parser.add_argument("--trust_remote_code", type=bool, default=False, help="Whether to trust model remote code")  # noqa: E501
     return parser.parse_args()
 
 
@@ -33,6 +36,7 @@ def trace(
     sequential_targets: Optional[Union[List[str], str]] = None,
     ignore: Union[List[str], str] = [],
     modality: str = "text",
+    trust_remote_code: bool = True
 ):
     """
     Debug traceability by tracing a pre-trained model into subgraphs
@@ -44,6 +48,7 @@ def trace(
         inference
     :param ignore: patterns to ignore during tracing
     :param modality: data modality for dummy tracing data, defaults to 'text'
+    :param trust_remote_code: trust remote model code
 
     Example usage from CLI
     llmcompressor.trace \
@@ -54,12 +59,16 @@ def trace(
         --modality text
     """
     # Load model
-    model = model_class.from_pretrained(
-        model_id,
-        device_map="auto",
-        torch_dtype="auto",
+    with skip_weights_download(model_class):
+        model = model_class.from_pretrained(
+            model_id,
+            device_map="cpu",
+            torch_dtype="auto",
+            trust_remote_code=trust_remote_code,
+        )
+    processor = AutoProcessor.from_pretrained(
+        model_id, trust_remote_code=trust_remote_code
     )
-    processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
     print("Loaded model")
 
     # Prepare sample data
@@ -138,6 +147,7 @@ def main():
         sequential_targets=args.sequential_targets,
         ignore=args.ignore,
         modality=args.modality,
+        trust_remote_code=args.trust_remote_code
     )
 
 
diff --git a/src/llmcompressor/transformers/tracing/idefics3.py b/src/llmcompressor/transformers/tracing/idefics3.py
@@ -285,7 +285,7 @@ def __init__(self, config: Idefics3Config):
 
     def forward(
         self,
-        input_ids: torch.LongTensor = None,
+        input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
         past_key_values: Optional[List[torch.FloatTensor]] = None,
@@ -296,6 +296,7 @@ def forward(
         use_cache: Optional[bool] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
         return_dict: Optional[bool] = None,
     ) -> Union[Tuple, Idefics3BaseModelOutputWithPast]:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
@@ -394,6 +395,7 @@ def forward(
             use_cache=use_cache,
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
+            cache_position=cache_position,
             return_dict=return_dict,
         )
 
diff --git a/tests/llmcompressor/modifiers/awq/test_base.py b/tests/llmcompressor/modifiers/awq/test_base.py
diff --git a/tests/llmcompressor/pytorch/utils/test_helpers.py b/tests/llmcompressor/pytorch/utils/test_helpers.py
diff --git a/tests/llmcompressor/transformers/tracing/test_models.py b/tests/llmcompressor/transformers/tracing/test_models.py