Skip to content

Commit 6c26ad5

Browse files
committed
Merge remote-tracking branch 'origin' into kylesayrs/shared-pipelines
2 parents 673fe04 + ffa570c commit 6c26ad5

File tree

10 files changed

+213
-128
lines changed

10 files changed

+213
-128
lines changed

.github/workflows/test-check-transformers.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ on:
88

99
env:
1010
CADENCE: "commit"
11+
HF_TOKEN: ${{ secrets.HF_TOKEN_READ }}
1112

1213
jobs:
1314
detect-changes:
@@ -95,6 +96,10 @@ jobs:
9596
if: (success() || failure()) && steps.install.outcome == 'success'
9697
run: |
9798
pytest -v tests/llmcompressor/transformers/obcq
99+
- name: Running Tracing Tests
100+
if: (success() || failure()) && steps.install.outcome == 'success'
101+
run: |
102+
pytest -v tests/llmcompressor/transformers/tracing
98103
- name: Running KV Cache Tests
99104
if: (success() || failure()) && steps.install.outcome == 'success'
100105
run: |

src/llmcompressor/modifiers/awq/base.py

Lines changed: 27 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,11 @@
22
from typing import Any, Dict, List, Optional, Union
33

44
import torch
5-
from compressed_tensors.utils import align_module_device, update_offload_parameter
5+
from compressed_tensors.utils import (
6+
align_module_device,
7+
get_execution_device,
8+
update_offload_parameter,
9+
)
610
from loguru import logger
711
from pydantic import ConfigDict
812
from torch.nn import Module
@@ -11,7 +15,6 @@
1115
from llmcompressor.core import State
1216
from llmcompressor.modifiers import Modifier
1317
from llmcompressor.modifiers.utils.pytorch_helpers import run_calibration_forward
14-
from llmcompressor.pytorch.utils import tensor_forward_with_input_args
1518
from llmcompressor.utils.fsdp.helpers import get_fsdp_parent
1619
from llmcompressor.utils.helpers import calibration_forward_context
1720
from llmcompressor.utils.pytorch.module import (
@@ -217,7 +220,7 @@ def _set_resolved_mappings(self, model: Module) -> None:
217220
self._resolved_mappings = resolved_mappings
218221
return
219222

220-
def _setup_scale_hooks(self):
223+
def _setup_scale_hooks(self) -> None:
221224
"""
222225
Attach a forward hook to each activation we want to smooth. This allows us to
223226
calculate the dynamic range during calibration
@@ -243,7 +246,7 @@ def hook_fn(module, inp, out):
243246
self.register_hook(layer, create_hook_fn(name), "forward")
244247

245248
@torch.no_grad()
246-
def _calibrate(self, model: Module, calibration_dataloader: List):
249+
def _calibrate(self, model: Module, calibration_dataloader: List) -> None:
247250
"""
248251
Catch the output dynamic ranges of each layer that will be smoothed by running
249252
forward passes with calibration_dataloader
@@ -264,7 +267,7 @@ def _calibrate(self, model: Module, calibration_dataloader: List):
264267
calibration_dataloader,
265268
)
266269

267-
def _concat_collected_activations(self):
270+
def _concat_collected_activations(self) -> None:
268271
"""
269272
Concatenate the collected activation values from each forward pass into a single
270273
tensor for each layer
@@ -277,7 +280,7 @@ def _concat_collected_activations(self):
277280
self._scales[name] = torch.cat(self._scales[name], dim=0)
278281

279282
@torch.no_grad()
280-
def _apply_smoothing(self, model: Module):
283+
def _apply_smoothing(self, model: Module) -> None:
281284
"""
282285
Calculate the best scaling factors for each layer to smooth activations and
283286
apply the scaling factors to the weights of the next layer to offset the
@@ -484,7 +487,7 @@ def _compute_loss(
484487
fp16_output: torch.Tensor,
485488
int_w_output: torch.Tensor,
486489
device: torch.device,
487-
):
490+
) -> torch.Tensor:
488491
loss = 0.0
489492
fp16_output_flat = fp16_output.view(-1)
490493
int_w_output_flat = int_w_output.view(-1)
@@ -579,7 +582,7 @@ def _forward_input_with_kwargs(
579582
module: Module,
580583
inputs: torch.Tensor,
581584
input_kwargs: Optional[Dict[str, Any]] = None,
582-
):
585+
) -> torch.Tensor:
583586
"""
584587
Forward pass with input arguments
585588
@@ -590,43 +593,44 @@ def _forward_input_with_kwargs(
590593
"""
591594
kwargs = input_kwargs or self._module_kwargs
592595
kwargs = _sanitize_kwargs(kwargs, module)
593-
return tensor_forward_with_input_args(
594-
module=module,
595-
inputs=inputs,
596-
input_kwargs=kwargs,
597-
)[0]
596+
597+
inputs = inputs.to(get_execution_device(module))
598+
599+
return module(inputs, **kwargs)[0]
598600

599601

600-
def _sanitize_kwargs(inputs_kwargs, module):
602+
def _sanitize_kwargs(input_kwargs: Dict[str, Any], module: Module) -> Dict[str, Any]:
601603
"""
602-
Remove the arguments that are not supported in the module's
603-
forward pass to avoid breaking behaviour between different versions
604-
of transformers.
604+
Sanitize input keyword arguments to match the module's forward method signature,
605+
excluding `use_cache` which is not desired to be passed into module.
605606
606607
Args:
607608
inputs_kwargs (`dict`):
608609
The input dictionary to pass to the model layer
609610
module (`torch.nn.Module`):
610611
Target module to quantize.
611612
"""
613+
612614
params = inspect.signature(module.forward).parameters
613-
sanitized_kwargs = {}
614-
for k, v in inputs_kwargs.items():
615-
if k in params and k != "use_cache":
616-
sanitized_kwargs[k] = v
617-
# In case forward pass has optional dependencies that don't default to None.
615+
616+
# Filter out any kwargs not in module.forward signature
617+
sanitized_kwargs = {k: v for k, v in input_kwargs.items() if k in params}
618+
619+
# Edge Case: forward pass has optional dependencies that don't default to None.
618620
# This is the case for `LlamaAttention.forward` which has input
619621
# `attention_mask: Optional[torch.Tensor],` (with no `= None` default)
620622
# https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py#L246
621623
for k, v in params.items():
622624
if (
623625
k not in sanitized_kwargs
624-
and k != "use_cache"
625626
and v.default is inspect.Parameter.empty
626627
and str(v.annotation).startswith("typing.Optional")
627628
):
628629
sanitized_kwargs[k] = None
629630

631+
# Exclude `use_cache` entirely
632+
sanitized_kwargs.pop("use_cache", None)
633+
630634
return sanitized_kwargs
631635

632636

src/llmcompressor/modifiers/quantization/cache.py

Lines changed: 29 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -87,8 +87,12 @@ def update(
8787
v_observer_name, quantization_args=self.quantization_args
8888
)
8989

90-
self.k_observers.append(k_observer)
91-
self.v_observers.append(v_observer)
90+
# NOTE: User may ignore some layers in configuration,
91+
# meaning len(self.k_observers) <= layer_idx-1
92+
# Must account for that case by padding list so that
93+
# index of lists corresponds to layer_idx
94+
_pad_and_append_at_idx_(self.k_observers, layer_idx, k_observer)
95+
_pad_and_append_at_idx_(self.v_observers, layer_idx, v_observer)
9296

9397
q_key_states = self._quantize(
9498
key_states.contiguous(), KVCacheScaleType.KEY, layer_idx
@@ -151,12 +155,8 @@ def _quantize(self, tensor, kv_type, layer_idx):
151155
zps = self.v_zps
152156

153157
scale, zp = observer(tensor)
154-
if len(scales) <= layer_idx:
155-
scales.append(scale)
156-
zps.append(zp)
157-
else:
158-
scales[layer_idx] = scale
159-
zps[layer_idx] = scale
158+
_pad_and_append_at_idx_(scales, layer_idx, scale)
159+
_pad_and_append_at_idx_(zps, layer_idx, zp)
160160

161161
q_tensor = quantize(
162162
x=tensor,
@@ -185,3 +185,24 @@ def _dequantize(self, qtensor, kv_type, layer_idx):
185185
args=self.quantization_args,
186186
)
187187
return qdq_tensor
188+
189+
190+
# NOTE: Using _ suffix to denote l is modified in place
191+
def _pad_and_append_at_idx_(lst: List, idx: int, val: Any) -> list:
192+
"""
193+
Append value val to list lst at index idx, right padding if necessary
194+
Needed because user may ignore some layers in configuration, meaning
195+
len(lst) <= idx-1
196+
197+
>>> _pad_and_append_at_idx_([0,1,2], 5, 5)
198+
[0, 1, 2, None, None, 5]
199+
>>> _pad_and_append_at_idx_([0,1,2], 3, 8)
200+
[0, 1, 2, 8]
201+
>>> _pad_and_append_at_idx_([0,1,2], 1, 5)
202+
[0, 5, 2]
203+
"""
204+
num_to_pad = idx - len(lst) + 1
205+
if num_to_pad > 0:
206+
lst += [None] * num_to_pad
207+
lst[idx] = val
208+
return lst

src/llmcompressor/pytorch/utils/helpers.py

Lines changed: 0 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,6 @@
22
Utility / helper functions
33
"""
44

5-
import functools
6-
import inspect
75
import random
86
from typing import Any, Dict, Iterable, List, Mapping, OrderedDict, Tuple, Union
97

@@ -26,8 +24,6 @@
2624
"tensors_to_precision",
2725
"tensors_module_forward",
2826
"tensor_sparsity",
29-
"tensor_forward_with_input_args",
30-
"sanitize_kwargs_for_module",
3127
"get_linear_layers",
3228
"get_quantized_layers",
3329
"set_deterministic_seeds",
@@ -204,43 +200,6 @@ def tensor_sparsity(
204200
return zeros.float() / float(total)
205201

206202

207-
def sanitize_kwargs_for_module(
208-
kwargs: Dict[str, Any], module: Module
209-
) -> Dict[str, Any]:
210-
"""
211-
Sanitize the kwargs for a Module by removing any keys that are not
212-
in the signature of the forward method.
213-
:param kwargs: the kwargs to sanitize
214-
:param module: the Module to sanitize the kwargs for
215-
:return: the sanitized kwargs for the callable object
216-
"""
217-
if not isinstance(kwargs, dict):
218-
raise TypeError(f"Expected a dictionary as kwargs, but got {kwargs}")
219-
220-
allowed_params = inspect.signature(module.forward).parameters
221-
return {key: value for key, value in kwargs.items() if key in allowed_params}
222-
223-
224-
def tensor_forward_with_input_args(
225-
module: Module, inputs: Tensor, input_kwargs: Dict[str, Any]
226-
) -> Tensor:
227-
"""
228-
Forward the given inputs through the given module with the given input_kwargs.
229-
This function is a wrapper around tensors_module_forward that ensures that the
230-
input_kwargs are sanitized and passed to the module as keyword arguments during
231-
the forward pass.
232-
:param module: the module to forward the inputs through
233-
:param inputs: the inputs to forward through the module
234-
:param input_kwargs: the keyword arguments to pass to the
235-
module during the forward pass
236-
:return: the output of the module after forwarding the inputs through it
237-
"""
238-
inputs = inputs.to(next(module.parameters()).device)
239-
input_kwargs = sanitize_kwargs_for_module(input_kwargs, module)
240-
241-
return tensors_module_forward(inputs, functools.partial(module, **input_kwargs))
242-
243-
244203
##############################
245204
#
246205
# pytorch module helper functions

src/llmcompressor/transformers/finetune/data/peoples_speech.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -26,20 +26,20 @@ class PeoplesSpeech(TextGenerationDataset):
2626
:param processor: processor or tokenizer to use on dataset
2727
"""
2828

29-
def __init__(self, data_args: "DataArgs", split: str, processor: Processor):
30-
data_args = deepcopy(data_args)
31-
data_args.dataset = "MLCommons/peoples_speech"
32-
data_args.dataset_config_name = "test"
33-
if not data_args.overwrite_cache:
29+
def __init__(self, dataset_args: "DataArgs", split: str, processor: Processor):
30+
dataset_args = deepcopy(dataset_args)
31+
dataset_args.dataset = "MLCommons/peoples_speech"
32+
dataset_args.dataset_config_name = "test"
33+
if not dataset_args.overwrite_cache:
3434
logger.warning(
3535
"Because audio processors are more complex, dataset mapping functions "
3636
"vary with model architecture and their results cannot be cached. "
3737
"Setting overwrite_cache=True"
3838
)
39-
data_args.overwrite_cache = True
39+
dataset_args.overwrite_cache = True
4040
self.processor_type = processor.__class__.__name__
4141

42-
super().__init__(data_args=data_args, split=split, processor=processor)
42+
super().__init__(dataset_args=dataset_args, split=split, processor=processor)
4343

4444
def dataset_template(self, example):
4545
audio = example["audio"]["array"]

src/llmcompressor/transformers/tracing/debug.py

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@
1212
from llmcompressor.transformers import TextGenerationDataset
1313
from llmcompressor.args import DatasetArguments
1414

15+
from llmcompressor.utils.dev import skip_weights_download
16+
1517
__all__ = [
1618
"get_model_class"
1719
]
@@ -24,6 +26,7 @@ def parse_args():
2426
parser.add_argument("--sequential_targets", type=str, nargs="*", default=None, metavar="TARGET", help="List of targets for sequential tracing") # noqa: E501
2527
parser.add_argument("--ignore", type=str, nargs="*", default=[], metavar="PATTERN", help="List of patterns to ignore during tracing") # noqa: E501
2628
parser.add_argument("--modality", type=str, default="text", help="Modality of calibration dataset, defaults to text") # noqa: E501
29+
parser.add_argument("--trust_remote_code", type=bool, default=False, help="Whether to trust model remote code") # noqa: E501
2730
return parser.parse_args()
2831

2932

@@ -33,6 +36,7 @@ def trace(
3336
sequential_targets: Optional[Union[List[str], str]] = None,
3437
ignore: Union[List[str], str] = [],
3538
modality: str = "text",
39+
trust_remote_code: bool = True
3640
):
3741
"""
3842
Debug traceability by tracing a pre-trained model into subgraphs
@@ -44,6 +48,7 @@ def trace(
4448
inference
4549
:param ignore: patterns to ignore during tracing
4650
:param modality: data modality for dummy tracing data, defaults to 'text'
51+
:param trust_remote_code: trust remote model code
4752
4853
Example usage from CLI
4954
llmcompressor.trace \
@@ -54,12 +59,16 @@ def trace(
5459
--modality text
5560
"""
5661
# Load model
57-
model = model_class.from_pretrained(
58-
model_id,
59-
device_map="auto",
60-
torch_dtype="auto",
62+
with skip_weights_download(model_class):
63+
model = model_class.from_pretrained(
64+
model_id,
65+
device_map="cpu",
66+
torch_dtype="auto",
67+
trust_remote_code=trust_remote_code,
68+
)
69+
processor = AutoProcessor.from_pretrained(
70+
model_id, trust_remote_code=trust_remote_code
6171
)
62-
processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
6372
print("Loaded model")
6473

6574
# Prepare sample data
@@ -138,6 +147,7 @@ def main():
138147
sequential_targets=args.sequential_targets,
139148
ignore=args.ignore,
140149
modality=args.modality,
150+
trust_remote_code=args.trust_remote_code
141151
)
142152

143153

src/llmcompressor/transformers/tracing/idefics3.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -285,7 +285,7 @@ def __init__(self, config: Idefics3Config):
285285

286286
def forward(
287287
self,
288-
input_ids: torch.LongTensor = None,
288+
input_ids: Optional[torch.LongTensor] = None,
289289
attention_mask: Optional[torch.Tensor] = None,
290290
position_ids: Optional[torch.LongTensor] = None,
291291
past_key_values: Optional[List[torch.FloatTensor]] = None,
@@ -296,6 +296,7 @@ def forward(
296296
use_cache: Optional[bool] = None,
297297
output_attentions: Optional[bool] = None,
298298
output_hidden_states: Optional[bool] = None,
299+
cache_position: Optional[torch.LongTensor] = None,
299300
return_dict: Optional[bool] = None,
300301
) -> Union[Tuple, Idefics3BaseModelOutputWithPast]:
301302
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
@@ -394,6 +395,7 @@ def forward(
394395
use_cache=use_cache,
395396
output_attentions=output_attentions,
396397
output_hidden_states=output_hidden_states,
398+
cache_position=cache_position,
397399
return_dict=return_dict,
398400
)
399401

0 commit comments

Comments
 (0)