add kv cache disable tests

kylesayrs · kylesayrs · commit 153c4dc6853a · 2025-09-15T11:24:47.000-04:00
Signed-off-by: Kyle Sayers &lt;kylesayrs@gmail.com&gt;
diff --git a/tests/llmcompressor/utils/test_helpers.py b/tests/llmcompressor/utils/test_helpers.py
@@ -2,19 +2,27 @@
 
 import pytest
 import torch
-from transformers import PretrainedConfig, PreTrainedModel
+from transformers import (
+    AutoModelForCausalLM,
+    MllamaForConditionalGeneration,
+    PretrainedConfig,
+    PreTrainedModel,
+)
 
 from llmcompressor.utils import (
     ALL_TOKEN,
     DisableQuantization,
     calibration_forward_context,
     convert_to_bool,
+    disable_cache,
     flatten_iterable,
     getattr_chain,
     interpolate,
     patch_attr,
     validate_str_iterable,
 )
+from llmcompressor.utils.dev import skip_weights_download
+from tests.testing_utils import requires_gpu
 
 
 @pytest.mark.unit
@@ -173,3 +181,25 @@ def test_patch_attr():
         assert obj.attribute == "patched"
         obj.attribute = "modified"
     assert not hasattr(obj, "attribute")
+
+
+@requires_gpu
+@pytest.mark.unit
+@pytest.mark.parametrize(
+    "model_cls,model_stub",
+    [
+        (MllamaForConditionalGeneration, "meta-llama/Llama-3.2-11B-Vision-Instruct"),
+        (AutoModelForCausalLM, "nm-testing/llama2.c-stories15M"),
+    ],
+)
+def test_disable_cache(model_cls, model_stub):
+    with skip_weights_download(model_cls):
+        model = model_cls.from_pretrained(model_stub, device_map="cuda")
+    inputs = {key: value.to(model.device) for key, value in model.dummy_inputs.items()}
+
+    with disable_cache(model):
+        output = model(**inputs)
+        assert output.past_key_values is None
+
+    output = model(**inputs)
+    assert output.past_key_values is not None