update

a-r-r-o-w · a-r-r-o-w · commit 2c608d1ab41c · 2025-06-16T22:57:24.000+02:00
diff --git a/tests/quantization/bnb/test_4bit.py b/tests/quantization/bnb/test_4bit.py
@@ -45,7 +45,6 @@
     require_peft_backend,
     require_torch,
     require_torch_accelerator,
-    require_torch_version_greater,
     require_transformers_version_greater,
     slow,
     torch_device,
@@ -861,7 +860,7 @@ def test_fp4_double_safe(self):
         self.test_serialization(quant_type="fp4", double_quant=True, safe_serialization=True)
 
 
-@require_torch_version_greater("2.7.1")
+# @require_torch_version_greater("2.7.1")
 class Bnb4BitCompileTests(QuantCompileTests):
     quantization_config = PipelineQuantizationConfig(
         quant_backend="bitsandbytes_8bit",
@@ -880,5 +879,7 @@ def test_torch_compile(self):
     def test_torch_compile_with_cpu_offload(self):
         super()._test_torch_compile_with_cpu_offload(quantization_config=self.quantization_config)
 
-    def test_torch_compile_with_group_offload(self):
-        super()._test_torch_compile_with_group_offload_leaf_stream(quantization_config=self.quantization_config)
+    def test_torch_compile_with_group_offload_leaf(self):
+        super()._test_torch_compile_with_group_offload_leaf(
+            quantization_config=self.quantization_config, use_stream=True
+        )
diff --git a/tests/quantization/bnb/test_mixed_int8.py b/tests/quantization/bnb/test_mixed_int8.py
@@ -844,7 +844,7 @@ def test_torch_compile_with_cpu_offload(self):
         )
 
     @pytest.mark.xfail(reason="Test fails because of an offloading problem from Accelerate with confusion in hooks.")
-    def test_torch_compile_with_group_offload(self):
-        super()._test_torch_compile_with_group_offload_leaf_stream(
-            quantization_config=self.quantization_config, torch_dtype=torch.float16
+    def test_torch_compile_with_group_offload_leaf(self):
+        super()._test_torch_compile_with_group_offload_leaf(
+            quantization_config=self.quantization_config, torch_dtype=torch.float16, use_stream=True
         )
diff --git a/tests/quantization/test_torch_compile_utils.py b/tests/quantization/test_torch_compile_utils.py
@@ -64,7 +64,9 @@ def _test_torch_compile_with_cpu_offload(self, quantization_config, torch_dtype=
             # small resolutions to ensure speedy execution.
             pipe("a dog", num_inference_steps=3, max_sequence_length=16, height=256, width=256)
 
-    def _test_torch_compile_with_group_offload_leaf(self, quantization_config, torch_dtype=torch.bfloat16):
+    def _test_torch_compile_with_group_offload_leaf(
+        self, quantization_config, torch_dtype=torch.bfloat16, *, use_stream: bool = False
+    ):
         torch._dynamo.config.cache_size_limit = 10000
 
         pipe = self._init_pipeline(quantization_config, torch_dtype)
@@ -73,28 +75,7 @@ def _test_torch_compile_with_group_offload_leaf(self, quantization_config, torch
             "offload_device": torch.device("cpu"),
             "offload_type": "leaf_level",
             "num_blocks_per_group": 1,
-            "use_stream": False,
-        }
-        pipe.transformer.enable_group_offload(**group_offload_kwargs)
-        pipe.transformer.compile()
-        for name, component in pipe.components.items():
-            if name != "transformer" and isinstance(component, torch.nn.Module):
-                if torch.device(component.device).type == "cpu":
-                    component.to("cuda")
-
-        for _ in range(2):
-            # small resolutions to ensure speedy execution.
-            pipe("a dog", num_inference_steps=3, max_sequence_length=16, height=256, width=256)
-
-    def _test_torch_compile_with_group_offload_leaf_stream(self, quantization_config, torch_dtype=torch.bfloat16):
-        torch._dynamo.config.cache_size_limit = 10000
-
-        pipe = self._init_pipeline(quantization_config, torch_dtype)
-        group_offload_kwargs = {
-            "onload_device": torch.device("cuda"),
-            "offload_device": torch.device("cpu"),
-            "offload_type": "leaf_level",
-            "use_stream": True,
+            "use_stream": use_stream,
         }
         pipe.transformer.enable_group_offload(**group_offload_kwargs)
         pipe.transformer.compile()
diff --git a/tests/quantization/torchao/test_torchao.py b/tests/quantization/torchao/test_torchao.py
@@ -19,6 +19,7 @@
 from typing import List
 
 import numpy as np
+from parameterized import parameterized
 from transformers import AutoTokenizer, CLIPTextModel, CLIPTokenizer, T5EncoderModel
 
 from diffusers import (
@@ -648,10 +649,17 @@ def test_torch_compile_with_cpu_offload(self):
         super()._test_torch_compile_with_cpu_offload(quantization_config=self.quantization_config)
 
     @unittest.skip(
-        "Changing the device of AQT tensor, with `param.data = param.data.to(device)` as done in group offloading implementation "
-        "is unsupported in TorchAO. When compiling, FakeTensor device mismatch causes failure."
+        """
+        For `use_stream=False`:
+            - Changing the device of AQT tensor, with `param.data = param.data.to(device)` as done in group offloading implementation
+            is unsupported in TorchAO. When compiling, FakeTensor device mismatch causes failure.
+        For `use_stream=True`:
+            Using non-default stream requires ability to pin tensors. AQT does not seem to support this yet in TorchAO.
+        """
     )
+    @parameterized.expand([False, True])
     def test_torch_compile_with_group_offload_leaf(self):
+        # For use_stream=False:
         # If we run group offloading without compilation, we will see:
         #   RuntimeError: Attempted to set the storage of a tensor on device "cpu" to a storage on different device "cuda:0".  This is no longer allowed; the devices must match.
         # When running with compilation, the error ends up being different:
@@ -660,14 +668,10 @@ def test_torch_compile_with_group_offload_leaf(self):
         # Looks like something that will have to be looked into upstream.
         # for linear layers, weight.tensor_impl shows cuda... but:
         # weight.tensor_impl.{data,scale,zero_point}.device will be cpu
-        super()._test_torch_compile_with_group_offload_leaf(quantization_config=self.quantization_config)
 
-    @unittest.skip(
-        "Using non-default stream requires ability to pin tensors. AQT does not seem to support this yet in TorchAO."
-    )
-    def test_torch_compile_with_group_offload_leaf_stream(self):
-        # NotImplementedError: AffineQuantizedTensor dispatch: attempting to run unimplemented operator/function: func=<OpOverload(op='aten.is_pinned', overload='default')>, types=(<class 'torchao.dtypes.affine_quantized_tensor.AffineQuantizedTensor'>,), arg_types=(<class 'torchao.dtypes.affine_quantized_tensor.AffineQuantizedTensor'>,), kwarg_types={}
-        super()._test_torch_compile_with_group_offload_leaf_stream(quantization_config=self.quantization_config)
+        # For use_stream=True:
+        # # NotImplementedError: AffineQuantizedTensor dispatch: attempting to run unimplemented operator/function: func=<OpOverload(op='aten.is_pinned', overload='default')>, types=(<class 'torchao.dtypes.affine_quantized_tensor.AffineQuantizedTensor'>,), arg_types=(<class 'torchao.dtypes.affine_quantized_tensor.AffineQuantizedTensor'>,), kwarg_types={}
+        super()._test_torch_compile_with_group_offload_leaf(quantization_config=self.quantization_config)
 
 
 # Slices for these tests have been obtained on our aws-g6e-xlarge-plus runners

Original file line number	Diff line number	Diff line change
`@@ -844,7 +844,7 @@ def test_torch_compile_with_cpu_offload(self):`
`844`	`844`	`)`
`845`	`845`
`846`	`846`	`@pytest.mark.xfail(reason="Test fails because of an offloading problem from Accelerate with confusion in hooks.")`
`847`		`- def test_torch_compile_with_group_offload(self):`
`848`		`- super()._test_torch_compile_with_group_offload_leaf_stream(`
`849`		`- quantization_config=self.quantization_config, torch_dtype=torch.float16`
	`847`	`+ def test_torch_compile_with_group_offload_leaf(self):`
	`848`	`+ super()._test_torch_compile_with_group_offload_leaf(`
	`849`	`+ quantization_config=self.quantization_config, torch_dtype=torch.float16, use_stream=True`
`850`	`850`	`)`