update

a-r-r-o-w · a-r-r-o-w · commit fb99d94b25eb · 2025-06-16T10:23:14.000+02:00
diff --git a/src/diffusers/hooks/group_offloading.py b/src/diffusers/hooks/group_offloading.py
@@ -219,7 +219,6 @@ def initialize_hook(self, module: torch.nn.Module) -> torch.nn.Module:
         return module
 
     def pre_forward(self, module: torch.nn.Module, *args, **kwargs):
-        breakpoint()
         # If there wasn't an onload_leader assigned, we assume that the submodule that first called its forward
         # method is the onload_leader of the group.
         if self.group.onload_leader is None:
@@ -286,7 +285,6 @@ def callback():
         return module
 
     def post_forward(self, module, output):
-        breakpoint()
         # At this point, for the current modules' submodules, we know the execution order of the layers. We can now
         # remove the layer execution tracker hooks and apply prefetching by setting the next_group attribute for each
         # group offloading hook.
@@ -626,9 +624,7 @@ def _apply_group_offloading_leaf_level(
     modules_with_group_offloading = set()
     for name, submodule in module.named_modules():
         if not isinstance(submodule, _SUPPORTED_PYTORCH_LAYERS):
-            print("unsupported module", name, type(submodule))
             continue
-        print("applying group offloading to", name, type(submodule))
         group = ModuleGroup(
             modules=[submodule],
             offload_device=offload_device,
diff --git a/tests/quantization/test_torch_compile_utils.py b/tests/quantization/test_torch_compile_utils.py
@@ -76,7 +76,7 @@ def _test_torch_compile_with_group_offload_leaf(self, quantization_config, torch
             "use_stream": False,
         }
         pipe.transformer.enable_group_offload(**group_offload_kwargs)
-        # pipe.transformer.compile()
+        pipe.transformer.compile()
         for name, component in pipe.components.items():
             if name != "transformer" and isinstance(component, torch.nn.Module):
                 if torch.device(component.device).type == "cpu":
diff --git a/tests/quantization/torchao/test_torchao.py b/tests/quantization/torchao/test_torchao.py
@@ -639,13 +639,19 @@ class TorchAoCompileTest(QuantCompileTests):
     def test_torch_compile(self):
         super()._test_torch_compile(quantization_config=self.quantization_config)
 
+    @unittest.skip(
+        "Changing the device of AQT tensor with module._apply (called from doing module.to() in accelerate) does not work."
+    )
     def test_torch_compile_with_cpu_offload(self):
+        # RuntimeError: _apply(): Couldn't swap Linear.weight
         super()._test_torch_compile_with_cpu_offload(quantization_config=self.quantization_config)
 
+    @unittest.skip(
+        "Changing the device of AQT tensor with .to() does not work. Needs to be discussed with TorchAO team."
+    )
     def test_torch_compile_with_group_offload_leaf(self):
-        from diffusers.utils.logging import set_verbosity_debug
-
-        set_verbosity_debug()
+        # for linear layers, weight.tensor_impl shows cuda... but:
+        # weight.tensor_impl.{data,scale,zero_point}.device will be cpu
         super()._test_torch_compile_with_group_offload_leaf(quantization_config=self.quantization_config)
 
     @unittest.skip(

Original file line number	Diff line number	Diff line change
`@@ -76,7 +76,7 @@ def _test_torch_compile_with_group_offload_leaf(self, quantization_config, torch`
`76`	`76`	`"use_stream": False,`
`77`	`77`	`}`
`78`	`78`	`pipe.transformer.enable_group_offload(**group_offload_kwargs)`
`79`		`- # pipe.transformer.compile()`
	`79`	`+ pipe.transformer.compile()`
`80`	`80`	`for name, component in pipe.components.items():`
`81`	`81`	`if name != "transformer" and isinstance(component, torch.nn.Module):`
`82`	`82`	`if torch.device(component.device).type == "cpu":`