huggingface · yao-matrix · Aug 15, 2025 · Aug 15, 2025 · Aug 18, 2025 · Aug 18, 2025
diff --git a/README.md b/README.md
@@ -71,26 +71,32 @@ Generating outputs is super easy with 🤗 Diffusers. To generate an image from
 
 ```python
 from diffusers import DiffusionPipeline
+from diffusers.utils.torch_utils import get_device
 import torch
 
+device = get_device()
+
 pipeline = DiffusionPipeline.from_pretrained("stable-diffusion-v1-5/stable-diffusion-v1-5", torch_dtype=torch.float16)
-pipeline.to("cuda")
+pipeline.to(device)
 pipeline("An image of a squirrel in Picasso style").images[0]
 ```
 
 You can also dig into the models and schedulers toolbox to build your own diffusion system:
 
 ```python
 from diffusers import DDPMScheduler, UNet2DModel
+from diffusers.utils.torch_utils import get_device
 from PIL import Image
 import torch
 
+device = get_device()
+
 scheduler = DDPMScheduler.from_pretrained("google/ddpm-cat-256")
-model = UNet2DModel.from_pretrained("google/ddpm-cat-256").to("cuda")
+model = UNet2DModel.from_pretrained("google/ddpm-cat-256").to(device)
 scheduler.set_timesteps(50)
 
 sample_size = model.config.sample_size
-noise = torch.randn((1, 3, sample_size, sample_size), device="cuda")
+noise = torch.randn((1, 3, sample_size, sample_size), device=device)
 input = noise
 
 for t in scheduler.timesteps:

diff --git a/docs/source/en/api/models/autoencoderkl_cogvideox.md b/docs/source/en/api/models/autoencoderkl_cogvideox.md
@@ -17,8 +17,11 @@ The model can be loaded with the following code snippet.
 
 ```python
 from diffusers import AutoencoderKLCogVideoX
+from diffusers.utils.torch_utils import get_device
 
-vae = AutoencoderKLCogVideoX.from_pretrained("THUDM/CogVideoX-2b", subfolder="vae", torch_dtype=torch.float16).to("cuda")
+device = get_device()
+
+vae = AutoencoderKLCogVideoX.from_pretrained("THUDM/CogVideoX-2b", subfolder="vae", torch_dtype=torch.float16).to(device)
 ```
 
 ## AutoencoderKLCogVideoX

diff --git a/docs/source/en/api/models/autoencoderkl_ltx_video.md b/docs/source/en/api/models/autoencoderkl_ltx_video.md
@@ -17,8 +17,11 @@ The model can be loaded with the following code snippet.
 
 ```python
 from diffusers import AutoencoderKLLTXVideo
+from diffusers.utils.torch_utils import get_device
 
-vae = AutoencoderKLLTXVideo.from_pretrained("Lightricks/LTX-Video", subfolder="vae", torch_dtype=torch.float32).to("cuda")
+device = get_device()
+
+vae = AutoencoderKLLTXVideo.from_pretrained("Lightricks/LTX-Video", subfolder="vae", torch_dtype=torch.float32).to(device)
 ```
 
 ## AutoencoderKLLTXVideo

diff --git a/docs/source/en/api/models/autoencoderkl_mochi.md b/docs/source/en/api/models/autoencoderkl_mochi.md
@@ -17,8 +17,11 @@ The model can be loaded with the following code snippet.
 
 ```python
 from diffusers import AutoencoderKLMochi
+from diffusers.utils.torch_utils import get_device
 
-vae = AutoencoderKLMochi.from_pretrained("genmo/mochi-1-preview", subfolder="vae", torch_dtype=torch.float32).to("cuda")
+device = get_device()
+
+vae = AutoencoderKLMochi.from_pretrained("genmo/mochi-1-preview", subfolder="vae", torch_dtype=torch.float32).to(device)
 ```
 
 ## AutoencoderKLMochi

diff --git a/docs/source/en/api/models/cogvideox_transformer3d.md b/docs/source/en/api/models/cogvideox_transformer3d.md
@@ -17,8 +17,11 @@ The model can be loaded with the following code snippet.
 
 ```python
 from diffusers import CogVideoXTransformer3DModel
+from diffusers.utils.torch_utils import get_device
 
-transformer = CogVideoXTransformer3DModel.from_pretrained("THUDM/CogVideoX-2b", subfolder="transformer", torch_dtype=torch.float16).to("cuda")
+device = get_device()
+
+transformer = CogVideoXTransformer3DModel.from_pretrained("THUDM/CogVideoX-2b", subfolder="transformer", torch_dtype=torch.float16).to(device)
 ```
 
 ## CogVideoXTransformer3DModel

diff --git a/docs/source/en/api/models/cogview4_transformer2d.md b/docs/source/en/api/models/cogview4_transformer2d.md
@@ -17,8 +17,11 @@ The model can be loaded with the following code snippet.
 
 ```python
 from diffusers import CogView4Transformer2DModel
+from diffusers.utils.torch_utils import get_device
 
-transformer = CogView4Transformer2DModel.from_pretrained("THUDM/CogView4-6B", subfolder="transformer", torch_dtype=torch.bfloat16).to("cuda")
+device = get_device()
+
+transformer = CogView4Transformer2DModel.from_pretrained("THUDM/CogView4-6B", subfolder="transformer", torch_dtype=torch.bfloat16).to(device)
 ```
 
 ## CogView4Transformer2DModel

diff --git a/docs/source/en/api/models/consisid_transformer3d.md b/docs/source/en/api/models/consisid_transformer3d.md
@@ -17,8 +17,11 @@ The model can be loaded with the following code snippet.
 
 ```python
 from diffusers import ConsisIDTransformer3DModel
+from diffusers.utils.torch_utils import get_device
 
-transformer = ConsisIDTransformer3DModel.from_pretrained("BestWishYsh/ConsisID-preview", subfolder="transformer", torch_dtype=torch.bfloat16).to("cuda")
+device = get_device()
+
+transformer = ConsisIDTransformer3DModel.from_pretrained("BestWishYsh/ConsisID-preview", subfolder="transformer", torch_dtype=torch.bfloat16).to(device)
 ```
 
 ## ConsisIDTransformer3DModel

diff --git a/docs/source/en/api/models/ltx_video_transformer3d.md b/docs/source/en/api/models/ltx_video_transformer3d.md
@@ -17,8 +17,11 @@ The model can be loaded with the following code snippet.
 
 ```python
 from diffusers import LTXVideoTransformer3DModel
+from diffusers.utils.torch_utils import get_device
 
-transformer = LTXVideoTransformer3DModel.from_pretrained("Lightricks/LTX-Video", subfolder="transformer", torch_dtype=torch.bfloat16).to("cuda")
+device = get_device()
+
+transformer = LTXVideoTransformer3DModel.from_pretrained("Lightricks/LTX-Video", subfolder="transformer", torch_dtype=torch.bfloat16).to(device)
 ```
 
 ## LTXVideoTransformer3DModel

diff --git a/docs/source/en/api/pipelines/consistency_models.md b/docs/source/en/api/pipelines/consistency_models.md
@@ -29,8 +29,9 @@ For an additional speed-up, use `torch.compile` to generate multiple images in <
 ```diff
   import torch
   from diffusers import ConsistencyModelPipeline
+  from diffusers.utils.torch_utils import get_device
 
-  device = "cuda"
+  device = get_device()
   # Load the cd_bedroom256_lpips checkpoint.
   model_id_or_path = "openai/diffusers-cd_bedroom256_lpips"
   pipe = ConsistencyModelPipeline.from_pretrained(model_id_or_path, torch_dtype=torch.float16)

diff --git a/docs/source/en/api/pipelines/flux.md b/docs/source/en/api/pipelines/flux.md
@@ -102,12 +102,15 @@ out.save("image.png")
 import torch
 from diffusers import FluxFillPipeline
 from diffusers.utils import load_image
+from diffusers.utils.torch_utils import get_device
+
+device = get_device()
 
 image = load_image("https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/cup.png")
 mask = load_image("https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/cup_mask.png")
 
 repo_id = "black-forest-labs/FLUX.1-Fill-dev"
-pipe = FluxFillPipeline.from_pretrained(repo_id, torch_dtype=torch.bfloat16).to("cuda")
+pipe = FluxFillPipeline.from_pretrained(repo_id, torch_dtype=torch.bfloat16).to(device)
 
 image = pipe(
     prompt="a white paper cup",
@@ -131,8 +134,11 @@ import torch
 from controlnet_aux import CannyDetector
 from diffusers import FluxControlPipeline
 from diffusers.utils import load_image
+from diffusers.utils.torch_utils import get_device
+
+device = get_device()
 
-pipe = FluxControlPipeline.from_pretrained("black-forest-labs/FLUX.1-Canny-dev", torch_dtype=torch.bfloat16).to("cuda")
+pipe = FluxControlPipeline.from_pretrained("black-forest-labs/FLUX.1-Canny-dev", torch_dtype=torch.bfloat16).to(device)
 
 prompt = "A robot made of exotic candies and chocolates of different kinds. The background is filled with confetti and celebratory gifts."
 control_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/robot.png")
@@ -159,8 +165,11 @@ import torch
 from controlnet_aux import CannyDetector
 from diffusers import FluxControlPipeline
 from diffusers.utils import load_image
+from diffusers.utils.torch_utils import get_device
+
+device = get_device()
 
-pipe = FluxControlPipeline.from_pretrained("black-forest-labs/FLUX.1-dev", torch_dtype=torch.bfloat16).to("cuda")
+pipe = FluxControlPipeline.from_pretrained("black-forest-labs/FLUX.1-dev", torch_dtype=torch.bfloat16).to(device)
 pipe.load_lora_weights("black-forest-labs/FLUX.1-Canny-dev-lora")
 
 prompt = "A robot made of exotic candies and chocolates of different kinds. The background is filled with confetti and celebratory gifts."
@@ -189,9 +198,12 @@ image.save("output.png")
 import torch
 from diffusers import FluxControlPipeline, FluxTransformer2DModel
 from diffusers.utils import load_image
+from diffusers.utils.torch_utils import get_device
 from image_gen_aux import DepthPreprocessor
 
-pipe = FluxControlPipeline.from_pretrained("black-forest-labs/FLUX.1-Depth-dev", torch_dtype=torch.bfloat16).to("cuda")
+device = get_device()
+
+pipe = FluxControlPipeline.from_pretrained("black-forest-labs/FLUX.1-Depth-dev", torch_dtype=torch.bfloat16).to(device)
 
 prompt = "A robot made of exotic candies and chocolates of different kinds. The background is filled with confetti and celebratory gifts."
 control_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/robot.png")
@@ -218,9 +230,12 @@ Depth Control is also possible with a LoRA variant of this condition. The usage
 import torch
 from diffusers import FluxControlPipeline, FluxTransformer2DModel
 from diffusers.utils import load_image
+from diffusers.utils.torch_utils import get_device
 from image_gen_aux import DepthPreprocessor
 
-pipe = FluxControlPipeline.from_pretrained("black-forest-labs/FLUX.1-dev", torch_dtype=torch.bfloat16).to("cuda")
+device = get_device()
+
+pipe = FluxControlPipeline.from_pretrained("black-forest-labs/FLUX.1-dev", torch_dtype=torch.bfloat16).to(device)
 pipe.load_lora_weights("black-forest-labs/FLUX.1-Depth-dev-lora")
 
 prompt = "A robot made of exotic candies and chocolates of different kinds. The background is filled with confetti and celebratory gifts."
@@ -251,9 +266,10 @@ image.save("output.png")
 import torch
 from diffusers import FluxPriorReduxPipeline, FluxPipeline
 from diffusers.utils import load_image
-device = "cuda"
-dtype = torch.bfloat16
+from diffusers.utils.torch_utils import get_device
 
+device = get_device()
+dtype = torch.bfloat16
 
 repo_redux = "black-forest-labs/FLUX.1-Redux-dev"
 repo_base = "black-forest-labs/FLUX.1-dev" 
@@ -284,11 +300,14 @@ Flux Kontext is a model that allows in-context control of the image generation p
 import torch
 from diffusers import FluxKontextPipeline
 from diffusers.utils import load_image
+from diffusers.utils.torch_utils import get_device
+
+device = get_device()
 
 pipe = FluxKontextPipeline.from_pretrained(
     "black-forest-labs/FLUX.1-Kontext-dev", torch_dtype=torch.bfloat16
 )
-pipe.to("cuda")
+pipe.to(device)
 
 image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/yarn-art-pikachu.png").convert("RGB")
 prompt = "Make Pikachu hold a sign that says 'Black Forest Labs is awesome', yarn art style, detailed, vibrant colors"
@@ -305,13 +324,16 @@ Flux Kontext comes with an integrity safety checker, which should be run after t
 
 ```python
 from flux.content_filters import PixtralContentFilter
+from diffusers.utils.torch_utils import get_device
+
+device = get_device()
 
 # ... pipeline invocation to generate images
 
-integrity_checker = PixtralContentFilter(torch.device("cuda"))
+integrity_checker = PixtralContentFilter(torch.device(device))
 image_ = np.array(image) / 255.0
 image_ = 2 * image_ - 1
-image_ = torch.from_numpy(image_).to("cuda", dtype=torch.float32).unsqueeze(0).permute(0, 3, 1, 2)
+image_ = torch.from_numpy(image_).to(device, dtype=torch.float32).unsqueeze(0).permute(0, 3, 1, 2)
 if integrity_checker.test_image(image_):
     raise ValueError("Your image has been flagged. Choose another prompt/image or try again.")
 ```
@@ -371,10 +393,13 @@ An IP-Adapter lets you prompt Flux with images, in addition to the text prompt.
 import torch
 from diffusers import FluxPipeline
 from diffusers.utils import load_image
+from diffusers.utils.torch_utils import get_device
+
+device = get_device()
 
 pipe = FluxPipeline.from_pretrained(
     "black-forest-labs/FLUX.1-dev", torch_dtype=torch.bfloat16
-).to("cuda")
+).to(device)
 
 image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/flux_ip_adapter_input.jpg").resize((1024, 1024))
 
@@ -411,7 +436,7 @@ Flux is a very large model and requires ~50GB of RAM/VRAM to load all the modeli
 
 [Group offloading](../../optimization/memory#group-offloading) lowers VRAM usage by offloading groups of internal layers rather than the whole model or weights. You need to use [`~hooks.apply_group_offloading`] on all the model components of a pipeline. The `offload_type` parameter allows you to toggle between block and leaf-level offloading. Setting it to `leaf_level` offloads the lowest leaf-level parameters to the CPU instead of offloading at the module-level.
 
-On CUDA devices that support asynchronous data streaming, set `use_stream=True` to overlap data transfer and computation to accelerate inference.
+On accelerator devices that support asynchronous data streaming, set `use_stream=True` to overlap data transfer and computation to accelerate inference.
 
 > [!TIP]
 > It is possible to mix block and leaf-level offloading for different components in a pipeline.
@@ -420,6 +445,9 @@ On CUDA devices that support asynchronous data streaming, set `use_stream=True`
 import torch
 from diffusers import FluxPipeline
 from diffusers.hooks import apply_group_offloading
+from diffusers.utils.torch_utils import get_device
+
+device = get_device()
 
 model_id = "black-forest-labs/FLUX.1-dev"
 dtype = torch.bfloat16
@@ -432,27 +460,27 @@ apply_group_offloading(
     pipe.transformer,
     offload_type="leaf_level",
     offload_device=torch.device("cpu"),
-    onload_device=torch.device("cuda"),
+    onload_device=torch.device(device),
     use_stream=True,
 )
 apply_group_offloading(
     pipe.text_encoder, 
     offload_device=torch.device("cpu"),
-    onload_device=torch.device("cuda"),
+    onload_device=torch.device(device),
     offload_type="leaf_level",
     use_stream=True,
 )
 apply_group_offloading(
     pipe.text_encoder_2, 
     offload_device=torch.device("cpu"),
-    onload_device=torch.device("cuda"),
+    onload_device=torch.device(device),
     offload_type="leaf_level",
     use_stream=True,
 )
 apply_group_offloading(
     pipe.vae, 
     offload_device=torch.device("cpu"),
-    onload_device=torch.device("cuda"),
+    onload_device=torch.device(device),
     offload_type="leaf_level",
     use_stream=True,
 )

diff --git a/docs/source/en/api/pipelines/hunyuandit.md b/docs/source/en/api/pipelines/hunyuandit.md
@@ -52,11 +52,14 @@ First, load the pipeline:
 
 ```python
 from diffusers import HunyuanDiTPipeline
+from diffusers.utils.torch_utils import get_device
 import torch
 
+device = get_device()
+
 pipeline = HunyuanDiTPipeline.from_pretrained(
 	"Tencent-Hunyuan/HunyuanDiT-Diffusers", torch_dtype=torch.float16
-).to("cuda")
+).to(device)
 ```
 
 Then change the memory layout of the pipelines `transformer` and `vae` components to `torch.channels-last`:

diff --git a/docs/source/en/api/pipelines/kolors.md b/docs/source/en/api/pipelines/kolors.md
@@ -31,9 +31,12 @@ The abstract from the technical report is:
 import torch
 
 from diffusers import DPMSolverMultistepScheduler, KolorsPipeline
+from diffusers.utils.torch_utils import get_device
+
+device = get_device()
 
 pipe = KolorsPipeline.from_pretrained("Kwai-Kolors/Kolors-diffusers", torch_dtype=torch.float16, variant="fp16")
-pipe.to("cuda")
+pipe.to(device)
 pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config, use_karras_sigmas=True)
 
 image = pipe(

diff --git a/docs/source/en/api/pipelines/latte.md b/docs/source/en/api/pipelines/latte.md
@@ -41,10 +41,13 @@ First, load the pipeline:
 ```python
 import torch
 from diffusers import LattePipeline
+from diffusers.utils.torch_utils import get_device
+
+device = get_device()
 
 pipeline = LattePipeline.from_pretrained(
 	"maxin-cn/Latte-1", torch_dtype=torch.float16
-).to("cuda")
+).to(device)
 ```
 
 Then change the memory layout of the pipelines `transformer` and `vae` components to `torch.channels-last`: