huggingface · stevhliu · Aug 19, 2025 · Aug 18, 2025 · Aug 18, 2025 · Aug 18, 2025
diff --git a/docs/source/en/model_doc/bart.md b/docs/source/en/model_doc/bart.md
@@ -65,7 +65,7 @@ model = AutoModelForMaskedLM.from_pretrained(
     device_map="auto",
     attn_implementation="sdpa"
 )
-inputs = tokenizer("Plants create <mask> through a process known as photosynthesis.", return_tensors="pt").to("cuda")
+inputs = tokenizer("Plants create <mask> through a process known as photosynthesis.", return_tensors="pt").to(model.device)
 
 with torch.no_grad():
     outputs = model(**inputs)

diff --git a/docs/source/en/model_doc/barthez.md b/docs/source/en/model_doc/barthez.md
@@ -68,7 +68,7 @@ model = AutoModelForMaskedLM.from_pretrained(
     torch_dtype=torch.float16,
     device_map="auto",
 )
-inputs = tokenizer("Les plantes produisent <mask> grâce à un processus appelé photosynthèse.", return_tensors="pt").to("cuda")
+inputs = tokenizer("Les plantes produisent <mask> grâce à un processus appelé photosynthèse.", return_tensors="pt").to(model.device)
 
 with torch.no_grad():
     outputs = model(**inputs)

diff --git a/docs/source/en/model_doc/bartpho.md b/docs/source/en/model_doc/bartpho.md
@@ -82,7 +82,7 @@ Quang tổng hợp hay gọi tắt là quang hợp là quá trình thu nhận v
 tảo và một số vi khuẩn để tạo ra hợp chất hữu cơ phục vụ bản thân cũng như làm nguồn thức ăn cho hầu hết các sinh vật 
 trên Trái Đất. Quang hợp trong thực vật thường liên quan đến chất tố diệp lục màu xanh lá cây và tạo ra oxy như một sản phẩm phụ
 """
-inputs = tokenizer(text, return_tensors="pt").to("cuda")
+inputs = tokenizer(text, return_tensors="pt").to(model.device)
 
 outputs = model.generate(inputs["input_ids"], num_beams=2, min_length=0, max_length=20)
 tokenizer.batch_decode(outputs, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]

diff --git a/docs/source/en/model_doc/bertweet.md b/docs/source/en/model_doc/bertweet.md
@@ -67,7 +67,7 @@ model = AutoModelForMaskedLM.from_pretrained(
     torch_dtype=torch.float16,
     device_map="auto"
 )
-inputs = tokenizer("Plants create <mask> through a process known as photosynthesis.", return_tensors="pt").to("cuda")
+inputs = tokenizer("Plants create <mask> through a process known as photosynthesis.", return_tensors="pt").to(model.device)
 
 with torch.no_grad():
     outputs = model(**inputs)

diff --git a/docs/source/en/model_doc/big_bird.md b/docs/source/en/model_doc/big_bird.md
@@ -64,7 +64,7 @@ model = AutoModelForMaskedLM.from_pretrained(
     torch_dtype=torch.float16,
     device_map="auto",
 )
-inputs = tokenizer("Plants create [MASK] through a process known as photosynthesis.", return_tensors="pt").to("cuda")
+inputs = tokenizer("Plants create [MASK] through a process known as photosynthesis.", return_tensors="pt").to(model.device)
 
 with torch.no_grad():
     outputs = model(**inputs)

diff --git a/docs/source/en/model_doc/bigbird_pegasus.md b/docs/source/en/model_doc/bigbird_pegasus.md
@@ -72,7 +72,7 @@ input_text = """Plants are among the most remarkable and essential life forms on
 Through photosynthesis, plants capture energy from sunlight using a green pigment called chlorophyll, which is located in specialized cell structures called chloroplasts. In the presence of light, plants absorb carbon dioxide from the atmosphere through small pores in their leaves called stomata, and take in water from the soil through their root systems.
 These ingredients are then transformed into glucose, a type of sugar that serves as a source of chemical energy, and oxygen, which is released as a byproduct into the atmosphere. The glucose produced during photosynthesis is not just used immediately; plants also store it as starch or convert it into other organic compounds like cellulose, which is essential for building their cellular structure.
 This energy reserve allows them to grow, develop leaves, produce flowers, bear fruit, and carry out various physiological processes throughout their lifecycle."""
-input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
+input_ids = tokenizer(input_text, return_tensors="pt").to(model.device)
 
 output = model.generate(**input_ids, cache_implementation="static")
 print(tokenizer.decode(output[0], skip_special_tokens=True))
@@ -115,7 +115,7 @@ input_text = """Plants are among the most remarkable and essential life forms on
 Through photosynthesis, plants capture energy from sunlight using a green pigment called chlorophyll, which is located in specialized cell structures called chloroplasts. In the presence of light, plants absorb carbon dioxide from the atmosphere through small pores in their leaves called stomata, and take in water from the soil through their root systems.
 These ingredients are then transformed into glucose, a type of sugar that serves as a source of chemical energy, and oxygen, which is released as a byproduct into the atmosphere. The glucose produced during photosynthesis is not just used immediately; plants also store it as starch or convert it into other organic compounds like cellulose, which is essential for building their cellular structure.
 This energy reserve allows them to grow, develop leaves, produce flowers, bear fruit, and carry out various physiological processes throughout their lifecycle."""
-input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
+input_ids = tokenizer(input_text, return_tensors="pt").to(model.device)
 
 output = model.generate(**input_ids, cache_implementation="static")
 print(tokenizer.decode(output[0], skip_special_tokens=True))

diff --git a/docs/source/en/model_doc/blip.md b/docs/source/en/model_doc/blip.md
@@ -73,7 +73,7 @@ url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/
 image = Image.open(requests.get(url, stream=True).raw)
 
 question = "What is the weather in this image?"
-inputs = processor(images=image, text=question, return_tensors="pt").to("cuda", torch.float16)
+inputs = processor(images=image, text=question, return_tensors="pt").to(model.device, torch.float16)
 
 output = model.generate(**inputs)
 processor.batch_decode(output, skip_special_tokens=True)[0]

diff --git a/docs/source/en/model_doc/clap.md b/docs/source/en/model_doc/clap.md
@@ -48,7 +48,7 @@ tokenizer = AutoTokenizer.from_pretrained("laion/clap-htsat-unfused")
 
 texts = ["the sound of a cat", "the sound of a dog", "music playing"]
 
-inputs = tokenizer(texts, padding=True, return_tensors="pt").to("cuda")
+inputs = tokenizer(texts, padding=True, return_tensors="pt").to(model.device)
 
 with torch.no_grad():
     text_features = model.get_text_features(**inputs)

diff --git a/docs/source/en/model_doc/code_llama.md b/docs/source/en/model_doc/code_llama.md
@@ -74,7 +74,7 @@ model = AutoModelForCausalLM.from_pretrained(
 
 # basic code generation
 prompt = "# Function to calculate the factorial of a number\ndef factorial(n):"
-input_ids = tokenizer(prompt, return_tensors="pt").to("cuda")
+input_ids = tokenizer(prompt, return_tensors="pt").to(model.device)
 
 output = model.generate(
     **input_ids,
@@ -121,7 +121,7 @@ model = AutoModelForCausalLM.from_pretrained(
 )
 
 prompt = "# Write a Python function to check if a string is a palindrome\ndef is_palindrome(s):"
-input_ids = tokenizer(prompt, return_tensors="pt").to("cuda")
+input_ids = tokenizer(prompt, return_tensors="pt").to(model.device)
 
 output = model.generate(**input_ids, max_new_tokens=200, cache_implementation="static")
 print(tokenizer.decode(output[0], skip_special_tokens=True))

diff --git a/docs/source/en/model_doc/csm.md b/docs/source/en/model_doc/csm.md
@@ -38,10 +38,10 @@ CSM can be used to simply generate speech from a text prompt:
 
 ```python
 import torch
-from transformers import CsmForConditionalGeneration, AutoProcessor
+from transformers import CsmForConditionalGeneration, AutoProcessor, infer_device
 
 model_id = "sesame/csm-1b"
-device = "cuda" if torch.cuda.is_available() else "cpu"
+device = infer_device()
 
 # load the model and the processor
 processor = AutoProcessor.from_pretrained(model_id)
@@ -72,11 +72,11 @@ CSM can be used to generate speech given a conversation, allowing consistency in
 
 ```python
 import torch
-from transformers import CsmForConditionalGeneration, AutoProcessor
+from transformers import CsmForConditionalGeneration, AutoProcessor, infer_device
 from datasets import load_dataset, Audio
 
 model_id = "sesame/csm-1b"
-device = "cuda" if torch.cuda.is_available() else "cpu"
+device = infer_device()
 
 # load the model and the processor
 processor = AutoProcessor.from_pretrained(model_id)
@@ -117,11 +117,11 @@ CSM supports batched inference!
 
 ```python
 import torch
-from transformers import CsmForConditionalGeneration, AutoProcessor
+from transformers import CsmForConditionalGeneration, AutoProcessor, infer_device
 from datasets import load_dataset, Audio
 
 model_id = "sesame/csm-1b"
-device = "cuda" if torch.cuda.is_available() else "cpu"
+device = infer_device()
 
 # load the model and the processor
 processor = AutoProcessor.from_pretrained(model_id)
@@ -306,11 +306,11 @@ print("="*50)
 CSM Transformers integration supports training!
 
 ```python
-from transformers import CsmForConditionalGeneration, AutoProcessor
+from transformers import CsmForConditionalGeneration, AutoProcessor, infer_device
 from datasets import load_dataset, Audio
 
 model_id = "sesame/csm-1b"
-device = "cuda"
+device = infer_device()
 
 # load the model and the processor
 processor = AutoProcessor.from_pretrained(model_id)

diff --git a/docs/source/en/model_doc/cvt.md b/docs/source/en/model_doc/cvt.md
@@ -69,7 +69,7 @@ model = AutoModelForImageClassification.from_pretrained(
 
 url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
 image = Image.open(requests.get(url, stream=True).raw)
-inputs = image_processor(image, return_tensors="pt").to("cuda")
+inputs = image_processor(image, return_tensors="pt").to(model.device)
 
 with torch.no_grad():
   logits = model(**inputs).logits

diff --git a/docs/source/en/model_doc/dbrx.md b/docs/source/en/model_doc/dbrx.md
@@ -58,7 +58,7 @@ model = DbrxForCausalLM.from_pretrained(
 
 input_text = "What does it take to build a great LLM?"
 messages = [{"role": "user", "content": input_text}]
-input_ids = tokenizer.apply_chat_template(messages, return_dict=True, tokenize=True, add_generation_prompt=True, return_tensors="pt").to("cuda")
+input_ids = tokenizer.apply_chat_template(messages, return_dict=True, tokenize=True, add_generation_prompt=True, return_tensors="pt").to(model.device)
 
 outputs = model.generate(**input_ids, max_new_tokens=200)
 print(tokenizer.decode(outputs[0]))
@@ -80,7 +80,7 @@ model = DbrxForCausalLM.from_pretrained(
 
 input_text = "What does it take to build a great LLM?"
 messages = [{"role": "user", "content": input_text}]
-input_ids = tokenizer.apply_chat_template(messages, return_dict=True, tokenize=True, add_generation_prompt=True, return_tensors="pt").to("cuda")
+input_ids = tokenizer.apply_chat_template(messages, return_dict=True, tokenize=True, add_generation_prompt=True, return_tensors="pt").to(model.device)
 
 outputs = model.generate(**input_ids, max_new_tokens=200)
 print(tokenizer.decode(outputs[0]))
@@ -102,7 +102,7 @@ model = DbrxForCausalLM.from_pretrained(
 
 input_text = "What does it take to build a great LLM?"
 messages = [{"role": "user", "content": input_text}]
-input_ids = tokenizer.apply_chat_template(messages, return_dict=True, tokenize=True, add_generation_prompt=True, return_tensors="pt").to("cuda")
+input_ids = tokenizer.apply_chat_template(messages, return_dict=True, tokenize=True, add_generation_prompt=True, return_tensors="pt").to(model.device)
 
 outputs = model.generate(**input_ids, max_new_tokens=200)
 print(tokenizer.decode(outputs[0]))

diff --git a/docs/source/en/model_doc/depth_pro.md b/docs/source/en/model_doc/depth_pro.md
@@ -46,17 +46,17 @@ The DepthPro model processes an input image by first downsampling it at multiple
 >>> import requests
 >>> from PIL import Image
 >>> import torch
->>> from transformers import DepthProImageProcessorFast, DepthProForDepthEstimation
+>>> from transformers import DepthProImageProcessorFast, DepthProForDepthEstimation, infer_device
 
->>> device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+>>> device = infer_device()
 
 >>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
 >>> image = Image.open(requests.get(url, stream=True).raw)
 
 >>> image_processor = DepthProImageProcessorFast.from_pretrained("apple/DepthPro-hf")
 >>> model = DepthProForDepthEstimation.from_pretrained("apple/DepthPro-hf").to(device)
 
->>> inputs = image_processor(images=image, return_tensors="pt").to(device)
+>>> inputs = image_processor(images=image, return_tensors="pt").to(model.device)
 
 >>> with torch.no_grad():
 ...     outputs = model(**inputs)

diff --git a/docs/source/en/model_doc/dia.md b/docs/source/en/model_doc/dia.md
@@ -42,9 +42,9 @@ tokens and decodes them back into audio.
 ### Generation with Text
 
 ```python
-from transformers import AutoProcessor, DiaForConditionalGeneration
+from transformers import AutoProcessor, DiaForConditionalGeneration, infer_device
 
-torch_device = "cuda"
+torch_device = infer_device()
 model_checkpoint = "nari-labs/Dia-1.6B-0626"
 
 text = ["[S1] Dia is an open weights text to dialogue model."]
@@ -64,9 +64,9 @@ processor.save_audio(outputs, "example.wav")
 
 ```python
 from datasets import load_dataset, Audio
-from transformers import AutoProcessor, DiaForConditionalGeneration
+from transformers import AutoProcessor, DiaForConditionalGeneration, infer_device
 
-torch_device = "cuda"
+torch_device = infer_device()
 model_checkpoint = "nari-labs/Dia-1.6B-0626"
 
 ds = load_dataset("hf-internal-testing/dailytalk-dummy", split="train")
@@ -91,9 +91,9 @@ processor.save_audio(outputs, "example_with_audio.wav")
 
 ```python
 from datasets import load_dataset, Audio
-from transformers import AutoProcessor, DiaForConditionalGeneration
+from transformers import AutoProcessor, DiaForConditionalGeneration, infer_device
 
-torch_device = "cuda"
+torch_device = infer_device()
 model_checkpoint = "nari-labs/Dia-1.6B-0626"
 
 ds = load_dataset("hf-internal-testing/dailytalk-dummy", split="train")

diff --git a/docs/source/en/model_doc/distilbert.md b/docs/source/en/model_doc/distilbert.md
@@ -72,7 +72,7 @@ model = AutoModelForSequenceClassification.from_pretrained(
     device_map="auto",
     attn_implementation="sdpa"
 )
-inputs = tokenizer("I love using Hugging Face Transformers!", return_tensors="pt").to("cuda")
+inputs = tokenizer("I love using Hugging Face Transformers!", return_tensors="pt").to(model.device)
 
 with torch.no_grad():
     outputs = model(**inputs)

diff --git a/docs/source/en/model_doc/dit.md b/docs/source/en/model_doc/dit.md
@@ -70,7 +70,7 @@ model = AutoModelForImageClassification.from_pretrained(
 )
 url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/dit-example.jpg"
 image = Image.open(requests.get(url, stream=True).raw)
-inputs = image_processor(image, return_tensors="pt").to("cuda")
+inputs = image_processor(image, return_tensors="pt").to(model.device)
 
 with torch.no_grad():
   logits = model(**inputs).logits

diff --git a/docs/source/en/model_doc/gemma2.md b/docs/source/en/model_doc/gemma2.md
@@ -52,7 +52,7 @@ pipe = pipeline(
     task="text-generation",
     model="google/gemma-2-9b",
     torch_dtype=torch.bfloat16,
-    device="cuda",
+    device_map="auto",
 )
 
 pipe("Explain quantum computing simply. ", max_new_tokens=50)
@@ -74,7 +74,7 @@ model = AutoModelForCausalLM.from_pretrained(
 )
 
 input_text = "Explain quantum computing simply."
-input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
+input_ids = tokenizer(input_text, return_tensors="pt").to(model.device)
 
 outputs = model.generate(**input_ids, max_new_tokens=32, cache_implementation="static")
 print(tokenizer.decode(outputs[0], skip_special_tokens=True))
@@ -108,7 +108,7 @@ model = AutoModelForCausalLM.from_pretrained(
 )
 
 input_text = "Explain quantum computing simply."
-input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
+input_ids = tokenizer(input_text, return_tensors="pt").to(model.device)
 
 outputs = model.generate(**input_ids, max_new_tokens=32, cache_implementation="static")
 print(tokenizer.decode(outputs[0], skip_special_tokens=True))

diff --git a/docs/source/en/model_doc/glm.md b/docs/source/en/model_doc/glm.md
@@ -61,8 +61,8 @@ Tips:
 In the following, we demonstrate how to use `glm-4-9b-chat` for the inference. Note that we have used the ChatML format for dialog, in this demo we show how to leverage `apply_chat_template` for this purpose.
 
 ```python
->>> from transformers import AutoModelForCausalLM, AutoTokenizer
->>> device = "cuda" # the device to load the model onto
+>>> from transformers import AutoModelForCausalLM, AutoTokenizer, infer_device
+>>> device = infer_device() # the device to load the model onto
 
 >>> model = AutoModelForCausalLM.from_pretrained("THUDM/glm-4-9b-chat", device_map="auto", trust_remote_code=True)
 >>> tokenizer = AutoTokenizer.from_pretrained("THUDM/glm-4-9b-chat")
@@ -73,7 +73,7 @@ In the following, we demonstrate how to use `glm-4-9b-chat` for the inference. N
 
 >>> text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
 
->>> model_inputs = tokenizer([text], return_tensors="pt").to(device)
+>>> model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
 
 >>> generated_ids = model.generate(model_inputs.input_ids, max_new_tokens=512, do_sample=True)
 

diff --git a/docs/source/en/model_doc/gpt2.md b/docs/source/en/model_doc/gpt2.md
@@ -58,7 +58,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer
 model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2", torch_dtype=torch.float16, device_map="auto", attn_implementation="sdpa")
 tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
 
-input_ids = tokenizer("Hello, I'm a language model", return_tensors="pt").to("cuda")
+input_ids = tokenizer("Hello, I'm a language model", return_tensors="pt").to(model.device)
 
 output = model.generate(**input_ids, cache_implementation="static")
 print(tokenizer.decode(output[0], skip_special_tokens=True))
@@ -102,7 +102,7 @@ model = AutoModelForCausalLM.from_pretrained(
 )
 
 tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2-xl")
-inputs = tokenizer("Once upon a time, there was a magical forest", return_tensors="pt").to("cuda")
+inputs = tokenizer("Once upon a time, there was a magical forest", return_tensors="pt").to(model.device)
 outputs = model.generate(**inputs, max_new_tokens=100)
 print(tokenizer.decode(outputs[0], skip_special_tokens=True))
 ```

diff --git a/docs/source/en/model_doc/gpt_bigcode.md b/docs/source/en/model_doc/gpt_bigcode.md
@@ -67,15 +67,15 @@ To load and run a model using Flash Attention 2, refer to the snippet below:
 
 ```python
 >>> import torch
->>> from transformers import AutoModelForCausalLM, AutoTokenizer
->>> device = "cuda" # the device to load the model onto
+>>> from transformers import AutoModelForCausalLM, AutoTokenizer, infer_device
+>>> device = infer_device() # the device to load the model onto
 
 >>> model = AutoModelForCausalLM.from_pretrained("bigcode/gpt_bigcode-santacoder", torch_dtype=torch.float16, attn_implementation="flash_attention_2")
 >>> tokenizer = AutoTokenizer.from_pretrained("bigcode/gpt_bigcode-santacoder")
 
 >>> prompt = "def hello_world():"
 
->>> model_inputs = tokenizer([prompt], return_tensors="pt").to(device)
+>>> model_inputs = tokenizer([prompt], return_tensors="pt").to(model.device)
 >>> model.to(device)
 
 >>> generated_ids = model.generate(**model_inputs, max_new_tokens=30, do_sample=False)

diff --git a/docs/source/en/model_doc/gptj.md b/docs/source/en/model_doc/gptj.md
@@ -41,10 +41,10 @@ This model was contributed by [Stella Biderman](https://huggingface.co/stellaath
   which could be used to further minimize the RAM usage:
 
 ```python
->>> from transformers import GPTJForCausalLM
+>>> from transformers import GPTJForCausalLM, infer_device
 >>> import torch
 
->>> device = "cuda"
+>>> device = infer_device()
 >>> model = GPTJForCausalLM.from_pretrained(
 ...     "EleutherAI/gpt-j-6B",
 ...     revision="float16",
@@ -96,10 +96,10 @@ model.
 ...or in float16 precision:
 
 ```python
->>> from transformers import GPTJForCausalLM, AutoTokenizer
+>>> from transformers import GPTJForCausalLM, AutoTokenizer, infer_device
 >>> import torch
 
->>> device = "cuda"
+>>> device = infer_device()
 >>> model = GPTJForCausalLM.from_pretrained("EleutherAI/gpt-j-6B", torch_dtype=torch.float16).to(device)
 >>> tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B")
 
@@ -109,7 +109,7 @@ model.
 ...     "researchers was the fact that the unicorns spoke perfect English."
 ... )
 
->>> input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
+>>> input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(model.device)
 
 >>> gen_tokens = model.generate(
 ...     input_ids,

diff --git a/docs/source/en/model_doc/helium.md b/docs/source/en/model_doc/helium.md
@@ -119,14 +119,13 @@ In the following, we demonstrate how to use `helium-1-preview` for the inference
 
 ```python
 >>> from transformers import AutoModelForCausalLM, AutoTokenizer
->>> device = "cuda" # the device to load the model onto
 
 >>> model = AutoModelForCausalLM.from_pretrained("kyutai/helium-1-preview-2b", device_map="auto")
 >>> tokenizer = AutoTokenizer.from_pretrained("kyutai/helium-1-preview-2b")
 
 >>> prompt = "Give me a short introduction to large language model."
 
->>> model_inputs = tokenizer(prompt, return_tensors="pt").to(device)
+>>> model_inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
 
 >>> generated_ids = model.generate(model_inputs.input_ids, max_new_tokens=512, do_sample=True)