Skip to content

make model docs device agnostic (2) #40256

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 11 commits into from
Aug 19, 2025
2 changes: 1 addition & 1 deletion docs/source/en/model_doc/bart.md
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ model = AutoModelForMaskedLM.from_pretrained(
device_map="auto",
attn_implementation="sdpa"
)
inputs = tokenizer("Plants create <mask> through a process known as photosynthesis.", return_tensors="pt").to("cuda")
inputs = tokenizer("Plants create <mask> through a process known as photosynthesis.", return_tensors="pt").to(model.device)

with torch.no_grad():
outputs = model(**inputs)
Expand Down
2 changes: 1 addition & 1 deletion docs/source/en/model_doc/barthez.md
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ model = AutoModelForMaskedLM.from_pretrained(
torch_dtype=torch.float16,
device_map="auto",
)
inputs = tokenizer("Les plantes produisent <mask> grâce à un processus appelé photosynthèse.", return_tensors="pt").to("cuda")
inputs = tokenizer("Les plantes produisent <mask> grâce à un processus appelé photosynthèse.", return_tensors="pt").to(model.device)

with torch.no_grad():
outputs = model(**inputs)
Expand Down
2 changes: 1 addition & 1 deletion docs/source/en/model_doc/bartpho.md
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ Quang tổng hợp hay gọi tắt là quang hợp là quá trình thu nhận v
tảo và một số vi khuẩn để tạo ra hợp chất hữu cơ phục vụ bản thân cũng như làm nguồn thức ăn cho hầu hết các sinh vật
trên Trái Đất. Quang hợp trong thực vật thường liên quan đến chất tố diệp lục màu xanh lá cây và tạo ra oxy như một sản phẩm phụ
"""
inputs = tokenizer(text, return_tensors="pt").to("cuda")
inputs = tokenizer(text, return_tensors="pt").to(model.device)

outputs = model.generate(inputs["input_ids"], num_beams=2, min_length=0, max_length=20)
tokenizer.batch_decode(outputs, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
Expand Down
2 changes: 1 addition & 1 deletion docs/source/en/model_doc/bertweet.md
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ model = AutoModelForMaskedLM.from_pretrained(
torch_dtype=torch.float16,
device_map="auto"
)
inputs = tokenizer("Plants create <mask> through a process known as photosynthesis.", return_tensors="pt").to("cuda")
inputs = tokenizer("Plants create <mask> through a process known as photosynthesis.", return_tensors="pt").to(model.device)

with torch.no_grad():
outputs = model(**inputs)
Expand Down
2 changes: 1 addition & 1 deletion docs/source/en/model_doc/big_bird.md
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ model = AutoModelForMaskedLM.from_pretrained(
torch_dtype=torch.float16,
device_map="auto",
)
inputs = tokenizer("Plants create [MASK] through a process known as photosynthesis.", return_tensors="pt").to("cuda")
inputs = tokenizer("Plants create [MASK] through a process known as photosynthesis.", return_tensors="pt").to(model.device)

with torch.no_grad():
outputs = model(**inputs)
Expand Down
4 changes: 2 additions & 2 deletions docs/source/en/model_doc/bigbird_pegasus.md
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ input_text = """Plants are among the most remarkable and essential life forms on
Through photosynthesis, plants capture energy from sunlight using a green pigment called chlorophyll, which is located in specialized cell structures called chloroplasts. In the presence of light, plants absorb carbon dioxide from the atmosphere through small pores in their leaves called stomata, and take in water from the soil through their root systems.
These ingredients are then transformed into glucose, a type of sugar that serves as a source of chemical energy, and oxygen, which is released as a byproduct into the atmosphere. The glucose produced during photosynthesis is not just used immediately; plants also store it as starch or convert it into other organic compounds like cellulose, which is essential for building their cellular structure.
This energy reserve allows them to grow, develop leaves, produce flowers, bear fruit, and carry out various physiological processes throughout their lifecycle."""
input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
input_ids = tokenizer(input_text, return_tensors="pt").to(model.device)

output = model.generate(**input_ids, cache_implementation="static")
print(tokenizer.decode(output[0], skip_special_tokens=True))
Expand Down Expand Up @@ -115,7 +115,7 @@ input_text = """Plants are among the most remarkable and essential life forms on
Through photosynthesis, plants capture energy from sunlight using a green pigment called chlorophyll, which is located in specialized cell structures called chloroplasts. In the presence of light, plants absorb carbon dioxide from the atmosphere through small pores in their leaves called stomata, and take in water from the soil through their root systems.
These ingredients are then transformed into glucose, a type of sugar that serves as a source of chemical energy, and oxygen, which is released as a byproduct into the atmosphere. The glucose produced during photosynthesis is not just used immediately; plants also store it as starch or convert it into other organic compounds like cellulose, which is essential for building their cellular structure.
This energy reserve allows them to grow, develop leaves, produce flowers, bear fruit, and carry out various physiological processes throughout their lifecycle."""
input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
input_ids = tokenizer(input_text, return_tensors="pt").to(model.device)

output = model.generate(**input_ids, cache_implementation="static")
print(tokenizer.decode(output[0], skip_special_tokens=True))
Expand Down
2 changes: 1 addition & 1 deletion docs/source/en/model_doc/blip.md
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/
image = Image.open(requests.get(url, stream=True).raw)

question = "What is the weather in this image?"
inputs = processor(images=image, text=question, return_tensors="pt").to("cuda", torch.float16)
inputs = processor(images=image, text=question, return_tensors="pt").to(model.device, torch.float16)

output = model.generate(**inputs)
processor.batch_decode(output, skip_special_tokens=True)[0]
Expand Down
2 changes: 1 addition & 1 deletion docs/source/en/model_doc/clap.md
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ tokenizer = AutoTokenizer.from_pretrained("laion/clap-htsat-unfused")

texts = ["the sound of a cat", "the sound of a dog", "music playing"]

inputs = tokenizer(texts, padding=True, return_tensors="pt").to("cuda")
inputs = tokenizer(texts, padding=True, return_tensors="pt").to(model.device)

with torch.no_grad():
text_features = model.get_text_features(**inputs)
Expand Down
4 changes: 2 additions & 2 deletions docs/source/en/model_doc/code_llama.md
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ model = AutoModelForCausalLM.from_pretrained(

# basic code generation
prompt = "# Function to calculate the factorial of a number\ndef factorial(n):"
input_ids = tokenizer(prompt, return_tensors="pt").to("cuda")
input_ids = tokenizer(prompt, return_tensors="pt").to(model.device)

output = model.generate(
**input_ids,
Expand Down Expand Up @@ -121,7 +121,7 @@ model = AutoModelForCausalLM.from_pretrained(
)

prompt = "# Write a Python function to check if a string is a palindrome\ndef is_palindrome(s):"
input_ids = tokenizer(prompt, return_tensors="pt").to("cuda")
input_ids = tokenizer(prompt, return_tensors="pt").to(model.device)

output = model.generate(**input_ids, max_new_tokens=200, cache_implementation="static")
print(tokenizer.decode(output[0], skip_special_tokens=True))
Expand Down
16 changes: 8 additions & 8 deletions docs/source/en/model_doc/csm.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,10 +38,10 @@ CSM can be used to simply generate speech from a text prompt:

```python
import torch
from transformers import CsmForConditionalGeneration, AutoProcessor
from transformers import CsmForConditionalGeneration, AutoProcessor, infer_device

model_id = "sesame/csm-1b"
device = "cuda" if torch.cuda.is_available() else "cpu"
device = infer_device()

# load the model and the processor
processor = AutoProcessor.from_pretrained(model_id)
Expand Down Expand Up @@ -72,11 +72,11 @@ CSM can be used to generate speech given a conversation, allowing consistency in

```python
import torch
from transformers import CsmForConditionalGeneration, AutoProcessor
from transformers import CsmForConditionalGeneration, AutoProcessor, infer_device
from datasets import load_dataset, Audio

model_id = "sesame/csm-1b"
device = "cuda" if torch.cuda.is_available() else "cpu"
device = infer_device()

# load the model and the processor
processor = AutoProcessor.from_pretrained(model_id)
Expand Down Expand Up @@ -117,11 +117,11 @@ CSM supports batched inference!

```python
import torch
from transformers import CsmForConditionalGeneration, AutoProcessor
from transformers import CsmForConditionalGeneration, AutoProcessor, infer_device
from datasets import load_dataset, Audio

model_id = "sesame/csm-1b"
device = "cuda" if torch.cuda.is_available() else "cpu"
device = infer_device()

# load the model and the processor
processor = AutoProcessor.from_pretrained(model_id)
Expand Down Expand Up @@ -306,11 +306,11 @@ print("="*50)
CSM Transformers integration supports training!

```python
from transformers import CsmForConditionalGeneration, AutoProcessor
from transformers import CsmForConditionalGeneration, AutoProcessor, infer_device
from datasets import load_dataset, Audio

model_id = "sesame/csm-1b"
device = "cuda"
device = infer_device()

# load the model and the processor
processor = AutoProcessor.from_pretrained(model_id)
Expand Down
2 changes: 1 addition & 1 deletion docs/source/en/model_doc/cvt.md
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ model = AutoModelForImageClassification.from_pretrained(

url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
image = Image.open(requests.get(url, stream=True).raw)
inputs = image_processor(image, return_tensors="pt").to("cuda")
inputs = image_processor(image, return_tensors="pt").to(model.device)

with torch.no_grad():
logits = model(**inputs).logits
Expand Down
6 changes: 3 additions & 3 deletions docs/source/en/model_doc/dbrx.md
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ model = DbrxForCausalLM.from_pretrained(

input_text = "What does it take to build a great LLM?"
messages = [{"role": "user", "content": input_text}]
input_ids = tokenizer.apply_chat_template(messages, return_dict=True, tokenize=True, add_generation_prompt=True, return_tensors="pt").to("cuda")
input_ids = tokenizer.apply_chat_template(messages, return_dict=True, tokenize=True, add_generation_prompt=True, return_tensors="pt").to(model.device)

outputs = model.generate(**input_ids, max_new_tokens=200)
print(tokenizer.decode(outputs[0]))
Expand All @@ -80,7 +80,7 @@ model = DbrxForCausalLM.from_pretrained(

input_text = "What does it take to build a great LLM?"
messages = [{"role": "user", "content": input_text}]
input_ids = tokenizer.apply_chat_template(messages, return_dict=True, tokenize=True, add_generation_prompt=True, return_tensors="pt").to("cuda")
input_ids = tokenizer.apply_chat_template(messages, return_dict=True, tokenize=True, add_generation_prompt=True, return_tensors="pt").to(model.device)

outputs = model.generate(**input_ids, max_new_tokens=200)
print(tokenizer.decode(outputs[0]))
Expand All @@ -102,7 +102,7 @@ model = DbrxForCausalLM.from_pretrained(

input_text = "What does it take to build a great LLM?"
messages = [{"role": "user", "content": input_text}]
input_ids = tokenizer.apply_chat_template(messages, return_dict=True, tokenize=True, add_generation_prompt=True, return_tensors="pt").to("cuda")
input_ids = tokenizer.apply_chat_template(messages, return_dict=True, tokenize=True, add_generation_prompt=True, return_tensors="pt").to(model.device)

outputs = model.generate(**input_ids, max_new_tokens=200)
print(tokenizer.decode(outputs[0]))
Expand Down
6 changes: 3 additions & 3 deletions docs/source/en/model_doc/depth_pro.md
Original file line number Diff line number Diff line change
Expand Up @@ -46,17 +46,17 @@ The DepthPro model processes an input image by first downsampling it at multiple
>>> import requests
>>> from PIL import Image
>>> import torch
>>> from transformers import DepthProImageProcessorFast, DepthProForDepthEstimation
>>> from transformers import DepthProImageProcessorFast, DepthProForDepthEstimation, infer_device

>>> device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
>>> device = infer_device()

>>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
>>> image = Image.open(requests.get(url, stream=True).raw)

>>> image_processor = DepthProImageProcessorFast.from_pretrained("apple/DepthPro-hf")
>>> model = DepthProForDepthEstimation.from_pretrained("apple/DepthPro-hf").to(device)

>>> inputs = image_processor(images=image, return_tensors="pt").to(device)
>>> inputs = image_processor(images=image, return_tensors="pt").to(model.device)

>>> with torch.no_grad():
... outputs = model(**inputs)
Expand Down
12 changes: 6 additions & 6 deletions docs/source/en/model_doc/dia.md
Original file line number Diff line number Diff line change
Expand Up @@ -42,9 +42,9 @@ tokens and decodes them back into audio.
### Generation with Text

```python
from transformers import AutoProcessor, DiaForConditionalGeneration
from transformers import AutoProcessor, DiaForConditionalGeneration, infer_device

torch_device = "cuda"
torch_device = infer_device()
model_checkpoint = "nari-labs/Dia-1.6B-0626"

text = ["[S1] Dia is an open weights text to dialogue model."]
Expand All @@ -64,9 +64,9 @@ processor.save_audio(outputs, "example.wav")

```python
from datasets import load_dataset, Audio
from transformers import AutoProcessor, DiaForConditionalGeneration
from transformers import AutoProcessor, DiaForConditionalGeneration, infer_device

torch_device = "cuda"
torch_device = infer_device()
model_checkpoint = "nari-labs/Dia-1.6B-0626"

ds = load_dataset("hf-internal-testing/dailytalk-dummy", split="train")
Expand All @@ -91,9 +91,9 @@ processor.save_audio(outputs, "example_with_audio.wav")

```python
from datasets import load_dataset, Audio
from transformers import AutoProcessor, DiaForConditionalGeneration
from transformers import AutoProcessor, DiaForConditionalGeneration, infer_device

torch_device = "cuda"
torch_device = infer_device()
model_checkpoint = "nari-labs/Dia-1.6B-0626"

ds = load_dataset("hf-internal-testing/dailytalk-dummy", split="train")
Expand Down
2 changes: 1 addition & 1 deletion docs/source/en/model_doc/distilbert.md
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ model = AutoModelForSequenceClassification.from_pretrained(
device_map="auto",
attn_implementation="sdpa"
)
inputs = tokenizer("I love using Hugging Face Transformers!", return_tensors="pt").to("cuda")
inputs = tokenizer("I love using Hugging Face Transformers!", return_tensors="pt").to(model.device)

with torch.no_grad():
outputs = model(**inputs)
Expand Down
2 changes: 1 addition & 1 deletion docs/source/en/model_doc/dit.md
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ model = AutoModelForImageClassification.from_pretrained(
)
url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/dit-example.jpg"
image = Image.open(requests.get(url, stream=True).raw)
inputs = image_processor(image, return_tensors="pt").to("cuda")
inputs = image_processor(image, return_tensors="pt").to(model.device)

with torch.no_grad():
logits = model(**inputs).logits
Expand Down
6 changes: 3 additions & 3 deletions docs/source/en/model_doc/gemma2.md
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ pipe = pipeline(
task="text-generation",
model="google/gemma-2-9b",
torch_dtype=torch.bfloat16,
device="cuda",
device_map="auto",
)

pipe("Explain quantum computing simply. ", max_new_tokens=50)
Expand All @@ -74,7 +74,7 @@ model = AutoModelForCausalLM.from_pretrained(
)

input_text = "Explain quantum computing simply."
input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
input_ids = tokenizer(input_text, return_tensors="pt").to(model.device)

outputs = model.generate(**input_ids, max_new_tokens=32, cache_implementation="static")
print(tokenizer.decode(outputs[0], skip_special_tokens=True))
Expand Down Expand Up @@ -108,7 +108,7 @@ model = AutoModelForCausalLM.from_pretrained(
)

input_text = "Explain quantum computing simply."
input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
input_ids = tokenizer(input_text, return_tensors="pt").to(model.device)

outputs = model.generate(**input_ids, max_new_tokens=32, cache_implementation="static")
print(tokenizer.decode(outputs[0], skip_special_tokens=True))
Expand Down
6 changes: 3 additions & 3 deletions docs/source/en/model_doc/glm.md
Original file line number Diff line number Diff line change
Expand Up @@ -61,8 +61,8 @@ Tips:
In the following, we demonstrate how to use `glm-4-9b-chat` for the inference. Note that we have used the ChatML format for dialog, in this demo we show how to leverage `apply_chat_template` for this purpose.

```python
>>> from transformers import AutoModelForCausalLM, AutoTokenizer
>>> device = "cuda" # the device to load the model onto
>>> from transformers import AutoModelForCausalLM, AutoTokenizer, infer_device
>>> device = infer_device() # the device to load the model onto

>>> model = AutoModelForCausalLM.from_pretrained("THUDM/glm-4-9b-chat", device_map="auto", trust_remote_code=True)
>>> tokenizer = AutoTokenizer.from_pretrained("THUDM/glm-4-9b-chat")
Expand All @@ -73,7 +73,7 @@ In the following, we demonstrate how to use `glm-4-9b-chat` for the inference. N

>>> text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

>>> model_inputs = tokenizer([text], return_tensors="pt").to(device)
>>> model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

>>> generated_ids = model.generate(model_inputs.input_ids, max_new_tokens=512, do_sample=True)

Expand Down
4 changes: 2 additions & 2 deletions docs/source/en/model_doc/gpt2.md
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer
model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2", torch_dtype=torch.float16, device_map="auto", attn_implementation="sdpa")
tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")

input_ids = tokenizer("Hello, I'm a language model", return_tensors="pt").to("cuda")
input_ids = tokenizer("Hello, I'm a language model", return_tensors="pt").to(model.device)

output = model.generate(**input_ids, cache_implementation="static")
print(tokenizer.decode(output[0], skip_special_tokens=True))
Expand Down Expand Up @@ -102,7 +102,7 @@ model = AutoModelForCausalLM.from_pretrained(
)

tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2-xl")
inputs = tokenizer("Once upon a time, there was a magical forest", return_tensors="pt").to("cuda")
inputs = tokenizer("Once upon a time, there was a magical forest", return_tensors="pt").to(model.device)
outputs = model.generate(**inputs, max_new_tokens=100)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))
```
Expand Down
6 changes: 3 additions & 3 deletions docs/source/en/model_doc/gpt_bigcode.md
Original file line number Diff line number Diff line change
Expand Up @@ -67,15 +67,15 @@ To load and run a model using Flash Attention 2, refer to the snippet below:

```python
>>> import torch
>>> from transformers import AutoModelForCausalLM, AutoTokenizer
>>> device = "cuda" # the device to load the model onto
>>> from transformers import AutoModelForCausalLM, AutoTokenizer, infer_device
>>> device = infer_device() # the device to load the model onto

>>> model = AutoModelForCausalLM.from_pretrained("bigcode/gpt_bigcode-santacoder", torch_dtype=torch.float16, attn_implementation="flash_attention_2")
>>> tokenizer = AutoTokenizer.from_pretrained("bigcode/gpt_bigcode-santacoder")

>>> prompt = "def hello_world():"

>>> model_inputs = tokenizer([prompt], return_tensors="pt").to(device)
>>> model_inputs = tokenizer([prompt], return_tensors="pt").to(model.device)
>>> model.to(device)

>>> generated_ids = model.generate(**model_inputs, max_new_tokens=30, do_sample=False)
Expand Down
10 changes: 5 additions & 5 deletions docs/source/en/model_doc/gptj.md
Original file line number Diff line number Diff line change
Expand Up @@ -41,10 +41,10 @@ This model was contributed by [Stella Biderman](https://huggingface.co/stellaath
which could be used to further minimize the RAM usage:

```python
>>> from transformers import GPTJForCausalLM
>>> from transformers import GPTJForCausalLM, infer_device
>>> import torch

>>> device = "cuda"
>>> device = infer_device()
>>> model = GPTJForCausalLM.from_pretrained(
... "EleutherAI/gpt-j-6B",
... revision="float16",
Expand Down Expand Up @@ -96,10 +96,10 @@ model.
...or in float16 precision:

```python
>>> from transformers import GPTJForCausalLM, AutoTokenizer
>>> from transformers import GPTJForCausalLM, AutoTokenizer, infer_device
>>> import torch

>>> device = "cuda"
>>> device = infer_device()
>>> model = GPTJForCausalLM.from_pretrained("EleutherAI/gpt-j-6B", torch_dtype=torch.float16).to(device)
>>> tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B")

Expand All @@ -109,7 +109,7 @@ model.
... "researchers was the fact that the unicorns spoke perfect English."
... )

>>> input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
>>> input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(model.device)

>>> gen_tokens = model.generate(
... input_ids,
Expand Down
3 changes: 1 addition & 2 deletions docs/source/en/model_doc/helium.md
Original file line number Diff line number Diff line change
Expand Up @@ -119,14 +119,13 @@ In the following, we demonstrate how to use `helium-1-preview` for the inference

```python
>>> from transformers import AutoModelForCausalLM, AutoTokenizer
>>> device = "cuda" # the device to load the model onto

>>> model = AutoModelForCausalLM.from_pretrained("kyutai/helium-1-preview-2b", device_map="auto")
>>> tokenizer = AutoTokenizer.from_pretrained("kyutai/helium-1-preview-2b")

>>> prompt = "Give me a short introduction to large language model."

>>> model_inputs = tokenizer(prompt, return_tensors="pt").to(device)
>>> model_inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

>>> generated_ids = model.generate(model_inputs.input_ids, max_new_tokens=512, do_sample=True)

Expand Down
Loading