Update transformers multimodal docs to reflect new changes with Chat and Batching examples

laitifranz · laitifranz · commit 26c2bd9e09d2 · 2025-08-23T18:07:41.000+02:00
diff --git a/docs/features/models/transformers_multimodal.md b/docs/features/models/transformers_multimodal.md
@@ -10,18 +10,18 @@ The Outlines `TransformersMultiModal` model inherits from `Transformers` and sha
 
 To load the model, you can use the `from_transformers` function. It takes 2 arguments:
 
-- `model`: a `transformers` model (created with `AutoModelForCausalLM` for instance)
+- `model`: a `transformers` model (created with `AutoModelForImageTextToText` for instance)
 - `tokenizer_or_processor`: a `transformers` processor (created with `AutoProcessor` for instance, it must be an instance of `ProcessorMixin`)
 
 For instance:
 
 ```python
 import outlines
-from transformers import AutoModelForCausalLM, AutoProcessor
+from transformers import AutoModelForImageTextToText, AutoProcessor
 
 # Create the transformers model and processor
-hf_model = AutoModelForCausalLM.from_pretrained("microsoft/Phi-3-mini-4k-instruct")
-hf_processor = AutoProcessor.from_pretrained("microsoft/Phi-3-mini-4k-instruct")
+hf_model = AutoModelForImageTextToText.from_pretrained("Qwen/Qwen2.5-VL-3B-Instruct")
+hf_processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-3B-Instruct")
 
 # Create the Outlines model
 model = outlines.from_transformers(hf_model, hf_processor)
@@ -76,54 +76,66 @@ result = model(
 print(result) # '{"specie": "cat", "color": "white", "weight": 4}'
 print(Animal.model_validate_json(result)) # specie=cat, color=white, weight=4
 ```
+!!! Warning
+
+    Make sure your prompt contains the tags expected by your processor to correctly inject the assets in the prompt. For some vision multimodal models for instance, you need to add as many `<image>` tags in your prompt as there are image assets included in your model input. `Chat` method, instead, does not require this step.
+
 
-The `TransformersMultiModal` model supports batch generation. To use it, invoke the `batch` method with a list of lists. You will receive as a result a list of completions.
+### Chat
+The `Chat` interface offers a more convenient way to work with multimodal inputs. You don't need to manually add asset tags like `<image>`. The model's HF processor handles the chat templating and asset placement for you automatically. 
+To do so, call the model with a `Chat` instance using a multimodal chat format. Assets must be pre-processed as `outlines.inputs.{Image, Audio, Video}` format, and only `image`, `video`, and `audio` types are supported.
 
 For instance:
 
 ```python
+import outlines
+from outlines.inputs import Chat, Image
+from transformers import AutoModelForImageTextToText, AutoProcessor
+from PIL import Image as PILImage
 from io import BytesIO
 from urllib.request import urlopen
+import torch
 
-from PIL import Image as PILImage
-from transformers import (
-    LlavaForConditionalGeneration,
-    AutoProcessor,
-)
-
-import outlines
-from outlines.inputs import Image
-
-TEST_MODEL = "trl-internal-testing/tiny-LlavaForConditionalGeneration"
-IMAGE_URL = "https://upload.wikimedia.org/wikipedia/commons/2/25/Siam_lilacpoint.jpg"
-IMAGE_URL_2 ="https://upload.wikimedia.org/wikipedia/commons/9/98/Aldrin_Apollo_11_original.jpg"
+model_kwargs = {
+        "torch_dtype": torch.bfloat16,
+        "attn_implementation": "flash_attention_2",
+        "device_map": "auto",
+    }
 
 def get_image_from_url(image_url):
     img_byte_stream = BytesIO(urlopen(image_url).read())
     image = PILImage.open(img_byte_stream).convert("RGB")
     image.format = "PNG"
     return image
-
-# Create a model
+    
+# Create the model
 model = outlines.from_transformers(
-    LlavaForConditionalGeneration.from_pretrained(TEST_MODEL),
-    AutoProcessor.from_pretrained(TEST_MODEL),
+    AutoModelForImageTextToText.from_pretrained("Qwen/Qwen2.5-VL-3B-Instruct", **model_kwargs),
+    AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-3B-Instruct", **model_kwargs)
 )
 
-# Call the batch method with a list of model input dicts
-result = model.batch(
-    [
-        ["<image>Describe the image.", Image(get_image_from_url(IMAGE_URL))],
-        ["<image>Describe the image.", Image(get_image_from_url(IMAGE_URL_2))],
-    ]
-)
-print(result) # ['The image shows a cat', 'The image shows an astronaut']
+IMAGE_URL = "https://upload.wikimedia.org/wikipedia/commons/2/25/Siam_lilacpoint.jpg"
+
+# Create the chat mutimodal input
+prompt = Chat([
+    {
+        "role": "user",
+        "content": [
+            {"type": "image", "image": Image(get_image_from_url(IMAGE_URL))},
+            {"type": "text", "text": "Describe the image in few words."}
+        ],
+    }
+])
+
+# Call the model to generate a response
+response = model(prompt, max_new_tokens=50)
+print(response) # 'A Siamese cat with blue eyes is sitting on a cat tree, looking alert and curious.'
 ```
 
-### Chat
-You can use chat inputs with the `TransformersMultiModal` model. To do so, call the model with a `Chat` instance. 
+### Batching
+The `TransformersMultiModal` model supports batching through the `batch` method. To use it, provide a list of prompts (using the formats described above) to the `batch` method. You will receive as a result a list of completions.
 
-For instance:
+An example using the Chat format:
 
 ```python
 import outlines
@@ -133,18 +145,22 @@ from PIL import Image as PILImage
 from io import BytesIO
 from urllib.request import urlopen
 import torch
+from pydantic import BaseModel
 
 model_kwargs = {
         "torch_dtype": torch.bfloat16,
         "attn_implementation": "flash_attention_2",
         "device_map": "auto",
     }
 
+class Animal(BaseModel):
+    animal: str
+    color: str
+
 def get_image_from_url(image_url):
     img_byte_stream = BytesIO(urlopen(image_url).read())
     image = PILImage.open(img_byte_stream).convert("RGB")
     image.format = "PNG"
-    image.save("image.png")
     return image
     
 # Create the model
@@ -153,25 +169,76 @@ model = outlines.from_transformers(
     AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-3B-Instruct", **model_kwargs)
 )
 
-IMAGE_URL = "https://upload.wikimedia.org/wikipedia/commons/2/25/Siam_lilacpoint.jpg"
+IMAGE_URL_1 = "https://upload.wikimedia.org/wikipedia/commons/2/25/Siam_lilacpoint.jpg"
+IMAGE_URL_2 = "https://upload.wikimedia.org/wikipedia/commons/a/af/Golden_retriever_eating_pigs_foot.jpg"
 
-# Create the chat mutimodal input
-prompt = Chat([
+# Create the chat mutimodal messages
+messages = [
     {
         "role": "user",
         "content": [
-            {"type": "image", "image": Image(get_image_from_url(IMAGE_URL))},
-            {"type": "text", "text": "Describe the image in few words."}
+            {"type": "text", "text": "Describe the image in few words."},
+            {"type": "image", "image": Image(get_image_from_url(IMAGE_URL_1))},
         ],
-    }
-])
+    },
+]
+
+messages_2 = [
+    {
+        "role": "user",
+        "content": [
+            {"type": "text", "text": "Describe the image in few words."},
+            {"type": "image", "image": Image(get_image_from_url(IMAGE_URL_2))},
+        ],
+    },
+]
+
+prompts = [Chat(messages), Chat(messages_2)]
 
 # Call the model to generate a response
-response = model(prompt, max_new_tokens=50)
-print(response) # 'A Siamese cat with blue eyes is sitting on a cat tree, looking alert and curious.'
+responses = model.batch(prompts, output_type=Animal, max_new_tokens=100)
+print(responses) # ['{ "animal": "cat", "color": "white and gray" }', '{ "animal": "dog", "color": "white" }']
+print([Animal.model_validate_json(i) for i in responses]) # [Animal(animal='cat', color='white and gray'), Animal(animal='dog', color='white')]
 ```
 
 
-!!! Warning
+An example using a list of lists with tag assets:
 
-    Make sure your prompt contains the tags expected by your processor to correctly inject the assets in the prompt. For some vision multimodal models for instance, you need to add as many `<image>` tags in your prompt as there are image assets included in your model input.
+```python
+from io import BytesIO
+from urllib.request import urlopen
+
+from PIL import Image as PILImage
+from transformers import (
+    LlavaForConditionalGeneration,
+    AutoProcessor,
+)
+
+import outlines
+from outlines.inputs import Image
+
+TEST_MODEL = "trl-internal-testing/tiny-LlavaForConditionalGeneration"
+IMAGE_URL = "https://upload.wikimedia.org/wikipedia/commons/2/25/Siam_lilacpoint.jpg"
+IMAGE_URL_2 ="https://upload.wikimedia.org/wikipedia/commons/9/98/Aldrin_Apollo_11_original.jpg"
+
+def get_image_from_url(image_url):
+    img_byte_stream = BytesIO(urlopen(image_url).read())
+    image = PILImage.open(img_byte_stream).convert("RGB")
+    image.format = "PNG"
+    return image
+
+# Create a model
+model = outlines.from_transformers(
+    LlavaForConditionalGeneration.from_pretrained(TEST_MODEL),
+    AutoProcessor.from_pretrained(TEST_MODEL),
+)
+
+# Call the batch method with a list of model input dicts
+result = model.batch(
+    [
+        ["<image>Describe the image.", Image(get_image_from_url(IMAGE_URL))],
+        ["<image>Describe the image.", Image(get_image_from_url(IMAGE_URL_2))],
+    ]
+)
+print(result) # ['The image shows a cat', 'The image shows an astronaut']
+```