Skip to content

Commit 66a8d9b

Browse files
feat: add multimodal support to AnthropicChatGenerator (#2186)
* Add multimodal support to AnthropicChatGenerator #2125 * fix: resolve import formatting and line length issues * Fixed recommended changes #2186: anthropic image support * Fix mime type validation * refinements --------- Co-authored-by: anakin87 <[email protected]>
1 parent 1b40c2a commit 66a8d9b

File tree

5 files changed

+176
-24
lines changed

5 files changed

+176
-24
lines changed

integrations/anthropic/pyproject.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,8 @@ dependencies = [
5757
"pytest-cov",
5858
"pytest-rerunfailures",
5959
"mypy",
60-
"pip"
60+
"pip",
61+
"pillow", # image support
6162
]
6263

6364
[tool.hatch.envs.test.scripts]

integrations/anthropic/src/haystack_integrations/components/generators/anthropic/chat/chat_generator.py

Lines changed: 87 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
11
from datetime import datetime, timezone
2-
from typing import Any, ClassVar, Dict, List, Literal, Optional, Tuple, Union
2+
from typing import Any, ClassVar, Dict, List, Literal, Optional, Tuple, Union, cast, get_args
33

44
from haystack import component, default_from_dict, default_to_dict, logging
55
from haystack.components.generators.utils import _convert_streaming_chunks_to_chat_message
6-
from haystack.dataclasses.chat_message import ChatMessage, ChatRole, ToolCall, ToolCallResult
6+
from haystack.dataclasses.chat_message import ChatMessage, ChatRole, TextContent, ToolCall, ToolCallResult
7+
from haystack.dataclasses.image_content import ImageContent
78
from haystack.dataclasses.streaming_chunk import (
89
AsyncStreamingCallbackT,
910
ComponentInfo,
@@ -26,11 +27,23 @@
2627

2728
from anthropic import Anthropic, AsyncAnthropic
2829
from anthropic.resources.messages.messages import Message, RawMessageStreamEvent, Stream
29-
from anthropic.types import MessageParam, TextBlockParam, ToolParam, ToolResultBlockParam, ToolUseBlockParam
30+
from anthropic.types import (
31+
ImageBlockParam,
32+
MessageParam,
33+
TextBlockParam,
34+
ToolParam,
35+
ToolResultBlockParam,
36+
ToolUseBlockParam,
37+
)
3038

3139
logger = logging.getLogger(__name__)
3240

3341

42+
# See https://docs.anthropic.com/en/api/messages for supported formats
43+
ImageFormat = Literal["image/jpeg", "image/png", "image/gif", "image/webp"]
44+
IMAGE_SUPPORTED_FORMATS: list[ImageFormat] = list(get_args(ImageFormat))
45+
46+
3447
# Mapping from Anthropic stop reasons to Haystack FinishReason values
3548
FINISH_REASON_MAPPING: Dict[str, FinishReason] = {
3649
"end_turn": "stop",
@@ -44,7 +57,7 @@
4457

4558
def _update_anthropic_message_with_tool_call_results(
4659
tool_call_results: List[ToolCallResult],
47-
content: List[Union[TextBlockParam, ToolUseBlockParam, ToolResultBlockParam]],
60+
content: List[Union[TextBlockParam, ToolUseBlockParam, ToolResultBlockParam, ImageBlockParam]],
4861
) -> None:
4962
"""
5063
Update an Anthropic message content list with tool call results.
@@ -119,13 +132,39 @@ def _convert_messages_to_anthropic_format(
119132
i += 1
120133
continue
121134

122-
content: List[Union[TextBlockParam, ToolUseBlockParam, ToolResultBlockParam]] = []
123-
124-
if message.texts and message.texts[0]:
125-
text_block = TextBlockParam(type="text", text=message.texts[0])
126-
if cache_control:
127-
text_block["cache_control"] = cache_control
128-
content.append(text_block)
135+
content: List[Union[TextBlockParam, ToolUseBlockParam, ToolResultBlockParam, ImageBlockParam]] = []
136+
137+
# Handle multimodal content (text and images) preserving order
138+
for part in message._content:
139+
if isinstance(part, TextContent) and part.text:
140+
text_block = TextBlockParam(type="text", text=part.text)
141+
if cache_control:
142+
text_block["cache_control"] = cache_control
143+
content.append(text_block)
144+
elif isinstance(part, ImageContent):
145+
if not message.is_from(ChatRole.USER):
146+
msg = "Image content is only supported for user messages"
147+
raise ValueError(msg)
148+
149+
if part.mime_type not in IMAGE_SUPPORTED_FORMATS:
150+
supported_formats = ", ".join(IMAGE_SUPPORTED_FORMATS)
151+
msg = (
152+
f"Unsupported image format: {part.mime_type}. "
153+
f"Anthropic supports the following formats: {supported_formats}"
154+
)
155+
raise ValueError(msg)
156+
157+
image_block = ImageBlockParam(
158+
type="image",
159+
source={
160+
"type": "base64",
161+
"media_type": cast(ImageFormat, part.mime_type),
162+
"data": part.base64_image,
163+
},
164+
)
165+
if cache_control:
166+
image_block["cache_control"] = cache_control
167+
content.append(image_block)
129168

130169
if message.tool_calls:
131170
tool_use_blocks = _convert_tool_calls_to_anthropic_format(message.tool_calls)
@@ -148,7 +187,10 @@ def _convert_messages_to_anthropic_format(
148187
blk["cache_control"] = cache_control
149188

150189
if not content:
151-
msg = "A `ChatMessage` must contain at least one `TextContent`, `ToolCall`, or `ToolCallResult`."
190+
msg = (
191+
"A `ChatMessage` must contain at least one `TextContent`, `ImageContent`, "
192+
"`ToolCall`, or `ToolCallResult`."
193+
)
152194
raise ValueError(msg)
153195

154196
# Anthropic only supports assistant and user roles in messages. User role is also used for tool messages.
@@ -170,7 +212,7 @@ class AnthropicChatGenerator:
170212
Completes chats using Anthropic's large language models (LLMs).
171213
172214
It uses [ChatMessage](https://docs.haystack.deepset.ai/docs/data-classes#chatmessage)
173-
format in input and output.
215+
format in input and output. Supports multimodal inputs including text and images.
174216
175217
You can customize how the text is generated by passing parameters to the
176218
Anthropic API. Use the `**generation_kwargs` argument when you initialize
@@ -182,18 +224,41 @@ class AnthropicChatGenerator:
182224
183225
Usage example:
184226
```python
185-
from haystack_integrations.components.generators.anthropic import AnthropicChatGenerator
227+
from haystack_integrations.components.generators.anthropic import (
228+
AnthropicChatGenerator,
229+
)
186230
from haystack.dataclasses import ChatMessage
187231
188-
generator = AnthropicChatGenerator(model="claude-sonnet-4-20250514",
189-
generation_kwargs={
190-
"max_tokens": 1000,
191-
"temperature": 0.7,
192-
})
193-
194-
messages = [ChatMessage.from_system("You are a helpful, respectful and honest assistant"),
195-
ChatMessage.from_user("What's Natural Language Processing?")]
232+
generator = AnthropicChatGenerator(
233+
model="claude-sonnet-4-20250514",
234+
generation_kwargs={
235+
"max_tokens": 1000,
236+
"temperature": 0.7,
237+
},
238+
)
239+
240+
messages = [
241+
ChatMessage.from_system(
242+
"You are a helpful, respectful and honest assistant"
243+
),
244+
ChatMessage.from_user("What's Natural Language Processing?"),
245+
]
196246
print(generator.run(messages=messages))
247+
```
248+
249+
Usage example with images:
250+
```python
251+
from haystack.dataclasses import ChatMessage, ImageContent
252+
253+
image_content = ImageContent.from_file_path("path/to/image.jpg")
254+
messages = [
255+
ChatMessage.from_user(
256+
content_parts=["What's in this image?", image_content]
257+
)
258+
]
259+
generator = AnthropicChatGenerator()
260+
result = generator.run(messages)
261+
```
197262
"""
198263

199264
# The parameters that can be passed to the Anthropic API https://docs.anthropic.com/claude/reference/messages_post

integrations/anthropic/tests/conftest.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
from pathlib import Path
12
from unittest.mock import patch
23

34
import pytest
@@ -43,3 +44,8 @@ def mock_chat_completion_extended_thinking():
4344

4445
mock_chat_completion_create.return_value = completion
4546
yield mock_chat_completion_create
47+
48+
49+
@pytest.fixture()
50+
def test_files_path():
51+
return Path(__file__).parent / "test_files"

integrations/anthropic/tests/test_chat_generator.py

Lines changed: 81 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,15 @@
2525
from anthropic.types.raw_message_delta_event import Delta
2626
from haystack import Pipeline
2727
from haystack.components.generators.utils import _convert_streaming_chunks_to_chat_message, print_streaming_chunk
28-
from haystack.dataclasses import ChatMessage, ChatRole, ComponentInfo, StreamingChunk, ToolCall, ToolCallDelta
28+
from haystack.dataclasses import (
29+
ChatMessage,
30+
ChatRole,
31+
ComponentInfo,
32+
ImageContent,
33+
StreamingChunk,
34+
ToolCall,
35+
ToolCallDelta,
36+
)
2937
from haystack.tools import Tool, Toolset
3038
from haystack.utils.auth import Secret
3139

@@ -1155,6 +1163,53 @@ def test_convert_message_to_anthropic_format_complex(self):
11551163
},
11561164
]
11571165

1166+
def test_convert_message_to_anthropic_format_with_image(self):
1167+
"""Test that a ChatMessage with ImageContent is converted to Anthropic format correctly."""
1168+
base64_image = (
1169+
"iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mP8/5+hHgAHggJ/PchI7wAAAABJRU5ErkJggg=="
1170+
)
1171+
image_content = ImageContent(base64_image=base64_image, mime_type="image/png")
1172+
message = ChatMessage.from_user(content_parts=["What's in this image?", image_content])
1173+
1174+
system_messages, non_system_messages = _convert_messages_to_anthropic_format([message])
1175+
1176+
assert len(non_system_messages) == 1
1177+
anthropic_message = non_system_messages[0]
1178+
assert anthropic_message["role"] == "user"
1179+
assert len(anthropic_message["content"]) == 2
1180+
1181+
# Check text and image blocks
1182+
assert anthropic_message["content"][0]["type"] == "text"
1183+
assert anthropic_message["content"][0]["text"] == "What's in this image?"
1184+
assert anthropic_message["content"][1]["type"] == "image"
1185+
assert anthropic_message["content"][1]["source"]["type"] == "base64"
1186+
assert anthropic_message["content"][1]["source"]["media_type"] == "image/png"
1187+
assert anthropic_message["content"][1]["source"]["data"] == base64_image
1188+
1189+
def test_convert_message_to_anthropic_format_with_unsupported_mime_type(self):
1190+
"""Test that a ChatMessage with unsupported mime type raises ValueError."""
1191+
base64_image = (
1192+
"iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mP8/5+hHgAHggJ/PchI7wAAAABJRU5ErkJggg=="
1193+
)
1194+
image_content = ImageContent(base64_image=base64_image, mime_type="image/bmp") # Unsupported format
1195+
message = ChatMessage.from_user(content_parts=["What's in this image?", image_content])
1196+
1197+
with pytest.raises(ValueError, match="Unsupported image format: image/bmp"):
1198+
_convert_messages_to_anthropic_format([message])
1199+
1200+
def test_convert_message_to_anthropic_format_with_none_mime_type(self):
1201+
"""Test that a ChatMessage with None mime type raises ValueError."""
1202+
base64_image = (
1203+
"iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mP8/5+hHgAHggJ/PchI7wAAAABJRU5ErkJggg=="
1204+
)
1205+
image_content = ImageContent(base64_image=base64_image, mime_type="image/png")
1206+
# Manually set mime_type to None to test the validation
1207+
image_content.mime_type = None
1208+
message = ChatMessage.from_user(content_parts=["What's in this image?", image_content])
1209+
1210+
with pytest.raises(ValueError, match="Unsupported image format: None"):
1211+
_convert_messages_to_anthropic_format([message])
1212+
11581213
def test_convert_message_to_anthropic_invalid(self):
11591214
"""
11601215
Test that the AnthropicChatGenerator component fails to convert an invalid ChatMessage to Anthropic format.
@@ -1564,6 +1619,7 @@ def test_convert_messages_attaches_cache_control(self):
15641619
assert non_sys[0]["content"][0]["cache_control"]["type"] == "ephemeral"
15651620

15661621
@pytest.mark.skipif(not os.environ.get("ANTHROPIC_API_KEY", None), reason="ANTHROPIC_API_KEY not set")
1622+
@pytest.mark.integration
15671623
@pytest.mark.parametrize("cache_enabled", [True, False])
15681624
def test_prompt_caching_live_run(self, cache_enabled):
15691625
generation_kwargs = {"extra_headers": {"anthropic-beta": "prompt-caching-2024-07-31"}} if cache_enabled else {}
@@ -1595,6 +1651,7 @@ def test_prompt_caching_live_run(self, cache_enabled):
15951651
assert token_usage["cache_read_input_tokens"] == 0
15961652

15971653
@pytest.mark.skipif(not os.environ.get("ANTHROPIC_API_KEY"), reason="ANTHROPIC_API_KEY not set")
1654+
@pytest.mark.integration
15981655
@pytest.mark.parametrize("cache_enabled", [True, False])
15991656
def test_prompt_caching_live_run_with_user_message(self, cache_enabled):
16001657
claude_llm = AnthropicChatGenerator(
@@ -1799,3 +1856,26 @@ async def test_live_run_async_with_tools(self, tools):
17991856
assert len(final_message.text) > 0
18001857
assert "paris" in final_message.text.lower()
18011858
assert "completion_tokens" in final_message.meta["usage"]
1859+
1860+
@pytest.mark.integration
1861+
@pytest.mark.skipif(
1862+
not os.environ.get("ANTHROPIC_API_KEY", None),
1863+
reason="Export an env var called ANTHROPIC_API_KEY containing the Anthropic token to run this test.",
1864+
)
1865+
def test_live_run_multimodal(self, test_files_path):
1866+
"""Integration test for multimodal functionality with real API."""
1867+
image_path = test_files_path / "apple.jpg"
1868+
# Resize the image to keep this test fast
1869+
image_content = ImageContent.from_file_path(file_path=image_path, size=(100, 100))
1870+
messages = [ChatMessage.from_user(content_parts=["What does this image show? Max 5 words", image_content])]
1871+
1872+
generator = AnthropicChatGenerator(generation_kwargs={"max_tokens": 20})
1873+
response = generator.run(messages=messages)
1874+
1875+
assert "replies" in response
1876+
assert isinstance(response["replies"], list)
1877+
assert len(response["replies"]) > 0
1878+
message = response["replies"][0]
1879+
assert message.text
1880+
assert len(message.text) > 0
1881+
assert any(word in message.text.lower() for word in ["apple", "fruit", "red"])
67.7 KB
Loading

0 commit comments

Comments
 (0)