7
7
from haystack .dataclasses import (
8
8
ChatMessage ,
9
9
ComponentInfo ,
10
+ ImageContent ,
10
11
StreamingCallbackT ,
12
+ TextContent ,
11
13
ToolCall ,
12
14
ToolCallDelta ,
13
15
select_streaming_callback ,
25
27
ChatCompletionMessageToolCall ,
26
28
ChatCompletionRequestAssistantMessage ,
27
29
ChatCompletionRequestMessage ,
30
+ ChatCompletionRequestMessageContentPart ,
28
31
ChatCompletionResponseChoice ,
29
32
ChatCompletionTool ,
30
33
CreateChatCompletionResponse ,
31
34
CreateChatCompletionStreamResponse ,
32
35
Llama ,
36
+ llama_chat_format ,
33
37
)
38
+ from llama_cpp .llama_chat_format import Llava15ChatHandler
34
39
from llama_cpp .llama_tokenizer import LlamaHFTokenizer
35
40
36
41
logger = logging .getLogger (__name__ )
42
47
"function_call" : "tool_calls" ,
43
48
}
44
49
50
+ SUPPORTED_IMAGE_FORMATS = ["image/jpeg" , "image/jpg" , "image/png" , "image/gif" , "image/webp" ]
51
+
45
52
46
53
def _convert_message_to_llamacpp_format (message : ChatMessage ) -> ChatCompletionRequestMessage :
47
54
"""
@@ -50,16 +57,24 @@ def _convert_message_to_llamacpp_format(message: ChatMessage) -> ChatCompletionR
50
57
text_contents = message .texts
51
58
tool_calls = message .tool_calls
52
59
tool_call_results = message .tool_call_results
60
+ images = message .images
53
61
54
- if not text_contents and not tool_calls and not tool_call_results :
55
- msg = "A `ChatMessage` must contain at least one `TextContent`, `ToolCall`, or `ToolCallResult`."
62
+ if not text_contents and not tool_calls and not tool_call_results and not images :
63
+ msg = (
64
+ "A `ChatMessage` must contain at least one `TextContent`, `ImageContent`, `ToolCall`, or `ToolCallResult`."
65
+ )
56
66
raise ValueError (msg )
57
67
elif len (text_contents ) + len (tool_call_results ) > 1 :
58
- msg = "A `ChatMessage` can only contain one `TextContent` or one `ToolCallResult`."
68
+ msg = "For llama.cpp compatibility, a `ChatMessage` can contain at most one `TextContent` or `ToolCallResult`."
59
69
raise ValueError (msg )
60
70
61
71
role = message ._role .value
62
72
73
+ # Check that images are only in user messages
74
+ if images and role != "user" :
75
+ msg = "Image content is only supported for user messages"
76
+ raise ValueError (msg )
77
+
63
78
if role == "tool" and tool_call_results :
64
79
if tool_call_results [0 ].origin .id is None :
65
80
msg = "`ToolCall` must have a non-null `id` attribute to be used with llama.cpp."
@@ -71,12 +86,34 @@ def _convert_message_to_llamacpp_format(message: ChatMessage) -> ChatCompletionR
71
86
}
72
87
73
88
if role == "system" :
74
- content = text_contents [0 ] if text_contents else None
75
- return {"role" : "system" , "content" : content }
89
+ return {"role" : "system" , "content" : text_contents [0 ]}
76
90
77
91
if role == "user" :
78
- content = text_contents [0 ] if text_contents else None
79
- return {"role" : "user" , "content" : content }
92
+ # Handle multimodal content (text + images) preserving order
93
+ if images :
94
+ # Check image constraints for LlamaCpp
95
+ for image in images :
96
+ if image .mime_type not in SUPPORTED_IMAGE_FORMATS :
97
+ supported_formats = ", " .join (SUPPORTED_IMAGE_FORMATS )
98
+ msg = (
99
+ f"Unsupported image format: { image .mime_type } . "
100
+ f"LlamaCpp supports the following formats: { supported_formats } "
101
+ )
102
+ raise ValueError (msg )
103
+
104
+ content_parts : list [ChatCompletionRequestMessageContentPart ] = []
105
+ for part in message ._content :
106
+ if isinstance (part , TextContent ) and part .text :
107
+ content_parts .append ({"type" : "text" , "text" : part .text })
108
+ elif isinstance (part , ImageContent ):
109
+ # LlamaCpp expects base64 data URI format
110
+ image_url = f"data:{ part .mime_type } ;base64,{ part .base64_image } "
111
+ content_parts .append ({"type" : "image_url" , "image_url" : {"url" : image_url }})
112
+
113
+ return {"role" : "user" , "content" : content_parts }
114
+
115
+ # Simple text-only message
116
+ return {"role" : "user" , "content" : text_contents [0 ]}
80
117
81
118
if role == "assistant" :
82
119
result : ChatCompletionRequestAssistantMessage = {"role" : "assistant" }
@@ -113,6 +150,7 @@ class LlamaCppChatGenerator:
113
150
114
151
[llama.cpp](https://github.com/ggml-org/llama.cpp) is a project written in C/C++ for efficient inference of LLMs.
115
152
It employs the quantized GGUF format, suitable for running these models on standard machines (even without GPUs).
153
+ Supports both text-only and multimodal (text + image) models like LLaVA.
116
154
117
155
Usage example:
118
156
```python
@@ -121,7 +159,30 @@ class LlamaCppChatGenerator:
121
159
generator = LlamaCppGenerator(model="zephyr-7b-beta.Q4_0.gguf", n_ctx=2048, n_batch=512)
122
160
123
161
print(generator.run(user_message, generation_kwargs={"max_tokens": 128}))
124
- # {"replies": [ChatMessage(content="John Cusack", role=<ChatRole.ASSISTANT: "assistant">, name=None, meta={...}]}
162
+ # {"replies": [ChatMessage(content="John Cusack", role=<ChatRole.ASSISTANT: "assistant">, name=None, meta={...})}
163
+ ```
164
+
165
+ Usage example with multimodal (image + text):
166
+ ```python
167
+ from haystack.dataclasses import ChatMessage, ImageContent
168
+
169
+ # Create an image from file path or base64
170
+ image_content = ImageContent.from_file_path("path/to/your/image.jpg")
171
+
172
+ # Create a multimodal message with both text and image
173
+ messages = [ChatMessage.from_user(content_parts=["What's in this image?", image_content])]
174
+
175
+ # Initialize with multimodal support
176
+ generator = LlamaCppChatGenerator(
177
+ model="llava-v1.5-7b-q4_0.gguf",
178
+ chat_handler_name="Llava15ChatHandler", # Use llava-1-5 handler
179
+ model_clip_path="mmproj-model-f16.gguf", # CLIP model
180
+ n_ctx=4096 # Larger context for image processing
181
+ )
182
+ generator.warm_up()
183
+
184
+ result = generator.run(messages)
185
+ print(result)
125
186
```
126
187
"""
127
188
@@ -135,6 +196,8 @@ def __init__(
135
196
* ,
136
197
tools : Optional [Union [List [Tool ], Toolset ]] = None ,
137
198
streaming_callback : Optional [StreamingCallbackT ] = None ,
199
+ chat_handler_name : Optional [str ] = None ,
200
+ model_clip_path : Optional [str ] = None ,
138
201
):
139
202
"""
140
203
:param model: The path of a quantized model for text generation, for example, "zephyr-7b-beta.Q4_0.gguf".
@@ -153,6 +216,12 @@ def __init__(
153
216
A list of tools or a Toolset for which the model can prepare calls.
154
217
This parameter can accept either a list of `Tool` objects or a `Toolset` instance.
155
218
:param streaming_callback: A callback function that is called when a new token is received from the stream.
219
+ :param chat_handler_name: Name of the chat handler for multimodal models.
220
+ Common options include: "Llava16ChatHandler", "MoondreamChatHandler", "Qwen25VLChatHandler".
221
+ For other handlers, check
222
+ [llama-cpp-python documentation](https://llama-cpp-python.readthedocs.io/en/latest/#multi-modal-models).
223
+ :param model_clip_path: Path to the CLIP model for vision processing (e.g., "mmproj.bin").
224
+ Required when chat_handler_name is provided for multimodal models.
156
225
"""
157
226
158
227
model_kwargs = model_kwargs or {}
@@ -166,6 +235,19 @@ def __init__(
166
235
167
236
_check_duplicate_tool_names (list (tools or []))
168
237
238
+ handler : Optional [Llava15ChatHandler ] = None
239
+ # Validate multimodal requirements
240
+ if chat_handler_name is not None :
241
+ if model_clip_path is None :
242
+ msg = "model_clip_path must be provided when chat_handler_name is specified for multimodal models"
243
+ raise ValueError (msg )
244
+ # Validate chat handler by attempting to import it
245
+ try :
246
+ handler = getattr (llama_chat_format , chat_handler_name )
247
+ except AttributeError as e :
248
+ msg = f"Failed to import chat handler '{ chat_handler_name } '."
249
+ raise ValueError (msg ) from e
250
+
169
251
self .model_path = model
170
252
self .n_ctx = n_ctx
171
253
self .n_batch = n_batch
@@ -174,14 +256,25 @@ def __init__(
174
256
self ._model : Optional [Llama ] = None
175
257
self .tools = tools
176
258
self .streaming_callback = streaming_callback
259
+ self .chat_handler_name = chat_handler_name
260
+ self .model_clip_path = model_clip_path
261
+ self ._handler = handler
177
262
178
263
def warm_up (self ):
179
- if "hf_tokenizer_path" in self .model_kwargs and "tokenizer" not in self .model_kwargs :
180
- tokenizer = LlamaHFTokenizer .from_pretrained (self .model_kwargs ["hf_tokenizer_path" ])
181
- self .model_kwargs ["tokenizer" ] = tokenizer
264
+ if self ._model is not None :
265
+ return
182
266
183
- if self ._model is None :
184
- self ._model = Llama (** self .model_kwargs )
267
+ kwargs = self .model_kwargs .copy ()
268
+ if "hf_tokenizer_path" in kwargs and "tokenizer" not in kwargs :
269
+ tokenizer = LlamaHFTokenizer .from_pretrained (kwargs ["hf_tokenizer_path" ])
270
+ kwargs ["tokenizer" ] = tokenizer
271
+
272
+ # Handle multimodal initialization
273
+ if self ._handler is not None and self .model_clip_path is not None :
274
+ # the following command is correct, but mypy complains because handlers also have a __call__ method
275
+ kwargs ["chat_handler" ] = self ._handler (clip_model_path = self .model_clip_path ) # type: ignore[call-arg]
276
+
277
+ self ._model = Llama (** kwargs )
185
278
186
279
def to_dict (self ) -> Dict [str , Any ]:
187
280
"""
@@ -200,6 +293,8 @@ def to_dict(self) -> Dict[str, Any]:
200
293
generation_kwargs = self .generation_kwargs ,
201
294
tools = serialize_tools_or_toolset (self .tools ),
202
295
streaming_callback = callback_name ,
296
+ chat_handler_name = self .chat_handler_name ,
297
+ model_clip_path = self .model_clip_path ,
203
298
)
204
299
205
300
@classmethod
0 commit comments