Skip to content

Commit 7e44517

Browse files
CopilotTomeHirata
andauthored
Fix content input conversion for OpenAI Responses API (#8993)
* Initial plan * Fix image input handling for OpenAI Responses API Co-authored-by: TomeHirata <[email protected]> * Fix linting issues in image conversion code Co-authored-by: TomeHirata <[email protected]> * Add better error handling for malformed data URIs Co-authored-by: TomeHirata <[email protected]> * Remove docstrings from test cases Co-authored-by: TomeHirata <[email protected]> * Fix image format to match OpenAI Responses API documentation Co-authored-by: TomeHirata <[email protected]> * Convert text items from 'text' to 'input_text' for Responses API Co-authored-by: TomeHirata <[email protected]> --------- Co-authored-by: copilot-swe-agent[bot] <[email protected]> Co-authored-by: TomeHirata <[email protected]>
1 parent 4904d80 commit 7e44517

File tree

2 files changed

+161
-3
lines changed

2 files changed

+161
-3
lines changed

dspy/clients/lm.py

Lines changed: 35 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -468,7 +468,9 @@ def _convert_chat_request_to_responses_request(request: dict[str, Any]):
468468
if isinstance(c, str):
469469
content_blocks.append({"type": "input_text", "text": c})
470470
elif isinstance(c, list):
471-
content_blocks.extend(c)
471+
# Convert each content item from Chat API format to Responses API format
472+
for item in c:
473+
content_blocks.append(_convert_content_item_to_responses_format(item))
472474
request["input"] = [{"role": msg.get("role", "user"), "content": content_blocks}]
473475

474476
# Convert `response_format` to `text.format` for Responses API
@@ -480,6 +482,38 @@ def _convert_chat_request_to_responses_request(request: dict[str, Any]):
480482
return request
481483

482484

485+
def _convert_content_item_to_responses_format(item: dict[str, Any]) -> dict[str, Any]:
486+
"""
487+
Convert a content item from Chat API format to Responses API format.
488+
489+
For images, converts from:
490+
{"type": "image_url", "image_url": {"url": "..."}}
491+
To:
492+
{"type": "input_image", "image_url": "..."}
493+
494+
For text, converts from:
495+
{"type": "text", "text": "..."}
496+
To:
497+
{"type": "input_text", "text": "..."}
498+
499+
For other types, passes through as-is.
500+
"""
501+
if item.get("type") == "image_url":
502+
image_url = item.get("image_url", {}).get("url", "")
503+
return {
504+
"type": "input_image",
505+
"image_url": image_url,
506+
}
507+
elif item.get("type") == "text":
508+
return {
509+
"type": "input_text",
510+
"text": item.get("text", ""),
511+
}
512+
513+
# For other items, return as-is
514+
return item
515+
516+
483517
def _get_headers(headers: dict[str, Any] | None = None):
484518
headers = headers or {}
485519
return {

tests/clients/test_lm.py

Lines changed: 126 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -343,8 +343,8 @@ def test_reasoning_model_requirements(model_name):
343343
lm = dspy.LM(
344344
model=model_name,
345345
)
346-
assert lm.kwargs["temperature"] == None
347-
assert lm.kwargs["max_completion_tokens"] == None
346+
assert lm.kwargs["temperature"] is None
347+
assert lm.kwargs["max_completion_tokens"] is None
348348

349349

350350
def test_dump_state():
@@ -633,3 +633,127 @@ def test_api_key_not_saved_in_json():
633633
assert saved_state["lm"]["model"] == "openai/gpt-4o-mini"
634634
assert saved_state["lm"]["temperature"] == 1.0
635635
assert saved_state["lm"]["max_tokens"] == 100
636+
637+
638+
def test_responses_api_converts_images_correctly():
639+
from dspy.clients.lm import _convert_chat_request_to_responses_request
640+
641+
# Test with base64 image
642+
request_with_base64_image = {
643+
"model": "openai/gpt-5-mini",
644+
"messages": [
645+
{
646+
"role": "user",
647+
"content": [
648+
{"type": "text", "text": "What's in this image?"},
649+
{
650+
"type": "image_url",
651+
"image_url": {
652+
"url": ""
653+
}
654+
}
655+
]
656+
}
657+
]
658+
}
659+
660+
result = _convert_chat_request_to_responses_request(request_with_base64_image)
661+
662+
assert "input" in result
663+
assert len(result["input"]) == 1
664+
assert result["input"][0]["role"] == "user"
665+
666+
content = result["input"][0]["content"]
667+
assert len(content) == 2
668+
669+
# First item should be text converted to input_text format
670+
assert content[0]["type"] == "input_text"
671+
assert content[0]["text"] == "What's in this image?"
672+
673+
# Second item should be converted to input_image format
674+
assert content[1]["type"] == "input_image"
675+
assert content[1]["image_url"] == ""
676+
677+
# Test with URL image
678+
request_with_url_image = {
679+
"model": "openai/gpt-5-mini",
680+
"messages": [
681+
{
682+
"role": "user",
683+
"content": [
684+
{
685+
"type": "image_url",
686+
"image_url": {
687+
"url": "https://example.com/image.jpg"
688+
}
689+
}
690+
]
691+
}
692+
]
693+
}
694+
695+
result = _convert_chat_request_to_responses_request(request_with_url_image)
696+
697+
content = result["input"][0]["content"]
698+
assert len(content) == 1
699+
assert content[0]["type"] == "input_image"
700+
assert content[0]["image_url"] == "https://example.com/image.jpg"
701+
702+
703+
def test_responses_api_with_image_input():
704+
api_response = make_response(
705+
output_blocks=[
706+
ResponseOutputMessage(
707+
**{
708+
"id": "msg_1",
709+
"type": "message",
710+
"role": "assistant",
711+
"status": "completed",
712+
"content": [
713+
{"type": "output_text", "text": "This is a test answer with image input.", "annotations": []}
714+
],
715+
},
716+
),
717+
]
718+
)
719+
720+
with mock.patch("litellm.responses", autospec=True, return_value=api_response) as dspy_responses:
721+
lm = dspy.LM(
722+
model="openai/gpt-5-mini",
723+
model_type="responses",
724+
cache=False,
725+
temperature=1.0,
726+
max_tokens=16000,
727+
)
728+
729+
# Test with messages containing an image
730+
messages = [
731+
{
732+
"role": "user",
733+
"content": [
734+
{"type": "text", "text": "Describe this image"},
735+
{
736+
"type": "image_url",
737+
"image_url": {
738+
"url": ""
739+
}
740+
}
741+
]
742+
}
743+
]
744+
745+
lm_result = lm(messages=messages)
746+
747+
assert lm_result == [{"text": "This is a test answer with image input."}]
748+
749+
dspy_responses.assert_called_once()
750+
call_args = dspy_responses.call_args.kwargs
751+
752+
# Verify the request was converted correctly
753+
assert "input" in call_args
754+
content = call_args["input"][0]["content"]
755+
756+
# Check that image was converted to input_image format
757+
image_content = [c for c in content if c.get("type") == "input_image"]
758+
assert len(image_content) == 1
759+
assert image_content[0]["image_url"] == ""

0 commit comments

Comments
 (0)