diff --git a/dspy/clients/lm.py b/dspy/clients/lm.py index 71a8934cc4..19085bea81 100644 --- a/dspy/clients/lm.py +++ b/dspy/clients/lm.py @@ -468,7 +468,9 @@ def _convert_chat_request_to_responses_request(request: dict[str, Any]): if isinstance(c, str): content_blocks.append({"type": "input_text", "text": c}) elif isinstance(c, list): - content_blocks.extend(c) + # Convert each content item from Chat API format to Responses API format + for item in c: + content_blocks.append(_convert_content_item_to_responses_format(item)) request["input"] = [{"role": msg.get("role", "user"), "content": content_blocks}] # Convert `response_format` to `text.format` for Responses API @@ -480,6 +482,38 @@ def _convert_chat_request_to_responses_request(request: dict[str, Any]): return request +def _convert_content_item_to_responses_format(item: dict[str, Any]) -> dict[str, Any]: + """ + Convert a content item from Chat API format to Responses API format. + + For images, converts from: + {"type": "image_url", "image_url": {"url": "..."}} + To: + {"type": "input_image", "image_url": "..."} + + For text, converts from: + {"type": "text", "text": "..."} + To: + {"type": "input_text", "text": "..."} + + For other types, passes through as-is. + """ + if item.get("type") == "image_url": + image_url = item.get("image_url", {}).get("url", "") + return { + "type": "input_image", + "image_url": image_url, + } + elif item.get("type") == "text": + return { + "type": "input_text", + "text": item.get("text", ""), + } + + # For other items, return as-is + return item + + def _get_headers(headers: dict[str, Any] | None = None): headers = headers or {} return { diff --git a/tests/clients/test_lm.py b/tests/clients/test_lm.py index 336ccc41b4..a24c09a9d4 100644 --- a/tests/clients/test_lm.py +++ b/tests/clients/test_lm.py @@ -343,8 +343,8 @@ def test_reasoning_model_requirements(model_name): lm = dspy.LM( model=model_name, ) - assert lm.kwargs["temperature"] == None - assert lm.kwargs["max_completion_tokens"] == None + assert lm.kwargs["temperature"] is None + assert lm.kwargs["max_completion_tokens"] is None def test_dump_state(): @@ -633,3 +633,127 @@ def test_api_key_not_saved_in_json(): assert saved_state["lm"]["model"] == "openai/gpt-4o-mini" assert saved_state["lm"]["temperature"] == 1.0 assert saved_state["lm"]["max_tokens"] == 100 + + +def test_responses_api_converts_images_correctly(): + from dspy.clients.lm import _convert_chat_request_to_responses_request + + # Test with base64 image + request_with_base64_image = { + "model": "openai/gpt-5-mini", + "messages": [ + { + "role": "user", + "content": [ + {"type": "text", "text": "What's in this image?"}, + { + "type": "image_url", + "image_url": { + "url": "" + } + } + ] + } + ] + } + + result = _convert_chat_request_to_responses_request(request_with_base64_image) + + assert "input" in result + assert len(result["input"]) == 1 + assert result["input"][0]["role"] == "user" + + content = result["input"][0]["content"] + assert len(content) == 2 + + # First item should be text converted to input_text format + assert content[0]["type"] == "input_text" + assert content[0]["text"] == "What's in this image?" + + # Second item should be converted to input_image format + assert content[1]["type"] == "input_image" + assert content[1]["image_url"] == "" + + # Test with URL image + request_with_url_image = { + "model": "openai/gpt-5-mini", + "messages": [ + { + "role": "user", + "content": [ + { + "type": "image_url", + "image_url": { + "url": "https://example.com/image.jpg" + } + } + ] + } + ] + } + + result = _convert_chat_request_to_responses_request(request_with_url_image) + + content = result["input"][0]["content"] + assert len(content) == 1 + assert content[0]["type"] == "input_image" + assert content[0]["image_url"] == "https://example.com/image.jpg" + + +def test_responses_api_with_image_input(): + api_response = make_response( + output_blocks=[ + ResponseOutputMessage( + **{ + "id": "msg_1", + "type": "message", + "role": "assistant", + "status": "completed", + "content": [ + {"type": "output_text", "text": "This is a test answer with image input.", "annotations": []} + ], + }, + ), + ] + ) + + with mock.patch("litellm.responses", autospec=True, return_value=api_response) as dspy_responses: + lm = dspy.LM( + model="openai/gpt-5-mini", + model_type="responses", + cache=False, + temperature=1.0, + max_tokens=16000, + ) + + # Test with messages containing an image + messages = [ + { + "role": "user", + "content": [ + {"type": "text", "text": "Describe this image"}, + { + "type": "image_url", + "image_url": { + "url": "" + } + } + ] + } + ] + + lm_result = lm(messages=messages) + + assert lm_result == [{"text": "This is a test answer with image input."}] + + dspy_responses.assert_called_once() + call_args = dspy_responses.call_args.kwargs + + # Verify the request was converted correctly + assert "input" in call_args + content = call_args["input"][0]["content"] + + # Check that image was converted to input_image format + image_content = [c for c in content if c.get("type") == "input_image"] + assert len(image_content) == 1 + assert image_content[0]["image_url"] == ""