Fix content input conversion for OpenAI Responses API (#8993)

Copilot · TomeHirata · web-flow · commit 7e44517fdfb7 · 2025-11-06T14:02:13.000+09:00
* Initial plan

* Fix image input handling for OpenAI Responses API

Co-authored-by: TomeHirata &lt;33407409+TomeHirata@users.noreply.github.com&gt;

* Fix linting issues in image conversion code

Co-authored-by: TomeHirata &lt;33407409+TomeHirata@users.noreply.github.com&gt;

* Add better error handling for malformed data URIs

Co-authored-by: TomeHirata &lt;33407409+TomeHirata@users.noreply.github.com&gt;

* Remove docstrings from test cases

Co-authored-by: TomeHirata &lt;33407409+TomeHirata@users.noreply.github.com&gt;

* Fix image format to match OpenAI Responses API documentation

Co-authored-by: TomeHirata &lt;33407409+TomeHirata@users.noreply.github.com&gt;

* Convert text items from 'text' to 'input_text' for Responses API

Co-authored-by: TomeHirata &lt;33407409+TomeHirata@users.noreply.github.com&gt;

---------

Co-authored-by: copilot-swe-agent[bot] &lt;198982749+Copilot@users.noreply.github.com&gt;
Co-authored-by: TomeHirata &lt;33407409+TomeHirata@users.noreply.github.com&gt;
diff --git a/dspy/clients/lm.py b/dspy/clients/lm.py
@@ -468,7 +468,9 @@ def _convert_chat_request_to_responses_request(request: dict[str, Any]):
             if isinstance(c, str):
                 content_blocks.append({"type": "input_text", "text": c})
             elif isinstance(c, list):
-                content_blocks.extend(c)
+                # Convert each content item from Chat API format to Responses API format
+                for item in c:
+                    content_blocks.append(_convert_content_item_to_responses_format(item))
         request["input"] = [{"role": msg.get("role", "user"), "content": content_blocks}]
 
     # Convert `response_format` to `text.format` for Responses API
@@ -480,6 +482,38 @@ def _convert_chat_request_to_responses_request(request: dict[str, Any]):
     return request
 
 
+def _convert_content_item_to_responses_format(item: dict[str, Any]) -> dict[str, Any]:
+    """
+    Convert a content item from Chat API format to Responses API format.
+
+    For images, converts from:
+        {"type": "image_url", "image_url": {"url": "..."}}
+    To:
+        {"type": "input_image", "image_url": "..."}
+
+    For text, converts from:
+        {"type": "text", "text": "..."}
+    To:
+        {"type": "input_text", "text": "..."}
+
+    For other types, passes through as-is.
+    """
+    if item.get("type") == "image_url":
+        image_url = item.get("image_url", {}).get("url", "")
+        return {
+            "type": "input_image",
+            "image_url": image_url,
+        }
+    elif item.get("type") == "text":
+        return {
+            "type": "input_text",
+            "text": item.get("text", ""),
+        }
+
+    # For other items, return as-is
+    return item
+
+
 def _get_headers(headers: dict[str, Any] | None = None):
     headers = headers or {}
     return {
diff --git a/tests/clients/test_lm.py b/tests/clients/test_lm.py
@@ -343,8 +343,8 @@ def test_reasoning_model_requirements(model_name):
     lm = dspy.LM(
         model=model_name,
     )
-    assert lm.kwargs["temperature"] == None
-    assert lm.kwargs["max_completion_tokens"] == None
+    assert lm.kwargs["temperature"] is None
+    assert lm.kwargs["max_completion_tokens"] is None
 
 
 def test_dump_state():
@@ -633,3 +633,127 @@ def test_api_key_not_saved_in_json():
         assert saved_state["lm"]["model"] == "openai/gpt-4o-mini"
         assert saved_state["lm"]["temperature"] == 1.0
         assert saved_state["lm"]["max_tokens"] == 100
+
+
+def test_responses_api_converts_images_correctly():
+    from dspy.clients.lm import _convert_chat_request_to_responses_request
+
+    # Test with base64 image
+    request_with_base64_image = {
+        "model": "openai/gpt-5-mini",
+        "messages": [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "What's in this image?"},
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNk+M9QDwADhgGAWjR9awAAAABJRU5ErkJggg=="
+                        }
+                    }
+                ]
+            }
+        ]
+    }
+
+    result = _convert_chat_request_to_responses_request(request_with_base64_image)
+
+    assert "input" in result
+    assert len(result["input"]) == 1
+    assert result["input"][0]["role"] == "user"
+
+    content = result["input"][0]["content"]
+    assert len(content) == 2
+
+    # First item should be text converted to input_text format
+    assert content[0]["type"] == "input_text"
+    assert content[0]["text"] == "What's in this image?"
+
+    # Second item should be converted to input_image format
+    assert content[1]["type"] == "input_image"
+    assert content[1]["image_url"] == "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNk+M9QDwADhgGAWjR9awAAAABJRU5ErkJggg=="
+
+    # Test with URL image
+    request_with_url_image = {
+        "model": "openai/gpt-5-mini",
+        "messages": [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": "https://example.com/image.jpg"
+                        }
+                    }
+                ]
+            }
+        ]
+    }
+
+    result = _convert_chat_request_to_responses_request(request_with_url_image)
+
+    content = result["input"][0]["content"]
+    assert len(content) == 1
+    assert content[0]["type"] == "input_image"
+    assert content[0]["image_url"] == "https://example.com/image.jpg"
+
+
+def test_responses_api_with_image_input():
+    api_response = make_response(
+        output_blocks=[
+            ResponseOutputMessage(
+                **{
+                    "id": "msg_1",
+                    "type": "message",
+                    "role": "assistant",
+                    "status": "completed",
+                    "content": [
+                        {"type": "output_text", "text": "This is a test answer with image input.", "annotations": []}
+                    ],
+                },
+            ),
+        ]
+    )
+
+    with mock.patch("litellm.responses", autospec=True, return_value=api_response) as dspy_responses:
+        lm = dspy.LM(
+            model="openai/gpt-5-mini",
+            model_type="responses",
+            cache=False,
+            temperature=1.0,
+            max_tokens=16000,
+        )
+
+        # Test with messages containing an image
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "Describe this image"},
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNk+M9QDwADhgGAWjR9awAAAABJRU5ErkJggg=="
+                        }
+                    }
+                ]
+            }
+        ]
+
+        lm_result = lm(messages=messages)
+
+        assert lm_result == [{"text": "This is a test answer with image input."}]
+
+        dspy_responses.assert_called_once()
+        call_args = dspy_responses.call_args.kwargs
+
+        # Verify the request was converted correctly
+        assert "input" in call_args
+        content = call_args["input"][0]["content"]
+
+        # Check that image was converted to input_image format
+        image_content = [c for c in content if c.get("type") == "input_image"]
+        assert len(image_content) == 1
+        assert image_content[0]["image_url"] == "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNk+M9QDwADhgGAWjR9awAAAABJRU5ErkJggg=="