fix(responses): fix regression in support for mcp tool require_approval argument (#3731)

grs · web-flow · commit 96886afacaf1 · 2025-10-08T10:47:17.000-04:00
# What does this PR do? It prevents a tool call message being added to the chat completions message without a corresponding tool call result, which is needed in the case that an approval is required first or if the approval request is denied. In both these cases the tool call messages is popped of the next turn messages. Closes #3728 ## Test Plan Ran the integration tests Manual check of both approval and denial against gpt-4o Signed-off-by: Gordon Sim <gsim@redhat.com>
diff --git a/llama_stack/providers/inline/agents/meta_reference/responses/openai_responses.py b/llama_stack/providers/inline/agents/meta_reference/responses/openai_responses.py
@@ -269,7 +269,7 @@ async def _create_streaming_response(
             response_tools=tools,
             temperature=temperature,
             response_format=response_format,
-            inputs=input,
+            inputs=all_input,
         )
 
         # Create orchestrator and delegate streaming logic
diff --git a/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py b/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py
@@ -175,6 +175,8 @@ async def create_response(self) -> AsyncIterator[OpenAIResponseObjectStream]:
             ):
                 yield stream_event
 
+            messages = next_turn_messages
+
             if not function_tool_calls and not non_function_tool_calls:
                 break
 
@@ -187,9 +189,7 @@ async def create_response(self) -> AsyncIterator[OpenAIResponseObjectStream]:
                 logger.info(f"Exiting inference loop since iteration count({n_iter}) exceeds {self.max_infer_iters=}")
                 break
 
-            messages = next_turn_messages
-
-        self.final_messages = messages.copy() + [current_response.choices[0].message]
+        self.final_messages = messages.copy()
 
         # Create final response
         final_response = OpenAIResponseObject(
@@ -232,9 +232,11 @@ def _separate_tool_calls(self, current_response, messages) -> tuple[list, list,
                                     non_function_tool_calls.append(tool_call)
                                 else:
                                     logger.info(f"Approval denied for {tool_call.id} on {tool_call.function.name}")
+                                    next_turn_messages.pop()
                             else:
                                 logger.info(f"Requesting approval for {tool_call.id} on {tool_call.function.name}")
                                 approvals.append(tool_call)
+                                next_turn_messages.pop()
                         else:
                             non_function_tool_calls.append(tool_call)
 

Original file line number	Diff line number	Diff line change
`@@ -269,7 +269,7 @@ async def _create_streaming_response(`
`269`	`269`	`response_tools=tools,`
`270`	`270`	`temperature=temperature,`
`271`	`271`	`response_format=response_format,`
`272`		`- inputs=input,`
	`272`	`+ inputs=all_input,`
`273`	`273`	`)`
`274`	`274`
`275`	`275`	`# Create orchestrator and delegate streaming logic`