diff --git a/client-sdks/stainless/openapi.yml b/client-sdks/stainless/openapi.yml index 4f772b5670..befb734926 100644 --- a/client-sdks/stainless/openapi.yml +++ b/client-sdks/stainless/openapi.yml @@ -7472,6 +7472,10 @@ components: type: string type: object - type: 'null' + top_logprobs: + anyOf: + - type: integer + - type: 'null' type: object required: - input diff --git a/docs/static/deprecated-llama-stack-spec.yaml b/docs/static/deprecated-llama-stack-spec.yaml index eb5aa52044..5db531346e 100644 --- a/docs/static/deprecated-llama-stack-spec.yaml +++ b/docs/static/deprecated-llama-stack-spec.yaml @@ -4307,6 +4307,10 @@ components: type: string type: object - type: 'null' + top_logprobs: + anyOf: + - type: integer + - type: 'null' type: object required: - input diff --git a/docs/static/llama-stack-spec.yaml b/docs/static/llama-stack-spec.yaml index 360c8cb663..6d5b5c19c7 100644 --- a/docs/static/llama-stack-spec.yaml +++ b/docs/static/llama-stack-spec.yaml @@ -6114,6 +6114,10 @@ components: type: string type: object - type: 'null' + top_logprobs: + anyOf: + - type: integer + - type: 'null' type: object required: - input diff --git a/docs/static/stainless-llama-stack-spec.yaml b/docs/static/stainless-llama-stack-spec.yaml index 4f772b5670..befb734926 100644 --- a/docs/static/stainless-llama-stack-spec.yaml +++ b/docs/static/stainless-llama-stack-spec.yaml @@ -7472,6 +7472,10 @@ components: type: string type: object - type: 'null' + top_logprobs: + anyOf: + - type: integer + - type: 'null' type: object required: - input diff --git a/src/llama_stack/providers/inline/agents/meta_reference/agents.py b/src/llama_stack/providers/inline/agents/meta_reference/agents.py index 8f336ef00d..8b764f2705 100644 --- a/src/llama_stack/providers/inline/agents/meta_reference/agents.py +++ b/src/llama_stack/providers/inline/agents/meta_reference/agents.py @@ -113,6 +113,7 @@ async def create_openai_response( guardrails: list[ResponseGuardrail] | None = None, max_tool_calls: int | None = None, metadata: dict[str, str] | None = None, + top_logprobs: int | None = None, ) -> OpenAIResponseObject: assert self.openai_responses_impl is not None, "OpenAI responses not initialized" result = await self.openai_responses_impl.create_openai_response( @@ -120,6 +121,7 @@ async def create_openai_response( model, prompt, instructions, + parallel_tool_calls, previous_response_id, conversation, store, @@ -131,9 +133,9 @@ async def create_openai_response( include, max_infer_iters, guardrails, - parallel_tool_calls, max_tool_calls, metadata, + top_logprobs, ) return result # type: ignore[no-any-return] diff --git a/src/llama_stack/providers/inline/agents/meta_reference/responses/openai_responses.py b/src/llama_stack/providers/inline/agents/meta_reference/responses/openai_responses.py index c64a70473b..57c2704ab4 100644 --- a/src/llama_stack/providers/inline/agents/meta_reference/responses/openai_responses.py +++ b/src/llama_stack/providers/inline/agents/meta_reference/responses/openai_responses.py @@ -328,6 +328,7 @@ async def create_openai_response( model: str, prompt: OpenAIResponsePrompt | None = None, instructions: str | None = None, + parallel_tool_calls: bool | None = True, previous_response_id: str | None = None, conversation: str | None = None, store: bool | None = True, @@ -339,9 +340,9 @@ async def create_openai_response( include: list[ResponseItemInclude] | None = None, max_infer_iters: int | None = 10, guardrails: list[str | ResponseGuardrailSpec] | None = None, - parallel_tool_calls: bool | None = None, max_tool_calls: int | None = None, metadata: dict[str, str] | None = None, + top_logprobs: int | None = None, ): stream = bool(stream) text = OpenAIResponseText(format=OpenAIResponseTextFormat(type="text")) if text is None else text @@ -399,6 +400,7 @@ async def create_openai_response( max_tool_calls=max_tool_calls, metadata=metadata, include=include, + top_logprobs=top_logprobs, ) if stream: @@ -454,6 +456,7 @@ async def _create_streaming_response( max_tool_calls: int | None = None, metadata: dict[str, str] | None = None, include: list[ResponseItemInclude] | None = None, + top_logprobs: int | None = None, ) -> AsyncIterator[OpenAIResponseObjectStream]: # These should never be None when called from create_openai_response (which sets defaults) # but we assert here to help mypy understand the types @@ -505,6 +508,7 @@ async def _create_streaming_response( max_tool_calls=max_tool_calls, metadata=metadata, include=include, + top_logprobs=top_logprobs, ) # Stream the response diff --git a/src/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py b/src/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py index 6761514129..9e610c0fe9 100644 --- a/src/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py +++ b/src/llama_stack/providers/inline/agents/meta_reference/responses/streaming.py @@ -138,6 +138,7 @@ def __init__( max_tool_calls: int | None = None, metadata: dict[str, str] | None = None, include: list[ResponseItemInclude] | None = None, + top_logprobs: int | None = None, ): self.inference_api = inference_api self.ctx = ctx @@ -157,6 +158,7 @@ def __init__( self.max_tool_calls = max_tool_calls self.metadata = metadata self.include = include + self.top_logprobs = top_logprobs self.sequence_number = 0 # Store MCP tool mapping that gets built during tool processing self.mcp_tool_to_server: dict[str, OpenAIResponseInputToolMCP] = ( @@ -311,6 +313,8 @@ async def create_response(self) -> AsyncIterator[OpenAIResponseObjectStream]: True if self.include and ResponseItemInclude.message_output_text_logprobs in self.include else None ) + top_logprobs_param = self.top_logprobs if logprobs is True else None + params = OpenAIChatCompletionRequestWithExtraBody( model=self.ctx.model, messages=messages, @@ -324,6 +328,7 @@ async def create_response(self) -> AsyncIterator[OpenAIResponseObjectStream]: "include_usage": True, }, logprobs=logprobs, + top_logprobs=top_logprobs_param, ) completion_result = await self.inference_api.openai_chat_completion(params) diff --git a/src/llama_stack_api/agents.py b/src/llama_stack_api/agents.py index 63e6f0fd1f..2812e3a847 100644 --- a/src/llama_stack_api/agents.py +++ b/src/llama_stack_api/agents.py @@ -107,6 +107,7 @@ async def create_openai_response( ] = None, max_tool_calls: int | None = None, metadata: dict[str, str] | None = None, + top_logprobs: int | None = None, ) -> OpenAIResponseObject | AsyncIterator[OpenAIResponseObjectStream]: """Create a model response. @@ -119,6 +120,7 @@ async def create_openai_response( :param guardrails: (Optional) List of guardrails to apply during response generation. Can be guardrail IDs (strings) or guardrail specifications. :param max_tool_calls: (Optional) Max number of total calls to built-in tools that can be processed in a response. :param metadata: (Optional) Dictionary of metadata key-value pairs to attach to the response. + :param top_logprobs: (Optional) The number of top log probabilities to return for each token. Only used when logprobs are requested via the include parameter. :returns: An OpenAIResponseObject. """ ...