fix: fix openai_embeddings for asymmetric embedding NIMs (#3205)

jiayin-nvidia · web-flow · commit 55e9959f62bd · 2025-08-20T08:06:25.000-04:00
# What does this PR do? NVIDIA asymmetric embedding models (e.g., `nvidia/llama-3.2-nv-embedqa-1b-v2`) require an `input_type` parameter not present in the standard OpenAI embeddings API. This PR adds the `input_type="query"` as default and updates the documentation to suggest using the `embedding` API for passage embeddings.  Resolves #2892 ## Test Plan ``` pytest -s -v tests/integration/inference/test_openai_embeddings.py --stack-config="inference=nvidia" --embedding-model="nvidia/llama-3.2-nv-embedqa-1b-v2" --env NVIDIA_API_KEY={nvidia_api_key} --env NVIDIA_BASE_URL="https://integrate.api.nvidia.com" ```
diff --git a/llama_stack/providers/remote/inference/nvidia/NVIDIA.md b/llama_stack/providers/remote/inference/nvidia/NVIDIA.md
@@ -77,6 +77,10 @@ print(f"Response: {response.completion_message.content}")
 ```
 
 ### Create Embeddings
+> Note on OpenAI embeddings compatibility
+>
+> NVIDIA asymmetric embedding models (e.g., `nvidia/llama-3.2-nv-embedqa-1b-v2`) require an `input_type` parameter not present in the standard OpenAI embeddings API. The NVIDIA Inference Adapter automatically sets `input_type="query"` when using the OpenAI-compatible embeddings endpoint for NVIDIA. For passage embeddings, use the `embeddings` API with `task_type="document"`.
+
 ```python
 response = client.inference.embeddings(
     model_id="nvidia/llama-3.2-nv-embedqa-1b-v2",
diff --git a/llama_stack/providers/remote/inference/nvidia/nvidia.py b/llama_stack/providers/remote/inference/nvidia/nvidia.py
@@ -7,7 +7,7 @@
 import warnings
 from collections.abc import AsyncIterator
 
-from openai import APIConnectionError, BadRequestError
+from openai import NOT_GIVEN, APIConnectionError, BadRequestError
 
 from llama_stack.apis.common.content_types import (
     InterleavedContent,
@@ -26,6 +26,9 @@
     Inference,
     LogProbConfig,
     Message,
+    OpenAIEmbeddingData,
+    OpenAIEmbeddingsResponse,
+    OpenAIEmbeddingUsage,
     ResponseFormat,
     SamplingParams,
     TextTruncation,
@@ -210,6 +213,57 @@ async def embeddings(
         #
         return EmbeddingsResponse(embeddings=[embedding.embedding for embedding in response.data])
 
+    async def openai_embeddings(
+        self,
+        model: str,
+        input: str | list[str],
+        encoding_format: str | None = "float",
+        dimensions: int | None = None,
+        user: str | None = None,
+    ) -> OpenAIEmbeddingsResponse:
+        """
+        OpenAI-compatible embeddings for NVIDIA NIM.
+
+        Note: NVIDIA NIM asymmetric embedding models require an "input_type" field not present in the standard OpenAI embeddings API.
+        We default this to "query" to ensure requests succeed when using the
+        OpenAI-compatible endpoint. For passage embeddings, use the embeddings API with
+        `task_type='document'`.
+        """
+        extra_body: dict[str, object] = {"input_type": "query"}
+        logger.warning(
+            "NVIDIA OpenAI-compatible embeddings: defaulting to input_type='query'. "
+            "For passage embeddings, use the embeddings API with task_type='document'."
+        )
+
+        response = await self.client.embeddings.create(
+            model=await self._get_provider_model_id(model),
+            input=input,
+            encoding_format=encoding_format if encoding_format is not None else NOT_GIVEN,
+            dimensions=dimensions if dimensions is not None else NOT_GIVEN,
+            user=user if user is not None else NOT_GIVEN,
+            extra_body=extra_body,
+        )
+
+        data = []
+        for i, embedding_data in enumerate(response.data):
+            data.append(
+                OpenAIEmbeddingData(
+                    embedding=embedding_data.embedding,
+                    index=i,
+                )
+            )
+
+        usage = OpenAIEmbeddingUsage(
+            prompt_tokens=response.usage.prompt_tokens,
+            total_tokens=response.usage.total_tokens,
+        )
+
+        return OpenAIEmbeddingsResponse(
+            data=data,
+            model=response.model,
+            usage=usage,
+        )
+
     async def chat_completion(
         self,
         model_id: str,