modify embedding sentence_transformers

OliverBryant · OliverBryant · commit dd2f141d06d5 · 2025-11-12T10:44:08.000+08:00
diff --git a/xinference/model/embedding/vllm/core.py b/xinference/model/embedding/vllm/core.py
@@ -89,6 +89,34 @@ def load(self):
                     is_matryoshka=True,
                 )
 
+        # Set appropriate VLLM configuration parameters based on model capabilities
+        model_max_tokens = getattr(self.model_family, "max_tokens", 512)
+
+        # Set max_model_len based on model family capabilities with reasonable limits
+        max_model_len = min(model_max_tokens, 8192)
+        if "max_model_len" not in self._kwargs:
+            self._kwargs["max_model_len"] = max_model_len
+
+        # Ensure max_num_batched_tokens is sufficient for large models
+        if "max_num_batched_tokens" not in self._kwargs:
+            # max_num_batched_tokens should be at least max_model_len
+            # Set to a reasonable minimum that satisfies the constraint
+            self._kwargs["max_num_batched_tokens"] = max(4096, max_model_len)
+
+        # Configure other reasonable defaults for embedding models
+        if "gpu_memory_utilization" not in self._kwargs:
+            self._kwargs["gpu_memory_utilization"] = 0.7
+
+        # Use a smaller block size for better compatibility
+        if "block_size" not in self._kwargs:
+            self._kwargs["block_size"] = 16
+
+        logger.debug(
+            f"VLLM configuration for {self.model_family.model_name}: "
+            f"max_model_len={self._kwargs.get('max_model_len')}, "
+            f"max_num_batched_tokens={self._kwargs.get('max_num_batched_tokens')}"
+        )
+
         self._model = LLM(model=self._model_path, task="embed", **self._kwargs)
         self._tokenizer = self._model.get_tokenizer()
 
@@ -246,6 +274,21 @@ def _set_context_length(self):
                 self._model.llm_engine.vllm_config.model_config.max_model_len
             )
         else:
-            # v1
-            logger.warning("vLLM v1 is not supported, ignore context length setting")
+            # v1 - Get max_model_len from the v1 engine configuration
+            try:
+                # For v1, access the config differently
+                if hasattr(self._model.llm_engine, "vllm_config"):
+                    self._context_length = (
+                        self._model.llm_engine.vllm_config.model_config.max_model_len
+                    )
+                elif hasattr(self._model.llm_engine, "model_config"):
+                    self._context_length = (
+                        self._model.llm_engine.model_config.max_model_len
+                    )
+                else:
+                    # Fallback to the configured value
+                    self._context_length = self._kwargs.get("max_model_len", 512)
+            except Exception as e:
+                logger.warning(f"Failed to get context length from vLLM v1 engine: {e}")
+                self._context_length = self._kwargs.get("max_model_len", 512)
         logger.debug("Model context length: %s", self._context_length)
diff --git a/xinference/model/rerank/vllm/core.py b/xinference/model/rerank/vllm/core.py
@@ -1,11 +1,15 @@
 import importlib.util
+import json
+import logging
 import uuid
 from typing import List, Optional, Union
 
 from ....types import Document, DocumentObj, Meta, Rerank, RerankTokens
 from ...utils import cache_clean
 from ..core import RerankModel, RerankModelFamilyV2, RerankSpecV1
 
+logger = logging.getLogger(__name__)
+
 SUPPORTED_MODELS_PREFIXES = ["bge", "gte", "text2vec", "m3e", "gte", "qwen3"]
 
 
@@ -67,6 +71,42 @@ def load(self):
                     classifier_from_token=["no", "yes"],
                     is_original_qwen3_reranker=True,
                 )
+            elif isinstance(self._kwargs["hf_overrides"], str):
+                self._kwargs["hf_overrides"] = json.loads(self._kwargs["hf_overrides"])
+                self._kwargs["hf_overrides"].update(
+                    architectures=["Qwen3ForSequenceClassification"],
+                    classifier_from_token=["no", "yes"],
+                    is_original_qwen3_reranker=True,
+                )
+
+        # Set appropriate VLLM configuration parameters based on model capabilities
+        model_max_tokens = getattr(self.model_family, "max_tokens", 512)
+
+        # Set max_model_len based on model family capabilities with reasonable limits
+        max_model_len = min(model_max_tokens, 8192)
+        if "max_model_len" not in self._kwargs:
+            self._kwargs["max_model_len"] = max_model_len
+
+        # Ensure max_num_batched_tokens is sufficient for large models
+        if "max_num_batched_tokens" not in self._kwargs:
+            # max_num_batched_tokens should be at least max_model_len
+            # Set to a reasonable minimum that satisfies the constraint
+            self._kwargs["max_num_batched_tokens"] = max(4096, max_model_len)
+
+        # Configure other reasonable defaults for reranking models
+        if "gpu_memory_utilization" not in self._kwargs:
+            self._kwargs["gpu_memory_utilization"] = 0.7
+
+        # Use a smaller block size for better compatibility
+        if "block_size" not in self._kwargs:
+            self._kwargs["block_size"] = 16
+
+        logger.debug(
+            f"VLLM configuration for rerank model {self.model_family.model_name}: "
+            f"max_model_len={self._kwargs.get('max_model_len')}, "
+            f"max_num_batched_tokens={self._kwargs.get('max_num_batched_tokens')}"
+        )
+
         self._model = LLM(model=self._model_path, task="score", **self._kwargs)
         self._tokenizer = self._model.get_tokenizer()