PaddlePaddle · Bobholamovic · Nov 5, 2025 · Oct 30, 2025 · Oct 31, 2025 · Nov 3, 2025
diff --git a/docs/pipeline_usage/tutorials/ocr_pipelines/PaddleOCR-VL.en.md b/docs/pipeline_usage/tutorials/ocr_pipelines/PaddleOCR-VL.en.md
@@ -997,8 +997,8 @@ The default save path for the configuration file is `PaddleOCR-VL.yaml`. Modify
 VLRecognition:
   ...
   genai_config:
-    backend: vllm
-    server_url: http://127.0.0.1:8118
+    backend: vllm-server
+    server_url: http://127.0.0.1:8118/v1
 ```
 
 After that, the modified configuration file can be used for production line invocation. For example, invoke it through the CLI:
@@ -1290,7 +1290,7 @@ Below are the API references for basic service-based deployment and examples of
 </tr>
 <tr>
 <td><code>promptLabel</code></td>
-<td><code>string</code>|<code>object</code>|<code>null</code></td>
+<td><code>string</code>|<code>null</code></td>
 <td>Please refer to the description of the <code>prompt_label</code> parameter in the  <code>predict</code> method of the PaddleOCR-VL object.</td>
 <td>No</td>
 </tr>

diff --git a/docs/pipeline_usage/tutorials/ocr_pipelines/PaddleOCR-VL.md b/docs/pipeline_usage/tutorials/ocr_pipelines/PaddleOCR-VL.md
@@ -1038,8 +1038,8 @@ paddlex --get_pipeline_config PaddleOCR-VL
 VLRecognition:
   ...
   genai_config:
-    backend: vllm
-    server_url: http://127.0.0.1:8118
+    backend: vllm-server
+    server_url: http://127.0.0.1:8118/v1
 ```
 
 之后，可以使用修改好的配置文件进行产线调用。例如通过 CLI 调用：
@@ -1334,7 +1334,7 @@ INFO:     Uvicorn running on http://0.0.0.0:8080 (Press CTRL+C to quit)
 </tr>
 <tr>
 <td><code>promptLabel</code></td>
-<td><code>string</code> | <code>object</code> | <code>null</code></td>
+<td><code>string</code> | <code>null</code></td>
 <td>请参阅PaddleOCR-VL对象中 <code>predict</code> 方法的 <code>prompt_label</code> 参数相关说明。</td>
 <td>否</td>
 </tr>

diff --git a/paddlex/inference/models/base/predictor/base_predictor.py b/paddlex/inference/models/base/predictor/base_predictor.py
@@ -35,7 +35,6 @@
 from ....utils.benchmark import ENTRY_POINT_NAME, benchmark
 from ....utils.hpi import HPIConfig, HPIInfo
 from ....utils.io import YAMLReader
-from ....utils.model_paths import get_model_paths
 from ....utils.pp_option import PaddlePredictorOption
 from ...common import HPInfer, PaddleInfer
 from ...common.genai import GenAIClient, GenAIConfig, need_local_model
@@ -156,7 +155,7 @@ def __init__(
 
         self.batch_sampler.batch_size = batch_size
 
-        if self.model_dir and get_model_paths(self.model_dir, self.MODEL_FILE_PREFIX):
+        if self._use_local_model:
             self._use_hpip = use_hpip
             if not use_hpip:
                 self._pp_option = self._prepare_pp_option(pp_option, device)

diff --git a/paddlex/inference/models/common/genai.py b/paddlex/inference/models/common/genai.py
@@ -88,7 +88,7 @@ def stop(self):
         self.loop = None
         self.thread = None
 
-    def run_async(self, coro, return_future=False):
+    def run_async(self, coro):
         if not self.is_running():
             raise RuntimeError("Event loop is not running")
 
@@ -164,6 +164,8 @@ def __init__(
 
         self.backend = backend
         self._max_concurrency = max_concurrency
+        if model_name is None:
+            model_name = run_async(self._get_model_name(), timeout=10)
         self._model_name = model_name
 
         if "api_key" not in kwargs:
@@ -177,12 +179,6 @@ def openai_client(self):
         return self._client
 
     def create_chat_completion(self, messages, *, return_future=False, **kwargs):
-        if self._model_name is not None:
-            model_name = self._model_name
-        else:
-            model_name = run_async(self._get_model_name(), timeout=10)
-            self._model_name = model_name
-
         async def _create_chat_completion_with_semaphore(*args, **kwargs):
             async with self._semaphore:
                 return await self._client.chat.completions.create(
@@ -192,7 +188,7 @@ async def _create_chat_completion_with_semaphore(*args, **kwargs):
 
         return run_async(
             _create_chat_completion_with_semaphore(
-                model=model_name,
+                model=self._model_name,
                 messages=messages,
                 **kwargs,
             ),

diff --git a/paddlex/inference/utils/misc.py b/paddlex/inference/utils/misc.py
@@ -31,4 +31,4 @@ def is_bfloat16_available(device):
     device_type, _ = parse_device(device)
     return (
         "npu" in get_device_type() or paddle.amp.is_bfloat16_supported()
-    ) and device_type in ("gpu", "npu", "xpu", "mlu", "dcu")
+    ) and device_type in ("gpu", "npu", "xpu", "mlu")