Use backend fixture for llmapi

krishung5 · krishung5 · commit 70268030d6c0 · 2025-02-28T15:15:07.000-08:00
diff --git a/python/openai/tests/conftest.py b/python/openai/tests/conftest.py
@@ -31,9 +31,6 @@
 from fastapi.testclient import TestClient
 from tests.utils import OpenAIServer, setup_fastapi_app, setup_server
 
-### TEST ENVIRONMENT SETUP ###
-LLMAPI_SETUP = os.environ.get("LLMAPI_SETUP", 0)
-
 
 def infer_test_environment():
     # Infer the test environment for simplicity in local dev/testing.
@@ -49,10 +46,14 @@ def infer_test_environment():
     try:
         import tensorrt_llm as _
 
-        backend = "tensorrtllm"
+        # TODO: Refactor away from environment variables
+        LLMAPI_SETUP = os.environ.get("LLMAPI_SETUP", 0)
+
         if LLMAPI_SETUP:
+            backend = "llmapi"
             model = "tensorrt_llm"
         else:
+            backend = "tensorrtllm"
             model = "tensorrt_llm_bls"
         return backend, model
     except ImportError:
@@ -62,10 +63,7 @@ def infer_test_environment():
 
 
 def infer_test_model_repository(backend):
-    if LLMAPI_SETUP:
-        model_repository = str(Path(__file__).parent / f"{backend}_llmapi_models")
-    else:
-        model_repository = str(Path(__file__).parent / f"{backend}_models")
+    model_repository = str(Path(__file__).parent / f"{backend}_models")
     return model_repository
 
 
@@ -92,13 +90,23 @@ def infer_test_model_repository(backend):
 # only once for all the tests below.
 @pytest.fixture(scope="module")
 def server():
+    # TODO: tensorrllm and llmapi backends both use "tensorrtllm" as the backend flag for OpenAI server.
+    # In the future if the backend are consolidated, this check can be updated or removed.
+    # key: the TEST_BACKEND value
+    # value: the corresponding backend flag for OpenAI server
+    backend_map = {
+        "tensorrtllm": "tensorrtllm",
+        "llmapi": "tensorrtllm",
+        "vllm": "vllm",
+    }
+
     args = [
         "--model-repository",
         TEST_MODEL_REPOSITORY,
         "--tokenizer",
         TEST_TOKENIZER,
         "--backend",
-        TEST_BACKEND,
+        backend_map[TEST_BACKEND],
     ]
     # TODO: Incorporate kserve frontend binding smoke tests to catch any
     # breakage with default values or slight cli arg variations
diff --git a/python/openai/tests/test_chat_completions.py b/python/openai/tests/test_chat_completions.py
@@ -311,7 +311,7 @@ def test_chat_completions_temperature_vllm(
     def test_chat_completions_temperature_tensorrtllm(
         self, client, backend: str, model: str, messages: List[dict]
     ):
-        if backend != "tensorrtllm":
+        if backend != "tensorrtllm" and backend != "llmapi":
             pytest.skip(
                 reason="Only used to test TRT-LLM-specific temperature behavior"
             )
@@ -371,7 +371,7 @@ def test_chat_completions_temperature_tensorrtllm(
 
     # TODO: Remove xfail for LLM API when it's verified.
     @pytest.mark.xfail(
-        condition=os.getenv("LLMAPI_SETUP") == "1",
+        condition=lambda backend: backend == "llmapi",
         reason="Seed parameter support to be verified for LLM API",
     )
     # Simple tests to verify random seed roughly behaves as expected
diff --git a/python/openai/tests/test_completions.py b/python/openai/tests/test_completions.py
@@ -192,8 +192,8 @@ def test_completions_temperature_vllm(
     def test_completions_temperature_tensorrtllm(
         self, client, backend: str, model: str, prompt: str
     ):
-        if backend != "tensorrtllm":
-            pytest.skip(reason="Only used to test vLLM-specific temperature behavior")
+        if backend != "tensorrtllm" and backend != "llmapi":
+            pytest.skip(reason="Only used to test TRTLLM-specific temperature behavior")
 
         responses = []
         payload1 = {
@@ -241,7 +241,7 @@ def test_completions_temperature_tensorrtllm(
 
     # TODO: Remove xfail for LLM API when it's verified.
     @pytest.mark.xfail(
-        condition=os.getenv("LLMAPI_SETUP") == "1",
+        condition=lambda backend: backend == "llmapi",
         reason="Seed parameter support to be verified for LLM API",
     )
     # Simple tests to verify seed roughly behaves as expected
diff --git a/python/openai/tests/test_openai_client.py b/python/openai/tests/test_openai_client.py
@@ -39,16 +39,12 @@ def test_openai_client_models(self, client: openai.OpenAI, backend: str):
         models = list(client.models.list())
         print(f"Models: {models}")
         if backend == "tensorrtllm":
-            import os
-
-            LLMAPI_SETUP = os.environ.get("LLMAPI_SETUP", 0)
-            if LLMAPI_SETUP:
-                # LLM API setup only has the tensorrt_llm model
-                assert len(models) == 1
-            else:
-                # tensorrt_llm_bls +
-                # preprocess -> tensorrt_llm -> postprocess
-                assert len(models) == 4
+            # tensorrt_llm_bls +
+            # preprocess -> tensorrt_llm -> postprocess
+            assert len(models) == 4
+        elif backend == "llmapi":
+            # Only has one tensorrt_llm model.
+            assert len(models) == 1
         elif backend == "vllm":
             assert len(models) == 1
         else:
@@ -82,7 +78,7 @@ def test_openai_client_chat_completion(
     def test_openai_client_completion_echo(
         self, client: openai.OpenAI, echo: bool, backend: str, model: str, prompt: str
     ):
-        if backend == "tensorrtllm":
+        if backend == "tensorrtllm" or backend == "llmapi":
             pytest.skip(
                 reason="TRT-LLM backend currently only supports setting this parameter at model load time",
             )
@@ -112,16 +108,12 @@ async def test_openai_client_models(self, client: openai.AsyncOpenAI, backend: s
         models = [model async for model in async_models]
         print(f"Models: {models}")
         if backend == "tensorrtllm":
-            import os
-
-            LLMAPI_SETUP = os.environ.get("LLMAPI_SETUP", 0)
-            if LLMAPI_SETUP:
-                # LLM API setup only has the tensorrt_llm model
-                assert len(models) == 1
-            else:
-                # tensorrt_llm_bls +
-                # preprocess -> tensorrt_llm -> postprocess
-                assert len(models) == 4
+            # tensorrt_llm_bls +
+            # preprocess -> tensorrt_llm -> postprocess
+            assert len(models) == 4
+        elif backend == "llmapi":
+            # Only has one tensorrt_llm model.
+            assert len(models) == 1
         elif backend == "vllm":
             assert len(models) == 1
         else:
diff --git a/python/openai/tests/utils.py b/python/openai/tests/utils.py
@@ -1,4 +1,4 @@
-# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# Copyright 2024-2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
@@ -53,8 +53,17 @@ def setup_server(model_repository: str):
 
 
 def setup_fastapi_app(tokenizer: str, server: tritonserver.Server, backend: str):
+    # TODO: tensorrllm and llmapi backends both use "tensorrtllm" as the backend flag for OpenAI server.
+    # In the future if the backends are consolidated, this check can be updated or removed.
+    # key: the backend value
+    # value: the corresponding backend flag for OpenAI server
+    backend_map = {
+        "tensorrtllm": "tensorrtllm",
+        "llmapi": "tensorrtllm",
+        "vllm": "vllm",
+    }
     engine: TritonLLMEngine = TritonLLMEngine(
-        server=server, tokenizer=tokenizer, backend=backend
+        server=server, tokenizer=tokenizer, backend=backend_map[backend]
     )
     frontend: FastApiFrontend = FastApiFrontend(engine=engine)
     return frontend.app
diff --git a/qa/L0_openai/test.sh b/qa/L0_openai/test.sh
@@ -87,7 +87,7 @@ function prepare_tensorrtllm() {
     python3 ${FILL_TEMPLATE} -i ${MODEL_REPO}/tensorrt_llm/config.pbtxt triton_backend:tensorrtllm,triton_max_batch_size:64,decoupled_mode:True,max_beam_width:1,engine_dir:${ENGINE_PATH},batching_strategy:inflight_fused_batching,max_queue_size:0,max_queue_delay_microseconds:1000,encoder_input_features_data_type:TYPE_FP16,logits_datatype:TYPE_FP32,exclude_input_in_output:True
 
     # Prepare LLM API setup
-    LLMAPI_MODEL_REPO="tests/tensorrtllm_llmapi_models"
+    LLMAPI_MODEL_REPO="tests/llmapi_models"
     mkdir -p ${LLMAPI_MODEL_REPO}
     cp /app/all_models/llmapi/* "${LLMAPI_MODEL_REPO}" -r
     sed -i 's#"model":"TinyLlama/TinyLlama-1.1B-Chat-v1.0"#"model":"meta-llama/Meta-Llama-3.1-8B-Instruct"#g' ${LLMAPI_MODEL_REPO}/tensorrt_llm/1/model.json