From 227971cdcc9c535a62e23d65f2b710617e231e4b Mon Sep 17 00:00:00 2001 From: Ivy Zhang <25222398+crazydemo@users.noreply.github.com> Date: Thu, 14 Aug 2025 14:29:28 +0800 Subject: [PATCH 1/2] update skip config Signed-off-by: Ivy Zhang <25222398+crazydemo@users.noreply.github.com> --- .../accuracy/test_disaggregated_serving.py | 22 ++++++++++++------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/tests/integration/defs/accuracy/test_disaggregated_serving.py b/tests/integration/defs/accuracy/test_disaggregated_serving.py index 8fd7508b075..9da5d279f98 100644 --- a/tests/integration/defs/accuracy/test_disaggregated_serving.py +++ b/tests/integration/defs/accuracy/test_disaggregated_serving.py @@ -302,6 +302,7 @@ class TestLlama3_1_8BInstruct(LlmapiAccuracyTestHarness): MODEL_PATH = f"{llm_models_root()}/llama-3.1-model/Llama-3.1-8B-Instruct" @pytest.mark.skip_less_device_memory(32000) + @pytest.mark.skip_less_device(2) @pytest.mark.parametrize("disable_overlap_scheduler", [False, True]) def test_auto_dtype(self, disable_overlap_scheduler): ctx_server_config = {"disable_overlap_scheduler": True} @@ -331,6 +332,8 @@ def test_auto_dtype(self, disable_overlap_scheduler): task = GSM8K(self.MODEL_NAME) task.evaluate(llm) + @pytest.mark.skip_less_device(2) + @skip_pre_hopper def test_ngram(self): speculative_decoding_config = { "decoding_type": "NGram", @@ -381,6 +384,7 @@ def test_ngram(self): @skip_pre_hopper @parametrize_with_ids("overlap_scheduler", [True, False]) @parametrize_with_ids("eagle3_one_model", [True, False]) + @pytest.mark.skip_less_device(2) def test_eagle3(self, overlap_scheduler, eagle3_one_model): speculative_decoding_config = { "decoding_type": "Eagle", @@ -437,7 +441,6 @@ def test_eagle3(self, overlap_scheduler, eagle3_one_model): task = GSM8K(self.MODEL_NAME) task.evaluate(llm) - @pytest.mark.skip_less_device(2) @pytest.mark.parametrize("tp,pp", [(1, 2), (2, 1), (2, 2)], ids=["tp1pp2", "tp2pp1", "tp2pp2"]) @pytest.mark.parametrize("testset", ["GSM8K", "MMLU"]) @@ -445,7 +448,6 @@ def test_tp_pp_symmetric(self, tp, pp, testset): return run_parallel_test(self.MODEL_NAME, self.MODEL_PATH, pp, tp, pp, tp, 1, 1, get_accuracy_task(testset)) - @pytest.mark.skip_less_device(4) @parametrize_with_ids("ctx_pp", [2, 4]) @parametrize_with_ids("gen_tp", [1, 2]) @pytest.mark.parametrize("testset", ["GSM8K", "MMLU"]) @@ -453,20 +455,19 @@ def test_ctx_pp_gen_tp_asymmetric(self, ctx_pp, gen_tp, testset): return run_parallel_test(self.MODEL_NAME, self.MODEL_PATH, ctx_pp, 1, 1, gen_tp, 1, 1, get_accuracy_task(testset)) - @pytest.mark.skip_less_device(4) @pytest.mark.parametrize("testset", ["GSM8K", "MMLU"]) def test_multi_instance(self, testset): return run_parallel_test(self.MODEL_NAME, self.MODEL_PATH, 1, 1, 1, 1, 2, 2, get_accuracy_task(testset)) -@pytest.mark.skip_less_device_memory(140000) -@pytest.mark.timeout(3600) -@pytest.mark.skip_less_device(4) class TestLlama4ScoutInstruct(LlmapiAccuracyTestHarness): MODEL_NAME = "meta-llama/Llama-4-Scout-17B-16E-Instruct" MODEL_PATH = f"{llm_models_root()}/llama4-models/Llama-4-Scout-17B-16E-Instruct" + @pytest.mark.skip_less_device_memory(140000) + @pytest.mark.timeout(3600) + @pytest.mark.skip_less_device(8) @pytest.mark.parametrize("overlap_scheduler", [False, True]) def test_auto_dtype(self, overlap_scheduler): ctx_server_config = {"disable_overlap_scheduler": True} @@ -505,6 +506,8 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness): MODEL_NAME = "deepseek-ai/DeepSeek-V3-Lite" MODEL_PATH = f"{llm_models_root()}/DeepSeek-V3-Lite/bf16" + @pytest.mark.skip_less_device(2) + @pytest.mark.skip_less_device_memory(60000) def test_nixl_backend(self): ctx_server_config = { "disable_overlap_scheduler": True, @@ -542,7 +545,7 @@ def test_nixl_backend(self): @parametrize_with_ids("overlap_scheduler", [True, False]) @parametrize_with_ids("mtp_nextn", [0, pytest.param(2, marks=skip_pre_hopper)]) - @pytest.mark.skip_less_device(4) + @pytest.mark.skip_less_device(8) def test_auto_dtype(self, overlap_scheduler, mtp_nextn): ctx_server_config = {"disable_overlap_scheduler": True} gen_server_config = {"disable_overlap_scheduler": not overlap_scheduler} @@ -586,6 +589,7 @@ class TestGemma3_1BInstruct(LlmapiAccuracyTestHarness): MODEL_NAME = "google/gemma-3-1b-it" MODEL_PATH = f"{llm_models_root()}/gemma/gemma-3-1b-it/" + @pytest.mark.skip_less_device(2) @pytest.mark.parametrize("overlap_scheduler", [False, True]) def test_auto_dtype(self, overlap_scheduler): ctx_server_config = { @@ -637,6 +641,7 @@ class TestQwen3_8B(LlmapiAccuracyTestHarness): MODEL_NAME = "Qwen3/Qwen3-8B" MODEL_PATH = f"{llm_models_root()}/Qwen3/Qwen3-8B-FP8" + @pytest.mark.skip_less_device(2) def test_nixl_backend(self): ctx_server_config = { "disable_overlap_scheduler": True, @@ -673,8 +678,9 @@ def test_nixl_backend(self): task = GSM8K(self.MODEL_NAME) task.evaluate(llm) - @pytest.mark.parametrize("overlap_scheduler", [False, True]) @skip_pre_hopper + @pytest.mark.skip_less_device(2) + @pytest.mark.parametrize("overlap_scheduler", [False, True]) def test_auto_dtype(self, overlap_scheduler): ctx_server_config = { "disable_overlap_scheduler": True, From 569a0253a7c61647ceba2992e87f7da8b9f588a9 Mon Sep 17 00:00:00 2001 From: Ivy Zhang <25222398+crazydemo@users.noreply.github.com> Date: Mon, 18 Aug 2025 12:53:31 +0800 Subject: [PATCH 2/2] fix invalid test name Signed-off-by: Ivy Zhang <25222398+crazydemo@users.noreply.github.com> --- .../defs/accuracy/test_llm_api_pytorch.py | 31 ++++++++++--------- .../test_lists/qa/llm_function_full.txt | 4 +-- .../test_lists/qa/llm_function_sanity.txt | 2 +- .../test_lists/test-db/l0_h100.yml | 4 +-- 4 files changed, 21 insertions(+), 20 deletions(-) diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py index 2597c8323ae..22d04b26145 100644 --- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py +++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py @@ -695,25 +695,26 @@ def test_auto_dtype(self): class TestMistralSmall24B(LlmapiAccuracyTestHarness): MODEL_NAME = "mistralai/Mistral-Small-3.1-24B-Instruct-2503" + MODEL_PATH = f"{llm_models_root()}/Mistral-Small-3.1-24B-Instruct-2503" @pytest.mark.skip_less_device_memory(80000) - @pytest.mark.parametrize( - "model_path, expected_quant_algo", - [ - # Original bfloat16 model. - (f"{llm_models_root()}/Mistral-Small-3.1-24B-Instruct-2503", None), - # FP8 model. - pytest.param( - f"{llm_models_root()}/Mistral-Small-3.1-24B-Instruct-2503-fp8", - QuantAlgo.FP8, - marks=skip_pre_ada, - ), - ], - ) - def test_auto_dtype(self, model_path, expected_quant_algo): + def test_auto_dtype(self): kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.75) + with LLM(self.MODEL_PATH, kv_cache_config=kv_cache_config) as llm: + task = CnnDailymail(self.MODEL_NAME) + task.evaluate(llm) + task = MMLU(self.MODEL_NAME) + task.evaluate(llm) + task = GSM8K(self.MODEL_NAME) + task.evaluate(llm) + + @skip_pre_ada + @pytest.mark.skip_less_device_memory(80000) + def test_fp8(self): + kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.75) + model_path = f"{llm_models_root()}/Mistral-Small-3.1-24B-Instruct-2503-fp8" with LLM(model_path, kv_cache_config=kv_cache_config) as llm: - assert llm.args.quant_config.quant_algo == expected_quant_algo + assert llm.args.quant_config.quant_algo == QuantAlgo.FP8 task = CnnDailymail(self.MODEL_NAME) task.evaluate(llm) task = MMLU(self.MODEL_NAME) diff --git a/tests/integration/test_lists/qa/llm_function_full.txt b/tests/integration/test_lists/qa/llm_function_full.txt index 66b110fdc56..184b91ebafc 100644 --- a/tests/integration/test_lists/qa/llm_function_full.txt +++ b/tests/integration/test_lists/qa/llm_function_full.txt @@ -454,8 +454,8 @@ accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_eagle3_tp8[ accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_eagle3_tp8[eagle3_one_model=False] accuracy/test_llm_api_pytorch.py::TestMistral7B::test_auto_dtype accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_auto_dtype -accuracy/test_llm_api_pytorch.py::TestMistralSmall24B::test_auto_dtype[/scratch.trt_llm_data/llm-models/Mistral-Small-3.1-24B-Instruct-2503-None] -accuracy/test_llm_api_pytorch.py::TestMistralSmall24B::test_auto_dtype[/scratch.trt_llm_data/llm-models/Mistral-Small-3.1-24B-Instruct-2503-fp8-FP8] +accuracy/test_llm_api_pytorch.py::TestMistralSmall24B::test_auto_dtype +accuracy/test_llm_api_pytorch.py::TestMistralSmall24B::test_fp8 accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_auto_dtype[tp8-cuda_graph=False] accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_auto_dtype[tp8ep4-cuda_graph=True] accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_auto_dtype[tp8ep8-cuda_graph=True] diff --git a/tests/integration/test_lists/qa/llm_function_sanity.txt b/tests/integration/test_lists/qa/llm_function_sanity.txt index 14f04cdec9d..8be15cf469f 100644 --- a/tests/integration/test_lists/qa/llm_function_sanity.txt +++ b/tests/integration/test_lists/qa/llm_function_sanity.txt @@ -46,7 +46,7 @@ accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus_online accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus_online_eplb[fp8kv=False] accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus_online_eplb[fp8kv=True] accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_auto_dtype -accuracy/test_llm_api_pytorch.py::TestMistralSmall24B::test_auto_dtype[/scratch.trt_llm_data/llm-models/Mistral-Small-3.1-24B-Instruct-2503-None] +accuracy/test_llm_api_pytorch.py::TestMistralSmall24B::test_auto_dtype accuracy/test_llm_api_pytorch.py::TestKanana_Instruct::test_auto_dtype accuracy/test_llm_api_pytorch.py::TestKimiK2::test_fp8_blockscale[latency] accuracy/test_llm_api_pytorch.py::TestLlama3_1_8B::test_nvfp4 diff --git a/tests/integration/test_lists/test-db/l0_h100.yml b/tests/integration/test_lists/test-db/l0_h100.yml index 25ab9ccfdac..5c37c2a6553 100644 --- a/tests/integration/test_lists/test-db/l0_h100.yml +++ b/tests/integration/test_lists/test-db/l0_h100.yml @@ -205,8 +205,8 @@ l0_h100: - accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_fp8_prequantized - accuracy/test_llm_api_pytorch.py::TestGemma3_27BInstruct::test_fp8_prequantized - accuracy/test_llm_api_pytorch.py::TestGemma3_27BInstruct::test_auto_dtype - - accuracy/test_llm_api_pytorch.py::TestMistralSmall24B::test_auto_dtype[/scratch.trt_llm_data/llm-models/Mistral-Small-3.1-24B-Instruct-2503-None] - - accuracy/test_llm_api_pytorch.py::TestMistralSmall24B::test_auto_dtype[/scratch.trt_llm_data/llm-models/Mistral-Small-3.1-24B-Instruct-2503-fp8-FP8] + - accuracy/test_llm_api_pytorch.py::TestMistralSmall24B::test_auto_dtype + - accuracy/test_llm_api_pytorch.py::TestMistralSmall24B::test_fp8 - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_fp8_block_scales[latency-torch_compile=False] - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_fp8_block_scales[latency-torch_compile=True] - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding[llguidance]