From c2c9b1844c3a9e3e34dc79e89c43edf05d4801e7 Mon Sep 17 00:00:00 2001 From: Jhao-Ting Chen Date: Thu, 31 Jul 2025 12:39:43 -0700 Subject: [PATCH 1/2] add eagle3 one model disagg/agg accuracy tests Signed-off-by: Jhao-Ting Chen --- .../defs/accuracy/references/gsm8k.yaml | 2 ++ .../accuracy/test_disaggregated_serving.py | 28 +++++++++++-------- .../defs/accuracy/test_llm_api_pytorch.py | 20 +++++++++---- .../test_lists/qa/examples_test_list.txt | 3 +- .../test_lists/qa/llm_release_digits_func.txt | 3 +- .../test_lists/qa/llm_sanity_test.txt | 3 +- .../test_lists/test-db/l0_dgx_h100.yml | 3 +- .../test_lists/test-db/l0_h100.yml | 2 ++ tests/integration/test_lists/waives.txt | 1 - 9 files changed, 44 insertions(+), 21 deletions(-) diff --git a/tests/integration/defs/accuracy/references/gsm8k.yaml b/tests/integration/defs/accuracy/references/gsm8k.yaml index 5c942ed41b0..0bc8f96446e 100644 --- a/tests/integration/defs/accuracy/references/gsm8k.yaml +++ b/tests/integration/defs/accuracy/references/gsm8k.yaml @@ -2,6 +2,8 @@ meta-llama/Llama-3.1-8B-Instruct: - accuracy: 74.20 - spec_dec_algo: NGram accuracy: 74.20 + - spec_dec_algo: Eagle + accuracy: 74.20 - quant_algo: FP8 accuracy: 74.30 - quant_algo: FP8 diff --git a/tests/integration/defs/accuracy/test_disaggregated_serving.py b/tests/integration/defs/accuracy/test_disaggregated_serving.py index fa9ab908976..0ff2eea2d7c 100644 --- a/tests/integration/defs/accuracy/test_disaggregated_serving.py +++ b/tests/integration/defs/accuracy/test_disaggregated_serving.py @@ -336,24 +336,26 @@ def test_ngram(self): task = GSM8K(self.MODEL_NAME) task.evaluate(llm) - @pytest.mark.parametrize("overlap_scheduler", [False]) - def test_eagle3(self, overlap_scheduler): + @parametrize_with_ids("overlap_scheduler", [True, False]) + @parametrize_with_ids("eagle3_one_model", [True, False]) + def test_eagle3(self, overlap_scheduler, eagle3_one_model): speculative_decoding_config = { "decoding_type": "Eagle", "max_draft_len": 4, "speculative_model_dir": f"{llm_models_root()}/EAGLE3-LLaMA3.1-Instruct-8B", - "eagle3_one_model": False - } - kv_cache_config = { - "free_gpu_memory_fraction": 0.5, - "enable_block_reuse": False + "eagle3_one_model": eagle3_one_model } ctx_server_config = { - "disable_overlap_scheduler": True, + "disable_overlap_scheduler": + True, # BS=1 does not need overlap scheduling "speculative_config": speculative_decoding_config, - "kv_cache_config": kv_cache_config, + "kv_cache_config": { + "free_gpu_memory_fraction": 0.5, + "enable_block_reuse": True # reuse on context requests + }, "max_num_tokens": 13393 * 2, + "max_batch_size": 1, "cache_transceiver_config": { "backend": "default" } @@ -361,8 +363,12 @@ def test_eagle3(self, overlap_scheduler): gen_server_config = { "disable_overlap_scheduler": not overlap_scheduler, "speculative_config": speculative_decoding_config, - "kv_cache_config": kv_cache_config, - "max_num_tokens": 13393 * 2, + "kv_cache_config": { + "free_gpu_memory_fraction": 0.5, + "enable_block_reuse": False + }, + "max_num_tokens": 20, # BS * (draft token + 1) + "max_batch_size": 4, "cache_transceiver_config": { "backend": "default" } diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py index b58d0017fe2..d367b30c20d 100644 --- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py +++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py @@ -238,19 +238,27 @@ def test_fp8_beam_search(self): extra_acc_spec="beam_width=4") @skip_pre_hopper - def test_eagle3(self): + @parametrize_with_ids("overlap_scheduler", [True, False]) + @parametrize_with_ids("eagle3_one_model", [True, False]) + def test_eagle3(self, overlap_scheduler, eagle3_one_model): pytorch_config = dict( - disable_overlap_scheduler=True, - cuda_graph_config=CudaGraphConfig(batch_sizes=[1]), + max_batch_size= + 1, # add max_batch_size to avoid error in overlap scheduler + disable_overlap_scheduler=not overlap_scheduler, + cuda_graph_config=CudaGraphConfig(max_batch_size=1, + enable_padding=True), ) - kv_cache_config = KvCacheConfig(enable_block_reuse=False) + kv_cache_config = KvCacheConfig( + enable_block_reuse=True + ) # both one-model and two-model supports this feature eagle_model_dir = f"{llm_models_root()}/EAGLE3-LLaMA3.1-Instruct-8B" target_model_dir = f"{llm_models_root()}/llama-3.1-model/Llama-3.1-8B-Instruct" draft_len = 4 spec_config = EagleDecodingConfig(max_draft_len=draft_len, - speculative_model_dir=eagle_model_dir) + speculative_model_dir=eagle_model_dir, + eagle3_one_model=eagle3_one_model) with LLM(model=target_model_dir, **pytorch_config, @@ -259,6 +267,8 @@ def test_eagle3(self): build_config=None) as llm: task = MMLU(self.MODEL_NAME) task.evaluate(llm) + task = GSM8K(self.MODEL_NAME) + task.evaluate(llm) @skip_pre_hopper def test_ngram(self): diff --git a/tests/integration/test_lists/qa/examples_test_list.txt b/tests/integration/test_lists/qa/examples_test_list.txt index 6c79e873b74..993d9445cf6 100644 --- a/tests/integration/test_lists/qa/examples_test_list.txt +++ b/tests/integration/test_lists/qa/examples_test_list.txt @@ -441,7 +441,8 @@ accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_chunked_prefill[ accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_chunked_prefill[attn_backend=TRTLLM] accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_llm_sampler accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search -accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3 +accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3[eagle3_one_model=True-overlap_scheduler=True] +accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3[eagle3_one_model=False-overlap_scheduler=False] accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_ngram accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding[xgrammar] accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding[llguidance] diff --git a/tests/integration/test_lists/qa/llm_release_digits_func.txt b/tests/integration/test_lists/qa/llm_release_digits_func.txt index 4be82d4925c..fcf0b2bdc40 100644 --- a/tests/integration/test_lists/qa/llm_release_digits_func.txt +++ b/tests/integration/test_lists/qa/llm_release_digits_func.txt @@ -19,5 +19,6 @@ test_e2e.py::test_ptp_quickstart_multimodal[NVILA-15B-FP16-NVILA-15B-image-False test_e2e.py::test_ptp_quickstart_multimodal[NVILA-15B-FP16-NVILA-15B-video-False] accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_llm_sampler -accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3 +accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3[eagle3_one_model=False-overlap_scheduler=False] +accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3[eagle3_one_model=True-overlap_scheduler=True] accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_ngram diff --git a/tests/integration/test_lists/qa/llm_sanity_test.txt b/tests/integration/test_lists/qa/llm_sanity_test.txt index 4d3701166ea..59fdacb96cd 100644 --- a/tests/integration/test_lists/qa/llm_sanity_test.txt +++ b/tests/integration/test_lists/qa/llm_sanity_test.txt @@ -21,7 +21,8 @@ accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_auto_dtype accuracy/test_llm_api_pytorch.py::TestKanana_Instruct::test_auto_dtype accuracy/test_llm_api_pytorch.py::TestLlama3_1_8B::test_nvfp4 accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_chunked_prefill[attn_backend=FLASHINFER] -accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3 +accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3[eagle3_one_model=False-overlap_scheduler=False] +accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3[eagle3_one_model=True-overlap_scheduler=True] accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_llm_sampler accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding_4gpus[llguidance] diff --git a/tests/integration/test_lists/test-db/l0_dgx_h100.yml b/tests/integration/test_lists/test-db/l0_dgx_h100.yml index fd6b7387235..99fa084bda5 100644 --- a/tests/integration/test_lists/test-db/l0_dgx_h100.yml +++ b/tests/integration/test_lists/test-db/l0_dgx_h100.yml @@ -38,9 +38,10 @@ l0_dgx_h100: - accuracy/test_disaggregated_serving.py::TestGemma3_1BInstruct::test_auto_dtype[False] - accuracy/test_disaggregated_serving.py::TestGemma3_1BInstruct::test_auto_dtype[True] - accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ngram - - accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_eagle3[False] - accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_auto_dtype[False] - accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_auto_dtype[True] + - accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_eagle3[eagle3_one_model=False-overlap_scheduler=False] + - accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_eagle3[eagle3_one_model=True-overlap_scheduler=True] - accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_tp_pp_symmetric[GSM8K-tp1pp2] - accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_tp_pp_symmetric[MMLU-tp1pp2] - accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_tp_pp_symmetric[GSM8K-tp2pp1] diff --git a/tests/integration/test_lists/test-db/l0_h100.yml b/tests/integration/test_lists/test-db/l0_h100.yml index c8049a689be..9100418517c 100644 --- a/tests/integration/test_lists/test-db/l0_h100.yml +++ b/tests/integration/test_lists/test-db/l0_h100.yml @@ -34,6 +34,8 @@ l0_h100: - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8[fp8kv=False-attn_backend=TRTLLM-torch_compile=True] - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8[fp8kv=True-attn_backend=TRTLLM-torch_compile=False] - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8[fp8kv=True-attn_backend=TRTLLM-torch_compile=True] + - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3[eagle3_one_model=False-overlap_scheduler=False] + - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3[eagle3_one_model=True-overlap_scheduler=True] - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[mtp=disable-fp8kv=True-attention_dp=False-cuda_graph=True-overlap_scheduler=True-torch_compile=True] - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[mtp=eagle-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[mtp=vanilla-fp8kv=True-attention_dp=False-cuda_graph=True-overlap_scheduler=True-torch_compile=True] diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index 2e1ae548762..7849266fe09 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -251,7 +251,6 @@ test_e2e.py::test_ptp_quickstart_multimodal[gemma-3-27b-it-gemma/gemma-3-27b-it- test_e2e.py::test_ptp_quickstart_multimodal[gemma-3-27b-it-gemma/gemma-3-27b-it-image-False] SKIP (https://nvbugs/5401114) examples/test_recurrentgemma.py::test_llm_recurrentgemma_1gpu[use_cpp_session-recurrentgemma-2b-use_paged_cache-int4_awq-float16-enable_attn_plugin-enable_gemm_plugin] SKIP (https://nvbugs/5401233) examples/test_recurrentgemma.py::test_llm_recurrentgemma_2gpu[recurrentgemma-2b] SKIP (https://nvbugs/5401233) -accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3 SKIP (https://nvbugs/5409414) accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search SKIP (https://nvbugs/5409415) accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_ngram SKIP (https://nvbugs/5409414) test_e2e.py::test_openai_multi_chat_example SKIP (https://nvbugs/5409416) From c977d2a1e8a449eb7f7e92c3a3d24bcd5b90213a Mon Sep 17 00:00:00 2001 From: Jhao-Ting Chen Date: Fri, 1 Aug 2025 15:13:12 -0700 Subject: [PATCH 2/2] fix disagg test Signed-off-by: Jhao-Ting Chen --- .../defs/accuracy/test_disaggregated_serving.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/tests/integration/defs/accuracy/test_disaggregated_serving.py b/tests/integration/defs/accuracy/test_disaggregated_serving.py index 0ff2eea2d7c..788cf581802 100644 --- a/tests/integration/defs/accuracy/test_disaggregated_serving.py +++ b/tests/integration/defs/accuracy/test_disaggregated_serving.py @@ -358,7 +358,8 @@ def test_eagle3(self, overlap_scheduler, eagle3_one_model): "max_batch_size": 1, "cache_transceiver_config": { "backend": "default" - } + }, + "cuda_graph_config": None, } gen_server_config = { "disable_overlap_scheduler": not overlap_scheduler, @@ -367,11 +368,12 @@ def test_eagle3(self, overlap_scheduler, eagle3_one_model): "free_gpu_memory_fraction": 0.5, "enable_block_reuse": False }, - "max_num_tokens": 20, # BS * (draft token + 1) - "max_batch_size": 4, + "max_num_tokens": 13393 * 2, + "max_batch_size": 16, "cache_transceiver_config": { "backend": "default" - } + }, + "cuda_graph_config": None, } disaggregated_server_config = { "hostname": "localhost",