Skip to content

Commit e82d393

Browse files
jhaotingclancelly
authored andcommitted
[None][infra] add eagle3 one model accuracy tests (NVIDIA#6264)
Signed-off-by: Jhao-Ting Chen <[email protected]> Signed-off-by: Lanyu Liao <[email protected]>
1 parent 64e0377 commit e82d393

File tree

9 files changed

+47
-22
lines changed

9 files changed

+47
-22
lines changed

tests/integration/defs/accuracy/references/gsm8k.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@ meta-llama/Llama-3.1-8B-Instruct:
22
- accuracy: 74.20
33
- spec_dec_algo: NGram
44
accuracy: 74.20
5+
- spec_dec_algo: Eagle
6+
accuracy: 74.20
57
- quant_algo: FP8
68
accuracy: 74.30
79
- quant_algo: FP8

tests/integration/defs/accuracy/test_disaggregated_serving.py

Lines changed: 20 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -336,36 +336,44 @@ def test_ngram(self):
336336
task = GSM8K(self.MODEL_NAME)
337337
task.evaluate(llm)
338338

339-
@pytest.mark.parametrize("overlap_scheduler", [False])
340-
def test_eagle3(self, overlap_scheduler):
339+
@parametrize_with_ids("overlap_scheduler", [True, False])
340+
@parametrize_with_ids("eagle3_one_model", [True, False])
341+
def test_eagle3(self, overlap_scheduler, eagle3_one_model):
341342
speculative_decoding_config = {
342343
"decoding_type": "Eagle",
343344
"max_draft_len": 4,
344345
"speculative_model_dir":
345346
f"{llm_models_root()}/EAGLE3-LLaMA3.1-Instruct-8B",
346-
"eagle3_one_model": False
347-
}
348-
kv_cache_config = {
349-
"free_gpu_memory_fraction": 0.5,
350-
"enable_block_reuse": False
347+
"eagle3_one_model": eagle3_one_model
351348
}
352349
ctx_server_config = {
353-
"disable_overlap_scheduler": True,
350+
"disable_overlap_scheduler":
351+
True, # BS=1 does not need overlap scheduling
354352
"speculative_config": speculative_decoding_config,
355-
"kv_cache_config": kv_cache_config,
353+
"kv_cache_config": {
354+
"free_gpu_memory_fraction": 0.5,
355+
"enable_block_reuse": True # reuse on context requests
356+
},
356357
"max_num_tokens": 13393 * 2,
358+
"max_batch_size": 1,
357359
"cache_transceiver_config": {
358360
"backend": "default"
359-
}
361+
},
362+
"cuda_graph_config": None,
360363
}
361364
gen_server_config = {
362365
"disable_overlap_scheduler": not overlap_scheduler,
363366
"speculative_config": speculative_decoding_config,
364-
"kv_cache_config": kv_cache_config,
367+
"kv_cache_config": {
368+
"free_gpu_memory_fraction": 0.5,
369+
"enable_block_reuse": False
370+
},
365371
"max_num_tokens": 13393 * 2,
372+
"max_batch_size": 16,
366373
"cache_transceiver_config": {
367374
"backend": "default"
368-
}
375+
},
376+
"cuda_graph_config": None,
369377
}
370378
disaggregated_server_config = {
371379
"hostname": "localhost",

tests/integration/defs/accuracy/test_llm_api_pytorch.py

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -238,19 +238,27 @@ def test_fp8_beam_search(self):
238238
extra_acc_spec="beam_width=4")
239239

240240
@skip_pre_hopper
241-
def test_eagle3(self):
241+
@parametrize_with_ids("overlap_scheduler", [True, False])
242+
@parametrize_with_ids("eagle3_one_model", [True, False])
243+
def test_eagle3(self, overlap_scheduler, eagle3_one_model):
242244
pytorch_config = dict(
243-
disable_overlap_scheduler=True,
244-
cuda_graph_config=CudaGraphConfig(batch_sizes=[1]),
245+
max_batch_size=
246+
1, # add max_batch_size to avoid error in overlap scheduler
247+
disable_overlap_scheduler=not overlap_scheduler,
248+
cuda_graph_config=CudaGraphConfig(max_batch_size=1,
249+
enable_padding=True),
245250
)
246-
kv_cache_config = KvCacheConfig(enable_block_reuse=False)
251+
kv_cache_config = KvCacheConfig(
252+
enable_block_reuse=True
253+
) # both one-model and two-model supports this feature
247254

248255
eagle_model_dir = f"{llm_models_root()}/EAGLE3-LLaMA3.1-Instruct-8B"
249256
target_model_dir = f"{llm_models_root()}/llama-3.1-model/Llama-3.1-8B-Instruct"
250257

251258
draft_len = 4
252259
spec_config = EagleDecodingConfig(max_draft_len=draft_len,
253-
speculative_model_dir=eagle_model_dir)
260+
speculative_model_dir=eagle_model_dir,
261+
eagle3_one_model=eagle3_one_model)
254262

255263
with LLM(model=target_model_dir,
256264
**pytorch_config,
@@ -259,6 +267,8 @@ def test_eagle3(self):
259267
build_config=None) as llm:
260268
task = MMLU(self.MODEL_NAME)
261269
task.evaluate(llm)
270+
task = GSM8K(self.MODEL_NAME)
271+
task.evaluate(llm)
262272

263273
@skip_pre_hopper
264274
def test_ngram(self):

tests/integration/test_lists/qa/examples_test_list.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -441,7 +441,8 @@ accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_chunked_prefill[
441441
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_chunked_prefill[attn_backend=TRTLLM]
442442
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_llm_sampler
443443
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search
444-
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3
444+
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3[eagle3_one_model=True-overlap_scheduler=True]
445+
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3[eagle3_one_model=False-overlap_scheduler=False]
445446
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_ngram
446447
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding[xgrammar]
447448
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding[llguidance]

tests/integration/test_lists/qa/llm_release_digits_func.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,5 +19,6 @@ test_e2e.py::test_ptp_quickstart_multimodal[NVILA-15B-FP16-NVILA-15B-image-False
1919
test_e2e.py::test_ptp_quickstart_multimodal[NVILA-15B-FP16-NVILA-15B-video-False]
2020

2121
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_llm_sampler
22-
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3
22+
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3[eagle3_one_model=False-overlap_scheduler=False]
23+
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3[eagle3_one_model=True-overlap_scheduler=True]
2324
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_ngram

tests/integration/test_lists/qa/llm_sanity_test.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,8 @@ accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_auto_dtype
2121
accuracy/test_llm_api_pytorch.py::TestKanana_Instruct::test_auto_dtype
2222
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8B::test_nvfp4
2323
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_chunked_prefill[attn_backend=FLASHINFER]
24-
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3
24+
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3[eagle3_one_model=False-overlap_scheduler=False]
25+
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3[eagle3_one_model=True-overlap_scheduler=True]
2526
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_llm_sampler
2627
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search
2728
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding_4gpus[llguidance]

tests/integration/test_lists/test-db/l0_dgx_h100.yml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,9 +38,10 @@ l0_dgx_h100:
3838
- accuracy/test_disaggregated_serving.py::TestGemma3_1BInstruct::test_auto_dtype[False]
3939
- accuracy/test_disaggregated_serving.py::TestGemma3_1BInstruct::test_auto_dtype[True]
4040
- accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ngram
41-
- accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_eagle3[False]
4241
- accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_auto_dtype[False]
4342
- accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_auto_dtype[True]
43+
- accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_eagle3[eagle3_one_model=False-overlap_scheduler=False]
44+
- accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_eagle3[eagle3_one_model=True-overlap_scheduler=True]
4445
- accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_tp_pp_symmetric[GSM8K-tp1pp2]
4546
- accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_tp_pp_symmetric[MMLU-tp1pp2]
4647
- accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_tp_pp_symmetric[GSM8K-tp2pp1]

tests/integration/test_lists/test-db/l0_h100.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,8 @@ l0_h100:
3434
- accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8[fp8kv=False-attn_backend=TRTLLM-torch_compile=True]
3535
- accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8[fp8kv=True-attn_backend=TRTLLM-torch_compile=False]
3636
- accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8[fp8kv=True-attn_backend=TRTLLM-torch_compile=True]
37+
- accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3[eagle3_one_model=False-overlap_scheduler=False]
38+
- accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3[eagle3_one_model=True-overlap_scheduler=True]
3739
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[mtp=disable-fp8kv=True-attention_dp=False-cuda_graph=True-overlap_scheduler=True-torch_compile=True]
3840
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[mtp=eagle-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
3941
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[mtp=vanilla-fp8kv=True-attention_dp=False-cuda_graph=True-overlap_scheduler=True-torch_compile=True]

tests/integration/test_lists/waives.txt

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -251,7 +251,6 @@ test_e2e.py::test_ptp_quickstart_multimodal[gemma-3-27b-it-gemma/gemma-3-27b-it-
251251
test_e2e.py::test_ptp_quickstart_multimodal[gemma-3-27b-it-gemma/gemma-3-27b-it-image-False] SKIP (https://nvbugs/5401114)
252252
examples/test_recurrentgemma.py::test_llm_recurrentgemma_1gpu[use_cpp_session-recurrentgemma-2b-use_paged_cache-int4_awq-float16-enable_attn_plugin-enable_gemm_plugin] SKIP (https://nvbugs/5401233)
253253
examples/test_recurrentgemma.py::test_llm_recurrentgemma_2gpu[recurrentgemma-2b] SKIP (https://nvbugs/5401233)
254-
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3 SKIP (https://nvbugs/5409414)
255254
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search SKIP (https://nvbugs/5409415)
256255
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_ngram SKIP (https://nvbugs/5409414)
257256
test_e2e.py::test_openai_multi_chat_example SKIP (https://nvbugs/5409416)

0 commit comments

Comments
 (0)