Skip to content

Commit 0df32bb

Browse files
committed
add eagle3 one model accuracy tests
Signed-off-by: Jhao-Ting Chen <[email protected]>
1 parent bc2fb29 commit 0df32bb

File tree

6 files changed

+23
-11
lines changed

6 files changed

+23
-11
lines changed

tests/integration/defs/accuracy/references/gsm8k.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@ meta-llama/Llama-3.1-8B-Instruct:
22
- accuracy: 74.20
33
- spec_dec_algo: NGRAM
44
accuracy: 74.20
5+
- spec_dec_algo: Eagle
6+
accuracy: 74.20
57
- quant_algo: FP8
68
accuracy: 74.30
79
- quant_algo: FP8

tests/integration/defs/accuracy/references/mmlu.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ meta-llama/Llama-3.1-8B:
2020
accuracy: 64.99
2121
meta-llama/Llama-3.1-8B-Instruct:
2222
- accuracy: 68.17
23-
- spec_dec_algo: EAGLE3
23+
- spec_dec_algo: Eagle
2424
accuracy: 68.20
2525
- spec_dec_algo: NGRAM
2626
accuracy: 68.17

tests/integration/defs/accuracy/test_disaggregated_serving.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -265,14 +265,15 @@ def test_ngram(self):
265265
task = GSM8K(self.MODEL_NAME)
266266
task.evaluate(llm)
267267

268-
@pytest.mark.parametrize("overlap_scheduler", [False])
269-
def test_eagle3(self, overlap_scheduler):
268+
@pytest.mark.parametrize(("overlap_scheduler", "eagle3_one_model"),
269+
[(False, True), (False, False)])
270+
def test_eagle3(self, overlap_scheduler, eagle3_one_model):
270271
speculative_decoding_config = {
271272
"decoding_type": "Eagle",
272273
"max_draft_len": 4,
273274
"speculative_model_dir":
274275
f"{llm_models_root()}/EAGLE3-LLaMA3.1-Instruct-8B",
275-
"eagle3_one_model": False
276+
"eagle3_one_model": eagle3_one_model
276277
}
277278
kv_cache_config = {
278279
"free_gpu_memory_fraction": 0.5,

tests/integration/defs/accuracy/test_llm_api_pytorch.py

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -228,9 +228,11 @@ def test_fp8_beam_search(self):
228228
sampling_params=sampling_params,
229229
extra_acc_spec="beam_width=4")
230230

231-
def test_eagle3(self):
231+
@pytest.mark.parametrize(("overlap_scheduler", "eagle3_one_model"),
232+
[(False, True), (False, False)])
233+
def test_eagle3(self, overlap_scheduler, eagle3_one_model):
232234
pytorch_config = dict(
233-
disable_overlap_scheduler=True,
235+
disable_overlap_scheduler=not overlap_scheduler,
234236
cuda_graph_config=CudaGraphConfig(batch_sizes=[1]),
235237
)
236238
kv_cache_config = KvCacheConfig(enable_block_reuse=False)
@@ -240,7 +242,8 @@ def test_eagle3(self):
240242

241243
draft_len = 4
242244
spec_config = EagleDecodingConfig(max_draft_len=draft_len,
243-
speculative_model_dir=eagle_model_dir)
245+
speculative_model_dir=eagle_model_dir,
246+
eagle3_one_model=eagle3_one_model)
244247

245248
with LLM(model=target_model_dir,
246249
**pytorch_config,
@@ -249,6 +252,8 @@ def test_eagle3(self):
249252
build_config=None) as llm:
250253
task = MMLU(self.MODEL_NAME)
251254
task.evaluate(llm)
255+
task = GSM8K(self.MODEL_NAME)
256+
task.evaluate(llm)
252257

253258
def test_ngram(self):
254259
pytorch_config = dict(disable_overlap_scheduler=True)
@@ -1641,9 +1646,11 @@ def test_bf16(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph,
16411646
task = MMLU(self.MODEL_NAME)
16421647
task.evaluate(llm)
16431648

1644-
def test_eagle3(self):
1649+
@pytest.mark.parametrize(("overlap_scheduler", "eagle3_one_model"),
1650+
[(False, True), (False, False)])
1651+
def test_eagle3(self, overlap_scheduler, eagle3_one_model):
16451652
pytorch_config = dict(
1646-
disable_overlap_scheduler=True,
1653+
disable_overlap_scheduler=not overlap_scheduler,
16471654
cuda_graph_config=CudaGraphConfig(batch_sizes=[1]),
16481655
)
16491656
kv_cache_config = KvCacheConfig(enable_block_reuse=False)
@@ -1653,7 +1660,8 @@ def test_eagle3(self):
16531660

16541661
draft_len = 4
16551662
spec_config = EagleDecodingConfig(max_draft_len=draft_len,
1656-
speculative_model_dir=eagle_model_dir)
1663+
speculative_model_dir=eagle_model_dir,
1664+
eagle3_one_model=eagle3_one_model)
16571665

16581666
llm = LLM(model=target_model_dir,
16591667
**pytorch_config,

tests/integration/test_lists/test-db/l0_dgx_h100.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ l0_dgx_h100:
3535
- accuracy/test_disaggregated_serving.py::TestGemma3_1BInstruct::test_auto_dtype[False]
3636
- accuracy/test_disaggregated_serving.py::TestGemma3_1BInstruct::test_auto_dtype[True]
3737
- accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ngram
38-
- accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_eagle3[False]
38+
- accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_eagle3
3939
- test_e2e.py::test_ptp_quickstart_advanced_bs1
4040
- condition:
4141
ranges:

tests/integration/test_lists/test-db/l0_h100.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ l0_h100:
3434
- accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8[fp8kv=False-attn_backend=TRTLLM-torch_compile=True]
3535
- accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8[fp8kv=True-attn_backend=TRTLLM-torch_compile=False]
3636
- accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8[fp8kv=True-attn_backend=TRTLLM-torch_compile=True]
37+
- accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3
3738
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[mtp=disable-fp8kv=True-attention_dp=False-cuda_graph=True-overlap_scheduler=True-torch_compile=True]
3839
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[mtp=eagle-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
3940
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[mtp=vanilla-fp8kv=True-attention_dp=False-cuda_graph=True-overlap_scheduler=True-torch_compile=True]

0 commit comments

Comments
 (0)