add eagle3 one model accuracy tests

jhaotingc · jhaotingc · commit 0df32bbcf98d · 2025-07-22T16:13:46.000-07:00
Signed-off-by: Jhao-Ting Chen &lt;jhaotingc@nvidia.com&gt;
diff --git a/tests/integration/defs/accuracy/references/gsm8k.yaml b/tests/integration/defs/accuracy/references/gsm8k.yaml
@@ -2,6 +2,8 @@ meta-llama/Llama-3.1-8B-Instruct:
   - accuracy: 74.20
   - spec_dec_algo: NGRAM
     accuracy: 74.20
+  - spec_dec_algo: Eagle
+    accuracy: 74.20
   - quant_algo: FP8
     accuracy: 74.30
   - quant_algo: FP8
diff --git a/tests/integration/defs/accuracy/references/mmlu.yaml b/tests/integration/defs/accuracy/references/mmlu.yaml
@@ -20,7 +20,7 @@ meta-llama/Llama-3.1-8B:
     accuracy: 64.99
 meta-llama/Llama-3.1-8B-Instruct:
   - accuracy: 68.17
-  - spec_dec_algo: EAGLE3
+  - spec_dec_algo: Eagle
     accuracy: 68.20
   - spec_dec_algo: NGRAM
     accuracy: 68.17
diff --git a/tests/integration/defs/accuracy/test_disaggregated_serving.py b/tests/integration/defs/accuracy/test_disaggregated_serving.py
@@ -265,14 +265,15 @@ def test_ngram(self):
             task = GSM8K(self.MODEL_NAME)
             task.evaluate(llm)
 
-    @pytest.mark.parametrize("overlap_scheduler", [False])
-    def test_eagle3(self, overlap_scheduler):
+    @pytest.mark.parametrize(("overlap_scheduler", "eagle3_one_model"),
+                             [(False, True), (False, False)])
+    def test_eagle3(self, overlap_scheduler, eagle3_one_model):
         speculative_decoding_config = {
             "decoding_type": "Eagle",
             "max_draft_len": 4,
             "speculative_model_dir":
             f"{llm_models_root()}/EAGLE3-LLaMA3.1-Instruct-8B",
-            "eagle3_one_model": False
+            "eagle3_one_model": eagle3_one_model
         }
         kv_cache_config = {
             "free_gpu_memory_fraction": 0.5,
diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@@ -228,9 +228,11 @@ def test_fp8_beam_search(self):
                           sampling_params=sampling_params,
                           extra_acc_spec="beam_width=4")
 
-    def test_eagle3(self):
+    @pytest.mark.parametrize(("overlap_scheduler", "eagle3_one_model"),
+                             [(False, True), (False, False)])
+    def test_eagle3(self, overlap_scheduler, eagle3_one_model):
         pytorch_config = dict(
-            disable_overlap_scheduler=True,
+            disable_overlap_scheduler=not overlap_scheduler,
             cuda_graph_config=CudaGraphConfig(batch_sizes=[1]),
         )
         kv_cache_config = KvCacheConfig(enable_block_reuse=False)
@@ -240,7 +242,8 @@ def test_eagle3(self):
 
         draft_len = 4
         spec_config = EagleDecodingConfig(max_draft_len=draft_len,
-                                          speculative_model_dir=eagle_model_dir)
+                                          speculative_model_dir=eagle_model_dir,
+                                          eagle3_one_model=eagle3_one_model)
 
         with LLM(model=target_model_dir,
                  **pytorch_config,
@@ -249,6 +252,8 @@ def test_eagle3(self):
                  build_config=None) as llm:
             task = MMLU(self.MODEL_NAME)
             task.evaluate(llm)
+            task = GSM8K(self.MODEL_NAME)
+            task.evaluate(llm)
 
     def test_ngram(self):
         pytorch_config = dict(disable_overlap_scheduler=True)
@@ -1641,9 +1646,11 @@ def test_bf16(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph,
             task = MMLU(self.MODEL_NAME)
             task.evaluate(llm)
 
-    def test_eagle3(self):
+    @pytest.mark.parametrize(("overlap_scheduler", "eagle3_one_model"),
+                             [(False, True), (False, False)])
+    def test_eagle3(self, overlap_scheduler, eagle3_one_model):
         pytorch_config = dict(
-            disable_overlap_scheduler=True,
+            disable_overlap_scheduler=not overlap_scheduler,
             cuda_graph_config=CudaGraphConfig(batch_sizes=[1]),
         )
         kv_cache_config = KvCacheConfig(enable_block_reuse=False)
@@ -1653,7 +1660,8 @@ def test_eagle3(self):
 
         draft_len = 4
         spec_config = EagleDecodingConfig(max_draft_len=draft_len,
-                                          speculative_model_dir=eagle_model_dir)
+                                          speculative_model_dir=eagle_model_dir,
+                                          eagle3_one_model=eagle3_one_model)
 
         llm = LLM(model=target_model_dir,
                   **pytorch_config,
diff --git a/tests/integration/test_lists/test-db/l0_dgx_h100.yml b/tests/integration/test_lists/test-db/l0_dgx_h100.yml
@@ -35,7 +35,7 @@ l0_dgx_h100:
   - accuracy/test_disaggregated_serving.py::TestGemma3_1BInstruct::test_auto_dtype[False]
   - accuracy/test_disaggregated_serving.py::TestGemma3_1BInstruct::test_auto_dtype[True]
   - accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ngram
-  - accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_eagle3[False]
+  - accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_eagle3
   - test_e2e.py::test_ptp_quickstart_advanced_bs1
 - condition:
     ranges:
diff --git a/tests/integration/test_lists/test-db/l0_h100.yml b/tests/integration/test_lists/test-db/l0_h100.yml
@@ -34,6 +34,7 @@ l0_h100:
   - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8[fp8kv=False-attn_backend=TRTLLM-torch_compile=True]
   - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8[fp8kv=True-attn_backend=TRTLLM-torch_compile=False]
   - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8[fp8kv=True-attn_backend=TRTLLM-torch_compile=True]
+  - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[mtp=disable-fp8kv=True-attention_dp=False-cuda_graph=True-overlap_scheduler=True-torch_compile=True]
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[mtp=eagle-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[mtp=vanilla-fp8kv=True-attention_dp=False-cuda_graph=True-overlap_scheduler=True-torch_compile=True]