@@ -336,36 +336,44 @@ def test_ngram(self):
336
336
task = GSM8K (self .MODEL_NAME )
337
337
task .evaluate (llm )
338
338
339
- @pytest .mark .parametrize ("overlap_scheduler" , [False ])
340
- def test_eagle3 (self , overlap_scheduler ):
339
+ @parametrize_with_ids ("overlap_scheduler" , [True , False ])
340
+ @parametrize_with_ids ("eagle3_one_model" , [True , False ])
341
+ def test_eagle3 (self , overlap_scheduler , eagle3_one_model ):
341
342
speculative_decoding_config = {
342
343
"decoding_type" : "Eagle" ,
343
344
"max_draft_len" : 4 ,
344
345
"speculative_model_dir" :
345
346
f"{ llm_models_root ()} /EAGLE3-LLaMA3.1-Instruct-8B" ,
346
- "eagle3_one_model" : False
347
- }
348
- kv_cache_config = {
349
- "free_gpu_memory_fraction" : 0.5 ,
350
- "enable_block_reuse" : False
347
+ "eagle3_one_model" : eagle3_one_model
351
348
}
352
349
ctx_server_config = {
353
- "disable_overlap_scheduler" : True ,
350
+ "disable_overlap_scheduler" :
351
+ True , # BS=1 does not need overlap scheduling
354
352
"speculative_config" : speculative_decoding_config ,
355
- "kv_cache_config" : kv_cache_config ,
353
+ "kv_cache_config" : {
354
+ "free_gpu_memory_fraction" : 0.5 ,
355
+ "enable_block_reuse" : True # reuse on context requests
356
+ },
356
357
"max_num_tokens" : 13393 * 2 ,
358
+ "max_batch_size" : 1 ,
357
359
"cache_transceiver_config" : {
358
360
"backend" : "default"
359
- }
361
+ },
362
+ "cuda_graph_config" : None ,
360
363
}
361
364
gen_server_config = {
362
365
"disable_overlap_scheduler" : not overlap_scheduler ,
363
366
"speculative_config" : speculative_decoding_config ,
364
- "kv_cache_config" : kv_cache_config ,
367
+ "kv_cache_config" : {
368
+ "free_gpu_memory_fraction" : 0.5 ,
369
+ "enable_block_reuse" : False
370
+ },
365
371
"max_num_tokens" : 13393 * 2 ,
372
+ "max_batch_size" : 16 ,
366
373
"cache_transceiver_config" : {
367
374
"backend" : "default"
368
- }
375
+ },
376
+ "cuda_graph_config" : None ,
369
377
}
370
378
disaggregated_server_config = {
371
379
"hostname" : "localhost" ,
0 commit comments