@@ -480,25 +480,27 @@ def test_auto_dtype_tp8(self):
480
480
task .evaluate (llm ,
481
481
extra_evaluator_kwargs = dict (apply_chat_template = True ))
482
482
483
+ @skip_pre_hopper
483
484
@pytest .mark .skip_less_mpi_world_size (8 )
484
485
@parametrize_with_ids ("eagle3_one_model" , [True , False ])
485
- def test_eagle3_tp8 (self , eagle3_one_model ):
486
- model_path = f"{ llm_models_root ()} /llama-3.3-models /Llama-3.3-70B-Instruct"
486
+ def test_fp8_eagle3_tp8 (self , eagle3_one_model ):
487
+ model_path = f"{ llm_models_root ()} /modelopt-hf-model-hub /Llama-3.3-70B-Instruct-fp8 "
487
488
eagle_model_dir = f"{ llm_models_root ()} /EAGLE3-LLaMA3.3-Instruct-70B"
488
489
kv_cache_config = KvCacheConfig (free_gpu_memory_fraction = 0.6 )
489
490
spec_config = EagleDecodingConfig (max_draft_len = 4 ,
490
491
speculative_model_dir = eagle_model_dir ,
491
492
eagle3_one_model = eagle3_one_model )
492
- pytorch_config = dict (disable_overlap_scheduler = True , )
493
+ pytorch_config = dict (
494
+ disable_overlap_scheduler = True ,
495
+ cuda_graph_config = CudaGraphConfig (max_batch_size = 1 ))
493
496
with LLM (model_path ,
497
+ max_batch_size = 16 ,
494
498
tensor_parallel_size = 8 ,
495
499
speculative_config = spec_config ,
496
500
kv_cache_config = kv_cache_config ,
497
501
** pytorch_config ) as llm :
498
502
task = CnnDailymail (self .MODEL_NAME )
499
503
task .evaluate (llm )
500
- task = MMLU (self .MODEL_NAME )
501
- task .evaluate (llm )
502
504
503
505
@pytest .mark .skip_less_device (4 )
504
506
@skip_pre_hopper
0 commit comments