@@ -476,25 +476,27 @@ def test_auto_dtype_tp8(self):
476
476
task .evaluate (llm ,
477
477
extra_evaluator_kwargs = dict (apply_chat_template = True ))
478
478
479
+ @skip_pre_hopper
479
480
@pytest .mark .skip_less_mpi_world_size (8 )
480
481
@parametrize_with_ids ("eagle3_one_model" , [True , False ])
481
- def test_eagle3_tp8 (self , eagle3_one_model ):
482
- model_path = f"{ llm_models_root ()} /llama-3.3-models /Llama-3.3-70B-Instruct"
482
+ def test_fp8_eagle3_tp8 (self , eagle3_one_model ):
483
+ model_path = f"{ llm_models_root ()} /modelopt-hf-model-hub /Llama-3.3-70B-Instruct-fp8 "
483
484
eagle_model_dir = f"{ llm_models_root ()} /EAGLE3-LLaMA3.3-Instruct-70B"
484
485
kv_cache_config = KvCacheConfig (free_gpu_memory_fraction = 0.6 )
485
486
spec_config = EagleDecodingConfig (max_draft_len = 4 ,
486
487
speculative_model_dir = eagle_model_dir ,
487
488
eagle3_one_model = eagle3_one_model )
488
- pytorch_config = dict (disable_overlap_scheduler = True , )
489
+ pytorch_config = dict (
490
+ disable_overlap_scheduler = True ,
491
+ cuda_graph_config = CudaGraphConfig (max_batch_size = 1 ))
489
492
with LLM (model_path ,
493
+ max_batch_size = 16 ,
490
494
tensor_parallel_size = 8 ,
491
495
speculative_config = spec_config ,
492
496
kv_cache_config = kv_cache_config ,
493
497
** pytorch_config ) as llm :
494
498
task = CnnDailymail (self .MODEL_NAME )
495
499
task .evaluate (llm )
496
- task = MMLU (self .MODEL_NAME )
497
- task .evaluate (llm )
498
500
499
501
@pytest .mark .skip_less_device (4 )
500
502
@skip_pre_hopper
0 commit comments