@@ -543,25 +543,27 @@ def test_auto_dtype_tp8(self):
543
543
task .evaluate (llm ,
544
544
extra_evaluator_kwargs = dict (apply_chat_template = True ))
545
545
546
+ @skip_pre_hopper
546
547
@pytest .mark .skip_less_mpi_world_size (8 )
547
548
@parametrize_with_ids ("eagle3_one_model" , [True , False ])
548
- def test_eagle3_tp8 (self , eagle3_one_model ):
549
- model_path = f"{ llm_models_root ()} /llama-3.3-models /Llama-3.3-70B-Instruct"
549
+ def test_fp8_eagle3_tp8 (self , eagle3_one_model ):
550
+ model_path = f"{ llm_models_root ()} /modelopt-hf-model-hub /Llama-3.3-70B-Instruct-fp8 "
550
551
eagle_model_dir = f"{ llm_models_root ()} /EAGLE3-LLaMA3.3-Instruct-70B"
551
552
kv_cache_config = KvCacheConfig (free_gpu_memory_fraction = 0.6 )
552
553
spec_config = EagleDecodingConfig (max_draft_len = 4 ,
553
554
speculative_model_dir = eagle_model_dir ,
554
555
eagle3_one_model = eagle3_one_model )
555
- pytorch_config = dict (disable_overlap_scheduler = True , )
556
+ pytorch_config = dict (
557
+ disable_overlap_scheduler = True ,
558
+ cuda_graph_config = CudaGraphConfig (max_batch_size = 1 ))
556
559
with LLM (model_path ,
560
+ max_batch_size = 16 ,
557
561
tensor_parallel_size = 8 ,
558
562
speculative_config = spec_config ,
559
563
kv_cache_config = kv_cache_config ,
560
564
** pytorch_config ) as llm :
561
565
task = CnnDailymail (self .MODEL_NAME )
562
566
task .evaluate (llm )
563
- task = MMLU (self .MODEL_NAME )
564
- task .evaluate (llm )
565
567
566
568
@pytest .mark .skip_less_device (4 )
567
569
@skip_pre_hopper
0 commit comments