@@ -463,7 +463,7 @@ def test_eagle3_tp8(self, eagle3_one_model):
463
463
@pytest .mark .skip_less_device (4 )
464
464
@skip_pre_hopper
465
465
def test_fp8_tp4 (self ):
466
- model_path = f"{ llm_models_root ()} /modelopt-hf-model-hub /Llama-3.3-70B-Instruct-fp8 "
466
+ model_path = f"{ llm_models_root ()} /llama-3.3-models /Llama-3.3-70B-Instruct-FP8 "
467
467
kv_cache_config = KvCacheConfig (free_gpu_memory_fraction = 0.5 )
468
468
with LLM (model_path ,
469
469
tensor_parallel_size = 4 ,
@@ -472,6 +472,7 @@ def test_fp8_tp4(self):
472
472
kv_cache_config = kv_cache_config ) as llm :
473
473
assert llm .args .quant_config .quant_algo == QuantAlgo .FP8
474
474
sampling_params = SamplingParams (
475
+ max_tokens = 256 ,
475
476
temperature = 0.0 ,
476
477
add_special_tokens = False ,
477
478
)
@@ -481,16 +482,21 @@ def test_fp8_tp4(self):
481
482
task .evaluate (llm , sampling_params = sampling_params )
482
483
task = GPQADiamond (self .MODEL_NAME )
483
484
task .evaluate (llm ,
484
- sampling_params = sampling_params ,
485
485
extra_evaluator_kwargs = dict (apply_chat_template = True ))
486
486
487
487
@pytest .mark .skip_less_device (4 )
488
488
@skip_pre_blackwell
489
489
def test_nvfp4_tp4 (self ):
490
- model_path = f"{ llm_models_root ()} /modelopt-hf-model-hub/Llama-3.3-70B-Instruct-fp4"
491
- with LLM (model_path , tensor_parallel_size = 4 ) as llm :
490
+ model_path = f"{ llm_models_root ()} /llama-3.3-models/Llama-3.3-70B-Instruct-FP4"
491
+ kv_cache_config = KvCacheConfig (free_gpu_memory_fraction = 0.5 )
492
+ with LLM (model_path ,
493
+ tensor_parallel_size = 4 ,
494
+ max_seq_len = 8192 ,
495
+ max_batch_size = 32 ,
496
+ kv_cache_config = kv_cache_config ) as llm :
492
497
assert llm .args .quant_config .quant_algo == QuantAlgo .NVFP4
493
498
sampling_params = SamplingParams (
499
+ max_tokens = 256 ,
494
500
temperature = 0.0 ,
495
501
add_special_tokens = False ,
496
502
)
@@ -500,7 +506,6 @@ def test_nvfp4_tp4(self):
500
506
task .evaluate (llm , sampling_params = sampling_params )
501
507
task = GPQADiamond (self .MODEL_NAME )
502
508
task .evaluate (llm ,
503
- sampling_params = sampling_params ,
504
509
extra_evaluator_kwargs = dict (apply_chat_template = True ))
505
510
506
511
0 commit comments