@@ -408,7 +408,7 @@ def test_fp8_eagle3_tp8(self, eagle3_one_model):
408
408
@pytest .mark .skip_less_device (4 )
409
409
@skip_pre_hopper
410
410
def test_fp8_tp4 (self ):
411
- model_path = f"{ llm_models_root ()} /modelopt-hf-model-hub /Llama-3.3-70B-Instruct-fp8 "
411
+ model_path = f"{ llm_models_root ()} /llama-3.3-models /Llama-3.3-70B-Instruct-FP8 "
412
412
kv_cache_config = KvCacheConfig (free_gpu_memory_fraction = 0.5 )
413
413
with LLM (model_path ,
414
414
tensor_parallel_size = 4 ,
@@ -417,6 +417,7 @@ def test_fp8_tp4(self):
417
417
kv_cache_config = kv_cache_config ) as llm :
418
418
assert llm .args .quant_config .quant_algo == QuantAlgo .FP8
419
419
sampling_params = SamplingParams (
420
+ max_tokens = 256 ,
420
421
temperature = 0.0 ,
421
422
add_special_tokens = False ,
422
423
)
@@ -426,16 +427,20 @@ def test_fp8_tp4(self):
426
427
task .evaluate (llm , sampling_params = sampling_params )
427
428
task = GPQADiamond (self .MODEL_NAME )
428
429
task .evaluate (llm ,
429
- sampling_params = sampling_params ,
430
430
extra_evaluator_kwargs = dict (apply_chat_template = True ))
431
431
432
432
@pytest .mark .skip_less_device (4 )
433
433
@skip_pre_blackwell
434
434
def test_nvfp4_tp4 (self ):
435
- model_path = f"{ llm_models_root ()} /modelopt-hf-model-hub/Llama-3.3-70B-Instruct-fp4"
436
- with LLM (model_path , tensor_parallel_size = 4 ) as llm :
435
+ model_path = f"{ llm_models_root ()} /llama-3.3-models/Llama-3.3-70B-Instruct-FP4"
436
+ kv_cache_config = KvCacheConfig (free_gpu_memory_fraction = 0.5 )
437
+ with LLM (model_path ,
438
+ tensor_parallel_size = 4 ,
439
+ max_batch_size = 32 ,
440
+ kv_cache_config = kv_cache_config ) as llm :
437
441
assert llm .args .quant_config .quant_algo == QuantAlgo .NVFP4
438
442
sampling_params = SamplingParams (
443
+ max_tokens = 256 ,
439
444
temperature = 0.0 ,
440
445
add_special_tokens = False ,
441
446
)
@@ -445,7 +450,6 @@ def test_nvfp4_tp4(self):
445
450
task .evaluate (llm , sampling_params = sampling_params )
446
451
task = GPQADiamond (self .MODEL_NAME )
447
452
task .evaluate (llm ,
448
- sampling_params = sampling_params ,
449
453
extra_evaluator_kwargs = dict (apply_chat_template = True ))
450
454
451
455
0 commit comments