Skip to content

Commit 0c7967b

Browse files
chenfeiz0326yuanjingx87
authored andcommitted
[https://nvbugs/5440241][fix] Fix 70B GSM8K Accuracy drop (#7075)
Signed-off-by: Chenfei Zhang <[email protected]>
1 parent 2f6e704 commit 0c7967b

File tree

3 files changed

+17
-9
lines changed

3 files changed

+17
-9
lines changed

tests/integration/defs/accuracy/references/gsm8k.yaml

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,10 +13,12 @@ meta-llama/Llama-3.3-70B-Instruct:
1313
- accuracy: 83.78
1414
- quant_algo: NVFP4
1515
kv_cache_quant_algo: FP8
16-
accuracy: 88.70
16+
accuracy: 87.33
1717
- quant_algo: FP8
1818
kv_cache_quant_algo: FP8
19-
accuracy: 84.08
19+
accuracy: 90.30
20+
- quant_algo: FP8
21+
accuracy: 90.30
2022
meta-llama/Llama-4-Maverick-17B-128E-Instruct:
2123
- accuracy: 92.20
2224
- quant_algo: FP8

tests/integration/defs/accuracy/references/mmlu.yaml

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -64,10 +64,12 @@ meta-llama/Llama-3.3-70B-Instruct:
6464
accuracy: 81.31
6565
- quant_algo: NVFP4
6666
kv_cache_quant_algo: FP8
67-
accuracy: 79.31
67+
accuracy: 78.78
6868
- quant_algo: FP8
6969
kv_cache_quant_algo: FP8
70-
accuracy: 81.02
70+
accuracy: 80.40
71+
- quant_algo: FP8
72+
accuracy: 80.40
7173
meta-llama/Llama-4-Maverick-17B-128E-Instruct:
7274
- accuracy: 86.40
7375
- quant_algo: FP8

tests/integration/defs/accuracy/test_llm_api_pytorch.py

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -408,7 +408,7 @@ def test_fp8_eagle3_tp8(self, eagle3_one_model):
408408
@pytest.mark.skip_less_device(4)
409409
@skip_pre_hopper
410410
def test_fp8_tp4(self):
411-
model_path = f"{llm_models_root()}/modelopt-hf-model-hub/Llama-3.3-70B-Instruct-fp8"
411+
model_path = f"{llm_models_root()}/llama-3.3-models/Llama-3.3-70B-Instruct-FP8"
412412
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.5)
413413
with LLM(model_path,
414414
tensor_parallel_size=4,
@@ -417,6 +417,7 @@ def test_fp8_tp4(self):
417417
kv_cache_config=kv_cache_config) as llm:
418418
assert llm.args.quant_config.quant_algo == QuantAlgo.FP8
419419
sampling_params = SamplingParams(
420+
max_tokens=256,
420421
temperature=0.0,
421422
add_special_tokens=False,
422423
)
@@ -426,16 +427,20 @@ def test_fp8_tp4(self):
426427
task.evaluate(llm, sampling_params=sampling_params)
427428
task = GPQADiamond(self.MODEL_NAME)
428429
task.evaluate(llm,
429-
sampling_params=sampling_params,
430430
extra_evaluator_kwargs=dict(apply_chat_template=True))
431431

432432
@pytest.mark.skip_less_device(4)
433433
@skip_pre_blackwell
434434
def test_nvfp4_tp4(self):
435-
model_path = f"{llm_models_root()}/modelopt-hf-model-hub/Llama-3.3-70B-Instruct-fp4"
436-
with LLM(model_path, tensor_parallel_size=4) as llm:
435+
model_path = f"{llm_models_root()}/llama-3.3-models/Llama-3.3-70B-Instruct-FP4"
436+
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.5)
437+
with LLM(model_path,
438+
tensor_parallel_size=4,
439+
max_batch_size=32,
440+
kv_cache_config=kv_cache_config) as llm:
437441
assert llm.args.quant_config.quant_algo == QuantAlgo.NVFP4
438442
sampling_params = SamplingParams(
443+
max_tokens=256,
439444
temperature=0.0,
440445
add_special_tokens=False,
441446
)
@@ -445,7 +450,6 @@ def test_nvfp4_tp4(self):
445450
task.evaluate(llm, sampling_params=sampling_params)
446451
task = GPQADiamond(self.MODEL_NAME)
447452
task.evaluate(llm,
448-
sampling_params=sampling_params,
449453
extra_evaluator_kwargs=dict(apply_chat_template=True))
450454

451455

0 commit comments

Comments
 (0)