[https://nvbugs/5440241][fix] Fix 70B GSM8K Accuracy drop (#7075)

chenfeiz0326 · yuanjingx87 · commit 0c7967b82be7 · 2025-08-27T18:35:34.000-07:00
Signed-off-by: Chenfei Zhang &lt;chenfeiz@nvidia.com&gt;
diff --git a/tests/integration/defs/accuracy/references/gsm8k.yaml b/tests/integration/defs/accuracy/references/gsm8k.yaml
@@ -13,10 +13,12 @@ meta-llama/Llama-3.3-70B-Instruct:
   - accuracy: 83.78
   - quant_algo: NVFP4
     kv_cache_quant_algo: FP8
-    accuracy: 88.70
+    accuracy: 87.33
   - quant_algo: FP8
     kv_cache_quant_algo: FP8
-    accuracy: 84.08
+    accuracy: 90.30
+  - quant_algo: FP8
+    accuracy: 90.30
 meta-llama/Llama-4-Maverick-17B-128E-Instruct:
   - accuracy: 92.20
   - quant_algo: FP8
diff --git a/tests/integration/defs/accuracy/references/mmlu.yaml b/tests/integration/defs/accuracy/references/mmlu.yaml
@@ -64,10 +64,12 @@ meta-llama/Llama-3.3-70B-Instruct:
     accuracy: 81.31
   - quant_algo: NVFP4
     kv_cache_quant_algo: FP8
-    accuracy: 79.31
+    accuracy: 78.78
   - quant_algo: FP8
     kv_cache_quant_algo: FP8
-    accuracy: 81.02
+    accuracy: 80.40
+  - quant_algo: FP8
+    accuracy: 80.40
 meta-llama/Llama-4-Maverick-17B-128E-Instruct:
   - accuracy: 86.40
   - quant_algo: FP8
diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@@ -408,7 +408,7 @@ def test_fp8_eagle3_tp8(self, eagle3_one_model):
     @pytest.mark.skip_less_device(4)
     @skip_pre_hopper
     def test_fp8_tp4(self):
-        model_path = f"{llm_models_root()}/modelopt-hf-model-hub/Llama-3.3-70B-Instruct-fp8"
+        model_path = f"{llm_models_root()}/llama-3.3-models/Llama-3.3-70B-Instruct-FP8"
         kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.5)
         with LLM(model_path,
                  tensor_parallel_size=4,
@@ -417,6 +417,7 @@ def test_fp8_tp4(self):
                  kv_cache_config=kv_cache_config) as llm:
             assert llm.args.quant_config.quant_algo == QuantAlgo.FP8
             sampling_params = SamplingParams(
+                max_tokens=256,
                 temperature=0.0,
                 add_special_tokens=False,
             )
@@ -426,16 +427,20 @@ def test_fp8_tp4(self):
             task.evaluate(llm, sampling_params=sampling_params)
             task = GPQADiamond(self.MODEL_NAME)
             task.evaluate(llm,
-                          sampling_params=sampling_params,
                           extra_evaluator_kwargs=dict(apply_chat_template=True))
 
     @pytest.mark.skip_less_device(4)
     @skip_pre_blackwell
     def test_nvfp4_tp4(self):
-        model_path = f"{llm_models_root()}/modelopt-hf-model-hub/Llama-3.3-70B-Instruct-fp4"
-        with LLM(model_path, tensor_parallel_size=4) as llm:
+        model_path = f"{llm_models_root()}/llama-3.3-models/Llama-3.3-70B-Instruct-FP4"
+        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.5)
+        with LLM(model_path,
+                 tensor_parallel_size=4,
+                 max_batch_size=32,
+                 kv_cache_config=kv_cache_config) as llm:
             assert llm.args.quant_config.quant_algo == QuantAlgo.NVFP4
             sampling_params = SamplingParams(
+                max_tokens=256,
                 temperature=0.0,
                 add_special_tokens=False,
             )
@@ -445,7 +450,6 @@ def test_nvfp4_tp4(self):
             task.evaluate(llm, sampling_params=sampling_params)
             task = GPQADiamond(self.MODEL_NAME)
             task.evaluate(llm,
-                          sampling_params=sampling_params,
                           extra_evaluator_kwargs=dict(apply_chat_template=True))