Fix accuracy drop of gsm8k

chenfeiz0326 · chenfeiz0326 · commit bb15eb92de80 · 2025-08-18T00:33:36.000-07:00
Signed-off-by: Chenfei Zhang &lt;chenfeiz@nvidia.com&gt;
diff --git a/tensorrt_llm/executor/executor.py b/tensorrt_llm/executor/executor.py
@@ -134,6 +134,9 @@ def generate_async(
         if postproc_params:
             postproc_params.postproc_args.num_prompt_tokens = len(
                 prompt_token_ids)
+
+        print(f"[CF][generate_async] sampling_params is ")
+        print(sampling_params)
         request = GenerationRequest(
             prompt_token_ids,
             sampling_params=sampling_params,
diff --git a/tests/integration/defs/accuracy/references/gsm8k.yaml b/tests/integration/defs/accuracy/references/gsm8k.yaml
@@ -13,12 +13,12 @@ meta-llama/Llama-3.3-70B-Instruct:
   - accuracy: 83.78
   - quant_algo: NVFP4
     kv_cache_quant_algo: FP8
-    accuracy: 88.70
+    accuracy: 87.33
   - quant_algo: FP8
     kv_cache_quant_algo: FP8
-    accuracy: 84.08
+    accuracy: 90.30
   - quant_algo: FP8
-    accuracy: 84.08
+    accuracy: 90.30
 meta-llama/Llama-4-Maverick-17B-128E-Instruct:
   - accuracy: 92.20
 meta-llama/Llama-4-Scout-17B-16E-Instruct:
diff --git a/tests/integration/defs/accuracy/references/mmlu.yaml b/tests/integration/defs/accuracy/references/mmlu.yaml
@@ -63,12 +63,12 @@ meta-llama/Llama-3.3-70B-Instruct:
     accuracy: 81.31
   - quant_algo: NVFP4
     kv_cache_quant_algo: FP8
-    accuracy: 79.31
+    accuracy: 78.78
   - quant_algo: FP8
     kv_cache_quant_algo: FP8
-    accuracy: 81.02
+    accuracy: 80.40
   - quant_algo: FP8
-    accuracy: 80.34
+    accuracy: 80.40
 meta-llama/Llama-4-Maverick-17B-128E-Instruct:
   - accuracy: 86.40
   - quant_algo: FP8
diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@@ -463,7 +463,7 @@ def test_eagle3_tp8(self, eagle3_one_model):
     @pytest.mark.skip_less_device(4)
     @skip_pre_hopper
     def test_fp8_tp4(self):
-        model_path = f"{llm_models_root()}/modelopt-hf-model-hub/Llama-3.3-70B-Instruct-fp8"
+        model_path = f"{llm_models_root()}/llama-3.3-models/Llama-3.3-70B-Instruct-FP8"
         kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.5)
         with LLM(model_path,
                  tensor_parallel_size=4,
@@ -472,6 +472,7 @@ def test_fp8_tp4(self):
                  kv_cache_config=kv_cache_config) as llm:
             assert llm.args.quant_config.quant_algo == QuantAlgo.FP8
             sampling_params = SamplingParams(
+                max_tokens=256,
                 temperature=0.0,
                 add_special_tokens=False,
             )
@@ -481,16 +482,21 @@ def test_fp8_tp4(self):
             task.evaluate(llm, sampling_params=sampling_params)
             task = GPQADiamond(self.MODEL_NAME)
             task.evaluate(llm,
-                          sampling_params=sampling_params,
                           extra_evaluator_kwargs=dict(apply_chat_template=True))
 
     @pytest.mark.skip_less_device(4)
     @skip_pre_blackwell
     def test_nvfp4_tp4(self):
-        model_path = f"{llm_models_root()}/modelopt-hf-model-hub/Llama-3.3-70B-Instruct-fp4"
-        with LLM(model_path, tensor_parallel_size=4) as llm:
+        model_path = f"{llm_models_root()}/llama-3.3-models/Llama-3.3-70B-Instruct-FP4"
+        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.5)
+        with LLM(model_path,
+                 tensor_parallel_size=4,
+                 max_seq_len=8192,
+                 max_batch_size=32,
+                 kv_cache_config=kv_cache_config) as llm:
             assert llm.args.quant_config.quant_algo == QuantAlgo.NVFP4
             sampling_params = SamplingParams(
+                max_tokens=256,
                 temperature=0.0,
                 add_special_tokens=False,
             )
@@ -500,7 +506,6 @@ def test_nvfp4_tp4(self):
             task.evaluate(llm, sampling_params=sampling_params)
             task = GPQADiamond(self.MODEL_NAME)
             task.evaluate(llm,
-                          sampling_params=sampling_params,
                           extra_evaluator_kwargs=dict(apply_chat_template=True))