[https://nvbugs/5448525][fix] Mistral Small 3.1 accuracy tests (#6909)

2ez4bz · web-flow · commit c16aff5e3fea · 2025-08-18T11:17:37.000+08:00
This commit lowers the GPU memory allocated for KV cache in accuracy
tests, and adjusts a threshold for Mistral Small 3.1 24B for FP8.

Signed-off-by: William Zhang &lt;133824995+2ez4bz@users.noreply.github.com&gt;
diff --git a/tests/integration/defs/accuracy/references/cnn_dailymail.yaml b/tests/integration/defs/accuracy/references/cnn_dailymail.yaml
@@ -205,7 +205,7 @@ mistralai/Mistral-Small-3.1-24B-Instruct-2503:
   - accuracy: 29.20
   - quant_algo: FP8
     kv_cache_quant_algo: FP8
-    accuracy: 29.0
+    accuracy: 27.0
 mistralai/Mistral-Nemo-Base-2407:
   - quant_algo: FP8
     kv_cache_quant_algo: FP8
diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@@ -711,7 +711,8 @@ class TestMistralSmall24B(LlmapiAccuracyTestHarness):
         ],
     )
     def test_auto_dtype(self, model_path, expected_quant_algo):
-        with LLM(model_path) as llm:
+        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.75)
+        with LLM(model_path, kv_cache_config=kv_cache_config) as llm:
             assert llm.args.quant_config.quant_algo == expected_quant_algo
             task = CnnDailymail(self.MODEL_NAME)
             task.evaluate(llm)