feat: Add Phi-4-Mini-Instruct in Pytorch backend for LLM API accuracy tests (#6303)

moraxu · web-flow · commit 2573bb729d8b · 2025-07-28T14:02:14.000-07:00
Signed-off-by: moraxu &lt;mguzek@nvidia.com&gt;
diff --git a/tests/integration/defs/accuracy/references/cnn_dailymail.yaml b/tests/integration/defs/accuracy/references/cnn_dailymail.yaml
@@ -40,6 +40,8 @@ microsoft/Phi-3-small-128k-instruct:
   - accuracy: 27.208
 microsoft/Phi-3.5-mini-instruct:
   - accuracy: 31.354
+microsoft/Phi-4-mini-instruct:
+  - accuracy: 32.921
 state-spaces/mamba-130m-hf:
   - accuracy: 19.470
 lmsys/vicuna-7b-v1.3:
diff --git a/tests/integration/defs/accuracy/references/gsm8k.yaml b/tests/integration/defs/accuracy/references/gsm8k.yaml
@@ -126,3 +126,5 @@ mistralai/Mistral-Small-3.1-24B-Instruct-2503:
   - accuracy: 89.23
 microsoft/Phi-4-multimodal-instruct:
   - accuracy: 81.19
+microsoft/Phi-4-mini-instruct:
+  - accuracy: 82.30
diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@@ -1920,10 +1920,6 @@ class TestPhi4MiniInstruct(LlmapiAccuracyTestHarness):
     MODEL_NAME = "microsoft/Phi-4-mini-instruct"
     MODEL_PATH = f"{llm_models_root()}/Phi-4-mini-instruct"
 
-    @pytest.mark.skip(
-        reason=
-        "Temporarily skipping test_auto_dtype while resolving Phi-4's architecture issue."
-    )
     def test_auto_dtype(self):
         with LLM(self.MODEL_PATH) as llm:
             task = CnnDailymail(self.MODEL_NAME)
@@ -1932,9 +1928,6 @@ def test_auto_dtype(self):
             task.evaluate(llm)
             task = GSM8K(self.MODEL_NAME)
             task.evaluate(llm)
-            task = GPQADiamond(self.MODEL_NAME)
-            task.evaluate(llm,
-                          extra_evaluator_kwargs=dict(apply_chat_template=True))
 
 
 class TestKanana_Instruct(LlmapiAccuracyTestHarness):
diff --git a/tests/integration/test_lists/qa/examples_test_list.txt b/tests/integration/test_lists/qa/examples_test_list.txt
@@ -495,6 +495,7 @@ accuracy/test_llm_api_pytorch.py::TestBielik11BInstruct::test_fp8
 accuracy/test_llm_api_pytorch.py::TestMinistral8BInstruct::test_auto_dtype
 accuracy/test_llm_api_pytorch.py::TestMinistral8BInstruct::test_fp8
 accuracy/test_llm_api_pytorch.py::TestPhi4MM::test_auto_dtype
+accuracy/test_llm_api_pytorch.py::TestPhi4MiniInstruct::test_auto_dtype
 
 test_e2e.py::test_llama_e2e[use_cpp_session-remove_input_padding-]
 test_e2e.py::test_llama_e2e[use_py_session-remove_input_padding-]
diff --git a/tests/integration/test_lists/qa/llm_sanity_test.txt b/tests/integration/test_lists/qa/llm_sanity_test.txt
@@ -63,6 +63,7 @@ accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_fp8_block_scales[laten
 accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_cutlass]
 accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_trtllm]
 accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_fp8_block_scales[latency]
+accuracy/test_llm_api_pytorch.py::TestPhi4MiniInstruct::test_auto_dtype
 disaggregated/test_disaggregated.py::test_disaggregated_cache_aware_balance[TinyLlama-1.1B-Chat-v1.0]
 disaggregated/test_disaggregated.py::test_disaggregated_cuda_graph[TinyLlama-1.1B-Chat-v1.0]
 disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_attention_dp_one_mtp[DeepSeek-V3-Lite-fp8]