Merge branch 'main' into 2-model-perf

mikeiovine · web-flow · commit ea2f8049bb50 · 2025-08-29T12:00:20.000-04:00
diff --git a/tests/integration/test_lists/qa/llm_function_l20.txt b/tests/integration/test_lists/qa/llm_function_l20.txt
@@ -19,7 +19,6 @@ accuracy/test_llm_api.py::TestMistralNemo12B::test_fp8
 accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_chunked_prefill[attn_backend=FLASHINFER]
 accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_chunked_prefill[attn_backend=TRTLLM]
 accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_llm_sampler
-accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search
 accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3[eagle3_one_model=True-overlap_scheduler=True]
 accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3[eagle3_one_model=False-overlap_scheduler=False]
 accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_ngram
diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt
@@ -346,3 +346,6 @@ accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8[fp8kv=False-
 accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8[fp8kv=True-attn_backend=FLASHINFER-torch_compile=True] SKIP (https://nvbugs/5485102)
 accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_bfloat16_4gpus[tp4-attn_backend=FLASHINFER-torch_compile=True] SKIP (https://nvbugs/5485109)
 accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_4gpus[tp4-fp8kv=False-attn_backend=FLASHINFER-torch_compile=True] SKIP (https://nvbugs/5485116)
+accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus_online_eplb[mtp_nextn=2] SKIP (https://nvbugs/5444687)
+accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus_online_eplb[fp8kv=True] SKIP (https://nvbugs/5444687)
+accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_4gpus[tp4-fp8kv=True-attn_backend=FLASHINFER-torch_compile=True] SKIP (https://nvbugs/5488580)
diff --git a/tests/unittest/llmapi/test_executor.py b/tests/unittest/llmapi/test_executor.py
@@ -78,6 +78,7 @@ def llama_7b_tp2_path(engine_path: Path) -> Path:
     return path
 
 
+@pytest.mark.skip(reason="https://nvbugs/5488280")
 @pytest.mark.skipif(WORLD_SIZE != 1, reason="Must run on single MPI rank")
 def test_generation_bs2(llama_7b_bs2_path: Path):
     tokenizer = TransformersTokenizer.from_pretrained(llama_7b_bs2_path)
@@ -99,6 +100,7 @@ def test_generation_bs2(llama_7b_bs2_path: Path):
                        'E F G H I K L M')
 
 
+@pytest.mark.skip(reason="https://nvbugs/5488280")
 @pytest.mark.skipif(WORLD_SIZE != 1, reason="Must run on single MPI rank")
 def test_sync_generation(llama_7b_path: Path):
     tokenizer = TransformersTokenizer.from_pretrained(llama_7b_path)
diff --git a/tests/unittest/trt/model_api/test_model_level_api.py b/tests/unittest/trt/model_api/test_model_level_api.py
@@ -3,6 +3,7 @@
 import tempfile
 from contextlib import contextmanager
 
+import pytest
 from profile_utils import profile
 from transformers import AutoTokenizer
 from utils.llm_data import llm_models_root
@@ -42,6 +43,7 @@ def workspace(suffix, prefix="./trtllm_workspace"):
 # 233s on ipp1-1197: loading weights 37s, network/engine 27s, save engine: 35s, load engine (14GB) about 100s
 @profile("save-and-load")
 @force_ampere
+@pytest.mark.skip(reason="https://nvbugs/5488280")
 def test_save_load():
     '''When the engine_dir parameter of to_trt and generate is not None
         to_trt() saves the engine to disk.
@@ -102,6 +104,7 @@ def test_high_level_fake_weights():
 
 
 @force_ampere
+@pytest.mark.skip(reason="https://nvbugs/5488280")
 def test_async_io():
     max_batch_size, max_isl, max_osl = 8, 256, 256
     hf_model_dir = str(llm_models_root() / "llama-models/llama-7b-hf")
diff --git a/tests/unittest/trt/model_api/test_model_quantization.py b/tests/unittest/trt/model_api/test_model_quantization.py
@@ -1,5 +1,6 @@
 import tempfile
 
+import pytest
 from transformers import AutoTokenizer
 from utils.llm_data import llm_models_root
 from utils.util import force_ampere, skip_no_modelopt, skip_pre_ada
@@ -20,6 +21,7 @@
 ]
 
 
+@pytest.mark.skip(reason="https://nvbugs/5488280")
 @force_ampere
 @skip_no_modelopt
 def test_int4_awq_quantization():
@@ -63,6 +65,7 @@ def test_int4_awq_quantization():
             # TODO: TRTLLM-185, check the score when the test infra is ready, hard coded value is not stable, cause flaky tests in L0
 
 
+@pytest.mark.skip(reason="https://nvbugs/5488280")
 @skip_pre_ada
 @skip_no_modelopt
 def test_fp8_quantization():