[TRTLLM-5877][infra] Add fmha tests and auto trigger rules (#6050)

yiqingy0 · chzblych · web-flow · commit 5c616da2fd20 · 2025-09-09T11:33:09.000+08:00
Signed-off-by: Yiqing Yan &lt;yiqingy@nvidia.com&gt;
Co-authored-by: Yanchao Lu &lt;yanchaol@nvidia.com&gt;
diff --git a/cpp/kernels/fmha_v2/fmha_test.py b/cpp/kernels/fmha_v2/fmha_test.py
@@ -61,6 +61,8 @@ def getSMVersion():
 def test_trtllm_flash_attention_fmha(d, s, dtype, flag, tiled_kernel):
     verbose = 0
     sm_version = getSMVersion()
+    if flag == "-use-attention-sinks" and sm_version != 90:
+        pytest.skip("use-attention-sinks is only supported on sm90 currently.")
     if sm_version == 90 and tiled_kernel == "-force-non-tiled":
         pytest.skip(
             "Tiled/non-tiled flags only make a difference to ampere-style kernels."
diff --git a/jenkins/L0_MergeRequest.groovy b/jenkins/L0_MergeRequest.groovy
@@ -629,6 +629,7 @@ def getAutoTriggerTagList(pipeline, testFilter, globalVars) {
     }
     def specialFileToTagMap = [
         "tensorrt_llm/_torch/models/modeling_deepseekv3.py": ["-DeepSeek-"],
+        "cpp/kernels/fmha_v2/": ["-FMHA-"],
     ]
     for (file in changedFileList) {
         for (String key : specialFileToTagMap.keySet()) {
diff --git a/jenkins/L0_Test.groovy b/jenkins/L0_Test.groovy
@@ -1215,8 +1215,12 @@ def getMakoArgsFromStageName(stageName, parseSysinfo=false) {
         // If stageName contains "-Triton-", add "backend=triton" to makoArgs
         // At this point, only tests with backend=triton or unspecified backend will be run
         makoArgs += ["backend=triton"]
+    } else if (stageName.contains("-FMHA-")) {
+        // If stageName contains "-FMHA-", add "backend=fmha" to makoArgs
+        // At this point, only tests with backend=fmha or unspecified backend will be run
+        makoArgs += ["backend=fmha"]
     } else {
-        // If stageName does not contain "-PyTorch-", "-TensorRT-", "-CPP-", or "-Triton-", do not add any backend
+        // If stageName does not contain "-PyTorch-", "-TensorRT-", "-CPP-", "-Triton-", or "-FMHA-", do not add any backend
         // At this point, all tests will be run
         // For cases where backend is not specified in makoArgs, we will match all types of backends and tests without specified backend
     }
@@ -2000,6 +2004,7 @@ def launchTestJobs(pipeline, testFilter)
         "A10-PyTorch-Post-Merge-1": ["a10", "l0_a10", 1, 1],
         "A10-TensorRT-Post-Merge-1": ["a10", "l0_a10", 1, 2],
         "A10-TensorRT-Post-Merge-2": ["a10", "l0_a10", 2, 2],
+        "A10-FMHA-Post-Merge-1": ["a10", "l0_a10", 1, 1],
         "A30-TensorRT-Post-Merge-1": ["a30", "l0_a30", 1, 6],
         "A30-TensorRT-Post-Merge-2": ["a30", "l0_a30", 2, 6],
         "A30-TensorRT-Post-Merge-3": ["a30", "l0_a30", 3, 6],
@@ -2017,18 +2022,21 @@ def launchTestJobs(pipeline, testFilter)
         "A100X-TensorRT-Post-Merge-6": ["a100x", "l0_a100", 6, 6],
         "A100X-Triton-Post-Merge-1": ["a100x", "l0_a100", 1, 2],
         "A100X-Triton-Post-Merge-2": ["a100x", "l0_a100", 2, 2],
+        "A100X-FMHA-Post-Merge-1": ["a100x", "l0_a100", 1, 1],
         "L40S-TensorRT-Post-Merge-1": ["l40s", "l0_l40s", 1, 5],
         "L40S-TensorRT-Post-Merge-2": ["l40s", "l0_l40s", 2, 5],
         "L40S-TensorRT-Post-Merge-3": ["l40s", "l0_l40s", 3, 5],
         "L40S-TensorRT-Post-Merge-4": ["l40s", "l0_l40s", 4, 5],
         "L40S-TensorRT-Post-Merge-5": ["l40s", "l0_l40s", 5, 5],
+        "L40S-FMHA-Post-Merge-1": ["l40s", "l0_l40s", 1, 1],
         "H100_PCIe-PyTorch-Post-Merge-1": ["h100-cr", "l0_h100", 1, 1],
         "H100_PCIe-CPP-Post-Merge-1": ["h100-cr", "l0_h100", 1, 1],
         "H100_PCIe-TensorRT-Post-Merge-1": ["h100-cr", "l0_h100", 1, 5],
         "H100_PCIe-TensorRT-Post-Merge-2": ["h100-cr", "l0_h100", 2, 5],
         "H100_PCIe-TensorRT-Post-Merge-3": ["h100-cr", "l0_h100", 3, 5],
         "H100_PCIe-TensorRT-Post-Merge-4": ["h100-cr", "l0_h100", 4, 5],
         "H100_PCIe-TensorRT-Post-Merge-5": ["h100-cr", "l0_h100", 5, 5],
+        "H100_PCIe-FMHA-Post-Merge-1": ["h100-cr", "l0_h100", 1, 1],
         "B200_PCIe-Triton-Post-Merge-1": ["b100-ts2", "l0_b200", 1, 1],
         "B200_PCIe-PyTorch-Post-Merge-1": ["b100-ts2", "l0_b200", 1, 1],
         "B200_PCIe-TensorRT-Post-Merge-1": ["b100-ts2", "l0_b200", 1, 2],
@@ -2422,6 +2430,7 @@ def launchTestJobs(pipeline, testFilter)
             "pytorch": "-PyTorch-",
             "tensorrt": "-TensorRT-",
             "cpp": "-CPP-",
+            "fmha": "-FMHA-",
         ]
         def backendModeList = backendMode.collect { changeMap.get(it) }.flatten()
         def parallelJobsNoBackend = parallelJobsFiltered.findAll { key, _ ->
@@ -2445,8 +2454,9 @@ def launchTestJobs(pipeline, testFilter)
         } else {
             echo "ONLY_ONE_GROUP_CHANGED mode is true. The group is: ${testFilter[(ONLY_ONE_GROUP_CHANGED)]}."
             def excludedBackends = new HashMap()
-            excludedBackends["PyTorch"] = ["-CPP-", "-TensorRT-", "-Triton-"]
-            excludedBackends["Triton"] = ["-PyTorch-", "-CPP-", "-TensorRT-"]
+            excludedBackends["PyTorch"] = ["-CPP-", "-TensorRT-", "-Triton-", "-FMHA-"]
+            excludedBackends["Triton"] = ["-PyTorch-", "-CPP-", "-TensorRT-", "-FMHA-"]
+            excludedBackends["FMHA"] = ["-PyTorch-", "-CPP-", "-TensorRT-", "-Triton-"]
             def group = testFilter[(ONLY_ONE_GROUP_CHANGED)]
             if (excludedBackends.containsKey(group)) {
                 parallelJobsFiltered = parallelJobsFiltered.findAll { key, value ->
diff --git a/tests/integration/defs/test_fmha.py b/tests/integration/defs/test_fmha.py
@@ -0,0 +1,34 @@
+import os
+from functools import partial
+from pathlib import Path
+from subprocess import run
+
+
+def test_fmha():
+    build_run = partial(run, shell=True, check=True)
+
+    current_dir = Path.cwd()
+    project_dir = Path(__file__).parent.resolve().parent.parent.parent
+    fmha_v2_dir = project_dir / "cpp/kernels/fmha_v2"
+
+    try:
+        os.chdir(fmha_v2_dir)
+
+        env = os.environ.copy()
+        env.update({
+            "TORCH_CUDA_ARCH_LIST": "9.0",
+            "ENABLE_SM89_QMMA": "1",
+            "ENABLE_HMMA_FP32": "1",
+            "SCHEDULING_MODE": "1",
+            "ENABLE_SM100": "1",
+            "ENABLE_SM120": "1",
+        })
+
+        build_run(
+            "rm -rf generated temp obj .pytest_cache __pycache__ bin cubin")
+        build_run("python3 setup.py", env=env)
+        build_run("make -j 16", env=env)
+        build_run("pytest fmha_test.py", env=env)
+
+    finally:
+        os.chdir(current_dir)
diff --git a/tests/integration/test_lists/test-db/l0_a10.yml b/tests/integration/test_lists/test-db/l0_a10.yml
@@ -215,6 +215,20 @@ l0_a10:
   tests:
   - stress_test/stress_test.py::test_run_stress_test[llama-v3-8b-instruct-hf_tp1-stress_time_300s_timeout_450s-MAX_UTILIZATION-pytorch-stress-test]
   - stress_test/stress_test.py::test_run_stress_test[llama-v3-8b-instruct-hf_tp1-stress_time_300s_timeout_450s-GUARANTEED_NO_EVICT-pytorch-stress-test]
+- condition:
+    ranges:
+      system_gpu_count:
+        gte: 1
+        lte: 1
+    wildcards:
+      gpu:
+      - '*a10*'
+      linux_distribution_name: ubuntu*
+    terms:
+      stage: post_merge
+      backend: fmha
+  tests:
+  - test_fmha.py::test_fmha TIMEOUT (90)
 l0_a10_pybind:
 - condition:
     ranges:
diff --git a/tests/integration/test_lists/test-db/l0_a100.yml b/tests/integration/test_lists/test-db/l0_a100.yml
@@ -107,3 +107,17 @@ l0_a100:
   - triton_server/test_triton_llm.py::test_mistral_small_3_1_24b_pixtral[TYPE_FP16-TYPE_BF16-False-1---False-True-False-0-1-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap--0.7-max_utilization---1-1-1-False-tensorrt_llm_bls]
   - triton_server/test_triton_llm.py::test_mistral_small_3_1_24b_pixtral[TYPE_FP16-TYPE_BF16-False-1---False-True-False-0-1-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap--0.7-guaranteed_no_evict---1-1-1-False-ensemble]
   - triton_server/test_triton_llm.py::test_mistral_small_3_1_24b_pixtral[TYPE_FP16-TYPE_BF16-False-1---False-True-False-0-1-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap--0.7-guaranteed_no_evict---1-1-1-False-tensorrt_llm_bls]
+- condition:
+    ranges:
+      system_gpu_count:
+        gte: 1
+        lte: 1
+    wildcards:
+      gpu:
+      - '*a100*'
+      linux_distribution_name: ubuntu*
+    terms:
+      stage: post_merge
+      backend: fmha
+  tests:
+  - test_fmha.py::test_fmha
diff --git a/tests/integration/test_lists/test-db/l0_h100.yml b/tests/integration/test_lists/test-db/l0_h100.yml
@@ -356,3 +356,17 @@ l0_h100:
   - examples/test_phi.py::test_llm_phi_quantization_1gpu[Phi-4-mini-instruct-fp8-bfloat16]
   - unittest/trt/model/test_gpt_e2e.py # 3 mins / 6 mins on H100
   - unittest/trt/attention/test_gpt_attention_no_cache.py
+- condition:
+    ranges:
+      system_gpu_count:
+        gte: 1
+        lte: 1
+    wildcards:
+      gpu:
+      - '*h100*'
+      linux_distribution_name: ubuntu*
+    terms:
+      stage: post_merge
+      backend: fmha
+  tests:
+  - test_fmha.py::test_fmha
diff --git a/tests/integration/test_lists/test-db/l0_l40s.yml b/tests/integration/test_lists/test-db/l0_l40s.yml
@@ -114,3 +114,17 @@ l0_l40s:
   - examples/test_granite.py::test_granite_bf16_lora[granite-3.0-1b-a400m-instruct] TIMEOUT (90)
   - examples/test_llama.py::test_llama_3_x_fp8_with_bf16_lora[llama-v3-8b-instruct-hf]
   - examples/test_qwen.py::test_llm_qwen1_5_7b_single_gpu_lora[qwen1.5_7b_chat-Qwen1.5-7B-Chat-750Mb-lora]
+- condition:
+    ranges:
+      system_gpu_count:
+        gte: 1
+        lte: 1
+    wildcards:
+      gpu:
+      - '*l40s*'
+      linux_distribution_name: ubuntu*
+    terms:
+      stage: post_merge
+      backend: fmha
+  tests:
+  - test_fmha.py::test_fmha

Original file line number	Diff line number	Diff line change
`@@ -629,6 +629,7 @@ def getAutoTriggerTagList(pipeline, testFilter, globalVars) {`
`629`	`629`	`}`
`630`	`630`	`def specialFileToTagMap = [`
`631`	`631`	`"tensorrt_llm/_torch/models/modeling_deepseekv3.py": ["-DeepSeek-"],`
	`632`	`+ "cpp/kernels/fmha_v2/": ["-FMHA-"],`
`632`	`633`	`]`
`633`	`634`	`for (file in changedFileList) {`
`634`	`635`	`for (String key : specialFileToTagMap.keySet()) {`