Skip to content

Commit 5c616da

Browse files
yiqingy0chzblych
andauthored
[TRTLLM-5877][infra] Add fmha tests and auto trigger rules (#6050)
Signed-off-by: Yiqing Yan <[email protected]> Co-authored-by: Yanchao Lu <[email protected]>
1 parent 1e0669d commit 5c616da

File tree

8 files changed

+106
-3
lines changed

8 files changed

+106
-3
lines changed

cpp/kernels/fmha_v2/fmha_test.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,8 @@ def getSMVersion():
6161
def test_trtllm_flash_attention_fmha(d, s, dtype, flag, tiled_kernel):
6262
verbose = 0
6363
sm_version = getSMVersion()
64+
if flag == "-use-attention-sinks" and sm_version != 90:
65+
pytest.skip("use-attention-sinks is only supported on sm90 currently.")
6466
if sm_version == 90 and tiled_kernel == "-force-non-tiled":
6567
pytest.skip(
6668
"Tiled/non-tiled flags only make a difference to ampere-style kernels."

jenkins/L0_MergeRequest.groovy

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -629,6 +629,7 @@ def getAutoTriggerTagList(pipeline, testFilter, globalVars) {
629629
}
630630
def specialFileToTagMap = [
631631
"tensorrt_llm/_torch/models/modeling_deepseekv3.py": ["-DeepSeek-"],
632+
"cpp/kernels/fmha_v2/": ["-FMHA-"],
632633
]
633634
for (file in changedFileList) {
634635
for (String key : specialFileToTagMap.keySet()) {

jenkins/L0_Test.groovy

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1215,8 +1215,12 @@ def getMakoArgsFromStageName(stageName, parseSysinfo=false) {
12151215
// If stageName contains "-Triton-", add "backend=triton" to makoArgs
12161216
// At this point, only tests with backend=triton or unspecified backend will be run
12171217
makoArgs += ["backend=triton"]
1218+
} else if (stageName.contains("-FMHA-")) {
1219+
// If stageName contains "-FMHA-", add "backend=fmha" to makoArgs
1220+
// At this point, only tests with backend=fmha or unspecified backend will be run
1221+
makoArgs += ["backend=fmha"]
12181222
} else {
1219-
// If stageName does not contain "-PyTorch-", "-TensorRT-", "-CPP-", or "-Triton-", do not add any backend
1223+
// If stageName does not contain "-PyTorch-", "-TensorRT-", "-CPP-", "-Triton-", or "-FMHA-", do not add any backend
12201224
// At this point, all tests will be run
12211225
// For cases where backend is not specified in makoArgs, we will match all types of backends and tests without specified backend
12221226
}
@@ -2000,6 +2004,7 @@ def launchTestJobs(pipeline, testFilter)
20002004
"A10-PyTorch-Post-Merge-1": ["a10", "l0_a10", 1, 1],
20012005
"A10-TensorRT-Post-Merge-1": ["a10", "l0_a10", 1, 2],
20022006
"A10-TensorRT-Post-Merge-2": ["a10", "l0_a10", 2, 2],
2007+
"A10-FMHA-Post-Merge-1": ["a10", "l0_a10", 1, 1],
20032008
"A30-TensorRT-Post-Merge-1": ["a30", "l0_a30", 1, 6],
20042009
"A30-TensorRT-Post-Merge-2": ["a30", "l0_a30", 2, 6],
20052010
"A30-TensorRT-Post-Merge-3": ["a30", "l0_a30", 3, 6],
@@ -2017,18 +2022,21 @@ def launchTestJobs(pipeline, testFilter)
20172022
"A100X-TensorRT-Post-Merge-6": ["a100x", "l0_a100", 6, 6],
20182023
"A100X-Triton-Post-Merge-1": ["a100x", "l0_a100", 1, 2],
20192024
"A100X-Triton-Post-Merge-2": ["a100x", "l0_a100", 2, 2],
2025+
"A100X-FMHA-Post-Merge-1": ["a100x", "l0_a100", 1, 1],
20202026
"L40S-TensorRT-Post-Merge-1": ["l40s", "l0_l40s", 1, 5],
20212027
"L40S-TensorRT-Post-Merge-2": ["l40s", "l0_l40s", 2, 5],
20222028
"L40S-TensorRT-Post-Merge-3": ["l40s", "l0_l40s", 3, 5],
20232029
"L40S-TensorRT-Post-Merge-4": ["l40s", "l0_l40s", 4, 5],
20242030
"L40S-TensorRT-Post-Merge-5": ["l40s", "l0_l40s", 5, 5],
2031+
"L40S-FMHA-Post-Merge-1": ["l40s", "l0_l40s", 1, 1],
20252032
"H100_PCIe-PyTorch-Post-Merge-1": ["h100-cr", "l0_h100", 1, 1],
20262033
"H100_PCIe-CPP-Post-Merge-1": ["h100-cr", "l0_h100", 1, 1],
20272034
"H100_PCIe-TensorRT-Post-Merge-1": ["h100-cr", "l0_h100", 1, 5],
20282035
"H100_PCIe-TensorRT-Post-Merge-2": ["h100-cr", "l0_h100", 2, 5],
20292036
"H100_PCIe-TensorRT-Post-Merge-3": ["h100-cr", "l0_h100", 3, 5],
20302037
"H100_PCIe-TensorRT-Post-Merge-4": ["h100-cr", "l0_h100", 4, 5],
20312038
"H100_PCIe-TensorRT-Post-Merge-5": ["h100-cr", "l0_h100", 5, 5],
2039+
"H100_PCIe-FMHA-Post-Merge-1": ["h100-cr", "l0_h100", 1, 1],
20322040
"B200_PCIe-Triton-Post-Merge-1": ["b100-ts2", "l0_b200", 1, 1],
20332041
"B200_PCIe-PyTorch-Post-Merge-1": ["b100-ts2", "l0_b200", 1, 1],
20342042
"B200_PCIe-TensorRT-Post-Merge-1": ["b100-ts2", "l0_b200", 1, 2],
@@ -2422,6 +2430,7 @@ def launchTestJobs(pipeline, testFilter)
24222430
"pytorch": "-PyTorch-",
24232431
"tensorrt": "-TensorRT-",
24242432
"cpp": "-CPP-",
2433+
"fmha": "-FMHA-",
24252434
]
24262435
def backendModeList = backendMode.collect { changeMap.get(it) }.flatten()
24272436
def parallelJobsNoBackend = parallelJobsFiltered.findAll { key, _ ->
@@ -2445,8 +2454,9 @@ def launchTestJobs(pipeline, testFilter)
24452454
} else {
24462455
echo "ONLY_ONE_GROUP_CHANGED mode is true. The group is: ${testFilter[(ONLY_ONE_GROUP_CHANGED)]}."
24472456
def excludedBackends = new HashMap()
2448-
excludedBackends["PyTorch"] = ["-CPP-", "-TensorRT-", "-Triton-"]
2449-
excludedBackends["Triton"] = ["-PyTorch-", "-CPP-", "-TensorRT-"]
2457+
excludedBackends["PyTorch"] = ["-CPP-", "-TensorRT-", "-Triton-", "-FMHA-"]
2458+
excludedBackends["Triton"] = ["-PyTorch-", "-CPP-", "-TensorRT-", "-FMHA-"]
2459+
excludedBackends["FMHA"] = ["-PyTorch-", "-CPP-", "-TensorRT-", "-Triton-"]
24502460
def group = testFilter[(ONLY_ONE_GROUP_CHANGED)]
24512461
if (excludedBackends.containsKey(group)) {
24522462
parallelJobsFiltered = parallelJobsFiltered.findAll { key, value ->
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
import os
2+
from functools import partial
3+
from pathlib import Path
4+
from subprocess import run
5+
6+
7+
def test_fmha():
8+
build_run = partial(run, shell=True, check=True)
9+
10+
current_dir = Path.cwd()
11+
project_dir = Path(__file__).parent.resolve().parent.parent.parent
12+
fmha_v2_dir = project_dir / "cpp/kernels/fmha_v2"
13+
14+
try:
15+
os.chdir(fmha_v2_dir)
16+
17+
env = os.environ.copy()
18+
env.update({
19+
"TORCH_CUDA_ARCH_LIST": "9.0",
20+
"ENABLE_SM89_QMMA": "1",
21+
"ENABLE_HMMA_FP32": "1",
22+
"SCHEDULING_MODE": "1",
23+
"ENABLE_SM100": "1",
24+
"ENABLE_SM120": "1",
25+
})
26+
27+
build_run(
28+
"rm -rf generated temp obj .pytest_cache __pycache__ bin cubin")
29+
build_run("python3 setup.py", env=env)
30+
build_run("make -j 16", env=env)
31+
build_run("pytest fmha_test.py", env=env)
32+
33+
finally:
34+
os.chdir(current_dir)

tests/integration/test_lists/test-db/l0_a10.yml

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -215,6 +215,20 @@ l0_a10:
215215
tests:
216216
- stress_test/stress_test.py::test_run_stress_test[llama-v3-8b-instruct-hf_tp1-stress_time_300s_timeout_450s-MAX_UTILIZATION-pytorch-stress-test]
217217
- stress_test/stress_test.py::test_run_stress_test[llama-v3-8b-instruct-hf_tp1-stress_time_300s_timeout_450s-GUARANTEED_NO_EVICT-pytorch-stress-test]
218+
- condition:
219+
ranges:
220+
system_gpu_count:
221+
gte: 1
222+
lte: 1
223+
wildcards:
224+
gpu:
225+
- '*a10*'
226+
linux_distribution_name: ubuntu*
227+
terms:
228+
stage: post_merge
229+
backend: fmha
230+
tests:
231+
- test_fmha.py::test_fmha TIMEOUT (90)
218232
l0_a10_pybind:
219233
- condition:
220234
ranges:

tests/integration/test_lists/test-db/l0_a100.yml

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -107,3 +107,17 @@ l0_a100:
107107
- triton_server/test_triton_llm.py::test_mistral_small_3_1_24b_pixtral[TYPE_FP16-TYPE_BF16-False-1---False-True-False-0-1-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap--0.7-max_utilization---1-1-1-False-tensorrt_llm_bls]
108108
- triton_server/test_triton_llm.py::test_mistral_small_3_1_24b_pixtral[TYPE_FP16-TYPE_BF16-False-1---False-True-False-0-1-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap--0.7-guaranteed_no_evict---1-1-1-False-ensemble]
109109
- triton_server/test_triton_llm.py::test_mistral_small_3_1_24b_pixtral[TYPE_FP16-TYPE_BF16-False-1---False-True-False-0-1-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap--0.7-guaranteed_no_evict---1-1-1-False-tensorrt_llm_bls]
110+
- condition:
111+
ranges:
112+
system_gpu_count:
113+
gte: 1
114+
lte: 1
115+
wildcards:
116+
gpu:
117+
- '*a100*'
118+
linux_distribution_name: ubuntu*
119+
terms:
120+
stage: post_merge
121+
backend: fmha
122+
tests:
123+
- test_fmha.py::test_fmha

tests/integration/test_lists/test-db/l0_h100.yml

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -356,3 +356,17 @@ l0_h100:
356356
- examples/test_phi.py::test_llm_phi_quantization_1gpu[Phi-4-mini-instruct-fp8-bfloat16]
357357
- unittest/trt/model/test_gpt_e2e.py # 3 mins / 6 mins on H100
358358
- unittest/trt/attention/test_gpt_attention_no_cache.py
359+
- condition:
360+
ranges:
361+
system_gpu_count:
362+
gte: 1
363+
lte: 1
364+
wildcards:
365+
gpu:
366+
- '*h100*'
367+
linux_distribution_name: ubuntu*
368+
terms:
369+
stage: post_merge
370+
backend: fmha
371+
tests:
372+
- test_fmha.py::test_fmha

tests/integration/test_lists/test-db/l0_l40s.yml

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,3 +114,17 @@ l0_l40s:
114114
- examples/test_granite.py::test_granite_bf16_lora[granite-3.0-1b-a400m-instruct] TIMEOUT (90)
115115
- examples/test_llama.py::test_llama_3_x_fp8_with_bf16_lora[llama-v3-8b-instruct-hf]
116116
- examples/test_qwen.py::test_llm_qwen1_5_7b_single_gpu_lora[qwen1.5_7b_chat-Qwen1.5-7B-Chat-750Mb-lora]
117+
- condition:
118+
ranges:
119+
system_gpu_count:
120+
gte: 1
121+
lte: 1
122+
wildcards:
123+
gpu:
124+
- '*l40s*'
125+
linux_distribution_name: ubuntu*
126+
terms:
127+
stage: post_merge
128+
backend: fmha
129+
tests:
130+
- test_fmha.py::test_fmha

0 commit comments

Comments
 (0)