Skip to content

Commit d8acca4

Browse files
authored
[TRTLLM-6675][infra] Cherry-pick #6623 (#6735)
Signed-off-by: Bo Deng <[email protected]>
1 parent 4200fa4 commit d8acca4

File tree

7 files changed

+315
-3
lines changed

7 files changed

+315
-3
lines changed

tensorrt_llm/serve/scripts/benchmark_serving.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -591,7 +591,7 @@ def save_to_pytorch_benchmark_format(args: argparse.Namespace,
591591
pt_records = convert_to_pytorch_benchmark_format(
592592
args=args,
593593
metrics={k: [results[k]]
594-
for k in metrics},
594+
for k in metrics if k in results},
595595
extra_info={
596596
k: results[k]
597597
for k in results if k not in metrics and k not in ignored_metrics

tests/integration/defs/accuracy/test_disaggregated_serving.py

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -570,6 +570,40 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness):
570570
MODEL_NAME = "deepseek-ai/DeepSeek-V3-Lite"
571571
MODEL_PATH = f"{llm_models_root()}/DeepSeek-V3-Lite/bf16"
572572

573+
def test_nixl_backend(self):
574+
ctx_server_config = {
575+
"disable_overlap_scheduler": True,
576+
"cache_transceiver_config": {
577+
"backend": "nixl"
578+
}
579+
}
580+
gen_server_config = {
581+
"disable_overlap_scheduler": True,
582+
"cache_transceiver_config": {
583+
"backend": "nixl"
584+
}
585+
}
586+
disaggregated_server_config = {
587+
"hostname": "localhost",
588+
"port": 8000,
589+
"backend": "pytorch",
590+
"context_servers": {
591+
"num_instances": 1,
592+
"urls": ["localhost:8001"]
593+
},
594+
"generation_servers": {
595+
"num_instances": 1,
596+
"urls": ["localhost:8002"]
597+
}
598+
}
599+
with launch_disaggregated_llm(disaggregated_server_config,
600+
ctx_server_config, gen_server_config,
601+
self.MODEL_PATH) as llm:
602+
task = MMLU(self.MODEL_NAME)
603+
task.evaluate(llm)
604+
task = GSM8K(self.MODEL_NAME)
605+
task.evaluate(llm)
606+
573607
@pytest.mark.skip_less_device(8)
574608
@parametrize_with_ids("overlap_scheduler", [True, False])
575609
@parametrize_with_ids("mtp_nextn",
@@ -672,6 +706,40 @@ class TestQwen3_8B(LlmapiAccuracyTestHarness):
672706
MODEL_NAME = "Qwen3/Qwen3-8B"
673707
MODEL_PATH = f"{llm_models_root()}/Qwen3/Qwen3-8B-FP8"
674708

709+
def test_nixl_backend(self):
710+
ctx_server_config = {
711+
"disable_overlap_scheduler": True,
712+
"cache_transceiver_config": {
713+
"backend": "nixl"
714+
}
715+
}
716+
gen_server_config = {
717+
"disable_overlap_scheduler": True,
718+
"cache_transceiver_config": {
719+
"backend": "nixl"
720+
}
721+
}
722+
disaggregated_server_config = {
723+
"hostname": "localhost",
724+
"port": 8000,
725+
"backend": "pytorch",
726+
"context_servers": {
727+
"num_instances": 1,
728+
"urls": ["localhost:8001"]
729+
},
730+
"generation_servers": {
731+
"num_instances": 1,
732+
"urls": ["localhost:8002"]
733+
}
734+
}
735+
with launch_disaggregated_llm(disaggregated_server_config,
736+
ctx_server_config, gen_server_config,
737+
self.MODEL_PATH) as llm:
738+
task = MMLU(self.MODEL_NAME)
739+
task.evaluate(llm)
740+
task = GSM8K(self.MODEL_NAME)
741+
task.evaluate(llm)
742+
675743
@pytest.mark.parametrize("overlap_scheduler", [False, True])
676744
def test_auto_dtype(self, overlap_scheduler):
677745
ctx_server_config = {

tests/integration/defs/disaggregated/test_disaggregated.py

Lines changed: 229 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,11 +14,14 @@
1414
# limitations under the License.
1515

1616
import os
17+
import re
1718
import subprocess
19+
import tempfile
1820

1921
import pytest
20-
from defs.conftest import skip_arm, skip_no_hopper
21-
from defs.trt_test_alternative import check_call, popen
22+
import yaml
23+
from defs.conftest import llm_models_root, skip_arm, skip_no_hopper
24+
from defs.trt_test_alternative import check_call, check_output, popen
2225

2326
from tensorrt_llm.logger import logger
2427

@@ -1051,3 +1054,227 @@ def test_disaggregated_deepseek_v3_lite_fp8_tp1_two_mtp(
10511054
"deepseek_v3_lite_fp8_tp1_two_mtp",
10521055
env=llm_venv._new_env,
10531056
cwd=llm_venv.get_working_directory())
1057+
1058+
1059+
@pytest.fixture(scope="module")
1060+
def benchmark_root():
1061+
llm_root = os.getenv("LLM_ROOT")
1062+
return os.path.join(llm_root, "tensorrt_llm", "serve", "scripts")
1063+
1064+
1065+
@pytest.fixture(scope="module")
1066+
def shared_gpt_path():
1067+
DEFAULT_LLM_MODEL_ROOT = os.path.join("/scratch.trt_llm_data", "llm-models")
1068+
LLM_MODELS_ROOT = os.environ.get("LLM_MODELS_ROOT", DEFAULT_LLM_MODEL_ROOT)
1069+
return os.path.join(LLM_MODELS_ROOT, "datasets",
1070+
"ShareGPT_V3_unfiltered_cleaned_split.json")
1071+
1072+
1073+
@pytest.fixture(scope="function")
1074+
def benchmark_model_root(request):
1075+
models_root = llm_models_root()
1076+
if (request.param == "DeepSeek-V3-Lite-fp8"):
1077+
model_path = os.path.join(models_root, "DeepSeek-V3-Lite", "fp8")
1078+
elif (request.param == "DeepSeek-V3-Lite-bf16"):
1079+
model_path = os.path.join(models_root, "DeepSeek-V3-Lite", "bf16")
1080+
elif request.param == "llama-v3-8b-hf":
1081+
model_path = os.path.join(models_root, "llama-models-v3", "8B")
1082+
elif request.param == "llama-3.1-8b-instruct-hf-fp8":
1083+
model_path = os.path.join(models_root, "llama-3.1-model",
1084+
"Llama-3.1-8B-Instruct-FP8")
1085+
else:
1086+
raise ValueError(f"Failed to find the model: {request.param}")
1087+
return model_path
1088+
1089+
1090+
def run_disaggregated_benchmark(example_dir,
1091+
config_file,
1092+
benchmark_root,
1093+
benchmark_model_root,
1094+
shared_gpt_path,
1095+
env=None,
1096+
cwd=None):
1097+
"""Run disaggregated test with given configuration."""
1098+
run_env = env.copy()
1099+
run_env["UCX_TLS"] = "^ib"
1100+
num_rank = 2
1101+
workers_cmd = [
1102+
'mpirun', '--allow-run-as-root', '--oversubscribe', '-n',
1103+
str(num_rank), 'trtllm-serve', 'disaggregated_mpi_worker', '-c',
1104+
config_file
1105+
]
1106+
1107+
server_start_timeout = 900
1108+
server_cmd = [
1109+
'trtllm-serve', 'disaggregated', '--server_start_timeout',
1110+
str(server_start_timeout), '-c', config_file
1111+
]
1112+
try:
1113+
with ( # Start workers
1114+
open('output_workers.log', 'w') as output_workers,
1115+
popen(workers_cmd,
1116+
stdout=output_workers,
1117+
stderr=subprocess.STDOUT,
1118+
env=run_env,
1119+
cwd=cwd) as workers_proc,
1120+
# Start server
1121+
open('output_disagg.log', 'w') as output_disagg,
1122+
popen(server_cmd,
1123+
stdout=output_disagg,
1124+
stderr=subprocess.STDOUT,
1125+
env=run_env,
1126+
cwd=cwd) as server_proc):
1127+
# Ensure the sever has started
1128+
client_dir = f"{example_dir}/clients"
1129+
client_cmd = [
1130+
'python3', f'{client_dir}/disagg_client.py', '-c',
1131+
f'{example_dir}/disagg_config.yaml', '-p',
1132+
f'{client_dir}/prompts.json', '--ignore-eos',
1133+
'--server-start-timeout',
1134+
str(server_start_timeout)
1135+
]
1136+
# Warm up
1137+
check_call(client_cmd,
1138+
env=env,
1139+
poll_procs=[workers_proc, server_proc])
1140+
# Start Benchmark
1141+
benchmark_script = os.path.join(benchmark_root,
1142+
"benchmark_serving.py")
1143+
benchmark_cmd = [
1144+
'python3',
1145+
benchmark_script,
1146+
'--model',
1147+
benchmark_model_root,
1148+
'--tokenizer',
1149+
benchmark_model_root,
1150+
'--dataset-name',
1151+
'random',
1152+
'--dataset-path',
1153+
shared_gpt_path,
1154+
'--random-input-len',
1155+
'256',
1156+
'--random-output-len',
1157+
'64',
1158+
'--random-prefix-len',
1159+
'0',
1160+
'--num-prompts',
1161+
'320',
1162+
'--max-concurrency',
1163+
'32',
1164+
'--host',
1165+
'localhost',
1166+
'--port',
1167+
'8000',
1168+
'--ignore-eos',
1169+
'--no-test-input',
1170+
'--percentile-metrics',
1171+
'e2el,ttft',
1172+
]
1173+
# warm up
1174+
check_call(benchmark_cmd, env=env)
1175+
output = check_output(benchmark_cmd, env=env)
1176+
e2el_pattern = r"Median E2EL \(ms\):\s*(\d+\.?\d*)"
1177+
ttft_pattern = r"Median TTFT \(ms\):\s*(\d+\.?\d*)"
1178+
e2el_match = re.search(e2el_pattern, output)
1179+
ttft_match = re.search(ttft_pattern, output)
1180+
if e2el_match and ttft_match:
1181+
median_e2el = float(e2el_match.group(1))
1182+
median_ttft = float(ttft_match.group(1))
1183+
return median_e2el, median_ttft
1184+
else:
1185+
raise ValueError("No benchmark result found")
1186+
1187+
except Exception:
1188+
# Print outputs on error
1189+
logger.error("-------- Workers output --------")
1190+
with open('output_workers.log', 'r') as f:
1191+
logger.error(f.read())
1192+
1193+
logger.error("-------- Disagg server output --------")
1194+
with open('output_disagg.log', 'r') as f:
1195+
logger.error(f.read())
1196+
raise
1197+
finally:
1198+
server_proc.terminate()
1199+
workers_proc.terminate()
1200+
server_proc.wait()
1201+
workers_proc.wait()
1202+
1203+
1204+
def get_config_for_benchmark(model_root, backend):
1205+
serve_config = {
1206+
"model": model_root,
1207+
"hostname": "localhost",
1208+
"port": 8000,
1209+
"backend": "pytorch",
1210+
"context_servers": {
1211+
"num_instances": 1,
1212+
"max_batch_size": 2,
1213+
"max_num_tokens": 384,
1214+
"max_seq_len": 384,
1215+
"tensor_parallel_size": 1,
1216+
"pipeline_parallel_size": 1,
1217+
"disable_overlap_scheduler": True,
1218+
"cache_transceiver_config": {
1219+
"backend": backend,
1220+
"max_tokens_in_buffer": 512,
1221+
},
1222+
"urls": ["localhost:8001"]
1223+
},
1224+
"generation_servers": {
1225+
"num_instances": 1,
1226+
"tensor_parallel_size": 1,
1227+
"pipeline_parallel_size": 1,
1228+
"max_batch_size": 2,
1229+
"max_num_tokens": 384,
1230+
"max_seq_len": 384,
1231+
"cache_transceiver_config": {
1232+
"backend": backend,
1233+
"max_tokens_in_buffer": 512,
1234+
},
1235+
"urls": ["localhost:8002"]
1236+
}
1237+
}
1238+
return serve_config
1239+
1240+
1241+
@pytest.mark.parametrize("benchmark_model_root", [
1242+
'DeepSeek-V3-Lite-fp8', 'DeepSeek-V3-Lite-bf16', 'llama-v3-8b-hf',
1243+
'llama-3.1-8b-instruct-hf-fp8'
1244+
],
1245+
indirect=True)
1246+
def test_disaggregated_benchmark_on_diff_backends(
1247+
disaggregated_test_root, disaggregated_example_root, llm_venv,
1248+
benchmark_model_root, benchmark_root, shared_gpt_path):
1249+
nixl_config = get_config_for_benchmark(benchmark_model_root, "nixl")
1250+
ucx_config = get_config_for_benchmark(benchmark_model_root, "ucx")
1251+
temp_dir = tempfile.TemporaryDirectory()
1252+
nixl_config_path = os.path.join(temp_dir.name, "nixl_config.yaml")
1253+
ucx_config_path = os.path.join(temp_dir.name, "ucx_config.yaml")
1254+
with open(nixl_config_path, 'w', encoding='utf-8') as f:
1255+
yaml.dump(nixl_config, f)
1256+
with open(ucx_config_path, 'w', encoding='utf-8') as f:
1257+
yaml.dump(ucx_config, f)
1258+
1259+
env = llm_venv._new_env.copy()
1260+
nixl_e2el, nixl_ttft = run_disaggregated_benchmark(
1261+
disaggregated_example_root,
1262+
nixl_config_path,
1263+
benchmark_root,
1264+
benchmark_model_root,
1265+
shared_gpt_path,
1266+
env=env,
1267+
cwd=llm_venv.get_working_directory())
1268+
ucx_e2el, ucx_ttft = run_disaggregated_benchmark(
1269+
disaggregated_example_root,
1270+
ucx_config_path,
1271+
benchmark_root,
1272+
benchmark_model_root,
1273+
shared_gpt_path,
1274+
env=env,
1275+
cwd=llm_venv.get_working_directory())
1276+
print(f"Nixl E2EL: {nixl_e2el} ms, UCX E2EL: {ucx_e2el} ms")
1277+
print(f"Nixl TTFT: {nixl_ttft} ms, UCX TTFT: {ucx_ttft} ms")
1278+
1279+
assert ucx_e2el > 0 and nixl_e2el > 0 and nixl_e2el < 1.05 * ucx_e2el
1280+
assert ucx_ttft > 0 and nixl_ttft > 0 and nixl_ttft < 1.05 * ucx_ttft

tests/integration/test_lists/qa/llm_function_full.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -573,6 +573,8 @@ accuracy/test_llm_api_pytorch.py::TestPhi4MM::test_auto_dtype
573573
accuracy/test_llm_api_pytorch.py::TestPhi4MM::test_auto_dtype_long_rope
574574
accuracy/test_llm_api_pytorch.py::TestPhi4MiniInstruct::test_auto_dtype
575575
accuracy/test_llm_api_pytorch.py::TestEXAONE4::test_auto_dtype
576+
accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_nixl_backend
577+
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_nixl_backend
576578

577579
test_e2e.py::test_llama_e2e[use_cpp_session-remove_input_padding-]
578580
test_e2e.py::test_llama_e2e[use_py_session-remove_input_padding-]

tests/integration/test_lists/qa/llm_function_sanity.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,8 @@ accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_fp8_block_scales[latency]
110110
accuracy/test_llm_api_pytorch.py::TestPhi4MiniInstruct::test_auto_dtype
111111
accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_w4a8_mxfp4[fp8-latency]
112112
accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_w4a8_mxfp4[mxfp8-latency]
113+
accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_nixl_backend
114+
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_nixl_backend
113115
disaggregated/test_disaggregated.py::test_disaggregated_cache_aware_balance[TinyLlama-1.1B-Chat-v1.0]
114116
disaggregated/test_disaggregated.py::test_disaggregated_cuda_graph[TinyLlama-1.1B-Chat-v1.0]
115117
disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_attention_dp_one_mtp[DeepSeek-V3-Lite-fp8]

tests/integration/test_lists/test-db/l0_dgx_b200.yml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,3 +79,10 @@ l0_dgx_b200:
7979
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-TRTLLM]
8080
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-TRITON]
8181
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4a16[dp4]
82+
- disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_nixl[DeepSeek-V3-Lite-fp8]
83+
- disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[DeepSeek-V3-Lite-bf16]
84+
- disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[llama-v3-8b-hf]
85+
- disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[llama-3.1-8b-instruct-hf-fp8]
86+
- disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[DeepSeek-V3-Lite-fp8]
87+
- accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_nixl_backend
88+
- accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_nixl_backend

tests/integration/test_lists/test-db/l0_dgx_h100.yml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,8 @@ l0_dgx_h100:
5252
- accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen_tp_asymmetric[GSM8K-gen_tp=2-ctx_pp=2]
5353
- accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen_tp_asymmetric[MMLU-gen_tp=1-ctx_pp=2]
5454
- accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen_tp_asymmetric[MMLU-gen_tp=2-ctx_pp=2]
55+
- accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_nixl_backend
56+
- accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_nixl_backend
5557
- test_e2e.py::test_ptp_quickstart_advanced_bs1
5658
- test_e2e.py::test_ptp_quickstart_advanced_deepseek_v3_lite_4gpus_adp_balance[DeepSeek-V3-Lite-FP8-DeepSeek-V3-Lite/fp8]
5759
- unittest/_torch/modeling/test_modeling_pixtral.py::test_tensor_parallelism
@@ -114,6 +116,10 @@ l0_dgx_h100:
114116
- disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_mpi[DeepSeek-V3-Lite-fp8]
115117
- disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_ucx[DeepSeek-V3-Lite-fp8]
116118
- disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_nixl[DeepSeek-V3-Lite-fp8]
119+
- disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[DeepSeek-V3-Lite-bf16]
120+
- disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[llama-v3-8b-hf]
121+
- disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[llama-3.1-8b-instruct-hf-fp8]
122+
- disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[DeepSeek-V3-Lite-fp8]
117123
- disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_attention_dp[DeepSeek-V3-Lite-fp8]
118124
- disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_attention_dp_overlap[DeepSeek-V3-Lite-fp8]
119125
- disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_attention_dp_one[DeepSeek-V3-Lite-fp8]

0 commit comments

Comments
 (0)