Skip to content

Commit 1095b8f

Browse files
committed
adds FP8 agg recipe
1 parent d242f78 commit 1095b8f

File tree

2 files changed

+267
-0
lines changed

2 files changed

+267
-0
lines changed
Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,114 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
# SPDX-License-Identifier: Apache-2.0
3+
apiVersion: v1
4+
kind: ConfigMap
5+
metadata:
6+
name: agg-config
7+
data:
8+
agg.yaml: |
9+
backend: pytorch
10+
trust_remote_code: true
11+
tensor_parallel_size: 4
12+
moe_expert_parallel_size: 4
13+
moe_tensor_parallel_size: 1
14+
enable_attention_dp: false
15+
enable_chunked_prefill: true
16+
build_config:
17+
max_batch_size: 128
18+
max_num_tokens: 8192
19+
max_seq_len: 8192
20+
kv_cache_config:
21+
enable_block_reuse: false
22+
free_gpu_memory_fraction: 0.8
23+
dtype: auto
24+
cache_transceiver_config:
25+
backend: DEFAULT
26+
cuda_graph_config:
27+
enable_padding: true
28+
max_batch_size: 128
29+
disable_overlap_scheduler: false
30+
print_iter_log: false
31+
---
32+
apiVersion: nvidia.com/v1alpha1
33+
kind: DynamoGraphDeployment
34+
metadata:
35+
name: qwen3-235b-a22b-agg
36+
spec:
37+
backendFramework: trtllm
38+
pvcs:
39+
- name: model-cache
40+
create: false
41+
services:
42+
Frontend:
43+
componentType: frontend
44+
dynamoNamespace: qwen3-235b-a22b-agg
45+
replicas: 1
46+
extraPodSpec:
47+
affinity:
48+
podAntiAffinity:
49+
requiredDuringSchedulingIgnoredDuringExecution:
50+
- labelSelector:
51+
matchExpressions:
52+
- key: nvidia.com/dynamo-graph-deployment-name
53+
operator: In
54+
values:
55+
- qwen3-235b-a22b-agg-frontend
56+
topologyKey: kubernetes.io/hostname
57+
mainContainer:
58+
image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:my-tag
59+
args:
60+
- python3 -m dynamo.frontend --router-mode kv --http-port 8000
61+
command:
62+
- /bin/sh
63+
- -c
64+
TrtllmWorker:
65+
componentType: main
66+
dynamoNamespace: qwen3-235b-a22b-agg
67+
envFromSecret: hf-token-secret
68+
sharedMemory:
69+
size: 256Gi
70+
extraPodSpec:
71+
affinity:
72+
nodeAffinity:
73+
requiredDuringSchedulingIgnoredDuringExecution:
74+
nodeSelectorTerms:
75+
- matchExpressions:
76+
- key: nvidia.com/gpu.present
77+
operator: In
78+
values:
79+
- "true"
80+
mainContainer:
81+
env:
82+
- name: MODEL_PATH
83+
value: /mnt/model-cache/hub/models--Qwen--Qwen3-235B-A22B-FP8/snapshots/39eb2b067ea6b8e3e1dd97d3cd0c7ffeaf3e1a35
84+
- name: ENGINE_ARGS
85+
value: /engine_configs/agg.yaml
86+
command:
87+
- /bin/sh
88+
- -c
89+
args:
90+
- |
91+
python3 -m dynamo.trtllm \
92+
--model-path "${MODEL_PATH}" \
93+
--served-model-name "Qwen/Qwen3-235B-A22B-FP8" \
94+
--extra-engine-args "${ENGINE_ARGS}"
95+
image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:my-tag
96+
workingDir: /workspace/components/backends/trtllm
97+
volumeMounts:
98+
- name: agg-config
99+
mountPath: /engine_configs
100+
- name: model-cache
101+
mountPath: /mnt/model-cache
102+
volumes:
103+
- name: agg-config
104+
configMap:
105+
name: agg-config
106+
- name: model-cache
107+
persistentVolumeClaim:
108+
claimName: model-cache
109+
replicas: 1
110+
resources:
111+
limits:
112+
gpu: "4"
113+
requests:
114+
gpu: "4"
Lines changed: 153 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,153 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
# SPDX-License-Identifier: Apache-2.0
3+
apiVersion: batch/v1
4+
kind: Job
5+
metadata:
6+
name: qwen3-235b-a22b-bench
7+
spec:
8+
backoffLimit: 1
9+
completions: 1
10+
parallelism: 1
11+
template:
12+
metadata:
13+
labels:
14+
app: qwen3-235b-a22b-bench
15+
spec:
16+
affinity:
17+
podAntiAffinity:
18+
requiredDuringSchedulingIgnoredDuringExecution:
19+
- labelSelector:
20+
matchExpressions:
21+
- key: nvidia.com/dynamo-graph-deployment-name
22+
operator: In
23+
values:
24+
- qwen3-235b-a22b-agg
25+
topologyKey: kubernetes.io/hostname
26+
containers:
27+
- command:
28+
- /bin/sh
29+
- -c
30+
- |
31+
apt-get update && apt-get install -y curl jq procps git && apt-get clean
32+
pip install git+https://github.com/ai-dynamo/aiperf.git@70af59489df24a601dba57604a7341966150b366;
33+
echo "aiperf installation completed";
34+
sysctl -w net.ipv4.ip_local_port_range="1024 65000"
35+
cat /proc/sys/net/ipv4/ip_local_port_range
36+
export COLUMNS=200
37+
EPOCH=$(date +%s)
38+
## utility functions -- can be moved to a bash script / configmap
39+
wait_for_model_ready() {
40+
echo "Waiting for model '$TARGET_MODEL' at $ENDPOINT/v1/models (checking every 5s)..."
41+
while ! curl -s "http://$ENDPOINT/v1/models" | jq -e --arg model "$TARGET_MODEL" '.data[]? | select(.id == $model)' >/dev/null 2>&1; do
42+
echo "[$(date '+%H:%M:%S')] Model not ready yet, sleeping 5s before checking again http://$ENDPOINT/v1/models"
43+
sleep 5
44+
done
45+
echo "✅ Model '$TARGET_MODEL' is now available!"
46+
echo "Model '$TARGET_MODEL' is now available!"
47+
curl -s "http://$ENDPOINT/v1/models" | jq .
48+
}
49+
run_perf() {
50+
local concurrency=$1
51+
local isl=$2
52+
local osl=$3
53+
key=concurrency_${concurrency}
54+
export ARTIFACT_DIR="${ROOT_ARTIFACT_DIR}/${EPOCH}_${JOB_NAME}/${key}"
55+
mkdir -p "$ARTIFACT_DIR"
56+
echo "ARTIFACT_DIR: $ARTIFACT_DIR"
57+
aiperf profile --artifact-dir $ARTIFACT_DIR \
58+
--model $TARGET_MODEL \
59+
--tokenizer $TARGET_MODEL \
60+
--endpoint-type chat \
61+
--endpoint /v1/chat/completions \
62+
--streaming \
63+
--url http://$ENDPOINT \
64+
--synthetic-input-tokens-mean $isl \
65+
--synthetic-input-tokens-stddev 0 \
66+
--output-tokens-mean $osl \
67+
--output-tokens-stddev 0 \
68+
--extra-inputs "max_tokens:$osl" \
69+
--extra-inputs "min_tokens:$osl" \
70+
--extra-inputs "ignore_eos:true" \
71+
--extra-inputs "{\"nvext\":{\"ignore_eos\":true}}" \
72+
--extra-inputs "repetition_penalty:1.0" \
73+
--extra-inputs "temperature: 0.0" \
74+
--concurrency $concurrency \
75+
--request-count $((10*concurrency)) \
76+
--warmup-request-count $concurrency \
77+
--conversation-num 12800 \
78+
--random-seed 100 \
79+
--workers-max 252 \
80+
-H 'Authorization: Bearer NOT USED' \
81+
-H 'Accept: text/event-stream'\
82+
--record-processors 32 \
83+
--ui simple
84+
echo "ARTIFACT_DIR: $ARTIFACT_DIR"
85+
ls -la $ARTIFACT_DIR
86+
}
87+
#### Actual execution ####
88+
wait_for_model_ready
89+
mkdir -p "${ROOT_ARTIFACT_DIR}/${EPOCH}_${JOB_NAME}"
90+
# Calculate total concurrency based on per-GPU concurrency and GPU count
91+
TOTAL_CONCURRENCY=$((CONCURRENCY_PER_GPU * DEPLOYMENT_GPU_COUNT))
92+
echo "Calculated total concurrency: $TOTAL_CONCURRENCY (${CONCURRENCY_PER_GPU} per GPU × ${DEPLOYMENT_GPU_COUNT} GPUs)"
93+
# Write input_config.json
94+
cat > "${ROOT_ARTIFACT_DIR}/${EPOCH}_${JOB_NAME}/input_config.json" <<EOF
95+
{
96+
"gpu_count": $DEPLOYMENT_GPU_COUNT,
97+
"concurrency_per_gpu": $CONCURRENCY_PER_GPU,
98+
"total_concurrency": $TOTAL_CONCURRENCY,
99+
"mode": "$DEPLOYMENT_MODE",
100+
"isl": $ISL,
101+
"osl": $OSL,
102+
"endpoint": "$ENDPOINT",
103+
"model endpoint": "$TARGET_MODEL"
104+
}
105+
EOF
106+
107+
# Run perf with calculated total concurrency
108+
run_perf $TOTAL_CONCURRENCY $ISL $OSL
109+
echo "done with concurrency $TOTAL_CONCURRENCY"
110+
env:
111+
- name: TARGET_MODEL
112+
value: Qwen/Qwen3-235B-A22B-FP8
113+
- name: ENDPOINT
114+
value: qwen3-235b-a22b-agg-frontend:8000
115+
- name: CONCURRENCY_PER_GPU
116+
value: "2"
117+
- name: DEPLOYMENT_GPU_COUNT
118+
value: "2"
119+
- name: ISL
120+
value: "4000"
121+
- name: OSL
122+
value: "500"
123+
- name: DEPLOYMENT_MODE
124+
value: agg
125+
- name: AIPERF_HTTP_CONNECTION_LIMIT
126+
value: "200"
127+
- name: JOB_NAME
128+
valueFrom:
129+
fieldRef:
130+
apiVersion: v1
131+
fieldPath: metadata.labels['job-name']
132+
- name: ROOT_ARTIFACT_DIR
133+
value: /model-cache/perf
134+
- name: HF_HOME
135+
value: /model-cache
136+
- name: PYTHONUNBUFFERED
137+
value: "1"
138+
image: python:3.12-slim
139+
imagePullPolicy: IfNotPresent
140+
name: perf
141+
securityContext:
142+
privileged: true
143+
volumeMounts:
144+
- name: model-cache
145+
mountPath: /model-cache
146+
workingDir: /workspace
147+
imagePullSecrets:
148+
- name: nvcrimagepullsecret
149+
restartPolicy: Never
150+
volumes:
151+
- name: model-cache
152+
persistentVolumeClaim:
153+
claimName: model-cache

0 commit comments

Comments
 (0)