Skip to content

Add Acceptance Rate calculation to benchmark_serving #6240

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jul 28, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 22 additions & 0 deletions tensorrt_llm/serve/scripts/backend_request_func.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ class RequestFuncOutput:
tpot: float = 0.0 # avg next-token latencies
prompt_len: int = 0
error: str = ""
decode_iteration: int = 0 # Number of decoding iterations


async def async_request_trt_llm(
Expand Down Expand Up @@ -77,6 +78,7 @@ async def async_request_trt_llm(
ttft = 0.0
st = time.perf_counter()
most_recent_timestamp = st
decode_iteration_count = 0 # Track decoding iterations
try:
async with request_session.post(url=api_url, json=payload) as response:
if response.status == 200:
Expand All @@ -102,16 +104,22 @@ async def async_request_trt_llm(
else:
output.itl.append(timestamp - most_recent_timestamp)

# Increment decode iteration for each chunk
decode_iteration_count += 1
most_recent_timestamp = timestamp

output.latency = most_recent_timestamp - st
output.decode_iteration = decode_iteration_count
else:
content = await response.content.read()
data = json.loads(content.decode())
output.ttft = -1
output.itl = []
output.generated_text = data["text_output"]
output.latency = time.perf_counter() - st
# For non-streaming, estimate decode_iteration as number of output tokens
output.decode_iteration = len(output.generated_text.split(
)) if output.generated_text else 1

else:
output.error = response.reason or ""
Expand Down Expand Up @@ -170,6 +178,7 @@ async def async_request_openai_completions(
generated_text = ""
st = time.perf_counter()
most_recent_timestamp = st
decode_iteration_count = 0 # Track decoding iterations
try:
async with request_session.post(url=api_url,
json=payload,
Expand Down Expand Up @@ -206,6 +215,9 @@ async def async_request_openai_completions(
output.itl.append(timestamp -
most_recent_timestamp)

# Increment decode iteration for each chunk with text
if text is not None:
decode_iteration_count += 1
most_recent_timestamp = timestamp
generated_text += text or ""
elif usage := data.get("usage"):
Expand All @@ -220,6 +232,7 @@ async def async_request_openai_completions(
"This response will be marked as failed!")
output.generated_text = generated_text
output.latency = most_recent_timestamp - st
output.decode_iteration = decode_iteration_count
else:
content = await response.content.read()
data = json.loads(content.decode())
Expand All @@ -230,6 +243,8 @@ async def async_request_openai_completions(
output.ttft = -1
output.itl = []
output.output_tokens = data["usage"]["completion_tokens"]
# For non-streaming, estimate decode_iteration as number of output tokens
output.decode_iteration = output.output_tokens if output.output_tokens > 0 else 1
else:
output.error = response.reason or ""
output.success = False
Expand Down Expand Up @@ -306,6 +321,7 @@ async def async_request_openai_chat_completions(
ttft = 0.0
st = time.perf_counter()
most_recent_timestamp = st
decode_iteration_count = 0 # Track decoding iterations
try:
async with request_session.post(url=api_url,
json=payload,
Expand Down Expand Up @@ -336,6 +352,9 @@ async def async_request_openai_chat_completions(
output.itl.append(timestamp -
most_recent_timestamp)

# Increment decode iteration for each chunk with content
if content is not None:
decode_iteration_count += 1
generated_text += content or ""
elif usage := data.get("usage"):
output.output_tokens = usage.get(
Expand All @@ -345,6 +364,7 @@ async def async_request_openai_chat_completions(

output.generated_text = generated_text
output.latency = most_recent_timestamp - st
output.decode_iteration = decode_iteration_count
else:
content = await response.content.read()
data = json.loads(content.decode())
Expand All @@ -354,6 +374,8 @@ async def async_request_openai_chat_completions(
output.itl = []
output.latency = time.perf_counter() - st
output.ttft = -1
# For non-streaming, estimate decode_iteration as number of output tokens
output.decode_iteration = output.output_tokens if output.output_tokens > 0 else 1

else:
output.error = response.reason or ""
Expand Down
57 changes: 48 additions & 9 deletions tensorrt_llm/serve/scripts/benchmark_serving.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,11 @@ class BenchmarkMetrics:
std_e2el_ms: float
percentiles_e2el_ms: list[tuple[float, float]]
tput_user: list[float]
# Request accuracy rate metrics
mean_request_ar: float
median_request_ar: float
std_request_ar: float
percentiles_request_ar: list[tuple[float, float]]


async def get_request(
Expand Down Expand Up @@ -131,7 +136,7 @@ def calculate_metrics(
selected_percentile_metrics: list[str],
selected_percentiles: list[float],
goodput_config_dict: dict[str, float],
) -> tuple[BenchmarkMetrics, list[int]]:
) -> tuple[BenchmarkMetrics, list[int], list[float]]:
actual_output_lens: list[int] = []
total_input = 0
completed = 0
Expand All @@ -142,6 +147,7 @@ def calculate_metrics(
ttfts: list[float] = []
e2els: list[float] = []
tput_user: list[float] = []
request_ars: list[float] = [] # Request accuracy rates
for i in range(len(outputs)):
if outputs[i].success:
output_len = outputs[i].output_tokens
Expand All @@ -167,9 +173,24 @@ def calculate_metrics(
ttfts.append(outputs[i].ttft)
e2els.append(outputs[i].latency)
tput_user.append(output_len / (outputs[i].latency))

# Calculate request accuracy rate (num_generated_tokens / (decode_iteration + 1))
decode_iter = outputs[i].decode_iteration
if decode_iter >= 0:
# For generated tokens, we use output_len - 1 (excluding the first token if needed)
# But according to the reference, it should be num_generated_tokens
num_generated_tokens = max(0, output_len -
1) if output_len > 1 else output_len
request_ar = num_generated_tokens / (
decode_iter + 1) if decode_iter >= 0 else 0.0
request_ars.append(request_ar)
else:
request_ars.append(0.0)

completed += 1
else:
actual_output_lens.append(0)
request_ars.append(0.0)

if goodput_config_dict:
valid_metrics = []
Expand Down Expand Up @@ -228,8 +249,13 @@ def calculate_metrics(
percentiles_e2el_ms=[(p, np.percentile(e2els or 0, p) * 1000)
for p in selected_percentiles],
tput_user=np.mean(tput_user or 0),
mean_request_ar=np.mean(request_ars or 0),
median_request_ar=np.median(request_ars or 0),
std_request_ar=np.std(request_ars or 0),
percentiles_request_ar=[(p, np.percentile(request_ars or 0, p))
for p in selected_percentiles],
)
return metrics, actual_output_lens
return metrics, actual_output_lens, request_ars


async def benchmark(
Expand Down Expand Up @@ -403,7 +429,7 @@ async def limited_request_func(request_func_input, streaming, pbar,
# Close the session
await session.close()

metrics, actual_output_lens = calculate_metrics(
metrics, actual_output_lens, request_ars = calculate_metrics(
input_requests=input_requests,
outputs=outputs,
dur_s=benchmark_duration,
Expand Down Expand Up @@ -431,6 +457,10 @@ async def limited_request_func(request_func_input, streaming, pbar,
metrics.total_token_throughput))
print("{:<40} {:<10.2f}".format("User throughput (tok/s):",
metrics.tput_user))
print("{:<40} {:<10.4f}".format("Mean Request AR:",
metrics.mean_request_ar))
print("{:<40} {:<10.4f}".format("Median Request AR:",
metrics.median_request_ar))

result = {
"duration": benchmark_duration,
Expand All @@ -443,12 +473,16 @@ async def limited_request_func(request_func_input, streaming, pbar,
"output_throughput": metrics.output_throughput,
"total_token_throughput": metrics.total_token_throughput,
"user_throughput": metrics.tput_user,
"mean_request_ar": metrics.mean_request_ar,
"median_request_ar": metrics.median_request_ar,
"input_lens": [output.prompt_len for output in outputs],
"output_lens": actual_output_lens,
"ttfts": [output.ttft for output in outputs],
"itls": [output.itl for output in outputs],
"generated_texts": [output.generated_text for output in outputs],
"errors": [output.error for output in outputs],
"request_ars": request_ars,
"decode_iterations": [output.decode_iteration for output in outputs],
}

def process_one_metric(
Expand Down Expand Up @@ -534,11 +568,15 @@ def save_to_pytorch_benchmark_format(args: argparse.Namespace,
metrics = [
"median_ttft_ms", "mean_ttft_ms", "std_ttft_ms", "p99_ttft_ms",
"mean_tpot_ms", "median_tpot_ms", "std_tpot_ms", "p99_tpot_ms",
"median_itl_ms", "mean_itl_ms", "std_itl_ms", "p99_itl_ms"
"median_itl_ms", "mean_itl_ms", "std_itl_ms", "p99_itl_ms",
"mean_request_ar", "median_request_ar", "std_request_ar"
]
# These raw data might be useful, but they are rather big. They can be added
# later if needed
ignored_metrics = ["ttfts", "itls", "generated_texts", "errors"]
ignored_metrics = [
"ttfts", "itls", "generated_texts", "errors", "request_ars",
"decode_iterations"
]
pt_records = convert_to_pytorch_benchmark_format(
args=args,
metrics={k: [results[k]]
Expand Down Expand Up @@ -762,7 +800,8 @@ def main(args: argparse.Namespace):
# Remove fields with too many data points
for field in [
"input_lens", "output_lens", "ttfts", "itls",
"generated_texts", "errors"
"generated_texts", "errors", "request_ars",
"decode_iterations"
]:
if field in result_json:
del result_json[field]
Expand Down Expand Up @@ -963,11 +1002,11 @@ def main(args: argparse.Namespace):
parser.add_argument(
"--percentile-metrics",
type=str,
default="ttft,tpot,itl",
default="ttft,tpot,itl,request_ar",
help="Comma-separated list of selected metrics to report percentils. "
"This argument specifies the metrics to report percentiles. "
"Allowed metric names are \"ttft\", \"tpot\", \"itl\", \"e2el\". "
"Default value is \"ttft,tpot,itl\".")
"Allowed metric names are \"ttft\", \"tpot\", \"itl\", \"e2el\", \"request_ar\". "
"Default value is \"ttft,tpot,itl,request_ar\".")
parser.add_argument(
"--metric-percentiles",
type=str,
Expand Down