Skip to content

Commit 34cb737

Browse files
committed
Revert "fix: Fix missing key (NVIDIA#6471)"
This reverts commit 48768fd. Revert "Add Acceptance Rate calculation to benchmark_serving (NVIDIA#6240)" This reverts commit c9b8b61. Signed-off-by: Zero Zeng <[email protected]>
1 parent 1ce2354 commit 34cb737

File tree

2 files changed

+9
-71
lines changed

2 files changed

+9
-71
lines changed

tensorrt_llm/serve/scripts/backend_request_func.py

Lines changed: 0 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,6 @@ class RequestFuncOutput:
4545
tpot: float = 0.0 # avg next-token latencies
4646
prompt_len: int = 0
4747
error: str = ""
48-
decode_iteration: int = 0 # Number of decoding iterations
4948

5049

5150
async def async_request_trt_llm(
@@ -82,7 +81,6 @@ async def async_request_trt_llm(
8281
ttft = 0.0
8382
st = time.perf_counter()
8483
most_recent_timestamp = st
85-
decode_iteration_count = 0 # Track decoding iterations
8684
try:
8785
async with request_session.post(url=api_url, json=payload) as response:
8886
if response.status == 200:
@@ -108,22 +106,16 @@ async def async_request_trt_llm(
108106
else:
109107
output.itl.append(timestamp - most_recent_timestamp)
110108

111-
# Increment decode iteration for each chunk
112-
decode_iteration_count += 1
113109
most_recent_timestamp = timestamp
114110

115111
output.latency = most_recent_timestamp - st
116-
output.decode_iteration = decode_iteration_count
117112
else:
118113
content = await response.content.read()
119114
data = json.loads(content.decode())
120115
output.ttft = -1
121116
output.itl = []
122117
output.generated_text = data["text_output"]
123118
output.latency = time.perf_counter() - st
124-
# For non-streaming, estimate decode_iteration as number of output tokens
125-
output.decode_iteration = len(output.generated_text.split(
126-
)) if output.generated_text else 1
127119

128120
else:
129121
output.error = response.reason or ""
@@ -183,7 +175,6 @@ async def async_request_openai_completions(
183175
generated_text = ""
184176
st = time.perf_counter()
185177
most_recent_timestamp = st
186-
decode_iteration_count = 0 # Track decoding iterations
187178
try:
188179
async with request_session.post(url=api_url,
189180
json=payload,
@@ -220,9 +211,6 @@ async def async_request_openai_completions(
220211
output.itl.append(timestamp -
221212
most_recent_timestamp)
222213

223-
# Increment decode iteration for each chunk with text
224-
if text is not None:
225-
decode_iteration_count += 1
226214
most_recent_timestamp = timestamp
227215
generated_text += text or ""
228216
elif usage := data.get("usage"):
@@ -237,7 +225,6 @@ async def async_request_openai_completions(
237225
"This response will be marked as failed!")
238226
output.generated_text = generated_text
239227
output.latency = most_recent_timestamp - st
240-
output.decode_iteration = decode_iteration_count
241228
else:
242229
content = await response.content.read()
243230
data = json.loads(content.decode())
@@ -248,8 +235,6 @@ async def async_request_openai_completions(
248235
output.ttft = -1
249236
output.itl = []
250237
output.output_tokens = data["usage"]["completion_tokens"]
251-
# For non-streaming, estimate decode_iteration as number of output tokens
252-
output.decode_iteration = output.output_tokens if output.output_tokens > 0 else 1
253238
else:
254239
output.error = response.reason or ""
255240
output.success = False
@@ -322,7 +307,6 @@ async def async_request_openai_chat_completions(
322307
ttft = 0.0
323308
st = time.perf_counter()
324309
most_recent_timestamp = st
325-
decode_iteration_count = 0 # Track decoding iterations
326310
try:
327311
async with request_session.post(url=api_url,
328312
json=payload,
@@ -353,9 +337,6 @@ async def async_request_openai_chat_completions(
353337
output.itl.append(timestamp -
354338
most_recent_timestamp)
355339

356-
# Increment decode iteration for each chunk with content
357-
if content is not None:
358-
decode_iteration_count += 1
359340
generated_text += content or ""
360341
elif usage := data.get("usage"):
361342
output.output_tokens = usage.get(
@@ -365,7 +346,6 @@ async def async_request_openai_chat_completions(
365346

366347
output.generated_text = generated_text
367348
output.latency = most_recent_timestamp - st
368-
output.decode_iteration = decode_iteration_count
369349
else:
370350
content = await response.content.read()
371351
data = json.loads(content.decode())
@@ -375,8 +355,6 @@ async def async_request_openai_chat_completions(
375355
output.itl = []
376356
output.latency = time.perf_counter() - st
377357
output.ttft = -1
378-
# For non-streaming, estimate decode_iteration as number of output tokens
379-
output.decode_iteration = output.output_tokens if output.output_tokens > 0 else 1
380358

381359
else:
382360
output.error = response.reason or ""

tensorrt_llm/serve/scripts/benchmark_serving.py

Lines changed: 9 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -79,11 +79,6 @@ class BenchmarkMetrics:
7979
std_e2el_ms: float
8080
percentiles_e2el_ms: list[tuple[float, float]]
8181
tput_user: list[float]
82-
# Request accuracy rate metrics
83-
mean_request_ar: float
84-
median_request_ar: float
85-
std_request_ar: float
86-
percentiles_request_ar: list[tuple[float, float]]
8782

8883

8984
async def get_request(
@@ -136,7 +131,7 @@ def calculate_metrics(
136131
selected_percentile_metrics: list[str],
137132
selected_percentiles: list[float],
138133
goodput_config_dict: dict[str, float],
139-
) -> tuple[BenchmarkMetrics, list[int], list[float]]:
134+
) -> tuple[BenchmarkMetrics, list[int]]:
140135
actual_output_lens: list[int] = []
141136
total_input = 0
142137
completed = 0
@@ -147,7 +142,6 @@ def calculate_metrics(
147142
ttfts: list[float] = []
148143
e2els: list[float] = []
149144
tput_user: list[float] = []
150-
request_ars: list[float] = [] # Request accuracy rates
151145
for i in range(len(outputs)):
152146
if outputs[i].success:
153147
output_len = outputs[i].output_tokens
@@ -173,24 +167,9 @@ def calculate_metrics(
173167
ttfts.append(outputs[i].ttft)
174168
e2els.append(outputs[i].latency)
175169
tput_user.append(output_len / (outputs[i].latency))
176-
177-
# Calculate request accuracy rate (num_generated_tokens / (decode_iteration + 1))
178-
decode_iter = outputs[i].decode_iteration
179-
if decode_iter >= 0:
180-
# For generated tokens, we use output_len - 1 (excluding the first token if needed)
181-
# But according to the reference, it should be num_generated_tokens
182-
num_generated_tokens = max(0, output_len -
183-
1) if output_len > 1 else output_len
184-
request_ar = num_generated_tokens / (
185-
decode_iter + 1) if decode_iter >= 0 else 0.0
186-
request_ars.append(request_ar)
187-
else:
188-
request_ars.append(0.0)
189-
190170
completed += 1
191171
else:
192172
actual_output_lens.append(0)
193-
request_ars.append(0.0)
194173

195174
if goodput_config_dict:
196175
valid_metrics = []
@@ -249,13 +228,8 @@ def calculate_metrics(
249228
percentiles_e2el_ms=[(p, np.percentile(e2els or 0, p) * 1000)
250229
for p in selected_percentiles],
251230
tput_user=np.mean(tput_user or 0),
252-
mean_request_ar=np.mean(request_ars or 0),
253-
median_request_ar=np.median(request_ars or 0),
254-
std_request_ar=np.std(request_ars or 0),
255-
percentiles_request_ar=[(p, np.percentile(request_ars or 0, p))
256-
for p in selected_percentiles],
257231
)
258-
return metrics, actual_output_lens, request_ars
232+
return metrics, actual_output_lens
259233

260234

261235
async def benchmark(
@@ -439,7 +413,7 @@ async def limited_request_func(request_func_input, streaming, pbar,
439413
# Close the session
440414
await session.close()
441415

442-
metrics, actual_output_lens, request_ars = calculate_metrics(
416+
metrics, actual_output_lens = calculate_metrics(
443417
input_requests=input_requests,
444418
outputs=outputs,
445419
dur_s=benchmark_duration,
@@ -467,10 +441,6 @@ async def limited_request_func(request_func_input, streaming, pbar,
467441
metrics.total_token_throughput))
468442
print("{:<40} {:<10.2f}".format("User throughput (tok/s):",
469443
metrics.tput_user))
470-
print("{:<40} {:<10.4f}".format("Mean Request AR:",
471-
metrics.mean_request_ar))
472-
print("{:<40} {:<10.4f}".format("Median Request AR:",
473-
metrics.median_request_ar))
474444

475445
result = {
476446
"duration": benchmark_duration,
@@ -483,17 +453,12 @@ async def limited_request_func(request_func_input, streaming, pbar,
483453
"output_throughput": metrics.output_throughput,
484454
"total_token_throughput": metrics.total_token_throughput,
485455
"user_throughput": metrics.tput_user,
486-
"mean_request_ar": metrics.mean_request_ar,
487-
"median_request_ar": metrics.median_request_ar,
488-
"std_request_ar": metrics.std_request_ar,
489456
"input_lens": [output.prompt_len for output in outputs],
490457
"output_lens": actual_output_lens,
491458
"ttfts": [output.ttft for output in outputs],
492459
"itls": [output.itl for output in outputs],
493460
"generated_texts": [output.generated_text for output in outputs],
494461
"errors": [output.error for output in outputs],
495-
"request_ars": request_ars,
496-
"decode_iterations": [output.decode_iteration for output in outputs],
497462
}
498463

499464
def process_one_metric(
@@ -579,15 +544,11 @@ def save_to_pytorch_benchmark_format(args: argparse.Namespace,
579544
metrics = [
580545
"median_ttft_ms", "mean_ttft_ms", "std_ttft_ms", "p99_ttft_ms",
581546
"mean_tpot_ms", "median_tpot_ms", "std_tpot_ms", "p99_tpot_ms",
582-
"median_itl_ms", "mean_itl_ms", "std_itl_ms", "p99_itl_ms",
583-
"mean_request_ar", "median_request_ar", "std_request_ar"
547+
"median_itl_ms", "mean_itl_ms", "std_itl_ms", "p99_itl_ms"
584548
]
585549
# These raw data might be useful, but they are rather big. They can be added
586550
# later if needed
587-
ignored_metrics = [
588-
"ttfts", "itls", "generated_texts", "errors", "request_ars",
589-
"decode_iterations"
590-
]
551+
ignored_metrics = ["ttfts", "itls", "generated_texts", "errors"]
591552
pt_records = convert_to_pytorch_benchmark_format(
592553
args=args,
593554
metrics={k: [results[k]]
@@ -858,8 +819,7 @@ def create_dataset_and_sample(dataset_name: str):
858819
# Remove fields with too many data points
859820
for field in [
860821
"input_lens", "output_lens", "ttfts", "itls",
861-
"generated_texts", "errors", "request_ars",
862-
"decode_iterations"
822+
"generated_texts", "errors"
863823
]:
864824
if field in result_json:
865825
del result_json[field]
@@ -1061,11 +1021,11 @@ def create_dataset_and_sample(dataset_name: str):
10611021
parser.add_argument(
10621022
"--percentile-metrics",
10631023
type=str,
1064-
default="ttft,tpot,itl,request_ar",
1024+
default="ttft,tpot,itl",
10651025
help="Comma-separated list of selected metrics to report percentils. "
10661026
"This argument specifies the metrics to report percentiles. "
1067-
"Allowed metric names are \"ttft\", \"tpot\", \"itl\", \"e2el\", \"request_ar\". "
1068-
"Default value is \"ttft,tpot,itl,request_ar\".")
1027+
"Allowed metric names are \"ttft\", \"tpot\", \"itl\", \"e2el\". "
1028+
"Default value is \"ttft,tpot,itl\".")
10691029
parser.add_argument(
10701030
"--metric-percentiles",
10711031
type=str,

0 commit comments

Comments
 (0)