Skip to content

Commit 5064921

Browse files
dharaneeshvrdsallyom
authored andcommitted
Record EPP NormalizedTimePerOutputToken metric on streaming mode (kubernetes-sigs#1706)
Update e2e/epp/e2e_test & integration/epp/hermetic_test to validate inference_objective_normalized_time_per_output_token_seconds metric Signed-off-by: Dharaneeshwaran Ravichandran <[email protected]>
1 parent 47a241f commit 5064921

File tree

4 files changed

+24
-4
lines changed

4 files changed

+24
-4
lines changed

pkg/epp/handlers/server.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -278,6 +278,7 @@ func (s *StreamingServer) Process(srv extProcPb.ExternalProcessor_ProcessServer)
278278
reqCtx.ResponseCompleteTimestamp = time.Now()
279279
metrics.RecordRequestLatencies(ctx, reqCtx.IncomingModelName, reqCtx.TargetModelName, reqCtx.RequestReceivedTimestamp, reqCtx.ResponseCompleteTimestamp)
280280
metrics.RecordResponseSizes(reqCtx.IncomingModelName, reqCtx.TargetModelName, reqCtx.ResponseSize)
281+
metrics.RecordNormalizedTimePerOutputToken(ctx, reqCtx.IncomingModelName, reqCtx.TargetModelName, reqCtx.RequestReceivedTimestamp, reqCtx.ResponseCompleteTimestamp, reqCtx.Usage.CompletionTokens)
281282
}
282283

283284
reqCtx.respBodyResp = generateResponseBodyResponses(v.ResponseBody.Body, v.ResponseBody.EndOfStream)

site-src/guides/metrics-and-observability.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ This guide describes the current state of exposed metrics and how to scrape them
3535
| inference_objective_request_total | Counter | The counter of requests broken out for each model. | `model_name`=&lt;model-name&gt; <br> `target_model_name`=&lt;target-model-name&gt; | ALPHA |
3636
| inference_objective_request_error_total | Counter | The counter of requests errors broken out for each model. | `model_name`=&lt;model-name&gt; <br> `target_model_name`=&lt;target-model-name&gt; | ALPHA |
3737
| inference_objective_request_duration_seconds | Distribution | Distribution of response latency. | `model_name`=&lt;model-name&gt; <br> `target_model_name`=&lt;target-model-name&gt; | ALPHA |
38-
| normalized_time_per_output_token_seconds | Distribution | Distribution of ntpot (response latency per output token) | `model_name`=&lt;model-name&gt; <br> `target_model_name`=&lt;target-model-name&gt; | ALPHA |
38+
| inference_objective_normalized_time_per_output_token_seconds | Distribution | Distribution of ntpot (response latency per output token) | `model_name`=&lt;model-name&gt; <br> `target_model_name`=&lt;target-model-name&gt; | ALPHA |
3939
| inference_objective_request_sizes | Distribution | Distribution of request size in bytes. | `model_name`=&lt;model-name&gt; <br> `target_model_name`=&lt;target-model-name&gt; | ALPHA |
4040
| inference_objective_response_sizes | Distribution | Distribution of response size in bytes. | `model_name`=&lt;model-name&gt; <br> `target_model_name`=&lt;target-model-name&gt; | ALPHA |
4141
| inference_objective_input_tokens | Distribution | Distribution of input token count. | `model_name`=&lt;model-name&gt; <br> `target_model_name`=&lt;target-model-name&gt; | ALPHA |

test/e2e/epp/e2e_test.go

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -244,8 +244,7 @@ func verifyMetrics() {
244244
"inference_objective_request_total",
245245
"inference_objective_request_error_total",
246246
"inference_objective_request_duration_seconds",
247-
// TODO: normalized_time_per_output_token_seconds is not actually recorded yet
248-
// "normalized_time_per_output_token_seconds",
247+
"inference_objective_normalized_time_per_output_token_seconds",
249248
"inference_objective_request_sizes",
250249
"inference_objective_response_sizes",
251250
"inference_objective_input_tokens",

test/integration/epp/hermetic_test.go

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -727,7 +727,27 @@ func TestFullDuplexStreamed_KubeInferenceObjectiveRequest(t *testing.T) {
727727
inference_objective_input_tokens_bucket{model_name="",target_model_name="",le="+Inf"} 1
728728
inference_objective_input_tokens_sum{model_name="",target_model_name=""} 7
729729
inference_objective_input_tokens_count{model_name="",target_model_name=""} 1
730-
`},
730+
`,
731+
`inference_objective_normalized_time_per_output_token_seconds`: `
732+
# HELP inference_objective_normalized_time_per_output_token_seconds [ALPHA] Inference objective latency divided by number of output tokens in seconds for each model and target model.
733+
# TYPE inference_objective_normalized_time_per_output_token_seconds histogram
734+
inference_objective_normalized_time_per_output_token_seconds_bucket{model_name="",target_model_name="",le="0.001"} 0
735+
inference_objective_normalized_time_per_output_token_seconds_bucket{model_name="",target_model_name="",le="0.002"} 0
736+
inference_objective_normalized_time_per_output_token_seconds_bucket{model_name="",target_model_name="",le="0.005"} 0
737+
inference_objective_normalized_time_per_output_token_seconds_bucket{model_name="",target_model_name="",le="0.01"} 0
738+
inference_objective_normalized_time_per_output_token_seconds_bucket{model_name="",target_model_name="",le="0.02"} 0
739+
inference_objective_normalized_time_per_output_token_seconds_bucket{model_name="",target_model_name="",le="0.05"} 0
740+
inference_objective_normalized_time_per_output_token_seconds_bucket{model_name="",target_model_name="",le="0.1"} 0
741+
inference_objective_normalized_time_per_output_token_seconds_bucket{model_name="",target_model_name="",le="0.2"} 0
742+
inference_objective_normalized_time_per_output_token_seconds_bucket{model_name="",target_model_name="",le="0.5"} 0
743+
inference_objective_normalized_time_per_output_token_seconds_bucket{model_name="",target_model_name="",le="1"} 0
744+
inference_objective_normalized_time_per_output_token_seconds_bucket{model_name="",target_model_name="",le="2"} 0
745+
inference_objective_normalized_time_per_output_token_seconds_bucket{model_name="",target_model_name="",le="5"} 0
746+
inference_objective_normalized_time_per_output_token_seconds_bucket{model_name="",target_model_name="",le="10"} 0
747+
inference_objective_normalized_time_per_output_token_seconds_bucket{model_name="",target_model_name="",le="+Inf"} 1
748+
inference_objective_normalized_time_per_output_token_seconds_sum{model_name="",target_model_name=""} 9.223372036854776e+08
749+
inference_objective_normalized_time_per_output_token_seconds_count{model_name="",target_model_name=""} 1
750+
`},
731751
wantResponses: []*extProcPb.ProcessingResponse{
732752
integrationutils.NewResponseHeaders(
733753
&configPb.HeaderValueOption{

0 commit comments

Comments
 (0)