@@ -161,6 +161,7 @@ pub struct Metrics {
161161 request_duration : HistogramVec ,
162162 input_sequence_length : HistogramVec ,
163163 output_sequence_length : HistogramVec ,
164+ output_tokens_counter : IntCounterVec ,
164165 time_to_first_token : HistogramVec ,
165166 inter_token_latency : HistogramVec ,
166167
@@ -266,6 +267,7 @@ impl Metrics {
266267 /// - `{prefix}_request_duration_seconds` - HistogramVec for the duration of requests
267268 /// - `{prefix}_input_sequence_tokens` - HistogramVec for input sequence length in tokens
268269 /// - `{prefix}_output_sequence_tokens` - HistogramVec for output sequence length in tokens
270+ /// - `{prefix}_output_tokens_total` - IntCounterVec for total output tokens generated (real-time updates)
269271 /// - `{prefix}_time_to_first_token_seconds` - HistogramVec for time to first token in seconds
270272 /// - `{prefix}_inter_token_latency_seconds` - HistogramVec for inter-token latency in seconds
271273 ///
@@ -392,6 +394,15 @@ impl Metrics {
392394 )
393395 . unwrap ( ) ;
394396
397+ let output_tokens_counter = IntCounterVec :: new (
398+ Opts :: new (
399+ frontend_metric_name ( frontend_service:: OUTPUT_TOKENS_TOTAL ) ,
400+ "Total number of output tokens generated (updates in real-time)" ,
401+ ) ,
402+ & [ "model" ] ,
403+ )
404+ . unwrap ( ) ;
405+
395406 // Time to first token buckets: configurable via DYN_METRICS_TTFT_{MIN,MAX,COUNT}
396407 let ( ttft_min, ttft_max, ttft_count) =
397408 parse_bucket_config ( "DYN_METRICS_TTFT" , 0.001 , 480.0 , 18 ) ;
@@ -487,6 +498,7 @@ impl Metrics {
487498 request_duration,
488499 input_sequence_length,
489500 output_sequence_length,
501+ output_tokens_counter,
490502 time_to_first_token,
491503 inter_token_latency,
492504 model_total_kv_blocks,
@@ -581,6 +593,7 @@ impl Metrics {
581593 registry. register ( Box :: new ( self . request_duration . clone ( ) ) ) ?;
582594 registry. register ( Box :: new ( self . input_sequence_length . clone ( ) ) ) ?;
583595 registry. register ( Box :: new ( self . output_sequence_length . clone ( ) ) ) ?;
596+ registry. register ( Box :: new ( self . output_tokens_counter . clone ( ) ) ) ?;
584597 registry. register ( Box :: new ( self . time_to_first_token . clone ( ) ) ) ?;
585598 registry. register ( Box :: new ( self . inter_token_latency . clone ( ) ) ) ?;
586599
@@ -832,6 +845,12 @@ impl ResponseMetricCollector {
832845 return ;
833846 }
834847
848+ // Increment the real-time output tokens counter
849+ self . metrics
850+ . output_tokens_counter
851+ . with_label_values ( & [ & self . model ] )
852+ . inc_by ( num_tokens as u64 ) ;
853+
835854 if self . is_first_token {
836855 // NOTE: when there are multiple tokens in the first response,
837856 // we use the full response time as TTFT and ignore the ITL
@@ -1187,4 +1206,151 @@ mod tests {
11871206 ) ;
11881207 }
11891208 }
1209+
1210+ #[ test]
1211+ fn test_output_tokens_counter_increments ( ) {
1212+ let metrics = Arc :: new ( Metrics :: new ( ) ) ;
1213+ let registry = prometheus:: Registry :: new ( ) ;
1214+ metrics. register ( & registry) . unwrap ( ) ;
1215+
1216+ let model = "test-model" ;
1217+
1218+ // Create response collector
1219+ let mut collector = metrics. clone ( ) . create_response_collector ( model) ;
1220+
1221+ // Simulate first chunk (5 tokens)
1222+ collector. observe_response ( 100 , 5 ) ;
1223+
1224+ // Verify counter incremented by 5
1225+ let counter_value = metrics
1226+ . output_tokens_counter
1227+ . with_label_values ( & [ model] )
1228+ . get ( ) ;
1229+ assert_eq ! ( counter_value, 5 ) ;
1230+
1231+ // Simulate second chunk (10 tokens)
1232+ collector. observe_response ( 100 , 10 ) ;
1233+
1234+ // Verify counter incremented to 15
1235+ let counter_value = metrics
1236+ . output_tokens_counter
1237+ . with_label_values ( & [ model] )
1238+ . get ( ) ;
1239+ assert_eq ! ( counter_value, 15 ) ;
1240+
1241+ // Simulate third chunk (7 tokens)
1242+ collector. observe_response ( 100 , 7 ) ;
1243+
1244+ // Verify counter incremented to 22
1245+ let counter_value = metrics
1246+ . output_tokens_counter
1247+ . with_label_values ( & [ model] )
1248+ . get ( ) ;
1249+ assert_eq ! ( counter_value, 22 ) ;
1250+ }
1251+
1252+ #[ test]
1253+ fn test_output_tokens_counter_zero_tokens ( ) {
1254+ let metrics = Arc :: new ( Metrics :: new ( ) ) ;
1255+ let registry = prometheus:: Registry :: new ( ) ;
1256+ metrics. register ( & registry) . unwrap ( ) ;
1257+
1258+ let model = "test-model" ;
1259+ let mut collector = metrics. clone ( ) . create_response_collector ( model) ;
1260+
1261+ // Simulate chunk with zero tokens (should not increment)
1262+ collector. observe_response ( 100 , 0 ) ;
1263+
1264+ // Verify counter remains 0
1265+ let counter_value = metrics
1266+ . output_tokens_counter
1267+ . with_label_values ( & [ model] )
1268+ . get ( ) ;
1269+ assert_eq ! ( counter_value, 0 ) ;
1270+
1271+ // Add some tokens
1272+ collector. observe_response ( 100 , 5 ) ;
1273+ assert_eq ! (
1274+ metrics
1275+ . output_tokens_counter
1276+ . with_label_values( & [ model] )
1277+ . get( ) ,
1278+ 5
1279+ ) ;
1280+
1281+ // Try zero tokens again (should not change counter)
1282+ collector. observe_response ( 100 , 0 ) ;
1283+ assert_eq ! (
1284+ metrics
1285+ . output_tokens_counter
1286+ . with_label_values( & [ model] )
1287+ . get( ) ,
1288+ 5
1289+ ) ;
1290+ }
1291+
1292+ #[ test]
1293+ fn test_output_tokens_counter_multiple_models ( ) {
1294+ let metrics = Arc :: new ( Metrics :: new ( ) ) ;
1295+ let registry = prometheus:: Registry :: new ( ) ;
1296+ metrics. register ( & registry) . unwrap ( ) ;
1297+
1298+ let model1 = "model-1" ;
1299+ let model2 = "model-2" ;
1300+
1301+ // Create collectors for different models
1302+ let mut collector1 = metrics. clone ( ) . create_response_collector ( model1) ;
1303+ let mut collector2 = metrics. clone ( ) . create_response_collector ( model2) ;
1304+
1305+ // Increment model1
1306+ collector1. observe_response ( 100 , 10 ) ;
1307+ assert_eq ! (
1308+ metrics
1309+ . output_tokens_counter
1310+ . with_label_values( & [ model1] )
1311+ . get( ) ,
1312+ 10
1313+ ) ;
1314+ assert_eq ! (
1315+ metrics
1316+ . output_tokens_counter
1317+ . with_label_values( & [ model2] )
1318+ . get( ) ,
1319+ 0
1320+ ) ;
1321+
1322+ // Increment model2
1323+ collector2. observe_response ( 200 , 20 ) ;
1324+ assert_eq ! (
1325+ metrics
1326+ . output_tokens_counter
1327+ . with_label_values( & [ model1] )
1328+ . get( ) ,
1329+ 10
1330+ ) ;
1331+ assert_eq ! (
1332+ metrics
1333+ . output_tokens_counter
1334+ . with_label_values( & [ model2] )
1335+ . get( ) ,
1336+ 20
1337+ ) ;
1338+
1339+ // Increment model1 again
1340+ collector1. observe_response ( 100 , 5 ) ;
1341+ assert_eq ! (
1342+ metrics
1343+ . output_tokens_counter
1344+ . with_label_values( & [ model1] )
1345+ . get( ) ,
1346+ 15
1347+ ) ;
1348+ assert_eq ! (
1349+ metrics
1350+ . output_tokens_counter
1351+ . with_label_values( & [ model2] )
1352+ . get( ) ,
1353+ 20
1354+ ) ;
1355+ }
11901356}
0 commit comments