@@ -516,43 +516,45 @@ def _set_completions(span, choices):
516
516
def _set_streaming_token_metrics (
517
517
request_kwargs , complete_response , span , token_counter , shared_attributes
518
518
):
519
- # use tiktoken calculate token usage
520
519
if not should_record_stream_token_usage ():
521
520
return
522
521
523
- # kwargs={'model': 'gpt-3.5', 'messages': [{'role': 'user', 'content': '...'}], 'stream': True}
524
522
prompt_usage = - 1
525
523
completion_usage = - 1
526
524
527
- # prompt_usage
528
- if request_kwargs and request_kwargs .get ("messages" ):
529
- prompt_content = ""
530
- # setting the default model_name as gpt-4. As this uses the embedding "cl100k_base" that
531
- # is used by most of the other model.
525
+ # First, try to get usage from API response
526
+ if complete_response .get ("usage" ):
527
+ usage = complete_response ["usage" ]
528
+ if usage .get ("prompt_tokens" ):
529
+ prompt_usage = usage ["prompt_tokens" ]
530
+ if usage .get ("completion_tokens" ):
531
+ completion_usage = usage ["completion_tokens" ]
532
+
533
+ # If API response doesn't have usage, fallback to tiktoken calculation
534
+ if prompt_usage == - 1 or completion_usage == - 1 :
532
535
model_name = (
533
536
complete_response .get ("model" ) or request_kwargs .get ("model" ) or "gpt-4"
534
537
)
535
- for msg in request_kwargs .get ("messages" ):
536
- if msg .get ("content" ):
537
- prompt_content += msg .get ("content" )
538
- if model_name :
539
- prompt_usage = get_token_count_from_string (prompt_content , model_name )
540
-
541
- # completion_usage
542
- if complete_response .get ("choices" ):
543
- completion_content = ""
544
- # setting the default model_name as gpt-4. As this uses the embedding "cl100k_base" that
545
- # is used by most of the other model.
546
- model_name = complete_response .get ("model" ) or "gpt-4"
547
-
548
- for choice in complete_response .get ("choices" ):
549
- if choice .get ("message" ) and choice .get ("message" ).get ("content" ):
550
- completion_content += choice ["message" ]["content" ]
551
-
552
- if model_name :
553
- completion_usage = get_token_count_from_string (
554
- completion_content , model_name
555
- )
538
+
539
+ # Calculate prompt tokens if not available from API
540
+ if prompt_usage == - 1 and request_kwargs and request_kwargs .get ("messages" ):
541
+ prompt_content = ""
542
+ for msg in request_kwargs .get ("messages" ):
543
+ if msg .get ("content" ):
544
+ prompt_content += msg .get ("content" )
545
+ if model_name and should_record_stream_token_usage ():
546
+ prompt_usage = get_token_count_from_string (prompt_content , model_name )
547
+
548
+ # Calculate completion tokens if not available from API
549
+ if completion_usage == - 1 and complete_response .get ("choices" ):
550
+ completion_content = ""
551
+ for choice in complete_response .get ("choices" ):
552
+ if choice .get ("message" ) and choice .get ("message" ).get ("content" ):
553
+ completion_content += choice ["message" ]["content" ]
554
+ if model_name and should_record_stream_token_usage ():
555
+ completion_usage = get_token_count_from_string (
556
+ completion_content , model_name
557
+ )
556
558
557
559
# span record
558
560
_set_span_stream_usage (span , prompt_usage , completion_usage )
@@ -971,6 +973,13 @@ def _accumulate_stream_items(item, complete_response):
971
973
complete_response ["model" ] = item .get ("model" )
972
974
complete_response ["id" ] = item .get ("id" )
973
975
976
+ # capture usage information from the last stream chunks
977
+ if item .get ("usage" ):
978
+ complete_response ["usage" ] = item .get ("usage" )
979
+ elif item .get ("choices" ) and item ["choices" ][0 ].get ("usage" ):
980
+ # Some LLM providers like moonshot mistakenly place token usage information within choices[0], handle this.
981
+ complete_response ["usage" ] = item ["choices" ][0 ].get ("usage" )
982
+
974
983
# prompt filter results
975
984
if item .get ("prompt_filter_results" ):
976
985
complete_response ["prompt_filter_results" ] = item .get ("prompt_filter_results" )
0 commit comments