6
6
import time
7
7
import traceback
8
8
import uuid
9
- from typing import Any , AsyncGenerator , Literal
9
+ from typing import Any , List , Literal
10
10
11
11
from openai_harmony import (Author , Conversation , DeveloperContent ,
12
12
HarmonyEncodingName , HarmonyError , Message ,
13
13
ReasoningEffort , Role , StreamableParser ,
14
14
SystemContent , TextContent , ToolDescription ,
15
15
load_harmony_encoding )
16
16
17
- from tensorrt_llm .llmapi import RequestOutput
18
17
from tensorrt_llm .logger import logger
19
18
20
19
# yapf: disable
21
- from .openai_protocol import (ChatCompletionMessageParam , ChatCompletionRequest ,
20
+ from .openai_protocol import (ChatCompletionMessageParam ,
22
21
ChatCompletionResponse ,
23
22
ChatCompletionResponseChoice ,
24
23
ChatCompletionResponseStreamChoice ,
25
- ChatCompletionStreamResponse , ChatMessage ,
24
+ ChatCompletionStreamResponse ,
25
+ ChatCompletionToolsParam , ChatMessage ,
26
26
DeltaFunctionCall , DeltaMessage , DeltaToolCall ,
27
27
UsageInfo )
28
28
@@ -1485,36 +1485,72 @@ def _is_tool_call_allowed(self, tool_call: dict[str, Any],
1485
1485
return True
1486
1486
1487
1487
1488
- async def handle_streaming_response (
1489
- harmony_adapter : HarmonyAdapter ,
1490
- generator : RequestOutput ,
1491
- request_id : str ,
1492
- request : ChatCompletionRequest ,
1493
- ) -> AsyncGenerator [str , None ]:
1494
- """Handle streaming response with harmony format."""
1488
+ _SERVE_HARMONY_ADAPTER : HarmonyAdapter = None
1489
+
1490
+
1491
+ def get_harmony_adapter ():
1492
+ global _SERVE_HARMONY_ADAPTER
1493
+ if _SERVE_HARMONY_ADAPTER is None :
1494
+ _SERVE_HARMONY_ADAPTER = HarmonyAdapter ()
1495
+
1496
+ return _SERVE_HARMONY_ADAPTER
1497
+
1498
+
1499
+ def handle_streaming_response (tools : List [ChatCompletionToolsParam ],
1500
+ tool_choice : str , outputs : List , model : str ,
1501
+ request_id : str , done : bool ,
1502
+ num_prompt_tokens : int ):
1495
1503
first_iteration = True
1496
- async for res in generator :
1497
- output = res .outputs [0 ]
1504
+ output = outputs [0 ]
1498
1505
1499
- # Convert tools to dictionary format for harmony adapter (standard pattern)
1500
- tools_dict = None
1501
- if request .tools :
1502
- tools_dict = [tool .model_dump () for tool in request .tools ]
1506
+ # Convert tools to dictionary format for harmony adapter (standard pattern)
1507
+ tools_dict = None
1508
+ harmony_adapter = get_harmony_adapter ()
1509
+ if tools :
1510
+ tools_dict = [tool .model_dump () for tool in tools ]
1503
1511
1504
- # Get tool_choice from request - if "none", don't pass tools to parser
1505
- tool_choice = getattr (request , 'tool_choice' , None )
1506
- if tool_choice == "none" :
1507
- tools_for_parser = None
1508
- else :
1509
- tools_for_parser = tools_dict
1512
+ # Get tool_choice from request - if "none", don't pass tools to parser
1513
+ if tool_choice == "none" :
1514
+ tools_for_parser = None
1515
+ else :
1516
+ tools_for_parser = tools_dict
1510
1517
1511
- # Create OpenAI streaming responses
1512
- try :
1518
+ # Create OpenAI streaming responses
1519
+ try :
1520
+ res = []
1521
+ if done :
1522
+ # Clean up state
1523
+ harmony_adapter .cleanup_stream_state (request_id )
1524
+
1525
+ usage_info = _create_usage_info (num_prompt_tokens , outputs )
1526
+
1527
+ # Send final message with finish_reason
1528
+ final_response = ChatCompletionStreamResponse (
1529
+ model = model ,
1530
+ choices = [
1531
+ ChatCompletionResponseStreamChoice (
1532
+ index = 0 ,
1533
+ delta = DeltaMessage (),
1534
+ finish_reason = output .finish_reason ,
1535
+ stop_reason = output .stop_reason )
1536
+ ],
1537
+ )
1538
+
1539
+ final_response_json = final_response .model_dump_json (
1540
+ exclude_none = True )
1541
+ final_usage_chunk = ChatCompletionStreamResponse (choices = [],
1542
+ model = model ,
1543
+ usage = usage_info )
1544
+ final_usage_json = final_usage_chunk .model_dump_json (
1545
+ exclude_none = True )
1546
+ res .append (f"data: { final_response_json } \n \n " )
1547
+ res .append (f"data: { final_usage_json } \n \n " )
1548
+ else :
1513
1549
responses = harmony_adapter .create_openai_streaming_response (
1514
1550
request_id = request_id ,
1515
1551
tokens = output .token_ids_diff ,
1516
1552
available_tools = tools_for_parser ,
1517
- model_name = request . model ,
1553
+ model_name = model ,
1518
1554
tool_choice = tool_choice )
1519
1555
# Send first response after receiving the first output
1520
1556
if first_iteration :
@@ -1525,64 +1561,44 @@ async def handle_streaming_response(
1525
1561
delta = first_delta )
1526
1562
1527
1563
first_response = ChatCompletionStreamResponse (
1528
- model = request . model ,
1564
+ model = model ,
1529
1565
choices = [choice ],
1530
1566
)
1531
1567
1532
1568
response_json = first_response .model_dump_json (
1533
1569
exclude_none = True )
1534
- yield f"data: { response_json } \n \n "
1570
+ res . append ( f"data: { response_json } \n \n " )
1535
1571
1536
- for response in responses :
1537
- yield response
1572
+ res .extend (responses )
1538
1573
1539
- except Exception as e :
1540
- logger .error (f"Failed to create OpenAI streaming response: { e } " )
1541
- logger .debug (f"Streaming error details: { traceback .format_exc ()} " )
1542
- # Clean up state
1543
- harmony_adapter .cleanup_stream_state (request_id )
1544
- raise e
1545
-
1546
- # Clean up state
1547
- harmony_adapter .cleanup_stream_state (request_id )
1574
+ return res
1548
1575
1549
- # Send final message with finish_reason
1550
- output = generator .outputs [0 ]
1551
- final_response = ChatCompletionStreamResponse (
1552
- model = request .model ,
1553
- choices = [
1554
- ChatCompletionResponseStreamChoice (
1555
- index = 0 ,
1556
- delta = DeltaMessage (),
1557
- finish_reason = output .finish_reason ,
1558
- stop_reason = output .stop_reason )
1559
- ])
1576
+ except Exception as e :
1577
+ logger .error (f"Failed to create OpenAI streaming response: { e } " )
1578
+ logger .debug (f"Streaming error details: { traceback .format_exc ()} " )
1579
+ # Clean up state
1580
+ harmony_adapter .cleanup_stream_state (request_id )
1581
+ raise e
1560
1582
1561
- yield f"data: { final_response .model_dump_json (exclude_unset = True )} \n \n "
1562
- yield "data: [DONE]\n \n "
1563
1583
1564
-
1565
- async def handle_non_streaming_response (
1566
- harmony_adapter : HarmonyAdapter , promise : RequestOutput ,
1567
- request : ChatCompletionRequest ) -> ChatCompletionResponse :
1584
+ def handle_non_streaming_response (tools : List [ChatCompletionToolsParam ],
1585
+ tool_choice : str , outputs : List , model : str ,
1586
+ num_prompt_tokens : int ):
1568
1587
"""Handle non-streaming response with harmony format."""
1569
- # Get final result
1570
- await promise
1571
-
1572
1588
# Parse harmony output to OpenAI format
1573
1589
# Convert tools to dictionary format for harmony adapter (standard pattern)
1574
1590
tools_dict = None
1575
- if request .tools :
1576
- tools_dict = [tool .model_dump () for tool in request .tools ]
1591
+ harmony_adapter = get_harmony_adapter ()
1592
+ if tools :
1593
+ tools_dict = [tool .model_dump () for tool in tools ]
1577
1594
1578
1595
# Get tool_choice from request - if "none", don't pass tools to parser
1579
- tool_choice = getattr (request , 'tool_choice' , None )
1580
1596
if tool_choice == "none" :
1581
1597
tools_for_parser = None
1582
1598
else :
1583
1599
tools_for_parser = tools_dict
1584
1600
1585
- output = promise . outputs [0 ]
1601
+ output = outputs [0 ]
1586
1602
parsed_output = harmony_adapter .harmony_output_to_openai (
1587
1603
output .token_ids , tools_for_parser , tool_choice )
1588
1604
@@ -1597,11 +1613,11 @@ async def handle_non_streaming_response(
1597
1613
output .finish_reason )
1598
1614
1599
1615
# Create usage info from metrics (RequestOutput doesn't have usage in v1)
1600
- usage_info = _create_usage_info (promise )
1616
+ usage_info = _create_usage_info (num_prompt_tokens , outputs )
1601
1617
1602
1618
# Create response
1603
1619
response = ChatCompletionResponse (
1604
- model = request . model ,
1620
+ model = model ,
1605
1621
choices = [
1606
1622
ChatCompletionResponseChoice (
1607
1623
index = 0 ,
@@ -1613,7 +1629,6 @@ async def handle_non_streaming_response(
1613
1629
# Optional: Log if harmony parsing failed (for debugging)
1614
1630
if parsed_output .get ('_harmony_parsing_failed' ):
1615
1631
logger .warning ("⚠️ Harmony parsing fell back to raw text decoding" )
1616
- logger .debug (f"request\n \n { request } " )
1617
1632
logger .debug (f"response\n \n { response } \n " )
1618
1633
1619
1634
return response
@@ -1646,15 +1661,10 @@ def _determine_finish_reason(parsed_output: dict[str, Any],
1646
1661
return reason
1647
1662
1648
1663
1649
- def _create_usage_info (final_res : RequestOutput ) -> UsageInfo :
1664
+ def _create_usage_info (num_prompt_tokens , outputs ) -> UsageInfo :
1650
1665
"""Create usage info from RequestOutput following serving_chat.py pattern."""
1651
- # Calculate prompt tokens from prompt_token_ids and encoder_prompt_token_ids
1652
- assert final_res .prompt_token_ids is not None
1653
- num_prompt_tokens = len (final_res .prompt_token_ids )
1654
-
1655
1666
# Calculate completion tokens from all outputs
1656
- num_generated_tokens = sum (
1657
- len (output .token_ids ) for output in final_res .outputs )
1667
+ num_generated_tokens = sum (len (output .token_ids ) for output in outputs )
1658
1668
1659
1669
# Create usage info
1660
1670
usage = UsageInfo (prompt_tokens = num_prompt_tokens ,
0 commit comments