Skip to content

Commit a32a2e4

Browse files
authored
[https://nvbugs/5383702][fix] error propagation in GenerationExecutor (#6793)
Signed-off-by: Superjomn <[email protected]>
1 parent c39454c commit a32a2e4

File tree

3 files changed

+42
-3
lines changed

3 files changed

+42
-3
lines changed

tensorrt_llm/executor/proxy.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -317,14 +317,15 @@ def mpi_done_callback(future: concurrent.futures.Future):
317317

318318
while True:
319319
if self.worker_init_status_queue.poll(1):
320-
ready_signal = self.worker_init_status_queue.get()
320+
ready_signal, error_trace = self.worker_init_status_queue.get()
321321
break
322322
if any(fut.done() for fut in self.mpi_futures):
323323
logger.error("Executor worker died during initialization.")
324324
raise RuntimeError("Executor worker died during initialization")
325325
self._handle_background_error()
326326

327327
if ready_signal != GenerationExecutorProxy.READY_SIGNAL:
328+
logger.error(f"Executor worker initialization error: {error_trace}")
328329
self.mpi_session.shutdown_abort(reason=ready_signal)
329330
raise RuntimeError(
330331
"Executor worker returned error") from ready_signal

tensorrt_llm/executor/worker.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -781,7 +781,7 @@ def notify_proxy_threads_to_quit():
781781
logger.error(traceback.format_exc())
782782
print_colored_debug(f"error: {traceback.format_exc()}", "red")
783783
if is_leader:
784-
worker_init_status_queue.put(e)
784+
worker_init_status_queue.put((e, traceback.format_exc()))
785785
return
786786

787787
with worker:
@@ -799,7 +799,7 @@ def notify_proxy_threads_to_quit():
799799
mp_stats_queue)
800800
worker._set_iteration_result_queue(worker.kv_events_queues,
801801
kv_cache_events_queue)
802-
worker_init_status_queue.put(ready_signal)
802+
worker_init_status_queue.put((ready_signal, None))
803803
while (req := request_queue.get()) is not None:
804804
if isinstance(req, CancellingRequest):
805805
worker.abort_request(req.id)

tests/unittest/llmapi/test_llm_pytorch.py

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import pytest
44

55
from tensorrt_llm import LLM
6+
from tensorrt_llm.executor import GenerationExecutorWorker
67
from tensorrt_llm.llmapi import KvCacheConfig
78
from tensorrt_llm.llmapi.llm_args import PeftCacheConfig
89
from tensorrt_llm.llmapi.tokenizer import TransformersTokenizer
@@ -783,3 +784,40 @@ def test_gqa_nemo_lora(tmp_path):
783784
f"got: {base_outputs[0].outputs[0].text}"
784785
finally:
785786
llm.shutdown()
787+
788+
789+
class FailingExecutorWorker(GenerationExecutorWorker):
790+
"""Mock worker that fails during initialization to test error handling."""
791+
792+
def __init__(self, *args, **kwargs):
793+
# Simulate a constructor failure
794+
raise RuntimeError(
795+
"Mock GenerationExecutorWorker initialization failed")
796+
797+
798+
FailingExecutor = type(
799+
"FailingExecutor", (), {
800+
"create":
801+
classmethod(
802+
lambda cls, *args, **kwargs: FailingExecutorWorker(*args, **kwargs))
803+
})
804+
805+
806+
def test_llm_with_proxy_error():
807+
"""Test that LLM properly handles GenerationExecutorWorker constructor failures.
808+
809+
This test mocks the GenerationExecutorWorker to fail during __init__ and
810+
verifies that the LLM class properly catches and re-raises the error.
811+
"""
812+
from unittest.mock import patch
813+
814+
# Test that the error is properly caught and re-raised by LLM
815+
# We patch GenerationExecutor.create directly to return our failing worker
816+
with patch('tensorrt_llm.executor.executor.GenerationExecutor.create',
817+
side_effect=lambda *args, **kwargs: FailingExecutorWorker(
818+
*args, **kwargs)):
819+
with pytest.raises(
820+
RuntimeError,
821+
match="Mock GenerationExecutorWorker initialization failed"):
822+
llm = LLM(model=llama_model_path,
823+
kv_cache_config=global_kvcache_config)

0 commit comments

Comments
 (0)