[https://nvbugs/5383702][fix] error propagation in GenerationExecutor (#6793)

Superjomn · dominicshanshan · commit e4cafa1cd2de · 2025-08-20T04:26:58.000-07:00
Signed-off-by: Superjomn &lt;328693+Superjomn@users.noreply.github.com&gt;
Signed-off-by: Wangshanshan &lt;30051912+dominicshanshan@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/executor/proxy.py b/tensorrt_llm/executor/proxy.py
@@ -317,14 +317,15 @@ def mpi_done_callback(future: concurrent.futures.Future):
 
         while True:
             if self.worker_init_status_queue.poll(1):
-                ready_signal = self.worker_init_status_queue.get()
+                ready_signal, error_trace = self.worker_init_status_queue.get()
                 break
             if any(fut.done() for fut in self.mpi_futures):
                 logger.error("Executor worker died during initialization.")
                 raise RuntimeError("Executor worker died during initialization")
             self._handle_background_error()
 
         if ready_signal != GenerationExecutorProxy.READY_SIGNAL:
+            logger.error(f"Executor worker initialization error: {error_trace}")
             self.mpi_session.shutdown_abort(reason=ready_signal)
             raise RuntimeError(
                 "Executor worker returned error") from ready_signal
diff --git a/tensorrt_llm/executor/worker.py b/tensorrt_llm/executor/worker.py
@@ -774,7 +774,7 @@ def notify_proxy_threads_to_quit():
         logger.error(traceback.format_exc())
         print_colored_debug(f"error: {traceback.format_exc()}", "red")
         if is_leader:
-            worker_init_status_queue.put(e)
+            worker_init_status_queue.put((e, traceback.format_exc()))
         return
 
     with worker:
@@ -792,7 +792,7 @@ def notify_proxy_threads_to_quit():
                                                    mp_stats_queue)
                 worker._set_iteration_result_queue(worker.kv_events_queues,
                                                    kv_cache_events_queue)
-                worker_init_status_queue.put(ready_signal)
+                worker_init_status_queue.put((ready_signal, None))
                 while (req := request_queue.get()) is not None:
                     if isinstance(req, CancellingRequest):
                         worker.abort_request(req.id)
diff --git a/tests/unittest/llmapi/test_llm_pytorch.py b/tests/unittest/llmapi/test_llm_pytorch.py
@@ -4,6 +4,7 @@
 import pytest
 
 from tensorrt_llm import LLM
+from tensorrt_llm.executor import GenerationExecutorWorker
 from tensorrt_llm.llmapi import KvCacheConfig
 from tensorrt_llm.llmapi.llm_args import PeftCacheConfig
 from tensorrt_llm.llmapi.tokenizer import TransformersTokenizer
@@ -818,3 +819,40 @@ def test_max_num_token_check(self):
                            match="should not exceed max_num_tokens"):
             ids = [random.randint(10, 100) for _ in range(101)]
             llm.generate([ids])
+
+
+class FailingExecutorWorker(GenerationExecutorWorker):
+    """Mock worker that fails during initialization to test error handling."""
+
+    def __init__(self, *args, **kwargs):
+        # Simulate a constructor failure
+        raise RuntimeError(
+            "Mock GenerationExecutorWorker initialization failed")
+
+
+FailingExecutor = type(
+    "FailingExecutor", (), {
+        "create":
+        classmethod(
+            lambda cls, *args, **kwargs: FailingExecutorWorker(*args, **kwargs))
+    })
+
+
+def test_llm_with_proxy_error():
+    """Test that LLM properly handles GenerationExecutorWorker constructor failures.
+
+    This test mocks the GenerationExecutorWorker to fail during __init__ and
+    verifies that the LLM class properly catches and re-raises the error.
+    """
+    from unittest.mock import patch
+
+    # Test that the error is properly caught and re-raised by LLM
+    # We patch GenerationExecutor.create directly to return our failing worker
+    with patch('tensorrt_llm.executor.executor.GenerationExecutor.create',
+               side_effect=lambda *args, **kwargs: FailingExecutorWorker(
+                   *args, **kwargs)):
+        with pytest.raises(
+                RuntimeError,
+                match="Mock GenerationExecutorWorker initialization failed"):
+            llm = LLM(model=llama_model_path,
+                      kv_cache_config=global_kvcache_config)