[https://nvbugs/5383702][fix] error propagation in GenerationExecutor (#6793)

Superjomn · web-flow · commit a32a2e4d82f6 · 2025-08-12T12:28:06.000+08:00
Signed-off-by: Superjomn &lt;328693+Superjomn@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/executor/proxy.py b/tensorrt_llm/executor/proxy.py
@@ -317,14 +317,15 @@ def mpi_done_callback(future: concurrent.futures.Future):
 
         while True:
             if self.worker_init_status_queue.poll(1):
-                ready_signal = self.worker_init_status_queue.get()
+                ready_signal, error_trace = self.worker_init_status_queue.get()
                 break
             if any(fut.done() for fut in self.mpi_futures):
                 logger.error("Executor worker died during initialization.")
                 raise RuntimeError("Executor worker died during initialization")
             self._handle_background_error()
 
         if ready_signal != GenerationExecutorProxy.READY_SIGNAL:
+            logger.error(f"Executor worker initialization error: {error_trace}")
             self.mpi_session.shutdown_abort(reason=ready_signal)
             raise RuntimeError(
                 "Executor worker returned error") from ready_signal
diff --git a/tensorrt_llm/executor/worker.py b/tensorrt_llm/executor/worker.py
@@ -781,7 +781,7 @@ def notify_proxy_threads_to_quit():
         logger.error(traceback.format_exc())
         print_colored_debug(f"error: {traceback.format_exc()}", "red")
         if is_leader:
-            worker_init_status_queue.put(e)
+            worker_init_status_queue.put((e, traceback.format_exc()))
         return
 
     with worker:
@@ -799,7 +799,7 @@ def notify_proxy_threads_to_quit():
                                                    mp_stats_queue)
                 worker._set_iteration_result_queue(worker.kv_events_queues,
                                                    kv_cache_events_queue)
-                worker_init_status_queue.put(ready_signal)
+                worker_init_status_queue.put((ready_signal, None))
                 while (req := request_queue.get()) is not None:
                     if isinstance(req, CancellingRequest):
                         worker.abort_request(req.id)
diff --git a/tests/unittest/llmapi/test_llm_pytorch.py b/tests/unittest/llmapi/test_llm_pytorch.py
@@ -3,6 +3,7 @@
 import pytest
 
 from tensorrt_llm import LLM
+from tensorrt_llm.executor import GenerationExecutorWorker
 from tensorrt_llm.llmapi import KvCacheConfig
 from tensorrt_llm.llmapi.llm_args import PeftCacheConfig
 from tensorrt_llm.llmapi.tokenizer import TransformersTokenizer
@@ -783,3 +784,40 @@ def test_gqa_nemo_lora(tmp_path):
             f"got: {base_outputs[0].outputs[0].text}"
     finally:
         llm.shutdown()
+
+
+class FailingExecutorWorker(GenerationExecutorWorker):
+    """Mock worker that fails during initialization to test error handling."""
+
+    def __init__(self, *args, **kwargs):
+        # Simulate a constructor failure
+        raise RuntimeError(
+            "Mock GenerationExecutorWorker initialization failed")
+
+
+FailingExecutor = type(
+    "FailingExecutor", (), {
+        "create":
+        classmethod(
+            lambda cls, *args, **kwargs: FailingExecutorWorker(*args, **kwargs))
+    })
+
+
+def test_llm_with_proxy_error():
+    """Test that LLM properly handles GenerationExecutorWorker constructor failures.
+
+    This test mocks the GenerationExecutorWorker to fail during __init__ and
+    verifies that the LLM class properly catches and re-raises the error.
+    """
+    from unittest.mock import patch
+
+    # Test that the error is properly caught and re-raised by LLM
+    # We patch GenerationExecutor.create directly to return our failing worker
+    with patch('tensorrt_llm.executor.executor.GenerationExecutor.create',
+               side_effect=lambda *args, **kwargs: FailingExecutorWorker(
+                   *args, **kwargs)):
+        with pytest.raises(
+                RuntimeError,
+                match="Mock GenerationExecutorWorker initialization failed"):
+            llm = LLM(model=llama_model_path,
+                      kv_cache_config=global_kvcache_config)