use cudaSetDevice to create context ,fix nvbug 5394497 (NVIDIA#6403)

chuangz0 · Ria Jain · commit 1767c8fcd424 · 2025-08-07T16:22:10.000-07:00
Signed-off-by: Chuang Zhu &lt;111838961+chuangz0@users.noreply.github.com&gt;
diff --git a/cpp/tensorrt_llm/batch_manager/cacheFormatter.cpp b/cpp/tensorrt_llm/batch_manager/cacheFormatter.cpp
@@ -347,7 +347,7 @@ void CacheFormatter::format(TransferSession& session)
                     auto copyTargetSlice = runtime::ITensor::slice(preAllocSendBuffer, 0, sendSize);
                     bufferManager.copy(*copySlice, *copyTargetSlice);
                     bufferManager.getStream().synchronize();
-                    session.send(processIdx, copyTargetSlice->data(), sendSize);
+                    session.send(processIdx, copyTargetSlice->data(), copyTargetSlice->getSizeInBytes());
                     remainSendSize -= sendSize;
                 }
             }
diff --git a/cpp/tensorrt_llm/batch_manager/mlaCacheFormatter.cpp b/cpp/tensorrt_llm/batch_manager/mlaCacheFormatter.cpp
@@ -223,7 +223,7 @@ void MLACacheFormatter::format(TransferSession& session)
                 auto copyTargetSlice = runtime::ITensor::slice(preAllocSendBuffer, 0, sendSize);
                 bufferManager.copy(*copySlice, *copyTargetSlice);
                 bufferManager.getStream().synchronize();
-                session.send(processIdx, copyTargetSlice->data(), sendSize);
+                session.send(processIdx, copyTargetSlice->data(), copyTargetSlice->getSizeInBytes());
 
                 remainSendSize -= sendSize;
             }
diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor.py b/tensorrt_llm/_torch/pyexecutor/py_executor.py
@@ -11,6 +11,7 @@
 from typing import Dict, List, Optional, Union
 
 import torch
+from cuda import cudart
 
 from tensorrt_llm._torch.pyexecutor.resource_manager import ResourceManagerType
 from tensorrt_llm._torch.pyexecutor.seq_slot_manager import SeqSlotManager
@@ -25,6 +26,7 @@
 from tensorrt_llm.bindings.internal.batch_manager import (LlmRequestType,
                                                           ReqIdsSet)
 from tensorrt_llm.logger import logger
+from tensorrt_llm.runtime.generation import CUASSERT
 
 from ..distributed import Distributed
 from ..speculative.drafter import Drafter
@@ -644,6 +646,8 @@ def _need_return_log_probs(self, scheduled_requests: ScheduledRequests):
     def _executor_loop_pp(self):
         logger.debug(f"Starting executor loop for pp_rank {self.dist.pp_rank}")
         torch.cuda.set_device(self.device_id)
+        # ensure the context is created, otherwise, some MPI calls will fail.
+        CUASSERT(cudart.cudaSetDevice(self.device_id))
         microbatch_id = 0
         with self._profiler() as profile_step:
             iter_start_time = time.time()
@@ -897,6 +901,8 @@ def _execute_guided_decoder(self, scheduled_batch, logits):
 
     def _executor_loop(self):
         torch.cuda.set_device(self.device_id)
+        # ensure the context is created, otherwise, some MPI calls will fail.
+        CUASSERT(cudart.cudaSetDevice(self.device_id))
         with self._profiler() as profile_step:
             sample_state = None
             iter_start_time = time.time()
@@ -1014,6 +1020,8 @@ def _prepare_draft_requests(self):
 
     def _executor_loop_overlap(self):
         torch.cuda.set_device(self.device_id)
+        # ensure the context is created, otherwise, some MPI calls will fail.
+        CUASSERT(cudart.cudaSetDevice(self.device_id))
         if self.dist.rank == 0 and not self.is_warmup and self.benchmark_req_queues_size > 0 and self.kv_cache_transceiver:
             while self.executor_request_queue.get_request_queue_size(
             ) < self.benchmark_req_queues_size:

Original file line number	Diff line number	Diff line change
`@@ -347,7 +347,7 @@ void CacheFormatter::format(TransferSession& session)`
`347`	`347`	`auto copyTargetSlice = runtime::ITensor::slice(preAllocSendBuffer, 0, sendSize);`
`348`	`348`	`bufferManager.copy(copySlice, copyTargetSlice);`
`349`	`349`	`bufferManager.getStream().synchronize();`
`350`		`- session.send(processIdx, copyTargetSlice->data(), sendSize);`
	`350`	`+ session.send(processIdx, copyTargetSlice->data(), copyTargetSlice->getSizeInBytes());`
`351`	`351`	`remainSendSize -= sendSize;`
`352`	`352`	`}`
`353`	`353`	`}`
Original file line number	Diff line number	Diff line change
`@@ -223,7 +223,7 @@ void MLACacheFormatter::format(TransferSession& session)`
`223`	`223`	`auto copyTargetSlice = runtime::ITensor::slice(preAllocSendBuffer, 0, sendSize);`
`224`	`224`	`bufferManager.copy(copySlice, copyTargetSlice);`
`225`	`225`	`bufferManager.getStream().synchronize();`
`226`		`- session.send(processIdx, copyTargetSlice->data(), sendSize);`
	`226`	`+ session.send(processIdx, copyTargetSlice->data(), copyTargetSlice->getSizeInBytes());`
`227`	`227`
`228`	`228`	`remainSendSize -= sendSize;`
`229`	`229`	`}`