ray-project
diff --git a/‎python/ray/_private/serialization.py‎
Lines changed: 0 additions & 2 deletions b/‎python/ray/_private/serialization.py‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎python/ray/_raylet.pyx‎
Lines changed: 7 additions & 0 deletions b/‎python/ray/_raylet.pyx‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎python/ray/experimental/gpu_object_manager/gpu_object_manager.py‎
Lines changed: 0 additions & 4 deletions b/‎python/ray/experimental/gpu_object_manager/gpu_object_manager.py‎
Lines changed: 0 additions & 4 deletions
diff --git a/‎python/ray/experimental/gpu_object_manager/gpu_object_store.py‎
Lines changed: 0 additions & 8 deletions b/‎python/ray/experimental/gpu_object_manager/gpu_object_store.py‎
Lines changed: 0 additions & 8 deletions
diff --git a/‎python/ray/includes/libcoreworker.pxd‎
Lines changed: 1 addition & 0 deletions b/‎python/ray/includes/libcoreworker.pxd‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎python/ray/tests/test_gpu_objects_gloo.py‎
Lines changed: 92 additions & 7 deletions b/‎python/ray/tests/test_gpu_objects_gloo.py‎
Lines changed: 92 additions & 7 deletions
diff --git a/‎src/ray/core_worker/core_worker.cc‎
Lines changed: 15 additions & 1 deletion b/‎src/ray/core_worker/core_worker.cc‎
Lines changed: 15 additions & 1 deletion
diff --git a/‎src/ray/core_worker/core_worker.h‎
Lines changed: 6 additions & 0 deletions b/‎src/ray/core_worker/core_worker.h‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎src/ray/core_worker/core_worker_options.h‎
Lines changed: 3 additions & 0 deletions b/‎src/ray/core_worker/core_worker_options.h‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎src/ray/core_worker/reference_count.h‎
Lines changed: 2 additions & 0 deletions b/‎src/ray/core_worker/reference_count.h‎
Lines changed: 2 additions & 0 deletions
@@ -292,8 +292,6 @@ def _deserialize_pickle5_data(
                 gpu_object_manager.fetch_gpu_object(object_id)
             tensors = gpu_object_manager.gpu_object_store.get_gpu_object(object_id)
             ctx.reset_out_of_band_tensors(tensors)
-            # TODO(kevin85421): The current garbage collection implementation for the in-actor object store
-            # is naive. We garbage collect each object after it is consumed once.
             gpu_object_manager.gpu_object_store.remove_gpu_object(object_id)
 
         try:
 
@@ -2287,6 +2287,12 @@ cdef execute_task_with_cancellation_handler(
                 f"Exited because worker reached max_calls={execution_info.max_calls}"
                 " for this method.")
 
+cdef void free_actor_object_callback(const CObjectID &c_object_id) nogil:
+    with gil:
+        object_id = c_object_id.Hex().decode()
+        gpu_object_manager = ray._private.worker.global_worker.gpu_object_manager
+        gpu_object_manager.gpu_object_store.remove_gpu_object(object_id)
+
 cdef shared_ptr[LocalMemoryBuffer] ray_error_to_memory_buf(ray_error):
     cdef bytes py_bytes = ray_error.to_bytes()
     return make_shared[LocalMemoryBuffer](
@@ -2998,6 +3004,7 @@ cdef class CoreWorker:
         options.driver_name = driver_name
         options.initialize_thread_callback = initialize_pygilstate_for_thread
         options.task_execution_callback = task_execution_handler
+        options.free_actor_object_callback = free_actor_object_callback
         options.check_signals = check_signals
         options.gc_collect = gc_collect
         options.spill_objects = spill_objects_handler
 
@@ -44,10 +44,6 @@ def __ray_fetch_gpu_object__(self, obj_id: str):
         obj_id
     ), f"obj_id={obj_id} not found in GPU object store"
     tensors = gpu_object_store.get_gpu_object(obj_id)
-    # TODO(kevin85421): The current garbage collection implementation for the
-    # in-actor object store is naive. We garbage collect each object after it
-    # is consumed once.
-    gpu_object_store.remove_gpu_object(obj_id)
     return tensors
 
 
 
@@ -56,10 +56,6 @@ def __ray_send__(self, communicator_name: str, obj_id: str, dst_rank: int):
                 f"tensor device {tensor.device} does not match device {device}"
             )
         collective.send(tensor, dst_rank, group_name=communicator_name)
-    # TODO(kevin85421): The current garbage collection implementation for the
-    # in-actor object store is naive. We garbage collect each object after it
-    # is consumed once.
-    gpu_object_store.remove_gpu_object(obj_id)
 
 
 def __ray_recv__(
@@ -94,10 +90,6 @@ def __ray_fetch_gpu_object__(self, obj_id: str):
         obj_id
     ), f"obj_id={obj_id} not found in GPU object store"
     tensors = gpu_object_store.get_gpu_object(obj_id)
-    # TODO(kevin85421): The current garbage collection implementation for the
-    # in-actor object store is naive. We garbage collect each object after it
-    # is consumed once.
-    gpu_object_store.remove_gpu_object(obj_id)
     return tensors
 
 
 
@@ -406,6 +406,7 @@ cdef extern from "ray/core_worker/core_worker.h" nogil:
             int64_t generator_backpressure_num_objects,
             CTensorTransport tensor_transport
         ) nogil) task_execution_callback
+        (void(const CObjectID &) nogil) free_actor_object_callback
         (function[void()]() nogil) initialize_thread_callback
         (CRayStatus() nogil) check_signals
         (void(c_bool) nogil) gc_collect
 
@@ -5,6 +5,7 @@
 import ray
 from ray.experimental.collective import create_collective_group
 from ray._private.custom_types import TensorTransportEnum
+from ray._common.test_utils import wait_for_condition
 
 # tensordict is not supported on macos ci, so we skip the tests
 support_tensordict = sys.platform != "darwin"
@@ -32,10 +33,97 @@ def get_gpu_object(self, obj_id: str):
         )
         if gpu_object_store.has_gpu_object(obj_id):
             gpu_object = gpu_object_store.get_gpu_object(obj_id)
-            print(f"gpu_object: {gpu_object}")
             return gpu_object
         return None
 
+    def get_num_gpu_objects(self):
+        gpu_object_manager = ray._private.worker.global_worker.gpu_object_manager
+        return len(gpu_object_manager.gpu_object_store.gpu_object_store)
+
+
+@pytest.mark.parametrize("data_size_bytes", [100])
+def test_gc_gpu_object(ray_start_regular, data_size_bytes):
+    """
+    For small data, GPU objects are inlined, but the actual data lives
+    on the remote actor. Therefore, if we decrement the reference count
+    upon inlining, we may cause the tensors on the sender actor to be
+    freed before transferring to the receiver actor.
+
+    # TODO(kevin85421): Add a test for large CPU data that is not inlined
+    # after https://github.com/ray-project/ray/issues/54281 is fixed.
+    """
+    world_size = 2
+    actors = [GPUTestActor.remote() for _ in range(world_size)]
+    create_collective_group(actors, backend="torch_gloo")
+
+    small_tensor = torch.randn((1,))
+    cpu_data = b"1" * data_size_bytes
+    data = [small_tensor, cpu_data]
+    sender = actors[0]
+    receiver = actors[1]
+
+    ref1 = sender.echo.remote(data)
+    ref2 = receiver.double.remote(ref1)
+    ref3 = receiver.double.remote(ref1)
+
+    result = ray.get(ref2)
+    assert result[0] == pytest.approx(small_tensor * 2)
+    assert result[1] == cpu_data * 2
+    result = ray.get(ref3)
+    assert result[0] == pytest.approx(small_tensor * 2)
+    assert result[1] == cpu_data * 2
+
+    wait_for_condition(
+        lambda: ray.get(receiver.get_num_gpu_objects.remote()) == 0,
+        timeout=10,
+        retry_interval_ms=100,
+    )
+
+    del ref1
+
+    wait_for_condition(
+        lambda: ray.get(sender.get_num_gpu_objects.remote()) == 0,
+        timeout=10,
+        retry_interval_ms=100,
+    )
+
+
+@pytest.mark.parametrize("data_size_bytes", [100])
+def test_gc_del_ref_before_recv_finish(ray_start_regular, data_size_bytes):
+    """
+    This test deletes the ObjectRef of the GPU object before calling
+    `ray.get` to ensure the receiver finishes receiving the GPU object.
+    """
+    world_size = 2
+    actors = [GPUTestActor.remote() for _ in range(world_size)]
+    create_collective_group(actors, backend="torch_gloo")
+
+    small_tensor = torch.randn((1,))
+    cpu_data = b"1" * data_size_bytes
+    data = [small_tensor, cpu_data]
+    sender = actors[0]
+    receiver = actors[1]
+
+    ref1 = sender.echo.remote(data)
+    ref2 = receiver.double.remote(ref1)
+
+    del ref1
+
+    result = ray.get(ref2)
+    assert result[0] == pytest.approx(small_tensor * 2)
+    assert result[1] == cpu_data * 2
+
+    wait_for_condition(
+        lambda: ray.get(receiver.get_num_gpu_objects.remote()) == 0,
+        timeout=10,
+        retry_interval_ms=100,
+    )
+    wait_for_condition(
+        lambda: ray.get(sender.get_num_gpu_objects.remote()) == 0,
+        timeout=10,
+        retry_interval_ms=100,
+    )
+
 
 def test_p2p(ray_start_regular):
     world_size = 2
@@ -149,9 +237,10 @@ def test_trigger_out_of_band_tensor_transfer(ray_start_regular):
 
     tensor = torch.tensor([1, 2, 3])
     gpu_ref = src_actor.echo.remote(tensor)
+    gpu_obj_id = gpu_ref.hex()
 
     # Check src_actor has the GPU object
-    ret_val_src = ray.get(src_actor.get_gpu_object.remote(gpu_ref.hex()))
+    ret_val_src = ray.get(src_actor.get_gpu_object.remote(gpu_obj_id))
     assert ret_val_src is not None
     assert len(ret_val_src) == 1
     assert torch.equal(ret_val_src[0], tensor)
@@ -160,15 +249,11 @@ def test_trigger_out_of_band_tensor_transfer(ray_start_regular):
     gpu_object_manager.add_gpu_object_ref(gpu_ref, src_actor, TensorTransportEnum.GLOO)
 
     # Trigger out-of-band tensor transfer from src_actor to dst_actor.
-    # The GPU object will be removed from src_actor's GPU object store
-    # because the current GC implementation garbage collects GPU objects
-    # whenever they are consumed once.
     task_args = (gpu_ref,)
     gpu_object_manager.trigger_out_of_band_tensor_transfer(dst_actor, task_args)
-    assert ray.get(src_actor.get_gpu_object.remote(gpu_ref.hex())) is None
 
     # Check dst_actor has the GPU object
-    ret_val_dst = ray.get(dst_actor.get_gpu_object.remote(gpu_ref.hex()))
+    ret_val_dst = ray.get(dst_actor.get_gpu_object.remote(gpu_obj_id))
     assert ret_val_dst is not None
     assert len(ret_val_dst) == 1
     assert torch.equal(ret_val_dst[0], tensor)
 
@@ -734,7 +734,13 @@ CoreWorker::CoreWorker(CoreWorkerOptions options, const WorkerID &worker_id)
       },
       push_error_callback,
       RayConfig::instance().max_lineage_bytes(),
-      *task_event_buffer_);
+      *task_event_buffer_,
+      /*get_actor_rpc_client_callback=*/
+      [this](const ActorID &actor_id) {
+        auto addr = actor_task_submitter_->GetActorAddress(actor_id);
+        RAY_CHECK(addr.has_value()) << "Actor address not found for actor " << actor_id;
+        return core_worker_client_pool_->GetOrConnect(addr.value());
+      });
 
   // Create an entry for the driver task in the task table. This task is
   // added immediately with status RUNNING. This allows us to push errors
@@ -4952,6 +4958,14 @@ void CoreWorker::HandlePlasmaObjectReady(rpc::PlasmaObjectReadyRequest request,
   send_reply_callback(Status::OK(), nullptr, nullptr);
 }
 
+void CoreWorker::HandleFreeActorObject(rpc::FreeActorObjectRequest request,
+                                       rpc::FreeActorObjectReply *reply,
+                                       rpc::SendReplyCallback send_reply_callback) {
+  ObjectID object_id = ObjectID::FromBinary(request.object_id());
+  options_.free_actor_object_callback(object_id);
+  send_reply_callback(Status::OK(), nullptr, nullptr);
+}
+
 void CoreWorker::SetActorId(const ActorID &actor_id) {
   absl::MutexLock lock(&mutex_);
   if (!options_.is_local_mode) {
 
@@ -1268,6 +1268,12 @@ class CoreWorker : public rpc::CoreWorkerServiceHandler {
   void HandleNumPendingTasks(rpc::NumPendingTasksRequest request,
                              rpc::NumPendingTasksReply *reply,
                              rpc::SendReplyCallback send_reply_callback) override;
+
+  // Free GPU objects from the in-actor GPU object store.
+  void HandleFreeActorObject(rpc::FreeActorObjectRequest request,
+                             rpc::FreeActorObjectReply *reply,
+                             rpc::SendReplyCallback send_reply_callback) override;
+
   ///
   /// Public methods related to async actor call. This should only be used when
   /// the actor is (1) direct actor and (2) using async mode.
 
@@ -87,6 +87,7 @@ struct CoreWorkerOptions {
         raylet_ip_address(""),
         driver_name(""),
         task_execution_callback(nullptr),
+        free_actor_object_callback(nullptr),
         check_signals(nullptr),
         initialize_thread_callback(nullptr),
         gc_collect(nullptr),
@@ -146,6 +147,8 @@ struct CoreWorkerOptions {
   std::string driver_name;
   /// Application-language worker callback to execute tasks.
   TaskExecutionCallback task_execution_callback;
+  /// Callback to free GPU object from the in-actor object store.
+  std::function<void(const ObjectID &)> free_actor_object_callback;
   /// Application-language callback to check for signals that have been received
   /// since calling into C++. This will be called periodically (at least every
   /// 1s) during long-running operations. If the function returns anything but StatusOK,
 
@@ -771,6 +771,8 @@ class ReferenceCounter : public ReferenceCounterInterface,
     /// counting is enabled, then some raylet must be pinning the object value.
     /// This is the address of that raylet.
     std::optional<NodeID> pinned_at_raylet_id;
+    /// TODO(kevin85421): Make tensor_transport a required field for all constructors.
+    ///
     /// The transport used for the object.
     rpc::TensorTransport tensor_transport = rpc::TensorTransport::OBJECT_STORE;
     /// Whether we own the object. If we own the object, then we are