5
5
import ray
6
6
from ray .experimental .collective import create_collective_group
7
7
from ray ._private .custom_types import TensorTransportEnum
8
+ from ray ._common .test_utils import wait_for_condition
8
9
9
10
# tensordict is not supported on macos ci, so we skip the tests
10
11
support_tensordict = sys .platform != "darwin"
@@ -32,10 +33,97 @@ def get_gpu_object(self, obj_id: str):
32
33
)
33
34
if gpu_object_store .has_gpu_object (obj_id ):
34
35
gpu_object = gpu_object_store .get_gpu_object (obj_id )
35
- print (f"gpu_object: { gpu_object } " )
36
36
return gpu_object
37
37
return None
38
38
39
+ def get_num_gpu_objects (self ):
40
+ gpu_object_manager = ray ._private .worker .global_worker .gpu_object_manager
41
+ return len (gpu_object_manager .gpu_object_store .gpu_object_store )
42
+
43
+
44
+ @pytest .mark .parametrize ("data_size_bytes" , [100 ])
45
+ def test_gc_gpu_object (ray_start_regular , data_size_bytes ):
46
+ """
47
+ For small data, GPU objects are inlined, but the actual data lives
48
+ on the remote actor. Therefore, if we decrement the reference count
49
+ upon inlining, we may cause the tensors on the sender actor to be
50
+ freed before transferring to the receiver actor.
51
+
52
+ # TODO(kevin85421): Add a test for large CPU data that is not inlined
53
+ # after https://github.com/ray-project/ray/issues/54281 is fixed.
54
+ """
55
+ world_size = 2
56
+ actors = [GPUTestActor .remote () for _ in range (world_size )]
57
+ create_collective_group (actors , backend = "torch_gloo" )
58
+
59
+ small_tensor = torch .randn ((1 ,))
60
+ cpu_data = b"1" * data_size_bytes
61
+ data = [small_tensor , cpu_data ]
62
+ sender = actors [0 ]
63
+ receiver = actors [1 ]
64
+
65
+ ref1 = sender .echo .remote (data )
66
+ ref2 = receiver .double .remote (ref1 )
67
+ ref3 = receiver .double .remote (ref1 )
68
+
69
+ result = ray .get (ref2 )
70
+ assert result [0 ] == pytest .approx (small_tensor * 2 )
71
+ assert result [1 ] == cpu_data * 2
72
+ result = ray .get (ref3 )
73
+ assert result [0 ] == pytest .approx (small_tensor * 2 )
74
+ assert result [1 ] == cpu_data * 2
75
+
76
+ wait_for_condition (
77
+ lambda : ray .get (receiver .get_num_gpu_objects .remote ()) == 0 ,
78
+ timeout = 10 ,
79
+ retry_interval_ms = 100 ,
80
+ )
81
+
82
+ del ref1
83
+
84
+ wait_for_condition (
85
+ lambda : ray .get (sender .get_num_gpu_objects .remote ()) == 0 ,
86
+ timeout = 10 ,
87
+ retry_interval_ms = 100 ,
88
+ )
89
+
90
+
91
+ @pytest .mark .parametrize ("data_size_bytes" , [100 ])
92
+ def test_gc_del_ref_before_recv_finish (ray_start_regular , data_size_bytes ):
93
+ """
94
+ This test deletes the ObjectRef of the GPU object before calling
95
+ `ray.get` to ensure the receiver finishes receiving the GPU object.
96
+ """
97
+ world_size = 2
98
+ actors = [GPUTestActor .remote () for _ in range (world_size )]
99
+ create_collective_group (actors , backend = "torch_gloo" )
100
+
101
+ small_tensor = torch .randn ((1 ,))
102
+ cpu_data = b"1" * data_size_bytes
103
+ data = [small_tensor , cpu_data ]
104
+ sender = actors [0 ]
105
+ receiver = actors [1 ]
106
+
107
+ ref1 = sender .echo .remote (data )
108
+ ref2 = receiver .double .remote (ref1 )
109
+
110
+ del ref1
111
+
112
+ result = ray .get (ref2 )
113
+ assert result [0 ] == pytest .approx (small_tensor * 2 )
114
+ assert result [1 ] == cpu_data * 2
115
+
116
+ wait_for_condition (
117
+ lambda : ray .get (receiver .get_num_gpu_objects .remote ()) == 0 ,
118
+ timeout = 10 ,
119
+ retry_interval_ms = 100 ,
120
+ )
121
+ wait_for_condition (
122
+ lambda : ray .get (sender .get_num_gpu_objects .remote ()) == 0 ,
123
+ timeout = 10 ,
124
+ retry_interval_ms = 100 ,
125
+ )
126
+
39
127
40
128
def test_p2p (ray_start_regular ):
41
129
world_size = 2
@@ -149,9 +237,10 @@ def test_trigger_out_of_band_tensor_transfer(ray_start_regular):
149
237
150
238
tensor = torch .tensor ([1 , 2 , 3 ])
151
239
gpu_ref = src_actor .echo .remote (tensor )
240
+ gpu_obj_id = gpu_ref .hex ()
152
241
153
242
# Check src_actor has the GPU object
154
- ret_val_src = ray .get (src_actor .get_gpu_object .remote (gpu_ref . hex () ))
243
+ ret_val_src = ray .get (src_actor .get_gpu_object .remote (gpu_obj_id ))
155
244
assert ret_val_src is not None
156
245
assert len (ret_val_src ) == 1
157
246
assert torch .equal (ret_val_src [0 ], tensor )
@@ -160,15 +249,11 @@ def test_trigger_out_of_band_tensor_transfer(ray_start_regular):
160
249
gpu_object_manager .add_gpu_object_ref (gpu_ref , src_actor , TensorTransportEnum .GLOO )
161
250
162
251
# Trigger out-of-band tensor transfer from src_actor to dst_actor.
163
- # The GPU object will be removed from src_actor's GPU object store
164
- # because the current GC implementation garbage collects GPU objects
165
- # whenever they are consumed once.
166
252
task_args = (gpu_ref ,)
167
253
gpu_object_manager .trigger_out_of_band_tensor_transfer (dst_actor , task_args )
168
- assert ray .get (src_actor .get_gpu_object .remote (gpu_ref .hex ())) is None
169
254
170
255
# Check dst_actor has the GPU object
171
- ret_val_dst = ray .get (dst_actor .get_gpu_object .remote (gpu_ref . hex () ))
256
+ ret_val_dst = ray .get (dst_actor .get_gpu_object .remote (gpu_obj_id ))
172
257
assert ret_val_dst is not None
173
258
assert len (ret_val_dst ) == 1
174
259
assert torch .equal (ret_val_dst [0 ], tensor )
0 commit comments