-
Notifications
You must be signed in to change notification settings - Fork 6.9k
[data] Allocate GPU resources in ResourceManager #54445
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
ba4ab09
eefe7e6
f1ca9bd
aee6b6d
647e1a5
1c41ada
2e6c979
0ca6058
6492f43
5d1c4fb
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -451,18 +451,14 @@ def process_completed_tasks( | |
|
||
max_bytes_to_read_per_op: Dict[OpState, int] = {} | ||
for op, state in topology.items(): | ||
# Check all backpressure policies for max_task_output_bytes_to_read | ||
# Use the minimum limit from all policies (most restrictive) | ||
max_bytes_to_read = None | ||
for policy in backpressure_policies: | ||
policy_limit = policy.max_task_output_bytes_to_read(op) | ||
if policy_limit is not None: | ||
if max_bytes_to_read is None: | ||
max_bytes_to_read = policy_limit | ||
else: | ||
max_bytes_to_read = min(max_bytes_to_read, policy_limit) | ||
|
||
# If no policy provides a limit, there's no limit | ||
max_bytes_to_read = min( | ||
( | ||
limit | ||
for policy in backpressure_policies | ||
if (limit := policy.max_task_output_bytes_to_read(op)) is not None | ||
), | ||
default=None, | ||
) | ||
Comment on lines
+454
to
+461
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nice. |
||
op.notify_in_task_output_backpressure(max_bytes_to_read == 0) | ||
if max_bytes_to_read is not None: | ||
max_bytes_to_read_per_op[state] = max_bytes_to_read | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -395,8 +395,8 @@ def can_submit_new_task(allocator, op): | |
# 50% of the global limits are shared. | ||
assert allocator._total_shared == ExecutionResources(8, 0, 500) | ||
# Test budgets. | ||
assert allocator._op_budgets[o2] == ExecutionResources(8, float("inf"), 375) | ||
assert allocator._op_budgets[o3] == ExecutionResources(8, float("inf"), 375) | ||
assert allocator._op_budgets[o2] == ExecutionResources(8, 0, 375) | ||
assert allocator._op_budgets[o3] == ExecutionResources(8, 0, 375) | ||
# Test can_submit_new_task and max_task_output_bytes_to_read. | ||
assert can_submit_new_task(allocator, o2) | ||
assert can_submit_new_task(allocator, o3) | ||
|
@@ -425,9 +425,9 @@ def can_submit_new_task(allocator, op): | |
# remaining shared = 1000/2 - 275 = 225 | ||
# Test budgets. | ||
# memory_budget[o2] = 0 + 225/2 = 112.5 | ||
assert allocator._op_budgets[o2] == ExecutionResources(3, float("inf"), 112.5) | ||
assert allocator._op_budgets[o2] == ExecutionResources(3, 0, 112.5) | ||
# memory_budget[o3] = 95 + 225/2 = 207.5 | ||
assert allocator._op_budgets[o3] == ExecutionResources(5, float("inf"), 207.5) | ||
assert allocator._op_budgets[o3] == ExecutionResources(5, 0, 207.5) | ||
# Test can_submit_new_task and max_task_output_bytes_to_read. | ||
assert can_submit_new_task(allocator, o2) | ||
assert can_submit_new_task(allocator, o3) | ||
|
@@ -461,9 +461,9 @@ def can_submit_new_task(allocator, op): | |
|
||
# Test budgets. | ||
# memory_budget[o2] = 0 + 100/2 = 50 | ||
assert allocator._op_budgets[o2] == ExecutionResources(1.5, float("inf"), 50) | ||
assert allocator._op_budgets[o2] == ExecutionResources(1.5, 0, 50) | ||
# memory_budget[o3] = 70 + 100/2 = 120 | ||
assert allocator._op_budgets[o3] == ExecutionResources(2.5, float("inf"), 120) | ||
assert allocator._op_budgets[o3] == ExecutionResources(2.5, 0, 120) | ||
# Test can_submit_new_task and max_task_output_bytes_to_read. | ||
assert can_submit_new_task(allocator, o2) | ||
assert can_submit_new_task(allocator, o3) | ||
|
@@ -624,6 +624,93 @@ def test_only_handle_eligible_ops(self, restore_data_context): | |
allocator.update_usages() | ||
assert o2 not in allocator._op_budgets | ||
|
||
def test_gpu_allocation(self, restore_data_context): | ||
"""Test GPU allocation for GPU vs non-GPU operators.""" | ||
DataContext.get_current().op_resource_reservation_enabled = True | ||
DataContext.get_current().op_resource_reservation_ratio = 0.5 | ||
Comment on lines
+629
to
+630
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Here and for the test below -- aren't these the defaults? Are they necessary for this test? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'd like to make the test not depend on the defaults. |
||
|
||
o1 = InputDataBuffer(DataContext.get_current(), []) | ||
|
||
# Non-GPU operator | ||
o2 = mock_map_op(o1) | ||
o2.min_max_resource_requirements = MagicMock( | ||
return_value=(ExecutionResources(0, 0, 0), ExecutionResources(0, 0, 0)) | ||
) | ||
|
||
# GPU operator | ||
o3 = mock_map_op(o2, ray_remote_args={"num_gpus": 1}) | ||
o3.min_max_resource_requirements = MagicMock( | ||
return_value=(ExecutionResources(0, 1, 0), ExecutionResources(0, 1, 0)) | ||
) | ||
|
||
topo, _ = build_streaming_topology(o3, ExecutionOptions()) | ||
|
||
global_limits = ExecutionResources(gpu=4) | ||
op_usages = { | ||
o1: ExecutionResources.zero(), | ||
o2: ExecutionResources.zero(), | ||
o3: ExecutionResources(gpu=1), # GPU op using 1 GPU | ||
} | ||
|
||
resource_manager = ResourceManager( | ||
topo, ExecutionOptions(), MagicMock(), DataContext.get_current() | ||
) | ||
resource_manager.get_op_usage = MagicMock(side_effect=lambda op: op_usages[op]) | ||
resource_manager._mem_op_internal = dict.fromkeys([o1, o2, o3], 0) | ||
resource_manager._mem_op_outputs = dict.fromkeys([o1, o2, o3], 0) | ||
Comment on lines
+659
to
+660
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. OOC why do we need to configure these? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. actually not needed. removed |
||
resource_manager.get_global_limits = MagicMock(return_value=global_limits) | ||
|
||
allocator = resource_manager._op_resource_allocator | ||
allocator.update_usages() | ||
|
||
# Non-GPU operator should get 0 GPU | ||
assert allocator._op_budgets[o2].gpu == 0 | ||
|
||
# GPU operator should get remaining GPUs (4 total - 1 used = 3 available) | ||
assert allocator._op_budgets[o3].gpu == 3 | ||
|
||
def test_multiple_gpu_operators(self, restore_data_context): | ||
"""Test GPU allocation for multiple GPU operators.""" | ||
DataContext.get_current().op_resource_reservation_enabled = True | ||
DataContext.get_current().op_resource_reservation_ratio = 0.5 | ||
|
||
o1 = InputDataBuffer(DataContext.get_current(), []) | ||
|
||
# Two GPU operators | ||
o2 = mock_map_op(o1, ray_remote_args={"num_gpus": 1}) | ||
o2.min_max_resource_requirements = MagicMock( | ||
return_value=(ExecutionResources(0, 1, 0), ExecutionResources(0, 1, 0)) | ||
) | ||
|
||
o3 = mock_map_op(o2, ray_remote_args={"num_gpus": 1}) | ||
o3.min_max_resource_requirements = MagicMock( | ||
return_value=(ExecutionResources(0, 1, 0), ExecutionResources(0, 1, 0)) | ||
) | ||
|
||
topo, _ = build_streaming_topology(o3, ExecutionOptions()) | ||
|
||
global_limits = ExecutionResources(gpu=4) | ||
op_usages = { | ||
o1: ExecutionResources.zero(), | ||
o2: ExecutionResources(gpu=1), # Using 1 GPU | ||
o3: ExecutionResources(gpu=0), # Not using GPU yet | ||
} | ||
|
||
resource_manager = ResourceManager( | ||
topo, ExecutionOptions(), MagicMock(), DataContext.get_current() | ||
) | ||
resource_manager.get_op_usage = MagicMock(side_effect=lambda op: op_usages[op]) | ||
resource_manager.get_global_limits = MagicMock(return_value=global_limits) | ||
|
||
allocator = resource_manager._op_resource_allocator | ||
allocator.update_usages() | ||
|
||
# o2: 4 total - 1 used = 3 available | ||
assert allocator._op_budgets[o2].gpu == 3 | ||
|
||
# o3: 4 total - 0 used = 4 available | ||
assert allocator._op_budgets[o3].gpu == 4 | ||
|
||
|
||
if __name__ == "__main__": | ||
import sys | ||
|
Uh oh!
There was an error while loading. Please reload this page.