-
Notifications
You must be signed in to change notification settings - Fork 6.7k
[data] Extract backpressure-related code from ResourceManager as a policy #54376
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
cbd3215
aa67471
96ca20d
1debe47
e8256e5
47ee243
4c84ff4
846c5fd
d7b4402
d0c5f19
6621c9d
7379a08
770212d
a446936
cb780ef
f1298db
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
import logging | ||
from typing import TYPE_CHECKING, Optional | ||
|
||
from .backpressure_policy import BackpressurePolicy | ||
|
||
if TYPE_CHECKING: | ||
from ray.data._internal.execution.interfaces.physical_operator import ( | ||
PhysicalOperator, | ||
) | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
||
class ResourceBudgetBackpressurePolicy(BackpressurePolicy): | ||
"""A backpressure policy based on resource budgets in ResourceManager.""" | ||
|
||
def can_add_input(self, op: "PhysicalOperator") -> bool: | ||
budget = self._resource_manager.get_budget(op) | ||
if budget is None: | ||
return True | ||
return op.incremental_resource_usage().satisfies_limit(budget) | ||
|
||
def max_task_output_bytes_to_read(self, op: "PhysicalOperator") -> Optional[int]: | ||
"""Determine maximum bytes to read based on the resource budgets. | ||
|
||
Args: | ||
op: The operator to get the limit for. | ||
|
||
Returns: | ||
The maximum bytes that can be read, or None if no limit. | ||
""" | ||
return self._resource_manager.max_task_output_bytes_to_read(op) | ||
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -358,6 +358,11 @@ class OpRuntimeMetrics(metaclass=OpRuntimesMetricsMeta): | |
description="Time spent in task submission backpressure.", | ||
metrics_group=MetricsGroup.TASKS, | ||
) | ||
task_output_backpressure_time: float = metric_field( | ||
default=0, | ||
description="Time spent in task output backpressure.", | ||
metrics_group=MetricsGroup.TASKS, | ||
) | ||
Comment on lines
+361
to
+365
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should we track this as per-task metric? I don't think cumulative one is very useful There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It keeps track of the total time when the op is backpressured. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. How can be the cumulative time be useful?
Well, you don't show every line you just show the distribution. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. we can handle that later. currently we already report cumulative metric for task submission backpressure time. |
||
histogram_buckets_s = [ | ||
0.1, | ||
0.25, | ||
|
@@ -446,6 +451,8 @@ def __init__(self, op: "PhysicalOperator"): | |
self._extra_metrics: Dict[str, Any] = {} | ||
# Start time of current pause due to task submission backpressure | ||
self._task_submission_backpressure_start_time = -1 | ||
# Start time of current pause due to task output backpressure | ||
self._task_output_backpressure_start_time = -1 | ||
|
||
self._internal_inqueue = create_bundle_queue() | ||
self._internal_outqueue = create_bundle_queue() | ||
|
@@ -667,6 +674,17 @@ def on_toggle_task_submission_backpressure(self, in_backpressure): | |
) | ||
self._task_submission_backpressure_start_time = -1 | ||
|
||
def on_toggle_task_output_backpressure(self, in_backpressure): | ||
if in_backpressure and self._task_output_backpressure_start_time == -1: | ||
# backpressure starting, start timer | ||
self._task_output_backpressure_start_time = time.perf_counter() | ||
elif self._task_output_backpressure_start_time != -1: | ||
# backpressure stopping, stop timer | ||
self.task_output_backpressure_time += ( | ||
time.perf_counter() - self._task_output_backpressure_start_time | ||
) | ||
self._task_output_backpressure_start_time = -1 | ||
|
||
def on_output_taken(self, output: RefBundle): | ||
"""Callback when an output is taken from the operator.""" | ||
self.num_outputs_taken += 1 | ||
|
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -306,6 +306,20 @@ def op_resource_allocator(self) -> "OpResourceAllocator": | |||||
assert self._op_resource_allocator is not None | ||||||
return self._op_resource_allocator | ||||||
|
||||||
def max_task_output_bytes_to_read(self, op: PhysicalOperator) -> Optional[int]: | ||||||
"""Return the maximum bytes of pending task outputs can be read for | ||||||
the given operator. None means no limit.""" | ||||||
if self._op_resource_allocator is None: | ||||||
return None | ||||||
return self._op_resource_allocator.max_task_output_bytes_to_read(op) | ||||||
|
||||||
def get_budget(self, op: PhysicalOperator) -> Optional[ExecutionResources]: | ||||||
"""Return the budget for the given operator, or None if the operator | ||||||
has unlimited budget.""" | ||||||
if self._op_resource_allocator is None: | ||||||
return None | ||||||
return self._op_resource_allocator.get_budget(op) | ||||||
|
||||||
|
||||||
class OpResourceAllocator(ABC): | ||||||
"""An interface for dynamic operator resource allocation. | ||||||
|
@@ -323,20 +337,16 @@ def update_usages(self): | |||||
"""Callback to update resource usages.""" | ||||||
... | ||||||
|
||||||
@abstractmethod | ||||||
def can_submit_new_task(self, op: PhysicalOperator) -> bool: | ||||||
"""Return whether the given operator can submit a new task.""" | ||||||
... | ||||||
|
||||||
@abstractmethod | ||||||
def max_task_output_bytes_to_read(self, op: PhysicalOperator) -> Optional[int]: | ||||||
"""Return the maximum bytes of pending task outputs can be read for | ||||||
the given operator. None means no limit.""" | ||||||
... | ||||||
|
||||||
@abstractmethod | ||||||
def get_budget(self, op: PhysicalOperator) -> ExecutionResources: | ||||||
"""Return the budget for the given operator.""" | ||||||
def get_budget(self, op: PhysicalOperator) -> Optional[ExecutionResources]: | ||||||
"""Return the budget for the given operator, or None if the operator | ||||||
has unlimited budget.""" | ||||||
... | ||||||
|
||||||
|
||||||
|
@@ -542,15 +552,8 @@ def _update_reservation(self): | |||||
|
||||||
self._total_shared = remaining | ||||||
|
||||||
def can_submit_new_task(self, op: PhysicalOperator) -> bool: | ||||||
if op not in self._op_budgets: | ||||||
return True | ||||||
budget = self._op_budgets[op] | ||||||
res = op.incremental_resource_usage().satisfies_limit(budget) | ||||||
return res | ||||||
|
||||||
def get_budget(self, op: PhysicalOperator) -> ExecutionResources: | ||||||
return self._op_budgets[op] | ||||||
def get_budget(self, op: PhysicalOperator) -> Optional[ExecutionResources]: | ||||||
return self._op_budgets.get(op, None) | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I intentionally changed this to return an optional. Because ineligible ops don't have budgets. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah, i understand that but what's the point of specifying default? It will return none anyways There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. got it. somehow I thought it raises key error by default. |
||||||
|
||||||
def _should_unblock_streaming_output_backpressure( | ||||||
self, op: PhysicalOperator | ||||||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Let's make sure we keep existing coverage of this
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
yeah, it's already tested in test_resource_manager.