Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions metaflow_extensions/ray/plugins/ray_decorator.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,6 @@ def _worker_node_heartbeat_monitor(
- Control task fails intermittently.
- The worker/control fails intermittently such as node being wiped off
"""
# TODO : Make heartbeat timeout configurable
_status_notifier = TaskStatusNotifier(datastore)
# Worker task statuses are only for bookkeeping.
# They are not used by the control task in any way.
Expand Down Expand Up @@ -88,6 +87,8 @@ class RayDecorator(ParallelDecorator):
"main_port": None,
"worker_polling_freq": 10, # We DONT use this anymore
"all_nodes_started_timeout": 90,
"heartbeat_timeout": 60 * 10, # 10 minutes
"unreachable_timeout": 60 * 10, # 10 minutes
}
IS_PARALLEL = True

Expand Down Expand Up @@ -235,7 +236,8 @@ def worker_task_function():
_worker_node_heartbeat_monitor(
self.deco_datastore,
current.parallel.node_index,
heartbeat_timeout=10 * 60, # 10 minutes (todo: make this configurable)
heartbeat_timeout=self.attributes["heartbeat_timeout"],
unreachable_timeout=self.attributes["unreachable_timeout"],
)

# A status notifier helps the control node publish heartbeats
Expand Down