ray-project
diff --git a/‎python/ray/dashboard/modules/aggregator/aggregator_agent.py‎
Lines changed: 91 additions & 327 deletions b/‎python/ray/dashboard/modules/aggregator/aggregator_agent.py‎
Lines changed: 91 additions & 327 deletions
diff --git a/‎python/ray/dashboard/modules/aggregator/constants.py‎
Lines changed: 2 additions & 0 deletions b/‎python/ray/dashboard/modules/aggregator/constants.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎python/ray/dashboard/modules/aggregator/multi_consumer_event_buffer.py‎
Lines changed: 194 additions & 0 deletions b/‎python/ray/dashboard/modules/aggregator/multi_consumer_event_buffer.py‎
Lines changed: 194 additions & 0 deletions
diff --git a/‎python/ray/dashboard/modules/aggregator/publisher/__init__.py‎ b/‎python/ray/dashboard/modules/aggregator/publisher/__init__.py‎
diff --git a/‎python/ray/dashboard/modules/aggregator/publisher/async_publisher_client.py‎
Lines changed: 127 additions & 0 deletions b/‎python/ray/dashboard/modules/aggregator/publisher/async_publisher_client.py‎
Lines changed: 127 additions & 0 deletions
diff --git a/‎python/ray/dashboard/modules/aggregator/publisher/configs.py‎
Lines changed: 24 additions & 0 deletions b/‎python/ray/dashboard/modules/aggregator/publisher/configs.py‎
Lines changed: 24 additions & 0 deletions
@@ -0,0 +1,2 @@
+AGGREGATOR_AGENT_METRIC_PREFIX = "aggregator_agent"
+CONSUMER_TAG_KEY = "consumer"
@@ -0,0 +1,194 @@
+import asyncio
+import time
+from collections import deque
+from dataclasses import dataclass
+from typing import Dict, List, Optional
+
+from ray._private.telemetry.open_telemetry_metric_recorder import (
+    OpenTelemetryMetricRecorder,
+)
+from ray.core.generated import (
+    events_base_event_pb2,
+)
+from ray.core.generated.events_base_event_pb2 import RayEvent
+from ray.dashboard.modules.aggregator.constants import (
+    AGGREGATOR_AGENT_METRIC_PREFIX,
+    CONSUMER_TAG_KEY,
+)
+
+
+@dataclass
+class _ConsumerState:
+    # Index of the next event to be consumed by this consumer
+    cursor_index: int
+
+
+class MultiConsumerEventBuffer:
+    """A buffer which allows adding one event at a time and consuming events in batches.
+    Supports multiple consumers, each with their own cursor index. Tracks the number of events evicted for each consumer.
+
+    Buffer is not thread-safe but is asyncio-friendly. All operations must be called from within the same event loop.
+
+    Arguments:
+        max_size: Maximum number of events to store in the buffer.
+        max_batch_size: Maximum number of events to return in a batch when calling wait_for_batch.
+        common_metric_tags: Tags to add to all metrics.
+    """
+
+    def __init__(
+        self,
+        max_size: int,
+        max_batch_size: int,
+        common_metric_tags: Optional[Dict[str, str]] = None,
+    ):
+        self._buffer = deque(maxlen=max_size)
+        self._max_size = max_size
+        self._lock = asyncio.Lock()
+        self._has_new_events_to_consume = asyncio.Condition(self._lock)
+        self._consumers: Dict[str, _ConsumerState] = {}
+
+        self._max_batch_size = max_batch_size
+
+        self._common_metrics_tags = common_metric_tags or {}
+        self._metric_recorder = OpenTelemetryMetricRecorder()
+        self.evicted_events_metric_name = (
+            f"{AGGREGATOR_AGENT_METRIC_PREFIX}_queue_dropped_events"
+        )
+        self._metric_recorder.register_counter_metric(
+            self.evicted_events_metric_name,
+            "Total number of events dropped because the publish/buffer queue was full.",
+        )
+
+    async def add_event(self, event: events_base_event_pb2.RayEvent) -> None:
+        """Add an event to the buffer.
+
+        If the buffer is full, the oldest event is dropped.
+        """
+        async with self._lock:
+            dropped_event = None
+            if len(self._buffer) >= self._max_size:
+                dropped_event = self._buffer.popleft()
+            self._buffer.append(event)
+
+            if dropped_event is not None:
+                for consumer_name, consumer_state in self._consumers.items():
+                    # Update consumer cursor index and evicted events metric if an event was dropped
+                    if consumer_state.cursor_index == 0:
+                        # The dropped event was the next event this consumer would have consumed, publish eviction metric
+                        self._metric_recorder.set_metric_value(
+                            self.evicted_events_metric_name,
+                            {
+                                **self._common_metrics_tags,
+                                CONSUMER_TAG_KEY: consumer_name,
+                                "event_type": RayEvent.EventType.Name(
+                                    dropped_event.event_type
+                                ),
+                            },
+                            1,
+                        )
+                    else:
+                        # The dropped event was already consumed by the consumer, so we need to adjust the cursor
+                        consumer_state.cursor_index -= 1
+
+            # Signal the consumers that there are new events to consume
+            self._has_new_events_to_consume.notify_all()
+
+    def _evict_old_events(self) -> None:
+        """Clean the buffer by removing events from the buffer who have index lower than
+        all the cursor indexes of all consumers and updating the cursor index of all
+        consumers.
+        """
+        if not self._consumers:
+            return
+
+        min_cursor_index = min(
+            consumer_state.cursor_index for consumer_state in self._consumers.values()
+        )
+        for _ in range(min_cursor_index):
+            self._buffer.popleft()
+
+        # update the cursor index of all consumers
+        for consumer_state in self._consumers.values():
+            consumer_state.cursor_index -= min_cursor_index
+
+    async def wait_for_batch(
+        self, consumer_name: str, timeout_seconds: float = 1.0
+    ) -> List[events_base_event_pb2.RayEvent]:
+        """Wait for batch respecting self.max_batch_size and timeout_seconds.
+
+        Returns a batch of up to self.max_batch_size items. Waits for up to
+        timeout_seconds after receiving the first event that will be in
+        the next batch. After the timeout, returns as many items as are ready.
+
+        Always returns a batch with at least one item - will block
+        indefinitely until an item comes in.
+
+        Arguments:
+            consumer_name: name of the consumer consuming the batch
+            timeout_seconds: maximum time to wait for a batch
+
+        Returns:
+            A list of up to max_batch_size events ready for consumption.
+            The list always contains at least one event.
+        """
+        max_batch = self._max_batch_size
+        batch = []
+        async with self._has_new_events_to_consume:
+            consumer_state = self._consumers.get(consumer_name)
+            if consumer_state is None:
+                raise KeyError(f"unknown consumer '{consumer_name}'")
+
+            # Phase 1: read the first event, wait indefinitely until there is at least one event to consume
+            while consumer_state.cursor_index >= len(self._buffer):
+                await self._has_new_events_to_consume.wait()
+
+            # Add the first event to the batch
+            event = self._buffer[consumer_state.cursor_index]
+            consumer_state.cursor_index += 1
+            batch.append(event)
+
+            # Phase 2: add items to the batch up to timeout or until full
+            deadline = time.monotonic() + max(0.0, float(timeout_seconds))
+            while len(batch) < max_batch:
+                remaining = deadline - time.monotonic()
+                if remaining <= 0:
+                    break
+
+                # Drain whatever is available
+                while len(batch) < max_batch and consumer_state.cursor_index < len(
+                    self._buffer
+                ):
+                    batch.append(self._buffer[consumer_state.cursor_index])
+                    consumer_state.cursor_index += 1
+
+                if len(batch) >= max_batch:
+                    break
+
+                # There is still room in the batch, but no new events to consume; wait until notified or timeout
+                try:
+                    await asyncio.wait_for(
+                        self._has_new_events_to_consume.wait(), remaining
+                    )
+                except asyncio.TimeoutError:
+                    # Timeout, return the current batch
+                    break
+
+            self._evict_old_events()
+        return batch
+
+    async def register_consumer(self, consumer_name: str) -> None:
+        """Register a new consumer with a name.
+
+        Arguments:
+            consumer_name: A unique name for the consumer.
+
+        """
+        async with self._lock:
+            if self._consumers.get(consumer_name) is not None:
+                raise ValueError(f"consumer '{consumer_name}' already registered")
+
+            self._consumers[consumer_name] = _ConsumerState(cursor_index=0)
+
+    async def size(self) -> int:
+        """Get total number of events in the buffer. Does not take consumer cursors into account."""
+        return len(self._buffer)
@@ -0,0 +1,127 @@
+import json
+import logging
+from abc import ABC, abstractmethod
+from concurrent.futures import ThreadPoolExecutor
+from dataclasses import dataclass
+from typing import Callable
+
+import aiohttp
+
+from ray._common.utils import get_or_create_event_loop
+from ray._private.protobuf_compat import message_to_json
+from ray.core.generated import events_base_event_pb2
+from ray.dashboard.modules.aggregator.publisher.configs import PUBLISHER_TIMEOUT_SECONDS
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class PublishStats:
+    """Data class that represents stats of publishing a batch of events."""
+
+    # Whether the publish was successful
+    is_publish_successful: bool
+    # Number of events published
+    num_events_published: int
+    # Number of events filtered out
+    num_events_filtered_out: int
+
+
+@dataclass
+class PublishBatch:
+    """Data class that represents a batch of events to publish."""
+
+    # The list of events to publish
+    events: list[events_base_event_pb2.RayEvent]
+
+
+class PublisherClientInterface(ABC):
+    """Abstract interface for publishing Ray event batches to external destinations.
+
+    Implementations should handle the actual publishing logic, filtering,
+    and format conversion appropriate for their specific destination type.
+    """
+
+    def count_num_events_in_batch(self, batch: PublishBatch) -> int:
+        """Count the number of events in a given batch."""
+        return len(batch.events)
+
+    @abstractmethod
+    async def publish(self, batch: PublishBatch) -> PublishStats:
+        """Publish a batch of events to the destination."""
+        pass
+
+    @abstractmethod
+    async def close(self) -> None:
+        """Clean up any resources used by this client. Should be called when the publisherClient is no longer required"""
+        pass
+
+
+class AsyncHttpPublisherClient(PublisherClientInterface):
+    """Client for publishing ray event batches to an external HTTP service."""
+
+    def __init__(
+        self,
+        endpoint: str,
+        executor: ThreadPoolExecutor,
+        events_filter_fn: Callable[[object], bool],
+        timeout: float = PUBLISHER_TIMEOUT_SECONDS,
+    ) -> None:
+        self._endpoint = endpoint
+        self._executor = executor
+        self._events_filter_fn = events_filter_fn
+        self._timeout = aiohttp.ClientTimeout(total=timeout)
+        self._session = None
+
+    async def publish(self, batch: PublishBatch) -> PublishStats:
+        events_batch: list[events_base_event_pb2.RayEvent] = batch.events
+        if not events_batch:
+            # Nothing to publish -> success but nothing published
+            return PublishStats(True, 0, 0)
+        filtered = [e for e in events_batch if self._events_filter_fn(e)]
+        num_filtered_out = len(events_batch) - len(filtered)
+        if not filtered:
+            # All filtered out -> success but nothing published
+            return PublishStats(True, 0, num_filtered_out)
+
+        # Convert protobuf objects to python dictionaries for HTTP POST. Run in executor to avoid blocking the event loop.
+        filtered_json = await get_or_create_event_loop().run_in_executor(
+            self._executor,
+            lambda: [
+                json.loads(
+                    message_to_json(e, always_print_fields_with_no_presence=True)
+                )
+                for e in filtered
+            ],
+        )
+
+        try:
+            # Create session on first use (lazy initialization)
+            if not self._session:
+                self._session = aiohttp.ClientSession(timeout=self._timeout)
+
+            return await self._send_http_request(filtered_json, num_filtered_out)
+        except Exception as e:
+            logger.error("Failed to send events to external service. Error: %s", e)
+            return PublishStats(False, 0, 0)
+
+    async def _send_http_request(self, json_data, num_filtered_out) -> PublishStats:
+        async with self._session.post(
+            self._endpoint,
+            json=json_data,
+        ) as resp:
+            resp.raise_for_status()
+            return PublishStats(True, len(json_data), num_filtered_out)
+
+    async def close(self) -> None:
+        """Closes the http session if one was created. Should be called when the publisherClient is no longer required"""
+        if self._session:
+            await self._session.close()
+            self._session = None
+
+    def set_session(self, session) -> None:
+        """Inject an HTTP client session.
+
+        If a session is set explicitly, it will be used and managed by close().
+        """
+        self._session = session
@@ -0,0 +1,24 @@
+# Environment variables for the aggregator agent publisher component.
+from ray._private import ray_constants
+
+env_var_prefix = "RAY_DASHBOARD_AGGREGATOR_AGENT_PUBLISHER"
+# Timeout for the publisher to publish events to the destination
+PUBLISHER_TIMEOUT_SECONDS = ray_constants.env_integer(
+    f"{env_var_prefix}_TIMEOUT_SECONDS", 3
+)
+# Maximum number of retries for publishing events to the destination, if less than 0, will retry indefinitely
+PUBLISHER_MAX_RETRIES = ray_constants.env_integer(f"{env_var_prefix}_MAX_RETRIES", -1)
+# Initial backoff time for publishing events to the destination
+PUBLISHER_INITIAL_BACKOFF_SECONDS = ray_constants.env_float(
+    f"{env_var_prefix}_INITIAL_BACKOFF_SECONDS", 0.01
+)
+# Maximum backoff time for publishing events to the destination
+PUBLISHER_MAX_BACKOFF_SECONDS = ray_constants.env_float(
+    f"{env_var_prefix}_MAX_BACKOFF_SECONDS", 5.0
+)
+# Jitter ratio for publishing events to the destination
+PUBLISHER_JITTER_RATIO = ray_constants.env_float(f"{env_var_prefix}_JITTER_RATIO", 0.1)
+# Maximum sleep time between sending batches of events to the destination, should be greater than 0.0 to avoid busy looping
+PUBLISHER_MAX_BUFFER_SEND_INTERVAL_SECONDS = ray_constants.env_float(
+    f"{env_var_prefix}_MAX_BUFFER_SEND_INTERVAL_SECONDS", 0.1
+)
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+AGGREGATOR_AGENT_METRIC_PREFIX = "aggregator_agent"`
	`2`	`+CONSUMER_TAG_KEY = "consumer"`