From b32a043258765ecd59dd9caf1c888eb296c77702 Mon Sep 17 00:00:00 2001 From: Gavin Elder Date: Wed, 10 Dec 2025 14:19:14 +0000 Subject: [PATCH 1/9] feat: add platform monitoring guide --- .../enterprise/advanced-topics/monitoring.md | 217 ++++++++++++++++++ 1 file changed, 217 insertions(+) create mode 100644 platform-enterprise_docs/enterprise/advanced-topics/monitoring.md diff --git a/platform-enterprise_docs/enterprise/advanced-topics/monitoring.md b/platform-enterprise_docs/enterprise/advanced-topics/monitoring.md new file mode 100644 index 000000000..310455dba --- /dev/null +++ b/platform-enterprise_docs/enterprise/advanced-topics/monitoring.md @@ -0,0 +1,217 @@ +--- +title: Seqera Platform Monitoring +headline: "Seqera Platform Monitoring" +description: "A guide on relevant platform metrics" +--- + +# Seqera Platform Monitoring + +## Enabling Observability Metrics + +The Seqera Platform Backend has built-in observability metrics which can be enabled by adding `prometheus` to the `MICRONAUT_ENVIRONMENTS` environment variable. This exposes a Prometheus endpoint at `/prometheus` on the default listen port (e.g., `http://localhost:8080/prometheus`). + +Combined with infrastructure monitoring tools such as Node Exporter, you can monitor relevant metrics across your deployment. + +--- + +## Key Metrics to Monitor + +### JVM Memory Metrics + +| Metric | Description | +| ------------------------------ | -------------------------------------------------------- | +| `jvm_buffer_memory_used_bytes` | Memory used by JVM buffer pools (direct, mapped) | +| `jvm_memory_used_bytes` | Amount of used memory by area (heap/non-heap) and region | +| `jvm_memory_committed_bytes` | Memory committed for JVM use | +| `jvm_memory_max_bytes` | Maximum memory available for memory management | +| `jvm_gc_live_data_size_bytes` | Size of long-lived heap memory pool after reclamation | +| `jvm_gc_max_data_size_bytes` | Max size of long-lived heap memory pool | + +### JVM Garbage Collection + +| Metric | Description | +| ------------------------------------- | ----------------------------------------- | +| `jvm_gc_pause_seconds_sum` | Total time spent in GC pauses | +| `jvm_gc_pause_seconds_count` | Number of GC pause events | +| `jvm_gc_pause_seconds_max` | Maximum GC pause duration | +| `jvm_gc_memory_allocated_bytes_total` | Total bytes allocated in young generation | +| `jvm_gc_memory_promoted_bytes_total` | Bytes promoted to old generation | + +### JVM Threads + +| Metric | Description | +| ---------------------------- | ----------------------------------------------------------------- | +| `jvm_threads_live_threads` | Current number of live threads (daemon + non-daemon) | +| `jvm_threads_daemon_threads` | Current number of daemon threads | +| `jvm_threads_peak_threads` | Peak thread count since JVM start | +| `jvm_threads_states_threads` | Thread count by state (runnable, blocked, waiting, timed-waiting) | + +### JVM Classes + +| Metric | Description | +| ------------------------------------ | -------------------------------------- | +| `jvm_classes_loaded_classes` | Currently loaded classes | +| `jvm_classes_unloaded_classes_total` | Total classes unloaded since JVM start | + +### HTTP Server Requests + +| Metric | Description | +| ------------------------------------------ | ------------------------------------------------- | +| `http_server_requests_seconds_count` | Total request count by method, status, and URI | +| `http_server_requests_seconds_sum` | Total request duration by method, status, and URI | +| `http_server_requests_seconds_max` | Maximum request duration | +| `http_server_requests_seconds` (quantiles) | Request latency percentiles (p50, p95, p99, p999) | + +### HTTP Client Requests + +| Metric | Description | +| ------------------------------------ | --------------------------------- | +| `http_client_requests_seconds_count` | Outbound request count | +| `http_client_requests_seconds_sum` | Total outbound request duration | +| `http_client_requests_seconds_max` | Maximum outbound request duration | + +### Process Metrics + +| Metric | Description | +| ---------------------------- | ------------------------------------ | +| `process_cpu_usage` | Recent CPU usage for the JVM process | +| `process_cpu_time_ns_total` | Total CPU time used by the JVM | +| `process_files_open_files` | Open file descriptor count | +| `process_files_max_files` | Maximum file descriptor limit | +| `process_uptime_seconds` | JVM uptime | +| `process_start_time_seconds` | Process start time (unix epoch) | + +### System Metrics + +| Metric | Description | +| ------------------------ | ------------------------------------- | +| `system_cpu_usage` | System-wide CPU usage | +| `system_cpu_count` | Number of processors available to JVM | +| `system_load_average_1m` | 1-minute load average | + +### Executor Thread Pools + +| Metric | Description | +| -------------------------------- | ---------------------------------------------------------- | +| `executor_active_threads` | Currently active threads by pool (io, blocking, scheduled) | +| `executor_pool_size_threads` | Current thread pool size | +| `executor_pool_max_threads` | Maximum allowed threads in pool | +| `executor_queued_tasks` | Tasks queued for execution | +| `executor_completed_tasks_total` | Total completed tasks | +| `executor_seconds_sum` | Total execution time | + +### Cache Metrics + +| Metric | Description | +| ----------------------- | ----------------------------------- | +| `cache_size` | Number of entries in cache | +| `cache_gets_total` | Cache hits and misses by cache name | +| `cache_puts_total` | Cache entries added | +| `cache_evictions_total` | Cache eviction count | + +### Hibernate/Database Metrics + +| Metric | Description | +| ---------------------------------------- | ---------------------------------------------------- | +| `hibernate_sessions_open_total` | Total sessions opened | +| `hibernate_sessions_closed_total` | Total sessions closed | +| `hibernate_connections_obtained_total` | Database connections obtained | +| `hibernate_query_executions_total` | Total queries executed | +| `hibernate_query_executions_max_seconds` | Slowest query time | +| `hibernate_entities_inserts_total` | Entity insert operations | +| `hibernate_entities_updates_total` | Entity update operations | +| `hibernate_entities_deletes_total` | Entity delete operations | +| `hibernate_entities_loads_total` | Entity load operations | +| `hibernate_transactions_total` | Transaction count | +| `hibernate_flushes_total` | Session flush count | +| `hibernate_optimistic_failures_total` | Optimistic lock failures (StaleObjectStateException) | + +### Seqera Platform-Specific Metrics + +#### Workflow Metrics + +| Metric | Description | +| ----------------------------------------- | ------------------- | +| `credits_estimation_workflow_added_total` | Workflows added | +| `credits_estimation_workflow_ended_total` | Workflows completed | +| `credits_estimation_task_started_total` | Tasks started | +| `credits_estimation_task_ended_total` | Tasks ended | + +#### Data Studio Metrics + +| Metric | Description | +| ------------------------------------------------ | ------------------------------------ | +| `data_studio_startup_time_failure_seconds_sum` | Time for failed Data Studio startups | +| `data_studio_startup_time_failure_seconds_count` | Failed Data Studio startup count | + +#### Error Tracking + +| Metric | Description | +| ------------------------------ | ------------------------- | +| `tower_logs_errors_10secCount` | Errors in last 10 seconds | +| `tower_logs_errors_1minCount` | Errors in last minute | +| `tower_logs_errors_5minCount` | Errors in last 5 minutes | + +### Logging Metrics + +| Metric | Description | +| ---------------------- | ----------------------------------------------------- | +| `logback_events_total` | Log events by level (debug, info, warn, error, trace) | + +--- + +## Recommended Alerting Thresholds + +### Critical Alerts + +- `jvm_memory_used_bytes{area="heap"}` > 90% of `jvm_memory_max_bytes` +- `process_files_open_files` > 90% of `process_files_max_files` +- `logback_events_total{level="error"}` rate > threshold +- `tower_logs_errors_1minCount` > 0 + +### Warning Alerts + +- `jvm_gc_pause_seconds_sum` rate increasing significantly +- `executor_queued_tasks` > threshold +- `hibernate_optimistic_failures_total` rate increasing +- `http_server_requests_seconds` p99 > acceptable latency + +--- + +## Example PromQL Queries + +### Request Rate (requests per second) + +```promql +rate(http_server_requests_seconds_count[5m]) +``` + +### Average Request Latency + +```promql +rate(http_server_requests_seconds_sum[5m]) / rate(http_server_requests_seconds_count[5m]) +``` + +### JVM Heap Usage Percentage + +```promql +sum(jvm_memory_used_bytes{area="heap"}) / sum(jvm_memory_max_bytes{area="heap"}) * 100 +``` + +### GC Pause Rate + +```promql +rate(jvm_gc_pause_seconds_sum[5m]) +``` + +### Error Rate + +```promql +rate(logback_events_total{level="error"}[5m]) +``` + +### Thread Pool Utilization + +```promql +executor_active_threads / executor_pool_size_threads * 100 +``` From 977cdce3b1f2fa8ba2a03f1bdbfc0486ae4b3c9a Mon Sep 17 00:00:00 2001 From: Gavin Elder Date: Wed, 17 Dec 2025 16:23:35 +0000 Subject: [PATCH 2/9] feat: Add obs signals --- .../enterprise-sidebar.json | 3 +- .../enterprise/advanced-topics/monitoring.md | 567 +++++++++++++--- .../enterprise/advanced-topics/monitoring.md | 626 ++++++++++++++++++ .../enterprise/advanced-topics/monitoring.md | 626 ++++++++++++++++++ .../enterprise/advanced-topics/monitoring.md | 626 ++++++++++++++++++ .../enterprise/advanced-topics/monitoring.md | 626 ++++++++++++++++++ .../enterprise/advanced-topics/monitoring.md | 626 ++++++++++++++++++ .../version-24.1-sidebars.json | 3 +- .../version-24.2-sidebars.json | 3 +- .../version-25.1-sidebars.json | 3 +- .../version-25.2-sidebars.json | 3 +- .../version-25.3-sidebars.json | 3 +- 12 files changed, 3630 insertions(+), 85 deletions(-) create mode 100644 platform-enterprise_versioned_docs/version-24.1/enterprise/advanced-topics/monitoring.md create mode 100644 platform-enterprise_versioned_docs/version-24.2/enterprise/advanced-topics/monitoring.md create mode 100644 platform-enterprise_versioned_docs/version-25.1/enterprise/advanced-topics/monitoring.md create mode 100644 platform-enterprise_versioned_docs/version-25.2/enterprise/advanced-topics/monitoring.md create mode 100644 platform-enterprise_versioned_docs/version-25.3/enterprise/advanced-topics/monitoring.md diff --git a/platform-enterprise_docs/enterprise-sidebar.json b/platform-enterprise_docs/enterprise-sidebar.json index 35f4c920e..daad555fe 100644 --- a/platform-enterprise_docs/enterprise-sidebar.json +++ b/platform-enterprise_docs/enterprise-sidebar.json @@ -59,7 +59,8 @@ "enterprise/advanced-topics/firewall-configuration", "enterprise/advanced-topics/seqera-container-images", "enterprise/advanced-topics/content-security-policy", - "enterprise/advanced-topics/jvm-memory-tuning" + "enterprise/advanced-topics/jvm-memory-tuning", + "enterprise/advanced-topics/monitoring" ] }, "enterprise/general_troubleshooting" diff --git a/platform-enterprise_docs/enterprise/advanced-topics/monitoring.md b/platform-enterprise_docs/enterprise/advanced-topics/monitoring.md index 310455dba..c7e53d966 100644 --- a/platform-enterprise_docs/enterprise/advanced-topics/monitoring.md +++ b/platform-enterprise_docs/enterprise/advanced-topics/monitoring.md @@ -2,21 +2,173 @@ title: Seqera Platform Monitoring headline: "Seqera Platform Monitoring" description: "A guide on relevant platform metrics" +date_created: "2025-12-17" --- # Seqera Platform Monitoring -## Enabling Observability Metrics +## Enabling observability metrics -The Seqera Platform Backend has built-in observability metrics which can be enabled by adding `prometheus` to the `MICRONAUT_ENVIRONMENTS` environment variable. This exposes a Prometheus endpoint at `/prometheus` on the default listen port (e.g., `http://localhost:8080/prometheus`). +Seqera Platform has built-in observability metrics which can be enabled by adding `prometheus` to the `MICRONAUT_ENVIRONMENTS` environment variable. This exposes a Prometheus endpoint at `/prometheus` on the default listen port (e.g., `http://localhost:8080/prometheus`). Combined with infrastructure monitoring tools such as Node Exporter, you can monitor relevant metrics across your deployment. --- -## Key Metrics to Monitor +## Key metrics to monitor -### JVM Memory Metrics +### Seqera Platform-specific metrics + +#### Data Studio metrics + +| Metric | Description | +| ------------------------------------------------ | ------------------------------------ | +| `data_studio_startup_time_failure_seconds_sum` | Time for failed Data Studio startups | +| `data_studio_startup_time_failure_seconds_count` | Failed Data Studio startup count | + +Track Data Studio startup performance to identify environment provisioning issues. Slow or failing startups impact user productivity. + +**Average startup time by tool:** + +```shell +sum by (tool) (increase(data_studio_startup_time_success_seconds_sum{app="backend", namespace="$namespace"}[$__rate_interval])) +/ +sum by (tool) (increase(data_studio_startup_time_success_seconds_count{app="backend", namespace="$namespace"}[$__rate_interval])) +``` + +**Failed startup rate:** + +```shell +rate(data_studio_startup_time_failure_seconds_count{namespace="$namespace"}[$__rate_interval]) +``` + +#### Error tracking + +| Metric | Description | +| ------------------------------ | ------------------------- | +| `tower_logs_errors_10secCount` | Errors in last 10 seconds | +| `tower_logs_errors_1minCount` | Errors in last minute | +| `tower_logs_errors_5minCount` | Errors in last 5 minutes | + +Monitor application errors across different time windows. Rolling error counts help identify transient issues versus sustained problems. + +**Recent error counts:** + +```shell +tower_logs_errors_10secCount{namespace="$namespace"} +tower_logs_errors_1minCount{namespace="$namespace"} +tower_logs_errors_5minCount{namespace="$namespace"} +``` + +**Log events by severity level:** + +```shell +rate(logback_events_total{namespace="$namespace"}[$__rate_interval]) +``` + +### Infrastructure resources + +#### CPU usage + +Monitor container CPU consumption against requested resources to identify capacity issues or inefficient resource allocation. + +**Backend CPU usage:** + +```shell +rate(container_cpu_usage_seconds_total{container="backend", namespace="$namespace"}[$__rate_interval]) +``` + +**Compare against requested resources** to determine if the container is over or under-provisioned: + +```shell +max(kube_pod_container_resource_requests{container="backend", namespace="$namespace", resource="cpu"}) +``` + +#### Memory usage + +Track working set memory, committed memory, and limits to prevent OOM conditions. + +**Backend memory working set** shows actual memory in use: + +```shell +container_memory_working_set_bytes{container="backend", namespace="$namespace"} +``` + +**Memory requests and limits** define the bounds for container memory allocation: + +```shell +max(kube_pod_container_resource_requests{container="backend", namespace="$namespace", resource="memory"}) +max(kube_pod_container_resource_limits{container="backend", namespace="$namespace", resource="memory"}) +``` + +### HTTP server requests + +| Metric | Description | +| ------------------------------------------ | ------------------------------------------------- | +| `http_server_requests_seconds_count` | Total request count by method, status, and URI | +| `http_server_requests_seconds_sum` | Total request duration by method, status, and URI | +| `http_server_requests_seconds_max` | Maximum request duration | +| `http_server_requests_seconds` (quantiles) | Request latency percentiles (p50, p95, p99, p999) | + +HTTP metrics reveal application throughput, error rates, and latency patterns. These are essential for understanding user-facing performance. + +**Total request throughput** shows overall API activity: + +```shell +sum(rate(http_server_requests_seconds_count{app="backend", namespace="$namespace"}[$__rate_interval])) +``` + +**Error rate (4xx and 5xx responses)** indicates client errors and server failures: + +```shell +sum(rate(http_server_requests_seconds_count{app="backend", namespace="$namespace", status=~"[45].."}[$__rate_interval])) +``` + +**Average latency per endpoint** helps identify slow API paths: + +```shell +sum by (method, uri) (rate(http_server_requests_seconds_sum{app="backend", namespace="$namespace"}[$__rate_interval])) +/ +sum by (method, uri) (rate(http_server_requests_seconds_count{app="backend", namespace="$namespace"}[$__rate_interval])) +``` + +**Top 10 endpoints by time spent** highlights where server time is consumed for optimization efforts: + +```shell +topk(10, sum by(method, uri) (rate(http_server_requests_seconds_sum{namespace="$namespace", app="backend"}[$__rate_interval]))) +``` + +### HTTP client requests + +| Metric | Description | +| ------------------------------------ | --------------------------------- | +| `http_client_requests_seconds_count` | Outbound request count | +| `http_client_requests_seconds_sum` | Total outbound request duration | +| `http_client_requests_seconds_max` | Maximum outbound request duration | + +Monitor external API calls and integrations. Slow or failing outbound requests can cascade into application performance issues. + +**Outbound request rate:** + +```shell +rate(http_client_requests_seconds_count{namespace="$namespace"}[$__rate_interval]) +``` + +**Average outbound request duration:** + +```shell +rate(http_client_requests_seconds_sum{namespace="$namespace"}[$__rate_interval]) +/ +rate(http_client_requests_seconds_count{namespace="$namespace"}[$__rate_interval]) +``` + +**Maximum outbound request duration** identifies slow external dependencies: + +```shell +http_client_requests_seconds_max{namespace="$namespace"} +``` + +### JVM memory metrics | Metric | Description | | ------------------------------ | -------------------------------------------------------- | @@ -27,7 +179,38 @@ Combined with infrastructure monitoring tools such as Node Exporter, you can mon | `jvm_gc_live_data_size_bytes` | Size of long-lived heap memory pool after reclamation | | `jvm_gc_max_data_size_bytes` | Max size of long-lived heap memory pool | -### JVM Garbage Collection +JVM memory metrics are critical for preventing OutOfMemoryErrors and identifying memory leaks. Monitor both heap (Java objects) and non-heap (metaspace, code cache) regions. + +**Heap memory usage** shows memory used for Java objects: + +```shell +jvm_memory_used_bytes{app="backend", namespace="$namespace", area="heap"} +jvm_memory_committed_bytes{app="backend", namespace="$namespace", area="heap"} +jvm_memory_max_bytes{app="backend", namespace="$namespace", area="heap"} +``` + +**Non-heap memory** includes metaspace and code cache: + +```shell +jvm_memory_used_bytes{app="backend", namespace="$namespace", area="nonheap"} +jvm_memory_committed_bytes{app="backend", namespace="$namespace", area="nonheap"} +jvm_memory_max_bytes{app="backend", namespace="$namespace", area="nonheap"} +``` + +**Heap usage percentage** provides a quick health indicator. Alert when this exceeds 85%: + +```shell +sum(jvm_memory_used_bytes{area="heap"}) / sum(jvm_memory_max_bytes{area="heap"}) * 100 +``` + +**Direct buffer usage** is important for Netty-based applications. High usage can cause native memory issues: + +```shell +jvm_buffer_memory_used_bytes{namespace="$namespace", app="backend", id="direct"} +jvm_buffer_total_capacity_bytes{namespace="$namespace", app="backend", id="direct"} +``` + +### JVM garbage collection | Metric | Description | | ------------------------------------- | ----------------------------------------- | @@ -37,7 +220,36 @@ Combined with infrastructure monitoring tools such as Node Exporter, you can mon | `jvm_gc_memory_allocated_bytes_total` | Total bytes allocated in young generation | | `jvm_gc_memory_promoted_bytes_total` | Bytes promoted to old generation | -### JVM Threads +Garbage collection metrics reveal memory pressure and its impact on application responsiveness. Long GC pauses cause request latency spikes. + +**Average GC pause duration** should remain low (under 100ms for most applications): + +```shell +rate(jvm_gc_pause_seconds_sum{app="backend", namespace="$namespace"}[$__rate_interval]) +/ +rate(jvm_gc_pause_seconds_count{app="backend", namespace="$namespace"}[$__rate_interval]) +``` + +**Maximum GC pause** identifies worst-case latency impact. Alert if this exceeds 1 second: + +```shell +jvm_gc_pause_seconds_max{app="backend", namespace="$namespace"} +``` + +**Live data size after GC** shows long-lived objects. If this grows over time, you may have a memory leak: + +```shell +jvm_gc_live_data_size_bytes{app="backend", namespace="$namespace"} +``` + +**Memory allocation and promotion rates** indicate object creation patterns. High promotion rates suggest objects are living longer than expected: + +```shell +rate(jvm_gc_memory_allocated_bytes_total{app="backend", namespace="$namespace"}[$__rate_interval]) +rate(jvm_gc_memory_promoted_bytes_total{app="backend", namespace="$namespace"}[$__rate_interval]) +``` + +### JVM threads | Metric | Description | | ---------------------------- | ----------------------------------------------------------------- | @@ -46,31 +258,44 @@ Combined with infrastructure monitoring tools such as Node Exporter, you can mon | `jvm_threads_peak_threads` | Peak thread count since JVM start | | `jvm_threads_states_threads` | Thread count by state (runnable, blocked, waiting, timed-waiting) | -### JVM Classes +Thread metrics help identify deadlocks, thread pool exhaustion, and concurrency issues. + +**Thread counts** show overall thread activity: + +```shell +jvm_threads_live_threads{app="backend", namespace="$namespace"} +jvm_threads_daemon_threads{app="backend", namespace="$namespace"} +jvm_threads_peak_threads{app="backend", namespace="$namespace"} +``` + +**Thread states** reveal blocking issues. High blocked thread counts indicate lock contention: + +```shell +jvm_threads_states_threads{app="backend", namespace="$namespace"} +``` + +### JVM classes | Metric | Description | | ------------------------------------ | -------------------------------------- | | `jvm_classes_loaded_classes` | Currently loaded classes | | `jvm_classes_unloaded_classes_total` | Total classes unloaded since JVM start | -### HTTP Server Requests +Class loading metrics help identify class loader leaks or excessive dynamic class generation. -| Metric | Description | -| ------------------------------------------ | ------------------------------------------------- | -| `http_server_requests_seconds_count` | Total request count by method, status, and URI | -| `http_server_requests_seconds_sum` | Total request duration by method, status, and URI | -| `http_server_requests_seconds_max` | Maximum request duration | -| `http_server_requests_seconds` (quantiles) | Request latency percentiles (p50, p95, p99, p999) | +**Loaded classes** should stabilize after startup. Continuous growth may indicate a class loader leak: -### HTTP Client Requests +```shell +jvm_classes_loaded_classes{namespace="$namespace", app="backend"} +``` -| Metric | Description | -| ------------------------------------ | --------------------------------- | -| `http_client_requests_seconds_count` | Outbound request count | -| `http_client_requests_seconds_sum` | Total outbound request duration | -| `http_client_requests_seconds_max` | Maximum outbound request duration | +**Class unload rate:** + +```shell +rate(jvm_classes_unloaded_classes_total{namespace="$namespace", app="backend"}[$__rate_interval]) +``` -### Process Metrics +### Process metrics | Metric | Description | | ---------------------------- | ------------------------------------ | @@ -81,7 +306,33 @@ Combined with infrastructure monitoring tools such as Node Exporter, you can mon | `process_uptime_seconds` | JVM uptime | | `process_start_time_seconds` | Process start time (unix epoch) | -### System Metrics +Process-level metrics provide visibility into resource consumption and system limits. + +**JVM process CPU usage:** + +```shell +process_cpu_usage{namespace="$namespace"} +``` + +**Open file descriptors** should be monitored against limits. Exhaustion causes connection failures: + +```shell +process_files_open_files{namespace="$namespace"} +``` + +**File descriptor utilization percentage** - alert when this exceeds 90%: + +```shell +(process_files_open_files{namespace="$namespace"} / process_files_max_files{namespace="$namespace"}) * 100 +``` + +**Process uptime** helps identify restart events. Low uptime may indicate stability issues: + +```shell +process_uptime_seconds{namespace="$namespace"} +``` + +### System metrics | Metric | Description | | ------------------------ | ------------------------------------- | @@ -89,7 +340,27 @@ Combined with infrastructure monitoring tools such as Node Exporter, you can mon | `system_cpu_count` | Number of processors available to JVM | | `system_load_average_1m` | 1-minute load average | -### Executor Thread Pools +System metrics provide host-level context for application performance. + +**System-wide CPU usage:** + +```shell +system_cpu_usage{namespace="$namespace"} +``` + +**System load average** should remain below the CPU count for healthy systems: + +```shell +system_load_average_1m{namespace="$namespace"} +``` + +**Available CPU count:** + +```shell +system_cpu_count{namespace="$namespace"} +``` + +### Executor thread pools | Metric | Description | | -------------------------------- | ---------------------------------------------------------- | @@ -100,7 +371,37 @@ Combined with infrastructure monitoring tools such as Node Exporter, you can mon | `executor_completed_tasks_total` | Total completed tasks | | `executor_seconds_sum` | Total execution time | -### Cache Metrics +Thread pool metrics reveal concurrency bottlenecks. Saturated pools cause request queuing and increased latency. + +**Thread pool utilization percentage** - high utilization indicates the pool is near capacity: + +```shell +executor_active_threads{service="backend", namespace="$namespace", name!="scheduled"} +/ +executor_pool_size_threads{service="backend", namespace="$namespace", name!="scheduled"} +``` + +**Cron scheduled executor utilization:** + +```shell +executor_active_threads{service="cron", namespace="$namespace", name="scheduled"} +/ +executor_pool_size_threads{service="cron", namespace="$namespace", name="scheduled"} +``` + +**Queued tasks** indicate backlog. Growing queues suggest the pool cannot keep up with demand: + +```shell +executor_queued_tasks{app="backend", namespace="$namespace"} +``` + +**Task completion rate:** + +```shell +rate(executor_completed_tasks_total{namespace="$namespace"}[$__rate_interval]) +``` + +### Cache metrics | Metric | Description | | ----------------------- | ----------------------------------- | @@ -109,7 +410,31 @@ Combined with infrastructure monitoring tools such as Node Exporter, you can mon | `cache_puts_total` | Cache entries added | | `cache_evictions_total` | Cache eviction count | -### Hibernate/Database Metrics +Cache effectiveness directly impacts database load and response times. Low hit rates indicate caching issues. + +**Redis cache hit rate** - should be above 70% for effective caching: + +```shell +avg(irate(redis_keyspace_hits_total{app="platform-redis-exporter"}[$__rate_interval]) +/ +(irate(redis_keyspace_misses_total{app="platform-redis-exporter"}[$__rate_interval]) + irate(redis_keyspace_hits_total{app="platform-redis-exporter"}[$__rate_interval]))) +``` + +**Cache size by name:** + +```shell +cache_size{namespace="$namespace"} +``` + +**Cache operation rates:** + +```shell +rate(cache_gets_total{namespace="$namespace"}[$__rate_interval]) +rate(cache_puts_total{namespace="$namespace"}[$__rate_interval]) +rate(cache_evictions_total{namespace="$namespace"}[$__rate_interval]) +``` + +### Hibernate/Database metrics | Metric | Description | | ---------------------------------------- | ---------------------------------------------------- | @@ -126,92 +451,176 @@ Combined with infrastructure monitoring tools such as Node Exporter, you can mon | `hibernate_flushes_total` | Session flush count | | `hibernate_optimistic_failures_total` | Optimistic lock failures (StaleObjectStateException) | -### Seqera Platform-Specific Metrics +Database metrics reveal query performance, connection management, and transaction health. + +**Session operations** - open and closed counts should be roughly equal. A growing gap indicates session leaks: -#### Workflow Metrics +```shell +rate(hibernate_sessions_open_total{app="backend", namespace="$namespace"}[$__rate_interval]) +rate(hibernate_sessions_closed_total{app="backend", namespace="$namespace"}[$__rate_interval]) +``` -| Metric | Description | -| ----------------------------------------- | ------------------- | -| `credits_estimation_workflow_added_total` | Workflows added | -| `credits_estimation_workflow_ended_total` | Workflows completed | -| `credits_estimation_task_started_total` | Tasks started | -| `credits_estimation_task_ended_total` | Tasks ended | +**Connection acquisition rate:** -#### Data Studio Metrics +```shell +rate(hibernate_connections_obtained_total{app="backend", namespace="$namespace"}[$__rate_interval]) +``` -| Metric | Description | -| ------------------------------------------------ | ------------------------------------ | -| `data_studio_startup_time_failure_seconds_sum` | Time for failed Data Studio startups | -| `data_studio_startup_time_failure_seconds_count` | Failed Data Studio startup count | +**Query execution rate:** -#### Error Tracking +```shell +rate(hibernate_query_executions_total{app="backend", namespace="$namespace"}[$__rate_interval]) +``` -| Metric | Description | -| ------------------------------ | ------------------------- | -| `tower_logs_errors_10secCount` | Errors in last 10 seconds | -| `tower_logs_errors_1minCount` | Errors in last minute | -| `tower_logs_errors_5minCount` | Errors in last 5 minutes | +**Query latency by type** helps identify slow queries for optimization: -### Logging Metrics +```shell +sum by (query) (rate(hibernate_query_execution_total_seconds_sum{app="backend", namespace="$namespace"}[$__rate_interval])) +/ +sum by (query) (rate(hibernate_query_execution_total_seconds_count{app="backend", namespace="$namespace"}[$__rate_interval])) +``` -| Metric | Description | -| ---------------------- | ----------------------------------------------------- | -| `logback_events_total` | Log events by level (debug, info, warn, error, trace) | +**Slowest query time** - alert if this exceeds 5 seconds: ---- +```shell +hibernate_query_executions_max_seconds{app="backend", namespace="$namespace"} +``` -## Recommended Alerting Thresholds +**Entity operation rates** show database write patterns: -### Critical Alerts +```shell +rate(hibernate_entities_inserts_total{app="backend", namespace="$namespace"}[$__rate_interval]) +rate(hibernate_entities_updates_total{app="backend", namespace="$namespace"}[$__rate_interval]) +rate(hibernate_entities_deletes_total{app="backend", namespace="$namespace"}[$__rate_interval]) +rate(hibernate_entities_loads_total{app="backend", namespace="$namespace"}[$__rate_interval]) +``` -- `jvm_memory_used_bytes{area="heap"}` > 90% of `jvm_memory_max_bytes` -- `process_files_open_files` > 90% of `process_files_max_files` -- `logback_events_total{level="error"}` rate > threshold -- `tower_logs_errors_1minCount` > 0 +**Transaction success/failure rate:** -### Warning Alerts +```shell +sum by (result) (rate(hibernate_transactions_total{app="backend", namespace="$namespace"}[$__rate_interval])) +``` -- `jvm_gc_pause_seconds_sum` rate increasing significantly -- `executor_queued_tasks` > threshold -- `hibernate_optimistic_failures_total` rate increasing -- `http_server_requests_seconds` p99 > acceptable latency +**Optimistic lock failures** indicate concurrent modification conflicts. High rates suggest contention issues: ---- +```shell +rate(hibernate_optimistic_failures_total{app="backend", namespace="$namespace"}[$__rate_interval]) +``` + +### Connection pool metrics -## Example PromQL Queries +| Metric | Description | +| ------------------------- | ---------------------------- | +| `jdbc_connections_active` | Active database connections | +| `jdbc_connections_max` | Maximum connection pool size | +| `jdbc_connections_min` | Minimum connection pool size | +| `jdbc_connections_usage` | Connection pool usage | -### Request Rate (requests per second) +Connection pool metrics prevent connection exhaustion during traffic bursts. -```promql -rate(http_server_requests_seconds_count[5m]) +**Active connections vs pool limits** - alert when active connections approach the maximum: + +```shell +sum(jdbc_connections_active{app="backend", namespace="$namespace"}) +sum(jdbc_connections_max{app="backend", namespace="$namespace"}) +sum(jdbc_connections_min{app="backend", namespace="$namespace"}) +sum(jdbc_connections_usage{app="backend", namespace="$namespace"}) ``` -### Average Request Latency +### Hibernate cache metrics + +Hibernate caching reduces database load. Monitor hit rates to ensure caches are effective. -```promql -rate(http_server_requests_seconds_sum[5m]) / rate(http_server_requests_seconds_count[5m]) +**Query cache hit rate** - should exceed 60%: + +```shell +sum(increase(hibernate_cache_query_requests_total{app="backend", namespace="$namespace", result="hit"}[$__rate_interval])) +/ +sum(increase(hibernate_cache_query_requests_total{app="backend", namespace="$namespace"}[$__rate_interval])) ``` -### JVM Heap Usage Percentage +**Query plan cache hit rate:** -```promql -sum(jvm_memory_used_bytes{area="heap"}) / sum(jvm_memory_max_bytes{area="heap"}) * 100 +```shell +sum(increase(hibernate_cache_query_plan_total{app="backend", namespace="$namespace", result="hit"}[$__rate_interval])) +/ +sum(increase(hibernate_cache_query_plan_total{app="backend", namespace="$namespace"}[$__rate_interval])) ``` -### GC Pause Rate +**Second level cache hit rate by region:** -```promql -rate(jvm_gc_pause_seconds_sum[5m]) +```shell +sum by (region) (increase(hibernate_second_level_cache_requests_total{app="backend", namespace="$namespace", result="hit"}[$__rate_interval])) +/ +sum by (region) (increase(hibernate_second_level_cache_requests_total{app="backend", namespace="$namespace"}[$__rate_interval])) ``` -### Error Rate +### Logging metrics + +| Metric | Description | +| ---------------------- | ----------------------------------------------------- | +| `logback_events_total` | Log events by level (debug, info, warn, error, trace) | -```promql +Log event metrics provide early warning of application issues. + +**Error rate** - track error log frequency for anomaly detection: + +```shell rate(logback_events_total{level="error"}[5m]) ``` -### Thread Pool Utilization +### Kubernetes health + +Monitor pod health to catch deployment or infrastructure issues early. + +**Pods in unhealthy states:** -```promql -executor_active_threads / executor_pool_size_threads * 100 +```shell +sum by (namespace, pod) (kube_pod_status_phase{phase=~"Pending|Unknown|Failed", namespace!="wave-build"}) > 0 ``` + +--- + +## Alerting recommendations + +### Critical alerts + +- `jvm_memory_used_bytes{area="heap"}` > 90% of `jvm_memory_max_bytes` +- `process_files_open_files` > 90% of `process_files_max_files` +- `logback_events_total{level="error"}` rate > threshold +- `tower_logs_errors_1minCount` > 0 +- HTTP 5xx errors > 5% of total requests +- `jdbc_connections_active` > 90% of `jdbc_connections_max` +- Any pods in Failed/Unknown state for > 5 minutes + +### Warning alerts + +- `jvm_gc_pause_seconds_max` > 1 second +- `jvm_gc_live_data_size_bytes` approaching `jvm_gc_max_data_size_bytes` +- Heap usage > 85% of max heap +- `executor_queued_tasks` > threshold +- Executor utilization > 90% +- `hibernate_optimistic_failures_total` rate increasing +- `hibernate_query_executions_max_seconds` > 5 seconds +- `http_server_requests_seconds` p99 > acceptable latency +- Redis cache hit rate < 70% +- Hibernate query cache hit rate < 60% +- Growing gap between `credits_estimation_workflow_added_total` and `credits_estimation_workflow_ended_total` +- `hibernate_sessions_open_total` >> `hibernate_sessions_closed_total` over time + +--- + +## Quick reference: Metrics by troubleshooting scenario + +| Issue | Key Metrics to Check | +| ------------------------------ | ----------------------------------------------------------------------------------------------------------------------------------------- | +| **Slow application response** | `http_server_requests_seconds` (latency), `jvm_gc_pause_seconds_max`, `hibernate_query_executions_max_seconds`, `executor_active_threads` | +| **Out of memory errors** | `jvm_memory_used_bytes`, `jvm_gc_pause_seconds`, `jvm_gc_live_data_size_bytes`, `jvm_buffer_memory_used_bytes` | +| **Database performance** | `hibernate_query_executions_max_seconds`, `jdbc_connections_active`, `hibernate_transactions_total`, cache hit rates | +| **High CPU usage** | `process_cpu_usage`, `system_cpu_usage`, `jvm_threads_live_threads`, `executor_active_threads` | +| **Connection exhaustion** | `jdbc_connections_active`, `jdbc_connections_max`, `hibernate_sessions_open_total` vs `hibernate_sessions_closed_total` | +| **Cache issues** | Redis hit rate, `hibernate_cache_query_requests_total`, `cache_gets_total`, `cache_evictions_total` | +| **Workflow processing delays** | `credits_estimation_workflow_*`, `credits_estimation_task_*`, `executor_queued_tasks`, `tower_logs_errors_*` | +| **Thread starvation** | `executor_active_threads`, `executor_queued_tasks`, `jvm_threads_states_threads{state="blocked"}` | +| **Memory leaks** | `jvm_memory_used_bytes` trending up, `jvm_gc_live_data_size_bytes` growing, `jvm_classes_loaded_classes` growing | +| **GC pressure** | `jvm_gc_pause_seconds_max`, `jvm_gc_memory_promoted_bytes_total`, time in GC vs application time | diff --git a/platform-enterprise_versioned_docs/version-24.1/enterprise/advanced-topics/monitoring.md b/platform-enterprise_versioned_docs/version-24.1/enterprise/advanced-topics/monitoring.md new file mode 100644 index 000000000..c7e53d966 --- /dev/null +++ b/platform-enterprise_versioned_docs/version-24.1/enterprise/advanced-topics/monitoring.md @@ -0,0 +1,626 @@ +--- +title: Seqera Platform Monitoring +headline: "Seqera Platform Monitoring" +description: "A guide on relevant platform metrics" +date_created: "2025-12-17" +--- + +# Seqera Platform Monitoring + +## Enabling observability metrics + +Seqera Platform has built-in observability metrics which can be enabled by adding `prometheus` to the `MICRONAUT_ENVIRONMENTS` environment variable. This exposes a Prometheus endpoint at `/prometheus` on the default listen port (e.g., `http://localhost:8080/prometheus`). + +Combined with infrastructure monitoring tools such as Node Exporter, you can monitor relevant metrics across your deployment. + +--- + +## Key metrics to monitor + +### Seqera Platform-specific metrics + +#### Data Studio metrics + +| Metric | Description | +| ------------------------------------------------ | ------------------------------------ | +| `data_studio_startup_time_failure_seconds_sum` | Time for failed Data Studio startups | +| `data_studio_startup_time_failure_seconds_count` | Failed Data Studio startup count | + +Track Data Studio startup performance to identify environment provisioning issues. Slow or failing startups impact user productivity. + +**Average startup time by tool:** + +```shell +sum by (tool) (increase(data_studio_startup_time_success_seconds_sum{app="backend", namespace="$namespace"}[$__rate_interval])) +/ +sum by (tool) (increase(data_studio_startup_time_success_seconds_count{app="backend", namespace="$namespace"}[$__rate_interval])) +``` + +**Failed startup rate:** + +```shell +rate(data_studio_startup_time_failure_seconds_count{namespace="$namespace"}[$__rate_interval]) +``` + +#### Error tracking + +| Metric | Description | +| ------------------------------ | ------------------------- | +| `tower_logs_errors_10secCount` | Errors in last 10 seconds | +| `tower_logs_errors_1minCount` | Errors in last minute | +| `tower_logs_errors_5minCount` | Errors in last 5 minutes | + +Monitor application errors across different time windows. Rolling error counts help identify transient issues versus sustained problems. + +**Recent error counts:** + +```shell +tower_logs_errors_10secCount{namespace="$namespace"} +tower_logs_errors_1minCount{namespace="$namespace"} +tower_logs_errors_5minCount{namespace="$namespace"} +``` + +**Log events by severity level:** + +```shell +rate(logback_events_total{namespace="$namespace"}[$__rate_interval]) +``` + +### Infrastructure resources + +#### CPU usage + +Monitor container CPU consumption against requested resources to identify capacity issues or inefficient resource allocation. + +**Backend CPU usage:** + +```shell +rate(container_cpu_usage_seconds_total{container="backend", namespace="$namespace"}[$__rate_interval]) +``` + +**Compare against requested resources** to determine if the container is over or under-provisioned: + +```shell +max(kube_pod_container_resource_requests{container="backend", namespace="$namespace", resource="cpu"}) +``` + +#### Memory usage + +Track working set memory, committed memory, and limits to prevent OOM conditions. + +**Backend memory working set** shows actual memory in use: + +```shell +container_memory_working_set_bytes{container="backend", namespace="$namespace"} +``` + +**Memory requests and limits** define the bounds for container memory allocation: + +```shell +max(kube_pod_container_resource_requests{container="backend", namespace="$namespace", resource="memory"}) +max(kube_pod_container_resource_limits{container="backend", namespace="$namespace", resource="memory"}) +``` + +### HTTP server requests + +| Metric | Description | +| ------------------------------------------ | ------------------------------------------------- | +| `http_server_requests_seconds_count` | Total request count by method, status, and URI | +| `http_server_requests_seconds_sum` | Total request duration by method, status, and URI | +| `http_server_requests_seconds_max` | Maximum request duration | +| `http_server_requests_seconds` (quantiles) | Request latency percentiles (p50, p95, p99, p999) | + +HTTP metrics reveal application throughput, error rates, and latency patterns. These are essential for understanding user-facing performance. + +**Total request throughput** shows overall API activity: + +```shell +sum(rate(http_server_requests_seconds_count{app="backend", namespace="$namespace"}[$__rate_interval])) +``` + +**Error rate (4xx and 5xx responses)** indicates client errors and server failures: + +```shell +sum(rate(http_server_requests_seconds_count{app="backend", namespace="$namespace", status=~"[45].."}[$__rate_interval])) +``` + +**Average latency per endpoint** helps identify slow API paths: + +```shell +sum by (method, uri) (rate(http_server_requests_seconds_sum{app="backend", namespace="$namespace"}[$__rate_interval])) +/ +sum by (method, uri) (rate(http_server_requests_seconds_count{app="backend", namespace="$namespace"}[$__rate_interval])) +``` + +**Top 10 endpoints by time spent** highlights where server time is consumed for optimization efforts: + +```shell +topk(10, sum by(method, uri) (rate(http_server_requests_seconds_sum{namespace="$namespace", app="backend"}[$__rate_interval]))) +``` + +### HTTP client requests + +| Metric | Description | +| ------------------------------------ | --------------------------------- | +| `http_client_requests_seconds_count` | Outbound request count | +| `http_client_requests_seconds_sum` | Total outbound request duration | +| `http_client_requests_seconds_max` | Maximum outbound request duration | + +Monitor external API calls and integrations. Slow or failing outbound requests can cascade into application performance issues. + +**Outbound request rate:** + +```shell +rate(http_client_requests_seconds_count{namespace="$namespace"}[$__rate_interval]) +``` + +**Average outbound request duration:** + +```shell +rate(http_client_requests_seconds_sum{namespace="$namespace"}[$__rate_interval]) +/ +rate(http_client_requests_seconds_count{namespace="$namespace"}[$__rate_interval]) +``` + +**Maximum outbound request duration** identifies slow external dependencies: + +```shell +http_client_requests_seconds_max{namespace="$namespace"} +``` + +### JVM memory metrics + +| Metric | Description | +| ------------------------------ | -------------------------------------------------------- | +| `jvm_buffer_memory_used_bytes` | Memory used by JVM buffer pools (direct, mapped) | +| `jvm_memory_used_bytes` | Amount of used memory by area (heap/non-heap) and region | +| `jvm_memory_committed_bytes` | Memory committed for JVM use | +| `jvm_memory_max_bytes` | Maximum memory available for memory management | +| `jvm_gc_live_data_size_bytes` | Size of long-lived heap memory pool after reclamation | +| `jvm_gc_max_data_size_bytes` | Max size of long-lived heap memory pool | + +JVM memory metrics are critical for preventing OutOfMemoryErrors and identifying memory leaks. Monitor both heap (Java objects) and non-heap (metaspace, code cache) regions. + +**Heap memory usage** shows memory used for Java objects: + +```shell +jvm_memory_used_bytes{app="backend", namespace="$namespace", area="heap"} +jvm_memory_committed_bytes{app="backend", namespace="$namespace", area="heap"} +jvm_memory_max_bytes{app="backend", namespace="$namespace", area="heap"} +``` + +**Non-heap memory** includes metaspace and code cache: + +```shell +jvm_memory_used_bytes{app="backend", namespace="$namespace", area="nonheap"} +jvm_memory_committed_bytes{app="backend", namespace="$namespace", area="nonheap"} +jvm_memory_max_bytes{app="backend", namespace="$namespace", area="nonheap"} +``` + +**Heap usage percentage** provides a quick health indicator. Alert when this exceeds 85%: + +```shell +sum(jvm_memory_used_bytes{area="heap"}) / sum(jvm_memory_max_bytes{area="heap"}) * 100 +``` + +**Direct buffer usage** is important for Netty-based applications. High usage can cause native memory issues: + +```shell +jvm_buffer_memory_used_bytes{namespace="$namespace", app="backend", id="direct"} +jvm_buffer_total_capacity_bytes{namespace="$namespace", app="backend", id="direct"} +``` + +### JVM garbage collection + +| Metric | Description | +| ------------------------------------- | ----------------------------------------- | +| `jvm_gc_pause_seconds_sum` | Total time spent in GC pauses | +| `jvm_gc_pause_seconds_count` | Number of GC pause events | +| `jvm_gc_pause_seconds_max` | Maximum GC pause duration | +| `jvm_gc_memory_allocated_bytes_total` | Total bytes allocated in young generation | +| `jvm_gc_memory_promoted_bytes_total` | Bytes promoted to old generation | + +Garbage collection metrics reveal memory pressure and its impact on application responsiveness. Long GC pauses cause request latency spikes. + +**Average GC pause duration** should remain low (under 100ms for most applications): + +```shell +rate(jvm_gc_pause_seconds_sum{app="backend", namespace="$namespace"}[$__rate_interval]) +/ +rate(jvm_gc_pause_seconds_count{app="backend", namespace="$namespace"}[$__rate_interval]) +``` + +**Maximum GC pause** identifies worst-case latency impact. Alert if this exceeds 1 second: + +```shell +jvm_gc_pause_seconds_max{app="backend", namespace="$namespace"} +``` + +**Live data size after GC** shows long-lived objects. If this grows over time, you may have a memory leak: + +```shell +jvm_gc_live_data_size_bytes{app="backend", namespace="$namespace"} +``` + +**Memory allocation and promotion rates** indicate object creation patterns. High promotion rates suggest objects are living longer than expected: + +```shell +rate(jvm_gc_memory_allocated_bytes_total{app="backend", namespace="$namespace"}[$__rate_interval]) +rate(jvm_gc_memory_promoted_bytes_total{app="backend", namespace="$namespace"}[$__rate_interval]) +``` + +### JVM threads + +| Metric | Description | +| ---------------------------- | ----------------------------------------------------------------- | +| `jvm_threads_live_threads` | Current number of live threads (daemon + non-daemon) | +| `jvm_threads_daemon_threads` | Current number of daemon threads | +| `jvm_threads_peak_threads` | Peak thread count since JVM start | +| `jvm_threads_states_threads` | Thread count by state (runnable, blocked, waiting, timed-waiting) | + +Thread metrics help identify deadlocks, thread pool exhaustion, and concurrency issues. + +**Thread counts** show overall thread activity: + +```shell +jvm_threads_live_threads{app="backend", namespace="$namespace"} +jvm_threads_daemon_threads{app="backend", namespace="$namespace"} +jvm_threads_peak_threads{app="backend", namespace="$namespace"} +``` + +**Thread states** reveal blocking issues. High blocked thread counts indicate lock contention: + +```shell +jvm_threads_states_threads{app="backend", namespace="$namespace"} +``` + +### JVM classes + +| Metric | Description | +| ------------------------------------ | -------------------------------------- | +| `jvm_classes_loaded_classes` | Currently loaded classes | +| `jvm_classes_unloaded_classes_total` | Total classes unloaded since JVM start | + +Class loading metrics help identify class loader leaks or excessive dynamic class generation. + +**Loaded classes** should stabilize after startup. Continuous growth may indicate a class loader leak: + +```shell +jvm_classes_loaded_classes{namespace="$namespace", app="backend"} +``` + +**Class unload rate:** + +```shell +rate(jvm_classes_unloaded_classes_total{namespace="$namespace", app="backend"}[$__rate_interval]) +``` + +### Process metrics + +| Metric | Description | +| ---------------------------- | ------------------------------------ | +| `process_cpu_usage` | Recent CPU usage for the JVM process | +| `process_cpu_time_ns_total` | Total CPU time used by the JVM | +| `process_files_open_files` | Open file descriptor count | +| `process_files_max_files` | Maximum file descriptor limit | +| `process_uptime_seconds` | JVM uptime | +| `process_start_time_seconds` | Process start time (unix epoch) | + +Process-level metrics provide visibility into resource consumption and system limits. + +**JVM process CPU usage:** + +```shell +process_cpu_usage{namespace="$namespace"} +``` + +**Open file descriptors** should be monitored against limits. Exhaustion causes connection failures: + +```shell +process_files_open_files{namespace="$namespace"} +``` + +**File descriptor utilization percentage** - alert when this exceeds 90%: + +```shell +(process_files_open_files{namespace="$namespace"} / process_files_max_files{namespace="$namespace"}) * 100 +``` + +**Process uptime** helps identify restart events. Low uptime may indicate stability issues: + +```shell +process_uptime_seconds{namespace="$namespace"} +``` + +### System metrics + +| Metric | Description | +| ------------------------ | ------------------------------------- | +| `system_cpu_usage` | System-wide CPU usage | +| `system_cpu_count` | Number of processors available to JVM | +| `system_load_average_1m` | 1-minute load average | + +System metrics provide host-level context for application performance. + +**System-wide CPU usage:** + +```shell +system_cpu_usage{namespace="$namespace"} +``` + +**System load average** should remain below the CPU count for healthy systems: + +```shell +system_load_average_1m{namespace="$namespace"} +``` + +**Available CPU count:** + +```shell +system_cpu_count{namespace="$namespace"} +``` + +### Executor thread pools + +| Metric | Description | +| -------------------------------- | ---------------------------------------------------------- | +| `executor_active_threads` | Currently active threads by pool (io, blocking, scheduled) | +| `executor_pool_size_threads` | Current thread pool size | +| `executor_pool_max_threads` | Maximum allowed threads in pool | +| `executor_queued_tasks` | Tasks queued for execution | +| `executor_completed_tasks_total` | Total completed tasks | +| `executor_seconds_sum` | Total execution time | + +Thread pool metrics reveal concurrency bottlenecks. Saturated pools cause request queuing and increased latency. + +**Thread pool utilization percentage** - high utilization indicates the pool is near capacity: + +```shell +executor_active_threads{service="backend", namespace="$namespace", name!="scheduled"} +/ +executor_pool_size_threads{service="backend", namespace="$namespace", name!="scheduled"} +``` + +**Cron scheduled executor utilization:** + +```shell +executor_active_threads{service="cron", namespace="$namespace", name="scheduled"} +/ +executor_pool_size_threads{service="cron", namespace="$namespace", name="scheduled"} +``` + +**Queued tasks** indicate backlog. Growing queues suggest the pool cannot keep up with demand: + +```shell +executor_queued_tasks{app="backend", namespace="$namespace"} +``` + +**Task completion rate:** + +```shell +rate(executor_completed_tasks_total{namespace="$namespace"}[$__rate_interval]) +``` + +### Cache metrics + +| Metric | Description | +| ----------------------- | ----------------------------------- | +| `cache_size` | Number of entries in cache | +| `cache_gets_total` | Cache hits and misses by cache name | +| `cache_puts_total` | Cache entries added | +| `cache_evictions_total` | Cache eviction count | + +Cache effectiveness directly impacts database load and response times. Low hit rates indicate caching issues. + +**Redis cache hit rate** - should be above 70% for effective caching: + +```shell +avg(irate(redis_keyspace_hits_total{app="platform-redis-exporter"}[$__rate_interval]) +/ +(irate(redis_keyspace_misses_total{app="platform-redis-exporter"}[$__rate_interval]) + irate(redis_keyspace_hits_total{app="platform-redis-exporter"}[$__rate_interval]))) +``` + +**Cache size by name:** + +```shell +cache_size{namespace="$namespace"} +``` + +**Cache operation rates:** + +```shell +rate(cache_gets_total{namespace="$namespace"}[$__rate_interval]) +rate(cache_puts_total{namespace="$namespace"}[$__rate_interval]) +rate(cache_evictions_total{namespace="$namespace"}[$__rate_interval]) +``` + +### Hibernate/Database metrics + +| Metric | Description | +| ---------------------------------------- | ---------------------------------------------------- | +| `hibernate_sessions_open_total` | Total sessions opened | +| `hibernate_sessions_closed_total` | Total sessions closed | +| `hibernate_connections_obtained_total` | Database connections obtained | +| `hibernate_query_executions_total` | Total queries executed | +| `hibernate_query_executions_max_seconds` | Slowest query time | +| `hibernate_entities_inserts_total` | Entity insert operations | +| `hibernate_entities_updates_total` | Entity update operations | +| `hibernate_entities_deletes_total` | Entity delete operations | +| `hibernate_entities_loads_total` | Entity load operations | +| `hibernate_transactions_total` | Transaction count | +| `hibernate_flushes_total` | Session flush count | +| `hibernate_optimistic_failures_total` | Optimistic lock failures (StaleObjectStateException) | + +Database metrics reveal query performance, connection management, and transaction health. + +**Session operations** - open and closed counts should be roughly equal. A growing gap indicates session leaks: + +```shell +rate(hibernate_sessions_open_total{app="backend", namespace="$namespace"}[$__rate_interval]) +rate(hibernate_sessions_closed_total{app="backend", namespace="$namespace"}[$__rate_interval]) +``` + +**Connection acquisition rate:** + +```shell +rate(hibernate_connections_obtained_total{app="backend", namespace="$namespace"}[$__rate_interval]) +``` + +**Query execution rate:** + +```shell +rate(hibernate_query_executions_total{app="backend", namespace="$namespace"}[$__rate_interval]) +``` + +**Query latency by type** helps identify slow queries for optimization: + +```shell +sum by (query) (rate(hibernate_query_execution_total_seconds_sum{app="backend", namespace="$namespace"}[$__rate_interval])) +/ +sum by (query) (rate(hibernate_query_execution_total_seconds_count{app="backend", namespace="$namespace"}[$__rate_interval])) +``` + +**Slowest query time** - alert if this exceeds 5 seconds: + +```shell +hibernate_query_executions_max_seconds{app="backend", namespace="$namespace"} +``` + +**Entity operation rates** show database write patterns: + +```shell +rate(hibernate_entities_inserts_total{app="backend", namespace="$namespace"}[$__rate_interval]) +rate(hibernate_entities_updates_total{app="backend", namespace="$namespace"}[$__rate_interval]) +rate(hibernate_entities_deletes_total{app="backend", namespace="$namespace"}[$__rate_interval]) +rate(hibernate_entities_loads_total{app="backend", namespace="$namespace"}[$__rate_interval]) +``` + +**Transaction success/failure rate:** + +```shell +sum by (result) (rate(hibernate_transactions_total{app="backend", namespace="$namespace"}[$__rate_interval])) +``` + +**Optimistic lock failures** indicate concurrent modification conflicts. High rates suggest contention issues: + +```shell +rate(hibernate_optimistic_failures_total{app="backend", namespace="$namespace"}[$__rate_interval]) +``` + +### Connection pool metrics + +| Metric | Description | +| ------------------------- | ---------------------------- | +| `jdbc_connections_active` | Active database connections | +| `jdbc_connections_max` | Maximum connection pool size | +| `jdbc_connections_min` | Minimum connection pool size | +| `jdbc_connections_usage` | Connection pool usage | + +Connection pool metrics prevent connection exhaustion during traffic bursts. + +**Active connections vs pool limits** - alert when active connections approach the maximum: + +```shell +sum(jdbc_connections_active{app="backend", namespace="$namespace"}) +sum(jdbc_connections_max{app="backend", namespace="$namespace"}) +sum(jdbc_connections_min{app="backend", namespace="$namespace"}) +sum(jdbc_connections_usage{app="backend", namespace="$namespace"}) +``` + +### Hibernate cache metrics + +Hibernate caching reduces database load. Monitor hit rates to ensure caches are effective. + +**Query cache hit rate** - should exceed 60%: + +```shell +sum(increase(hibernate_cache_query_requests_total{app="backend", namespace="$namespace", result="hit"}[$__rate_interval])) +/ +sum(increase(hibernate_cache_query_requests_total{app="backend", namespace="$namespace"}[$__rate_interval])) +``` + +**Query plan cache hit rate:** + +```shell +sum(increase(hibernate_cache_query_plan_total{app="backend", namespace="$namespace", result="hit"}[$__rate_interval])) +/ +sum(increase(hibernate_cache_query_plan_total{app="backend", namespace="$namespace"}[$__rate_interval])) +``` + +**Second level cache hit rate by region:** + +```shell +sum by (region) (increase(hibernate_second_level_cache_requests_total{app="backend", namespace="$namespace", result="hit"}[$__rate_interval])) +/ +sum by (region) (increase(hibernate_second_level_cache_requests_total{app="backend", namespace="$namespace"}[$__rate_interval])) +``` + +### Logging metrics + +| Metric | Description | +| ---------------------- | ----------------------------------------------------- | +| `logback_events_total` | Log events by level (debug, info, warn, error, trace) | + +Log event metrics provide early warning of application issues. + +**Error rate** - track error log frequency for anomaly detection: + +```shell +rate(logback_events_total{level="error"}[5m]) +``` + +### Kubernetes health + +Monitor pod health to catch deployment or infrastructure issues early. + +**Pods in unhealthy states:** + +```shell +sum by (namespace, pod) (kube_pod_status_phase{phase=~"Pending|Unknown|Failed", namespace!="wave-build"}) > 0 +``` + +--- + +## Alerting recommendations + +### Critical alerts + +- `jvm_memory_used_bytes{area="heap"}` > 90% of `jvm_memory_max_bytes` +- `process_files_open_files` > 90% of `process_files_max_files` +- `logback_events_total{level="error"}` rate > threshold +- `tower_logs_errors_1minCount` > 0 +- HTTP 5xx errors > 5% of total requests +- `jdbc_connections_active` > 90% of `jdbc_connections_max` +- Any pods in Failed/Unknown state for > 5 minutes + +### Warning alerts + +- `jvm_gc_pause_seconds_max` > 1 second +- `jvm_gc_live_data_size_bytes` approaching `jvm_gc_max_data_size_bytes` +- Heap usage > 85% of max heap +- `executor_queued_tasks` > threshold +- Executor utilization > 90% +- `hibernate_optimistic_failures_total` rate increasing +- `hibernate_query_executions_max_seconds` > 5 seconds +- `http_server_requests_seconds` p99 > acceptable latency +- Redis cache hit rate < 70% +- Hibernate query cache hit rate < 60% +- Growing gap between `credits_estimation_workflow_added_total` and `credits_estimation_workflow_ended_total` +- `hibernate_sessions_open_total` >> `hibernate_sessions_closed_total` over time + +--- + +## Quick reference: Metrics by troubleshooting scenario + +| Issue | Key Metrics to Check | +| ------------------------------ | ----------------------------------------------------------------------------------------------------------------------------------------- | +| **Slow application response** | `http_server_requests_seconds` (latency), `jvm_gc_pause_seconds_max`, `hibernate_query_executions_max_seconds`, `executor_active_threads` | +| **Out of memory errors** | `jvm_memory_used_bytes`, `jvm_gc_pause_seconds`, `jvm_gc_live_data_size_bytes`, `jvm_buffer_memory_used_bytes` | +| **Database performance** | `hibernate_query_executions_max_seconds`, `jdbc_connections_active`, `hibernate_transactions_total`, cache hit rates | +| **High CPU usage** | `process_cpu_usage`, `system_cpu_usage`, `jvm_threads_live_threads`, `executor_active_threads` | +| **Connection exhaustion** | `jdbc_connections_active`, `jdbc_connections_max`, `hibernate_sessions_open_total` vs `hibernate_sessions_closed_total` | +| **Cache issues** | Redis hit rate, `hibernate_cache_query_requests_total`, `cache_gets_total`, `cache_evictions_total` | +| **Workflow processing delays** | `credits_estimation_workflow_*`, `credits_estimation_task_*`, `executor_queued_tasks`, `tower_logs_errors_*` | +| **Thread starvation** | `executor_active_threads`, `executor_queued_tasks`, `jvm_threads_states_threads{state="blocked"}` | +| **Memory leaks** | `jvm_memory_used_bytes` trending up, `jvm_gc_live_data_size_bytes` growing, `jvm_classes_loaded_classes` growing | +| **GC pressure** | `jvm_gc_pause_seconds_max`, `jvm_gc_memory_promoted_bytes_total`, time in GC vs application time | diff --git a/platform-enterprise_versioned_docs/version-24.2/enterprise/advanced-topics/monitoring.md b/platform-enterprise_versioned_docs/version-24.2/enterprise/advanced-topics/monitoring.md new file mode 100644 index 000000000..c7e53d966 --- /dev/null +++ b/platform-enterprise_versioned_docs/version-24.2/enterprise/advanced-topics/monitoring.md @@ -0,0 +1,626 @@ +--- +title: Seqera Platform Monitoring +headline: "Seqera Platform Monitoring" +description: "A guide on relevant platform metrics" +date_created: "2025-12-17" +--- + +# Seqera Platform Monitoring + +## Enabling observability metrics + +Seqera Platform has built-in observability metrics which can be enabled by adding `prometheus` to the `MICRONAUT_ENVIRONMENTS` environment variable. This exposes a Prometheus endpoint at `/prometheus` on the default listen port (e.g., `http://localhost:8080/prometheus`). + +Combined with infrastructure monitoring tools such as Node Exporter, you can monitor relevant metrics across your deployment. + +--- + +## Key metrics to monitor + +### Seqera Platform-specific metrics + +#### Data Studio metrics + +| Metric | Description | +| ------------------------------------------------ | ------------------------------------ | +| `data_studio_startup_time_failure_seconds_sum` | Time for failed Data Studio startups | +| `data_studio_startup_time_failure_seconds_count` | Failed Data Studio startup count | + +Track Data Studio startup performance to identify environment provisioning issues. Slow or failing startups impact user productivity. + +**Average startup time by tool:** + +```shell +sum by (tool) (increase(data_studio_startup_time_success_seconds_sum{app="backend", namespace="$namespace"}[$__rate_interval])) +/ +sum by (tool) (increase(data_studio_startup_time_success_seconds_count{app="backend", namespace="$namespace"}[$__rate_interval])) +``` + +**Failed startup rate:** + +```shell +rate(data_studio_startup_time_failure_seconds_count{namespace="$namespace"}[$__rate_interval]) +``` + +#### Error tracking + +| Metric | Description | +| ------------------------------ | ------------------------- | +| `tower_logs_errors_10secCount` | Errors in last 10 seconds | +| `tower_logs_errors_1minCount` | Errors in last minute | +| `tower_logs_errors_5minCount` | Errors in last 5 minutes | + +Monitor application errors across different time windows. Rolling error counts help identify transient issues versus sustained problems. + +**Recent error counts:** + +```shell +tower_logs_errors_10secCount{namespace="$namespace"} +tower_logs_errors_1minCount{namespace="$namespace"} +tower_logs_errors_5minCount{namespace="$namespace"} +``` + +**Log events by severity level:** + +```shell +rate(logback_events_total{namespace="$namespace"}[$__rate_interval]) +``` + +### Infrastructure resources + +#### CPU usage + +Monitor container CPU consumption against requested resources to identify capacity issues or inefficient resource allocation. + +**Backend CPU usage:** + +```shell +rate(container_cpu_usage_seconds_total{container="backend", namespace="$namespace"}[$__rate_interval]) +``` + +**Compare against requested resources** to determine if the container is over or under-provisioned: + +```shell +max(kube_pod_container_resource_requests{container="backend", namespace="$namespace", resource="cpu"}) +``` + +#### Memory usage + +Track working set memory, committed memory, and limits to prevent OOM conditions. + +**Backend memory working set** shows actual memory in use: + +```shell +container_memory_working_set_bytes{container="backend", namespace="$namespace"} +``` + +**Memory requests and limits** define the bounds for container memory allocation: + +```shell +max(kube_pod_container_resource_requests{container="backend", namespace="$namespace", resource="memory"}) +max(kube_pod_container_resource_limits{container="backend", namespace="$namespace", resource="memory"}) +``` + +### HTTP server requests + +| Metric | Description | +| ------------------------------------------ | ------------------------------------------------- | +| `http_server_requests_seconds_count` | Total request count by method, status, and URI | +| `http_server_requests_seconds_sum` | Total request duration by method, status, and URI | +| `http_server_requests_seconds_max` | Maximum request duration | +| `http_server_requests_seconds` (quantiles) | Request latency percentiles (p50, p95, p99, p999) | + +HTTP metrics reveal application throughput, error rates, and latency patterns. These are essential for understanding user-facing performance. + +**Total request throughput** shows overall API activity: + +```shell +sum(rate(http_server_requests_seconds_count{app="backend", namespace="$namespace"}[$__rate_interval])) +``` + +**Error rate (4xx and 5xx responses)** indicates client errors and server failures: + +```shell +sum(rate(http_server_requests_seconds_count{app="backend", namespace="$namespace", status=~"[45].."}[$__rate_interval])) +``` + +**Average latency per endpoint** helps identify slow API paths: + +```shell +sum by (method, uri) (rate(http_server_requests_seconds_sum{app="backend", namespace="$namespace"}[$__rate_interval])) +/ +sum by (method, uri) (rate(http_server_requests_seconds_count{app="backend", namespace="$namespace"}[$__rate_interval])) +``` + +**Top 10 endpoints by time spent** highlights where server time is consumed for optimization efforts: + +```shell +topk(10, sum by(method, uri) (rate(http_server_requests_seconds_sum{namespace="$namespace", app="backend"}[$__rate_interval]))) +``` + +### HTTP client requests + +| Metric | Description | +| ------------------------------------ | --------------------------------- | +| `http_client_requests_seconds_count` | Outbound request count | +| `http_client_requests_seconds_sum` | Total outbound request duration | +| `http_client_requests_seconds_max` | Maximum outbound request duration | + +Monitor external API calls and integrations. Slow or failing outbound requests can cascade into application performance issues. + +**Outbound request rate:** + +```shell +rate(http_client_requests_seconds_count{namespace="$namespace"}[$__rate_interval]) +``` + +**Average outbound request duration:** + +```shell +rate(http_client_requests_seconds_sum{namespace="$namespace"}[$__rate_interval]) +/ +rate(http_client_requests_seconds_count{namespace="$namespace"}[$__rate_interval]) +``` + +**Maximum outbound request duration** identifies slow external dependencies: + +```shell +http_client_requests_seconds_max{namespace="$namespace"} +``` + +### JVM memory metrics + +| Metric | Description | +| ------------------------------ | -------------------------------------------------------- | +| `jvm_buffer_memory_used_bytes` | Memory used by JVM buffer pools (direct, mapped) | +| `jvm_memory_used_bytes` | Amount of used memory by area (heap/non-heap) and region | +| `jvm_memory_committed_bytes` | Memory committed for JVM use | +| `jvm_memory_max_bytes` | Maximum memory available for memory management | +| `jvm_gc_live_data_size_bytes` | Size of long-lived heap memory pool after reclamation | +| `jvm_gc_max_data_size_bytes` | Max size of long-lived heap memory pool | + +JVM memory metrics are critical for preventing OutOfMemoryErrors and identifying memory leaks. Monitor both heap (Java objects) and non-heap (metaspace, code cache) regions. + +**Heap memory usage** shows memory used for Java objects: + +```shell +jvm_memory_used_bytes{app="backend", namespace="$namespace", area="heap"} +jvm_memory_committed_bytes{app="backend", namespace="$namespace", area="heap"} +jvm_memory_max_bytes{app="backend", namespace="$namespace", area="heap"} +``` + +**Non-heap memory** includes metaspace and code cache: + +```shell +jvm_memory_used_bytes{app="backend", namespace="$namespace", area="nonheap"} +jvm_memory_committed_bytes{app="backend", namespace="$namespace", area="nonheap"} +jvm_memory_max_bytes{app="backend", namespace="$namespace", area="nonheap"} +``` + +**Heap usage percentage** provides a quick health indicator. Alert when this exceeds 85%: + +```shell +sum(jvm_memory_used_bytes{area="heap"}) / sum(jvm_memory_max_bytes{area="heap"}) * 100 +``` + +**Direct buffer usage** is important for Netty-based applications. High usage can cause native memory issues: + +```shell +jvm_buffer_memory_used_bytes{namespace="$namespace", app="backend", id="direct"} +jvm_buffer_total_capacity_bytes{namespace="$namespace", app="backend", id="direct"} +``` + +### JVM garbage collection + +| Metric | Description | +| ------------------------------------- | ----------------------------------------- | +| `jvm_gc_pause_seconds_sum` | Total time spent in GC pauses | +| `jvm_gc_pause_seconds_count` | Number of GC pause events | +| `jvm_gc_pause_seconds_max` | Maximum GC pause duration | +| `jvm_gc_memory_allocated_bytes_total` | Total bytes allocated in young generation | +| `jvm_gc_memory_promoted_bytes_total` | Bytes promoted to old generation | + +Garbage collection metrics reveal memory pressure and its impact on application responsiveness. Long GC pauses cause request latency spikes. + +**Average GC pause duration** should remain low (under 100ms for most applications): + +```shell +rate(jvm_gc_pause_seconds_sum{app="backend", namespace="$namespace"}[$__rate_interval]) +/ +rate(jvm_gc_pause_seconds_count{app="backend", namespace="$namespace"}[$__rate_interval]) +``` + +**Maximum GC pause** identifies worst-case latency impact. Alert if this exceeds 1 second: + +```shell +jvm_gc_pause_seconds_max{app="backend", namespace="$namespace"} +``` + +**Live data size after GC** shows long-lived objects. If this grows over time, you may have a memory leak: + +```shell +jvm_gc_live_data_size_bytes{app="backend", namespace="$namespace"} +``` + +**Memory allocation and promotion rates** indicate object creation patterns. High promotion rates suggest objects are living longer than expected: + +```shell +rate(jvm_gc_memory_allocated_bytes_total{app="backend", namespace="$namespace"}[$__rate_interval]) +rate(jvm_gc_memory_promoted_bytes_total{app="backend", namespace="$namespace"}[$__rate_interval]) +``` + +### JVM threads + +| Metric | Description | +| ---------------------------- | ----------------------------------------------------------------- | +| `jvm_threads_live_threads` | Current number of live threads (daemon + non-daemon) | +| `jvm_threads_daemon_threads` | Current number of daemon threads | +| `jvm_threads_peak_threads` | Peak thread count since JVM start | +| `jvm_threads_states_threads` | Thread count by state (runnable, blocked, waiting, timed-waiting) | + +Thread metrics help identify deadlocks, thread pool exhaustion, and concurrency issues. + +**Thread counts** show overall thread activity: + +```shell +jvm_threads_live_threads{app="backend", namespace="$namespace"} +jvm_threads_daemon_threads{app="backend", namespace="$namespace"} +jvm_threads_peak_threads{app="backend", namespace="$namespace"} +``` + +**Thread states** reveal blocking issues. High blocked thread counts indicate lock contention: + +```shell +jvm_threads_states_threads{app="backend", namespace="$namespace"} +``` + +### JVM classes + +| Metric | Description | +| ------------------------------------ | -------------------------------------- | +| `jvm_classes_loaded_classes` | Currently loaded classes | +| `jvm_classes_unloaded_classes_total` | Total classes unloaded since JVM start | + +Class loading metrics help identify class loader leaks or excessive dynamic class generation. + +**Loaded classes** should stabilize after startup. Continuous growth may indicate a class loader leak: + +```shell +jvm_classes_loaded_classes{namespace="$namespace", app="backend"} +``` + +**Class unload rate:** + +```shell +rate(jvm_classes_unloaded_classes_total{namespace="$namespace", app="backend"}[$__rate_interval]) +``` + +### Process metrics + +| Metric | Description | +| ---------------------------- | ------------------------------------ | +| `process_cpu_usage` | Recent CPU usage for the JVM process | +| `process_cpu_time_ns_total` | Total CPU time used by the JVM | +| `process_files_open_files` | Open file descriptor count | +| `process_files_max_files` | Maximum file descriptor limit | +| `process_uptime_seconds` | JVM uptime | +| `process_start_time_seconds` | Process start time (unix epoch) | + +Process-level metrics provide visibility into resource consumption and system limits. + +**JVM process CPU usage:** + +```shell +process_cpu_usage{namespace="$namespace"} +``` + +**Open file descriptors** should be monitored against limits. Exhaustion causes connection failures: + +```shell +process_files_open_files{namespace="$namespace"} +``` + +**File descriptor utilization percentage** - alert when this exceeds 90%: + +```shell +(process_files_open_files{namespace="$namespace"} / process_files_max_files{namespace="$namespace"}) * 100 +``` + +**Process uptime** helps identify restart events. Low uptime may indicate stability issues: + +```shell +process_uptime_seconds{namespace="$namespace"} +``` + +### System metrics + +| Metric | Description | +| ------------------------ | ------------------------------------- | +| `system_cpu_usage` | System-wide CPU usage | +| `system_cpu_count` | Number of processors available to JVM | +| `system_load_average_1m` | 1-minute load average | + +System metrics provide host-level context for application performance. + +**System-wide CPU usage:** + +```shell +system_cpu_usage{namespace="$namespace"} +``` + +**System load average** should remain below the CPU count for healthy systems: + +```shell +system_load_average_1m{namespace="$namespace"} +``` + +**Available CPU count:** + +```shell +system_cpu_count{namespace="$namespace"} +``` + +### Executor thread pools + +| Metric | Description | +| -------------------------------- | ---------------------------------------------------------- | +| `executor_active_threads` | Currently active threads by pool (io, blocking, scheduled) | +| `executor_pool_size_threads` | Current thread pool size | +| `executor_pool_max_threads` | Maximum allowed threads in pool | +| `executor_queued_tasks` | Tasks queued for execution | +| `executor_completed_tasks_total` | Total completed tasks | +| `executor_seconds_sum` | Total execution time | + +Thread pool metrics reveal concurrency bottlenecks. Saturated pools cause request queuing and increased latency. + +**Thread pool utilization percentage** - high utilization indicates the pool is near capacity: + +```shell +executor_active_threads{service="backend", namespace="$namespace", name!="scheduled"} +/ +executor_pool_size_threads{service="backend", namespace="$namespace", name!="scheduled"} +``` + +**Cron scheduled executor utilization:** + +```shell +executor_active_threads{service="cron", namespace="$namespace", name="scheduled"} +/ +executor_pool_size_threads{service="cron", namespace="$namespace", name="scheduled"} +``` + +**Queued tasks** indicate backlog. Growing queues suggest the pool cannot keep up with demand: + +```shell +executor_queued_tasks{app="backend", namespace="$namespace"} +``` + +**Task completion rate:** + +```shell +rate(executor_completed_tasks_total{namespace="$namespace"}[$__rate_interval]) +``` + +### Cache metrics + +| Metric | Description | +| ----------------------- | ----------------------------------- | +| `cache_size` | Number of entries in cache | +| `cache_gets_total` | Cache hits and misses by cache name | +| `cache_puts_total` | Cache entries added | +| `cache_evictions_total` | Cache eviction count | + +Cache effectiveness directly impacts database load and response times. Low hit rates indicate caching issues. + +**Redis cache hit rate** - should be above 70% for effective caching: + +```shell +avg(irate(redis_keyspace_hits_total{app="platform-redis-exporter"}[$__rate_interval]) +/ +(irate(redis_keyspace_misses_total{app="platform-redis-exporter"}[$__rate_interval]) + irate(redis_keyspace_hits_total{app="platform-redis-exporter"}[$__rate_interval]))) +``` + +**Cache size by name:** + +```shell +cache_size{namespace="$namespace"} +``` + +**Cache operation rates:** + +```shell +rate(cache_gets_total{namespace="$namespace"}[$__rate_interval]) +rate(cache_puts_total{namespace="$namespace"}[$__rate_interval]) +rate(cache_evictions_total{namespace="$namespace"}[$__rate_interval]) +``` + +### Hibernate/Database metrics + +| Metric | Description | +| ---------------------------------------- | ---------------------------------------------------- | +| `hibernate_sessions_open_total` | Total sessions opened | +| `hibernate_sessions_closed_total` | Total sessions closed | +| `hibernate_connections_obtained_total` | Database connections obtained | +| `hibernate_query_executions_total` | Total queries executed | +| `hibernate_query_executions_max_seconds` | Slowest query time | +| `hibernate_entities_inserts_total` | Entity insert operations | +| `hibernate_entities_updates_total` | Entity update operations | +| `hibernate_entities_deletes_total` | Entity delete operations | +| `hibernate_entities_loads_total` | Entity load operations | +| `hibernate_transactions_total` | Transaction count | +| `hibernate_flushes_total` | Session flush count | +| `hibernate_optimistic_failures_total` | Optimistic lock failures (StaleObjectStateException) | + +Database metrics reveal query performance, connection management, and transaction health. + +**Session operations** - open and closed counts should be roughly equal. A growing gap indicates session leaks: + +```shell +rate(hibernate_sessions_open_total{app="backend", namespace="$namespace"}[$__rate_interval]) +rate(hibernate_sessions_closed_total{app="backend", namespace="$namespace"}[$__rate_interval]) +``` + +**Connection acquisition rate:** + +```shell +rate(hibernate_connections_obtained_total{app="backend", namespace="$namespace"}[$__rate_interval]) +``` + +**Query execution rate:** + +```shell +rate(hibernate_query_executions_total{app="backend", namespace="$namespace"}[$__rate_interval]) +``` + +**Query latency by type** helps identify slow queries for optimization: + +```shell +sum by (query) (rate(hibernate_query_execution_total_seconds_sum{app="backend", namespace="$namespace"}[$__rate_interval])) +/ +sum by (query) (rate(hibernate_query_execution_total_seconds_count{app="backend", namespace="$namespace"}[$__rate_interval])) +``` + +**Slowest query time** - alert if this exceeds 5 seconds: + +```shell +hibernate_query_executions_max_seconds{app="backend", namespace="$namespace"} +``` + +**Entity operation rates** show database write patterns: + +```shell +rate(hibernate_entities_inserts_total{app="backend", namespace="$namespace"}[$__rate_interval]) +rate(hibernate_entities_updates_total{app="backend", namespace="$namespace"}[$__rate_interval]) +rate(hibernate_entities_deletes_total{app="backend", namespace="$namespace"}[$__rate_interval]) +rate(hibernate_entities_loads_total{app="backend", namespace="$namespace"}[$__rate_interval]) +``` + +**Transaction success/failure rate:** + +```shell +sum by (result) (rate(hibernate_transactions_total{app="backend", namespace="$namespace"}[$__rate_interval])) +``` + +**Optimistic lock failures** indicate concurrent modification conflicts. High rates suggest contention issues: + +```shell +rate(hibernate_optimistic_failures_total{app="backend", namespace="$namespace"}[$__rate_interval]) +``` + +### Connection pool metrics + +| Metric | Description | +| ------------------------- | ---------------------------- | +| `jdbc_connections_active` | Active database connections | +| `jdbc_connections_max` | Maximum connection pool size | +| `jdbc_connections_min` | Minimum connection pool size | +| `jdbc_connections_usage` | Connection pool usage | + +Connection pool metrics prevent connection exhaustion during traffic bursts. + +**Active connections vs pool limits** - alert when active connections approach the maximum: + +```shell +sum(jdbc_connections_active{app="backend", namespace="$namespace"}) +sum(jdbc_connections_max{app="backend", namespace="$namespace"}) +sum(jdbc_connections_min{app="backend", namespace="$namespace"}) +sum(jdbc_connections_usage{app="backend", namespace="$namespace"}) +``` + +### Hibernate cache metrics + +Hibernate caching reduces database load. Monitor hit rates to ensure caches are effective. + +**Query cache hit rate** - should exceed 60%: + +```shell +sum(increase(hibernate_cache_query_requests_total{app="backend", namespace="$namespace", result="hit"}[$__rate_interval])) +/ +sum(increase(hibernate_cache_query_requests_total{app="backend", namespace="$namespace"}[$__rate_interval])) +``` + +**Query plan cache hit rate:** + +```shell +sum(increase(hibernate_cache_query_plan_total{app="backend", namespace="$namespace", result="hit"}[$__rate_interval])) +/ +sum(increase(hibernate_cache_query_plan_total{app="backend", namespace="$namespace"}[$__rate_interval])) +``` + +**Second level cache hit rate by region:** + +```shell +sum by (region) (increase(hibernate_second_level_cache_requests_total{app="backend", namespace="$namespace", result="hit"}[$__rate_interval])) +/ +sum by (region) (increase(hibernate_second_level_cache_requests_total{app="backend", namespace="$namespace"}[$__rate_interval])) +``` + +### Logging metrics + +| Metric | Description | +| ---------------------- | ----------------------------------------------------- | +| `logback_events_total` | Log events by level (debug, info, warn, error, trace) | + +Log event metrics provide early warning of application issues. + +**Error rate** - track error log frequency for anomaly detection: + +```shell +rate(logback_events_total{level="error"}[5m]) +``` + +### Kubernetes health + +Monitor pod health to catch deployment or infrastructure issues early. + +**Pods in unhealthy states:** + +```shell +sum by (namespace, pod) (kube_pod_status_phase{phase=~"Pending|Unknown|Failed", namespace!="wave-build"}) > 0 +``` + +--- + +## Alerting recommendations + +### Critical alerts + +- `jvm_memory_used_bytes{area="heap"}` > 90% of `jvm_memory_max_bytes` +- `process_files_open_files` > 90% of `process_files_max_files` +- `logback_events_total{level="error"}` rate > threshold +- `tower_logs_errors_1minCount` > 0 +- HTTP 5xx errors > 5% of total requests +- `jdbc_connections_active` > 90% of `jdbc_connections_max` +- Any pods in Failed/Unknown state for > 5 minutes + +### Warning alerts + +- `jvm_gc_pause_seconds_max` > 1 second +- `jvm_gc_live_data_size_bytes` approaching `jvm_gc_max_data_size_bytes` +- Heap usage > 85% of max heap +- `executor_queued_tasks` > threshold +- Executor utilization > 90% +- `hibernate_optimistic_failures_total` rate increasing +- `hibernate_query_executions_max_seconds` > 5 seconds +- `http_server_requests_seconds` p99 > acceptable latency +- Redis cache hit rate < 70% +- Hibernate query cache hit rate < 60% +- Growing gap between `credits_estimation_workflow_added_total` and `credits_estimation_workflow_ended_total` +- `hibernate_sessions_open_total` >> `hibernate_sessions_closed_total` over time + +--- + +## Quick reference: Metrics by troubleshooting scenario + +| Issue | Key Metrics to Check | +| ------------------------------ | ----------------------------------------------------------------------------------------------------------------------------------------- | +| **Slow application response** | `http_server_requests_seconds` (latency), `jvm_gc_pause_seconds_max`, `hibernate_query_executions_max_seconds`, `executor_active_threads` | +| **Out of memory errors** | `jvm_memory_used_bytes`, `jvm_gc_pause_seconds`, `jvm_gc_live_data_size_bytes`, `jvm_buffer_memory_used_bytes` | +| **Database performance** | `hibernate_query_executions_max_seconds`, `jdbc_connections_active`, `hibernate_transactions_total`, cache hit rates | +| **High CPU usage** | `process_cpu_usage`, `system_cpu_usage`, `jvm_threads_live_threads`, `executor_active_threads` | +| **Connection exhaustion** | `jdbc_connections_active`, `jdbc_connections_max`, `hibernate_sessions_open_total` vs `hibernate_sessions_closed_total` | +| **Cache issues** | Redis hit rate, `hibernate_cache_query_requests_total`, `cache_gets_total`, `cache_evictions_total` | +| **Workflow processing delays** | `credits_estimation_workflow_*`, `credits_estimation_task_*`, `executor_queued_tasks`, `tower_logs_errors_*` | +| **Thread starvation** | `executor_active_threads`, `executor_queued_tasks`, `jvm_threads_states_threads{state="blocked"}` | +| **Memory leaks** | `jvm_memory_used_bytes` trending up, `jvm_gc_live_data_size_bytes` growing, `jvm_classes_loaded_classes` growing | +| **GC pressure** | `jvm_gc_pause_seconds_max`, `jvm_gc_memory_promoted_bytes_total`, time in GC vs application time | diff --git a/platform-enterprise_versioned_docs/version-25.1/enterprise/advanced-topics/monitoring.md b/platform-enterprise_versioned_docs/version-25.1/enterprise/advanced-topics/monitoring.md new file mode 100644 index 000000000..c7e53d966 --- /dev/null +++ b/platform-enterprise_versioned_docs/version-25.1/enterprise/advanced-topics/monitoring.md @@ -0,0 +1,626 @@ +--- +title: Seqera Platform Monitoring +headline: "Seqera Platform Monitoring" +description: "A guide on relevant platform metrics" +date_created: "2025-12-17" +--- + +# Seqera Platform Monitoring + +## Enabling observability metrics + +Seqera Platform has built-in observability metrics which can be enabled by adding `prometheus` to the `MICRONAUT_ENVIRONMENTS` environment variable. This exposes a Prometheus endpoint at `/prometheus` on the default listen port (e.g., `http://localhost:8080/prometheus`). + +Combined with infrastructure monitoring tools such as Node Exporter, you can monitor relevant metrics across your deployment. + +--- + +## Key metrics to monitor + +### Seqera Platform-specific metrics + +#### Data Studio metrics + +| Metric | Description | +| ------------------------------------------------ | ------------------------------------ | +| `data_studio_startup_time_failure_seconds_sum` | Time for failed Data Studio startups | +| `data_studio_startup_time_failure_seconds_count` | Failed Data Studio startup count | + +Track Data Studio startup performance to identify environment provisioning issues. Slow or failing startups impact user productivity. + +**Average startup time by tool:** + +```shell +sum by (tool) (increase(data_studio_startup_time_success_seconds_sum{app="backend", namespace="$namespace"}[$__rate_interval])) +/ +sum by (tool) (increase(data_studio_startup_time_success_seconds_count{app="backend", namespace="$namespace"}[$__rate_interval])) +``` + +**Failed startup rate:** + +```shell +rate(data_studio_startup_time_failure_seconds_count{namespace="$namespace"}[$__rate_interval]) +``` + +#### Error tracking + +| Metric | Description | +| ------------------------------ | ------------------------- | +| `tower_logs_errors_10secCount` | Errors in last 10 seconds | +| `tower_logs_errors_1minCount` | Errors in last minute | +| `tower_logs_errors_5minCount` | Errors in last 5 minutes | + +Monitor application errors across different time windows. Rolling error counts help identify transient issues versus sustained problems. + +**Recent error counts:** + +```shell +tower_logs_errors_10secCount{namespace="$namespace"} +tower_logs_errors_1minCount{namespace="$namespace"} +tower_logs_errors_5minCount{namespace="$namespace"} +``` + +**Log events by severity level:** + +```shell +rate(logback_events_total{namespace="$namespace"}[$__rate_interval]) +``` + +### Infrastructure resources + +#### CPU usage + +Monitor container CPU consumption against requested resources to identify capacity issues or inefficient resource allocation. + +**Backend CPU usage:** + +```shell +rate(container_cpu_usage_seconds_total{container="backend", namespace="$namespace"}[$__rate_interval]) +``` + +**Compare against requested resources** to determine if the container is over or under-provisioned: + +```shell +max(kube_pod_container_resource_requests{container="backend", namespace="$namespace", resource="cpu"}) +``` + +#### Memory usage + +Track working set memory, committed memory, and limits to prevent OOM conditions. + +**Backend memory working set** shows actual memory in use: + +```shell +container_memory_working_set_bytes{container="backend", namespace="$namespace"} +``` + +**Memory requests and limits** define the bounds for container memory allocation: + +```shell +max(kube_pod_container_resource_requests{container="backend", namespace="$namespace", resource="memory"}) +max(kube_pod_container_resource_limits{container="backend", namespace="$namespace", resource="memory"}) +``` + +### HTTP server requests + +| Metric | Description | +| ------------------------------------------ | ------------------------------------------------- | +| `http_server_requests_seconds_count` | Total request count by method, status, and URI | +| `http_server_requests_seconds_sum` | Total request duration by method, status, and URI | +| `http_server_requests_seconds_max` | Maximum request duration | +| `http_server_requests_seconds` (quantiles) | Request latency percentiles (p50, p95, p99, p999) | + +HTTP metrics reveal application throughput, error rates, and latency patterns. These are essential for understanding user-facing performance. + +**Total request throughput** shows overall API activity: + +```shell +sum(rate(http_server_requests_seconds_count{app="backend", namespace="$namespace"}[$__rate_interval])) +``` + +**Error rate (4xx and 5xx responses)** indicates client errors and server failures: + +```shell +sum(rate(http_server_requests_seconds_count{app="backend", namespace="$namespace", status=~"[45].."}[$__rate_interval])) +``` + +**Average latency per endpoint** helps identify slow API paths: + +```shell +sum by (method, uri) (rate(http_server_requests_seconds_sum{app="backend", namespace="$namespace"}[$__rate_interval])) +/ +sum by (method, uri) (rate(http_server_requests_seconds_count{app="backend", namespace="$namespace"}[$__rate_interval])) +``` + +**Top 10 endpoints by time spent** highlights where server time is consumed for optimization efforts: + +```shell +topk(10, sum by(method, uri) (rate(http_server_requests_seconds_sum{namespace="$namespace", app="backend"}[$__rate_interval]))) +``` + +### HTTP client requests + +| Metric | Description | +| ------------------------------------ | --------------------------------- | +| `http_client_requests_seconds_count` | Outbound request count | +| `http_client_requests_seconds_sum` | Total outbound request duration | +| `http_client_requests_seconds_max` | Maximum outbound request duration | + +Monitor external API calls and integrations. Slow or failing outbound requests can cascade into application performance issues. + +**Outbound request rate:** + +```shell +rate(http_client_requests_seconds_count{namespace="$namespace"}[$__rate_interval]) +``` + +**Average outbound request duration:** + +```shell +rate(http_client_requests_seconds_sum{namespace="$namespace"}[$__rate_interval]) +/ +rate(http_client_requests_seconds_count{namespace="$namespace"}[$__rate_interval]) +``` + +**Maximum outbound request duration** identifies slow external dependencies: + +```shell +http_client_requests_seconds_max{namespace="$namespace"} +``` + +### JVM memory metrics + +| Metric | Description | +| ------------------------------ | -------------------------------------------------------- | +| `jvm_buffer_memory_used_bytes` | Memory used by JVM buffer pools (direct, mapped) | +| `jvm_memory_used_bytes` | Amount of used memory by area (heap/non-heap) and region | +| `jvm_memory_committed_bytes` | Memory committed for JVM use | +| `jvm_memory_max_bytes` | Maximum memory available for memory management | +| `jvm_gc_live_data_size_bytes` | Size of long-lived heap memory pool after reclamation | +| `jvm_gc_max_data_size_bytes` | Max size of long-lived heap memory pool | + +JVM memory metrics are critical for preventing OutOfMemoryErrors and identifying memory leaks. Monitor both heap (Java objects) and non-heap (metaspace, code cache) regions. + +**Heap memory usage** shows memory used for Java objects: + +```shell +jvm_memory_used_bytes{app="backend", namespace="$namespace", area="heap"} +jvm_memory_committed_bytes{app="backend", namespace="$namespace", area="heap"} +jvm_memory_max_bytes{app="backend", namespace="$namespace", area="heap"} +``` + +**Non-heap memory** includes metaspace and code cache: + +```shell +jvm_memory_used_bytes{app="backend", namespace="$namespace", area="nonheap"} +jvm_memory_committed_bytes{app="backend", namespace="$namespace", area="nonheap"} +jvm_memory_max_bytes{app="backend", namespace="$namespace", area="nonheap"} +``` + +**Heap usage percentage** provides a quick health indicator. Alert when this exceeds 85%: + +```shell +sum(jvm_memory_used_bytes{area="heap"}) / sum(jvm_memory_max_bytes{area="heap"}) * 100 +``` + +**Direct buffer usage** is important for Netty-based applications. High usage can cause native memory issues: + +```shell +jvm_buffer_memory_used_bytes{namespace="$namespace", app="backend", id="direct"} +jvm_buffer_total_capacity_bytes{namespace="$namespace", app="backend", id="direct"} +``` + +### JVM garbage collection + +| Metric | Description | +| ------------------------------------- | ----------------------------------------- | +| `jvm_gc_pause_seconds_sum` | Total time spent in GC pauses | +| `jvm_gc_pause_seconds_count` | Number of GC pause events | +| `jvm_gc_pause_seconds_max` | Maximum GC pause duration | +| `jvm_gc_memory_allocated_bytes_total` | Total bytes allocated in young generation | +| `jvm_gc_memory_promoted_bytes_total` | Bytes promoted to old generation | + +Garbage collection metrics reveal memory pressure and its impact on application responsiveness. Long GC pauses cause request latency spikes. + +**Average GC pause duration** should remain low (under 100ms for most applications): + +```shell +rate(jvm_gc_pause_seconds_sum{app="backend", namespace="$namespace"}[$__rate_interval]) +/ +rate(jvm_gc_pause_seconds_count{app="backend", namespace="$namespace"}[$__rate_interval]) +``` + +**Maximum GC pause** identifies worst-case latency impact. Alert if this exceeds 1 second: + +```shell +jvm_gc_pause_seconds_max{app="backend", namespace="$namespace"} +``` + +**Live data size after GC** shows long-lived objects. If this grows over time, you may have a memory leak: + +```shell +jvm_gc_live_data_size_bytes{app="backend", namespace="$namespace"} +``` + +**Memory allocation and promotion rates** indicate object creation patterns. High promotion rates suggest objects are living longer than expected: + +```shell +rate(jvm_gc_memory_allocated_bytes_total{app="backend", namespace="$namespace"}[$__rate_interval]) +rate(jvm_gc_memory_promoted_bytes_total{app="backend", namespace="$namespace"}[$__rate_interval]) +``` + +### JVM threads + +| Metric | Description | +| ---------------------------- | ----------------------------------------------------------------- | +| `jvm_threads_live_threads` | Current number of live threads (daemon + non-daemon) | +| `jvm_threads_daemon_threads` | Current number of daemon threads | +| `jvm_threads_peak_threads` | Peak thread count since JVM start | +| `jvm_threads_states_threads` | Thread count by state (runnable, blocked, waiting, timed-waiting) | + +Thread metrics help identify deadlocks, thread pool exhaustion, and concurrency issues. + +**Thread counts** show overall thread activity: + +```shell +jvm_threads_live_threads{app="backend", namespace="$namespace"} +jvm_threads_daemon_threads{app="backend", namespace="$namespace"} +jvm_threads_peak_threads{app="backend", namespace="$namespace"} +``` + +**Thread states** reveal blocking issues. High blocked thread counts indicate lock contention: + +```shell +jvm_threads_states_threads{app="backend", namespace="$namespace"} +``` + +### JVM classes + +| Metric | Description | +| ------------------------------------ | -------------------------------------- | +| `jvm_classes_loaded_classes` | Currently loaded classes | +| `jvm_classes_unloaded_classes_total` | Total classes unloaded since JVM start | + +Class loading metrics help identify class loader leaks or excessive dynamic class generation. + +**Loaded classes** should stabilize after startup. Continuous growth may indicate a class loader leak: + +```shell +jvm_classes_loaded_classes{namespace="$namespace", app="backend"} +``` + +**Class unload rate:** + +```shell +rate(jvm_classes_unloaded_classes_total{namespace="$namespace", app="backend"}[$__rate_interval]) +``` + +### Process metrics + +| Metric | Description | +| ---------------------------- | ------------------------------------ | +| `process_cpu_usage` | Recent CPU usage for the JVM process | +| `process_cpu_time_ns_total` | Total CPU time used by the JVM | +| `process_files_open_files` | Open file descriptor count | +| `process_files_max_files` | Maximum file descriptor limit | +| `process_uptime_seconds` | JVM uptime | +| `process_start_time_seconds` | Process start time (unix epoch) | + +Process-level metrics provide visibility into resource consumption and system limits. + +**JVM process CPU usage:** + +```shell +process_cpu_usage{namespace="$namespace"} +``` + +**Open file descriptors** should be monitored against limits. Exhaustion causes connection failures: + +```shell +process_files_open_files{namespace="$namespace"} +``` + +**File descriptor utilization percentage** - alert when this exceeds 90%: + +```shell +(process_files_open_files{namespace="$namespace"} / process_files_max_files{namespace="$namespace"}) * 100 +``` + +**Process uptime** helps identify restart events. Low uptime may indicate stability issues: + +```shell +process_uptime_seconds{namespace="$namespace"} +``` + +### System metrics + +| Metric | Description | +| ------------------------ | ------------------------------------- | +| `system_cpu_usage` | System-wide CPU usage | +| `system_cpu_count` | Number of processors available to JVM | +| `system_load_average_1m` | 1-minute load average | + +System metrics provide host-level context for application performance. + +**System-wide CPU usage:** + +```shell +system_cpu_usage{namespace="$namespace"} +``` + +**System load average** should remain below the CPU count for healthy systems: + +```shell +system_load_average_1m{namespace="$namespace"} +``` + +**Available CPU count:** + +```shell +system_cpu_count{namespace="$namespace"} +``` + +### Executor thread pools + +| Metric | Description | +| -------------------------------- | ---------------------------------------------------------- | +| `executor_active_threads` | Currently active threads by pool (io, blocking, scheduled) | +| `executor_pool_size_threads` | Current thread pool size | +| `executor_pool_max_threads` | Maximum allowed threads in pool | +| `executor_queued_tasks` | Tasks queued for execution | +| `executor_completed_tasks_total` | Total completed tasks | +| `executor_seconds_sum` | Total execution time | + +Thread pool metrics reveal concurrency bottlenecks. Saturated pools cause request queuing and increased latency. + +**Thread pool utilization percentage** - high utilization indicates the pool is near capacity: + +```shell +executor_active_threads{service="backend", namespace="$namespace", name!="scheduled"} +/ +executor_pool_size_threads{service="backend", namespace="$namespace", name!="scheduled"} +``` + +**Cron scheduled executor utilization:** + +```shell +executor_active_threads{service="cron", namespace="$namespace", name="scheduled"} +/ +executor_pool_size_threads{service="cron", namespace="$namespace", name="scheduled"} +``` + +**Queued tasks** indicate backlog. Growing queues suggest the pool cannot keep up with demand: + +```shell +executor_queued_tasks{app="backend", namespace="$namespace"} +``` + +**Task completion rate:** + +```shell +rate(executor_completed_tasks_total{namespace="$namespace"}[$__rate_interval]) +``` + +### Cache metrics + +| Metric | Description | +| ----------------------- | ----------------------------------- | +| `cache_size` | Number of entries in cache | +| `cache_gets_total` | Cache hits and misses by cache name | +| `cache_puts_total` | Cache entries added | +| `cache_evictions_total` | Cache eviction count | + +Cache effectiveness directly impacts database load and response times. Low hit rates indicate caching issues. + +**Redis cache hit rate** - should be above 70% for effective caching: + +```shell +avg(irate(redis_keyspace_hits_total{app="platform-redis-exporter"}[$__rate_interval]) +/ +(irate(redis_keyspace_misses_total{app="platform-redis-exporter"}[$__rate_interval]) + irate(redis_keyspace_hits_total{app="platform-redis-exporter"}[$__rate_interval]))) +``` + +**Cache size by name:** + +```shell +cache_size{namespace="$namespace"} +``` + +**Cache operation rates:** + +```shell +rate(cache_gets_total{namespace="$namespace"}[$__rate_interval]) +rate(cache_puts_total{namespace="$namespace"}[$__rate_interval]) +rate(cache_evictions_total{namespace="$namespace"}[$__rate_interval]) +``` + +### Hibernate/Database metrics + +| Metric | Description | +| ---------------------------------------- | ---------------------------------------------------- | +| `hibernate_sessions_open_total` | Total sessions opened | +| `hibernate_sessions_closed_total` | Total sessions closed | +| `hibernate_connections_obtained_total` | Database connections obtained | +| `hibernate_query_executions_total` | Total queries executed | +| `hibernate_query_executions_max_seconds` | Slowest query time | +| `hibernate_entities_inserts_total` | Entity insert operations | +| `hibernate_entities_updates_total` | Entity update operations | +| `hibernate_entities_deletes_total` | Entity delete operations | +| `hibernate_entities_loads_total` | Entity load operations | +| `hibernate_transactions_total` | Transaction count | +| `hibernate_flushes_total` | Session flush count | +| `hibernate_optimistic_failures_total` | Optimistic lock failures (StaleObjectStateException) | + +Database metrics reveal query performance, connection management, and transaction health. + +**Session operations** - open and closed counts should be roughly equal. A growing gap indicates session leaks: + +```shell +rate(hibernate_sessions_open_total{app="backend", namespace="$namespace"}[$__rate_interval]) +rate(hibernate_sessions_closed_total{app="backend", namespace="$namespace"}[$__rate_interval]) +``` + +**Connection acquisition rate:** + +```shell +rate(hibernate_connections_obtained_total{app="backend", namespace="$namespace"}[$__rate_interval]) +``` + +**Query execution rate:** + +```shell +rate(hibernate_query_executions_total{app="backend", namespace="$namespace"}[$__rate_interval]) +``` + +**Query latency by type** helps identify slow queries for optimization: + +```shell +sum by (query) (rate(hibernate_query_execution_total_seconds_sum{app="backend", namespace="$namespace"}[$__rate_interval])) +/ +sum by (query) (rate(hibernate_query_execution_total_seconds_count{app="backend", namespace="$namespace"}[$__rate_interval])) +``` + +**Slowest query time** - alert if this exceeds 5 seconds: + +```shell +hibernate_query_executions_max_seconds{app="backend", namespace="$namespace"} +``` + +**Entity operation rates** show database write patterns: + +```shell +rate(hibernate_entities_inserts_total{app="backend", namespace="$namespace"}[$__rate_interval]) +rate(hibernate_entities_updates_total{app="backend", namespace="$namespace"}[$__rate_interval]) +rate(hibernate_entities_deletes_total{app="backend", namespace="$namespace"}[$__rate_interval]) +rate(hibernate_entities_loads_total{app="backend", namespace="$namespace"}[$__rate_interval]) +``` + +**Transaction success/failure rate:** + +```shell +sum by (result) (rate(hibernate_transactions_total{app="backend", namespace="$namespace"}[$__rate_interval])) +``` + +**Optimistic lock failures** indicate concurrent modification conflicts. High rates suggest contention issues: + +```shell +rate(hibernate_optimistic_failures_total{app="backend", namespace="$namespace"}[$__rate_interval]) +``` + +### Connection pool metrics + +| Metric | Description | +| ------------------------- | ---------------------------- | +| `jdbc_connections_active` | Active database connections | +| `jdbc_connections_max` | Maximum connection pool size | +| `jdbc_connections_min` | Minimum connection pool size | +| `jdbc_connections_usage` | Connection pool usage | + +Connection pool metrics prevent connection exhaustion during traffic bursts. + +**Active connections vs pool limits** - alert when active connections approach the maximum: + +```shell +sum(jdbc_connections_active{app="backend", namespace="$namespace"}) +sum(jdbc_connections_max{app="backend", namespace="$namespace"}) +sum(jdbc_connections_min{app="backend", namespace="$namespace"}) +sum(jdbc_connections_usage{app="backend", namespace="$namespace"}) +``` + +### Hibernate cache metrics + +Hibernate caching reduces database load. Monitor hit rates to ensure caches are effective. + +**Query cache hit rate** - should exceed 60%: + +```shell +sum(increase(hibernate_cache_query_requests_total{app="backend", namespace="$namespace", result="hit"}[$__rate_interval])) +/ +sum(increase(hibernate_cache_query_requests_total{app="backend", namespace="$namespace"}[$__rate_interval])) +``` + +**Query plan cache hit rate:** + +```shell +sum(increase(hibernate_cache_query_plan_total{app="backend", namespace="$namespace", result="hit"}[$__rate_interval])) +/ +sum(increase(hibernate_cache_query_plan_total{app="backend", namespace="$namespace"}[$__rate_interval])) +``` + +**Second level cache hit rate by region:** + +```shell +sum by (region) (increase(hibernate_second_level_cache_requests_total{app="backend", namespace="$namespace", result="hit"}[$__rate_interval])) +/ +sum by (region) (increase(hibernate_second_level_cache_requests_total{app="backend", namespace="$namespace"}[$__rate_interval])) +``` + +### Logging metrics + +| Metric | Description | +| ---------------------- | ----------------------------------------------------- | +| `logback_events_total` | Log events by level (debug, info, warn, error, trace) | + +Log event metrics provide early warning of application issues. + +**Error rate** - track error log frequency for anomaly detection: + +```shell +rate(logback_events_total{level="error"}[5m]) +``` + +### Kubernetes health + +Monitor pod health to catch deployment or infrastructure issues early. + +**Pods in unhealthy states:** + +```shell +sum by (namespace, pod) (kube_pod_status_phase{phase=~"Pending|Unknown|Failed", namespace!="wave-build"}) > 0 +``` + +--- + +## Alerting recommendations + +### Critical alerts + +- `jvm_memory_used_bytes{area="heap"}` > 90% of `jvm_memory_max_bytes` +- `process_files_open_files` > 90% of `process_files_max_files` +- `logback_events_total{level="error"}` rate > threshold +- `tower_logs_errors_1minCount` > 0 +- HTTP 5xx errors > 5% of total requests +- `jdbc_connections_active` > 90% of `jdbc_connections_max` +- Any pods in Failed/Unknown state for > 5 minutes + +### Warning alerts + +- `jvm_gc_pause_seconds_max` > 1 second +- `jvm_gc_live_data_size_bytes` approaching `jvm_gc_max_data_size_bytes` +- Heap usage > 85% of max heap +- `executor_queued_tasks` > threshold +- Executor utilization > 90% +- `hibernate_optimistic_failures_total` rate increasing +- `hibernate_query_executions_max_seconds` > 5 seconds +- `http_server_requests_seconds` p99 > acceptable latency +- Redis cache hit rate < 70% +- Hibernate query cache hit rate < 60% +- Growing gap between `credits_estimation_workflow_added_total` and `credits_estimation_workflow_ended_total` +- `hibernate_sessions_open_total` >> `hibernate_sessions_closed_total` over time + +--- + +## Quick reference: Metrics by troubleshooting scenario + +| Issue | Key Metrics to Check | +| ------------------------------ | ----------------------------------------------------------------------------------------------------------------------------------------- | +| **Slow application response** | `http_server_requests_seconds` (latency), `jvm_gc_pause_seconds_max`, `hibernate_query_executions_max_seconds`, `executor_active_threads` | +| **Out of memory errors** | `jvm_memory_used_bytes`, `jvm_gc_pause_seconds`, `jvm_gc_live_data_size_bytes`, `jvm_buffer_memory_used_bytes` | +| **Database performance** | `hibernate_query_executions_max_seconds`, `jdbc_connections_active`, `hibernate_transactions_total`, cache hit rates | +| **High CPU usage** | `process_cpu_usage`, `system_cpu_usage`, `jvm_threads_live_threads`, `executor_active_threads` | +| **Connection exhaustion** | `jdbc_connections_active`, `jdbc_connections_max`, `hibernate_sessions_open_total` vs `hibernate_sessions_closed_total` | +| **Cache issues** | Redis hit rate, `hibernate_cache_query_requests_total`, `cache_gets_total`, `cache_evictions_total` | +| **Workflow processing delays** | `credits_estimation_workflow_*`, `credits_estimation_task_*`, `executor_queued_tasks`, `tower_logs_errors_*` | +| **Thread starvation** | `executor_active_threads`, `executor_queued_tasks`, `jvm_threads_states_threads{state="blocked"}` | +| **Memory leaks** | `jvm_memory_used_bytes` trending up, `jvm_gc_live_data_size_bytes` growing, `jvm_classes_loaded_classes` growing | +| **GC pressure** | `jvm_gc_pause_seconds_max`, `jvm_gc_memory_promoted_bytes_total`, time in GC vs application time | diff --git a/platform-enterprise_versioned_docs/version-25.2/enterprise/advanced-topics/monitoring.md b/platform-enterprise_versioned_docs/version-25.2/enterprise/advanced-topics/monitoring.md new file mode 100644 index 000000000..c7e53d966 --- /dev/null +++ b/platform-enterprise_versioned_docs/version-25.2/enterprise/advanced-topics/monitoring.md @@ -0,0 +1,626 @@ +--- +title: Seqera Platform Monitoring +headline: "Seqera Platform Monitoring" +description: "A guide on relevant platform metrics" +date_created: "2025-12-17" +--- + +# Seqera Platform Monitoring + +## Enabling observability metrics + +Seqera Platform has built-in observability metrics which can be enabled by adding `prometheus` to the `MICRONAUT_ENVIRONMENTS` environment variable. This exposes a Prometheus endpoint at `/prometheus` on the default listen port (e.g., `http://localhost:8080/prometheus`). + +Combined with infrastructure monitoring tools such as Node Exporter, you can monitor relevant metrics across your deployment. + +--- + +## Key metrics to monitor + +### Seqera Platform-specific metrics + +#### Data Studio metrics + +| Metric | Description | +| ------------------------------------------------ | ------------------------------------ | +| `data_studio_startup_time_failure_seconds_sum` | Time for failed Data Studio startups | +| `data_studio_startup_time_failure_seconds_count` | Failed Data Studio startup count | + +Track Data Studio startup performance to identify environment provisioning issues. Slow or failing startups impact user productivity. + +**Average startup time by tool:** + +```shell +sum by (tool) (increase(data_studio_startup_time_success_seconds_sum{app="backend", namespace="$namespace"}[$__rate_interval])) +/ +sum by (tool) (increase(data_studio_startup_time_success_seconds_count{app="backend", namespace="$namespace"}[$__rate_interval])) +``` + +**Failed startup rate:** + +```shell +rate(data_studio_startup_time_failure_seconds_count{namespace="$namespace"}[$__rate_interval]) +``` + +#### Error tracking + +| Metric | Description | +| ------------------------------ | ------------------------- | +| `tower_logs_errors_10secCount` | Errors in last 10 seconds | +| `tower_logs_errors_1minCount` | Errors in last minute | +| `tower_logs_errors_5minCount` | Errors in last 5 minutes | + +Monitor application errors across different time windows. Rolling error counts help identify transient issues versus sustained problems. + +**Recent error counts:** + +```shell +tower_logs_errors_10secCount{namespace="$namespace"} +tower_logs_errors_1minCount{namespace="$namespace"} +tower_logs_errors_5minCount{namespace="$namespace"} +``` + +**Log events by severity level:** + +```shell +rate(logback_events_total{namespace="$namespace"}[$__rate_interval]) +``` + +### Infrastructure resources + +#### CPU usage + +Monitor container CPU consumption against requested resources to identify capacity issues or inefficient resource allocation. + +**Backend CPU usage:** + +```shell +rate(container_cpu_usage_seconds_total{container="backend", namespace="$namespace"}[$__rate_interval]) +``` + +**Compare against requested resources** to determine if the container is over or under-provisioned: + +```shell +max(kube_pod_container_resource_requests{container="backend", namespace="$namespace", resource="cpu"}) +``` + +#### Memory usage + +Track working set memory, committed memory, and limits to prevent OOM conditions. + +**Backend memory working set** shows actual memory in use: + +```shell +container_memory_working_set_bytes{container="backend", namespace="$namespace"} +``` + +**Memory requests and limits** define the bounds for container memory allocation: + +```shell +max(kube_pod_container_resource_requests{container="backend", namespace="$namespace", resource="memory"}) +max(kube_pod_container_resource_limits{container="backend", namespace="$namespace", resource="memory"}) +``` + +### HTTP server requests + +| Metric | Description | +| ------------------------------------------ | ------------------------------------------------- | +| `http_server_requests_seconds_count` | Total request count by method, status, and URI | +| `http_server_requests_seconds_sum` | Total request duration by method, status, and URI | +| `http_server_requests_seconds_max` | Maximum request duration | +| `http_server_requests_seconds` (quantiles) | Request latency percentiles (p50, p95, p99, p999) | + +HTTP metrics reveal application throughput, error rates, and latency patterns. These are essential for understanding user-facing performance. + +**Total request throughput** shows overall API activity: + +```shell +sum(rate(http_server_requests_seconds_count{app="backend", namespace="$namespace"}[$__rate_interval])) +``` + +**Error rate (4xx and 5xx responses)** indicates client errors and server failures: + +```shell +sum(rate(http_server_requests_seconds_count{app="backend", namespace="$namespace", status=~"[45].."}[$__rate_interval])) +``` + +**Average latency per endpoint** helps identify slow API paths: + +```shell +sum by (method, uri) (rate(http_server_requests_seconds_sum{app="backend", namespace="$namespace"}[$__rate_interval])) +/ +sum by (method, uri) (rate(http_server_requests_seconds_count{app="backend", namespace="$namespace"}[$__rate_interval])) +``` + +**Top 10 endpoints by time spent** highlights where server time is consumed for optimization efforts: + +```shell +topk(10, sum by(method, uri) (rate(http_server_requests_seconds_sum{namespace="$namespace", app="backend"}[$__rate_interval]))) +``` + +### HTTP client requests + +| Metric | Description | +| ------------------------------------ | --------------------------------- | +| `http_client_requests_seconds_count` | Outbound request count | +| `http_client_requests_seconds_sum` | Total outbound request duration | +| `http_client_requests_seconds_max` | Maximum outbound request duration | + +Monitor external API calls and integrations. Slow or failing outbound requests can cascade into application performance issues. + +**Outbound request rate:** + +```shell +rate(http_client_requests_seconds_count{namespace="$namespace"}[$__rate_interval]) +``` + +**Average outbound request duration:** + +```shell +rate(http_client_requests_seconds_sum{namespace="$namespace"}[$__rate_interval]) +/ +rate(http_client_requests_seconds_count{namespace="$namespace"}[$__rate_interval]) +``` + +**Maximum outbound request duration** identifies slow external dependencies: + +```shell +http_client_requests_seconds_max{namespace="$namespace"} +``` + +### JVM memory metrics + +| Metric | Description | +| ------------------------------ | -------------------------------------------------------- | +| `jvm_buffer_memory_used_bytes` | Memory used by JVM buffer pools (direct, mapped) | +| `jvm_memory_used_bytes` | Amount of used memory by area (heap/non-heap) and region | +| `jvm_memory_committed_bytes` | Memory committed for JVM use | +| `jvm_memory_max_bytes` | Maximum memory available for memory management | +| `jvm_gc_live_data_size_bytes` | Size of long-lived heap memory pool after reclamation | +| `jvm_gc_max_data_size_bytes` | Max size of long-lived heap memory pool | + +JVM memory metrics are critical for preventing OutOfMemoryErrors and identifying memory leaks. Monitor both heap (Java objects) and non-heap (metaspace, code cache) regions. + +**Heap memory usage** shows memory used for Java objects: + +```shell +jvm_memory_used_bytes{app="backend", namespace="$namespace", area="heap"} +jvm_memory_committed_bytes{app="backend", namespace="$namespace", area="heap"} +jvm_memory_max_bytes{app="backend", namespace="$namespace", area="heap"} +``` + +**Non-heap memory** includes metaspace and code cache: + +```shell +jvm_memory_used_bytes{app="backend", namespace="$namespace", area="nonheap"} +jvm_memory_committed_bytes{app="backend", namespace="$namespace", area="nonheap"} +jvm_memory_max_bytes{app="backend", namespace="$namespace", area="nonheap"} +``` + +**Heap usage percentage** provides a quick health indicator. Alert when this exceeds 85%: + +```shell +sum(jvm_memory_used_bytes{area="heap"}) / sum(jvm_memory_max_bytes{area="heap"}) * 100 +``` + +**Direct buffer usage** is important for Netty-based applications. High usage can cause native memory issues: + +```shell +jvm_buffer_memory_used_bytes{namespace="$namespace", app="backend", id="direct"} +jvm_buffer_total_capacity_bytes{namespace="$namespace", app="backend", id="direct"} +``` + +### JVM garbage collection + +| Metric | Description | +| ------------------------------------- | ----------------------------------------- | +| `jvm_gc_pause_seconds_sum` | Total time spent in GC pauses | +| `jvm_gc_pause_seconds_count` | Number of GC pause events | +| `jvm_gc_pause_seconds_max` | Maximum GC pause duration | +| `jvm_gc_memory_allocated_bytes_total` | Total bytes allocated in young generation | +| `jvm_gc_memory_promoted_bytes_total` | Bytes promoted to old generation | + +Garbage collection metrics reveal memory pressure and its impact on application responsiveness. Long GC pauses cause request latency spikes. + +**Average GC pause duration** should remain low (under 100ms for most applications): + +```shell +rate(jvm_gc_pause_seconds_sum{app="backend", namespace="$namespace"}[$__rate_interval]) +/ +rate(jvm_gc_pause_seconds_count{app="backend", namespace="$namespace"}[$__rate_interval]) +``` + +**Maximum GC pause** identifies worst-case latency impact. Alert if this exceeds 1 second: + +```shell +jvm_gc_pause_seconds_max{app="backend", namespace="$namespace"} +``` + +**Live data size after GC** shows long-lived objects. If this grows over time, you may have a memory leak: + +```shell +jvm_gc_live_data_size_bytes{app="backend", namespace="$namespace"} +``` + +**Memory allocation and promotion rates** indicate object creation patterns. High promotion rates suggest objects are living longer than expected: + +```shell +rate(jvm_gc_memory_allocated_bytes_total{app="backend", namespace="$namespace"}[$__rate_interval]) +rate(jvm_gc_memory_promoted_bytes_total{app="backend", namespace="$namespace"}[$__rate_interval]) +``` + +### JVM threads + +| Metric | Description | +| ---------------------------- | ----------------------------------------------------------------- | +| `jvm_threads_live_threads` | Current number of live threads (daemon + non-daemon) | +| `jvm_threads_daemon_threads` | Current number of daemon threads | +| `jvm_threads_peak_threads` | Peak thread count since JVM start | +| `jvm_threads_states_threads` | Thread count by state (runnable, blocked, waiting, timed-waiting) | + +Thread metrics help identify deadlocks, thread pool exhaustion, and concurrency issues. + +**Thread counts** show overall thread activity: + +```shell +jvm_threads_live_threads{app="backend", namespace="$namespace"} +jvm_threads_daemon_threads{app="backend", namespace="$namespace"} +jvm_threads_peak_threads{app="backend", namespace="$namespace"} +``` + +**Thread states** reveal blocking issues. High blocked thread counts indicate lock contention: + +```shell +jvm_threads_states_threads{app="backend", namespace="$namespace"} +``` + +### JVM classes + +| Metric | Description | +| ------------------------------------ | -------------------------------------- | +| `jvm_classes_loaded_classes` | Currently loaded classes | +| `jvm_classes_unloaded_classes_total` | Total classes unloaded since JVM start | + +Class loading metrics help identify class loader leaks or excessive dynamic class generation. + +**Loaded classes** should stabilize after startup. Continuous growth may indicate a class loader leak: + +```shell +jvm_classes_loaded_classes{namespace="$namespace", app="backend"} +``` + +**Class unload rate:** + +```shell +rate(jvm_classes_unloaded_classes_total{namespace="$namespace", app="backend"}[$__rate_interval]) +``` + +### Process metrics + +| Metric | Description | +| ---------------------------- | ------------------------------------ | +| `process_cpu_usage` | Recent CPU usage for the JVM process | +| `process_cpu_time_ns_total` | Total CPU time used by the JVM | +| `process_files_open_files` | Open file descriptor count | +| `process_files_max_files` | Maximum file descriptor limit | +| `process_uptime_seconds` | JVM uptime | +| `process_start_time_seconds` | Process start time (unix epoch) | + +Process-level metrics provide visibility into resource consumption and system limits. + +**JVM process CPU usage:** + +```shell +process_cpu_usage{namespace="$namespace"} +``` + +**Open file descriptors** should be monitored against limits. Exhaustion causes connection failures: + +```shell +process_files_open_files{namespace="$namespace"} +``` + +**File descriptor utilization percentage** - alert when this exceeds 90%: + +```shell +(process_files_open_files{namespace="$namespace"} / process_files_max_files{namespace="$namespace"}) * 100 +``` + +**Process uptime** helps identify restart events. Low uptime may indicate stability issues: + +```shell +process_uptime_seconds{namespace="$namespace"} +``` + +### System metrics + +| Metric | Description | +| ------------------------ | ------------------------------------- | +| `system_cpu_usage` | System-wide CPU usage | +| `system_cpu_count` | Number of processors available to JVM | +| `system_load_average_1m` | 1-minute load average | + +System metrics provide host-level context for application performance. + +**System-wide CPU usage:** + +```shell +system_cpu_usage{namespace="$namespace"} +``` + +**System load average** should remain below the CPU count for healthy systems: + +```shell +system_load_average_1m{namespace="$namespace"} +``` + +**Available CPU count:** + +```shell +system_cpu_count{namespace="$namespace"} +``` + +### Executor thread pools + +| Metric | Description | +| -------------------------------- | ---------------------------------------------------------- | +| `executor_active_threads` | Currently active threads by pool (io, blocking, scheduled) | +| `executor_pool_size_threads` | Current thread pool size | +| `executor_pool_max_threads` | Maximum allowed threads in pool | +| `executor_queued_tasks` | Tasks queued for execution | +| `executor_completed_tasks_total` | Total completed tasks | +| `executor_seconds_sum` | Total execution time | + +Thread pool metrics reveal concurrency bottlenecks. Saturated pools cause request queuing and increased latency. + +**Thread pool utilization percentage** - high utilization indicates the pool is near capacity: + +```shell +executor_active_threads{service="backend", namespace="$namespace", name!="scheduled"} +/ +executor_pool_size_threads{service="backend", namespace="$namespace", name!="scheduled"} +``` + +**Cron scheduled executor utilization:** + +```shell +executor_active_threads{service="cron", namespace="$namespace", name="scheduled"} +/ +executor_pool_size_threads{service="cron", namespace="$namespace", name="scheduled"} +``` + +**Queued tasks** indicate backlog. Growing queues suggest the pool cannot keep up with demand: + +```shell +executor_queued_tasks{app="backend", namespace="$namespace"} +``` + +**Task completion rate:** + +```shell +rate(executor_completed_tasks_total{namespace="$namespace"}[$__rate_interval]) +``` + +### Cache metrics + +| Metric | Description | +| ----------------------- | ----------------------------------- | +| `cache_size` | Number of entries in cache | +| `cache_gets_total` | Cache hits and misses by cache name | +| `cache_puts_total` | Cache entries added | +| `cache_evictions_total` | Cache eviction count | + +Cache effectiveness directly impacts database load and response times. Low hit rates indicate caching issues. + +**Redis cache hit rate** - should be above 70% for effective caching: + +```shell +avg(irate(redis_keyspace_hits_total{app="platform-redis-exporter"}[$__rate_interval]) +/ +(irate(redis_keyspace_misses_total{app="platform-redis-exporter"}[$__rate_interval]) + irate(redis_keyspace_hits_total{app="platform-redis-exporter"}[$__rate_interval]))) +``` + +**Cache size by name:** + +```shell +cache_size{namespace="$namespace"} +``` + +**Cache operation rates:** + +```shell +rate(cache_gets_total{namespace="$namespace"}[$__rate_interval]) +rate(cache_puts_total{namespace="$namespace"}[$__rate_interval]) +rate(cache_evictions_total{namespace="$namespace"}[$__rate_interval]) +``` + +### Hibernate/Database metrics + +| Metric | Description | +| ---------------------------------------- | ---------------------------------------------------- | +| `hibernate_sessions_open_total` | Total sessions opened | +| `hibernate_sessions_closed_total` | Total sessions closed | +| `hibernate_connections_obtained_total` | Database connections obtained | +| `hibernate_query_executions_total` | Total queries executed | +| `hibernate_query_executions_max_seconds` | Slowest query time | +| `hibernate_entities_inserts_total` | Entity insert operations | +| `hibernate_entities_updates_total` | Entity update operations | +| `hibernate_entities_deletes_total` | Entity delete operations | +| `hibernate_entities_loads_total` | Entity load operations | +| `hibernate_transactions_total` | Transaction count | +| `hibernate_flushes_total` | Session flush count | +| `hibernate_optimistic_failures_total` | Optimistic lock failures (StaleObjectStateException) | + +Database metrics reveal query performance, connection management, and transaction health. + +**Session operations** - open and closed counts should be roughly equal. A growing gap indicates session leaks: + +```shell +rate(hibernate_sessions_open_total{app="backend", namespace="$namespace"}[$__rate_interval]) +rate(hibernate_sessions_closed_total{app="backend", namespace="$namespace"}[$__rate_interval]) +``` + +**Connection acquisition rate:** + +```shell +rate(hibernate_connections_obtained_total{app="backend", namespace="$namespace"}[$__rate_interval]) +``` + +**Query execution rate:** + +```shell +rate(hibernate_query_executions_total{app="backend", namespace="$namespace"}[$__rate_interval]) +``` + +**Query latency by type** helps identify slow queries for optimization: + +```shell +sum by (query) (rate(hibernate_query_execution_total_seconds_sum{app="backend", namespace="$namespace"}[$__rate_interval])) +/ +sum by (query) (rate(hibernate_query_execution_total_seconds_count{app="backend", namespace="$namespace"}[$__rate_interval])) +``` + +**Slowest query time** - alert if this exceeds 5 seconds: + +```shell +hibernate_query_executions_max_seconds{app="backend", namespace="$namespace"} +``` + +**Entity operation rates** show database write patterns: + +```shell +rate(hibernate_entities_inserts_total{app="backend", namespace="$namespace"}[$__rate_interval]) +rate(hibernate_entities_updates_total{app="backend", namespace="$namespace"}[$__rate_interval]) +rate(hibernate_entities_deletes_total{app="backend", namespace="$namespace"}[$__rate_interval]) +rate(hibernate_entities_loads_total{app="backend", namespace="$namespace"}[$__rate_interval]) +``` + +**Transaction success/failure rate:** + +```shell +sum by (result) (rate(hibernate_transactions_total{app="backend", namespace="$namespace"}[$__rate_interval])) +``` + +**Optimistic lock failures** indicate concurrent modification conflicts. High rates suggest contention issues: + +```shell +rate(hibernate_optimistic_failures_total{app="backend", namespace="$namespace"}[$__rate_interval]) +``` + +### Connection pool metrics + +| Metric | Description | +| ------------------------- | ---------------------------- | +| `jdbc_connections_active` | Active database connections | +| `jdbc_connections_max` | Maximum connection pool size | +| `jdbc_connections_min` | Minimum connection pool size | +| `jdbc_connections_usage` | Connection pool usage | + +Connection pool metrics prevent connection exhaustion during traffic bursts. + +**Active connections vs pool limits** - alert when active connections approach the maximum: + +```shell +sum(jdbc_connections_active{app="backend", namespace="$namespace"}) +sum(jdbc_connections_max{app="backend", namespace="$namespace"}) +sum(jdbc_connections_min{app="backend", namespace="$namespace"}) +sum(jdbc_connections_usage{app="backend", namespace="$namespace"}) +``` + +### Hibernate cache metrics + +Hibernate caching reduces database load. Monitor hit rates to ensure caches are effective. + +**Query cache hit rate** - should exceed 60%: + +```shell +sum(increase(hibernate_cache_query_requests_total{app="backend", namespace="$namespace", result="hit"}[$__rate_interval])) +/ +sum(increase(hibernate_cache_query_requests_total{app="backend", namespace="$namespace"}[$__rate_interval])) +``` + +**Query plan cache hit rate:** + +```shell +sum(increase(hibernate_cache_query_plan_total{app="backend", namespace="$namespace", result="hit"}[$__rate_interval])) +/ +sum(increase(hibernate_cache_query_plan_total{app="backend", namespace="$namespace"}[$__rate_interval])) +``` + +**Second level cache hit rate by region:** + +```shell +sum by (region) (increase(hibernate_second_level_cache_requests_total{app="backend", namespace="$namespace", result="hit"}[$__rate_interval])) +/ +sum by (region) (increase(hibernate_second_level_cache_requests_total{app="backend", namespace="$namespace"}[$__rate_interval])) +``` + +### Logging metrics + +| Metric | Description | +| ---------------------- | ----------------------------------------------------- | +| `logback_events_total` | Log events by level (debug, info, warn, error, trace) | + +Log event metrics provide early warning of application issues. + +**Error rate** - track error log frequency for anomaly detection: + +```shell +rate(logback_events_total{level="error"}[5m]) +``` + +### Kubernetes health + +Monitor pod health to catch deployment or infrastructure issues early. + +**Pods in unhealthy states:** + +```shell +sum by (namespace, pod) (kube_pod_status_phase{phase=~"Pending|Unknown|Failed", namespace!="wave-build"}) > 0 +``` + +--- + +## Alerting recommendations + +### Critical alerts + +- `jvm_memory_used_bytes{area="heap"}` > 90% of `jvm_memory_max_bytes` +- `process_files_open_files` > 90% of `process_files_max_files` +- `logback_events_total{level="error"}` rate > threshold +- `tower_logs_errors_1minCount` > 0 +- HTTP 5xx errors > 5% of total requests +- `jdbc_connections_active` > 90% of `jdbc_connections_max` +- Any pods in Failed/Unknown state for > 5 minutes + +### Warning alerts + +- `jvm_gc_pause_seconds_max` > 1 second +- `jvm_gc_live_data_size_bytes` approaching `jvm_gc_max_data_size_bytes` +- Heap usage > 85% of max heap +- `executor_queued_tasks` > threshold +- Executor utilization > 90% +- `hibernate_optimistic_failures_total` rate increasing +- `hibernate_query_executions_max_seconds` > 5 seconds +- `http_server_requests_seconds` p99 > acceptable latency +- Redis cache hit rate < 70% +- Hibernate query cache hit rate < 60% +- Growing gap between `credits_estimation_workflow_added_total` and `credits_estimation_workflow_ended_total` +- `hibernate_sessions_open_total` >> `hibernate_sessions_closed_total` over time + +--- + +## Quick reference: Metrics by troubleshooting scenario + +| Issue | Key Metrics to Check | +| ------------------------------ | ----------------------------------------------------------------------------------------------------------------------------------------- | +| **Slow application response** | `http_server_requests_seconds` (latency), `jvm_gc_pause_seconds_max`, `hibernate_query_executions_max_seconds`, `executor_active_threads` | +| **Out of memory errors** | `jvm_memory_used_bytes`, `jvm_gc_pause_seconds`, `jvm_gc_live_data_size_bytes`, `jvm_buffer_memory_used_bytes` | +| **Database performance** | `hibernate_query_executions_max_seconds`, `jdbc_connections_active`, `hibernate_transactions_total`, cache hit rates | +| **High CPU usage** | `process_cpu_usage`, `system_cpu_usage`, `jvm_threads_live_threads`, `executor_active_threads` | +| **Connection exhaustion** | `jdbc_connections_active`, `jdbc_connections_max`, `hibernate_sessions_open_total` vs `hibernate_sessions_closed_total` | +| **Cache issues** | Redis hit rate, `hibernate_cache_query_requests_total`, `cache_gets_total`, `cache_evictions_total` | +| **Workflow processing delays** | `credits_estimation_workflow_*`, `credits_estimation_task_*`, `executor_queued_tasks`, `tower_logs_errors_*` | +| **Thread starvation** | `executor_active_threads`, `executor_queued_tasks`, `jvm_threads_states_threads{state="blocked"}` | +| **Memory leaks** | `jvm_memory_used_bytes` trending up, `jvm_gc_live_data_size_bytes` growing, `jvm_classes_loaded_classes` growing | +| **GC pressure** | `jvm_gc_pause_seconds_max`, `jvm_gc_memory_promoted_bytes_total`, time in GC vs application time | diff --git a/platform-enterprise_versioned_docs/version-25.3/enterprise/advanced-topics/monitoring.md b/platform-enterprise_versioned_docs/version-25.3/enterprise/advanced-topics/monitoring.md new file mode 100644 index 000000000..c7e53d966 --- /dev/null +++ b/platform-enterprise_versioned_docs/version-25.3/enterprise/advanced-topics/monitoring.md @@ -0,0 +1,626 @@ +--- +title: Seqera Platform Monitoring +headline: "Seqera Platform Monitoring" +description: "A guide on relevant platform metrics" +date_created: "2025-12-17" +--- + +# Seqera Platform Monitoring + +## Enabling observability metrics + +Seqera Platform has built-in observability metrics which can be enabled by adding `prometheus` to the `MICRONAUT_ENVIRONMENTS` environment variable. This exposes a Prometheus endpoint at `/prometheus` on the default listen port (e.g., `http://localhost:8080/prometheus`). + +Combined with infrastructure monitoring tools such as Node Exporter, you can monitor relevant metrics across your deployment. + +--- + +## Key metrics to monitor + +### Seqera Platform-specific metrics + +#### Data Studio metrics + +| Metric | Description | +| ------------------------------------------------ | ------------------------------------ | +| `data_studio_startup_time_failure_seconds_sum` | Time for failed Data Studio startups | +| `data_studio_startup_time_failure_seconds_count` | Failed Data Studio startup count | + +Track Data Studio startup performance to identify environment provisioning issues. Slow or failing startups impact user productivity. + +**Average startup time by tool:** + +```shell +sum by (tool) (increase(data_studio_startup_time_success_seconds_sum{app="backend", namespace="$namespace"}[$__rate_interval])) +/ +sum by (tool) (increase(data_studio_startup_time_success_seconds_count{app="backend", namespace="$namespace"}[$__rate_interval])) +``` + +**Failed startup rate:** + +```shell +rate(data_studio_startup_time_failure_seconds_count{namespace="$namespace"}[$__rate_interval]) +``` + +#### Error tracking + +| Metric | Description | +| ------------------------------ | ------------------------- | +| `tower_logs_errors_10secCount` | Errors in last 10 seconds | +| `tower_logs_errors_1minCount` | Errors in last minute | +| `tower_logs_errors_5minCount` | Errors in last 5 minutes | + +Monitor application errors across different time windows. Rolling error counts help identify transient issues versus sustained problems. + +**Recent error counts:** + +```shell +tower_logs_errors_10secCount{namespace="$namespace"} +tower_logs_errors_1minCount{namespace="$namespace"} +tower_logs_errors_5minCount{namespace="$namespace"} +``` + +**Log events by severity level:** + +```shell +rate(logback_events_total{namespace="$namespace"}[$__rate_interval]) +``` + +### Infrastructure resources + +#### CPU usage + +Monitor container CPU consumption against requested resources to identify capacity issues or inefficient resource allocation. + +**Backend CPU usage:** + +```shell +rate(container_cpu_usage_seconds_total{container="backend", namespace="$namespace"}[$__rate_interval]) +``` + +**Compare against requested resources** to determine if the container is over or under-provisioned: + +```shell +max(kube_pod_container_resource_requests{container="backend", namespace="$namespace", resource="cpu"}) +``` + +#### Memory usage + +Track working set memory, committed memory, and limits to prevent OOM conditions. + +**Backend memory working set** shows actual memory in use: + +```shell +container_memory_working_set_bytes{container="backend", namespace="$namespace"} +``` + +**Memory requests and limits** define the bounds for container memory allocation: + +```shell +max(kube_pod_container_resource_requests{container="backend", namespace="$namespace", resource="memory"}) +max(kube_pod_container_resource_limits{container="backend", namespace="$namespace", resource="memory"}) +``` + +### HTTP server requests + +| Metric | Description | +| ------------------------------------------ | ------------------------------------------------- | +| `http_server_requests_seconds_count` | Total request count by method, status, and URI | +| `http_server_requests_seconds_sum` | Total request duration by method, status, and URI | +| `http_server_requests_seconds_max` | Maximum request duration | +| `http_server_requests_seconds` (quantiles) | Request latency percentiles (p50, p95, p99, p999) | + +HTTP metrics reveal application throughput, error rates, and latency patterns. These are essential for understanding user-facing performance. + +**Total request throughput** shows overall API activity: + +```shell +sum(rate(http_server_requests_seconds_count{app="backend", namespace="$namespace"}[$__rate_interval])) +``` + +**Error rate (4xx and 5xx responses)** indicates client errors and server failures: + +```shell +sum(rate(http_server_requests_seconds_count{app="backend", namespace="$namespace", status=~"[45].."}[$__rate_interval])) +``` + +**Average latency per endpoint** helps identify slow API paths: + +```shell +sum by (method, uri) (rate(http_server_requests_seconds_sum{app="backend", namespace="$namespace"}[$__rate_interval])) +/ +sum by (method, uri) (rate(http_server_requests_seconds_count{app="backend", namespace="$namespace"}[$__rate_interval])) +``` + +**Top 10 endpoints by time spent** highlights where server time is consumed for optimization efforts: + +```shell +topk(10, sum by(method, uri) (rate(http_server_requests_seconds_sum{namespace="$namespace", app="backend"}[$__rate_interval]))) +``` + +### HTTP client requests + +| Metric | Description | +| ------------------------------------ | --------------------------------- | +| `http_client_requests_seconds_count` | Outbound request count | +| `http_client_requests_seconds_sum` | Total outbound request duration | +| `http_client_requests_seconds_max` | Maximum outbound request duration | + +Monitor external API calls and integrations. Slow or failing outbound requests can cascade into application performance issues. + +**Outbound request rate:** + +```shell +rate(http_client_requests_seconds_count{namespace="$namespace"}[$__rate_interval]) +``` + +**Average outbound request duration:** + +```shell +rate(http_client_requests_seconds_sum{namespace="$namespace"}[$__rate_interval]) +/ +rate(http_client_requests_seconds_count{namespace="$namespace"}[$__rate_interval]) +``` + +**Maximum outbound request duration** identifies slow external dependencies: + +```shell +http_client_requests_seconds_max{namespace="$namespace"} +``` + +### JVM memory metrics + +| Metric | Description | +| ------------------------------ | -------------------------------------------------------- | +| `jvm_buffer_memory_used_bytes` | Memory used by JVM buffer pools (direct, mapped) | +| `jvm_memory_used_bytes` | Amount of used memory by area (heap/non-heap) and region | +| `jvm_memory_committed_bytes` | Memory committed for JVM use | +| `jvm_memory_max_bytes` | Maximum memory available for memory management | +| `jvm_gc_live_data_size_bytes` | Size of long-lived heap memory pool after reclamation | +| `jvm_gc_max_data_size_bytes` | Max size of long-lived heap memory pool | + +JVM memory metrics are critical for preventing OutOfMemoryErrors and identifying memory leaks. Monitor both heap (Java objects) and non-heap (metaspace, code cache) regions. + +**Heap memory usage** shows memory used for Java objects: + +```shell +jvm_memory_used_bytes{app="backend", namespace="$namespace", area="heap"} +jvm_memory_committed_bytes{app="backend", namespace="$namespace", area="heap"} +jvm_memory_max_bytes{app="backend", namespace="$namespace", area="heap"} +``` + +**Non-heap memory** includes metaspace and code cache: + +```shell +jvm_memory_used_bytes{app="backend", namespace="$namespace", area="nonheap"} +jvm_memory_committed_bytes{app="backend", namespace="$namespace", area="nonheap"} +jvm_memory_max_bytes{app="backend", namespace="$namespace", area="nonheap"} +``` + +**Heap usage percentage** provides a quick health indicator. Alert when this exceeds 85%: + +```shell +sum(jvm_memory_used_bytes{area="heap"}) / sum(jvm_memory_max_bytes{area="heap"}) * 100 +``` + +**Direct buffer usage** is important for Netty-based applications. High usage can cause native memory issues: + +```shell +jvm_buffer_memory_used_bytes{namespace="$namespace", app="backend", id="direct"} +jvm_buffer_total_capacity_bytes{namespace="$namespace", app="backend", id="direct"} +``` + +### JVM garbage collection + +| Metric | Description | +| ------------------------------------- | ----------------------------------------- | +| `jvm_gc_pause_seconds_sum` | Total time spent in GC pauses | +| `jvm_gc_pause_seconds_count` | Number of GC pause events | +| `jvm_gc_pause_seconds_max` | Maximum GC pause duration | +| `jvm_gc_memory_allocated_bytes_total` | Total bytes allocated in young generation | +| `jvm_gc_memory_promoted_bytes_total` | Bytes promoted to old generation | + +Garbage collection metrics reveal memory pressure and its impact on application responsiveness. Long GC pauses cause request latency spikes. + +**Average GC pause duration** should remain low (under 100ms for most applications): + +```shell +rate(jvm_gc_pause_seconds_sum{app="backend", namespace="$namespace"}[$__rate_interval]) +/ +rate(jvm_gc_pause_seconds_count{app="backend", namespace="$namespace"}[$__rate_interval]) +``` + +**Maximum GC pause** identifies worst-case latency impact. Alert if this exceeds 1 second: + +```shell +jvm_gc_pause_seconds_max{app="backend", namespace="$namespace"} +``` + +**Live data size after GC** shows long-lived objects. If this grows over time, you may have a memory leak: + +```shell +jvm_gc_live_data_size_bytes{app="backend", namespace="$namespace"} +``` + +**Memory allocation and promotion rates** indicate object creation patterns. High promotion rates suggest objects are living longer than expected: + +```shell +rate(jvm_gc_memory_allocated_bytes_total{app="backend", namespace="$namespace"}[$__rate_interval]) +rate(jvm_gc_memory_promoted_bytes_total{app="backend", namespace="$namespace"}[$__rate_interval]) +``` + +### JVM threads + +| Metric | Description | +| ---------------------------- | ----------------------------------------------------------------- | +| `jvm_threads_live_threads` | Current number of live threads (daemon + non-daemon) | +| `jvm_threads_daemon_threads` | Current number of daemon threads | +| `jvm_threads_peak_threads` | Peak thread count since JVM start | +| `jvm_threads_states_threads` | Thread count by state (runnable, blocked, waiting, timed-waiting) | + +Thread metrics help identify deadlocks, thread pool exhaustion, and concurrency issues. + +**Thread counts** show overall thread activity: + +```shell +jvm_threads_live_threads{app="backend", namespace="$namespace"} +jvm_threads_daemon_threads{app="backend", namespace="$namespace"} +jvm_threads_peak_threads{app="backend", namespace="$namespace"} +``` + +**Thread states** reveal blocking issues. High blocked thread counts indicate lock contention: + +```shell +jvm_threads_states_threads{app="backend", namespace="$namespace"} +``` + +### JVM classes + +| Metric | Description | +| ------------------------------------ | -------------------------------------- | +| `jvm_classes_loaded_classes` | Currently loaded classes | +| `jvm_classes_unloaded_classes_total` | Total classes unloaded since JVM start | + +Class loading metrics help identify class loader leaks or excessive dynamic class generation. + +**Loaded classes** should stabilize after startup. Continuous growth may indicate a class loader leak: + +```shell +jvm_classes_loaded_classes{namespace="$namespace", app="backend"} +``` + +**Class unload rate:** + +```shell +rate(jvm_classes_unloaded_classes_total{namespace="$namespace", app="backend"}[$__rate_interval]) +``` + +### Process metrics + +| Metric | Description | +| ---------------------------- | ------------------------------------ | +| `process_cpu_usage` | Recent CPU usage for the JVM process | +| `process_cpu_time_ns_total` | Total CPU time used by the JVM | +| `process_files_open_files` | Open file descriptor count | +| `process_files_max_files` | Maximum file descriptor limit | +| `process_uptime_seconds` | JVM uptime | +| `process_start_time_seconds` | Process start time (unix epoch) | + +Process-level metrics provide visibility into resource consumption and system limits. + +**JVM process CPU usage:** + +```shell +process_cpu_usage{namespace="$namespace"} +``` + +**Open file descriptors** should be monitored against limits. Exhaustion causes connection failures: + +```shell +process_files_open_files{namespace="$namespace"} +``` + +**File descriptor utilization percentage** - alert when this exceeds 90%: + +```shell +(process_files_open_files{namespace="$namespace"} / process_files_max_files{namespace="$namespace"}) * 100 +``` + +**Process uptime** helps identify restart events. Low uptime may indicate stability issues: + +```shell +process_uptime_seconds{namespace="$namespace"} +``` + +### System metrics + +| Metric | Description | +| ------------------------ | ------------------------------------- | +| `system_cpu_usage` | System-wide CPU usage | +| `system_cpu_count` | Number of processors available to JVM | +| `system_load_average_1m` | 1-minute load average | + +System metrics provide host-level context for application performance. + +**System-wide CPU usage:** + +```shell +system_cpu_usage{namespace="$namespace"} +``` + +**System load average** should remain below the CPU count for healthy systems: + +```shell +system_load_average_1m{namespace="$namespace"} +``` + +**Available CPU count:** + +```shell +system_cpu_count{namespace="$namespace"} +``` + +### Executor thread pools + +| Metric | Description | +| -------------------------------- | ---------------------------------------------------------- | +| `executor_active_threads` | Currently active threads by pool (io, blocking, scheduled) | +| `executor_pool_size_threads` | Current thread pool size | +| `executor_pool_max_threads` | Maximum allowed threads in pool | +| `executor_queued_tasks` | Tasks queued for execution | +| `executor_completed_tasks_total` | Total completed tasks | +| `executor_seconds_sum` | Total execution time | + +Thread pool metrics reveal concurrency bottlenecks. Saturated pools cause request queuing and increased latency. + +**Thread pool utilization percentage** - high utilization indicates the pool is near capacity: + +```shell +executor_active_threads{service="backend", namespace="$namespace", name!="scheduled"} +/ +executor_pool_size_threads{service="backend", namespace="$namespace", name!="scheduled"} +``` + +**Cron scheduled executor utilization:** + +```shell +executor_active_threads{service="cron", namespace="$namespace", name="scheduled"} +/ +executor_pool_size_threads{service="cron", namespace="$namespace", name="scheduled"} +``` + +**Queued tasks** indicate backlog. Growing queues suggest the pool cannot keep up with demand: + +```shell +executor_queued_tasks{app="backend", namespace="$namespace"} +``` + +**Task completion rate:** + +```shell +rate(executor_completed_tasks_total{namespace="$namespace"}[$__rate_interval]) +``` + +### Cache metrics + +| Metric | Description | +| ----------------------- | ----------------------------------- | +| `cache_size` | Number of entries in cache | +| `cache_gets_total` | Cache hits and misses by cache name | +| `cache_puts_total` | Cache entries added | +| `cache_evictions_total` | Cache eviction count | + +Cache effectiveness directly impacts database load and response times. Low hit rates indicate caching issues. + +**Redis cache hit rate** - should be above 70% for effective caching: + +```shell +avg(irate(redis_keyspace_hits_total{app="platform-redis-exporter"}[$__rate_interval]) +/ +(irate(redis_keyspace_misses_total{app="platform-redis-exporter"}[$__rate_interval]) + irate(redis_keyspace_hits_total{app="platform-redis-exporter"}[$__rate_interval]))) +``` + +**Cache size by name:** + +```shell +cache_size{namespace="$namespace"} +``` + +**Cache operation rates:** + +```shell +rate(cache_gets_total{namespace="$namespace"}[$__rate_interval]) +rate(cache_puts_total{namespace="$namespace"}[$__rate_interval]) +rate(cache_evictions_total{namespace="$namespace"}[$__rate_interval]) +``` + +### Hibernate/Database metrics + +| Metric | Description | +| ---------------------------------------- | ---------------------------------------------------- | +| `hibernate_sessions_open_total` | Total sessions opened | +| `hibernate_sessions_closed_total` | Total sessions closed | +| `hibernate_connections_obtained_total` | Database connections obtained | +| `hibernate_query_executions_total` | Total queries executed | +| `hibernate_query_executions_max_seconds` | Slowest query time | +| `hibernate_entities_inserts_total` | Entity insert operations | +| `hibernate_entities_updates_total` | Entity update operations | +| `hibernate_entities_deletes_total` | Entity delete operations | +| `hibernate_entities_loads_total` | Entity load operations | +| `hibernate_transactions_total` | Transaction count | +| `hibernate_flushes_total` | Session flush count | +| `hibernate_optimistic_failures_total` | Optimistic lock failures (StaleObjectStateException) | + +Database metrics reveal query performance, connection management, and transaction health. + +**Session operations** - open and closed counts should be roughly equal. A growing gap indicates session leaks: + +```shell +rate(hibernate_sessions_open_total{app="backend", namespace="$namespace"}[$__rate_interval]) +rate(hibernate_sessions_closed_total{app="backend", namespace="$namespace"}[$__rate_interval]) +``` + +**Connection acquisition rate:** + +```shell +rate(hibernate_connections_obtained_total{app="backend", namespace="$namespace"}[$__rate_interval]) +``` + +**Query execution rate:** + +```shell +rate(hibernate_query_executions_total{app="backend", namespace="$namespace"}[$__rate_interval]) +``` + +**Query latency by type** helps identify slow queries for optimization: + +```shell +sum by (query) (rate(hibernate_query_execution_total_seconds_sum{app="backend", namespace="$namespace"}[$__rate_interval])) +/ +sum by (query) (rate(hibernate_query_execution_total_seconds_count{app="backend", namespace="$namespace"}[$__rate_interval])) +``` + +**Slowest query time** - alert if this exceeds 5 seconds: + +```shell +hibernate_query_executions_max_seconds{app="backend", namespace="$namespace"} +``` + +**Entity operation rates** show database write patterns: + +```shell +rate(hibernate_entities_inserts_total{app="backend", namespace="$namespace"}[$__rate_interval]) +rate(hibernate_entities_updates_total{app="backend", namespace="$namespace"}[$__rate_interval]) +rate(hibernate_entities_deletes_total{app="backend", namespace="$namespace"}[$__rate_interval]) +rate(hibernate_entities_loads_total{app="backend", namespace="$namespace"}[$__rate_interval]) +``` + +**Transaction success/failure rate:** + +```shell +sum by (result) (rate(hibernate_transactions_total{app="backend", namespace="$namespace"}[$__rate_interval])) +``` + +**Optimistic lock failures** indicate concurrent modification conflicts. High rates suggest contention issues: + +```shell +rate(hibernate_optimistic_failures_total{app="backend", namespace="$namespace"}[$__rate_interval]) +``` + +### Connection pool metrics + +| Metric | Description | +| ------------------------- | ---------------------------- | +| `jdbc_connections_active` | Active database connections | +| `jdbc_connections_max` | Maximum connection pool size | +| `jdbc_connections_min` | Minimum connection pool size | +| `jdbc_connections_usage` | Connection pool usage | + +Connection pool metrics prevent connection exhaustion during traffic bursts. + +**Active connections vs pool limits** - alert when active connections approach the maximum: + +```shell +sum(jdbc_connections_active{app="backend", namespace="$namespace"}) +sum(jdbc_connections_max{app="backend", namespace="$namespace"}) +sum(jdbc_connections_min{app="backend", namespace="$namespace"}) +sum(jdbc_connections_usage{app="backend", namespace="$namespace"}) +``` + +### Hibernate cache metrics + +Hibernate caching reduces database load. Monitor hit rates to ensure caches are effective. + +**Query cache hit rate** - should exceed 60%: + +```shell +sum(increase(hibernate_cache_query_requests_total{app="backend", namespace="$namespace", result="hit"}[$__rate_interval])) +/ +sum(increase(hibernate_cache_query_requests_total{app="backend", namespace="$namespace"}[$__rate_interval])) +``` + +**Query plan cache hit rate:** + +```shell +sum(increase(hibernate_cache_query_plan_total{app="backend", namespace="$namespace", result="hit"}[$__rate_interval])) +/ +sum(increase(hibernate_cache_query_plan_total{app="backend", namespace="$namespace"}[$__rate_interval])) +``` + +**Second level cache hit rate by region:** + +```shell +sum by (region) (increase(hibernate_second_level_cache_requests_total{app="backend", namespace="$namespace", result="hit"}[$__rate_interval])) +/ +sum by (region) (increase(hibernate_second_level_cache_requests_total{app="backend", namespace="$namespace"}[$__rate_interval])) +``` + +### Logging metrics + +| Metric | Description | +| ---------------------- | ----------------------------------------------------- | +| `logback_events_total` | Log events by level (debug, info, warn, error, trace) | + +Log event metrics provide early warning of application issues. + +**Error rate** - track error log frequency for anomaly detection: + +```shell +rate(logback_events_total{level="error"}[5m]) +``` + +### Kubernetes health + +Monitor pod health to catch deployment or infrastructure issues early. + +**Pods in unhealthy states:** + +```shell +sum by (namespace, pod) (kube_pod_status_phase{phase=~"Pending|Unknown|Failed", namespace!="wave-build"}) > 0 +``` + +--- + +## Alerting recommendations + +### Critical alerts + +- `jvm_memory_used_bytes{area="heap"}` > 90% of `jvm_memory_max_bytes` +- `process_files_open_files` > 90% of `process_files_max_files` +- `logback_events_total{level="error"}` rate > threshold +- `tower_logs_errors_1minCount` > 0 +- HTTP 5xx errors > 5% of total requests +- `jdbc_connections_active` > 90% of `jdbc_connections_max` +- Any pods in Failed/Unknown state for > 5 minutes + +### Warning alerts + +- `jvm_gc_pause_seconds_max` > 1 second +- `jvm_gc_live_data_size_bytes` approaching `jvm_gc_max_data_size_bytes` +- Heap usage > 85% of max heap +- `executor_queued_tasks` > threshold +- Executor utilization > 90% +- `hibernate_optimistic_failures_total` rate increasing +- `hibernate_query_executions_max_seconds` > 5 seconds +- `http_server_requests_seconds` p99 > acceptable latency +- Redis cache hit rate < 70% +- Hibernate query cache hit rate < 60% +- Growing gap between `credits_estimation_workflow_added_total` and `credits_estimation_workflow_ended_total` +- `hibernate_sessions_open_total` >> `hibernate_sessions_closed_total` over time + +--- + +## Quick reference: Metrics by troubleshooting scenario + +| Issue | Key Metrics to Check | +| ------------------------------ | ----------------------------------------------------------------------------------------------------------------------------------------- | +| **Slow application response** | `http_server_requests_seconds` (latency), `jvm_gc_pause_seconds_max`, `hibernate_query_executions_max_seconds`, `executor_active_threads` | +| **Out of memory errors** | `jvm_memory_used_bytes`, `jvm_gc_pause_seconds`, `jvm_gc_live_data_size_bytes`, `jvm_buffer_memory_used_bytes` | +| **Database performance** | `hibernate_query_executions_max_seconds`, `jdbc_connections_active`, `hibernate_transactions_total`, cache hit rates | +| **High CPU usage** | `process_cpu_usage`, `system_cpu_usage`, `jvm_threads_live_threads`, `executor_active_threads` | +| **Connection exhaustion** | `jdbc_connections_active`, `jdbc_connections_max`, `hibernate_sessions_open_total` vs `hibernate_sessions_closed_total` | +| **Cache issues** | Redis hit rate, `hibernate_cache_query_requests_total`, `cache_gets_total`, `cache_evictions_total` | +| **Workflow processing delays** | `credits_estimation_workflow_*`, `credits_estimation_task_*`, `executor_queued_tasks`, `tower_logs_errors_*` | +| **Thread starvation** | `executor_active_threads`, `executor_queued_tasks`, `jvm_threads_states_threads{state="blocked"}` | +| **Memory leaks** | `jvm_memory_used_bytes` trending up, `jvm_gc_live_data_size_bytes` growing, `jvm_classes_loaded_classes` growing | +| **GC pressure** | `jvm_gc_pause_seconds_max`, `jvm_gc_memory_promoted_bytes_total`, time in GC vs application time | diff --git a/platform-enterprise_versioned_sidebars/version-24.1-sidebars.json b/platform-enterprise_versioned_sidebars/version-24.1-sidebars.json index c6be05750..60796c618 100644 --- a/platform-enterprise_versioned_sidebars/version-24.1-sidebars.json +++ b/platform-enterprise_versioned_sidebars/version-24.1-sidebars.json @@ -58,7 +58,8 @@ "enterprise/advanced-topics/use-iam-role", "enterprise/advanced-topics/custom-launch-container", "enterprise/advanced-topics/firewall-configuration", - "enterprise/advanced-topics/jvm-memory-tuning" + "enterprise/advanced-topics/jvm-memory-tuning", + "enterprise/advanced-topics/monitoring" ] }, "enterprise/general_troubleshooting" diff --git a/platform-enterprise_versioned_sidebars/version-24.2-sidebars.json b/platform-enterprise_versioned_sidebars/version-24.2-sidebars.json index 09868c874..0cc8d2ae1 100644 --- a/platform-enterprise_versioned_sidebars/version-24.2-sidebars.json +++ b/platform-enterprise_versioned_sidebars/version-24.2-sidebars.json @@ -59,7 +59,8 @@ "enterprise/advanced-topics/custom-launch-container", "enterprise/advanced-topics/firewall-configuration", "enterprise/advanced-topics/seqera-container-images", - "enterprise/advanced-topics/jvm-memory-tuning" + "enterprise/advanced-topics/jvm-memory-tuning", + "enterprise/advanced-topics/monitoring" ] }, "enterprise/general_troubleshooting" diff --git a/platform-enterprise_versioned_sidebars/version-25.1-sidebars.json b/platform-enterprise_versioned_sidebars/version-25.1-sidebars.json index cae86add7..3ded33fb8 100644 --- a/platform-enterprise_versioned_sidebars/version-25.1-sidebars.json +++ b/platform-enterprise_versioned_sidebars/version-25.1-sidebars.json @@ -59,7 +59,8 @@ "enterprise/advanced-topics/custom-launch-container", "enterprise/advanced-topics/firewall-configuration", "enterprise/advanced-topics/seqera-container-images", - "enterprise/advanced-topics/jvm-memory-tuning" + "enterprise/advanced-topics/jvm-memory-tuning", + "enterprise/advanced-topics/monitoring" ] }, "enterprise/general_troubleshooting" diff --git a/platform-enterprise_versioned_sidebars/version-25.2-sidebars.json b/platform-enterprise_versioned_sidebars/version-25.2-sidebars.json index 4603f3aec..c2906fb0e 100644 --- a/platform-enterprise_versioned_sidebars/version-25.2-sidebars.json +++ b/platform-enterprise_versioned_sidebars/version-25.2-sidebars.json @@ -61,7 +61,8 @@ "enterprise/advanced-topics/firewall-configuration", "enterprise/advanced-topics/seqera-container-images", "enterprise/advanced-topics/content-security-policy", - "enterprise/advanced-topics/jvm-memory-tuning" + "enterprise/advanced-topics/jvm-memory-tuning", + "enterprise/advanced-topics/monitoring" ] }, "enterprise/general_troubleshooting" diff --git a/platform-enterprise_versioned_sidebars/version-25.3-sidebars.json b/platform-enterprise_versioned_sidebars/version-25.3-sidebars.json index 5f6512a95..dff85799a 100644 --- a/platform-enterprise_versioned_sidebars/version-25.3-sidebars.json +++ b/platform-enterprise_versioned_sidebars/version-25.3-sidebars.json @@ -60,7 +60,8 @@ "enterprise/advanced-topics/firewall-configuration", "enterprise/advanced-topics/seqera-container-images", "enterprise/advanced-topics/content-security-policy", - "enterprise/advanced-topics/jvm-memory-tuning" + "enterprise/advanced-topics/jvm-memory-tuning", + "enterprise/advanced-topics/monitoring" ] }, "enterprise/general_troubleshooting" From b210df6f4501ba5712bd8013d637f7f2050aca3b Mon Sep 17 00:00:00 2001 From: Justine Geffen Date: Wed, 17 Dec 2025 21:40:38 +0200 Subject: [PATCH 3/9] Update monitoring.md Signed-off-by: Justine Geffen --- .../enterprise/advanced-topics/monitoring.md | 64 +++++++++---------- 1 file changed, 29 insertions(+), 35 deletions(-) diff --git a/platform-enterprise_docs/enterprise/advanced-topics/monitoring.md b/platform-enterprise_docs/enterprise/advanced-topics/monitoring.md index c7e53d966..9dce9335e 100644 --- a/platform-enterprise_docs/enterprise/advanced-topics/monitoring.md +++ b/platform-enterprise_docs/enterprise/advanced-topics/monitoring.md @@ -1,34 +1,28 @@ --- -title: Seqera Platform Monitoring -headline: "Seqera Platform Monitoring" +title: "Seqera Platform Monitoring" description: "A guide on relevant platform metrics" -date_created: "2025-12-17" +date created: "2025-12-17" +tags: [platform, monitoring] --- -# Seqera Platform Monitoring - -## Enabling observability metrics - Seqera Platform has built-in observability metrics which can be enabled by adding `prometheus` to the `MICRONAUT_ENVIRONMENTS` environment variable. This exposes a Prometheus endpoint at `/prometheus` on the default listen port (e.g., `http://localhost:8080/prometheus`). Combined with infrastructure monitoring tools such as Node Exporter, you can monitor relevant metrics across your deployment. ---- - ## Key metrics to monitor ### Seqera Platform-specific metrics -#### Data Studio metrics +#### Studios metrics | Metric | Description | -| ------------------------------------------------ | ------------------------------------ | -| `data_studio_startup_time_failure_seconds_sum` | Time for failed Data Studio startups | -| `data_studio_startup_time_failure_seconds_count` | Failed Data Studio startup count | +| ------------------------------------------------ | -------------------------------------| +| `data_studio_startup_time_failure_seconds_sum` | Time for failed Studio startups | +| `data_studio_startup_time_failure_seconds_count` | Failed Studio startup count | -Track Data Studio startup performance to identify environment provisioning issues. Slow or failing startups impact user productivity. +Track Studio startup performance to identify environment provisioning issues. Slow or failing startups impact user productivity. -**Average startup time by tool:** +**Average startup time by tool** ```shell sum by (tool) (increase(data_studio_startup_time_success_seconds_sum{app="backend", namespace="$namespace"}[$__rate_interval])) @@ -36,7 +30,7 @@ sum by (tool) (increase(data_studio_startup_time_success_seconds_sum{app="backen sum by (tool) (increase(data_studio_startup_time_success_seconds_count{app="backend", namespace="$namespace"}[$__rate_interval])) ``` -**Failed startup rate:** +**Failed startup rate** ```shell rate(data_studio_startup_time_failure_seconds_count{namespace="$namespace"}[$__rate_interval]) @@ -52,7 +46,7 @@ rate(data_studio_startup_time_failure_seconds_count{namespace="$namespace"}[$__r Monitor application errors across different time windows. Rolling error counts help identify transient issues versus sustained problems. -**Recent error counts:** +**Recent error counts** ```shell tower_logs_errors_10secCount{namespace="$namespace"} @@ -60,7 +54,7 @@ tower_logs_errors_1minCount{namespace="$namespace"} tower_logs_errors_5minCount{namespace="$namespace"} ``` -**Log events by severity level:** +**Log events by severity level** ```shell rate(logback_events_total{namespace="$namespace"}[$__rate_interval]) @@ -72,7 +66,7 @@ rate(logback_events_total{namespace="$namespace"}[$__rate_interval]) Monitor container CPU consumption against requested resources to identify capacity issues or inefficient resource allocation. -**Backend CPU usage:** +**Backend CPU usage** ```shell rate(container_cpu_usage_seconds_total{container="backend", namespace="$namespace"}[$__rate_interval]) @@ -148,13 +142,13 @@ topk(10, sum by(method, uri) (rate(http_server_requests_seconds_sum{namespace="$ Monitor external API calls and integrations. Slow or failing outbound requests can cascade into application performance issues. -**Outbound request rate:** +**Outbound request rate** ```shell rate(http_client_requests_seconds_count{namespace="$namespace"}[$__rate_interval]) ``` -**Average outbound request duration:** +**Average outbound request duration** ```shell rate(http_client_requests_seconds_sum{namespace="$namespace"}[$__rate_interval]) @@ -289,7 +283,7 @@ Class loading metrics help identify class loader leaks or excessive dynamic clas jvm_classes_loaded_classes{namespace="$namespace", app="backend"} ``` -**Class unload rate:** +**Class unload rate** ```shell rate(jvm_classes_unloaded_classes_total{namespace="$namespace", app="backend"}[$__rate_interval]) @@ -308,7 +302,7 @@ rate(jvm_classes_unloaded_classes_total{namespace="$namespace", app="backend"}[$ Process-level metrics provide visibility into resource consumption and system limits. -**JVM process CPU usage:** +**JVM process CPU usage** ```shell process_cpu_usage{namespace="$namespace"} @@ -342,7 +336,7 @@ process_uptime_seconds{namespace="$namespace"} System metrics provide host-level context for application performance. -**System-wide CPU usage:** +**System-wide CPU usage** ```shell system_cpu_usage{namespace="$namespace"} @@ -354,7 +348,7 @@ system_cpu_usage{namespace="$namespace"} system_load_average_1m{namespace="$namespace"} ``` -**Available CPU count:** +**Available CPU count** ```shell system_cpu_count{namespace="$namespace"} @@ -381,7 +375,7 @@ executor_active_threads{service="backend", namespace="$namespace", name!="schedu executor_pool_size_threads{service="backend", namespace="$namespace", name!="scheduled"} ``` -**Cron scheduled executor utilization:** +**Cron scheduled executor utilization** ```shell executor_active_threads{service="cron", namespace="$namespace", name="scheduled"} @@ -395,7 +389,7 @@ executor_pool_size_threads{service="cron", namespace="$namespace", name="schedul executor_queued_tasks{app="backend", namespace="$namespace"} ``` -**Task completion rate:** +**Task completion rate** ```shell rate(executor_completed_tasks_total{namespace="$namespace"}[$__rate_interval]) @@ -420,13 +414,13 @@ avg(irate(redis_keyspace_hits_total{app="platform-redis-exporter"}[$__rate_inter (irate(redis_keyspace_misses_total{app="platform-redis-exporter"}[$__rate_interval]) + irate(redis_keyspace_hits_total{app="platform-redis-exporter"}[$__rate_interval]))) ``` -**Cache size by name:** +**Cache size by name** ```shell cache_size{namespace="$namespace"} ``` -**Cache operation rates:** +**Cache operation rates** ```shell rate(cache_gets_total{namespace="$namespace"}[$__rate_interval]) @@ -460,13 +454,13 @@ rate(hibernate_sessions_open_total{app="backend", namespace="$namespace"}[$__rat rate(hibernate_sessions_closed_total{app="backend", namespace="$namespace"}[$__rate_interval]) ``` -**Connection acquisition rate:** +**Connection acquisition rate** ```shell rate(hibernate_connections_obtained_total{app="backend", namespace="$namespace"}[$__rate_interval]) ``` -**Query execution rate:** +**Query execution rate** ```shell rate(hibernate_query_executions_total{app="backend", namespace="$namespace"}[$__rate_interval]) @@ -495,7 +489,7 @@ rate(hibernate_entities_deletes_total{app="backend", namespace="$namespace"}[$__ rate(hibernate_entities_loads_total{app="backend", namespace="$namespace"}[$__rate_interval]) ``` -**Transaction success/failure rate:** +**Transaction success/failure rate** ```shell sum by (result) (rate(hibernate_transactions_total{app="backend", namespace="$namespace"}[$__rate_interval])) @@ -539,7 +533,7 @@ sum(increase(hibernate_cache_query_requests_total{app="backend", namespace="$nam sum(increase(hibernate_cache_query_requests_total{app="backend", namespace="$namespace"}[$__rate_interval])) ``` -**Query plan cache hit rate:** +**Query plan cache hit rate** ```shell sum(increase(hibernate_cache_query_plan_total{app="backend", namespace="$namespace", result="hit"}[$__rate_interval])) @@ -547,7 +541,7 @@ sum(increase(hibernate_cache_query_plan_total{app="backend", namespace="$namespa sum(increase(hibernate_cache_query_plan_total{app="backend", namespace="$namespace"}[$__rate_interval])) ``` -**Second level cache hit rate by region:** +**Second level cache hit rate by region** ```shell sum by (region) (increase(hibernate_second_level_cache_requests_total{app="backend", namespace="$namespace", result="hit"}[$__rate_interval])) @@ -573,7 +567,7 @@ rate(logback_events_total{level="error"}[5m]) Monitor pod health to catch deployment or infrastructure issues early. -**Pods in unhealthy states:** +**Pods in unhealthy states** ```shell sum by (namespace, pod) (kube_pod_status_phase{phase=~"Pending|Unknown|Failed", namespace!="wave-build"}) > 0 From ebfd14f02be65caf6e77b759bc7db46cf6db823d Mon Sep 17 00:00:00 2001 From: Justine Geffen Date: Wed, 17 Dec 2025 21:41:50 +0200 Subject: [PATCH 4/9] Update monitoring.md Signed-off-by: Justine Geffen --- .../enterprise/advanced-topics/monitoring.md | 68 ++++++++----------- 1 file changed, 29 insertions(+), 39 deletions(-) diff --git a/platform-enterprise_versioned_docs/version-24.1/enterprise/advanced-topics/monitoring.md b/platform-enterprise_versioned_docs/version-24.1/enterprise/advanced-topics/monitoring.md index c7e53d966..c721b3ce3 100644 --- a/platform-enterprise_versioned_docs/version-24.1/enterprise/advanced-topics/monitoring.md +++ b/platform-enterprise_versioned_docs/version-24.1/enterprise/advanced-topics/monitoring.md @@ -1,34 +1,28 @@ --- -title: Seqera Platform Monitoring -headline: "Seqera Platform Monitoring" +title: "Seqera Platform Monitoring" description: "A guide on relevant platform metrics" -date_created: "2025-12-17" +date created: "2025-12-17" +tags: [platform, monitoring] --- -# Seqera Platform Monitoring - -## Enabling observability metrics - Seqera Platform has built-in observability metrics which can be enabled by adding `prometheus` to the `MICRONAUT_ENVIRONMENTS` environment variable. This exposes a Prometheus endpoint at `/prometheus` on the default listen port (e.g., `http://localhost:8080/prometheus`). Combined with infrastructure monitoring tools such as Node Exporter, you can monitor relevant metrics across your deployment. ---- - ## Key metrics to monitor ### Seqera Platform-specific metrics -#### Data Studio metrics +#### Studios metrics | Metric | Description | -| ------------------------------------------------ | ------------------------------------ | -| `data_studio_startup_time_failure_seconds_sum` | Time for failed Data Studio startups | -| `data_studio_startup_time_failure_seconds_count` | Failed Data Studio startup count | +| ------------------------------------------------ | -------------------------------------| +| `data_studio_startup_time_failure_seconds_sum` | Time for failed Studio startups | +| `data_studio_startup_time_failure_seconds_count` | Failed Studio startup count | -Track Data Studio startup performance to identify environment provisioning issues. Slow or failing startups impact user productivity. +Track Studio startup performance to identify environment provisioning issues. Slow or failing startups impact user productivity. -**Average startup time by tool:** +**Average startup time by tool** ```shell sum by (tool) (increase(data_studio_startup_time_success_seconds_sum{app="backend", namespace="$namespace"}[$__rate_interval])) @@ -36,7 +30,7 @@ sum by (tool) (increase(data_studio_startup_time_success_seconds_sum{app="backen sum by (tool) (increase(data_studio_startup_time_success_seconds_count{app="backend", namespace="$namespace"}[$__rate_interval])) ``` -**Failed startup rate:** +**Failed startup rate** ```shell rate(data_studio_startup_time_failure_seconds_count{namespace="$namespace"}[$__rate_interval]) @@ -52,7 +46,7 @@ rate(data_studio_startup_time_failure_seconds_count{namespace="$namespace"}[$__r Monitor application errors across different time windows. Rolling error counts help identify transient issues versus sustained problems. -**Recent error counts:** +**Recent error counts** ```shell tower_logs_errors_10secCount{namespace="$namespace"} @@ -60,7 +54,7 @@ tower_logs_errors_1minCount{namespace="$namespace"} tower_logs_errors_5minCount{namespace="$namespace"} ``` -**Log events by severity level:** +**Log events by severity level** ```shell rate(logback_events_total{namespace="$namespace"}[$__rate_interval]) @@ -72,7 +66,7 @@ rate(logback_events_total{namespace="$namespace"}[$__rate_interval]) Monitor container CPU consumption against requested resources to identify capacity issues or inefficient resource allocation. -**Backend CPU usage:** +**Backend CPU usage** ```shell rate(container_cpu_usage_seconds_total{container="backend", namespace="$namespace"}[$__rate_interval]) @@ -148,13 +142,13 @@ topk(10, sum by(method, uri) (rate(http_server_requests_seconds_sum{namespace="$ Monitor external API calls and integrations. Slow or failing outbound requests can cascade into application performance issues. -**Outbound request rate:** +**Outbound request rate** ```shell rate(http_client_requests_seconds_count{namespace="$namespace"}[$__rate_interval]) ``` -**Average outbound request duration:** +**Average outbound request duration** ```shell rate(http_client_requests_seconds_sum{namespace="$namespace"}[$__rate_interval]) @@ -289,7 +283,7 @@ Class loading metrics help identify class loader leaks or excessive dynamic clas jvm_classes_loaded_classes{namespace="$namespace", app="backend"} ``` -**Class unload rate:** +**Class unload rate** ```shell rate(jvm_classes_unloaded_classes_total{namespace="$namespace", app="backend"}[$__rate_interval]) @@ -308,7 +302,7 @@ rate(jvm_classes_unloaded_classes_total{namespace="$namespace", app="backend"}[$ Process-level metrics provide visibility into resource consumption and system limits. -**JVM process CPU usage:** +**JVM process CPU usage** ```shell process_cpu_usage{namespace="$namespace"} @@ -342,7 +336,7 @@ process_uptime_seconds{namespace="$namespace"} System metrics provide host-level context for application performance. -**System-wide CPU usage:** +**System-wide CPU usage** ```shell system_cpu_usage{namespace="$namespace"} @@ -354,7 +348,7 @@ system_cpu_usage{namespace="$namespace"} system_load_average_1m{namespace="$namespace"} ``` -**Available CPU count:** +**Available CPU count** ```shell system_cpu_count{namespace="$namespace"} @@ -381,7 +375,7 @@ executor_active_threads{service="backend", namespace="$namespace", name!="schedu executor_pool_size_threads{service="backend", namespace="$namespace", name!="scheduled"} ``` -**Cron scheduled executor utilization:** +**Cron scheduled executor utilization** ```shell executor_active_threads{service="cron", namespace="$namespace", name="scheduled"} @@ -395,7 +389,7 @@ executor_pool_size_threads{service="cron", namespace="$namespace", name="schedul executor_queued_tasks{app="backend", namespace="$namespace"} ``` -**Task completion rate:** +**Task completion rate** ```shell rate(executor_completed_tasks_total{namespace="$namespace"}[$__rate_interval]) @@ -420,13 +414,13 @@ avg(irate(redis_keyspace_hits_total{app="platform-redis-exporter"}[$__rate_inter (irate(redis_keyspace_misses_total{app="platform-redis-exporter"}[$__rate_interval]) + irate(redis_keyspace_hits_total{app="platform-redis-exporter"}[$__rate_interval]))) ``` -**Cache size by name:** +**Cache size by name** ```shell cache_size{namespace="$namespace"} ``` -**Cache operation rates:** +**Cache operation rates** ```shell rate(cache_gets_total{namespace="$namespace"}[$__rate_interval]) @@ -460,13 +454,13 @@ rate(hibernate_sessions_open_total{app="backend", namespace="$namespace"}[$__rat rate(hibernate_sessions_closed_total{app="backend", namespace="$namespace"}[$__rate_interval]) ``` -**Connection acquisition rate:** +**Connection acquisition rate** ```shell rate(hibernate_connections_obtained_total{app="backend", namespace="$namespace"}[$__rate_interval]) ``` -**Query execution rate:** +**Query execution rate** ```shell rate(hibernate_query_executions_total{app="backend", namespace="$namespace"}[$__rate_interval]) @@ -495,7 +489,7 @@ rate(hibernate_entities_deletes_total{app="backend", namespace="$namespace"}[$__ rate(hibernate_entities_loads_total{app="backend", namespace="$namespace"}[$__rate_interval]) ``` -**Transaction success/failure rate:** +**Transaction success/failure rate** ```shell sum by (result) (rate(hibernate_transactions_total{app="backend", namespace="$namespace"}[$__rate_interval])) @@ -539,7 +533,7 @@ sum(increase(hibernate_cache_query_requests_total{app="backend", namespace="$nam sum(increase(hibernate_cache_query_requests_total{app="backend", namespace="$namespace"}[$__rate_interval])) ``` -**Query plan cache hit rate:** +**Query plan cache hit rate** ```shell sum(increase(hibernate_cache_query_plan_total{app="backend", namespace="$namespace", result="hit"}[$__rate_interval])) @@ -547,7 +541,7 @@ sum(increase(hibernate_cache_query_plan_total{app="backend", namespace="$namespa sum(increase(hibernate_cache_query_plan_total{app="backend", namespace="$namespace"}[$__rate_interval])) ``` -**Second level cache hit rate by region:** +**Second level cache hit rate by region** ```shell sum by (region) (increase(hibernate_second_level_cache_requests_total{app="backend", namespace="$namespace", result="hit"}[$__rate_interval])) @@ -573,14 +567,12 @@ rate(logback_events_total{level="error"}[5m]) Monitor pod health to catch deployment or infrastructure issues early. -**Pods in unhealthy states:** +**Pods in unhealthy states** ```shell sum by (namespace, pod) (kube_pod_status_phase{phase=~"Pending|Unknown|Failed", namespace!="wave-build"}) > 0 ``` ---- - ## Alerting recommendations ### Critical alerts @@ -608,8 +600,6 @@ sum by (namespace, pod) (kube_pod_status_phase{phase=~"Pending|Unknown|Failed", - Growing gap between `credits_estimation_workflow_added_total` and `credits_estimation_workflow_ended_total` - `hibernate_sessions_open_total` >> `hibernate_sessions_closed_total` over time ---- - ## Quick reference: Metrics by troubleshooting scenario | Issue | Key Metrics to Check | From fc66603f4dc7e4b22050f4bfb15f2569d73b4c68 Mon Sep 17 00:00:00 2001 From: Justine Geffen Date: Wed, 17 Dec 2025 21:42:07 +0200 Subject: [PATCH 5/9] Clean up blank lines in monitoring.md Removed unnecessary blank lines in the monitoring documentation. Signed-off-by: Justine Geffen --- .../enterprise/advanced-topics/monitoring.md | 4 ---- 1 file changed, 4 deletions(-) diff --git a/platform-enterprise_docs/enterprise/advanced-topics/monitoring.md b/platform-enterprise_docs/enterprise/advanced-topics/monitoring.md index 9dce9335e..c721b3ce3 100644 --- a/platform-enterprise_docs/enterprise/advanced-topics/monitoring.md +++ b/platform-enterprise_docs/enterprise/advanced-topics/monitoring.md @@ -573,8 +573,6 @@ Monitor pod health to catch deployment or infrastructure issues early. sum by (namespace, pod) (kube_pod_status_phase{phase=~"Pending|Unknown|Failed", namespace!="wave-build"}) > 0 ``` ---- - ## Alerting recommendations ### Critical alerts @@ -602,8 +600,6 @@ sum by (namespace, pod) (kube_pod_status_phase{phase=~"Pending|Unknown|Failed", - Growing gap between `credits_estimation_workflow_added_total` and `credits_estimation_workflow_ended_total` - `hibernate_sessions_open_total` >> `hibernate_sessions_closed_total` over time ---- - ## Quick reference: Metrics by troubleshooting scenario | Issue | Key Metrics to Check | From 1444d1331e55d84501f6f5912f95ba9f8c76d2cb Mon Sep 17 00:00:00 2001 From: Justine Geffen Date: Wed, 17 Dec 2025 21:43:00 +0200 Subject: [PATCH 6/9] Update monitoring.md Signed-off-by: Justine Geffen --- .../enterprise/advanced-topics/monitoring.md | 68 ++++++++----------- 1 file changed, 29 insertions(+), 39 deletions(-) diff --git a/platform-enterprise_versioned_docs/version-24.2/enterprise/advanced-topics/monitoring.md b/platform-enterprise_versioned_docs/version-24.2/enterprise/advanced-topics/monitoring.md index c7e53d966..c721b3ce3 100644 --- a/platform-enterprise_versioned_docs/version-24.2/enterprise/advanced-topics/monitoring.md +++ b/platform-enterprise_versioned_docs/version-24.2/enterprise/advanced-topics/monitoring.md @@ -1,34 +1,28 @@ --- -title: Seqera Platform Monitoring -headline: "Seqera Platform Monitoring" +title: "Seqera Platform Monitoring" description: "A guide on relevant platform metrics" -date_created: "2025-12-17" +date created: "2025-12-17" +tags: [platform, monitoring] --- -# Seqera Platform Monitoring - -## Enabling observability metrics - Seqera Platform has built-in observability metrics which can be enabled by adding `prometheus` to the `MICRONAUT_ENVIRONMENTS` environment variable. This exposes a Prometheus endpoint at `/prometheus` on the default listen port (e.g., `http://localhost:8080/prometheus`). Combined with infrastructure monitoring tools such as Node Exporter, you can monitor relevant metrics across your deployment. ---- - ## Key metrics to monitor ### Seqera Platform-specific metrics -#### Data Studio metrics +#### Studios metrics | Metric | Description | -| ------------------------------------------------ | ------------------------------------ | -| `data_studio_startup_time_failure_seconds_sum` | Time for failed Data Studio startups | -| `data_studio_startup_time_failure_seconds_count` | Failed Data Studio startup count | +| ------------------------------------------------ | -------------------------------------| +| `data_studio_startup_time_failure_seconds_sum` | Time for failed Studio startups | +| `data_studio_startup_time_failure_seconds_count` | Failed Studio startup count | -Track Data Studio startup performance to identify environment provisioning issues. Slow or failing startups impact user productivity. +Track Studio startup performance to identify environment provisioning issues. Slow or failing startups impact user productivity. -**Average startup time by tool:** +**Average startup time by tool** ```shell sum by (tool) (increase(data_studio_startup_time_success_seconds_sum{app="backend", namespace="$namespace"}[$__rate_interval])) @@ -36,7 +30,7 @@ sum by (tool) (increase(data_studio_startup_time_success_seconds_sum{app="backen sum by (tool) (increase(data_studio_startup_time_success_seconds_count{app="backend", namespace="$namespace"}[$__rate_interval])) ``` -**Failed startup rate:** +**Failed startup rate** ```shell rate(data_studio_startup_time_failure_seconds_count{namespace="$namespace"}[$__rate_interval]) @@ -52,7 +46,7 @@ rate(data_studio_startup_time_failure_seconds_count{namespace="$namespace"}[$__r Monitor application errors across different time windows. Rolling error counts help identify transient issues versus sustained problems. -**Recent error counts:** +**Recent error counts** ```shell tower_logs_errors_10secCount{namespace="$namespace"} @@ -60,7 +54,7 @@ tower_logs_errors_1minCount{namespace="$namespace"} tower_logs_errors_5minCount{namespace="$namespace"} ``` -**Log events by severity level:** +**Log events by severity level** ```shell rate(logback_events_total{namespace="$namespace"}[$__rate_interval]) @@ -72,7 +66,7 @@ rate(logback_events_total{namespace="$namespace"}[$__rate_interval]) Monitor container CPU consumption against requested resources to identify capacity issues or inefficient resource allocation. -**Backend CPU usage:** +**Backend CPU usage** ```shell rate(container_cpu_usage_seconds_total{container="backend", namespace="$namespace"}[$__rate_interval]) @@ -148,13 +142,13 @@ topk(10, sum by(method, uri) (rate(http_server_requests_seconds_sum{namespace="$ Monitor external API calls and integrations. Slow or failing outbound requests can cascade into application performance issues. -**Outbound request rate:** +**Outbound request rate** ```shell rate(http_client_requests_seconds_count{namespace="$namespace"}[$__rate_interval]) ``` -**Average outbound request duration:** +**Average outbound request duration** ```shell rate(http_client_requests_seconds_sum{namespace="$namespace"}[$__rate_interval]) @@ -289,7 +283,7 @@ Class loading metrics help identify class loader leaks or excessive dynamic clas jvm_classes_loaded_classes{namespace="$namespace", app="backend"} ``` -**Class unload rate:** +**Class unload rate** ```shell rate(jvm_classes_unloaded_classes_total{namespace="$namespace", app="backend"}[$__rate_interval]) @@ -308,7 +302,7 @@ rate(jvm_classes_unloaded_classes_total{namespace="$namespace", app="backend"}[$ Process-level metrics provide visibility into resource consumption and system limits. -**JVM process CPU usage:** +**JVM process CPU usage** ```shell process_cpu_usage{namespace="$namespace"} @@ -342,7 +336,7 @@ process_uptime_seconds{namespace="$namespace"} System metrics provide host-level context for application performance. -**System-wide CPU usage:** +**System-wide CPU usage** ```shell system_cpu_usage{namespace="$namespace"} @@ -354,7 +348,7 @@ system_cpu_usage{namespace="$namespace"} system_load_average_1m{namespace="$namespace"} ``` -**Available CPU count:** +**Available CPU count** ```shell system_cpu_count{namespace="$namespace"} @@ -381,7 +375,7 @@ executor_active_threads{service="backend", namespace="$namespace", name!="schedu executor_pool_size_threads{service="backend", namespace="$namespace", name!="scheduled"} ``` -**Cron scheduled executor utilization:** +**Cron scheduled executor utilization** ```shell executor_active_threads{service="cron", namespace="$namespace", name="scheduled"} @@ -395,7 +389,7 @@ executor_pool_size_threads{service="cron", namespace="$namespace", name="schedul executor_queued_tasks{app="backend", namespace="$namespace"} ``` -**Task completion rate:** +**Task completion rate** ```shell rate(executor_completed_tasks_total{namespace="$namespace"}[$__rate_interval]) @@ -420,13 +414,13 @@ avg(irate(redis_keyspace_hits_total{app="platform-redis-exporter"}[$__rate_inter (irate(redis_keyspace_misses_total{app="platform-redis-exporter"}[$__rate_interval]) + irate(redis_keyspace_hits_total{app="platform-redis-exporter"}[$__rate_interval]))) ``` -**Cache size by name:** +**Cache size by name** ```shell cache_size{namespace="$namespace"} ``` -**Cache operation rates:** +**Cache operation rates** ```shell rate(cache_gets_total{namespace="$namespace"}[$__rate_interval]) @@ -460,13 +454,13 @@ rate(hibernate_sessions_open_total{app="backend", namespace="$namespace"}[$__rat rate(hibernate_sessions_closed_total{app="backend", namespace="$namespace"}[$__rate_interval]) ``` -**Connection acquisition rate:** +**Connection acquisition rate** ```shell rate(hibernate_connections_obtained_total{app="backend", namespace="$namespace"}[$__rate_interval]) ``` -**Query execution rate:** +**Query execution rate** ```shell rate(hibernate_query_executions_total{app="backend", namespace="$namespace"}[$__rate_interval]) @@ -495,7 +489,7 @@ rate(hibernate_entities_deletes_total{app="backend", namespace="$namespace"}[$__ rate(hibernate_entities_loads_total{app="backend", namespace="$namespace"}[$__rate_interval]) ``` -**Transaction success/failure rate:** +**Transaction success/failure rate** ```shell sum by (result) (rate(hibernate_transactions_total{app="backend", namespace="$namespace"}[$__rate_interval])) @@ -539,7 +533,7 @@ sum(increase(hibernate_cache_query_requests_total{app="backend", namespace="$nam sum(increase(hibernate_cache_query_requests_total{app="backend", namespace="$namespace"}[$__rate_interval])) ``` -**Query plan cache hit rate:** +**Query plan cache hit rate** ```shell sum(increase(hibernate_cache_query_plan_total{app="backend", namespace="$namespace", result="hit"}[$__rate_interval])) @@ -547,7 +541,7 @@ sum(increase(hibernate_cache_query_plan_total{app="backend", namespace="$namespa sum(increase(hibernate_cache_query_plan_total{app="backend", namespace="$namespace"}[$__rate_interval])) ``` -**Second level cache hit rate by region:** +**Second level cache hit rate by region** ```shell sum by (region) (increase(hibernate_second_level_cache_requests_total{app="backend", namespace="$namespace", result="hit"}[$__rate_interval])) @@ -573,14 +567,12 @@ rate(logback_events_total{level="error"}[5m]) Monitor pod health to catch deployment or infrastructure issues early. -**Pods in unhealthy states:** +**Pods in unhealthy states** ```shell sum by (namespace, pod) (kube_pod_status_phase{phase=~"Pending|Unknown|Failed", namespace!="wave-build"}) > 0 ``` ---- - ## Alerting recommendations ### Critical alerts @@ -608,8 +600,6 @@ sum by (namespace, pod) (kube_pod_status_phase{phase=~"Pending|Unknown|Failed", - Growing gap between `credits_estimation_workflow_added_total` and `credits_estimation_workflow_ended_total` - `hibernate_sessions_open_total` >> `hibernate_sessions_closed_total` over time ---- - ## Quick reference: Metrics by troubleshooting scenario | Issue | Key Metrics to Check | From 47138ad960f585cdbf0903bccac3278b984dec90 Mon Sep 17 00:00:00 2001 From: Justine Geffen Date: Wed, 17 Dec 2025 21:43:21 +0200 Subject: [PATCH 7/9] Refine Seqera Platform monitoring documentation Updated monitoring documentation for Seqera Platform, including corrections to metric names and descriptions. Signed-off-by: Justine Geffen --- .../enterprise/advanced-topics/monitoring.md | 68 ++++++++----------- 1 file changed, 29 insertions(+), 39 deletions(-) diff --git a/platform-enterprise_versioned_docs/version-25.1/enterprise/advanced-topics/monitoring.md b/platform-enterprise_versioned_docs/version-25.1/enterprise/advanced-topics/monitoring.md index c7e53d966..c721b3ce3 100644 --- a/platform-enterprise_versioned_docs/version-25.1/enterprise/advanced-topics/monitoring.md +++ b/platform-enterprise_versioned_docs/version-25.1/enterprise/advanced-topics/monitoring.md @@ -1,34 +1,28 @@ --- -title: Seqera Platform Monitoring -headline: "Seqera Platform Monitoring" +title: "Seqera Platform Monitoring" description: "A guide on relevant platform metrics" -date_created: "2025-12-17" +date created: "2025-12-17" +tags: [platform, monitoring] --- -# Seqera Platform Monitoring - -## Enabling observability metrics - Seqera Platform has built-in observability metrics which can be enabled by adding `prometheus` to the `MICRONAUT_ENVIRONMENTS` environment variable. This exposes a Prometheus endpoint at `/prometheus` on the default listen port (e.g., `http://localhost:8080/prometheus`). Combined with infrastructure monitoring tools such as Node Exporter, you can monitor relevant metrics across your deployment. ---- - ## Key metrics to monitor ### Seqera Platform-specific metrics -#### Data Studio metrics +#### Studios metrics | Metric | Description | -| ------------------------------------------------ | ------------------------------------ | -| `data_studio_startup_time_failure_seconds_sum` | Time for failed Data Studio startups | -| `data_studio_startup_time_failure_seconds_count` | Failed Data Studio startup count | +| ------------------------------------------------ | -------------------------------------| +| `data_studio_startup_time_failure_seconds_sum` | Time for failed Studio startups | +| `data_studio_startup_time_failure_seconds_count` | Failed Studio startup count | -Track Data Studio startup performance to identify environment provisioning issues. Slow or failing startups impact user productivity. +Track Studio startup performance to identify environment provisioning issues. Slow or failing startups impact user productivity. -**Average startup time by tool:** +**Average startup time by tool** ```shell sum by (tool) (increase(data_studio_startup_time_success_seconds_sum{app="backend", namespace="$namespace"}[$__rate_interval])) @@ -36,7 +30,7 @@ sum by (tool) (increase(data_studio_startup_time_success_seconds_sum{app="backen sum by (tool) (increase(data_studio_startup_time_success_seconds_count{app="backend", namespace="$namespace"}[$__rate_interval])) ``` -**Failed startup rate:** +**Failed startup rate** ```shell rate(data_studio_startup_time_failure_seconds_count{namespace="$namespace"}[$__rate_interval]) @@ -52,7 +46,7 @@ rate(data_studio_startup_time_failure_seconds_count{namespace="$namespace"}[$__r Monitor application errors across different time windows. Rolling error counts help identify transient issues versus sustained problems. -**Recent error counts:** +**Recent error counts** ```shell tower_logs_errors_10secCount{namespace="$namespace"} @@ -60,7 +54,7 @@ tower_logs_errors_1minCount{namespace="$namespace"} tower_logs_errors_5minCount{namespace="$namespace"} ``` -**Log events by severity level:** +**Log events by severity level** ```shell rate(logback_events_total{namespace="$namespace"}[$__rate_interval]) @@ -72,7 +66,7 @@ rate(logback_events_total{namespace="$namespace"}[$__rate_interval]) Monitor container CPU consumption against requested resources to identify capacity issues or inefficient resource allocation. -**Backend CPU usage:** +**Backend CPU usage** ```shell rate(container_cpu_usage_seconds_total{container="backend", namespace="$namespace"}[$__rate_interval]) @@ -148,13 +142,13 @@ topk(10, sum by(method, uri) (rate(http_server_requests_seconds_sum{namespace="$ Monitor external API calls and integrations. Slow or failing outbound requests can cascade into application performance issues. -**Outbound request rate:** +**Outbound request rate** ```shell rate(http_client_requests_seconds_count{namespace="$namespace"}[$__rate_interval]) ``` -**Average outbound request duration:** +**Average outbound request duration** ```shell rate(http_client_requests_seconds_sum{namespace="$namespace"}[$__rate_interval]) @@ -289,7 +283,7 @@ Class loading metrics help identify class loader leaks or excessive dynamic clas jvm_classes_loaded_classes{namespace="$namespace", app="backend"} ``` -**Class unload rate:** +**Class unload rate** ```shell rate(jvm_classes_unloaded_classes_total{namespace="$namespace", app="backend"}[$__rate_interval]) @@ -308,7 +302,7 @@ rate(jvm_classes_unloaded_classes_total{namespace="$namespace", app="backend"}[$ Process-level metrics provide visibility into resource consumption and system limits. -**JVM process CPU usage:** +**JVM process CPU usage** ```shell process_cpu_usage{namespace="$namespace"} @@ -342,7 +336,7 @@ process_uptime_seconds{namespace="$namespace"} System metrics provide host-level context for application performance. -**System-wide CPU usage:** +**System-wide CPU usage** ```shell system_cpu_usage{namespace="$namespace"} @@ -354,7 +348,7 @@ system_cpu_usage{namespace="$namespace"} system_load_average_1m{namespace="$namespace"} ``` -**Available CPU count:** +**Available CPU count** ```shell system_cpu_count{namespace="$namespace"} @@ -381,7 +375,7 @@ executor_active_threads{service="backend", namespace="$namespace", name!="schedu executor_pool_size_threads{service="backend", namespace="$namespace", name!="scheduled"} ``` -**Cron scheduled executor utilization:** +**Cron scheduled executor utilization** ```shell executor_active_threads{service="cron", namespace="$namespace", name="scheduled"} @@ -395,7 +389,7 @@ executor_pool_size_threads{service="cron", namespace="$namespace", name="schedul executor_queued_tasks{app="backend", namespace="$namespace"} ``` -**Task completion rate:** +**Task completion rate** ```shell rate(executor_completed_tasks_total{namespace="$namespace"}[$__rate_interval]) @@ -420,13 +414,13 @@ avg(irate(redis_keyspace_hits_total{app="platform-redis-exporter"}[$__rate_inter (irate(redis_keyspace_misses_total{app="platform-redis-exporter"}[$__rate_interval]) + irate(redis_keyspace_hits_total{app="platform-redis-exporter"}[$__rate_interval]))) ``` -**Cache size by name:** +**Cache size by name** ```shell cache_size{namespace="$namespace"} ``` -**Cache operation rates:** +**Cache operation rates** ```shell rate(cache_gets_total{namespace="$namespace"}[$__rate_interval]) @@ -460,13 +454,13 @@ rate(hibernate_sessions_open_total{app="backend", namespace="$namespace"}[$__rat rate(hibernate_sessions_closed_total{app="backend", namespace="$namespace"}[$__rate_interval]) ``` -**Connection acquisition rate:** +**Connection acquisition rate** ```shell rate(hibernate_connections_obtained_total{app="backend", namespace="$namespace"}[$__rate_interval]) ``` -**Query execution rate:** +**Query execution rate** ```shell rate(hibernate_query_executions_total{app="backend", namespace="$namespace"}[$__rate_interval]) @@ -495,7 +489,7 @@ rate(hibernate_entities_deletes_total{app="backend", namespace="$namespace"}[$__ rate(hibernate_entities_loads_total{app="backend", namespace="$namespace"}[$__rate_interval]) ``` -**Transaction success/failure rate:** +**Transaction success/failure rate** ```shell sum by (result) (rate(hibernate_transactions_total{app="backend", namespace="$namespace"}[$__rate_interval])) @@ -539,7 +533,7 @@ sum(increase(hibernate_cache_query_requests_total{app="backend", namespace="$nam sum(increase(hibernate_cache_query_requests_total{app="backend", namespace="$namespace"}[$__rate_interval])) ``` -**Query plan cache hit rate:** +**Query plan cache hit rate** ```shell sum(increase(hibernate_cache_query_plan_total{app="backend", namespace="$namespace", result="hit"}[$__rate_interval])) @@ -547,7 +541,7 @@ sum(increase(hibernate_cache_query_plan_total{app="backend", namespace="$namespa sum(increase(hibernate_cache_query_plan_total{app="backend", namespace="$namespace"}[$__rate_interval])) ``` -**Second level cache hit rate by region:** +**Second level cache hit rate by region** ```shell sum by (region) (increase(hibernate_second_level_cache_requests_total{app="backend", namespace="$namespace", result="hit"}[$__rate_interval])) @@ -573,14 +567,12 @@ rate(logback_events_total{level="error"}[5m]) Monitor pod health to catch deployment or infrastructure issues early. -**Pods in unhealthy states:** +**Pods in unhealthy states** ```shell sum by (namespace, pod) (kube_pod_status_phase{phase=~"Pending|Unknown|Failed", namespace!="wave-build"}) > 0 ``` ---- - ## Alerting recommendations ### Critical alerts @@ -608,8 +600,6 @@ sum by (namespace, pod) (kube_pod_status_phase{phase=~"Pending|Unknown|Failed", - Growing gap between `credits_estimation_workflow_added_total` and `credits_estimation_workflow_ended_total` - `hibernate_sessions_open_total` >> `hibernate_sessions_closed_total` over time ---- - ## Quick reference: Metrics by troubleshooting scenario | Issue | Key Metrics to Check | From efc34357ebb8e41798f68b263765a85c3bde6753 Mon Sep 17 00:00:00 2001 From: Justine Geffen Date: Wed, 17 Dec 2025 21:43:51 +0200 Subject: [PATCH 8/9] Revise monitoring.md for terminology updates Updated monitoring documentation for Seqera Platform to reflect changes in terminology and improve clarity. Signed-off-by: Justine Geffen --- .../enterprise/advanced-topics/monitoring.md | 68 ++++++++----------- 1 file changed, 29 insertions(+), 39 deletions(-) diff --git a/platform-enterprise_versioned_docs/version-25.2/enterprise/advanced-topics/monitoring.md b/platform-enterprise_versioned_docs/version-25.2/enterprise/advanced-topics/monitoring.md index c7e53d966..c721b3ce3 100644 --- a/platform-enterprise_versioned_docs/version-25.2/enterprise/advanced-topics/monitoring.md +++ b/platform-enterprise_versioned_docs/version-25.2/enterprise/advanced-topics/monitoring.md @@ -1,34 +1,28 @@ --- -title: Seqera Platform Monitoring -headline: "Seqera Platform Monitoring" +title: "Seqera Platform Monitoring" description: "A guide on relevant platform metrics" -date_created: "2025-12-17" +date created: "2025-12-17" +tags: [platform, monitoring] --- -# Seqera Platform Monitoring - -## Enabling observability metrics - Seqera Platform has built-in observability metrics which can be enabled by adding `prometheus` to the `MICRONAUT_ENVIRONMENTS` environment variable. This exposes a Prometheus endpoint at `/prometheus` on the default listen port (e.g., `http://localhost:8080/prometheus`). Combined with infrastructure monitoring tools such as Node Exporter, you can monitor relevant metrics across your deployment. ---- - ## Key metrics to monitor ### Seqera Platform-specific metrics -#### Data Studio metrics +#### Studios metrics | Metric | Description | -| ------------------------------------------------ | ------------------------------------ | -| `data_studio_startup_time_failure_seconds_sum` | Time for failed Data Studio startups | -| `data_studio_startup_time_failure_seconds_count` | Failed Data Studio startup count | +| ------------------------------------------------ | -------------------------------------| +| `data_studio_startup_time_failure_seconds_sum` | Time for failed Studio startups | +| `data_studio_startup_time_failure_seconds_count` | Failed Studio startup count | -Track Data Studio startup performance to identify environment provisioning issues. Slow or failing startups impact user productivity. +Track Studio startup performance to identify environment provisioning issues. Slow or failing startups impact user productivity. -**Average startup time by tool:** +**Average startup time by tool** ```shell sum by (tool) (increase(data_studio_startup_time_success_seconds_sum{app="backend", namespace="$namespace"}[$__rate_interval])) @@ -36,7 +30,7 @@ sum by (tool) (increase(data_studio_startup_time_success_seconds_sum{app="backen sum by (tool) (increase(data_studio_startup_time_success_seconds_count{app="backend", namespace="$namespace"}[$__rate_interval])) ``` -**Failed startup rate:** +**Failed startup rate** ```shell rate(data_studio_startup_time_failure_seconds_count{namespace="$namespace"}[$__rate_interval]) @@ -52,7 +46,7 @@ rate(data_studio_startup_time_failure_seconds_count{namespace="$namespace"}[$__r Monitor application errors across different time windows. Rolling error counts help identify transient issues versus sustained problems. -**Recent error counts:** +**Recent error counts** ```shell tower_logs_errors_10secCount{namespace="$namespace"} @@ -60,7 +54,7 @@ tower_logs_errors_1minCount{namespace="$namespace"} tower_logs_errors_5minCount{namespace="$namespace"} ``` -**Log events by severity level:** +**Log events by severity level** ```shell rate(logback_events_total{namespace="$namespace"}[$__rate_interval]) @@ -72,7 +66,7 @@ rate(logback_events_total{namespace="$namespace"}[$__rate_interval]) Monitor container CPU consumption against requested resources to identify capacity issues or inefficient resource allocation. -**Backend CPU usage:** +**Backend CPU usage** ```shell rate(container_cpu_usage_seconds_total{container="backend", namespace="$namespace"}[$__rate_interval]) @@ -148,13 +142,13 @@ topk(10, sum by(method, uri) (rate(http_server_requests_seconds_sum{namespace="$ Monitor external API calls and integrations. Slow or failing outbound requests can cascade into application performance issues. -**Outbound request rate:** +**Outbound request rate** ```shell rate(http_client_requests_seconds_count{namespace="$namespace"}[$__rate_interval]) ``` -**Average outbound request duration:** +**Average outbound request duration** ```shell rate(http_client_requests_seconds_sum{namespace="$namespace"}[$__rate_interval]) @@ -289,7 +283,7 @@ Class loading metrics help identify class loader leaks or excessive dynamic clas jvm_classes_loaded_classes{namespace="$namespace", app="backend"} ``` -**Class unload rate:** +**Class unload rate** ```shell rate(jvm_classes_unloaded_classes_total{namespace="$namespace", app="backend"}[$__rate_interval]) @@ -308,7 +302,7 @@ rate(jvm_classes_unloaded_classes_total{namespace="$namespace", app="backend"}[$ Process-level metrics provide visibility into resource consumption and system limits. -**JVM process CPU usage:** +**JVM process CPU usage** ```shell process_cpu_usage{namespace="$namespace"} @@ -342,7 +336,7 @@ process_uptime_seconds{namespace="$namespace"} System metrics provide host-level context for application performance. -**System-wide CPU usage:** +**System-wide CPU usage** ```shell system_cpu_usage{namespace="$namespace"} @@ -354,7 +348,7 @@ system_cpu_usage{namespace="$namespace"} system_load_average_1m{namespace="$namespace"} ``` -**Available CPU count:** +**Available CPU count** ```shell system_cpu_count{namespace="$namespace"} @@ -381,7 +375,7 @@ executor_active_threads{service="backend", namespace="$namespace", name!="schedu executor_pool_size_threads{service="backend", namespace="$namespace", name!="scheduled"} ``` -**Cron scheduled executor utilization:** +**Cron scheduled executor utilization** ```shell executor_active_threads{service="cron", namespace="$namespace", name="scheduled"} @@ -395,7 +389,7 @@ executor_pool_size_threads{service="cron", namespace="$namespace", name="schedul executor_queued_tasks{app="backend", namespace="$namespace"} ``` -**Task completion rate:** +**Task completion rate** ```shell rate(executor_completed_tasks_total{namespace="$namespace"}[$__rate_interval]) @@ -420,13 +414,13 @@ avg(irate(redis_keyspace_hits_total{app="platform-redis-exporter"}[$__rate_inter (irate(redis_keyspace_misses_total{app="platform-redis-exporter"}[$__rate_interval]) + irate(redis_keyspace_hits_total{app="platform-redis-exporter"}[$__rate_interval]))) ``` -**Cache size by name:** +**Cache size by name** ```shell cache_size{namespace="$namespace"} ``` -**Cache operation rates:** +**Cache operation rates** ```shell rate(cache_gets_total{namespace="$namespace"}[$__rate_interval]) @@ -460,13 +454,13 @@ rate(hibernate_sessions_open_total{app="backend", namespace="$namespace"}[$__rat rate(hibernate_sessions_closed_total{app="backend", namespace="$namespace"}[$__rate_interval]) ``` -**Connection acquisition rate:** +**Connection acquisition rate** ```shell rate(hibernate_connections_obtained_total{app="backend", namespace="$namespace"}[$__rate_interval]) ``` -**Query execution rate:** +**Query execution rate** ```shell rate(hibernate_query_executions_total{app="backend", namespace="$namespace"}[$__rate_interval]) @@ -495,7 +489,7 @@ rate(hibernate_entities_deletes_total{app="backend", namespace="$namespace"}[$__ rate(hibernate_entities_loads_total{app="backend", namespace="$namespace"}[$__rate_interval]) ``` -**Transaction success/failure rate:** +**Transaction success/failure rate** ```shell sum by (result) (rate(hibernate_transactions_total{app="backend", namespace="$namespace"}[$__rate_interval])) @@ -539,7 +533,7 @@ sum(increase(hibernate_cache_query_requests_total{app="backend", namespace="$nam sum(increase(hibernate_cache_query_requests_total{app="backend", namespace="$namespace"}[$__rate_interval])) ``` -**Query plan cache hit rate:** +**Query plan cache hit rate** ```shell sum(increase(hibernate_cache_query_plan_total{app="backend", namespace="$namespace", result="hit"}[$__rate_interval])) @@ -547,7 +541,7 @@ sum(increase(hibernate_cache_query_plan_total{app="backend", namespace="$namespa sum(increase(hibernate_cache_query_plan_total{app="backend", namespace="$namespace"}[$__rate_interval])) ``` -**Second level cache hit rate by region:** +**Second level cache hit rate by region** ```shell sum by (region) (increase(hibernate_second_level_cache_requests_total{app="backend", namespace="$namespace", result="hit"}[$__rate_interval])) @@ -573,14 +567,12 @@ rate(logback_events_total{level="error"}[5m]) Monitor pod health to catch deployment or infrastructure issues early. -**Pods in unhealthy states:** +**Pods in unhealthy states** ```shell sum by (namespace, pod) (kube_pod_status_phase{phase=~"Pending|Unknown|Failed", namespace!="wave-build"}) > 0 ``` ---- - ## Alerting recommendations ### Critical alerts @@ -608,8 +600,6 @@ sum by (namespace, pod) (kube_pod_status_phase{phase=~"Pending|Unknown|Failed", - Growing gap between `credits_estimation_workflow_added_total` and `credits_estimation_workflow_ended_total` - `hibernate_sessions_open_total` >> `hibernate_sessions_closed_total` over time ---- - ## Quick reference: Metrics by troubleshooting scenario | Issue | Key Metrics to Check | From 817b4ef1db569eb1214561d099e73bc30129eae0 Mon Sep 17 00:00:00 2001 From: Justine Geffen Date: Wed, 17 Dec 2025 21:44:05 +0200 Subject: [PATCH 9/9] Refine Seqera Platform Monitoring documentation Updated monitoring documentation to improve clarity and consistency in terminology. Signed-off-by: Justine Geffen --- .../enterprise/advanced-topics/monitoring.md | 68 ++++++++----------- 1 file changed, 29 insertions(+), 39 deletions(-) diff --git a/platform-enterprise_versioned_docs/version-25.3/enterprise/advanced-topics/monitoring.md b/platform-enterprise_versioned_docs/version-25.3/enterprise/advanced-topics/monitoring.md index c7e53d966..c721b3ce3 100644 --- a/platform-enterprise_versioned_docs/version-25.3/enterprise/advanced-topics/monitoring.md +++ b/platform-enterprise_versioned_docs/version-25.3/enterprise/advanced-topics/monitoring.md @@ -1,34 +1,28 @@ --- -title: Seqera Platform Monitoring -headline: "Seqera Platform Monitoring" +title: "Seqera Platform Monitoring" description: "A guide on relevant platform metrics" -date_created: "2025-12-17" +date created: "2025-12-17" +tags: [platform, monitoring] --- -# Seqera Platform Monitoring - -## Enabling observability metrics - Seqera Platform has built-in observability metrics which can be enabled by adding `prometheus` to the `MICRONAUT_ENVIRONMENTS` environment variable. This exposes a Prometheus endpoint at `/prometheus` on the default listen port (e.g., `http://localhost:8080/prometheus`). Combined with infrastructure monitoring tools such as Node Exporter, you can monitor relevant metrics across your deployment. ---- - ## Key metrics to monitor ### Seqera Platform-specific metrics -#### Data Studio metrics +#### Studios metrics | Metric | Description | -| ------------------------------------------------ | ------------------------------------ | -| `data_studio_startup_time_failure_seconds_sum` | Time for failed Data Studio startups | -| `data_studio_startup_time_failure_seconds_count` | Failed Data Studio startup count | +| ------------------------------------------------ | -------------------------------------| +| `data_studio_startup_time_failure_seconds_sum` | Time for failed Studio startups | +| `data_studio_startup_time_failure_seconds_count` | Failed Studio startup count | -Track Data Studio startup performance to identify environment provisioning issues. Slow or failing startups impact user productivity. +Track Studio startup performance to identify environment provisioning issues. Slow or failing startups impact user productivity. -**Average startup time by tool:** +**Average startup time by tool** ```shell sum by (tool) (increase(data_studio_startup_time_success_seconds_sum{app="backend", namespace="$namespace"}[$__rate_interval])) @@ -36,7 +30,7 @@ sum by (tool) (increase(data_studio_startup_time_success_seconds_sum{app="backen sum by (tool) (increase(data_studio_startup_time_success_seconds_count{app="backend", namespace="$namespace"}[$__rate_interval])) ``` -**Failed startup rate:** +**Failed startup rate** ```shell rate(data_studio_startup_time_failure_seconds_count{namespace="$namespace"}[$__rate_interval]) @@ -52,7 +46,7 @@ rate(data_studio_startup_time_failure_seconds_count{namespace="$namespace"}[$__r Monitor application errors across different time windows. Rolling error counts help identify transient issues versus sustained problems. -**Recent error counts:** +**Recent error counts** ```shell tower_logs_errors_10secCount{namespace="$namespace"} @@ -60,7 +54,7 @@ tower_logs_errors_1minCount{namespace="$namespace"} tower_logs_errors_5minCount{namespace="$namespace"} ``` -**Log events by severity level:** +**Log events by severity level** ```shell rate(logback_events_total{namespace="$namespace"}[$__rate_interval]) @@ -72,7 +66,7 @@ rate(logback_events_total{namespace="$namespace"}[$__rate_interval]) Monitor container CPU consumption against requested resources to identify capacity issues or inefficient resource allocation. -**Backend CPU usage:** +**Backend CPU usage** ```shell rate(container_cpu_usage_seconds_total{container="backend", namespace="$namespace"}[$__rate_interval]) @@ -148,13 +142,13 @@ topk(10, sum by(method, uri) (rate(http_server_requests_seconds_sum{namespace="$ Monitor external API calls and integrations. Slow or failing outbound requests can cascade into application performance issues. -**Outbound request rate:** +**Outbound request rate** ```shell rate(http_client_requests_seconds_count{namespace="$namespace"}[$__rate_interval]) ``` -**Average outbound request duration:** +**Average outbound request duration** ```shell rate(http_client_requests_seconds_sum{namespace="$namespace"}[$__rate_interval]) @@ -289,7 +283,7 @@ Class loading metrics help identify class loader leaks or excessive dynamic clas jvm_classes_loaded_classes{namespace="$namespace", app="backend"} ``` -**Class unload rate:** +**Class unload rate** ```shell rate(jvm_classes_unloaded_classes_total{namespace="$namespace", app="backend"}[$__rate_interval]) @@ -308,7 +302,7 @@ rate(jvm_classes_unloaded_classes_total{namespace="$namespace", app="backend"}[$ Process-level metrics provide visibility into resource consumption and system limits. -**JVM process CPU usage:** +**JVM process CPU usage** ```shell process_cpu_usage{namespace="$namespace"} @@ -342,7 +336,7 @@ process_uptime_seconds{namespace="$namespace"} System metrics provide host-level context for application performance. -**System-wide CPU usage:** +**System-wide CPU usage** ```shell system_cpu_usage{namespace="$namespace"} @@ -354,7 +348,7 @@ system_cpu_usage{namespace="$namespace"} system_load_average_1m{namespace="$namespace"} ``` -**Available CPU count:** +**Available CPU count** ```shell system_cpu_count{namespace="$namespace"} @@ -381,7 +375,7 @@ executor_active_threads{service="backend", namespace="$namespace", name!="schedu executor_pool_size_threads{service="backend", namespace="$namespace", name!="scheduled"} ``` -**Cron scheduled executor utilization:** +**Cron scheduled executor utilization** ```shell executor_active_threads{service="cron", namespace="$namespace", name="scheduled"} @@ -395,7 +389,7 @@ executor_pool_size_threads{service="cron", namespace="$namespace", name="schedul executor_queued_tasks{app="backend", namespace="$namespace"} ``` -**Task completion rate:** +**Task completion rate** ```shell rate(executor_completed_tasks_total{namespace="$namespace"}[$__rate_interval]) @@ -420,13 +414,13 @@ avg(irate(redis_keyspace_hits_total{app="platform-redis-exporter"}[$__rate_inter (irate(redis_keyspace_misses_total{app="platform-redis-exporter"}[$__rate_interval]) + irate(redis_keyspace_hits_total{app="platform-redis-exporter"}[$__rate_interval]))) ``` -**Cache size by name:** +**Cache size by name** ```shell cache_size{namespace="$namespace"} ``` -**Cache operation rates:** +**Cache operation rates** ```shell rate(cache_gets_total{namespace="$namespace"}[$__rate_interval]) @@ -460,13 +454,13 @@ rate(hibernate_sessions_open_total{app="backend", namespace="$namespace"}[$__rat rate(hibernate_sessions_closed_total{app="backend", namespace="$namespace"}[$__rate_interval]) ``` -**Connection acquisition rate:** +**Connection acquisition rate** ```shell rate(hibernate_connections_obtained_total{app="backend", namespace="$namespace"}[$__rate_interval]) ``` -**Query execution rate:** +**Query execution rate** ```shell rate(hibernate_query_executions_total{app="backend", namespace="$namespace"}[$__rate_interval]) @@ -495,7 +489,7 @@ rate(hibernate_entities_deletes_total{app="backend", namespace="$namespace"}[$__ rate(hibernate_entities_loads_total{app="backend", namespace="$namespace"}[$__rate_interval]) ``` -**Transaction success/failure rate:** +**Transaction success/failure rate** ```shell sum by (result) (rate(hibernate_transactions_total{app="backend", namespace="$namespace"}[$__rate_interval])) @@ -539,7 +533,7 @@ sum(increase(hibernate_cache_query_requests_total{app="backend", namespace="$nam sum(increase(hibernate_cache_query_requests_total{app="backend", namespace="$namespace"}[$__rate_interval])) ``` -**Query plan cache hit rate:** +**Query plan cache hit rate** ```shell sum(increase(hibernate_cache_query_plan_total{app="backend", namespace="$namespace", result="hit"}[$__rate_interval])) @@ -547,7 +541,7 @@ sum(increase(hibernate_cache_query_plan_total{app="backend", namespace="$namespa sum(increase(hibernate_cache_query_plan_total{app="backend", namespace="$namespace"}[$__rate_interval])) ``` -**Second level cache hit rate by region:** +**Second level cache hit rate by region** ```shell sum by (region) (increase(hibernate_second_level_cache_requests_total{app="backend", namespace="$namespace", result="hit"}[$__rate_interval])) @@ -573,14 +567,12 @@ rate(logback_events_total{level="error"}[5m]) Monitor pod health to catch deployment or infrastructure issues early. -**Pods in unhealthy states:** +**Pods in unhealthy states** ```shell sum by (namespace, pod) (kube_pod_status_phase{phase=~"Pending|Unknown|Failed", namespace!="wave-build"}) > 0 ``` ---- - ## Alerting recommendations ### Critical alerts @@ -608,8 +600,6 @@ sum by (namespace, pod) (kube_pod_status_phase{phase=~"Pending|Unknown|Failed", - Growing gap between `credits_estimation_workflow_added_total` and `credits_estimation_workflow_ended_total` - `hibernate_sessions_open_total` >> `hibernate_sessions_closed_total` over time ---- - ## Quick reference: Metrics by troubleshooting scenario | Issue | Key Metrics to Check |