Skip to content
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
* [CHANGE] Target 3M memory series per ingester instead of 1.5M
* [CHANGE] Update jsonnet-libs to Fri Jul 19 12:51:49 2024 #57
* [CHANGE] Increase CortexProvisioningTooManyWrites alert threshold to 160e3
* [CHANGE] Use `timeseriesPanel` instead of `panel` when creating panels #58
* [ENHANCEMENT] Configure `-ingester.client.grpc-compression` to be `snappy-block`
* [ENHANCEMENT] Support Grafana 11 in Cortex Service Scaling Dashboard
* [BUGFIX] Remove deprecated option `max_series_per_query`
Expand Down
42 changes: 21 additions & 21 deletions cortex-mixin/dashboards/alertmanager.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -10,22 +10,22 @@ local utils = import 'mixin-utils/utils.libsonnet';
showTitle: false,
})
.addPanel(
$.panel('Total Alerts') +
$.timeseriesPanel('Total Alerts') +
$.statPanel('sum(cluster_job_%s:cortex_alertmanager_alerts:sum{%s})' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.alertmanager)], format='short')
)
.addPanel(
$.panel('Total Silences') +
$.timeseriesPanel('Total Silences') +
$.statPanel('sum(cluster_job_%s:cortex_alertmanager_silences:sum{%s})' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.alertmanager)], format='short')
)
.addPanel(
$.panel('Tenants') +
$.timeseriesPanel('Tenants') +
$.statPanel('max(cortex_alertmanager_tenants_discovered{%s})' % $.jobMatcher($._config.job_names.alertmanager), format='short')
)
)
.addRow(
$.row('Alerts Received')
.addPanel(
$.panel('APS') +
$.timeseriesPanel('APS') +
$.queryPanel(
[
|||
Expand All @@ -42,7 +42,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
.addRow(
$.row('Alert Notifications')
.addPanel(
$.panel('NPS') +
$.timeseriesPanel('NPS') +
$.queryPanel(
[
|||
Expand All @@ -56,7 +56,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
)
)
.addPanel(
$.panel('NPS by integration') +
$.timeseriesPanel('NPS by integration') +
$.queryPanel(
[
|||
Expand All @@ -73,18 +73,18 @@ local utils = import 'mixin-utils/utils.libsonnet';
)
)
.addPanel(
$.panel('Latency') +
$.timeseriesPanel('Latency', unit='ms') +
$.latencyPanel('cortex_alertmanager_notification_latency_seconds', '{%s}' % $.jobMatcher($._config.job_names.alertmanager))
)
)
.addRow(
$.row('Configuration API (gateway) + Alertmanager UI')
.addPanel(
$.panel('QPS') +
$.timeseriesPanel('QPS') +
$.qpsPanel('cortex_request_duration_seconds_count{%s, route=~"api_v1_alerts|alertmanager"}' % $.jobMatcher($._config.job_names.gateway))
)
.addPanel(
$.panel('Latency') +
$.timeseriesPanel('Latency', unit='ms') +
utils.latencyRecordingRulePanel('cortex_request_duration_seconds', $.jobSelector($._config.job_names.gateway) + [utils.selector.re('route', 'api_v1_alerts|alertmanager')])
)
)
Expand All @@ -94,23 +94,23 @@ local utils = import 'mixin-utils/utils.libsonnet';
.addRow(
$.row('Replication')
.addPanel(
$.panel('Per %s Tenants' % $._config.per_instance_label) +
$.timeseriesPanel('Per %s Tenants' % $._config.per_instance_label) +
$.queryPanel(
'max by(%s) (cortex_alertmanager_tenants_owned{%s})' % [$._config.per_instance_label, $.jobMatcher($._config.job_names.alertmanager)],
'{{%s}}' % $._config.per_instance_label
) +
$.stack
)
.addPanel(
$.panel('Per %s Alerts' % $._config.per_instance_label) +
$.timeseriesPanel('Per %s Alerts' % $._config.per_instance_label) +
$.queryPanel(
'sum by(%s) (cluster_job_%s:cortex_alertmanager_alerts:sum{%s})' % [$._config.per_instance_label, $._config.per_instance_label, $.jobMatcher($._config.job_names.alertmanager)],
'{{%s}}' % $._config.per_instance_label
) +
$.stack
)
.addPanel(
$.panel('Per %s Silences' % $._config.per_instance_label) +
$.timeseriesPanel('Per %s Silences' % $._config.per_instance_label) +
$.queryPanel(
'sum by(%s) (cluster_job_%s:cortex_alertmanager_silences:sum{%s})' % [$._config.per_instance_label, $._config.per_instance_label, $.jobMatcher($._config.job_names.alertmanager)],
'{{%s}}' % $._config.per_instance_label
Expand All @@ -121,7 +121,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
.addRow(
$.row('Tenant Configuration Sync')
.addPanel(
$.panel('Syncs/sec') +
$.timeseriesPanel('Syncs/sec') +
$.queryPanel(
[
|||
Expand All @@ -135,14 +135,14 @@ local utils = import 'mixin-utils/utils.libsonnet';
)
)
.addPanel(
$.panel('Syncs/sec (By Reason)') +
$.timeseriesPanel('Syncs/sec (By Reason)') +
$.queryPanel(
'sum by(reason) (rate(cortex_alertmanager_sync_configs_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.alertmanager),
'{{reason}}'
)
)
.addPanel(
$.panel('Ring Check Errors/sec') +
$.timeseriesPanel('Ring Check Errors/sec') +
$.queryPanel(
'sum (rate(cortex_alertmanager_ring_check_errors_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.alertmanager),
'errors'
Expand All @@ -152,7 +152,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
.addRow(
$.row('Sharding Initial State Sync')
.addPanel(
$.panel('Initial syncs /sec') +
$.timeseriesPanel('Initial syncs /sec') +
$.queryPanel(
'sum by(outcome) (rate(cortex_alertmanager_state_initial_sync_completed_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.alertmanager),
'{{outcome}}'
Expand All @@ -166,7 +166,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
}
)
.addPanel(
$.panel('Initial sync duration') +
$.timeseriesPanel('Initial sync duration', unit='s') +
$.latencyPanel('cortex_alertmanager_state_initial_sync_duration_seconds', '{%s}' % $.jobMatcher($._config.job_names.alertmanager)) + {
targets: [
target {
Expand All @@ -177,7 +177,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
}
)
.addPanel(
$.panel('Fetch state from other alertmanagers /sec') +
$.timeseriesPanel('Fetch state from other alertmanagers /sec') +
$.queryPanel(
[
|||
Expand All @@ -201,7 +201,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
.addRow(
$.row('Sharding Runtime State Sync')
.addPanel(
$.panel('Replicate state to other alertmanagers /sec') +
$.timeseriesPanel('Replicate state to other alertmanagers /sec') +
$.queryPanel(
[
|||
Expand All @@ -215,7 +215,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
)
)
.addPanel(
$.panel('Merge state from other alertmanagers /sec') +
$.timeseriesPanel('Merge state from other alertmanagers /sec') +
$.queryPanel(
[
|||
Expand All @@ -229,7 +229,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
)
)
.addPanel(
$.panel('Persist state to remote storage /sec') +
$.timeseriesPanel('Persist state to remote storage /sec') +
$.queryPanel(
[
|||
Expand Down
19 changes: 8 additions & 11 deletions cortex-mixin/dashboards/compactor.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ local utils = import 'mixin-utils/utils.libsonnet';
'sum(rate(cortex_compactor_runs_failed_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.compactor)
) +
$.bars +
{ yaxes: $.yaxes('ops') } +
$.panelDescription(
'Per-instance runs',
|||
Expand All @@ -23,7 +22,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
),
)
.addPanel(
$.panel('Tenants compaction progress') +
$.timeseriesPanel('Tenants compaction progress') +
$.queryPanel(|||
(
cortex_compactor_tenants_processing_succeeded{%s} +
Expand All @@ -44,9 +43,8 @@ local utils = import 'mixin-utils/utils.libsonnet';
.addRow(
$.row('')
.addPanel(
$.panel('Compacted blocks / sec') +
$.timeseriesPanel('Compacted blocks / sec', unit='ops') +
$.queryPanel('sum(rate(prometheus_tsdb_compactions_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.compactor), 'blocks') +
{ yaxes: $.yaxes('ops') } +
$.panelDescription(
'Compacted blocks / sec',
|||
Expand All @@ -55,7 +53,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
),
)
.addPanel(
$.panel('Per-block compaction duration') +
$.timeseriesPanel('Per-block compaction duration', unit='s') +
$.latencyPanel('prometheus_tsdb_compaction_duration_seconds', '{%s}' % $.jobMatcher($._config.job_names.compactor)) +
$.panelDescription(
'Per-block compaction duration',
Expand All @@ -68,11 +66,11 @@ local utils = import 'mixin-utils/utils.libsonnet';
.addRow(
$.row('')
.addPanel(
$.panel('Average blocks / tenant') +
$.timeseriesPanel('Average blocks / tenant') +
$.queryPanel('avg(max by(user) (cortex_bucket_blocks_count{%s}))' % $.jobMatcher($._config.job_names.compactor), 'avg'),
)
.addPanel(
$.panel('Tenants with largest number of blocks') +
$.timeseriesPanel('Tenants with largest number of blocks') +
$.queryPanel('topk(10, max by(user) (cortex_bucket_blocks_count{%s}))' % $.jobMatcher($._config.job_names.compactor), '{{user}}') +
$.panelDescription(
'Tenants with largest number of blocks',
Expand All @@ -85,9 +83,8 @@ local utils = import 'mixin-utils/utils.libsonnet';
.addRow(
$.row('Garbage Collector')
.addPanel(
$.panel('Blocks marked for deletion / sec') +
$.queryPanel('sum(rate(cortex_compactor_blocks_marked_for_deletion_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.compactor), 'blocks') +
{ yaxes: $.yaxes('ops') },
$.timeseriesPanel('Blocks marked for deletion / sec', unit='ops') +
$.queryPanel('sum(rate(cortex_compactor_blocks_marked_for_deletion_total{%s}[$__rate_interval]))' % $.jobMatcher($._config.job_names.compactor), 'blocks'),
)
.addPanel(
$.successFailurePanel(
Expand All @@ -111,7 +108,7 @@ local utils = import 'mixin-utils/utils.libsonnet';
) + { yaxes: $.yaxes('ops') }
)
.addPanel(
$.panel('Metadata Sync Duration') +
$.timeseriesPanel('Metadata Sync Duration', unit='ms') +
// This metric tracks the duration of a per-tenant metadata sync.
$.latencyPanel('cortex_compactor_meta_sync_duration_seconds', '{%s}' % $.jobMatcher($._config.job_names.compactor)),
)
Expand Down
10 changes: 4 additions & 6 deletions cortex-mixin/dashboards/config.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -8,19 +8,17 @@ local utils = import 'mixin-utils/utils.libsonnet';
.addRow(
$.row('Startup config file')
.addPanel(
$.panel('Startup config file hashes') +
$.timeseriesPanel('Startup config file hashes', unit='instances') +
$.queryPanel('count(cortex_config_hash{%s}) by (sha256)' % $.namespaceMatcher(), 'sha256:{{sha256}}') +
$.stack +
{ yaxes: $.yaxes('instances') },
$.stack,
)
)
.addRow(
$.row('Runtime config file')
.addPanel(
$.panel('Runtime config file hashes') +
$.timeseriesPanel('Runtime config file hashes', unit='instances') +
$.queryPanel('count(cortex_runtime_config_hash{%s}) by (sha256)' % $.namespaceMatcher(), 'sha256:{{sha256}}') +
$.stack +
{ yaxes: $.yaxes('instances') },
$.stack,
)
),
}
Loading