Skip to content

Commit 306a7bc

Browse files
Add alerts for confirmation requests rate (#1992)
A global one and a per-member one as suggested on DACH-NY/canton-network-internal#1307 Signed-off-by: Martin Florian <[email protected]>
1 parent 1ff2c6f commit 306a7bc

File tree

5 files changed

+146
-1
lines changed

5 files changed

+146
-1
lines changed

cluster/deployment/config.yaml

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,15 @@ monitoring:
7373
trafficWaste:
7474
kilobytes: 1
7575
overMinutes: 5
76+
# confirmation requests correspond to new ledger submissions;
77+
# we alert if the rate is higher than expected to spot potential overload situations
78+
confirmationRequests:
79+
total:
80+
rate: 10
81+
overMinutes: 5
82+
perMember:
83+
rate: 2
84+
overMinutes: 5
7685
cloudSql:
7786
maintenance: false
7887
cometbft:

cluster/expected/infra/expected.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -864,7 +864,7 @@
864864
"notification_policies.yaml": "apiVersion: 1\npolicies:\n - orgId: 1\n receiver: cn-ci-channel-notification\n routes:\n - receiver: cn-ci-channel-high-prio-notification\n object_matchers:\n - - priority\n - '='\n - high\n group_wait: 30s\n group_interval: 30m\n repeat_interval: 4h\n continue: true\n - receiver: cn-ci-channel-notification\n group_wait: 30s\n group_interval: 30m\n repeat_interval: 4h\n",
865865
"sv-status-report_alerts.yaml": "apiVersion: 1\ngroups:\n - orgId: 1\n name: sv status reports\n folder: canton-network\n interval: 1m\n rules:\n - uid: adlmhpz5iv4sgc\n title: Report Creation Time Lag\n condition: No recent report\n data:\n - refId: Report time lag\n relativeTimeRange:\n from: 600\n to: 0\n datasourceUid: prometheus\n model:\n datasource:\n type: prometheus\n uid: prometheus\n editorMode: code\n expr: time() - max by (report_publisher) (splice_sv_status_report_creation_time_us{namespace=~\".*\", report_publisher=~\".*\", canton_version=~\".*\"}) / 1000000\n instant: false\n interval: \"\"\n intervalMs: 30000\n legendFormat: '{{report_publisher}}'\n maxDataPoints: 43200\n range: true\n refId: Report time lag\n - refId: Latest report time lag\n relativeTimeRange:\n from: 600\n to: 0\n datasourceUid: __expr__\n model:\n conditions:\n - evaluator:\n params: []\n type: gt\n operator:\n type: and\n query:\n params:\n - B\n reducer:\n params: []\n type: last\n type: query\n datasource:\n type: __expr__\n uid: __expr__\n expression: Report time lag\n intervalMs: 1000\n maxDataPoints: 43200\n reducer: last\n refId: Latest report time lag\n settings:\n mode: dropNN\n type: reduce\n - refId: No recent report\n relativeTimeRange:\n from: 600\n to: 0\n datasourceUid: __expr__\n model:\n conditions:\n - evaluator:\n params:\n - 300\n type: gt\n operator:\n type: and\n query:\n params:\n - C\n reducer:\n params: []\n type: last\n type: query\n datasource:\n type: __expr__\n uid: __expr__\n expression: Latest report time lag\n intervalMs: 1000\n maxDataPoints: 43200\n refId: No recent report\n type: threshold\n dashboardUid: cdlm6c7fn7vuod\n panelId: 18\n noDataState: Alerting\n execErrState: Alerting\n for: 5m\n annotations:\n __dashboardUid__: cdlm6c7fn7vuod\n __panelId__: \"18\"\n severity: critical\n description: The SV {{ $labels.report_publisher }} has not submitted a status report recently\n runbook_url: \"\"\n summary: Status report creation time lag too high\n labels:\n \"team\": \"support\"\n isPaused: false\n",
866866
"templates.yaml": "# config file version\napiVersion: 1\n\n# List of templates to import or update\n# source https://community.grafana.com/t/working-configuration-example-for-alerts-templating-telegram-and-slack/80988\ntemplates:\n - name: slack\n template: |\n {{ define \"slack_title\" }}\n {{ $hasCritical := false }}{{ $hasWarning := false }}{{ $hasInfo := false }}{{ $hasOthers := false }}\n {{- range .Alerts.Firing -}}\n {{- if eq .Annotations.severity \"critical\" -}}\n {{- $hasCritical = true -}}\n {{- else if eq .Annotations.severity \"warning\" -}}\n {{- $hasWarning = true -}}\n {{- else if eq .Annotations.severity \"info\" -}}\n {{- $hasInfo = true -}}\n {{- else -}}\n {{- $hasOthers = true -}}\n {{- end -}}\n {{- end -}}\n\n mock\n {{ if gt (len .Alerts.Firing) 0 }}\n {{- if $hasCritical }}\n 🔥 {{ len .Alerts.Firing }} Alert{{ if gt (len .Alerts.Firing) 1 }}s{{ end }} firing\n {{- else if $hasWarning }}\n ⚠️ {{ len .Alerts.Firing }} Alert{{ if gt (len .Alerts.Firing) 1 }}s{{ end }} firing\n {{- else }}\n :information_source: {{ len .Alerts.Firing }} Alert{{ if gt (len .Alerts.Firing) 1 }}s{{ end }} firing\n {{- end }}\n {{ end }}\n {{ if gt (len .Alerts.Resolved) 0 }} ✅ {{ len .Alerts.Resolved }} alert(s) resolved {{ end }}\n {{ end }}\n\n {{ define \"slack_message\" }}\n {{ $hasCritical := false }}{{ $hasWarning := false }}{{ $hasInfo := false }}{{ $hasOthers := false }}\n {{- range .Alerts.Firing -}}\n {{- if eq .Annotations.severity \"critical\" -}}\n {{- $hasCritical = true -}}\n {{- else if eq .Annotations.severity \"warning\" -}}\n {{- $hasWarning = true -}}\n {{- else if eq .Annotations.severity \"info\" -}}\n {{- $hasInfo = true -}}\n {{- else -}}\n {{- $hasOthers = true -}}\n {{- end -}}\n {{- end -}}\n {{ if $hasCritical }} 🔥Critical alerts {{ range .Alerts.Firing }} {{- if eq .Annotations.severity \"critical\" -}} {{ template \"slack_alert_firing\" .}} {{ end }} {{ end }} {{ end }}\n {{ if $hasWarning }} ⚠️Warning alerts {{ range .Alerts.Firing }} {{- if eq .Annotations.severity \"warning\" -}} {{ template \"slack_alert_firing\" .}} {{ end }} {{ end }} {{ end }}\n {{ if $hasInfo }} :information_source:Info alerts {{ range .Alerts.Firing }} {{- if eq .Annotations.severity \"info\" -}} {{ template \"slack_alert_firing\" .}} {{ end }} {{ end }} {{ end }}\n {{ if $hasOthers }} Other alerts {{ range .Alerts.Firing }} {{- if and (and (ne .Annotations.severity \"info\") (ne .Annotations.severity \"warning\")) (ne .Annotations.severity \"critical\") -}} {{ template \"slack_alert_firing\" . }} {{ end }} {{ end }} {{ end }}\n {{ if gt (len .Alerts.Resolved) 0 }} ✅Resolved Alerts {{ range .Alerts.Resolved }} {{ template \"slack_alert_resolved\" .}} {{ end }} {{ end }}\n {{ end }}\n\n {{ define \"slack_alert_firing\" }}\n *{{ .Labels.alertname }}*\n {{ .Annotations.summary }}\n {{ if .Annotations.description }}{{ .Annotations.description }}{{ end }}\n {{- if .Labels.service }}\n Service: {{ .Labels.service }}\n {{- end }}\n {{ template \"slack_gcloud_log_link\" . }}\n {{ end }}\n\n {{ define \"slack_alert_resolved\" }}\n *{{ .Labels.alertname }}*\n {{ if .Annotations.severity }}{{ .Annotations.severity }}{{ end }}\n {{ .Annotations.summary }}\n {{ if .Annotations.description }}{{ .Annotations.description }}{{ end }}\n {{ end }}\n\n {{ define \"slack_gcloud_log_link\" }}<https://console.cloud.google.com/logs/query;startTime={{ (.StartsAt.Add -600000000000).UTC.Format \"2006-01-02T15:04:05Z\" }};endTime={{ (.StartsAt.Add 600000000000).UTC.Format \"2006-01-02T15:04:05Z\" }};query=resource.labels.cluster_name%3D%22cn-mocknet%22%0A{{ .Labels.gcloud_filter }}?project=da-cn-devnet|cloud logs>{{ end }}\n\n {{ define \"slack_color\" -}}\n {{ $hasCritical := false }}{{ $hasWarning := false }}{{ $hasInfo := false }}{{ $hasOthers := false }}\n {{- range .Alerts.Firing -}}\n {{- if eq .Annotations.severity \"critical\" -}}\n {{- $hasCritical = true -}}\n {{- else if eq .Annotations.severity \"warning\" -}}\n {{- $hasWarning = true -}}\n {{- else if eq .Annotations.severity \"info\" -}}\n {{- $hasInfo = true -}}\n {{- else -}}\n {{- $hasOthers = true -}}\n {{- end -}}\n {{- end -}}\n {{ if eq .Status \"firing\" -}}\n {{ if $hasCritical -}}\n danger\n {{- else if $hasWarning -}}\n warning\n {{- else -}}\n #439FE0\n {{- end -}}\n {{ else -}}\n good\n {{- end }}\n {{- end }}\n\n {{ define \"support_email_message\" }}\n [ MAINNET-DA2-SVN-CRITICAL-ALERT 9f2b7e1a-4c3d-58b9-9f1e-df9c4a5b6e7d ]\n {{ if gt (len .Alerts.Firing) 0 }}**Firing**\n {{ template \"__text_alert_list\" .Alerts.Firing }}{{ if gt (len .Alerts.Resolved) 0 }}\n {{ end }}{{ end }}{{ if gt (len .Alerts.Resolved) 0 }}**Resolved**\n {{ template \"__text_alert_list\" .Alerts.Resolved }}{{ end }}{{ end }}\n",
867-
"traffic_alerts.yaml": "apiVersion: 1\ngroups:\n - orgId: 1\n name: traffic\n folder: canton-network\n interval: 1m\n rules:\n - uid: adw5rd048zf9ca\n title: Wasted Traffic\n condition: wasted_traffic_threshold\n data:\n - refId: wasted_traffic\n relativeTimeRange:\n from: 600\n to: 0\n datasourceUid: prometheus\n model:\n editorMode: code\n expr: max by (member) (delta(daml_sequencer_traffic_control_wasted_traffic_total{member=~\"PAR::.*\",member!~\"PAR::Dummy-SV-1::.*\",member!~\"PAR::Dummy-SV-2::.*\"}[5m]))\n instant: true\n intervalMs: 1000\n legendFormat: __auto\n maxDataPoints: 43200\n range: false\n refId: wasted_traffic\n - refId: wasted_traffic_threshold\n relativeTimeRange:\n from: 600\n to: 0\n datasourceUid: __expr__\n model:\n conditions:\n - evaluator:\n params:\n - 1024\n type: gt\n operator:\n type: and\n query:\n params:\n - C\n reducer:\n params: []\n type: last\n type: query\n datasource:\n type: __expr__\n uid: __expr__\n expression: wasted_traffic\n intervalMs: 1000\n maxDataPoints: 43200\n refId: wasted_traffic_threshold\n type: threshold\n dashboardUid: fdnphvrryfq4gf\n panelId: 6\n noDataState: OK\n execErrState: Alerting\n for: 1m\n annotations:\n __dashboardUid__: fdnphvrryfq4gf\n __panelId__: \"6\"\n description: The rate of traffic wasted by member {{ $labels.member }} exceeded the threshold with a value of {{ humanize1024 $values.wasted_traffic.Value }} in the last 5m\n severity: critical\n summary: Traffic wasted by {{ $labels.member }} exceeded threshold ({{ humanize1024 1024 }}b over 5m)\n labels: {}\n isPaused: false\n"
867+
"traffic_alerts.yaml": "apiVersion: 1\ngroups:\n - orgId: 1\n name: traffic\n folder: canton-network\n interval: 1m\n rules:\n - uid: adw5rd048zf9ca\n title: Wasted Traffic\n condition: wasted_traffic_threshold\n data:\n - refId: wasted_traffic\n relativeTimeRange:\n from: 600\n to: 0\n datasourceUid: prometheus\n model:\n editorMode: code\n expr: max by (member) (delta(daml_sequencer_traffic_control_wasted_traffic_total{member=~\"PAR::.*\",member!~\"PAR::Dummy-SV-1::.*\",member!~\"PAR::Dummy-SV-2::.*\"}[5m]))\n instant: true\n intervalMs: 1000\n legendFormat: __auto\n maxDataPoints: 43200\n range: false\n refId: wasted_traffic\n - refId: wasted_traffic_threshold\n relativeTimeRange:\n from: 600\n to: 0\n datasourceUid: __expr__\n model:\n conditions:\n - evaluator:\n params:\n - 1024\n type: gt\n operator:\n type: and\n query:\n params:\n - C\n reducer:\n params: []\n type: last\n type: query\n datasource:\n type: __expr__\n uid: __expr__\n expression: wasted_traffic\n intervalMs: 1000\n maxDataPoints: 43200\n refId: wasted_traffic_threshold\n type: threshold\n dashboardUid: fdnphvrryfq4gf\n panelId: 6\n noDataState: OK\n execErrState: Alerting\n for: 1m\n annotations:\n __dashboardUid__: fdnphvrryfq4gf\n __panelId__: \"6\"\n description: The rate of traffic wasted by member {{ $labels.member }} exceeded the threshold with a value of {{ humanize1024 $values.wasted_traffic.Value }} in the last 5m\n severity: critical\n summary: Traffic wasted by {{ $labels.member }} exceeded threshold ({{ humanize1024 1024 }}b over 5m)\n labels: {}\n isPaused: false\n - uid: 5dcddc9a5487\n title: Confirmation Requests Total\n condition: confirmation_requests_total_threshold\n data:\n - refId: confirmation_requests_total\n relativeTimeRange:\n from: 600\n to: 0\n datasourceUid: prometheus\n model:\n editorMode: code\n expr: max by (migration_id) (sum by (namespace, migration_id) (rate(daml_sequencer_block_events_total{type=\"send-confirmation-request\"}[5m])))\n instant: true\n intervalMs: 1000\n legendFormat: __auto\n maxDataPoints: 43200\n range: false\n refId: confirmation_requests_total\n - refId: confirmation_requests_total_threshold\n datasourceUid: __expr__\n model:\n conditions:\n - evaluator:\n params:\n - 10\n type: gt\n operator:\n type: and\n query:\n params: []\n reducer:\n params: []\n type: avg\n type: query\n datasource:\n name: Expression\n type: __expr__\n uid: __expr__\n expression: confirmation_requests_total\n hide: false\n refId: confirmation_requests_total_threshold\n type: threshold\n dashboardUid: fdjrxql2alblsd\n panelId: 1\n noDataState: OK\n execErrState: Alerting\n for: 1m\n annotations:\n __dashboardUid__: fdjrxql2alblsd\n __panelId__: \"1\"\n description: The total confirmation requests send rate on migration ID {{ $labels.migration_id }} exceeded the threshold with a value of {{ 10 }} in the last 5m\n severity: critical\n summary: Total confirmation requests on migration ID {{ $labels.migration_id }} exceeded the threshold (10 in the last 5m)\n labels: {}\n isPaused: false\n - uid: 88b8827c8d09\n title: Confirmation Requests By Member\n condition: confirmation_requests_by_member_threshold\n data:\n - refId: confirmation_requests_by_member\n relativeTimeRange:\n from: 600\n to: 0\n datasourceUid: prometheus\n model:\n editorMode: code\n expr: max by (member, migration_id) (rate(daml_sequencer_block_events_total{type=\"send-confirmation-request\"}[5m]))\n instant: true\n intervalMs: 1000\n legendFormat: __auto\n maxDataPoints: 43200\n range: false\n refId: confirmation_requests_by_member\n - refId: confirmation_requests_by_member_threshold\n datasourceUid: __expr__\n model:\n conditions:\n - evaluator:\n params:\n - 10\n type: gt\n operator:\n type: and\n query:\n params: []\n reducer:\n params: []\n type: avg\n type: query\n datasource:\n name: Expression\n type: __expr__\n uid: __expr__\n expression: confirmation_requests_by_member\n hide: false\n refId: confirmation_requests_by_member_threshold\n type: threshold\n dashboardUid: fdjrxql2alblsd\n panelId: 1\n noDataState: OK\n execErrState: Alerting\n for: 1m\n annotations:\n __dashboardUid__: fdjrxql2alblsd\n __panelId__: \"1\"\n description: The confirmation requests send rate of member {{ $labels.member }} on migration ID {{ $labels.migration_id }} exceeded the threshold with a value of {{ 10 }} in the last 5m\n severity: critical\n summary: Confirmation requests by member {{ $labels.member }} on migration ID {{ $labels.migration_id }} exceeded the threshold (10 in the last 5m)\n labels: {}\n isPaused: false\n"
868868
},
869869
"kind": "ConfigMap",
870870
"metadata": {

0 commit comments

Comments
 (0)