diff --git a/applications/base/services/kube-prometheus-stack/helm-values/alerting-rules-overrides.yaml b/applications/base/services/kube-prometheus-stack/helm-values/alerting-rules-overrides.yaml index 56a52e6..37f839e 100644 --- a/applications/base/services/kube-prometheus-stack/helm-values/alerting-rules-overrides.yaml +++ b/applications/base/services/kube-prometheus-stack/helm-values/alerting-rules-overrides.yaml @@ -5,17 +5,20 @@ additionalPrometheusRulesMap: - name: pod-state-alerts rules: - alert: TooManyContainerRestarts - expr: sum(increase(kube_pod_container_status_restarts_total{pod_template_hash=""}[15m])) + expr: + sum(increase(kube_pod_container_status_restarts_total{pod_template_hash=""}[15m])) by (pod,namespace,container) > 5 for: 0m labels: severity: critical - app: '{{ $labels.pod }}' + app: "{{ $labels.pod }}" annotations: - summary: Container named {{ $labels.container }} in {{ $labels.pod }} + summary: + Container named {{ $labels.container }} in {{ $labels.pod }} in {{ $labels.namespace }} has restarted too many times in a short period and needs to be investigated. - description: "Namespace: {{$labels.namespace}}\nPod name: {{$labels.pod}}\n\ + description: + "Namespace: {{$labels.namespace}}\nPod name: {{$labels.pod}}\n\ Container name: {{$labels.container}}\n" - alert: HighPodRestartRate expr: rate(kube_pod_container_status_restarts_total[5m]) > 2 @@ -24,10 +27,12 @@ additionalPrometheusRulesMap: severity: warning annotations: summary: High pod restart count detected - description: Pod {{ $labels.pod }} in namespace {{ $labels.namespace }} + description: + Pod {{ $labels.pod }} in namespace {{ $labels.namespace }} is restarting frequently, which may indicate network instability. - alert: KubePodNotReadyCritical - expr: sum by(namespace, pod) ( max by(namespace, pod) (kube_pod_status_phase{job="kube-state-metrics",namespace=~".*",phase=~"Pending|Unknown", + expr: + sum by(namespace, pod) ( max by(namespace, pod) (kube_pod_status_phase{job="kube-state-metrics",namespace=~".*",phase=~"Pending|Unknown", namespace=~".*"}) * on(namespace, pod) group_left(owner_kind) topk by(namespace, pod) (1, max by(namespace, pod, owner_kind) (kube_pod_owner{owner_kind!="Job"})) ) > 0 @@ -36,5 +41,6 @@ additionalPrometheusRulesMap: severity: critical annotations: summary: Pod has been in a non-ready state for more than 5 minutes. - description: Pod {{ $labels.namespace }}/{{ $labels.pod }} has been + description: + Pod {{ $labels.namespace }}/{{ $labels.pod }} has been in a non-ready state for longer than 5 minutes. diff --git a/applications/base/services/kube-prometheus-stack/helm-values/alertmanager-overrides.yaml b/applications/base/services/kube-prometheus-stack/helm-values/alertmanager-overrides.yaml index b17fdcf..aa86932 100644 --- a/applications/base/services/kube-prometheus-stack/helm-values/alertmanager-overrides.yaml +++ b/applications/base/services/kube-prometheus-stack/helm-values/alertmanager-overrides.yaml @@ -23,7 +23,7 @@ alertmanager: group_interval: 60s repeat_interval: 12h routes: - - receiver: 'null' + - receiver: "null" matchers: [alertname = "Watchdog"] - receiver: warning_alerts_receiver continue: false @@ -32,11 +32,11 @@ alertmanager: continue: false matchers: [severity =~ "critical'] receivers: - - name: 'null' + - name: "null" - name: warning_alerts_receiver msteams_configs: - send_resolved: true - webhook_url: '' + webhook_url: "" - name: alert_proxy_receiver webhook_configs: - url: http://alert-proxy.rackspace.svc.cluster.local/alert/process diff --git a/applications/base/services/kube-prometheus-stack/helmrelease.yaml b/applications/base/services/kube-prometheus-stack/helmrelease.yaml index 61d2f5a..f45d111 100644 --- a/applications/base/services/kube-prometheus-stack/helmrelease.yaml +++ b/applications/base/services/kube-prometheus-stack/helmrelease.yaml @@ -31,12 +31,9 @@ spec: - kind: Secret name: kube-prometheus-stack-values-base valuesKey: hardened.yaml -# - kind: Secret -# name: kube-prometheus-stack-values-base -# valuesKey: alerting-rules-overrides.yaml -# - kind: Secret -# name: kube-prometheus-stack-values-base -# valuesKey: alertmanager-overrides.yaml + - kind: Secret + name: kube-prometheus-stack-values-base + valuesKey: alertmanager-overrides.yaml - kind: Secret name: kube-prometheus-stack-values-base valuesKey: prometheus-overrides.yaml diff --git a/applications/base/services/kube-prometheus-stack/kustomization.yaml b/applications/base/services/kube-prometheus-stack/kustomization.yaml index 28082a5..fb16b7c 100644 --- a/applications/base/services/kube-prometheus-stack/kustomization.yaml +++ b/applications/base/services/kube-prometheus-stack/kustomization.yaml @@ -8,8 +8,7 @@ secretGenerator: type: Opaque files: - hardened.yaml=helm-values/hardened-values-v0.0.1.yaml -# - alerting-rules-overrides.yaml=helm-values/alerting-rules-overrides.yaml -# - alertmanager-overrides.yaml=helm-values/alertmanager-overrides.yaml + - alertmanager-overrides.yaml=helm-values/alertmanager-overrides.yaml - prometheus-overrides.yaml=helm-values/prometheus-overrides.yaml options: disableNameSuffixHash: true