Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -5,17 +5,20 @@ additionalPrometheusRulesMap:
- name: pod-state-alerts
rules:
- alert: TooManyContainerRestarts
expr: sum(increase(kube_pod_container_status_restarts_total{pod_template_hash=""}[15m]))
expr:
sum(increase(kube_pod_container_status_restarts_total{pod_template_hash=""}[15m]))
by (pod,namespace,container) > 5
for: 0m
labels:
severity: critical
app: '{{ $labels.pod }}'
app: "{{ $labels.pod }}"
annotations:
summary: Container named {{ $labels.container }} in {{ $labels.pod }}
summary:
Container named {{ $labels.container }} in {{ $labels.pod }}
in {{ $labels.namespace }} has restarted too many times in a short
period and needs to be investigated.
description: "Namespace: {{$labels.namespace}}\nPod name: {{$labels.pod}}\n\
description:
"Namespace: {{$labels.namespace}}\nPod name: {{$labels.pod}}\n\
Container name: {{$labels.container}}\n"
- alert: HighPodRestartRate
expr: rate(kube_pod_container_status_restarts_total[5m]) > 2
Expand All @@ -24,10 +27,12 @@ additionalPrometheusRulesMap:
severity: warning
annotations:
summary: High pod restart count detected
description: Pod {{ $labels.pod }} in namespace {{ $labels.namespace }}
description:
Pod {{ $labels.pod }} in namespace {{ $labels.namespace }}
is restarting frequently, which may indicate network instability.
- alert: KubePodNotReadyCritical
expr: sum by(namespace, pod) ( max by(namespace, pod) (kube_pod_status_phase{job="kube-state-metrics",namespace=~".*",phase=~"Pending|Unknown",
expr:
sum by(namespace, pod) ( max by(namespace, pod) (kube_pod_status_phase{job="kube-state-metrics",namespace=~".*",phase=~"Pending|Unknown",
namespace=~".*"}) * on(namespace, pod) group_left(owner_kind) topk by(namespace,
pod) (1, max by(namespace, pod, owner_kind) (kube_pod_owner{owner_kind!="Job"}))
) > 0
Expand All @@ -36,5 +41,6 @@ additionalPrometheusRulesMap:
severity: critical
annotations:
summary: Pod has been in a non-ready state for more than 5 minutes.
description: Pod {{ $labels.namespace }}/{{ $labels.pod }} has been
description:
Pod {{ $labels.namespace }}/{{ $labels.pod }} has been
in a non-ready state for longer than 5 minutes.
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ alertmanager:
group_interval: 60s
repeat_interval: 12h
routes:
- receiver: 'null'
- receiver: "null"
matchers: [alertname = "Watchdog"]
- receiver: warning_alerts_receiver
continue: false
Expand All @@ -32,11 +32,11 @@ alertmanager:
continue: false
matchers: [severity =~ "critical']
receivers:
- name: 'null'
- name: "null"
- name: warning_alerts_receiver
msteams_configs:
- send_resolved: true
webhook_url: ''
webhook_url: ""
- name: alert_proxy_receiver
webhook_configs:
- url: http://alert-proxy.rackspace.svc.cluster.local/alert/process
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,12 +31,9 @@ spec:
- kind: Secret
name: kube-prometheus-stack-values-base
valuesKey: hardened.yaml
# - kind: Secret
# name: kube-prometheus-stack-values-base
# valuesKey: alerting-rules-overrides.yaml
# - kind: Secret
# name: kube-prometheus-stack-values-base
# valuesKey: alertmanager-overrides.yaml
- kind: Secret
name: kube-prometheus-stack-values-base
valuesKey: alertmanager-overrides.yaml
- kind: Secret
name: kube-prometheus-stack-values-base
valuesKey: prometheus-overrides.yaml
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,7 @@ secretGenerator:
type: Opaque
files:
- hardened.yaml=helm-values/hardened-values-v0.0.1.yaml
# - alerting-rules-overrides.yaml=helm-values/alerting-rules-overrides.yaml
# - alertmanager-overrides.yaml=helm-values/alertmanager-overrides.yaml
- alertmanager-overrides.yaml=helm-values/alertmanager-overrides.yaml
- prometheus-overrides.yaml=helm-values/prometheus-overrides.yaml
options:
disableNameSuffixHash: true
Loading