Skip to content

Commit 4c368dd

Browse files
committed
upload library charts as well; switch to versioned charts; start decoupling alerts
1 parent 2bd32e6 commit 4c368dd

File tree

10 files changed

+256
-233
lines changed

10 files changed

+256
-233
lines changed

.github/workflows/push-charts.yaml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ jobs:
3131
uses: tj-actions/changed-files@v46
3232
with:
3333
files: |
34+
helm/lib/**/Chart.yaml
3435
helm/bundles/**/Chart.yaml
3536
- name: Push chart to registry
3637
if: steps.changed-chart-yaml-files.outputs.all_changed_files != ''
@@ -43,4 +44,4 @@ jobs:
4344
helm package $CHART_DIR --dependency-update --destination $CHART_DIR
4445
CHART_PACKAGE=$(ls $CHART_DIR/*.tgz)
4546
helm push $CHART_PACKAGE oci://${{ env.REGISTRY }}/${{ github.repository }}/charts/
46-
done
47+
done

helm/.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,3 +5,6 @@
55

66
# Include the dependency symlinks.
77
!bundles/*/charts/**
8+
9+
# Exclude all Helm chart archives.
10+
**/*.tgz

helm/bundles/cortex-manila/Chart.lock

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
dependencies:
2+
- name: cortex-core
3+
repository: oci://ghcr.io/cobaltcore-dev/charts
4+
version: 0.23.4
5+
- name: cortex-postgres
6+
repository: oci://ghcr.io/cobaltcore-dev/charts
7+
version: 0.3.2
8+
- name: cortex-mqtt
9+
repository: oci://ghcr.io/cobaltcore-dev/charts
10+
version: 0.0.4
11+
digest: sha256:bb8101092da2a0c047f1f8ac2d918b8dc897252a2aa93e994d79a68a321cf607
12+
generated: "2025-07-23T12:23:44.795339+02:00"

helm/bundles/cortex-manila/Chart.yaml

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,11 @@ version: 0.0.1
99
appVersion: 0.1.0
1010
dependencies:
1111
- name: cortex-core
12-
repository: file://cortex-core
13-
- name: cortex-mqtt
14-
repository: file://cortex-mqtt
12+
repository: oci://ghcr.io/cobaltcore-dev/charts
13+
version: 0.23.4
1514
- name: cortex-postgres
16-
repository: file://cortex-postgres
15+
repository: oci://ghcr.io/cobaltcore-dev/charts
16+
version: 0.3.2
17+
- name: cortex-mqtt
18+
repository: oci://ghcr.io/cobaltcore-dev/charts
19+
version: 0.0.4
Lines changed: 183 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,183 @@
1+
groups:
2+
- name: cortex-manila-alerts
3+
rules:
4+
- alert: CortexManilaSchedulerDown
5+
expr: |
6+
up{component="cortex-manila-scheduler"} != 1 or
7+
absent(up{component="cortex-manila-scheduler"})
8+
for: 1m
9+
labels:
10+
context: liveness
11+
dashboard: cortex/cortex
12+
service: cortex
13+
severity: warning
14+
support_group: workload-management
15+
annotations:
16+
summary: "Cortex external scheduler for Manila is down"
17+
description: >
18+
The Cortex scheduler is down. Initial placement requests from Manila will
19+
not be served. This is no immediate problem, since Manila will continue
20+
placing new shares. However, the placement will be less desirable.
21+
22+
- alert: CortexManilaSyncerDown
23+
expr: |
24+
up{component="cortex-manila-syncer"} != 1 or
25+
absent(up{component="cortex-manila-syncer"})
26+
for: 1m
27+
labels:
28+
context: liveness
29+
dashboard: cortex/cortex
30+
service: cortex
31+
severity: warning
32+
support_group: workload-management
33+
annotations:
34+
summary: "Cortex syncer is down"
35+
description: >
36+
The Cortex syncer is down. Cortex requires somewhat recent data from
37+
it's datasources (OpenStack, Prometheus, etc.) to make accurate
38+
scheduling decisions. If this issue persists for a longer time, the
39+
data based will slowly drift away from the actual state of the
40+
datacenter, which may lead to less desirable placement decisions.
41+
This is no immediate problem, since Manila will continue placing new
42+
shares.
43+
44+
- alert: CortexManilaExtractorDown
45+
expr: |
46+
up{component="cortex-manila-extractor"} != 1 or
47+
absent(up{component="cortex-manila-extractor"})
48+
for: 1m
49+
labels:
50+
context: liveness
51+
dashboard: cortex/cortex
52+
service: cortex
53+
severity: info
54+
support_group: workload-management
55+
annotations:
56+
summary: "Cortex extractor is down"
57+
description: >
58+
The Cortex extractor is down. This means that newly available data
59+
about the datacenter will not be used to extract scheduling knowledge.
60+
This is no immediate problem, since Manila will continue placing new
61+
shares. However, the placement will be less desirable.
62+
63+
- alert: CortexManilaHttpRequest400sTooHigh
64+
expr: rate(cortex_scheduler_api_request_duration_seconds_count{component="cortex-manila-scheduler",status=~"4.+"}[5m]) > 0.1
65+
for: 5m
66+
labels:
67+
context: api
68+
dashboard: cortex/cortex
69+
service: cortex
70+
severity: info
71+
support_group: workload-management
72+
annotations:
73+
summary: "HTTP request 400 errors too high"
74+
description: >
75+
Cortex is responding to Manila initial placement requests with HTTP 4xx
76+
errors. This is expected when the scheduling request cannot be served
77+
by Cortex. However, it could also indicate that the Manila request
78+
format has changed and Cortex is unable to parse it.
79+
80+
- alert: CortexManilaHttpRequest500sTooHigh
81+
expr: rate(cortex_scheduler_api_request_duration_seconds_count{component="cortex-manila-scheduler",status=~"5.+"}[5m]) > 0.1
82+
for: 5m
83+
labels:
84+
context: api
85+
dashboard: cortex/cortex
86+
service: cortex
87+
severity: info
88+
support_group: workload-management
89+
annotations:
90+
summary: "HTTP request 500 errors too high"
91+
description: >
92+
Cortex is responding to Manila initial placement requests with HTTP 5xx
93+
errors. This is not expected and indicates that Cortex is having some
94+
internal problem. Manila will continue to place new shares, but the
95+
placement will be less desirable. Thus, no immediate action is needed.
96+
97+
- alert: CortexManilaHighMemoryUsage
98+
expr: process_resident_memory_bytes{component=~"cortex-manila-.*"} > 1000 * 1024 * 1024
99+
for: 5m
100+
labels:
101+
context: memory
102+
dashboard: cortex/cortex
103+
service: cortex
104+
severity: info
105+
support_group: workload-management
106+
annotations:
107+
summary: "Cortex {{`{{$labels.component}}`}} uses too much memory"
108+
description: >
109+
Cortex should not be using more than 1000 MiB of memory. Usually it
110+
should use much less, so there may be a memory leak or other changes
111+
that are causing the memory usage to increase significantly.
112+
113+
- alert: CortexManilaHighCPUUsage
114+
expr: rate(process_cpu_seconds_total{component=~"cortex-manila-.*"}[1m]) > 0.5
115+
for: 5m
116+
labels:
117+
context: cpu
118+
dashboard: cortex/cortex
119+
service: cortex
120+
severity: info
121+
support_group: workload-management
122+
annotations:
123+
summary: "Cortex {{`{{$labels.component}}`}} uses too much CPU"
124+
description: >
125+
Cortex should not be using more than 50% of a single CPU core. Usually
126+
it should use much less, so there may be a CPU leak or other changes
127+
that are causing the CPU usage to increase significantly.
128+
129+
- alert: CortexManilaSyncNotSuccessful
130+
expr: cortex_sync_request_processed_total{component=~"cortex-manila-.*"} - cortex_sync_request_duration_seconds_count{component=~"cortex-manila-.*"} > 0
131+
for: 5m
132+
labels:
133+
context: syncstatus
134+
dashboard: cortex/cortex
135+
service: cortex
136+
severity: info
137+
support_group: workload-management
138+
annotations:
139+
summary: "Sync not successful"
140+
description: >
141+
Cortex experienced an issue syncing data from a datasource. This may
142+
happen when the datasource (OpenStack, Prometheus, etc.) is down or
143+
the sync module is misconfigured. No immediate action is needed, since
144+
the sync module will retry the sync operation and the currently synced
145+
data will be kept. However, when this problem persists for a longer
146+
time the service will have a less recent view of the datacenter.
147+
148+
- alert: CortexManilaSyncObjectsDroppedToZero
149+
expr: cortex_sync_objects{component=~"cortex-manila-.*"} == 0
150+
for: 5m
151+
labels:
152+
context: syncobjects
153+
dashboard: cortex/cortex
154+
service: cortex
155+
severity: info
156+
support_group: workload-management
157+
annotations:
158+
summary: "Cortex is not syncing any new data from {{`{{$labels.datasource}}`}}"
159+
description: >
160+
Cortex is not syncing any objects from a datasource. This may happen
161+
when the datasource (OpenStack, Prometheus, etc.) is down or the sync
162+
module is misconfigured. No immediate action is needed, since the sync
163+
module will retry the sync operation and the currently synced data will
164+
be kept. However, when this problem persists for a longer time the
165+
service will have a less recent view of the datacenter.
166+
167+
- alert: CortexManilaSyncObjectsTooHigh
168+
expr: cortex_sync_objects{component=~"cortex-manila-.*"} > 1000000
169+
for: 5m
170+
labels:
171+
context: syncobjects
172+
dashboard: cortex/cortex
173+
service: cortex
174+
severity: info
175+
support_group: workload-management
176+
annotations:
177+
summary: "Cortex is syncing unexpectedly many objects from {{`{{$labels.datasource}}`}}"
178+
description: >
179+
Cortex is syncing more than 1 million objects from a datasource. This
180+
may happen when the datasource (OpenStack, Prometheus, etc.) returns
181+
unexpectedly many objects, or when the database cannot drop old objects.
182+
No immediate action is needed, but should this condition persist for a
183+
longer time, the database may fill up and crash.
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
groups:
2+
- name: cortex-manila-mqtt-alerts
3+
rules:
4+
- alert: CortexManilaTooManyMQTTConnectionAttempts
5+
expr: rate(cortex_mqtt_connection_attempts_total{component=~"cortex-manila-.*"}[5m]) > 0.1
6+
for: 1m
7+
labels:
8+
context: mqtt
9+
annotations:
10+
summary: "Cortex is trying to connect to MQTT too often"
11+
description: >
12+
Cortex is trying to connect to the MQTT broker too often. This may
13+
happen when the broker is down or the connection parameters are
14+
misconfigured.
15+
16+
- alert: CortexManilaTooManyDBConnectionAttempts
17+
expr: rate(cortex_db_connection_attempts_total{component=~"cortex-manila-.*"}[5m]) > 0.1
18+
for: 5m
19+
labels:
20+
context: db
21+
annotations:
22+
summary: "Cortex is trying to connect to the database too often"
23+
description: >
24+
Cortex is trying to connect to the database too often. This may happen
25+
when the database is down or the connection parameters are misconfigured.

0 commit comments

Comments
 (0)