do nova

auhlig · auhlig · commit 6b1796bb286c · 2025-07-23T13:17:01.000+02:00
diff --git a/helm/README.md b/helm/README.md
@@ -32,3 +32,9 @@ Helm charts are organized into three main directories:
             ├── cortex-prometheus-operator
             └── ...
 ```
+
+## Versioning
+
+We use [semantic versioning](https://semver.org/) for our Helm charts. 
+Each chart has its own `Chart.yaml` file that specifies the version of the chart and its dependencies.
+
diff --git a/helm/bundles/cortex-manila/prometheus-rules/manila.alerts b/helm/bundles/cortex-manila/prometheus-rules/manila.alerts
@@ -1,7 +1,7 @@
   groups:
   - name: cortex-manila-alerts
     rules:
-    - alert: CortexManilaSchedulerDown
+    - alert: CortexManilaInitialPlacementDown
       expr: |
         up{component="cortex-manila-scheduler"} != 1 or
         absent(up{component="cortex-manila-scheduler"})
@@ -13,9 +13,9 @@
         severity: warning
         support_group: workload-management
       annotations:
-        summary: "Cortex external scheduler for Manila is down"
+        summary: "Cortex initial placement for Manila is down"
         description: >
-          The Cortex scheduler is down. Initial placement requests from Manila will
+          The Cortex initial placement is down. Initial placement requests from Manila will
           not be served. This is no immediate problem, since Manila will continue
           placing new shares. However, the placement will be less desirable.
 
diff --git a/helm/bundles/cortex-manila/prometheus-rules/mqtt.alerts b/helm/bundles/cortex-manila/prometheus-rules/mqtt.alerts
@@ -6,6 +6,10 @@
       for: 1m
       labels:
         context: mqtt
+        dashboard: cortex/cortex
+        service: cortex
+        severity: info
+        support_group: workload-management
       annotations:
         summary: "Cortex is trying to connect to MQTT too often"
         description: >
@@ -18,6 +22,10 @@
       for: 5m
       labels:
         context: db
+        dashboard: cortex/cortex
+        service: cortex
+        severity: info
+        support_group: workload-management
       annotations:
         summary: "Cortex is trying to connect to the database too often"
         description: >
diff --git a/helm/bundles/cortex-nova/prometheus-rules/descheduler.alerts b/helm/bundles/cortex-nova/prometheus-rules/descheduler.alerts
@@ -0,0 +1,18 @@
+  groups:
+  - name: cortex-nova-alerts
+    rules:
+    - alert: CortexNovaDeschedulerPipelineErroring
+      expr: delta(cortex_descheduler_pipeline_vm_descheduling_duration_seconds_count{component=~"cortex-nova-.*",error="true"}[2m]) > 0
+      for: 5m
+      labels:
+        context: descheduler
+        dashboard: cortex/cortex
+        service: cortex
+        severity: info
+        support_group: workload-management
+      annotations:
+        summary: "Descheduler pipeline is erroring."
+        description: >
+          The Cortex descheduler pipeline is encountering errors during its execution.
+          This may indicate issues with the descheduling logic or the underlying infrastructure.
+          It is recommended to investigate the descheduler logs and the state of the VMs being processed.
diff --git a/helm/bundles/cortex-nova/prometheus-rules/nova.alerts b/helm/bundles/cortex-nova/prometheus-rules/nova.alerts
@@ -0,0 +1,266 @@
+  groups:
+  - name: cortex-nova-alerts
+    rules:
+    - alert: CortexNovaInitialPlacementDown
+      expr: |
+        up{component="cortex-nova-scheduler"} != 1 or
+        absent(up{component="cortex-nova-scheduler"})
+      for: 1m
+      labels:
+        context: liveness
+         dashboard: cortex/cortex
+        service: cortex
+        severity: warning
+        support_group: workload-management
+      annotations:
+        summary: "Cortex initial placement for Nova is down"
+        description: >
+          The Cortex initial placement service is down. Initial placement requests from Nova will
+          not be served. This is no immediate problem, since Nova will continue
+          placing new VMs. However, the placement will be less desirable.
+
+    - alert: CortexNovaSyncerDown
+      expr: |
+        up{component="cortex-nova-syncer"} != 1 or
+        absent(up{component="cortex-nova-syncer"})
+      for: 1m
+      labels:
+        context: liveness
+        dashboard: cortex/cortex
+        service: cortex
+        severity: warning
+        support_group: workload-management
+      annotations:
+        summary: "Cortex syncer is down"
+        description: >
+          The Cortex syncer is down. Cortex requires somewhat recent data from
+          it's datasources (OpenStack, Prometheus, etc.) to make accurate
+          scheduling decisions. If this issue persists for a longer time, the
+          data based will slowly drift away from the actual state of the
+          datacenter, which may lead to less desirable placement decisions.
+          This is no immediate problem, since Nova will continue placing new VMs.
+
+    - alert: CortexNovaExtractorDown
+      expr: |
+        up{component="cortex-nova-extractor"} != 1 or
+        absent(up{component="cortex-nova-extractor"})
+      for: 1m
+      labels:
+        context: liveness
+        dashboard: cortex/cortex
+        service: cortex
+        severity: warning
+        support_group: workload-management
+      annotations:
+        summary: "Cortex extractor is down"
+        description: >
+          The Cortex extractor is down. This means that newly available data
+          about the datacenter will not be used to extract scheduling knowledge.
+          This is no immediate problem, since Nova will continue placing new VMs.
+          However, the placement will be less desirable.
+
+    - alert: CortexNovaHttpRequest400sTooHigh
+      expr: rate(cortex_scheduler_api_request_duration_seconds_count{component="cortex-nova-scheduler",status=~"4.+"}[5m]) > 0.1
+      for: 5m
+      labels:
+        context: api
+        dashboard: cortex/cortex
+        service: cortex
+        severity: info
+        support_group: workload-management
+      annotations:
+        summary: "HTTP request 400 errors too high"
+        description: >
+          Cortex is responding to Nova initial placement requests with HTTP 4xx
+          errors. This is expected when the scheduling request cannot be served
+          by Cortex. However, it could also indicate that the Nova request
+          format has changed and Cortex is unable to parse it.
+
+    - alert: CortexNovaHttpRequest500sTooHigh
+      expr: rate(cortex_scheduler_api_request_duration_seconds_count{component="cortex-nova-scheduler",status=~"5.+"}[5m]) > 0.1
+      for: 5m
+      labels:
+        context: api
+        dashboard: cortex/cortex
+        service: cortex
+        severity: info
+        support_group: workload-management
+      annotations:
+        summary: "HTTP request 500 errors too high"
+        description: >
+          Cortex is responding to Nova initial placement requests with HTTP 5xx
+          errors. This is not expected and indicates that Cortex is having some
+          internal problem. Nova will continue to place new VMs, but the
+          placement will be less desirable. Thus, no immediate action is needed.
+
+    - alert: CortexNovaHighMemoryUsage
+      expr: process_resident_memory_bytes{component=~"cortex-nova-.*"} > 1000 * 1024 * 1024
+      for: 5m
+      labels:
+        context: memory
+         dashboard: cortex/cortex
+         service: cortex
+         severity: info
+         support_group: workload-management
+      annotations:
+        summary: "Cortex {{`{{$labels.component}}`}} uses too much memory"
+        description: >
+          Cortex should not be using more than 1000 MiB of memory. Usually it
+          should use much less, so there may be a memory leak or other changes
+          that are causing the memory usage to increase significantly.
+
+    - alert: CortexNovaHighCPUUsage
+      expr: rate(process_cpu_seconds_total{component=~"cortex-nova-.*"}[1m]) > 0.5
+      for: 5m
+      labels:
+        context: cpu
+        dashboard: cortex/cortex
+        service: cortex
+        severity: info
+        support_group: workload-management
+      annotations:
+        summary: "Cortex {{`{{$labels.component}}`}} uses too much CPU"
+        description: >
+          Cortex should not be using more than 50% of a single CPU core. Usually
+          it should use much less, so there may be a CPU leak or other changes
+          that are causing the CPU usage to increase significantly.
+
+    - alert: CortexNovaSyncNotSuccessful
+      expr: cortex_sync_request_processed_total{component=~"cortex-nova-.*"} - cortex_sync_request_duration_seconds_count{component=~"cortex-nova-.*"} > 0
+      for: 5m
+      labels:
+        context: syncstatus
+        dashboard: cortex/cortex
+        service: cortex
+        severity: info
+        support_group: workload-management
+      annotations:
+        summary: "Sync not successful"
+        description: >
+          Cortex experienced an issue syncing data from a datasource. This may
+          happen when the datasource (OpenStack, Prometheus, etc.) is down or
+          the sync module is misconfigured. No immediate action is needed, since
+          the sync module will retry the sync operation and the currently synced
+          data will be kept. However, when this problem persists for a longer
+          time the service will have a less recent view of the datacenter.
+
+    - alert: CortexNovaSyncObjectsDroppedToZero
+      expr: cortex_sync_objects{component=~"cortex-nova-.*"} == 0
+      for: 5m
+      labels:
+        context: syncobjects
+        dashboard: cortex/cortex
+        service: cortex
+        severity: info
+        support_group: workload-management
+      annotations:
+        summary: "Cortex is not syncing any new data from {{`{{$labels.datasource}}`}}"
+        description: >
+          Cortex is not syncing any objects from a datasource. This may happen
+          when the datasource (OpenStack, Prometheus, etc.) is down or the sync
+          module is misconfigured. No immediate action is needed, since the sync
+          module will retry the sync operation and the currently synced data will
+          be kept. However, when this problem persists for a longer time the
+          service will have a less recent view of the datacenter.
+
+    - alert: CortexNovaSyncObjectsTooHigh
+      expr: cortex_sync_objects{component=~"cortex-nova-.*"} > 1000000
+      for: 5m
+      labels:
+        context: syncobjects
+        dashboard: cortex/cortex
+        service: cortex
+        severity: info
+        support_group: workload-management
+      annotations:
+        summary: "Cortex is syncing unexpectedly many objects from {{`{{$labels.datasource}}`}}"
+        description: >
+          Cortex is syncing more than 1 million objects from a datasource. This
+          may happen when the datasource (OpenStack, Prometheus, etc.) returns
+          unexpectedly many objects, or when the database cannot drop old objects.
+          No immediate action is needed, but should this condition persist for a
+          longer time, the database may fill up and crash.
+
+    - alert: CortexNovaTooManyMQTTConnectionAttempts
+      expr: rate(cortex_mqtt_connection_attempts_total{component=~"cortex-nova-.*"}[5m]) > 0.1
+      for: 1m
+      labels:
+        context: mqtt
+        dashboard: cortex/cortex
+        service: cortex
+        severity: info
+        support_group: workload-management
+      annotations:
+        summary: "Cortex is trying to connect to MQTT too often"
+        description: >
+          Cortex is trying to connect to the MQTT broker too often. This may
+          happen when the broker is down or the connection parameters are
+          misconfigured.
+
+    - alert: CortexNovaTooManyDBConnectionAttempts
+      expr: rate(cortex_db_connection_attempts_total{component=~"cortex-nova-.*"}[5m]) > 0.1
+      for: 5m
+      labels:
+        context: db
+        dashboard: cortex/cortex
+        service: cortex
+        severity: info
+        support_group: workload-management
+      annotations:
+        summary: "Cortex is trying to connect to the database too often"
+        description: >
+          Cortex is trying to connect to the database too often. This may happen
+          when the database is down or the connection parameters are misconfigured.
+
+    - alert: CortexNovaHostCPUUtilizationAbove100Percent
+      expr: cortex_host_utilization_per_host_pct{component=~"cortex-nova-.*",resource="cpu"} > 100
+      for: 5m
+      labels:
+        context: hostutilization
+        dashboard: cortex/cortex
+        service: cortex
+        severity: info
+        support_group: workload-management
+      annotations:
+        summary: "CPU utilization on host {{`{{$labels.compute_host_name}}`}} is above 100%"
+        description: >
+          OpenStack Placement reports CPU utilization above 100% for host {{`{{$labels.compute_host_name}}`}} in AZ {{`{{$labels.availability_zone}}`}} for over 5 minutes.
+          This can happen if there are VMs in the SHUTOFF state: these VMs still consume resources in Placement, but not in the underlying infrastructure (e.g., VMware). As a result, it is possible to manually migrate additional VMs onto a host with shut off VMs. The combined resource allocation (from running and shut off VMs) can then exceed the host's capacity, causing Placement to report utilization above 100%. This is expected behavior, as powering on the shut off VMs would overcommit the host.
+          Another cause may be shutting down a node without migrating its VMs. The total capacity drops, but Placement still accounts for the shut off VMs’ resource usage.
+          This situation should be investigated and resolved to ensure accurate resource accounting and avoid operational issues.
+
+
+    - alert: CortexNovaHostMemoryUtilizationAbove100Percent
+      expr: cortex_host_utilization_per_host_pct{component=~"cortex-nova-.*",resource="memory"} > 100
+      for: 5m
+      labels:
+        context: hostutilization
+        dashboard: cortex/cortex
+        service: cortex
+        severity: info
+        support_group: workload-management
+      annotations:
+        summary: "Memory utilization on host {{`{{$labels.compute_host_name}}`}} is above 100%"
+        description: >
+          OpenStack Placement reports Memory utilization above 100% for host {{`{{$labels.compute_host_name}}`}} in AZ {{`{{$labels.availability_zone}}`}} for over 5 minutes.
+          This can happen if there are VMs in the SHUTOFF state: these VMs still consume resources in Placement, but not in the underlying infrastructure (e.g., VMware). As a result, it is possible to manually migrate additional VMs onto a host with shut off VMs. The combined resource allocation (from running and shut off VMs) can then exceed the host's capacity, causing Placement to report utilization above 100%. This is expected behavior, as powering on the shut off VMs would overcommit the host.
+          Another cause may be shutting down a node without migrating its VMs. The total capacity drops, but Placement still accounts for the shut off VMs’ resource usage.
+          This situation should be investigated and resolved to ensure accurate resource accounting and avoid operational issues.
+
+    - alert: CortexNovaHostDiskUtilizationAbove100Percent
+      expr: cortex_host_utilization_per_host_pct{component=~"cortex-nova-.*",resource="disk"} > 100
+      for: 5m
+      labels:
+        context: hostutilization
+        dashboard: cortex/cortex
+        service: cortex
+        severity: info
+        support_group: workload-management
+      annotations:
+        summary: "Disk utilization on host {{`{{$labels.compute_host_name}}`}} is above 100%."
+        description: >
+          OpenStack Placement reports Disk utilization above 100% for host {{`{{$labels.compute_host_name}}`}} in AZ {{`{{$labels.availability_zone}}`}} for over 5 minutes.
+          This can happen if there are VMs in the SHUTOFF state: these VMs still consume resources in Placement, but not in the underlying infrastructure (e.g., VMware). As a result, it is possible to manually migrate additional VMs onto a host with shut off VMs. The combined resource allocation (from running and shut off VMs) can then exceed the host's capacity, causing Placement to report utilization above 100%. This is expected behavior, as powering on the shut off VMs would overcommit the host.
+          Another cause may be shutting down a node without migrating its VMs. The total capacity drops, but Placement still accounts for the shut off VMs’ resource usage.
+          This situation should be investigated and resolved to ensure accurate resource accounting and avoid operational issues.
+
diff --git a/helm/bundles/cortex-nova/templates/alerts.yaml b/helm/bundles/cortex-nova/templates/alerts.yaml
diff --git a/helm/bundles/cortex-nova/values.yaml b/helm/bundles/cortex-nova/values.yaml