|
| 1 | +--- |
| 2 | +title: Monitoring and alerting ingest |
| 3 | +description: Set up Prometheus alerting rules to detect common ingestion issues before they impact your logging pipeline. |
| 4 | +weight: 100 |
| 5 | +--- |
| 6 | + |
| 7 | + |
| 8 | +# Monitoring and alerting ingest |
| 9 | + |
| 10 | +Set up Prometheus alerting rules to detect common ingestion issues before they impact your logging pipeline. Create a file named `loki-ingestion-alerts.yml` (or add to your existing Prometheus rules file) with the following alerting rules: |
| 11 | + |
| 12 | +```yaml |
| 13 | +# File: loki-ingestion-alerts.yml |
| 14 | +# Add this file to your Prometheus rule_files configuration: |
| 15 | +# rule_files: |
| 16 | +# - /etc/prometheus/rules/loki-ingestion-alerts.yml |
| 17 | + |
| 18 | +groups: |
| 19 | + - name: loki_ingestion |
| 20 | + rules: |
| 21 | + # Rate limit alerts |
| 22 | + - alert: LokiRequestRateLimited |
| 23 | + expr: sum by (tenant) (rate(loki_discarded_samples_total{reason="rate_limited"}[5m])) > 0 |
| 24 | + for: 5m |
| 25 | + labels: |
| 26 | + severity: warning |
| 27 | + annotations: |
| 28 | + summary: "Tenant {{ $labels.tenant }} is being rate limited" |
| 29 | + description: "Tenant {{ $labels.tenant }} has exceeded ingestion rate limits. Consider increasing limits or reducing log volume." |
| 30 | + |
| 31 | + # Stream limit alerts |
| 32 | + - alert: LokiStreamLimitReached |
| 33 | + expr: sum by (tenant) (rate(loki_discarded_samples_total{reason="stream_limit"}[5m])) > 0 |
| 34 | + for: 5m |
| 35 | + labels: |
| 36 | + severity: warning |
| 37 | + annotations: |
| 38 | + summary: "Tenant {{ $labels.tenant }} has reached stream limit" |
| 39 | + description: "Tenant {{ $labels.tenant }} has exceeded max_global_streams_per_user. Reduce label cardinality or increase the limit." |
| 40 | + |
| 41 | + # WAL alerts |
| 42 | + - alert: LokiWALDiskFull |
| 43 | + expr: increase(loki_ingester_wal_disk_full_failures_total[5m]) > 0 |
| 44 | + for: 1m |
| 45 | + labels: |
| 46 | + severity: critical |
| 47 | + annotations: |
| 48 | + summary: "WAL disk is full on {{ $labels.instance }}" |
| 49 | + description: "Ingester {{ $labels.instance }} cannot write to WAL due to disk space. Data durability is compromised." |
| 50 | + |
| 51 | + # Validation errors |
| 52 | + - alert: LokiHighValidationErrors |
| 53 | + expr: sum by (reason) (rate(loki_discarded_samples_total{reason=~"invalid_labels|line_too_long|out_of_order"}[5m])) > 10 |
| 54 | + for: 5m |
| 55 | + labels: |
| 56 | + severity: warning |
| 57 | + annotations: |
| 58 | + summary: "High rate of {{ $labels.reason }} validation errors" |
| 59 | + description: "Loki is discarding logs due to {{ $labels.reason }}. Check your log shipping configuration." |
| 60 | +``` |
| 61 | +
|
| 62 | +To enable these alerts, add the rules file to your Prometheus configuration: |
| 63 | +
|
| 64 | +```yaml |
| 65 | +# prometheus.yml |
| 66 | +rule_files: |
| 67 | + - /etc/prometheus/rules/loki-ingestion-alerts.yml |
| 68 | +``` |
| 69 | +
|
| 70 | +{{< admonition type="tip" >}} |
| 71 | +The [Loki mixin](https://github.com/grafana/loki/tree/main/production/loki-mixin) provides a comprehensive set of pre-built dashboards and alerting rules for monitoring Loki in production. |
| 72 | +{{< /admonition >}} |
0 commit comments