Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 20 additions & 6 deletions docs/selfcheck.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,12 @@ The selfcheck system uses the following environment variables:
- `MESSAGES_SELFCHECK_INTERVAL`: Interval in seconds between self-checks (for instance: `600` - 10 minutes)
- `MESSAGES_SELFCHECK_TIMEOUT`: Timeout in seconds for message reception (for instance: `60` - 60 seconds)

As well as these prometheus specific environment variables:

- `MESSAGES_SELFCHECK_PROMETHEUS_METRICS_ENABLED`: Enable or disable Prometheus metrics reporting (default: `False`)
- `MESSAGES_SELFCHECK_PROMETHEUS_METRICS_PUSHGATEWAY_URL`: URL of the Prometheus Pushgateway to which metrics are sent (default: `None`)
- `MESSAGES_SELFCHECK_PROMETHEUS_METRICS_PREFIX`: Prefix for all Prometheus metrics names (default: empty string)

## Usage

### Manual Execution
Expand Down Expand Up @@ -64,28 +70,36 @@ If the selfcheck fails, it will return an error message and attempt to clean up
- **Reception timeout**: The message was not received within the timeout period (configurable via `MESSAGES_SELFCHECK_TIMEOUT`)
- **Integrity verification failure**: The received message does not contain the expected secret or has structural issues

## Monitoring
## Logging

The selfcheck system logs all operations with appropriate log levels:

- `INFO`: Normal operation progress
- `WARNING`: Non-critical issues (e.g., parsing errors for individual messages)
- `ERROR`: Critical failures that cause the self-check to fail



## Integration with Monitoring

The selfcheck results can be integrated with monitoring systems by:

1. **Checking the success status** of the selfcheck task
2. **Monitoring timing metrics** to detect performance degradation
3. **Alerting on failures** to quickly identify delivery pipeline issues
4. **Tracking trends** in reception times to identify system bottlenecks

## Monitoring

By setting `MESSAGES_SELFCHECK_PROMETHEUS_METRICS_ENABLED` to `True` as well as setting `MESSAGES_SELFCHECK_PROMETHEUS_METRICS_PUSHGATEWAY_URL` to your [prometheus pushgateway](https://github.com/prometheus/pushgateway)'s url, the job will push the following metrics:

- `selfcheck_start_time`: Start timestamp of the self check
- `selfcheck_end_time`: End timestamp of the self check
- `selfcheck_success`: 1 if the self check succeeded, 0 if it failed
- `selfcheck_send_duration_seconds`: Time taken to send the test message (seconds), only on succesful send
- `selfcheck_reception_duration_seconds`: Time taken to receive the test message (seconds), only on succesful reception

All metric names can be prefixed using the `MESSAGES_SELFCHECK_PROMETHEUS_METRICS_PREFIX` environment variable.

## Security Considerations

- The selfcheck uses dedicated test mailboxes that are separate from user data
- Test messages are automatically cleaned up after verification
- The secret string is configurable to prevent predictable patterns
- All test data is isolated from production user data
- All test data is isolated from production user data
71 changes: 71 additions & 0 deletions src/backend/core/management/commands/selfcheck.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,70 @@

from django.conf import settings
from django.core.management.base import BaseCommand
from prometheus_client import CollectorRegistry, Gauge, push_to_gateway

from core.mda.selfcheck import run_selfcheck

class SelfCheckMetricsBase(object):
"""Dummy class that should be subclassed for different metrics backends."""

def mark_start(self):
pass

def mark_end(self):
pass

def mark_failure(self):
pass

def mark_success(self):
pass

def write_send_time(self, send_time):
pass

def write_reception_time(self, reception_time):
pass

def send_metrics(self):
pass


class SelfCheckPrometheusMetrics(SelfCheckMetricsBase):
"""Prometheus metrics for the selfcheck process."""

def __init__(self):
prefix = settings.MESSAGES_SELFCHECK_PROMETHEUS_METRICS_PREFIX
if settings.MESSAGES_SELFCHECK_PROMETHEUS_METRICS_PUSHGATEWAY_URL is None:
raise ValueError("Prometheus push gateway URL is not set.")
self.registry = CollectorRegistry()
self.start_time = Gauge(f'{prefix}selfcheck_start_time', 'Start timestamp of the self check', registry=self.registry)
self.end_time = Gauge(f'{prefix}selfcheck_end_time', 'End timestamp of the self check', registry=self.registry)
self.success = Gauge(f'{prefix}selfcheck_success', 'Success of the self check', registry=self.registry)
self.send_duration = Gauge(f'{prefix}selfcheck_send_duration_seconds', 'Send duration of the self check', registry=self.registry)
self.reception_duration = Gauge(f'{prefix}selfcheck_reception_duration_seconds', 'Receptions duration of the self check', registry=self.registry)

def mark_start(self):
self.start_time.set_to_current_time()

def mark_end(self):
self.end_time.set_to_current_time()

def mark_failure(self):
self.success.set(0)

def mark_success(self):
self.success.set(1)

def write_send_time(self, send_time):
self.send_duration.set(send_time)

def write_reception_time(self, reception_time):
self.reception_duration.set(reception_time)

def send_metrics(self):
return push_to_gateway(settings.MESSAGES_SELFCHECK_PROMETHEUS_METRICS_PUSHGATEWAY_URL, job='selfcheck', registry=self.registry)


class Command(BaseCommand):
"""Run a selfcheck of the mail delivery system."""
Expand All @@ -20,27 +81,37 @@ def add_arguments(self, parser):
def handle(self, *args, **options):
"""Execute the command."""

metrics = SelfCheckPrometheusMetrics() if settings.MESSAGES_SELFCHECK_PROMETHEUS_METRICS_ENABLED else SelfCheckMetricsBase()

self.stdout.write("Starting selfcheck...")
self.stdout.write(f"FROM: {settings.MESSAGES_SELFCHECK_FROM}")
self.stdout.write(f"TO: {settings.MESSAGES_SELFCHECK_TO}")
self.stdout.write(f"SECRET: {settings.MESSAGES_SELFCHECK_SECRET}")
self.stdout.write("")

metrics.mark_start()

# Run the selfcheck
result = run_selfcheck()

metrics.mark_end()

# Display results
if result["success"]:
self.stdout.write(self.style.SUCCESS("✓ Selfcheck completed successfully!"))

self.stdout.write("")
self.stdout.write("Timings:")
if result["send_time"] is not None:
metrics.write_send_time(result["send_time"])
self.stdout.write(f" Send time: {result['send_time']:.2f}s")
if result["reception_time"] is not None:
metrics.write_reception_time(result["reception_time"])
self.stdout.write(f" Reception time: {result['reception_time']:.2f}s")
metrics.send_metrics()
else:
self.stdout.write(
self.style.ERROR(f"✗ Selfcheck failed: {result['error']}")
)
metrics.send_metrics()
sys.exit(1)
12 changes: 12 additions & 0 deletions src/backend/messages/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,18 @@ class Base(Configuration):
environ_prefix=None,
)

MESSAGES_SELFCHECK_PROMETHEUS_METRICS_ENABLED = values.BooleanValue(
default=False, environ_name="MESSAGES_SELFCHECK_PROMETHEUS_METRICS_ENABLED"
)

MESSAGES_SELFCHECK_PROMETHEUS_METRICS_PUSHGATEWAY_URL = values.Value(
None, environ_name="MESSAGES_SELFCHECK_PROMETHEUS_METRICS_PUSHGATEWAY_URL"
)

MESSAGES_SELFCHECK_PROMETHEUS_METRICS_PREFIX = values.Value(
"", environ_name="MESSAGES_SELFCHECK_PROMETHEUS_METRICS_PREFIX"
)

# Blob compression settings
MESSAGES_BLOB_ZSTD_LEVEL = values.PositiveIntegerValue(
default=3, environ_name="MESSAGES_BLOB_ZSTD_LEVEL", environ_prefix=None
Expand Down
13 changes: 6 additions & 7 deletions src/backend/poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions src/backend/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ dependencies = [
"sentry-sdk[django]==2.27.0",
"url-normalize==1.4.3",
"whitenoise==6.8.2",
"prometheus-client (>=0.22.1,<0.23.0)",
]

[project.urls]
Expand Down