Skip to content

Commit 9065166

Browse files
authored
Merge pull request #283 from NixOS/deploy
deploy.sh: add pre and post-deploy checks + rollbacks
2 parents fe433a5 + 60ea26f commit 9065166

File tree

1 file changed

+315
-5
lines changed

1 file changed

+315
-5
lines changed
Lines changed: 315 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,45 @@
11
#!/usr/bin/env bash
22

3-
set -euxo pipefail
3+
set -euo pipefail
4+
5+
WIKI_HOST="wiki.nixos.org"
6+
SSH_TARGET="root@${WIKI_HOST}"
7+
FLAKE_TARGET=".#nixos-wiki-nixos-org"
8+
MAX_RETRIES=3
9+
ROLLBACK_ON_FAILURE=true
10+
11+
# nixos-rebuild-ng handles its own SSH ControlMaster, so we just set up
12+
# a wrapper for our own SSH calls to reduce authentication prompts
13+
SSH_TMPDIR=$(mktemp -d /tmp/wiki-deploy.XXXXXX)
14+
trap 'rm -rf "$SSH_TMPDIR"' EXIT
15+
16+
# SSH options for our direct SSH calls (not nixos-rebuild-ng)
17+
SSH_CONTROL_PATH="${SSH_TMPDIR}/ssh-%h"
18+
SSH_OPTS="-o ControlMaster=auto -o ControlPath=${SSH_CONTROL_PATH} -o ControlPersist=30s"
19+
20+
# Function to use SSH with our options
21+
ssh() {
22+
# shellcheck disable=SC2086
23+
command ssh ${SSH_OPTS} "$@"
24+
}
25+
26+
# Colors for output
27+
RED='\033[0;31m'
28+
GREEN='\033[0;32m'
29+
YELLOW='\033[1;33m'
30+
NC='\033[0m' # No Color
31+
32+
log() {
33+
echo -e "${GREEN}[$(date '+%Y-%m-%d %H:%M:%S')]${NC} $*"
34+
}
35+
36+
error() {
37+
echo -e "${RED}[ERROR $(date '+%Y-%m-%d %H:%M:%S')]${NC} $*" >&2
38+
}
39+
40+
warning() {
41+
echo -e "${YELLOW}[WARNING $(date '+%Y-%m-%d %H:%M:%S')]${NC} $*"
42+
}
443

544
nixBuild() {
645
if command -v nom -v &>/dev/null; then
@@ -9,7 +48,278 @@ nixBuild() {
948
nix build "$@"
1049
fi
1150
}
12-
nixBuild .#checks.x86_64-linux.test .#nixosConfigurations.nixos-wiki-nixos-org.config.system.build.toplevel -L
13-
if ! nixos-rebuild-ng switch --flake .#nixos-wiki-nixos-org --target-host [email protected]; then
14-
nixos-rebuild-ng switch --flake .#nixos-wiki-nixos-org --target-host [email protected]
15-
fi
51+
52+
# Pre-deployment checks
53+
pre_deployment_checks() {
54+
log "Running pre-deployment checks..."
55+
56+
# Check SSH connectivity
57+
log "Checking SSH connectivity to ${WIKI_HOST}..."
58+
if ! ssh -o ConnectTimeout=10 -o BatchMode=yes "${SSH_TARGET}" "echo 'SSH connection successful'"; then
59+
error "Cannot establish SSH connection to ${WIKI_HOST}"
60+
return 1
61+
fi
62+
63+
# Get current system generation for potential rollback
64+
CURRENT_GENERATION=$(ssh "${SSH_TARGET}" "readlink /run/current-system | sed 's/.*-\([0-9]*\)-link/\1/'")
65+
log "Current system generation: ${CURRENT_GENERATION}"
66+
67+
# Check disk space
68+
log "Checking disk space on target..."
69+
DISK_USAGE=$(ssh "${SSH_TARGET}" "df -h / | awk 'NR==2 {print \$5}' | sed 's/%//'")
70+
if [ "${DISK_USAGE}" -gt 85 ]; then
71+
warning "Disk usage is high: ${DISK_USAGE}%"
72+
fi
73+
74+
return 0
75+
}
76+
77+
# Build the system
78+
build_system() {
79+
log "Building NixOS configuration..."
80+
nixBuild .#checks.x86_64-linux.test .#nixosConfigurations.nixos-wiki-nixos-org.config.system.build.toplevel -L --log-format bar-with-logs
81+
}
82+
83+
# Deploy with retries
84+
deploy_system() {
85+
log "Deploying to ${WIKI_HOST}..."
86+
87+
local retry_count=0
88+
while [ $retry_count -lt $MAX_RETRIES ]; do
89+
if nixos-rebuild-ng switch --flake "${FLAKE_TARGET}" --target-host "${SSH_TARGET}"; then
90+
log "Deployment successful"
91+
return 0
92+
else
93+
retry_count=$((retry_count + 1))
94+
if [ $retry_count -lt $MAX_RETRIES ]; then
95+
warning "Deployment failed, retrying ($retry_count/$MAX_RETRIES)..."
96+
sleep 5
97+
fi
98+
fi
99+
done
100+
101+
error "Deployment failed after $MAX_RETRIES attempts"
102+
return 1
103+
}
104+
105+
# Health check functions
106+
check_nginx() {
107+
log "Checking nginx service..."
108+
if ! ssh "${SSH_TARGET}" "systemctl is-active --quiet nginx"; then
109+
error "Nginx service is not active"
110+
ssh "${SSH_TARGET}" "systemctl status nginx --no-pager | head -20" || true
111+
return 1
112+
fi
113+
114+
# Check if main page loads with wiki content
115+
local response_code
116+
local response_body
117+
response_code=$(curl -sL -o /dev/null -w "%{http_code}" -m 10 "https://${WIKI_HOST}/wiki/Main_Page" || echo "000")
118+
119+
if [[ $response_code != "200" ]]; then
120+
error "Main page returned HTTP status code: $response_code"
121+
if [[ $response_code == "000" ]]; then
122+
error "Failed to connect to https://${WIKI_HOST}/wiki/Main_Page"
123+
fi
124+
return 1
125+
fi
126+
127+
# Check page content (follow redirects)
128+
response_body=$(curl -sfL -m 10 "https://${WIKI_HOST}/wiki/Main_Page" 2>&1) || {
129+
error "Failed to fetch main page content: $?"
130+
return 1
131+
}
132+
133+
if ! echo "$response_body" | grep -q "<title>.*NixOS Wiki.*</title>"; then
134+
error "Main page does not contain expected title"
135+
error "Page title: $(echo "$response_body" | grep -o '<title>[^<]*</title>' | head -1 || echo "Could not extract title")"
136+
error "First 500 chars of response:"
137+
echo "$response_body" | head -c 500
138+
return 1
139+
fi
140+
141+
return 0
142+
}
143+
144+
check_postgresql() {
145+
log "Checking PostgreSQL service..."
146+
ssh "${SSH_TARGET}" "systemctl is-active --quiet postgresql" || return 1
147+
148+
# Check if database is accessible
149+
if ! ssh "${SSH_TARGET}" "sudo -u postgres psql -d mediawiki -c 'SELECT 1;' >/dev/null 2>&1"; then
150+
error "PostgreSQL database 'mediawiki' is not accessible"
151+
return 1
152+
fi
153+
return 0
154+
}
155+
156+
check_postfix() {
157+
log "Checking Postfix service..."
158+
if ! ssh "${SSH_TARGET}" "systemctl is-active --quiet postfix"; then
159+
error "Postfix service is not active"
160+
return 1
161+
fi
162+
163+
# Check if postfix queue is processing (not stuck)
164+
local queue_status
165+
queue_status=$(ssh "${SSH_TARGET}" "postqueue -p | tail -1" 2>&1)
166+
if echo "$queue_status" | grep -q "Mail queue is empty"; then
167+
log " Postfix queue is empty (good)"
168+
elif echo "$queue_status" | grep -q "in .*[0-9]* Request"; then
169+
local queue_count
170+
queue_count=$(echo "$queue_status" | grep -o '[0-9]*' | head -1)
171+
if [ "${queue_count:-0}" -gt 50 ]; then
172+
warning " Postfix has many queued emails: $queue_status"
173+
else
174+
log " Postfix has $queue_count queued email(s) (acceptable)"
175+
fi
176+
else
177+
warning " Could not determine postfix queue status"
178+
fi
179+
180+
return 0
181+
}
182+
183+
check_backup_services() {
184+
log "Checking backup services..."
185+
186+
# Check if backup timers are active
187+
local backup_services=("wiki-dump.timer" "borgbackup-job-wiki.timer")
188+
for service in "${backup_services[@]}"; do
189+
# shellcheck disable=SC2029
190+
if ssh "${SSH_TARGET}" "systemctl is-active --quiet '$service'"; then
191+
log "$service is active"
192+
else
193+
warning "$service is not active"
194+
fi
195+
done
196+
return 0
197+
}
198+
199+
# Main health check
200+
run_health_checks() {
201+
log "Running post-deployment health checks..."
202+
203+
local failed_checks=0
204+
local start_time
205+
start_time=$(date +%s)
206+
207+
# Wait for system to stabilize
208+
log "Waiting for system to stabilize..."
209+
sleep 10
210+
211+
# Run individual health checks
212+
local checks=(
213+
"check_nginx"
214+
"check_postgresql"
215+
"check_postfix"
216+
"check_backup_services"
217+
)
218+
219+
for check in "${checks[@]}"; do
220+
if $check; then
221+
log "$check passed"
222+
else
223+
error "$check failed"
224+
failed_checks=$((failed_checks + 1))
225+
fi
226+
done
227+
228+
# Check overall system status
229+
log "Checking overall system status..."
230+
local system_status
231+
system_status=$(ssh "${SSH_TARGET}" "systemctl is-system-running || echo 'degraded'")
232+
233+
if [[ $system_status == "running" ]]; then
234+
log "System status: running"
235+
else
236+
warning "System status: $system_status"
237+
if [[ $system_status == "degraded" ]]; then
238+
log "Failed units:"
239+
ssh "${SSH_TARGET}" "systemctl --failed --no-pager"
240+
fi
241+
fi
242+
243+
local elapsed=$(($(date +%s) - start_time))
244+
log "Health checks completed in ${elapsed}s"
245+
246+
if [ $failed_checks -gt 0 ]; then
247+
error "$failed_checks health checks failed"
248+
return 1
249+
fi
250+
251+
return 0
252+
}
253+
254+
# Rollback function
255+
rollback() {
256+
if [ -z "${CURRENT_GENERATION:-}" ]; then
257+
error "Cannot rollback: no previous generation recorded"
258+
return 1
259+
fi
260+
261+
error "Rolling back to generation ${CURRENT_GENERATION}..."
262+
# shellcheck disable=SC2029
263+
if ssh "${SSH_TARGET}" "nix-env --profile /nix/var/nix/profiles/system --switch-generation '${CURRENT_GENERATION}' && /nix/var/nix/profiles/system/bin/switch-to-configuration switch"; then
264+
log "Rollback successful"
265+
return 0
266+
else
267+
error "Rollback failed!"
268+
return 1
269+
fi
270+
}
271+
272+
# Main deployment flow
273+
main() {
274+
log "Starting NixOS Wiki deployment..."
275+
276+
# Build
277+
if ! build_system; then
278+
error "Build failed, aborting"
279+
exit 1
280+
fi
281+
282+
# Pre-deployment checks
283+
if ! pre_deployment_checks; then
284+
error "Pre-deployment checks failed, aborting"
285+
exit 1
286+
fi
287+
288+
# Deploy
289+
deploy_success=true
290+
if ! deploy_system; then
291+
error "Deployment failed"
292+
deploy_success=false
293+
fi
294+
295+
# Always run health checks to see current system state
296+
if ! run_health_checks; then
297+
error "Post-deployment health checks failed"
298+
299+
if [ "$ROLLBACK_ON_FAILURE" = true ]; then
300+
warning "Attempting automatic rollback..."
301+
if rollback; then
302+
log "Rollback completed. Please investigate the deployment failure."
303+
exit 1
304+
else
305+
error "Automatic rollback failed! Manual intervention required!"
306+
exit 2
307+
fi
308+
else
309+
error "Health checks failed but rollback is disabled"
310+
exit 1
311+
fi
312+
fi
313+
314+
# If deployment failed but health checks passed, still exit with error
315+
if [ "$deploy_success" = false ]; then
316+
error "Deployment failed but system appears healthy"
317+
exit 1
318+
fi
319+
320+
log "Deployment completed successfully! 🚀"
321+
log "NixOS Wiki is healthy at https://${WIKI_HOST}"
322+
}
323+
324+
# Run main function
325+
main "$@"

0 commit comments

Comments
 (0)