1
1
#! /usr/bin/env bash
2
2
3
- set -euxo pipefail
3
+ set -euo pipefail
4
+
5
+ WIKI_HOST=" wiki.nixos.org"
6
+ SSH_TARGET=" root@${WIKI_HOST} "
7
+ FLAKE_TARGET=" .#nixos-wiki-nixos-org"
8
+ MAX_RETRIES=3
9
+ ROLLBACK_ON_FAILURE=true
10
+
11
+ # nixos-rebuild-ng handles its own SSH ControlMaster, so we just set up
12
+ # a wrapper for our own SSH calls to reduce authentication prompts
13
+ SSH_TMPDIR=$( mktemp -d /tmp/wiki-deploy.XXXXXX)
14
+ trap ' rm -rf "$SSH_TMPDIR"' EXIT
15
+
16
+ # SSH options for our direct SSH calls (not nixos-rebuild-ng)
17
+ SSH_CONTROL_PATH=" ${SSH_TMPDIR} /ssh-%h"
18
+ SSH_OPTS=" -o ControlMaster=auto -o ControlPath=${SSH_CONTROL_PATH} -o ControlPersist=30s"
19
+
20
+ # Function to use SSH with our options
21
+ ssh () {
22
+ # shellcheck disable=SC2086
23
+ command ssh ${SSH_OPTS} " $@ "
24
+ }
25
+
26
+ # Colors for output
27
+ RED=' \033[0;31m'
28
+ GREEN=' \033[0;32m'
29
+ YELLOW=' \033[1;33m'
30
+ NC=' \033[0m' # No Color
31
+
32
+ log () {
33
+ echo -e " ${GREEN} [$( date ' +%Y-%m-%d %H:%M:%S' ) ]${NC} $* "
34
+ }
35
+
36
+ error () {
37
+ echo -e " ${RED} [ERROR $( date ' +%Y-%m-%d %H:%M:%S' ) ]${NC} $* " >&2
38
+ }
39
+
40
+ warning () {
41
+ echo -e " ${YELLOW} [WARNING $( date ' +%Y-%m-%d %H:%M:%S' ) ]${NC} $* "
42
+ }
4
43
5
44
nixBuild () {
6
45
if command -v nom -v & > /dev/null; then
@@ -9,7 +48,278 @@ nixBuild() {
9
48
nix build " $@ "
10
49
fi
11
50
}
12
- nixBuild .# checks.x86_64-linux.test .#nixosConfigurations.nixos-wiki-nixos-org.config.system.build.toplevel -L
13
- if ! nixos-rebuild-ng switch --flake .
# nixos-wiki-nixos-org --target-host [email protected] ; then
14
- nixos-rebuild-ng switch --flake .
# nixos-wiki-nixos-org --target-host [email protected]
15
- fi
51
+
52
+ # Pre-deployment checks
53
+ pre_deployment_checks () {
54
+ log " Running pre-deployment checks..."
55
+
56
+ # Check SSH connectivity
57
+ log " Checking SSH connectivity to ${WIKI_HOST} ..."
58
+ if ! ssh -o ConnectTimeout=10 -o BatchMode=yes " ${SSH_TARGET} " " echo 'SSH connection successful'" ; then
59
+ error " Cannot establish SSH connection to ${WIKI_HOST} "
60
+ return 1
61
+ fi
62
+
63
+ # Get current system generation for potential rollback
64
+ CURRENT_GENERATION=$( ssh " ${SSH_TARGET} " " readlink /run/current-system | sed 's/.*-\([0-9]*\)-link/\1/'" )
65
+ log " Current system generation: ${CURRENT_GENERATION} "
66
+
67
+ # Check disk space
68
+ log " Checking disk space on target..."
69
+ DISK_USAGE=$( ssh " ${SSH_TARGET} " " df -h / | awk 'NR==2 {print \$ 5}' | sed 's/%//'" )
70
+ if [ " ${DISK_USAGE} " -gt 85 ]; then
71
+ warning " Disk usage is high: ${DISK_USAGE} %"
72
+ fi
73
+
74
+ return 0
75
+ }
76
+
77
+ # Build the system
78
+ build_system () {
79
+ log " Building NixOS configuration..."
80
+ nixBuild .# checks.x86_64-linux.test .#nixosConfigurations.nixos-wiki-nixos-org.config.system.build.toplevel -L --log-format bar-with-logs
81
+ }
82
+
83
+ # Deploy with retries
84
+ deploy_system () {
85
+ log " Deploying to ${WIKI_HOST} ..."
86
+
87
+ local retry_count=0
88
+ while [ $retry_count -lt $MAX_RETRIES ]; do
89
+ if nixos-rebuild-ng switch --flake " ${FLAKE_TARGET} " --target-host " ${SSH_TARGET} " ; then
90
+ log " Deployment successful"
91
+ return 0
92
+ else
93
+ retry_count=$(( retry_count + 1 ))
94
+ if [ $retry_count -lt $MAX_RETRIES ]; then
95
+ warning " Deployment failed, retrying ($retry_count /$MAX_RETRIES )..."
96
+ sleep 5
97
+ fi
98
+ fi
99
+ done
100
+
101
+ error " Deployment failed after $MAX_RETRIES attempts"
102
+ return 1
103
+ }
104
+
105
+ # Health check functions
106
+ check_nginx () {
107
+ log " Checking nginx service..."
108
+ if ! ssh " ${SSH_TARGET} " " systemctl is-active --quiet nginx" ; then
109
+ error " Nginx service is not active"
110
+ ssh " ${SSH_TARGET} " " systemctl status nginx --no-pager | head -20" || true
111
+ return 1
112
+ fi
113
+
114
+ # Check if main page loads with wiki content
115
+ local response_code
116
+ local response_body
117
+ response_code=$( curl -sL -o /dev/null -w " %{http_code}" -m 10 " https://${WIKI_HOST} /wiki/Main_Page" || echo " 000" )
118
+
119
+ if [[ $response_code != " 200" ]]; then
120
+ error " Main page returned HTTP status code: $response_code "
121
+ if [[ $response_code == " 000" ]]; then
122
+ error " Failed to connect to https://${WIKI_HOST} /wiki/Main_Page"
123
+ fi
124
+ return 1
125
+ fi
126
+
127
+ # Check page content (follow redirects)
128
+ response_body=$( curl -sfL -m 10 " https://${WIKI_HOST} /wiki/Main_Page" 2>&1 ) || {
129
+ error " Failed to fetch main page content: $? "
130
+ return 1
131
+ }
132
+
133
+ if ! echo " $response_body " | grep -q " <title>.*NixOS Wiki.*</title>" ; then
134
+ error " Main page does not contain expected title"
135
+ error " Page title: $( echo " $response_body " | grep -o ' <title>[^<]*</title>' | head -1 || echo " Could not extract title" ) "
136
+ error " First 500 chars of response:"
137
+ echo " $response_body " | head -c 500
138
+ return 1
139
+ fi
140
+
141
+ return 0
142
+ }
143
+
144
+ check_postgresql () {
145
+ log " Checking PostgreSQL service..."
146
+ ssh " ${SSH_TARGET} " " systemctl is-active --quiet postgresql" || return 1
147
+
148
+ # Check if database is accessible
149
+ if ! ssh " ${SSH_TARGET} " " sudo -u postgres psql -d mediawiki -c 'SELECT 1;' >/dev/null 2>&1" ; then
150
+ error " PostgreSQL database 'mediawiki' is not accessible"
151
+ return 1
152
+ fi
153
+ return 0
154
+ }
155
+
156
+ check_postfix () {
157
+ log " Checking Postfix service..."
158
+ if ! ssh " ${SSH_TARGET} " " systemctl is-active --quiet postfix" ; then
159
+ error " Postfix service is not active"
160
+ return 1
161
+ fi
162
+
163
+ # Check if postfix queue is processing (not stuck)
164
+ local queue_status
165
+ queue_status=$( ssh " ${SSH_TARGET} " " postqueue -p | tail -1" 2>&1 )
166
+ if echo " $queue_status " | grep -q " Mail queue is empty" ; then
167
+ log " Postfix queue is empty (good)"
168
+ elif echo " $queue_status " | grep -q " in .*[0-9]* Request" ; then
169
+ local queue_count
170
+ queue_count=$( echo " $queue_status " | grep -o ' [0-9]*' | head -1)
171
+ if [ " ${queue_count:- 0} " -gt 50 ]; then
172
+ warning " Postfix has many queued emails: $queue_status "
173
+ else
174
+ log " Postfix has $queue_count queued email(s) (acceptable)"
175
+ fi
176
+ else
177
+ warning " Could not determine postfix queue status"
178
+ fi
179
+
180
+ return 0
181
+ }
182
+
183
+ check_backup_services () {
184
+ log " Checking backup services..."
185
+
186
+ # Check if backup timers are active
187
+ local backup_services=(" wiki-dump.timer" " borgbackup-job-wiki.timer" )
188
+ for service in " ${backup_services[@]} " ; do
189
+ # shellcheck disable=SC2029
190
+ if ssh " ${SSH_TARGET} " " systemctl is-active --quiet '$service '" ; then
191
+ log " ✓ $service is active"
192
+ else
193
+ warning " ✗ $service is not active"
194
+ fi
195
+ done
196
+ return 0
197
+ }
198
+
199
+ # Main health check
200
+ run_health_checks () {
201
+ log " Running post-deployment health checks..."
202
+
203
+ local failed_checks=0
204
+ local start_time
205
+ start_time=$( date +%s)
206
+
207
+ # Wait for system to stabilize
208
+ log " Waiting for system to stabilize..."
209
+ sleep 10
210
+
211
+ # Run individual health checks
212
+ local checks=(
213
+ " check_nginx"
214
+ " check_postgresql"
215
+ " check_postfix"
216
+ " check_backup_services"
217
+ )
218
+
219
+ for check in " ${checks[@]} " ; do
220
+ if $check ; then
221
+ log " ✓ $check passed"
222
+ else
223
+ error " ✗ $check failed"
224
+ failed_checks=$(( failed_checks + 1 ))
225
+ fi
226
+ done
227
+
228
+ # Check overall system status
229
+ log " Checking overall system status..."
230
+ local system_status
231
+ system_status=$( ssh " ${SSH_TARGET} " " systemctl is-system-running || echo 'degraded'" )
232
+
233
+ if [[ $system_status == " running" ]]; then
234
+ log " System status: running"
235
+ else
236
+ warning " System status: $system_status "
237
+ if [[ $system_status == " degraded" ]]; then
238
+ log " Failed units:"
239
+ ssh " ${SSH_TARGET} " " systemctl --failed --no-pager"
240
+ fi
241
+ fi
242
+
243
+ local elapsed=$(( $(date +% s) - start_time))
244
+ log " Health checks completed in ${elapsed} s"
245
+
246
+ if [ $failed_checks -gt 0 ]; then
247
+ error " $failed_checks health checks failed"
248
+ return 1
249
+ fi
250
+
251
+ return 0
252
+ }
253
+
254
+ # Rollback function
255
+ rollback () {
256
+ if [ -z " ${CURRENT_GENERATION:- } " ]; then
257
+ error " Cannot rollback: no previous generation recorded"
258
+ return 1
259
+ fi
260
+
261
+ error " Rolling back to generation ${CURRENT_GENERATION} ..."
262
+ # shellcheck disable=SC2029
263
+ if ssh " ${SSH_TARGET} " " nix-env --profile /nix/var/nix/profiles/system --switch-generation '${CURRENT_GENERATION} ' && /nix/var/nix/profiles/system/bin/switch-to-configuration switch" ; then
264
+ log " Rollback successful"
265
+ return 0
266
+ else
267
+ error " Rollback failed!"
268
+ return 1
269
+ fi
270
+ }
271
+
272
+ # Main deployment flow
273
+ main () {
274
+ log " Starting NixOS Wiki deployment..."
275
+
276
+ # Build
277
+ if ! build_system; then
278
+ error " Build failed, aborting"
279
+ exit 1
280
+ fi
281
+
282
+ # Pre-deployment checks
283
+ if ! pre_deployment_checks; then
284
+ error " Pre-deployment checks failed, aborting"
285
+ exit 1
286
+ fi
287
+
288
+ # Deploy
289
+ deploy_success=true
290
+ if ! deploy_system; then
291
+ error " Deployment failed"
292
+ deploy_success=false
293
+ fi
294
+
295
+ # Always run health checks to see current system state
296
+ if ! run_health_checks; then
297
+ error " Post-deployment health checks failed"
298
+
299
+ if [ " $ROLLBACK_ON_FAILURE " = true ]; then
300
+ warning " Attempting automatic rollback..."
301
+ if rollback; then
302
+ log " Rollback completed. Please investigate the deployment failure."
303
+ exit 1
304
+ else
305
+ error " Automatic rollback failed! Manual intervention required!"
306
+ exit 2
307
+ fi
308
+ else
309
+ error " Health checks failed but rollback is disabled"
310
+ exit 1
311
+ fi
312
+ fi
313
+
314
+ # If deployment failed but health checks passed, still exit with error
315
+ if [ " $deploy_success " = false ]; then
316
+ error " Deployment failed but system appears healthy"
317
+ exit 1
318
+ fi
319
+
320
+ log " Deployment completed successfully! 🚀"
321
+ log " NixOS Wiki is healthy at https://${WIKI_HOST} "
322
+ }
323
+
324
+ # Run main function
325
+ main " $@ "
0 commit comments