Skip to content

Commit 180205a

Browse files
committed
Remove the healthcheck step
To speed up deployments, we'll remove the healthcheck step. This adds some risk to deployments for non-web roles - if they don't have a Docker healthcheck configured then the only check we do is if the container is running. If there is a bad image we might see the container running before it exits and deploy it. Previously the healthcheck step would have avoided this by ensuring a web container could boot and serve traffic first. To mitigate this, we'll add a deployment barrier. Until one of the primary role containers passes its healthcheck, we'll keep the barrier up and avoid stopping the containers on the non-primary roles. It the primary role container fails its healthcheck, we'll close the barrier and shut down the new containers on the waiting roles. We also have a new integration test to check we correctly handle a a broken image. This highlighted that SSHKit's default runner will stop at the first error it encounters. We'll now have a custom runner that waits for all threads to finish allowing them to clean up.
1 parent 31669d4 commit 180205a

File tree

24 files changed

+270
-329
lines changed

24 files changed

+270
-329
lines changed

lib/kamal/cli/app.rb

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,9 +14,12 @@ def boot
1414
end
1515
end
1616

17+
barrier = Kamal::Cli::Healthcheck::Barrier.new if KAMAL.roles.many?
18+
1719
on(KAMAL.hosts, **KAMAL.boot_strategy) do |host|
20+
# Ensure primary role is booted first to allow the web barrier to be opened
1821
KAMAL.roles_on(host).each do |role|
19-
Kamal::Cli::App::Boot.new(host, role, version, self).run
22+
Kamal::Cli::App::Boot.new(host, role, self, version, barrier).run
2023
end
2124
end
2225

@@ -282,4 +285,8 @@ def current_running_version(host: KAMAL.primary_host)
282285
def version_or_latest
283286
options[:version] || KAMAL.config.latest_tag
284287
end
288+
289+
def web_and_non_web_roles?
290+
KAMAL.roles.any?(&:running_traefik?) && !KAMAL.roles.all?(&:running_traefik?)
291+
end
285292
end

lib/kamal/cli/app/boot.rb

Lines changed: 54 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,13 @@
11
class Kamal::Cli::App::Boot
2-
attr_reader :host, :role, :version, :sshkit
2+
attr_reader :host, :role, :version, :barrier, :sshkit
33
delegate :execute, :capture_with_info, :info, to: :sshkit
4-
delegate :uses_cord?, :assets?, to: :role
4+
delegate :uses_cord?, :assets?, :running_traefik?, to: :role
55

6-
def initialize(host, role, version, sshkit)
6+
def initialize(host, role, sshkit, version, barrier)
77
@host = host
88
@role = role
99
@version = version
10+
@barrier = barrier
1011
@sshkit = sshkit
1112
end
1213

@@ -21,18 +22,6 @@ def run
2122
end
2223

2324
private
24-
def app
25-
@app ||= KAMAL.app(role: role)
26-
end
27-
28-
def auditor
29-
@auditor = KAMAL.auditor(role: role)
30-
end
31-
32-
def audit(message)
33-
execute *auditor.record(message), verbosity: :debug
34-
end
35-
3625
def old_version_renamed_if_clashing
3726
if capture_with_info(*app.container_id_for_version(version), raise_on_non_zero_exit: false).present?
3827
renamed_version = "#{version}_replaced_#{SecureRandom.hex(8)}"
@@ -46,9 +35,18 @@ def old_version_renamed_if_clashing
4635

4736
def start_new_version
4837
audit "Booted app version #{version}"
38+
4939
execute *app.tie_cord(role.cord_host_file) if uses_cord?
5040
execute *app.run(hostname: "#{host}-#{SecureRandom.hex(6)}")
41+
5142
Kamal::Cli::Healthcheck::Poller.wait_for_healthy(pause_after_ready: true) { capture_with_info(*app.status(version: version)) }
43+
44+
reach_barrier
45+
rescue => e
46+
close_barrier if barrier_role?
47+
execute *app.stop(version: version), raise_on_non_zero_exit: false
48+
49+
raise
5250
end
5351

5452
def stop_old_version(version)
@@ -64,4 +62,45 @@ def stop_old_version(version)
6462

6563
execute *app.clean_up_assets if assets?
6664
end
65+
66+
def reach_barrier
67+
if barrier
68+
if barrier_role?
69+
if barrier.open
70+
info "Opened barrier (#{host})"
71+
end
72+
else
73+
wait_for_barrier
74+
end
75+
end
76+
end
77+
78+
def wait_for_barrier
79+
info "Waiting at web barrier (#{host})..."
80+
barrier.wait
81+
info "Barrier opened (#{host})"
82+
rescue Kamal::Cli::Healthcheck::Error
83+
info "Barrier closed, shutting down new container... (#{host})"
84+
raise
85+
end
86+
87+
def close_barrier
88+
barrier&.close
89+
end
90+
91+
def barrier_role?
92+
role == KAMAL.primary_role
93+
end
94+
95+
def app
96+
@app ||= KAMAL.app(role: role)
97+
end
98+
99+
def auditor
100+
@auditor = KAMAL.auditor(role: role)
101+
end
102+
103+
def audit(message)
104+
execute *auditor.record(message), verbosity: :debug
105+
end
67106
end

lib/kamal/cli/healthcheck.rb

Lines changed: 0 additions & 21 deletions
This file was deleted.
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
class Kamal::Cli::Healthcheck::Barrier
2+
def initialize
3+
@ivar = Concurrent::IVar.new
4+
end
5+
6+
def close
7+
set(false)
8+
end
9+
10+
def open
11+
set(true)
12+
end
13+
14+
def wait
15+
unless opened?
16+
raise Kamal::Cli::Healthcheck::Error.new("Halted at barrier")
17+
end
18+
end
19+
20+
private
21+
def opened?
22+
@ivar.value
23+
end
24+
25+
def set(value)
26+
@ivar.set(value)
27+
true
28+
rescue Concurrent::MultipleAssignmentError
29+
false
30+
end
31+
end

lib/kamal/cli/healthcheck/error.rb

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
class Kamal::Cli::Healthcheck::Error < StandardError
2+
end

lib/kamal/cli/healthcheck/poller.rb

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@ module Kamal::Cli::Healthcheck::Poller
33

44
TRAEFIK_UPDATE_DELAY = 5
55

6-
class HealthcheckError < StandardError; end
76

87
def wait_for_healthy(pause_after_ready: false, &block)
98
attempt = 1
@@ -16,9 +15,9 @@ def wait_for_healthy(pause_after_ready: false, &block)
1615
when "running" # No health check configured
1716
sleep KAMAL.config.readiness_delay if pause_after_ready
1817
else
19-
raise HealthcheckError, "container not ready (#{status})"
18+
raise Kamal::Cli::Healthcheck::Error, "container not ready (#{status})"
2019
end
21-
rescue HealthcheckError => e
20+
rescue Kamal::Cli::Healthcheck::Error => e
2221
if attempt <= max_attempts
2322
info "#{e.message}, retrying in #{attempt}s (attempt #{attempt}/#{max_attempts})..."
2423
sleep attempt
@@ -41,9 +40,9 @@ def wait_for_unhealthy(pause_after_ready: false, &block)
4140
when "unhealthy"
4241
sleep TRAEFIK_UPDATE_DELAY if pause_after_ready
4342
else
44-
raise HealthcheckError, "container not unhealthy (#{status})"
43+
raise Kamal::Cli::Healthcheck::Error, "container not unhealthy (#{status})"
4544
end
46-
rescue HealthcheckError => e
45+
rescue Kamal::Cli::Healthcheck::Error => e
4746
if attempt <= max_attempts
4847
info "#{e.message}, retrying in #{attempt}s (attempt #{attempt}/#{max_attempts})..."
4948
sleep attempt

lib/kamal/cli/main.rb

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -41,11 +41,6 @@ def deploy
4141
say "Ensure Traefik is running...", :magenta
4242
invoke "kamal:cli:traefik:boot", [], invoke_options
4343

44-
if KAMAL.config.role(KAMAL.config.primary_role).running_traefik?
45-
say "Ensure app can pass healthcheck...", :magenta
46-
invoke "kamal:cli:healthcheck:perform", [], invoke_options
47-
end
48-
4944
say "Detect stale containers...", :magenta
5045
invoke "kamal:cli:app:stale_containers", [], invoke_options.merge(stop: true)
5146

@@ -76,9 +71,6 @@ def redeploy
7671

7772
run_hook "pre-deploy"
7873

79-
say "Ensure app can pass healthcheck...", :magenta
80-
invoke "kamal:cli:healthcheck:perform", [], invoke_options
81-
8274
say "Detect stale containers...", :magenta
8375
invoke "kamal:cli:app:stale_containers", [], invoke_options.merge(stop: true)
8476

@@ -223,9 +215,6 @@ def version
223215
desc "env", "Manage environment files"
224216
subcommand "env", Kamal::Cli::Env
225217

226-
desc "healthcheck", "Healthcheck application"
227-
subcommand "healthcheck", Kamal::Cli::Healthcheck
228-
229218
desc "lock", "Manage the deploy lock"
230219
subcommand "lock", Kamal::Cli::Lock
231220

lib/kamal/commander.rb

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -150,6 +150,7 @@ def configure_sshkit_with(config)
150150
sshkit.max_concurrent_starts = config.sshkit.max_concurrent_starts
151151
sshkit.ssh_options = config.ssh.options
152152
end
153+
SSHKit.config.default_runner = SSHKit::Runner::ParallelCompleteAll
153154
SSHKit.config.command_map[:docker] = "docker" # No need to use /usr/bin/env, just clogs up the logs
154155
SSHKit.config.output_verbosity = verbosity
155156
end

lib/kamal/commands/healthcheck.rb

Lines changed: 0 additions & 59 deletions
This file was deleted.

lib/kamal/configuration.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -188,7 +188,7 @@ def sshkit
188188

189189

190190
def healthcheck
191-
{ "path" => "/up", "port" => 3000, "max_attempts" => 7, "exposed_port" => 3999, "cord" => "/tmp/kamal-cord", "log_lines" => 50 }.merge(raw_config.healthcheck || {})
191+
{ "path" => "/up", "port" => 3000, "max_attempts" => 7, "cord" => "/tmp/kamal-cord", "log_lines" => 50 }.merge(raw_config.healthcheck || {})
192192
end
193193

194194
def healthcheck_service

0 commit comments

Comments
 (0)