Skip to content

Commit 117d4aa

Browse files
committed
Remove the healthcheck step
To speed up deployments, we'll remove the healthcheck step. This adds some risk to deployments for non-web roles - if they don't have a Docker healthcheck configured then the only check we do is if the container is running. If there is a bad image we might see the container running before it exits and deploy it. Previously the healthcheck step would have avoided this by ensuring a web container could boot and serve traffic first. To mitigate this, we'll add a web barrier. Non web containers will wait before shutting down the old containers until at least one web container has passed its healthcheck. It the web container fails its healthcheck, we'll close the barrier and shut down the new containers on the non-web roles. We also have a new integration test to check we correctly handle a a broken image. This highlighted that SSHKit's default runner will stop at the first error it encounters. We'll now have a custom runner that waits for all threads to finish allowing them to clean up. Finally, we only tag an image as the latest after we have successfully started the container and passed the web barrier, if applicable. That means that if we have a deployment that completes on some hosts but not others we can run `kamal app version --quiet` to see which version is running on each host.
1 parent 786454f commit 117d4aa

26 files changed

+276
-335
lines changed

lib/kamal/cli/app.rb

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,16 +10,18 @@ def boot
1010
# Assets are prepared in a separate step to ensure they are on all hosts before booting
1111
on(KAMAL.hosts) do
1212
execute *KAMAL.auditor.record("Tagging #{KAMAL.config.absolute_image} as the latest image"), verbosity: :debug
13-
execute *KAMAL.app.tag_current_image_as_latest
1413

1514
KAMAL.roles_on(host).each do |role|
1615
Kamal::Cli::App::PrepareAssets.new(host, role, self).run
1716
end
1817
end
1918

19+
web_barrier = Kamal::Cli::Healthcheck::Barrier.new if web_and_non_web_roles?
20+
2021
on(KAMAL.hosts, **KAMAL.boot_strategy) do |host|
22+
# Ensure web roles are booted first to allow the web barrier to be opened
2123
KAMAL.roles_on(host).each do |role|
22-
Kamal::Cli::App::Boot.new(host, role, version, self).run
24+
Kamal::Cli::App::Boot.new(host, role, version, web_barrier, self).run
2325
end
2426
end
2527
end
@@ -286,4 +288,8 @@ def stale_versions(host:, role:)
286288
def version_or_latest
287289
options[:version] || "latest"
288290
end
291+
292+
def web_and_non_web_roles?
293+
KAMAL.roles.any?(&:running_traefik?) && !KAMAL.roles.all?(&:running_traefik?)
294+
end
289295
end

lib/kamal/cli/app/boot.rb

Lines changed: 57 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,13 @@
11
class Kamal::Cli::App::Boot
2-
attr_reader :host, :role, :version, :sshkit
2+
attr_reader :host, :role, :version, :web_barrier, :sshkit
33
delegate :execute, :capture_with_info, :info, to: :sshkit
4-
delegate :uses_cord?, :assets?, to: :role
4+
delegate :uses_cord?, :assets?, :running_traefik?, to: :role
55

6-
def initialize(host, role, version, sshkit)
6+
def initialize(host, role, version, web_barrier, sshkit)
77
@host = host
88
@role = role
99
@version = version
10+
@web_barrier = web_barrier
1011
@sshkit = sshkit
1112
end
1213

@@ -15,24 +16,14 @@ def run
1516

1617
start_new_version
1718

19+
tag_current_image_as_latest
20+
1821
if old_version
1922
stop_old_version(old_version)
2023
end
2124
end
2225

2326
private
24-
def app
25-
@app ||= KAMAL.app(role: role)
26-
end
27-
28-
def auditor
29-
@auditor = KAMAL.auditor(role: role)
30-
end
31-
32-
def audit(message)
33-
execute *auditor.record(message), verbosity: :debug
34-
end
35-
3627
def old_version_renamed_if_clashing
3728
if capture_with_info(*app.container_id_for_version(version), raise_on_non_zero_exit: false).present?
3829
renamed_version = "#{version}_replaced_#{SecureRandom.hex(8)}"
@@ -46,9 +37,23 @@ def old_version_renamed_if_clashing
4637

4738
def start_new_version
4839
audit "Booted app version #{version}"
40+
4941
execute *app.tie_cord(role.cord_host_file) if uses_cord?
5042
execute *app.run(hostname: "#{host}-#{SecureRandom.hex(6)}")
43+
5144
Kamal::Cli::Healthcheck::Poller.wait_for_healthy(pause_after_ready: true) { capture_with_info(*app.status(version: version)) }
45+
46+
reach_web_barrier
47+
rescue => e
48+
close_web_barrier if running_traefik?
49+
execute *app.stop(version: version), raise_on_non_zero_exit: false
50+
51+
raise
52+
end
53+
54+
def tag_current_image_as_latest
55+
execute *KAMAL.auditor.record("Tagging #{KAMAL.config.absolute_image} as the latest image"), verbosity: :debug
56+
execute *KAMAL.app.tag_current_image_as_latest
5257
end
5358

5459
def stop_old_version(version)
@@ -64,4 +69,41 @@ def stop_old_version(version)
6469

6570
execute *app.clean_up_assets if assets?
6671
end
72+
73+
def reach_web_barrier
74+
if web_barrier
75+
if running_traefik?
76+
web_barrier.open
77+
else
78+
wait_for_web_barrier
79+
end
80+
end
81+
end
82+
83+
def wait_for_web_barrier
84+
info "Waiting at web barrier (#{host})..."
85+
web_barrier.wait
86+
info "Barrier opened (#{host})"
87+
rescue Kamal::Cli::Healthcheck::Error
88+
info "Barrier closed, shutting down new container... (#{host})"
89+
raise
90+
end
91+
92+
def close_web_barrier
93+
if web_barrier
94+
web_barrier.close
95+
end
96+
end
97+
98+
def app
99+
@app ||= KAMAL.app(role: role)
100+
end
101+
102+
def auditor
103+
@auditor = KAMAL.auditor(role: role)
104+
end
105+
106+
def audit(message)
107+
execute *auditor.record(message), verbosity: :debug
108+
end
67109
end

lib/kamal/cli/healthcheck.rb

Lines changed: 0 additions & 21 deletions
This file was deleted.
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
class Kamal::Cli::Healthcheck::Barrier
2+
def initialize
3+
@ivar = Concurrent::IVar.new
4+
end
5+
6+
def close
7+
set(false)
8+
end
9+
10+
def open
11+
set(true)
12+
end
13+
14+
def wait
15+
unless opened?
16+
raise Kamal::Cli::Healthcheck::Error.new("Halted at barrier")
17+
end
18+
end
19+
20+
private
21+
def opened?
22+
@ivar.value
23+
end
24+
25+
def set(value)
26+
@ivar.set(value)
27+
rescue Concurrent::MultipleAssignmentError
28+
end
29+
end

lib/kamal/cli/healthcheck/error.rb

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
class Kamal::Cli::Healthcheck::Error < StandardError
2+
end

lib/kamal/cli/healthcheck/poller.rb

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@ module Kamal::Cli::Healthcheck::Poller
33

44
TRAEFIK_UPDATE_DELAY = 5
55

6-
class HealthcheckError < StandardError; end
76

87
def wait_for_healthy(pause_after_ready: false, &block)
98
attempt = 1
@@ -16,9 +15,9 @@ def wait_for_healthy(pause_after_ready: false, &block)
1615
when "running" # No health check configured
1716
sleep KAMAL.config.readiness_delay if pause_after_ready
1817
else
19-
raise HealthcheckError, "container not ready (#{status})"
18+
raise Kamal::Cli::Healthcheck::Error, "container not ready (#{status})"
2019
end
21-
rescue HealthcheckError => e
20+
rescue Kamal::Cli::Healthcheck::Error => e
2221
if attempt <= max_attempts
2322
info "#{e.message}, retrying in #{attempt}s (attempt #{attempt}/#{max_attempts})..."
2423
sleep attempt
@@ -41,9 +40,9 @@ def wait_for_unhealthy(pause_after_ready: false, &block)
4140
when "unhealthy"
4241
sleep TRAEFIK_UPDATE_DELAY if pause_after_ready
4342
else
44-
raise HealthcheckError, "container not unhealthy (#{status})"
43+
raise Kamal::Cli::Healthcheck::Error, "container not unhealthy (#{status})"
4544
end
46-
rescue HealthcheckError => e
45+
rescue Kamal::Cli::Healthcheck::Error => e
4746
if attempt <= max_attempts
4847
info "#{e.message}, retrying in #{attempt}s (attempt #{attempt}/#{max_attempts})..."
4948
sleep attempt

lib/kamal/cli/main.rb

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -41,11 +41,6 @@ def deploy
4141
say "Ensure Traefik is running...", :magenta
4242
invoke "kamal:cli:traefik:boot", [], invoke_options
4343

44-
if KAMAL.config.role(KAMAL.config.primary_role).running_traefik?
45-
say "Ensure app can pass healthcheck...", :magenta
46-
invoke "kamal:cli:healthcheck:perform", [], invoke_options
47-
end
48-
4944
say "Detect stale containers...", :magenta
5045
invoke "kamal:cli:app:stale_containers", [], invoke_options.merge(stop: true)
5146

@@ -76,9 +71,6 @@ def redeploy
7671

7772
run_hook "pre-deploy"
7873

79-
say "Ensure app can pass healthcheck...", :magenta
80-
invoke "kamal:cli:healthcheck:perform", [], invoke_options
81-
8274
say "Detect stale containers...", :magenta
8375
invoke "kamal:cli:app:stale_containers", [], invoke_options.merge(stop: true)
8476

@@ -223,9 +215,6 @@ def version
223215
desc "env", "Manage environment files"
224216
subcommand "env", Kamal::Cli::Env
225217

226-
desc "healthcheck", "Healthcheck application"
227-
subcommand "healthcheck", Kamal::Cli::Healthcheck
228-
229218
desc "lock", "Manage the deploy lock"
230219
subcommand "lock", Kamal::Cli::Lock
231220

lib/kamal/commander.rb

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -61,9 +61,9 @@ def primary_role
6161
end
6262

6363
def roles
64-
(specific_roles || config.roles).select do |role|
65-
((specific_hosts || config.all_hosts) & role.hosts).any?
66-
end
64+
(specific_roles || config.roles) \
65+
.select { |role| ((specific_hosts || config.all_hosts) & role.hosts).any? }
66+
.sort_by { |role| role.running_traefik? ? 0 : 1 }
6767
end
6868

6969
def hosts
@@ -178,6 +178,7 @@ def configure_sshkit_with(config)
178178
sshkit.max_concurrent_starts = config.sshkit.max_concurrent_starts
179179
sshkit.ssh_options = config.ssh.options
180180
end
181+
SSHKit.config.default_runner = SSHKit::Runner::ParallelCompleteAll
181182
SSHKit.config.command_map[:docker] = "docker" # No need to use /usr/bin/env, just clogs up the logs
182183
SSHKit.config.output_verbosity = verbosity
183184
end

lib/kamal/commands/app/assets.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ def extract_assets
55
combine \
66
make_directory(role.asset_extracted_path),
77
[ *docker(:stop, "-t 1", asset_container, "2> /dev/null"), "|| true" ],
8-
docker(:run, "--name", asset_container, "--detach", "--rm", config.latest_image, "sleep 1000000"),
8+
docker(:run, "--name", asset_container, "--detach", "--rm", config.absolute_image, "sleep 1000000"),
99
docker(:cp, "-L", "#{asset_container}:#{role.asset_path}/.", role.asset_extracted_path),
1010
docker(:stop, "-t 1", asset_container),
1111
by: "&&"

lib/kamal/commands/healthcheck.rb

Lines changed: 0 additions & 59 deletions
This file was deleted.

0 commit comments

Comments
 (0)