Skip to content

Commit 78c0a0b

Browse files
committed
Don't start other roles we have a healthy container
If a primary role container is unhealthy, we might take a while to timeout the health check poller. In the meantime if we have started the other roles, they'll be running tow containers. This could be a problem, especially if they read run jobs as that doubles the worker capacity which could cause exessive load. We'll wait for the first primary role container to boot successfully before starting the other containers from other roles.
1 parent ee758d9 commit 78c0a0b

File tree

2 files changed

+25
-20
lines changed

2 files changed

+25
-20
lines changed

lib/kamal/cli/app/boot.rb

Lines changed: 25 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -34,20 +34,19 @@ def old_version_renamed_if_clashing
3434
end
3535

3636
def start_new_version
37+
wait_at_barrier if queuer?
38+
3739
audit "Booted app version #{version}"
3840

3941
execute *app.tie_cord(role.cord_host_file) if uses_cord?
4042
hostname = "#{host.to_s[0...51].gsub(/\.+$/, '')}-#{SecureRandom.hex(6)}"
4143
execute *app.run(hostname: hostname)
4244
Kamal::Cli::Healthcheck::Poller.wait_for_healthy(pause_after_ready: true) { capture_with_info(*app.status(version: version)) }
4345

44-
reach_barrier
46+
release_barrier if gatekeeper?
4547
rescue => e
46-
if barrier_role? && barrier&.close
47-
info "First #{KAMAL.primary_role} container unhealthy, stopping other roles (#{host})"
48-
error capture_with_info(*app.logs(version: version))
49-
error capture_with_info(*app.container_health_log(version: version))
50-
end
48+
close_barrier if gatekeeper?
49+
5150
execute *app.stop(version: version), raise_on_non_zero_exit: false
5251

5352
raise
@@ -67,19 +66,13 @@ def stop_old_version(version)
6766
execute *app.clean_up_assets if assets?
6867
end
6968

70-
def reach_barrier
71-
if barrier
72-
if barrier_role?
73-
if barrier.open
74-
info "First #{KAMAL.primary_role} container healthy, continuing other roles (#{host})"
75-
end
76-
else
77-
wait_for_barrier
78-
end
69+
def release_barrier
70+
if barrier.open
71+
info "First #{KAMAL.primary_role} container healthy, continuing other roles (#{host})"
7972
end
8073
end
8174

82-
def wait_for_barrier
75+
def wait_at_barrier
8376
info "Waiting for a healthy #{KAMAL.primary_role} container (#{host})..."
8477
barrier.wait
8578
info "First #{KAMAL.primary_role} container is healthy, continuing (#{host})"
@@ -88,6 +81,14 @@ def wait_for_barrier
8881
raise
8982
end
9083

84+
def close_barrier
85+
if barrier.close
86+
info "First #{KAMAL.primary_role} container unhealthy, stopping other roles (#{host})"
87+
error capture_with_info(*app.logs(version: version))
88+
error capture_with_info(*app.container_health_log(version: version))
89+
end
90+
end
91+
9192
def barrier_role?
9293
role == KAMAL.primary_role
9394
end
@@ -103,4 +104,12 @@ def auditor
103104
def audit(message)
104105
execute *auditor.record(message), verbosity: :debug
105106
end
107+
108+
def gatekeeper?
109+
barrier && barrier_role?
110+
end
111+
112+
def queuer?
113+
barrier && !barrier_role?
114+
end
106115
end

test/cli/app_test.rb

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -154,10 +154,6 @@ class CliAppTest < CliTestCase
154154
.with(:docker, :container, :ls, "--all", "--filter", "name=^app-web-latest$", "--quiet", "|", :xargs, :docker, :inspect, "--format", "'{{if .State.Health}}{{.State.Health.Status}}{{else}}{{.State.Status}}{{end}}'")
155155
.returns("unhealthy").at_least_once # web health check failing
156156

157-
SSHKit::Backend::Abstract.any_instance.expects(:capture_with_info)
158-
.with(:docker, :container, :ls, "--all", "--filter", "name=^app-workers-latest$", "--quiet", "|", :xargs, :docker, :inspect, "--format", "'{{if .State.Health}}{{.State.Health.Status}}{{else}}{{.State.Status}}{{end}}'")
159-
.returns("running").at_least_once # workers health check passing
160-
161157
stderred do
162158
run_command("boot", config: :with_roles, host: nil, allow_execute_error: true).tap do |output|
163159
assert_match "Waiting for a healthy web container (1.1.1.3)...", output

0 commit comments

Comments
 (0)