Skip to content

Commit 99d36fa

Browse files
committed
OCPEDGE-2213: fix(podman-etcd): prevent learner from starting before cluster is ready
Clear stale learner_node attribute during stop and on restart when no active resources exist, ensuring learner always waits for peer availability.
1 parent 6cd23a8 commit 99d36fa

File tree

1 file changed

+5
-1
lines changed

1 file changed

+5
-1
lines changed

heartbeat/podman-etcd

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1369,7 +1369,7 @@ container_health_check()
13691369
# Could not execute monitor check command and state file exists - the container failed, check recovery status in this lifecycle
13701370
local time_since_heartbeat
13711371
time_since_heartbeat=$(get_time_since_last_heartbeat)
1372-
ocf_log err "Container ${CONTAINER} failed (last healthy: ${time_since_heartbeat}s ago)"
1372+
ocf_log err "Container ${CONTAINER} failed (last healthy: ${time_since_heartbeat}s ago, error code: $rc)"
13731373

13741374
# Check if peer has set force_new_cluster for recovery
13751375
local fnc_holders
@@ -1796,6 +1796,9 @@ podman_start()
17961796
fi
17971797
;;
17981798
0)
1799+
# No active resources: clear any stale learner_node attribute from previous failed session
1800+
ocf_log debug "clearing stale learner_node attribute (safe when active_resources_count=0)"
1801+
attribute_learner_node clear
17991802
# count how many agents are starting now
18001803
local start_resources_count
18011804
start_resources_count=$(echo "$OCF_RESKEY_CRM_meta_notify_start_resource" | wc -w)
@@ -2033,6 +2036,7 @@ podman_stop()
20332036
ocf_log err "could not delete container health check state file"
20342037
fi
20352038

2039+
attribute_learner_node clear
20362040
attribute_node_revision update
20372041
attribute_node_cluster_id update
20382042

0 commit comments

Comments
 (0)