OCPEDGE-2213: fix(podman-etcd): prevent learner from starting before cluster is ready

clobrano · clobrano · commit 99d36fa651ce · 2025-11-10T16:04:34.000+01:00
Clear stale learner_node attribute during stop and on restart when no
active resources exist, ensuring learner always waits for peer
availability.
diff --git a/heartbeat/podman-etcd b/heartbeat/podman-etcd
@@ -1369,7 +1369,7 @@ container_health_check()
 	# Could not execute monitor check command and state file exists - the container failed, check recovery status in this lifecycle
 	local time_since_heartbeat
 	time_since_heartbeat=$(get_time_since_last_heartbeat)
-	ocf_log err "Container ${CONTAINER} failed (last healthy: ${time_since_heartbeat}s ago)"
+	ocf_log err "Container ${CONTAINER} failed (last healthy: ${time_since_heartbeat}s ago, error code: $rc)"
 
 	# Check if peer has set force_new_cluster for recovery
 	local fnc_holders
@@ -1796,6 +1796,9 @@ podman_start()
 				fi
 				;;
 			0)
+				# No active resources: clear any stale learner_node attribute from previous failed session
+				ocf_log debug "clearing stale learner_node attribute (safe when active_resources_count=0)"
+				attribute_learner_node clear
 				# count how many agents are starting now
 				local start_resources_count
 				start_resources_count=$(echo "$OCF_RESKEY_CRM_meta_notify_start_resource" | wc -w)
@@ -2033,6 +2036,7 @@ podman_stop()
 		ocf_log err "could not delete container health check state file"
 	fi
 
+	attribute_learner_node clear
 	attribute_node_revision update
 	attribute_node_cluster_id update