Skip to content

Commit 3641911

Browse files
committed
removing inner goroutine in cluster.Switchover and resolve race between processPodEvent and unregisterPodSubscriber
1 parent a77d5df commit 3641911

File tree

4 files changed

+28
-44
lines changed

4 files changed

+28
-44
lines changed

pkg/cluster/cluster.go

Lines changed: 6 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -1034,10 +1034,10 @@ func (c *Cluster) processPodEvent(obj interface{}) error {
10341034

10351035
c.podSubscribersMu.RLock()
10361036
subscriber, ok := c.podSubscribers[spec.NamespacedName(event.PodName)]
1037-
c.podSubscribersMu.RUnlock()
10381037
if ok {
10391038
subscriber <- event
10401039
}
1040+
c.podSubscribersMu.RUnlock()
10411041

10421042
return nil
10431043
}
@@ -1501,49 +1501,23 @@ func (c *Cluster) Switchover(curMaster *v1.Pod, candidate spec.NamespacedName) e
15011501
var err error
15021502
c.logger.Debugf("switching over from %q to %q", curMaster.Name, candidate)
15031503
c.eventRecorder.Eventf(c.GetReference(), v1.EventTypeNormal, "Switchover", "Switching over from %q to %q", curMaster.Name, candidate)
1504-
1505-
var wg sync.WaitGroup
1506-
1507-
podLabelErr := make(chan error)
15081504
stopCh := make(chan struct{})
1509-
1510-
wg.Add(1)
1511-
1512-
go func() {
1513-
defer wg.Done()
1514-
ch := c.registerPodSubscriber(candidate)
1515-
defer c.unregisterPodSubscriber(candidate)
1516-
1517-
role := Master
1518-
1519-
select {
1520-
case <-stopCh:
1521-
case podLabelErr <- func() (err2 error) {
1522-
_, err2 = c.waitForPodLabel(ch, stopCh, &role)
1523-
return
1524-
}():
1525-
}
1526-
}()
1505+
ch := c.registerPodSubscriber(candidate)
1506+
defer c.unregisterPodSubscriber(candidate)
1507+
defer close(stopCh)
15271508

15281509
if err = c.patroni.Switchover(curMaster, candidate.Name); err == nil {
15291510
c.logger.Debugf("successfully switched over from %q to %q", curMaster.Name, candidate)
15301511
c.eventRecorder.Eventf(c.GetReference(), v1.EventTypeNormal, "Switchover", "Successfully switched over from %q to %q", curMaster.Name, candidate)
1531-
if err = <-podLabelErr; err != nil {
1512+
_, err = c.waitForPodLabel(ch, stopCh, nil)
1513+
if err != nil {
15321514
err = fmt.Errorf("could not get master pod label: %v", err)
15331515
}
15341516
} else {
15351517
err = fmt.Errorf("could not switch over from %q to %q: %v", curMaster.Name, candidate, err)
15361518
c.eventRecorder.Eventf(c.GetReference(), v1.EventTypeNormal, "Switchover", "Switchover from %q to %q FAILED: %v", curMaster.Name, candidate, err)
15371519
}
15381520

1539-
// signal the role label waiting goroutine to close the shop and go home
1540-
close(stopCh)
1541-
// wait until the goroutine terminates, since unregisterPodSubscriber
1542-
// must be called before the outer return; otherwise we risk subscribing to the same pod twice.
1543-
wg.Wait()
1544-
// close the label waiting channel no sooner than the waiting goroutine terminates.
1545-
close(podLabelErr)
1546-
15471521
return err
15481522
}
15491523

pkg/cluster/pod.go

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@ func (c *Cluster) markRollingUpdateFlagForPod(pod *v1.Pod, msg string) error {
6767
return fmt.Errorf("could not form patch for pod's rolling update flag: %v", err)
6868
}
6969

70-
err = retryutil.Retry(c.OpConfig.PatroniAPICheckInterval, c.OpConfig.PatroniAPICheckTimeout,
70+
err = retryutil.Retry(c.OpConfig.ResourceCheckInterval, c.OpConfig.ResourceCheckTimeout,
7171
func() (bool, error) {
7272
_, err2 := c.KubeClient.Pods(pod.Namespace).Patch(
7373
context.TODO(),
@@ -149,14 +149,14 @@ func (c *Cluster) deletePod(podName spec.NamespacedName) error {
149149
func (c *Cluster) unregisterPodSubscriber(podName spec.NamespacedName) {
150150
c.logger.Debugf("unsubscribing from pod %q events", podName)
151151
c.podSubscribersMu.Lock()
152-
defer c.podSubscribersMu.Unlock()
153-
154-
if _, ok := c.podSubscribers[podName]; !ok {
152+
ch, ok := c.podSubscribers[podName]
153+
if !ok {
155154
panic("subscriber for pod '" + podName.String() + "' is not found")
156155
}
157156

158-
close(c.podSubscribers[podName])
159157
delete(c.podSubscribers, podName)
158+
c.podSubscribersMu.Unlock()
159+
close(ch)
160160
}
161161

162162
func (c *Cluster) registerPodSubscriber(podName spec.NamespacedName) chan PodEvent {
@@ -399,11 +399,12 @@ func (c *Cluster) getPatroniMemberData(pod *v1.Pod) (patroni.MemberData, error)
399399
}
400400

401401
func (c *Cluster) recreatePod(podName spec.NamespacedName) (*v1.Pod, error) {
402+
stopCh := make(chan struct{})
402403
ch := c.registerPodSubscriber(podName)
403404
defer c.unregisterPodSubscriber(podName)
404-
stopChan := make(chan struct{})
405+
defer close(stopCh)
405406

406-
err := retryutil.Retry(c.OpConfig.PatroniAPICheckInterval, c.OpConfig.PatroniAPICheckTimeout,
407+
err := retryutil.Retry(c.OpConfig.ResourceCheckInterval, c.OpConfig.PodDeletionWaitTimeout,
407408
func() (bool, error) {
408409
err2 := c.KubeClient.Pods(podName.Namespace).Delete(
409410
context.TODO(),
@@ -421,7 +422,7 @@ func (c *Cluster) recreatePod(podName spec.NamespacedName) (*v1.Pod, error) {
421422
if err := c.waitForPodDeletion(ch); err != nil {
422423
return nil, err
423424
}
424-
pod, err := c.waitForPodLabel(ch, stopChan, nil)
425+
pod, err := c.waitForPodLabel(ch, stopCh, nil)
425426
if err != nil {
426427
return nil, err
427428
}
@@ -446,7 +447,7 @@ func (c *Cluster) recreatePods(pods []v1.Pod, switchoverCandidates []spec.Namesp
446447
continue
447448
}
448449

449-
podName := util.NameFromMeta(pod.ObjectMeta)
450+
podName := util.NameFromMeta(pods[i].ObjectMeta)
450451
newPod, err := c.recreatePod(podName)
451452
if err != nil {
452453
return fmt.Errorf("could not recreate replica pod %q: %v", util.NameFromMeta(pod.ObjectMeta), err)

pkg/controller/controller.go

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -451,17 +451,18 @@ func (c *Controller) Run(stopCh <-chan struct{}, wg *sync.WaitGroup) {
451451
panic("could not acquire initial list of clusters")
452452
}
453453

454-
wg.Add(5)
454+
wg.Add(5 + util.Bool2Int(c.opConfig.EnablePostgresTeamCRD))
455455
go c.runPodInformer(stopCh, wg)
456456
go c.runPostgresqlInformer(stopCh, wg)
457457
go c.clusterResync(stopCh, wg)
458-
go c.apiserver.Run(stopCh, wg)
459458
go c.kubeNodesInformer(stopCh, wg)
460459

461460
if c.opConfig.EnablePostgresTeamCRD {
462461
go c.runPostgresTeamInformer(stopCh, wg)
463462
}
464463

464+
go c.apiserver.Run(stopCh, wg)
465+
465466
c.logger.Info("started working in background")
466467
}
467468

pkg/util/util.go

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -324,12 +324,20 @@ func testNil(values ...*int32) bool {
324324
return false
325325
}
326326

327-
// Convert int to IntOrString type
327+
// ToIntStr convert int to IntOrString type
328328
func ToIntStr(val int) *intstr.IntOrString {
329329
b := intstr.FromInt(val)
330330
return &b
331331
}
332332

333+
// Bool2Int converts bool to int
334+
func Bool2Int(flag bool) int {
335+
if flag {
336+
return 1
337+
}
338+
return 0
339+
}
340+
333341
// Get int from IntOrString and return max int if string
334342
func IntFromIntStr(intOrStr intstr.IntOrString) int {
335343
if intOrStr.Type == 1 {

0 commit comments

Comments
 (0)