From fa1ed34d8ee8db4767e9bfc80a965ab1542cb583 Mon Sep 17 00:00:00 2001 From: Vince Prignano Date: Tue, 9 Feb 2021 09:50:39 -0800 Subject: [PATCH 1/2] :bug: Fix a race condition between leader election and recorder This change introduces better syncronization between the leader election code and the event recorder. Running tests with -race flag, we often saw a panic on a closed channel, the channel was the one that the event recorder was using internally. After digging more through the code, it seems that we weren't properly waiting for leader election code to stop completely, but instead we were only calling the cancel() function asking the leader election to stop. With this change, during a shutdown, we now wait for leader election to finish up any internal task before we return and close an internal channel. Only after leader election signals that the channel has been closed, and Run(...) has properly returned, we return execution to the stop procedure, where the event recorder is then stopped. Signed-off-by: Vince Prignano --- pkg/manager/internal.go | 15 ++++++++++++++- pkg/manager/manager.go | 1 + 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/pkg/manager/internal.go b/pkg/manager/internal.go index d253b9a6ad..65b4917b70 100644 --- a/pkg/manager/internal.go +++ b/pkg/manager/internal.go @@ -117,6 +117,10 @@ type controllerManager struct { // it must be deferred until after gracefulShutdown is done. leaderElectionCancel context.CancelFunc + // leaderElectionStopped is an internal channel used to signal the stopping procedure that the + // LeaderElection.Run(...) function has returned and the shutdown can proceed. + leaderElectionStopped chan struct{} + // stop procedure engaged. In other words, we should not add anything else to the manager stopProcedureEngaged bool @@ -549,7 +553,12 @@ func (cm *controllerManager) waitForRunnableToEnd(shutdownCancel context.CancelF // Cancel leader election only after we waited. It will os.Exit() the app for safety. defer func() { if cm.leaderElectionCancel != nil { + // After asking the context to be cancelled, make sure + // we wait for the leader stopped channel to be closed, otherwise + // we might encounter race conditions between this code + // and the event recorder, which is used within leader election code. cm.leaderElectionCancel() + <-cm.leaderElectionStopped } }() @@ -652,7 +661,11 @@ func (cm *controllerManager) startLeaderElection() (err error) { } // Start the leader elector process - go l.Run(ctx) + go func() { + l.Run(ctx) + <-ctx.Done() + close(cm.leaderElectionStopped) + }() return nil } diff --git a/pkg/manager/manager.go b/pkg/manager/manager.go index 2f676ad7a0..3fd790d34e 100644 --- a/pkg/manager/manager.go +++ b/pkg/manager/manager.go @@ -350,6 +350,7 @@ func New(config *rest.Config, options Options) (Manager, error) { livenessEndpointName: options.LivenessEndpointName, gracefulShutdownTimeout: *options.GracefulShutdownTimeout, internalProceduresStop: make(chan struct{}), + leaderElectionStopped: make(chan struct{}), }, nil } From faecdc7dc7834e01f7830dee2f8547a8e53e7b73 Mon Sep 17 00:00:00 2001 From: Vince Prignano Date: Wed, 10 Feb 2021 10:10:55 -0800 Subject: [PATCH 2/2] Only cancel leader election if the runnables have shutdown Signed-off-by: Vince Prignano --- pkg/manager/internal.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pkg/manager/internal.go b/pkg/manager/internal.go index 65b4917b70..fd92ea1cb4 100644 --- a/pkg/manager/internal.go +++ b/pkg/manager/internal.go @@ -549,10 +549,10 @@ func (cm *controllerManager) engageStopProcedure(stopComplete <-chan struct{}) e // waitForRunnableToEnd blocks until all runnables ended or the // tearDownTimeout was reached. In the latter case, an error is returned. -func (cm *controllerManager) waitForRunnableToEnd(shutdownCancel context.CancelFunc) error { +func (cm *controllerManager) waitForRunnableToEnd(shutdownCancel context.CancelFunc) (retErr error) { // Cancel leader election only after we waited. It will os.Exit() the app for safety. defer func() { - if cm.leaderElectionCancel != nil { + if retErr == nil && cm.leaderElectionCancel != nil { // After asking the context to be cancelled, make sure // we wait for the leader stopped channel to be closed, otherwise // we might encounter race conditions between this code