Skip to content

Fix controller-manager failures causing CNS to remain incorrectly healthy #3688

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 3 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
173 changes: 115 additions & 58 deletions cns/service/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -1257,30 +1257,56 @@ func InitializeMultiTenantController(ctx context.Context, httpRestService cns.HT
return err
}

// Wait for multiTenantController to start.
go func() {
for {
if err := multiTenantController.Start(ctx); err != nil {
logger.Errorf("Failed to start multiTenantController: %v", err)
} else {
logger.Printf("Exiting multiTenantController")
return
}
// Set a timeout for the multiTenantController to start
controllerStartTimeout := 5 * time.Minute

// Create a channel to receive start errors
startErrCh := make(chan error, 1)

// Start the controller in a goroutine
startCtx, startCancel := context.WithTimeout(ctx, controllerStartTimeout)
defer startCancel()

// Retry after 1sec
time.Sleep(time.Second)
go func() {
if err := multiTenantController.Start(ctx); err != nil {
startErrCh <- err
} else {
logger.Printf("MultiTenantController stopped normally")
startErrCh <- nil
}
}()

// Wait for the controller to report it has started
ticker := time.NewTicker(500 * time.Millisecond)
defer ticker.Stop()

for {
if multiTenantController.IsStarted() {
logger.Printf("MultiTenantController is started")
break
select {
case err := <-startErrCh:
if err != nil {
logger.Errorf("Failed to start multiTenantController: %v", err)
return fmt.Errorf("failed to start multiTenantController: %w", err)
}
// Controller exited normally
return nil
case <-ticker.C:
if multiTenantController.IsStarted() {
logger.Printf("MultiTenantController is started")
// Start the sync loop
initSyncHostNCVersion(ctx, httpRestServiceImpl, cnsconfig)
return nil
}
logger.Printf("Waiting for multiTenantController to start...")
case <-startCtx.Done():
logger.Errorf("Timed out waiting for multiTenantController to start after %v", controllerStartTimeout)
multitenantControllerStartTimeouts.Inc()
return fmt.Errorf("timed out waiting for multiTenantController to start")
}

logger.Printf("Waiting for multiTenantController to start...")
time.Sleep(time.Millisecond * 500)
}
}

// initSyncHostNCVersion starts a goroutine to periodically poll and sync NC versions
func initSyncHostNCVersion(ctx context.Context, httpRestServiceImpl *restserver.HTTPRestService, cnsconfig configuration.CNSConfig) {
// TODO: do we need this to be running?
logger.Printf("Starting SyncHostNCVersion")
go func() {
Expand All @@ -1297,8 +1323,6 @@ func InitializeMultiTenantController(ctx context.Context, httpRestService cns.HT
}
}
}()

return nil
}

type nodeNetworkConfigGetter interface {
Expand Down Expand Up @@ -1609,56 +1633,89 @@ func InitializeCRDState(ctx context.Context, httpRestService cns.HTTPService, cn
// Start the Manager which starts the reconcile loop.
// The Reconciler will send an initial NodeNetworkConfig update to the PoolMonitor, starting the
// Monitor's internal loop.
managerStartTimeout := 5 * time.Minute
startManagerCtx, startManagerCancel := context.WithTimeout(ctx, managerStartTimeout)
defer startManagerCancel()
managerErrCh := make(chan error, 1)

go func() {
logger.Printf("Starting controller-manager.")
for {
if err := manager.Start(ctx); err != nil {
logger.Errorf("Failed to start controller-manager: %v", err)
// retry to start the request controller
// inc the managerStartFailures metric for failure tracking
managerStartFailures.Inc()
} else {
logger.Printf("Stopped controller-manager.")
if err := manager.Start(startManagerCtx); err != nil {
if errors.Is(err, context.DeadlineExceeded) || startManagerCtx.Err() == context.DeadlineExceeded {
logger.Errorf("Timed out waiting for controller-manager to start after %v", managerStartTimeout)
managerStartTimeouts.Inc()
managerErrCh <- fmt.Errorf("controller-manager start timed out: %w", err)
return
}
time.Sleep(time.Second) // TODO(rbtr): make this exponential backoff

logger.Errorf("Failed to start controller-manager: %v", err)
managerStartFailures.Inc()
managerErrCh <- fmt.Errorf("controller-manager failed to start: %w", err)
return
}

logger.Printf("Stopped controller-manager.")
managerErrCh <- nil
}()
logger.Printf("Initialized controller-manager.")
for {
logger.Printf("Waiting for NodeNetworkConfig reconciler to start.")
// wait for the Reconciler to run once on a NNC that was made for this Node.
// the nncReadyCtx has a timeout of 15 minutes, after which we will consider
// this false and the NNC Reconciler stuck/failed, log and retry.
nncReadyCtx, cancel := context.WithTimeout(ctx, 15*time.Minute) // nolint // it will time out and not leak
if started, err := nncReconciler.Started(nncReadyCtx); !started {
logger.Errorf("NNC reconciler has not started, does the NNC exist? err: %v", err)
nncReconcilerStartFailures.Inc()
continue
}
logger.Printf("NodeNetworkConfig reconciler has started.")
cancel()
break
}


// Wait for the NodeNetworkConfig reconciler to start or the manager to fail
nncReconcilerStartCtx, nncReconcilerStartCancel := context.WithCancel(ctx)
defer nncReconcilerStartCancel()

go func() {
logger.Printf("Starting SyncHostNCVersion loop.")
// Periodically poll vfp programmed NC version from NMAgent
tickerChannel := time.Tick(time.Duration(cnsconfig.SyncHostNCVersionIntervalMs) * time.Millisecond)
for {
select {
case <-tickerChannel:
timedCtx, cancel := context.WithTimeout(ctx, time.Duration(cnsconfig.SyncHostNCVersionIntervalMs)*time.Millisecond)
httpRestServiceImplementation.SyncHostNCVersion(timedCtx, cnsconfig.ChannelMode)
cancel()
case <-ctx.Done():
logger.Printf("Stopping SyncHostNCVersion loop.")
// Check if we get an error from the manager
select {
case err := <-managerErrCh:
if err != nil {
logger.Errorf("Controller manager error: %v", err)
rootErrCh <- err
return
}
case <-nncReconcilerStartCtx.Done():
// The NNC reconciler started successfully, so we can exit this goroutine
return
}
}()
logger.Printf("Initialized SyncHostNCVersion loop.")
return nil
logger.Printf("Initialized controller-manager.")

// Wait for the Reconciler to run once on a NNC that was made for this Node
nncReadyTimeout := 15 * time.Minute
waitStartTime := time.Now()

for {
select {
case err := <-managerErrCh:
if err != nil {
return fmt.Errorf("controller-manager failed: %w", err)
}
logger.Printf("Controller-manager exited normally")
return nil
default:
// Check if NNC reconciler has started
checkCtx, cancel := context.WithTimeout(ctx, 30*time.Second)
started, err := nncReconciler.Started(checkCtx)
cancel()

if started {
logger.Printf("NodeNetworkConfig reconciler has started.")
nncReconcilerStartCancel() // Signal the manager error watcher that NNC started successfully

// Start the sync host NC version goroutine
initSyncHostNncVersion(ctx, httpRestServiceImplementation, cnsconfig)
return nil
}

// Check if we've waited too long
if time.Since(waitStartTime) > nncReadyTimeout {
logger.Errorf("NNC reconciler has not started after %v, does the NNC exist? err: %v", nncReadyTimeout, err)
nncReconcilerStartFailures.Inc()
return fmt.Errorf("timed out waiting for NNC reconciler to start: %w", err)
}

logger.Printf("Waiting for NodeNetworkConfig reconciler to start.")
time.Sleep(3 * time.Second)
}
}
}

// getPodInfoByIPProvider returns a PodInfoByIPProvider that reads endpoint state from the configured source
Expand Down
24 changes: 24 additions & 0 deletions cns/service/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,28 @@ var managerStartFailures = prometheus.NewCounter(
},
)

// managerStartTimeouts is a monotic counter which tracks the number of times the controller-runtime
// manager timed out during startup. To drive alerting based on this metric, it is recommended to use the rate
// of increase over a period of time. A positive rate of change indicates that the CNS is actively
// failing with timeouts.
var managerStartTimeouts = prometheus.NewCounter(
prometheus.CounterOpts{
Name: "cns_ctrlmanager_start_timeouts_total",
Help: "Number of times the controller-runtime manager timed out during startup.",
},
)

// multitenantControllerStartTimeouts is a monotic counter which tracks the number of times the multitenant
// controller timed out during startup. To drive alerting based on this metric, it is recommended to use the rate
// of increase over a period of time. A positive rate of change indicates that the CNS is actively
// failing with timeouts.
var multitenantControllerStartTimeouts = prometheus.NewCounter(
prometheus.CounterOpts{
Name: "cns_multitenantcontroller_start_timeouts_total",
Help: "Number of times the multitenant controller timed out during startup.",
},
)

// nncReconcilerStartFailures is a monotic counter which tracks the number of times the NNC reconciler
// has failed to start within the timeout period. To drive alerting based on this metric, it is
// recommended to use the rate of increase over a period of time. A positive rate of change indicates
Expand All @@ -30,6 +52,8 @@ var nncReconcilerStartFailures = prometheus.NewCounter(
func init() {
metrics.Registry.MustRegister(
managerStartFailures,
managerStartTimeouts,
multitenantControllerStartTimeouts,
nncReconcilerStartFailures,
)
}
31 changes: 31 additions & 0 deletions cns/service/synchost.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
package main

import (
"context"
"time"

"github.com/Azure/azure-container-networking/cns/configuration"
"github.com/Azure/azure-container-networking/cns/logger"
"github.com/Azure/azure-container-networking/cns/restserver"
)

// initSyncHostNncVersion starts a goroutine to periodically poll and sync NC versions for NNC
func initSyncHostNncVersion(ctx context.Context, httpRestServiceImplementation *restserver.HTTPRestService, cnsconfig *configuration.CNSConfig) {
logger.Printf("Starting SyncHostNCVersion loop.")
go func() {
// Periodically poll vfp programmed NC version from NMAgent
tickerChannel := time.Tick(time.Duration(cnsconfig.SyncHostNCVersionIntervalMs) * time.Millisecond)
for {
select {
case <-tickerChannel:
timedCtx, cancel := context.WithTimeout(ctx, time.Duration(cnsconfig.SyncHostNCVersionIntervalMs)*time.Millisecond)
httpRestServiceImplementation.SyncHostNCVersion(timedCtx, cnsconfig.ChannelMode)
cancel()
case <-ctx.Done():
logger.Printf("Stopping SyncHostNCVersion loop.")
return
}
}
}()
logger.Printf("Initialized SyncHostNCVersion loop.")
}