-
Notifications
You must be signed in to change notification settings - Fork 0
fix(agent): Improve membership client stability with reconnection and test fixes #158
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -208,15 +208,84 @@ func RetrieveModuleList(ctx context.Context, config *rest.Config) (modules, eeMo | |
| func runMembershipClient(lc fx.Lifecycle, debug bool, membershipClient *membershipClient, logger logging.Logger, config *rest.Config) { | ||
| lc.Append(fx.Hook{ | ||
| OnStart: func(ctx context.Context) error { | ||
| client, err := membershipClient.connect(logging.ContextWithLogger(ctx, logger)) | ||
| if err != nil { | ||
| return err | ||
| } | ||
| clientWithTrace := grpcclient.NewConnectionWithTrace(client, debug) | ||
| // Create a background context for the client that won't be cancelled when startup completes | ||
| clientCtx := logging.ContextWithLogger(context.Background(), logger) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
|
|
||
|
Comment on lines
+211
to
213
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Make clientCtx cancelable; cancel it on OnStop to avoid goroutine leak clientCtx is based on context.Background() and never cancelled; the loop can outlive the app. Use WithCancel and cancel in OnStop. Apply this diff: func runMembershipClient(lc fx.Lifecycle, debug bool, membershipClient *membershipClient, logger logging.Logger, config *rest.Config) {
- lc.Append(fx.Hook{
+ var cancel context.CancelFunc
+ lc.Append(fx.Hook{
OnStart: func(ctx context.Context) error {
- // Create a background context for the client that won't be cancelled when startup completes
- clientCtx := logging.ContextWithLogger(context.Background(), logger)
+ // Create a background context for the client that can be cancelled on shutdown
+ clientCtx, cancel := context.WithCancel(logging.ContextWithLogger(context.Background(), logger))
go func() {
const (- OnStop: membershipClient.Stop,
+ OnStop: func(ctx context.Context) error {
+ if cancel != nil {
+ cancel()
+ }
+ return membershipClient.Stop(ctx)
+ },Also applies to: 293-294 🤖 Prompt for AI Agents |
||
| go func() { | ||
| if err := membershipClient.Start(logging.ContextWithLogger(ctx, logger), clientWithTrace); err != nil { | ||
| panic(err) | ||
| const ( | ||
| maxRetries = 10 | ||
| baseDelay = 1 * time.Second | ||
| maxDelay = 2 * time.Minute | ||
| backoffFactor = 2.0 | ||
| ) | ||
|
|
||
|
Comment on lines
+215
to
+221
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Backoff helpers: add exponential backoff with jitter and cancellable sleep Current delay is linear and lacks jitter. Add helpers once, reuse below. Apply this diff right after the const block: const (
maxRetries = 10
baseDelay = 1 * time.Second
maxDelay = 2 * time.Minute
backoffFactor = 2.0
)
+ // Helpers: exponential backoff with jitter + cancellable sleep
+ backoffDelay := func(retries int) time.Duration {
+ if retries < 1 {
+ retries = 1
+ }
+ d := time.Duration(float64(baseDelay) * math.Pow(backoffFactor, float64(retries-1)))
+ if d > maxDelay {
+ d = maxDelay
+ }
+ // ~50% jitter in [0.5x, 1.5x)
+ j := 0.5 + rand.Float64()
+ return time.Duration(float64(d) * j)
+ }
+ sleepWithContext := func(ctx context.Context, d time.Duration) bool {
+ t := time.NewTimer(d)
+ defer t.Stop()
+ select {
+ case <-ctx.Done():
+ return false
+ case <-t.C:
+ return true
+ }
+ }Also add imports: import (
"context"
"reflect"
"sort"
"time"
+ "math"
+ "math/rand" |
||
| retries := 0 | ||
| for { | ||
| select { | ||
| case <-clientCtx.Done(): | ||
| return | ||
| default: | ||
| } | ||
|
|
||
| client, err := membershipClient.connect(clientCtx) | ||
| if err != nil { | ||
| logger.Errorf("Failed to connect to membership server: %s", err) | ||
|
|
||
| retries++ | ||
| if retries >= maxRetries { | ||
| logger.Errorf("Max retries (%d) reached, giving up", maxRetries) | ||
| panic(errors.New("failed to connect to membership server after max retries")) | ||
| } | ||
|
Comment on lines
+235
to
+238
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Avoid panicking in goroutine; exit gracefully (or trigger app shutdown) Panics here will crash the process from a background goroutine. Prefer graceful exit; optionally request fx shutdown. Apply this diff: - logger.Errorf("Max retries (%d) reached, giving up", maxRetries)
- panic(errors.New("failed to connect to membership server after max retries"))
+ logger.Errorf("Max retries (%d) reached, stopping membership client loop", maxRetries)
+ return- logger.Errorf("Max retries (%d) reached after connection loss, giving up", maxRetries)
- panic(errors.New("failed to reconnect to membership server after max retries"))
+ logger.Errorf("Max retries (%d) reached after connection loss, stopping membership client loop", maxRetries)
+ returnOptional (recommended): inject fx.Shutdowner and call shutdowner.Shutdown() here to terminate the app cleanly instead of returning. Also applies to: 271-274 |
||
|
|
||
| // Calculate exponential backoff delay | ||
| delay := time.Duration(float64(baseDelay) * float64(retries) * backoffFactor) | ||
| if delay > maxDelay { | ||
| delay = maxDelay | ||
| } | ||
|
|
||
| logger.Infof("Retrying connection in %s (attempt %d/%d)", delay, retries, maxRetries) | ||
| time.Sleep(delay) | ||
| continue | ||
|
Comment on lines
+240
to
+248
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🛠️ Refactor suggestion | 🟠 Major Use exponential backoff + jitter and make wait cancelable Current delay is linear and Sleep ignores context cancellation. Use helpers added above. Apply this diff: - // Calculate exponential backoff delay
- delay := time.Duration(float64(baseDelay) * float64(retries) * backoffFactor)
- if delay > maxDelay {
- delay = maxDelay
- }
-
- logger.Infof("Retrying connection in %s (attempt %d/%d)", delay, retries, maxRetries)
- time.Sleep(delay)
+ delay := backoffDelay(retries)
+ logger.Infof("Retrying connection in %s (attempt %d/%d)", delay, retries, maxRetries)
+ if !sleepWithContext(clientCtx, delay) {
+ return
+ }- // Calculate exponential backoff delay
- delay := time.Duration(float64(baseDelay) * float64(retries) * backoffFactor)
- if delay > maxDelay {
- delay = maxDelay
- }
-
- logger.Infof("Reconnecting in %s (attempt %d/%d)", delay, retries, maxRetries)
- time.Sleep(delay)
+ delay := backoffDelay(retries)
+ logger.Infof("Reconnecting in %s (attempt %d/%d)", delay, retries, maxRetries)
+ if !sleepWithContext(clientCtx, delay) {
+ return
+ }Also applies to: 276-284 🤖 Prompt for AI Agents |
||
| } | ||
|
|
||
| // Connection successful, reset retry counter | ||
| if retries > 0 { | ||
| logger.Infof("Successfully reconnected after %d attempts", retries) | ||
| retries = 0 | ||
| } | ||
|
|
||
| clientWithTrace := grpcclient.NewConnectionWithTrace(client, debug) | ||
|
|
||
| if err := membershipClient.Start(clientCtx, clientWithTrace); err != nil { | ||
| logger.Errorf("Membership client stopped with error: %s", err) | ||
|
|
||
| // Check if it's a clean shutdown | ||
| if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) { | ||
| logger.Info("Membership client shutdown requested") | ||
| return | ||
| } | ||
|
|
||
| // Connection lost, attempt to reconnect | ||
| logger.Info("Connection lost, attempting to reconnect...") | ||
| retries++ | ||
| if retries >= maxRetries { | ||
| logger.Errorf("Max retries (%d) reached after connection loss, giving up", maxRetries) | ||
| panic(errors.New("failed to reconnect to membership server after max retries")) | ||
| } | ||
|
|
||
| // Calculate exponential backoff delay | ||
| delay := time.Duration(float64(baseDelay) * float64(retries) * backoffFactor) | ||
| if delay > maxDelay { | ||
| delay = maxDelay | ||
| } | ||
|
|
||
| logger.Infof("Reconnecting in %s (attempt %d/%d)", delay, retries, maxRetries) | ||
| time.Sleep(delay) | ||
| continue | ||
| } | ||
|
|
||
| // Clean exit | ||
| return | ||
| } | ||
| }() | ||
| return nil | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hum, just ignore the error is not a good test...