-
Notifications
You must be signed in to change notification settings - Fork 9.9k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Don't require minimal for failpoint injection period #17825
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -29,10 +29,7 @@ import ( | |
) | ||
|
||
const ( | ||
triggerTimeout = time.Minute | ||
waitBetweenFailpointTriggers = time.Second | ||
failpointInjectionsCount = 1 | ||
failpointInjectionsRetries = 3 | ||
triggerTimeout = time.Minute | ||
) | ||
|
||
var ( | ||
|
@@ -78,45 +75,37 @@ func Validate(clus *e2e.EtcdProcessCluster, failpoint Failpoint) error { | |
return nil | ||
} | ||
|
||
func Inject(ctx context.Context, t *testing.T, lg *zap.Logger, clus *e2e.EtcdProcessCluster, failpoint Failpoint) error { | ||
func Inject(ctx context.Context, t *testing.T, lg *zap.Logger, clus *e2e.EtcdProcessCluster, failpoint Failpoint, baseTime time.Time) (*InjectionReport, error) { | ||
ctx, cancel := context.WithTimeout(ctx, triggerTimeout) | ||
defer cancel() | ||
var err error | ||
successes := 0 | ||
failures := 0 | ||
for successes < failpointInjectionsCount && failures < failpointInjectionsRetries { | ||
time.Sleep(waitBetweenFailpointTriggers) | ||
|
||
lg.Info("Verifying cluster health before failpoint", zap.String("failpoint", failpoint.Name())) | ||
if err = verifyClusterHealth(ctx, t, clus); err != nil { | ||
return fmt.Errorf("failed to verify cluster health before failpoint injection, err: %v", err) | ||
} | ||
|
||
lg.Info("Triggering failpoint", zap.String("failpoint", failpoint.Name())) | ||
err = failpoint.Inject(ctx, t, lg, clus) | ||
if err != nil { | ||
select { | ||
case <-ctx.Done(): | ||
return fmt.Errorf("Triggering failpoints timed out, err: %v", ctx.Err()) | ||
default: | ||
} | ||
lg.Info("Failed to trigger failpoint", zap.String("failpoint", failpoint.Name()), zap.Error(err)) | ||
failures++ | ||
continue | ||
} | ||
|
||
lg.Info("Verifying cluster health after failpoint", zap.String("failpoint", failpoint.Name())) | ||
if err = verifyClusterHealth(ctx, t, clus); err != nil { | ||
return fmt.Errorf("failed to verify cluster health after failpoint injection, err: %v", err) | ||
} | ||
|
||
successes++ | ||
if err = verifyClusterHealth(ctx, t, clus); err != nil { | ||
return nil, fmt.Errorf("failed to verify cluster health before failpoint injection, err: %v", err) | ||
} | ||
lg.Info("Triggering failpoint", zap.String("failpoint", failpoint.Name())) | ||
start := time.Since(baseTime) | ||
err = failpoint.Inject(ctx, t, lg, clus) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we should keep the original retry here just in case that the failpoint http call timeouts There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Prefer to retry http calls and not whole failpoints. Reason is that failpoint seems simple and retryable as it's just a single Inject function. Underneath it's a multi stage stateful process, with some stages not retryiable or at least not from the begging. For example a simple KILL failpoint, first needs to kill the process, wait for it to exit, and start it back again. If we failed or wait, can we really retry and kill it again? What about failure on start? Failpoint injection has more nuances making them not easy to blindly retry, and trying to make them externally retryiable makes the internal code unneseserly complicated. Would prefer we failpoints we able to make the decision how to handle their internal failures and whether they can retry on their own There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks for the comment. It sounds good to me. |
||
if err != nil { | ||
lg.Error("Failed to trigger failpoint", zap.String("failpoint", failpoint.Name()), zap.Error(err)) | ||
return nil, fmt.Errorf("failed triggering failpoint, err: %v", err) | ||
} | ||
if successes < failpointInjectionsCount || failures >= failpointInjectionsRetries { | ||
t.Errorf("failed to trigger failpoints enough times, err: %v", err) | ||
if err = verifyClusterHealth(ctx, t, clus); err != nil { | ||
return nil, fmt.Errorf("failed to verify cluster health after failpoint injection, err: %v", err) | ||
} | ||
lg.Info("Finished triggering failpoint", zap.String("failpoint", failpoint.Name())) | ||
end := time.Since(baseTime) | ||
|
||
return &InjectionReport{ | ||
Start: start, | ||
End: end, | ||
Name: failpoint.Name(), | ||
}, nil | ||
} | ||
|
||
return nil | ||
type InjectionReport struct { | ||
Start, End time.Duration | ||
Name string | ||
} | ||
|
||
func verifyClusterHealth(ctx context.Context, _ *testing.T, clus *e2e.EtcdProcessCluster) error { | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -110,27 +110,32 @@ func (s testScenario) run(ctx context.Context, t *testing.T, lg *zap.Logger, clu | |
defer cancel() | ||
g := errgroup.Group{} | ||
var operationReport, watchReport []report.ClientReport | ||
finishTraffic := make(chan struct{}) | ||
failpointInjected := make(chan failpoint.InjectionReport, 1) | ||
|
||
// using baseTime time-measuring operation to get monotonic clock reading | ||
// see https://github.com/golang/go/blob/master/src/time/time.go#L17 | ||
baseTime := time.Now() | ||
ids := identity.NewIDProvider() | ||
g.Go(func() error { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Also, not for this PR, but we should probably get rid of the errgroup.Group, probably a good idea to just use a wait groups and goroutines. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why? This code was changed to use errgroup to make error handling cleaner. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Right, but iiuc errgroups are useful only if you return an error, we always return nil afaict. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ok, makes sense. |
||
defer close(finishTraffic) | ||
err := failpoint.Inject(ctx, t, lg, clus, s.failpoint) | ||
defer close(failpointInjected) | ||
// Give some time for traffic to reach qps target before injecting failpoint. | ||
time.Sleep(time.Second) | ||
serathius marked this conversation as resolved.
Show resolved
Hide resolved
|
||
fr, err := failpoint.Inject(ctx, t, lg, clus, s.failpoint, baseTime) | ||
if err != nil { | ||
t.Error(err) | ||
cancel() | ||
} | ||
// Give some time for traffic to reach qps target after injecting failpoint. | ||
time.Sleep(time.Second) | ||
lg.Info("Finished injecting failures") | ||
if fr != nil { | ||
failpointInjected <- *fr | ||
} | ||
return nil | ||
}) | ||
maxRevisionChan := make(chan int64, 1) | ||
g.Go(func() error { | ||
defer close(maxRevisionChan) | ||
operationReport = traffic.SimulateTraffic(ctx, t, lg, clus, s.profile, s.traffic, finishTraffic, baseTime, ids) | ||
operationReport = traffic.SimulateTraffic(ctx, t, lg, clus, s.profile, s.traffic, failpointInjected, baseTime, ids) | ||
maxRevision := operationsMaxRevision(operationReport) | ||
maxRevisionChan <- maxRevision | ||
lg.Info("Finished simulating traffic", zap.Int64("max-revision", maxRevision)) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
why is the retries removed? was it ever useful?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It was introduced as a band aid solution to flakes. It reduced problem a little, but there were still some issues with failpoint that retries didn't help. Over time we improved, especially after it we discovered issue with process execution.
Hard to say if removing it will expose some issues, we should see them in CI.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Checked logs form one of robustness test runs https://github.com/etcd-io/etcd/actions/runs/8781749612. Didn't find any "Failed to trigger failpoint" logs.