(feat) internal/civisibility: add Known Tests feature and refactor EF…

…D logic V2 (#3140)
DataDog · Feb 3, 2025 · c9f90c7 · c9f90c7
1 parent 1e52457
commit c9f90c7
Show file tree

Hide file tree

Showing 14 changed files with 408 additions and 280 deletions.
diff --git a/internal/civisibility/constants/test_tags.go b/internal/civisibility/constants/test_tags.go
@@ -78,6 +78,9 @@ const (
 	// This constant is used to tag test events that are part of a retry execution
 	TestIsRetry = "test.is_retry"
 
+	// TestRetryReason indicates the reason for retrying the test
+	TestRetryReason = "test.retry_reason"
+
 	// TestEarlyFlakeDetectionRetryAborted indicates a retry abort reason by the early flake detection feature
 	TestEarlyFlakeDetectionRetryAborted = "test.early_flake.abort_reason"
 

diff --git a/internal/civisibility/integrations/civisibility_features.go b/internal/civisibility/integrations/civisibility_features.go
@@ -51,8 +51,8 @@ var (
 	// ciVisibilitySettings contains the CI Visibility settings for this session
 	ciVisibilitySettings net.SettingsResponseData
 
-	// ciVisibilityEarlyFlakyDetectionSettings contains the CI Visibility Early Flake Detection data for this session
-	ciVisibilityEarlyFlakyDetectionSettings net.EfdResponseData
+	// ciVisibilityKnownTests contains the CI Visibility Known Tests data for this session
+	ciVisibilityKnownTests net.KnownTestsResponseData
 
 	// ciVisibilityFlakyRetriesSettings contains the CI Visibility Flaky Retries settings for this session
 	ciVisibilityFlakyRetriesSettings FlakyRetriesSetting
@@ -121,15 +121,20 @@ func ensureAdditionalFeaturesInitialization(serviceName string) {
 			return
 		}
 
-		// if early flake detection is enabled then we run the early flake detection request
-		if ciVisibilitySettings.EarlyFlakeDetection.Enabled {
-			ciEfdData, err := ciVisibilityClient.GetEarlyFlakeDetectionData()
+		// if early flake detection is enabled then we run the known tests request
+		if ciVisibilitySettings.KnownTestsEnabled {
+			ciEfdData, err := ciVisibilityClient.GetKnownTests()
 			if err != nil {
-				log.Error("civisibility: error getting CI visibility early flake detection data: %v", err)
+				log.Error("civisibility: error getting CI visibility known tests data: %v", err)
 			} else if ciEfdData != nil {
-				ciVisibilityEarlyFlakyDetectionSettings = *ciEfdData
-				log.Debug("civisibility: early flake detection data loaded.")
+				ciVisibilityKnownTests = *ciEfdData
+				log.Debug("civisibility: known tests data loaded.")
 			}
+		} else {
+			// "known_tests_enabled" parameter works as a kill-switch for EFD, so if “known_tests_enabled” is false it
+			// will disable EFD even if “early_flake_detection.enabled” is set to true (which should not happen normally,
+			// the backend should disable both of them in that case)
+			ciVisibilitySettings.EarlyFlakeDetection.Enabled = false
 		}
 
 		// if flaky test retries is enabled then let's load the flaky retries settings
@@ -172,11 +177,11 @@ func GetSettings() *net.SettingsResponseData {
 	return &ciVisibilitySettings
 }
 
-// GetEarlyFlakeDetectionSettings gets the early flake detection known tests data
-func GetEarlyFlakeDetectionSettings() *net.EfdResponseData {
+// GetKnownTests gets the known tests data
+func GetKnownTests() *net.KnownTestsResponseData {
 	// call to ensure the additional features initialization is completed (service name can be null here)
 	ensureAdditionalFeaturesInitialization("")
-	return &ciVisibilityEarlyFlakyDetectionSettings
+	return &ciVisibilityKnownTests
 }
 
 // GetFlakyRetriesSettings gets the flaky retries settings

diff --git a/internal/civisibility/integrations/gotesting/coverage/coverage_writer_test.go b/internal/civisibility/integrations/gotesting/coverage/coverage_writer_test.go
@@ -73,7 +73,7 @@ type MockClient struct {
 	SendCoveragePayloadFunc           func(ciTestCovPayload io.Reader) error
 	SendCoveragePayloadWithFormatFunc func(ciTestCovPayload io.Reader, format string) error
 	GetSettingsFunc                   func() (*net.SettingsResponseData, error)
-	GetEarlyFlakeDetectionDataFunc    func() (*net.EfdResponseData, error)
+	GetKnownTestsFunc                 func() (*net.KnownTestsResponseData, error)
 	GetCommitsFunc                    func(localCommits []string) ([]string, error)
 	SendPackFilesFunc                 func(commitSha string, packFiles []string) (bytes int64, err error)
 	GetSkippableTestsFunc             func() (correlationId string, skippables map[string]map[string][]net.SkippableResponseDataAttributes, err error)
@@ -91,8 +91,8 @@ func (m *MockClient) GetSettings() (*net.SettingsResponseData, error) {
 	return m.GetSettingsFunc()
 }
 
-func (m *MockClient) GetEarlyFlakeDetectionData() (*net.EfdResponseData, error) {
-	return m.GetEarlyFlakeDetectionDataFunc()
+func (m *MockClient) GetKnownTests() (*net.KnownTestsResponseData, error) {
+	return m.GetKnownTestsFunc()
 }
 
 func (m *MockClient) GetCommits(localCommits []string) ([]string, error) {

diff --git a/internal/civisibility/integrations/gotesting/instrumentation.go b/internal/civisibility/integrations/gotesting/instrumentation.go
@@ -9,7 +9,6 @@ import (
 	"fmt"
 	"reflect"
 	"runtime"
-	"slices"
 	"sync"
 	"sync/atomic"
 	"testing"
@@ -36,7 +35,9 @@ type (
 		panicData                   any               // panic data recovered from an internal test execution when using an additional feature wrapper
 		panicStacktrace             string            // stacktrace from the panic recovered from an internal test
 		isARetry                    bool              // flag to tag if a current test execution is a retry
-		isANewTest                  bool              // flag to tag if a current test execution is part of a new test (EFD not known test)
+		isANewTest                  bool              // flag to tag if a current test execution is part of a new test
+		isEFDExecution              bool              // flag to tag if a current test execution is part of an EFD execution
+		isATRExecution              bool              // flag to tag if a current test execution is part of an ATR execution
 		hasAdditionalFeatureWrapper bool              // flag to check if the current execution is part of an additional feature wrapper
 	}
 
@@ -191,20 +192,29 @@ func applyFlakyTestRetriesAdditionalFeature(targetFunc func(*testing.T)) (func(*
 				initialRetryCount: flakyRetrySettings.RetryCount,
 				adjustRetryCount:  nil, // No adjustRetryCount
 				shouldRetry: func(ptrToLocalT *testing.T, executionIndex int, remainingRetries int64) bool {
-					remainingTotalRetries := atomic.AddInt64(&flakyRetrySettings.RemainingTotalRetryCount, -1)
 					// Decide whether to retry
-					return ptrToLocalT.Failed() && remainingRetries >= 0 && remainingTotalRetries >= 0
+					return ptrToLocalT.Failed() && remainingRetries >= 0 && atomic.LoadInt64(&flakyRetrySettings.RemainingTotalRetryCount) >= 0
+				},
+				perExecution: func(ptrToLocalT *testing.T, executionIndex int, duration time.Duration) {
+					if executionIndex > 0 {
+						atomic.AddInt64(&flakyRetrySettings.RemainingTotalRetryCount, -1)
+					}
 				},
-				perExecution: nil, // No perExecution needed
 				onRetryEnd: func(t *testing.T, executionIndex int, lastPtrToLocalT *testing.T) {
 					// Update original `t` with results from last execution
 					tCommonPrivates := getTestPrivateFields(t)
+					if tCommonPrivates == nil {
+						panic("getting test private fields failed")
+					}
 					tCommonPrivates.SetFailed(lastPtrToLocalT.Failed())
 					tCommonPrivates.SetSkipped(lastPtrToLocalT.Skipped())
 
 					// Update parent status if failed
 					if lastPtrToLocalT.Failed() {
 						tParentCommonPrivates := getTestParentPrivateFields(t)
+						if tParentCommonPrivates == nil {
+							panic("getting test parent private fields failed")
+						}
 						tParentCommonPrivates.SetFailed(true)
 					}
 
@@ -218,14 +228,17 @@ func applyFlakyTestRetriesAdditionalFeature(targetFunc func(*testing.T)) (func(*
 						}
 
 						fmt.Printf("    [ %v after %v retries by Datadog's auto test retries ]\n", status, executionIndex)
-					}
 
-					// Check if total retry count was exceeded
-					if flakyRetrySettings.RemainingTotalRetryCount < 1 {
-						fmt.Println("    the maximum number of total retries was exceeded.")
+						// Check if total retry count was exceeded
+						if atomic.LoadInt64(&flakyRetrySettings.RemainingTotalRetryCount) < 1 {
+							fmt.Println("    the maximum number of total retries was exceeded.")
+						}
 					}
 				},
-				execMetaAdjust: nil, // No execMetaAdjust needed
+				execMetaAdjust: func(execMeta *testExecutionMetadata, executionIndex int) {
+					// Set the flag ATR execution to true
+					execMeta.isATRExecution = true
+				},
 			})
 		}, true
 	}
@@ -234,89 +247,82 @@ func applyFlakyTestRetriesAdditionalFeature(targetFunc func(*testing.T)) (func(*
 
 // applyEarlyFlakeDetectionAdditionalFeature applies the early flake detection feature as a wrapper of a func(*testing.T)
 func applyEarlyFlakeDetectionAdditionalFeature(testInfo *commonInfo, targetFunc func(*testing.T), settings *net.SettingsResponseData) (func(*testing.T), bool) {
-	earlyFlakeDetectionData := integrations.GetEarlyFlakeDetectionSettings()
-	if earlyFlakeDetectionData != nil &&
-		len(earlyFlakeDetectionData.Tests) > 0 {
-
-		// Define is a known test flag
-		isAKnownTest := false
-
-		// Check if the test is a known test or a new one
-		if knownSuites, ok := earlyFlakeDetectionData.Tests[testInfo.moduleName]; ok {
-			if knownTests, ok := knownSuites[testInfo.suiteName]; ok {
-				if slices.Contains(knownTests, testInfo.testName) {
-					isAKnownTest = true
-				}
-			}
-		}
+	isKnown, hasKnownData := isKnownTest(testInfo)
+	if !hasKnownData || isKnown {
+		return targetFunc, false
+	}
 
-		// If it's a new test, then we apply the EFD wrapper
-		if !isAKnownTest {
-			return func(t *testing.T) {
-				var testPassCount, testSkipCount, testFailCount int
-
-				runTestWithRetry(&runTestWithRetryOptions{
-					targetFunc:        targetFunc,
-					t:                 t,
-					initialRetryCount: 0,
-					adjustRetryCount: func(duration time.Duration) int64 {
-						slowTestRetriesSettings := settings.EarlyFlakeDetection.SlowTestRetries
-						durationSecs := duration.Seconds()
-						if durationSecs < 5 {
-							return int64(slowTestRetriesSettings.FiveS)
-						} else if durationSecs < 10 {
-							return int64(slowTestRetriesSettings.TenS)
-						} else if durationSecs < 30 {
-							return int64(slowTestRetriesSettings.ThirtyS)
-						} else if duration.Minutes() < 5 {
-							return int64(slowTestRetriesSettings.FiveM)
-						}
-						return 0
-					},
-					shouldRetry: func(ptrToLocalT *testing.T, executionIndex int, remainingRetries int64) bool {
-						return remainingRetries >= 0
-					},
-					perExecution: func(ptrToLocalT *testing.T, executionIndex int, duration time.Duration) {
-						// Collect test results
-						if ptrToLocalT.Failed() {
-							testFailCount++
-						} else if ptrToLocalT.Skipped() {
-							testSkipCount++
-						} else {
-							testPassCount++
-						}
-					},
-					onRetryEnd: func(t *testing.T, executionIndex int, lastPtrToLocalT *testing.T) {
-						// Update test status based on collected counts
-						tCommonPrivates := getTestPrivateFields(t)
+	// If it's a new test, then we apply the EFD wrapper
+	return func(t *testing.T) {
+		var testPassCount, testSkipCount, testFailCount int
+
+		runTestWithRetry(&runTestWithRetryOptions{
+			targetFunc:        targetFunc,
+			t:                 t,
+			initialRetryCount: 0,
+			adjustRetryCount: func(duration time.Duration) int64 {
+				slowTestRetriesSettings := settings.EarlyFlakeDetection.SlowTestRetries
+				durationSecs := duration.Seconds()
+				if durationSecs < 5 {
+					return int64(slowTestRetriesSettings.FiveS)
+				} else if durationSecs < 10 {
+					return int64(slowTestRetriesSettings.TenS)
+				} else if durationSecs < 30 {
+					return int64(slowTestRetriesSettings.ThirtyS)
+				} else if duration.Minutes() < 5 {
+					return int64(slowTestRetriesSettings.FiveM)
+				}
+				return 0
+			},
+			shouldRetry: func(ptrToLocalT *testing.T, executionIndex int, remainingRetries int64) bool {
+				return remainingRetries >= 0
+			},
+			perExecution: func(ptrToLocalT *testing.T, executionIndex int, duration time.Duration) {
+				// Collect test results
+				if ptrToLocalT.Failed() {
+					testFailCount++
+				} else if ptrToLocalT.Skipped() {
+					testSkipCount++
+				} else {
+					testPassCount++
+				}
+			},
+			onRetryEnd: func(t *testing.T, executionIndex int, lastPtrToLocalT *testing.T) {
+				// Update test status based on collected counts
+				tCommonPrivates := getTestPrivateFields(t)
+				if tCommonPrivates == nil {
+					panic("getting test private fields failed")
+				}
+				status := "passed"
+				if testPassCount == 0 {
+					if testSkipCount > 0 {
+						status = "skipped"
+						tCommonPrivates.SetSkipped(true)
+					}
+					if testFailCount > 0 {
+						status = "failed"
+						tCommonPrivates.SetFailed(true)
 						tParentCommonPrivates := getTestParentPrivateFields(t)
-						status := "passed"
-						if testPassCount == 0 {
-							if testSkipCount > 0 {
-								status = "skipped"
-								tCommonPrivates.SetSkipped(true)
-							}
-							if testFailCount > 0 {
-								status = "failed"
-								tCommonPrivates.SetFailed(true)
-								tParentCommonPrivates.SetFailed(true)
-							}
+						if tParentCommonPrivates == nil {
+							panic("getting test parent private fields failed")
 						}
+						tParentCommonPrivates.SetFailed(true)
+					}
+				}
 
-						// Print summary after retries
-						if executionIndex > 0 {
-							fmt.Printf("  [ %v after %v retries by Datadog's early flake detection ]\n", status, executionIndex)
-						}
-					},
-					execMetaAdjust: func(execMeta *testExecutionMetadata, executionIndex int) {
-						// Set the flag new test to true
-						execMeta.isANewTest = true
-					},
-				})
-			}, true
-		}
-	}
-	return targetFunc, false
+				// Print summary after retries
+				if executionIndex > 0 {
+					fmt.Printf("  [ %v after %v retries by Datadog's early flake detection ]\n", status, executionIndex)
+				}
+			},
+			execMetaAdjust: func(execMeta *testExecutionMetadata, executionIndex int) {
+				// Set the flag new test to true
+				execMeta.isANewTest = true
+				// Set the flag EFD execution to true
+				execMeta.isEFDExecution = true
+			},
+		})
+	}, true
 }
 
 // runTestWithRetry encapsulates the common retry logic for test functions.
@@ -336,7 +342,10 @@ func runTestWithRetry(options *runTestWithRetryOptions) {
 
 	for {
 		// Clear the matcher subnames map before each execution to avoid subname tests being called "parent/subname#NN" due to retries
-		getTestContextMatcherPrivateFields(options.t).ClearSubNames()
+		matcher := getTestContextMatcherPrivateFields(options.t)
+		if matcher != nil {
+			matcher.ClearSubNames()
+		}
 
 		// Increment execution index
 		executionIndex++
@@ -348,6 +357,12 @@ func runTestWithRetry(options *runTestWithRetryOptions) {
 		// Create a dummy parent so we can run the test using this local copy
 		// without affecting the test parent
 		localTPrivateFields := getTestPrivateFields(ptrToLocalT)
+		if localTPrivateFields == nil {
+			panic("getting test private fields failed")
+		}
+		if localTPrivateFields.parent == nil {
+			panic("parent of the test is nil")
+		}
 		*localTPrivateFields.parent = unsafe.Pointer(&testing.T{})
 
 		// Create an execution metadata instance
@@ -362,6 +377,12 @@ func runTestWithRetry(options *runTestWithRetryOptions) {
 			if originalExecMeta.isARetry {
 				execMeta.isARetry = true
 			}
+			if originalExecMeta.isEFDExecution {
+				execMeta.isEFDExecution = true
+			}
+			if originalExecMeta.isATRExecution {
+				execMeta.isATRExecution = true
+			}
 		}
 
 		// If we are in a retry execution, set the `isARetry` flag

diff --git a/internal/civisibility/integrations/gotesting/instrumentation_orchestrion.go b/internal/civisibility/integrations/gotesting/instrumentation_orchestrion.go
@@ -159,6 +159,12 @@ func instrumentTestingTFunc(f func(*testing.T)) func(*testing.T) {
 				if parentExecMeta.isARetry {
 					execMeta.isARetry = true
 				}
+				if parentExecMeta.isEFDExecution {
+					execMeta.isEFDExecution = true
+				}
+				if parentExecMeta.isATRExecution {
+					execMeta.isATRExecution = true
+				}
 			}
 		}
 
@@ -175,6 +181,15 @@ func instrumentTestingTFunc(f func(*testing.T)) func(*testing.T) {
 		if execMeta.isARetry {
 			// Set the retry tag
 			test.SetTag(constants.TestIsRetry, "true")
+
+			// If the execution is an EFD execution we tag the test event reason
+			if execMeta.isEFDExecution {
+				// Set the EFD as the retry reason
+				test.SetTag(constants.TestRetryReason, "efd")
+			} else if execMeta.isATRExecution {
+				// Set the ATR as the retry reason
+				test.SetTag(constants.TestRetryReason, "atr")
+			}
 		}
 
 		defer func() {