diff --git a/.github/actions/run-monitored-tmpnet-cmd/action.yml b/.github/actions/run-monitored-tmpnet-cmd/action.yml index 205e1d677116..eaede246ae00 100644 --- a/.github/actions/run-monitored-tmpnet-cmd/action.yml +++ b/.github/actions/run-monitored-tmpnet-cmd/action.yml @@ -80,4 +80,27 @@ runs: if: always() with: name: ${{ inputs.artifact_prefix }}-tmpnet-data - # TODO(marun) Check that collection is working by querying prometheus and loki with the GH_* labels above + - name: Check that logs were collected + shell: bash + run: go run ./tests/fixture/tmpnet/cmd check-logs + env: + LOKI_USERNAME: ${{ inputs.loki_username }} + LOKI_PASSWORD: ${{ inputs.loki_password }} + GH_REPO: ${{ inputs.repository_owner }}/${{ inputs.repository_name }} + GH_WORKFLOW: ${{ inputs.workflow }} + GH_RUN_ID: ${{ inputs.run_id }} + GH_RUN_NUMBER: ${{ inputs.run_number }} + GH_RUN_ATTEMPT: ${{ inputs.run_attempt }} + GH_JOB_ID: ${{ inputs.job }} + - name: Check that metrics were collected + shell: bash + run: go run ./tests/fixture/tmpnet/cmd check-metrics + env: + PROMETHEUS_USERNAME: ${{ inputs.prometheus_username }} + PROMETHEUS_PASSWORD: ${{ inputs.prometheus_password }} + GH_REPO: ${{ inputs.repository_owner }}/${{ inputs.repository_name }} + GH_WORKFLOW: ${{ inputs.workflow }} + GH_RUN_ID: ${{ inputs.run_id }} + GH_RUN_NUMBER: ${{ inputs.run_number }} + GH_RUN_ATTEMPT: ${{ inputs.run_attempt }} + GH_JOB_ID: ${{ inputs.job }} diff --git a/go.sum b/go.sum index f9928533c5bb..7ab84f6833b4 100644 --- a/go.sum +++ b/go.sum @@ -388,6 +388,8 @@ github.com/jessevdk/go-flags v0.0.0-20141203071132-1679536dcc89/go.mod h1:4FA24M github.com/jessevdk/go-flags v1.4.0/go.mod h1:4FA24M0QyGHXBuZZK/XkWh8h0e1EYbRYJSGM75WSRxI= github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY= github.com/josharian/intern v1.0.0/go.mod h1:5DoeVV0s6jJacbCEi61lwdGj/aVlrQvzHFFd8Hwg//Y= +github.com/jpillora/backoff v1.0.0 h1:uvFg412JmmHBHw7iwprIxkPMI+sGQ4kzOWsMeHnm2EA= +github.com/jpillora/backoff v1.0.0/go.mod h1:J/6gKK9jxlEcS3zixgDgUAsiuZ7yrSoa/FX5e0EB2j4= github.com/jrick/logrotate v1.0.0/go.mod h1:LNinyqDIJnpAur+b8yyulnQw/wDuN1+BYKlTRt3OuAQ= github.com/json-iterator/go v1.1.6/go.mod h1:+SdeFBvtyEkXs7REEP0seUULqWtbJapLOCVDaaPEHmU= github.com/json-iterator/go v1.1.9/go.mod h1:KdQUCv79m/52Kvf8AW2vK1V8akMuk1QjK/uOdHXbAo4= @@ -478,6 +480,8 @@ github.com/mr-tron/base58 v1.2.0 h1:T/HDJBh4ZCPbU39/+c3rRvE0uKBQlU27+QI8LJ4t64o= github.com/mr-tron/base58 v1.2.0/go.mod h1:BinMc/sQntlIE1frQmRFPUoPA1Zkr8VRgBdjWI2mNwc= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= +github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f h1:KUppIJq7/+SVif2QVs3tOP0zanoHgBEVAwHxUSIzRqU= +github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f/go.mod h1:qRWi+5nqEBWmkhHvq77mSJWrCKwh8bxhgT7d/eI7P4U= github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f h1:y5//uYreIhSUg3J1GEMiLbxo1LJaP8RfCpH6pymGZus= github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f/go.mod h1:ZdcZmHo+o7JKHSa8/e818NopupXU1YMK5fe1lsApnBw= github.com/nats-io/jwt v0.3.0/go.mod h1:fRYCDE99xlTsqUzISS1Bi75UBJ6ljOJQOAAu5VglpSg= diff --git a/tests/fixture/tmpnet/README.md b/tests/fixture/tmpnet/README.md index 0ef0d8b63ea5..a2dc7c61ca50 100644 --- a/tests/fixture/tmpnet/README.md +++ b/tests/fixture/tmpnet/README.md @@ -26,6 +26,7 @@ the following non-test files: | Filename | Types | Purpose | |:----------------------------|:------------|:------------------------------------------------------------| +| check_collection.go | | Enables checking if logs and metrics were collected | | defaults.go | | Defines common default configuration | | detached_process_default.go | | Configures detached processes for darwin and linux | | detached_process_windows.go | | No-op detached process configuration for windows | diff --git a/tests/fixture/tmpnet/check_collection.go b/tests/fixture/tmpnet/check_collection.go new file mode 100644 index 000000000000..eb58c4d14fd1 --- /dev/null +++ b/tests/fixture/tmpnet/check_collection.go @@ -0,0 +1,256 @@ +// Copyright (C) 2019-2024, Ava Labs, Inc. All rights reserved. +// See the file LICENSE for licensing terms. + +package tmpnet + +import ( + "context" + "encoding/base64" + "encoding/json" + "errors" + "fmt" + "io" + "net/http" + "net/url" + "strconv" + "strings" + "time" + + "github.com/prometheus/client_golang/api" + "github.com/prometheus/client_golang/api/prometheus/v1" + "github.com/prometheus/common/model" + "go.uber.org/zap" + + "github.com/ava-labs/avalanchego/utils/logging" +) + +// CheckLogsExist checks if logs exist for the given network. Github labels are also +// included if provided as env vars (GH_*). +func CheckLogsExist(ctx context.Context, log logging.Logger, networkUUID string) error { + username, password, err := getCollectorCredentials(promtailCmd) + if err != nil { + return fmt.Errorf("failed to get collector credentials: %w", err) + } + query, err := getCheckLogsQuery(networkUUID) + if err != nil { + return err + } + url := getLokiURL() + + log.Info("checking if logs exist", + zap.String("url", url), + zap.String("query", query), + ) + + logsCount, err := queryLoki(ctx, url, username, password, query) + if err != nil { + return err + } + + if logsCount > 0 { + log.Info("logs exist", + zap.Int("count", logsCount), + ) + return nil + } + + return errors.New("logs not found") +} + +// getCheckLogsQuery returns the query to check if logs exist. +func getCheckLogsQuery(networkUUID string) (string, error) { + selectors, err := getSelectors(networkUUID) + if err != nil { + return "", err + } + return fmt.Sprintf("sum(count_over_time({%s}[1h]))", selectors), nil +} + +func queryLoki( + ctx context.Context, + lokiURL string, + username string, + password string, + query string, +) (int, error) { + // Compose the URL + params := url.Values{} + params.Add("query", query) + reqURL := fmt.Sprintf("%s/loki/api/v1/query?%s", lokiURL, params.Encode()) + + // Create request + req, err := http.NewRequestWithContext(ctx, http.MethodGet, reqURL, nil) + if err != nil { + return 0, fmt.Errorf("failed to create request: %w", err) + } + + auth := base64.StdEncoding.EncodeToString([]byte(username + ":" + password)) + req.Header.Set("Authorization", "Basic "+auth) + + // Execute request + resp, err := http.DefaultClient.Do(req) + if err != nil { + return 0, fmt.Errorf("failed to execute request: %w", err) + } + defer resp.Body.Close() + + // Read and parse response + body, err := io.ReadAll(resp.Body) + if err != nil { + return 0, fmt.Errorf("failed to read response: %w", err) + } + + if resp.StatusCode != http.StatusOK { + return 0, fmt.Errorf("unexpected status code %d: %s", resp.StatusCode, string(body)) + } + + // Parse JSON response + var result struct { + Status string `json:"status"` + Data struct { + Result []struct { + Value []interface{} `json:"value"` + } `json:"result"` + } `json:"data"` + } + + if err := json.Unmarshal(body, &result); err != nil { + return 0, fmt.Errorf("failed to parse response: %w", err) + } + + // Extract count value + if len(result.Data.Result) == 0 { + return 0, nil + } + if len(result.Data.Result[0].Value) != 2 { + return 0, errors.New("unexpected value format in response") + } + // Convert value to a string + valueStr, ok := result.Data.Result[0].Value[1].(string) + if !ok { + return 0, errors.New("value is not a string") + } + // Convert string to float64 first to handle scientific notation + floatVal, err := strconv.ParseFloat(valueStr, 64) + if err != nil { + return 0, fmt.Errorf("parsing count value: %w", err) + } + // Round to nearest integer + return int(floatVal + 0.5), nil +} + +// CheckMetricsExist checks if metrics exist for the given network. Github labels are also +// included if provided as env vars (GH_*). +func CheckMetricsExist(ctx context.Context, log logging.Logger, networkUUID string) error { + username, password, err := getCollectorCredentials(prometheusCmd) + if err != nil { + return fmt.Errorf("failed to get collector credentials: %w", err) + } + query, err := getCheckMetricsQuery(networkUUID) + if err != nil { + return err + } + url := getPrometheusURL() + + log.Info("checking if metrics exist", + zap.String("url", url), + zap.String("query", query), + ) + + count, err := queryPrometheus(ctx, log, url, username, password, query) + if err != nil { + return err + } + if count == 0 { + return errors.New("metrics not found") + } + + log.Info("metrics exist", + zap.Int("count", count), + ) + return nil +} + +// getCheckMetricsQuery returns the query to check if metrics exist. +func getCheckMetricsQuery(networkUUID string) (string, error) { + selectors, err := getSelectors(networkUUID) + if err != nil { + return "", err + } + return fmt.Sprintf("count({%s})", selectors), nil +} + +func queryPrometheus( + ctx context.Context, + log logging.Logger, + url string, + username string, + password string, + query string, +) (int, error) { + // Create client with basic auth + client, err := api.NewClient(api.Config{ + Address: url, + RoundTripper: &basicAuthRoundTripper{ + username: username, + password: password, + rt: api.DefaultRoundTripper, + }, + }) + if err != nil { + return 0, fmt.Errorf("failed to create client: %w", err) + } + + // Query Prometheus + result, warnings, err := v1.NewAPI(client).QueryRange(ctx, query, v1.Range{ + Start: time.Now().Add(-time.Hour), + End: time.Now(), + Step: time.Minute, + }) + if err != nil { + return 0, fmt.Errorf("query failed: %w", err) + } + if len(warnings) > 0 { + log.Warn("prometheus query warnings", + zap.Strings("warnings", warnings), + ) + } + + if matrix, ok := result.(model.Matrix); !ok { + return 0, fmt.Errorf("unexpected result type: %s", result.Type()) + } else if len(matrix) > 0 { + return int(matrix[0].Values[len(matrix[0].Values)-1].Value), nil + } + + return 0, nil +} + +type basicAuthRoundTripper struct { + username, password string + rt http.RoundTripper +} + +func (b *basicAuthRoundTripper) RoundTrip(req *http.Request) (*http.Response, error) { + req.SetBasicAuth(b.username, b.password) + return b.rt.RoundTrip(req) +} + +// getSelectors returns the comma-separated list of selectors. +func getSelectors(networkUUID string) (string, error) { + selectors := []string{} + if len(networkUUID) > 0 { + selectors = append(selectors, fmt.Sprintf(`network_uuid="%s"`, networkUUID)) + } + githubLabels := githubLabelsFromEnv() + for label := range githubLabels { + value, err := githubLabels.GetStringVal(label) + if err != nil { + return "", err + } + if len(value) == 0 { + continue + } + selectors = append(selectors, fmt.Sprintf(`%s="%s"`, label, value)) + } + return strings.Join(selectors, ","), nil +} diff --git a/tests/fixture/tmpnet/cmd/main.go b/tests/fixture/tmpnet/cmd/main.go index bdd00c397cc0..5c1e2959cf1b 100644 --- a/tests/fixture/tmpnet/cmd/main.go +++ b/tests/fixture/tmpnet/cmd/main.go @@ -10,7 +10,6 @@ import ( "io/fs" "os" "path/filepath" - "time" "github.com/spf13/cobra" "go.uber.org/zap" @@ -81,10 +80,7 @@ func main() { Nodes: tmpnet.NewNodesOrPanic(int(nodeCount)), } - // Extreme upper bound, should never take this long - networkStartTimeout := 2 * time.Minute - - ctx, cancel := context.WithTimeout(context.Background(), networkStartTimeout) + ctx, cancel := context.WithTimeout(context.Background(), tmpnet.DefaultNetworkTimeout) defer cancel() if err := tmpnet.BootstrapNewNetwork( ctx, @@ -196,6 +192,50 @@ func main() { } rootCmd.AddCommand(stopCollectorsCmd) + var networkUUID string + + checkMetricsCmd := &cobra.Command{ + Use: "check-metrics", + Short: "Checks whether the default prometheus server has the expected metrics", + RunE: func(*cobra.Command, []string) error { + ctx, cancel := context.WithTimeout(context.Background(), tmpnet.DefaultNetworkTimeout) + defer cancel() + log, err := tests.LoggerForFormat("", rawLogFormat) + if err != nil { + return err + } + return tmpnet.CheckMetricsExist(ctx, log, networkUUID) + }, + } + checkMetricsCmd.PersistentFlags().StringVar( + &networkUUID, + "network-uuid", + "", + "[optional] The network UUID to check metrics for. Labels read from GH_* env vars will always be used.", + ) + rootCmd.AddCommand(checkMetricsCmd) + + checkLogsCmd := &cobra.Command{ + Use: "check-logs", + Short: "Checks whether the default loki server has the expected logs", + RunE: func(*cobra.Command, []string) error { + ctx, cancel := context.WithTimeout(context.Background(), tmpnet.DefaultNetworkTimeout) + defer cancel() + log, err := tests.LoggerForFormat("", rawLogFormat) + if err != nil { + return err + } + return tmpnet.CheckLogsExist(ctx, log, networkUUID) + }, + } + checkLogsCmd.PersistentFlags().StringVar( + &networkUUID, + "network-uuid", + "", + "[optional] The network UUID to check logs for. Labels read from GH_* env vars will always be used.", + ) + rootCmd.AddCommand(checkLogsCmd) + if err := rootCmd.Execute(); err != nil { fmt.Fprintf(os.Stderr, "tmpnetctl failed: %v\n", err) os.Exit(1) diff --git a/tests/fixture/tmpnet/node_process.go b/tests/fixture/tmpnet/node_process.go index 7013de4561f3..adf8da8214cb 100644 --- a/tests/fixture/tmpnet/node_process.go +++ b/tests/fixture/tmpnet/node_process.go @@ -262,16 +262,8 @@ func (p *NodeProcess) writeMonitoringConfig() error { "node_id": p.node.NodeID, "is_ephemeral_node": strconv.FormatBool(p.node.IsEphemeral), "network_owner": p.node.NetworkOwner, - // prometheus/promtail ignore empty values so including these - // labels with empty values outside of a github worker (where - // the env vars will not be set) should not be a problem. - "gh_repo": os.Getenv("GH_REPO"), - "gh_workflow": os.Getenv("GH_WORKFLOW"), - "gh_run_id": os.Getenv("GH_RUN_ID"), - "gh_run_number": os.Getenv("GH_RUN_NUMBER"), - "gh_run_attempt": os.Getenv("GH_RUN_ATTEMPT"), - "gh_job_id": os.Getenv("GH_JOB_ID"), } + commonLabels.SetDefaults(githubLabelsFromEnv()) prometheusConfig := []FlagsMap{ { @@ -419,3 +411,17 @@ func watchLogFileForFatal(ctx context.Context, cancelWithCause context.CancelCau } } } + +func githubLabelsFromEnv() FlagsMap { + return FlagsMap{ + // prometheus/promtail ignore empty values so including these + // labels with empty values outside of a github worker (where + // the env vars will not be set) should not be a problem. + "gh_repo": os.Getenv("GH_REPO"), + "gh_workflow": os.Getenv("GH_WORKFLOW"), + "gh_run_id": os.Getenv("GH_RUN_ID"), + "gh_run_number": os.Getenv("GH_RUN_NUMBER"), + "gh_run_attempt": os.Getenv("GH_RUN_ATTEMPT"), + "gh_job_id": os.Getenv("GH_JOB_ID"), + } +} diff --git a/tests/upgrade/upgrade_test.go b/tests/upgrade/upgrade_test.go index 162c7887fa6c..8ceed179288b 100644 --- a/tests/upgrade/upgrade_test.go +++ b/tests/upgrade/upgrade_test.go @@ -7,6 +7,7 @@ import ( "flag" "fmt" "testing" + "time" "github.com/onsi/ginkgo/v2" "github.com/stretchr/testify/require" @@ -53,16 +54,18 @@ var _ = ginkgo.Describe("[Upgrade]", func() { require.NoError(err) network.Genesis = genesis + shutdownDelay := 0 * time.Second if startCollectors { require.NoError(tmpnet.StartCollectors(tc.DefaultContext(), tc.Log())) + shutdownDelay = tmpnet.NetworkShutdownDelay // Ensure a final metrics scrape } e2e.StartNetwork( tc, network, avalancheGoExecPath, - "", /* pluginDir */ - 0, /* shutdownDelay */ + "", /* pluginDir */ + shutdownDelay, false, /* skipShutdown */ false, /* reuseNetwork */ )