diff --git a/.github/actions/run-monitored-tmpnet-cmd/action.yml b/.github/actions/run-monitored-tmpnet-cmd/action.yml index 205e1d677116..878d8748eaa2 100644 --- a/.github/actions/run-monitored-tmpnet-cmd/action.yml +++ b/.github/actions/run-monitored-tmpnet-cmd/action.yml @@ -80,4 +80,16 @@ runs: if: always() with: name: ${{ inputs.artifact_prefix }}-tmpnet-data - # TODO(marun) Check that collection is working by querying prometheus and loki with the GH_* labels above + - name: Check that metrics were collected + shell: bash + run: go run ./tests/fixture/tmpnet/cmd check-metrics + env: + PROMETHEUS_USERNAME: ${{ inputs.prometheus_username }} + PROMETHEUS_PASSWORD: ${{ inputs.prometheus_password }} + GH_REPO: ${{ inputs.repository_owner }}/${{ inputs.repository_name }} + GH_WORKFLOW: ${{ inputs.workflow }} + GH_RUN_ID: ${{ inputs.run_id }} + GH_RUN_NUMBER: ${{ inputs.run_number }} + GH_RUN_ATTEMPT: ${{ inputs.run_attempt }} + GH_JOB_ID: ${{ inputs.job }} + # TODO(marun) Check that log collection was successful diff --git a/tests/fixture/tmpnet/check_metrics.go b/tests/fixture/tmpnet/check_metrics.go new file mode 100644 index 000000000000..f54da992529c --- /dev/null +++ b/tests/fixture/tmpnet/check_metrics.go @@ -0,0 +1,132 @@ +// Copyright (C) 2019-2024, Ava Labs, Inc. All rights reserved. +// See the file LICENSE for licensing terms. + +package tmpnet + +import ( + "context" + "errors" + "fmt" + "net/http" + "strings" + "time" + + "github.com/prometheus/client_golang/api" + "github.com/prometheus/client_golang/api/prometheus/v1" + "github.com/prometheus/common/model" + "go.uber.org/zap" + + "github.com/ava-labs/avalanchego/utils/logging" +) + +// CheckMetricsExist checks if metrics exist for the given +// network. Github labels are also included if provided as env vars +// (GH_*). +func CheckMetricsExist( + log logging.Logger, + networkUUID string, +) error { + username, password, err := getCollectorCredentials(prometheusCmd) + if err != nil { + return fmt.Errorf("failed to get collector credentials: %w", err) + } + query, err := getCheckMetricsQuery(networkUUID) + if err != nil { + return err + } + url := getPrometheusURL() + + log.Info("checking if metrics exist", + zap.String("url", url), + zap.String("query", query), + ) + + results, err := queryPrometheus(log, url, username, password, query) + if err != nil { + return err + } + + metricsCount := len(results) + if metricsCount > 0 { + log.Info("metrics exist", + zap.String("query", query), + zap.Int("count", metricsCount), + ) + return nil + } + + return errors.New("metrics not found") +} + +// getCheckMetricsQuery returns the query to check if metrics exist. +func getCheckMetricsQuery(networkUUID string) (string, error) { + selectors := []string{} + if len(networkUUID) > 0 { + selectors = append(selectors, fmt.Sprintf("network_uuid=\"%s\"", networkUUID)) + } + githubLabels := githubLabelsFromEnv() + for label := range githubLabels { + value, err := githubLabels.GetStringVal(label) + if err != nil { + return "", err + } + if len(value) == 0 { + continue + } + selectors = append(selectors, fmt.Sprintf("%s=\"%s\"", label, value)) + } + return fmt.Sprintf("{%s}", strings.Join(selectors, ",")), nil +} + +func queryPrometheus( + log logging.Logger, + url string, + username string, + password string, + query string, +) (model.Vector, error) { + // Create client with basic auth + client, err := api.NewClient(api.Config{ + Address: url, + RoundTripper: &basicAuthRoundTripper{ + username: username, + password: password, + rt: api.DefaultRoundTripper, + }, + }) + if err != nil { + return nil, fmt.Errorf("failed to create client: %w", err) + } + + // Query Prometheus + result, warnings, err := v1.NewAPI(client).Query( + context.Background(), + query, + time.Now(), + ) + if err != nil { + return nil, fmt.Errorf("query failed: %w", err) + } + if len(warnings) > 0 { + log.Warn("prometheus query warnings", + zap.Strings("warnings", warnings), + ) + } + + // Check results + vector, ok := result.(model.Vector) + if !ok { + return nil, fmt.Errorf("unexpected result type: %s", result.Type()) + } + return vector, nil +} + +type basicAuthRoundTripper struct { + username, password string + rt http.RoundTripper +} + +func (b *basicAuthRoundTripper) RoundTrip(req *http.Request) (*http.Response, error) { + req.SetBasicAuth(b.username, b.password) + return b.rt.RoundTrip(req) +} diff --git a/tests/fixture/tmpnet/cmd/main.go b/tests/fixture/tmpnet/cmd/main.go index bdd00c397cc0..600f34aec16e 100644 --- a/tests/fixture/tmpnet/cmd/main.go +++ b/tests/fixture/tmpnet/cmd/main.go @@ -196,6 +196,26 @@ func main() { } rootCmd.AddCommand(stopCollectorsCmd) + var networkUUID string + checkMetricsCmd := &cobra.Command{ + Use: "check-metrics", + Short: "Checks whether the default prometheus server has the expected metrics", + RunE: func(*cobra.Command, []string) error { + log, err := tests.LoggerForFormat("", rawLogFormat) + if err != nil { + return err + } + return tmpnet.CheckMetricsExist(log, networkUUID) + }, + } + checkMetricsCmd.PersistentFlags().StringVar( + &networkUUID, + "network-uuid", + "", + "[optional] The network UUID to check metrics for. Labels read from GH_* env vars will always be used.", + ) + rootCmd.AddCommand(checkMetricsCmd) + if err := rootCmd.Execute(); err != nil { fmt.Fprintf(os.Stderr, "tmpnetctl failed: %v\n", err) os.Exit(1) diff --git a/tests/fixture/tmpnet/node_process.go b/tests/fixture/tmpnet/node_process.go index 7013de4561f3..adf8da8214cb 100644 --- a/tests/fixture/tmpnet/node_process.go +++ b/tests/fixture/tmpnet/node_process.go @@ -262,16 +262,8 @@ func (p *NodeProcess) writeMonitoringConfig() error { "node_id": p.node.NodeID, "is_ephemeral_node": strconv.FormatBool(p.node.IsEphemeral), "network_owner": p.node.NetworkOwner, - // prometheus/promtail ignore empty values so including these - // labels with empty values outside of a github worker (where - // the env vars will not be set) should not be a problem. - "gh_repo": os.Getenv("GH_REPO"), - "gh_workflow": os.Getenv("GH_WORKFLOW"), - "gh_run_id": os.Getenv("GH_RUN_ID"), - "gh_run_number": os.Getenv("GH_RUN_NUMBER"), - "gh_run_attempt": os.Getenv("GH_RUN_ATTEMPT"), - "gh_job_id": os.Getenv("GH_JOB_ID"), } + commonLabels.SetDefaults(githubLabelsFromEnv()) prometheusConfig := []FlagsMap{ { @@ -419,3 +411,17 @@ func watchLogFileForFatal(ctx context.Context, cancelWithCause context.CancelCau } } } + +func githubLabelsFromEnv() FlagsMap { + return FlagsMap{ + // prometheus/promtail ignore empty values so including these + // labels with empty values outside of a github worker (where + // the env vars will not be set) should not be a problem. + "gh_repo": os.Getenv("GH_REPO"), + "gh_workflow": os.Getenv("GH_WORKFLOW"), + "gh_run_id": os.Getenv("GH_RUN_ID"), + "gh_run_number": os.Getenv("GH_RUN_NUMBER"), + "gh_run_attempt": os.Getenv("GH_RUN_ATTEMPT"), + "gh_job_id": os.Getenv("GH_JOB_ID"), + } +}