Skip to content

Commit

Permalink
[tmpnet] Add check for log and metric collection to monitoring action
Browse files Browse the repository at this point in the history
  • Loading branch information
maru-ava committed Feb 24, 2025
1 parent 4dce5a1 commit e4243fc
Show file tree
Hide file tree
Showing 7 changed files with 350 additions and 17 deletions.
25 changes: 24 additions & 1 deletion .github/actions/run-monitored-tmpnet-cmd/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -80,4 +80,27 @@ runs:
if: always()
with:
name: ${{ inputs.artifact_prefix }}-tmpnet-data
# TODO(marun) Check that collection is working by querying prometheus and loki with the GH_* labels above
- name: Check that logs were collected
shell: bash
run: go run ./tests/fixture/tmpnet/cmd check-logs
env:
LOKI_USERNAME: ${{ inputs.loki_username }}
LOKI_PASSWORD: ${{ inputs.loki_password }}
GH_REPO: ${{ inputs.repository_owner }}/${{ inputs.repository_name }}
GH_WORKFLOW: ${{ inputs.workflow }}
GH_RUN_ID: ${{ inputs.run_id }}
GH_RUN_NUMBER: ${{ inputs.run_number }}
GH_RUN_ATTEMPT: ${{ inputs.run_attempt }}
GH_JOB_ID: ${{ inputs.job }}
- name: Check that metrics were collected
shell: bash
run: go run ./tests/fixture/tmpnet/cmd check-metrics
env:
PROMETHEUS_USERNAME: ${{ inputs.prometheus_username }}
PROMETHEUS_PASSWORD: ${{ inputs.prometheus_password }}
GH_REPO: ${{ inputs.repository_owner }}/${{ inputs.repository_name }}
GH_WORKFLOW: ${{ inputs.workflow }}
GH_RUN_ID: ${{ inputs.run_id }}
GH_RUN_NUMBER: ${{ inputs.run_number }}
GH_RUN_ATTEMPT: ${{ inputs.run_attempt }}
GH_JOB_ID: ${{ inputs.job }}
4 changes: 4 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -388,6 +388,8 @@ github.com/jessevdk/go-flags v0.0.0-20141203071132-1679536dcc89/go.mod h1:4FA24M
github.com/jessevdk/go-flags v1.4.0/go.mod h1:4FA24M0QyGHXBuZZK/XkWh8h0e1EYbRYJSGM75WSRxI=
github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY=
github.com/josharian/intern v1.0.0/go.mod h1:5DoeVV0s6jJacbCEi61lwdGj/aVlrQvzHFFd8Hwg//Y=
github.com/jpillora/backoff v1.0.0 h1:uvFg412JmmHBHw7iwprIxkPMI+sGQ4kzOWsMeHnm2EA=
github.com/jpillora/backoff v1.0.0/go.mod h1:J/6gKK9jxlEcS3zixgDgUAsiuZ7yrSoa/FX5e0EB2j4=
github.com/jrick/logrotate v1.0.0/go.mod h1:LNinyqDIJnpAur+b8yyulnQw/wDuN1+BYKlTRt3OuAQ=
github.com/json-iterator/go v1.1.6/go.mod h1:+SdeFBvtyEkXs7REEP0seUULqWtbJapLOCVDaaPEHmU=
github.com/json-iterator/go v1.1.9/go.mod h1:KdQUCv79m/52Kvf8AW2vK1V8akMuk1QjK/uOdHXbAo4=
Expand Down Expand Up @@ -478,6 +480,8 @@ github.com/mr-tron/base58 v1.2.0 h1:T/HDJBh4ZCPbU39/+c3rRvE0uKBQlU27+QI8LJ4t64o=
github.com/mr-tron/base58 v1.2.0/go.mod h1:BinMc/sQntlIE1frQmRFPUoPA1Zkr8VRgBdjWI2mNwc=
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA=
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ=
github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f h1:KUppIJq7/+SVif2QVs3tOP0zanoHgBEVAwHxUSIzRqU=
github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f/go.mod h1:qRWi+5nqEBWmkhHvq77mSJWrCKwh8bxhgT7d/eI7P4U=
github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f h1:y5//uYreIhSUg3J1GEMiLbxo1LJaP8RfCpH6pymGZus=
github.com/mxk/go-flowrate v0.0.0-20140419014527-cca7078d478f/go.mod h1:ZdcZmHo+o7JKHSa8/e818NopupXU1YMK5fe1lsApnBw=
github.com/nats-io/jwt v0.3.0/go.mod h1:fRYCDE99xlTsqUzISS1Bi75UBJ6ljOJQOAAu5VglpSg=
Expand Down
1 change: 1 addition & 0 deletions tests/fixture/tmpnet/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ the following non-test files:

| Filename | Types | Purpose |
|:----------------------------|:------------|:------------------------------------------------------------|
| check_collection.go | | Enables checking if logs and metrics were collected |
| defaults.go | | Defines common default configuration |
| detached_process_default.go | | Configures detached processes for darwin and linux |
| detached_process_windows.go | | No-op detached process configuration for windows |
Expand Down
256 changes: 256 additions & 0 deletions tests/fixture/tmpnet/check_collection.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,256 @@
// Copyright (C) 2019-2024, Ava Labs, Inc. All rights reserved.
// See the file LICENSE for licensing terms.

package tmpnet

import (
"context"
"encoding/base64"
"encoding/json"
"errors"
"fmt"
"io"
"net/http"
"net/url"
"strconv"
"strings"
"time"

"github.com/prometheus/client_golang/api"
"github.com/prometheus/client_golang/api/prometheus/v1"
"github.com/prometheus/common/model"
"go.uber.org/zap"

"github.com/ava-labs/avalanchego/utils/logging"
)

// CheckLogsExist checks if logs exist for the given network. Github labels are also
// included if provided as env vars (GH_*).
func CheckLogsExist(ctx context.Context, log logging.Logger, networkUUID string) error {
username, password, err := getCollectorCredentials(promtailCmd)
if err != nil {
return fmt.Errorf("failed to get collector credentials: %w", err)
}
query, err := getCheckLogsQuery(networkUUID)
if err != nil {
return err
}
url := getLokiURL()

log.Info("checking if logs exist",
zap.String("url", url),
zap.String("query", query),
)

logsCount, err := queryLoki(ctx, url, username, password, query)
if err != nil {
return err
}

if logsCount > 0 {
log.Info("logs exist",
zap.Int("count", logsCount),
)
return nil
}

return errors.New("logs not found")
}

// getCheckLogsQuery returns the query to check if logs exist.
func getCheckLogsQuery(networkUUID string) (string, error) {
selectors, err := getSelectors(networkUUID)
if err != nil {
return "", err
}
return fmt.Sprintf("sum(count_over_time({%s}[1h]))", selectors), nil
}

func queryLoki(
ctx context.Context,
lokiURL string,
username string,
password string,
query string,
) (int, error) {
// Compose the URL
params := url.Values{}
params.Add("query", query)
reqURL := fmt.Sprintf("%s/loki/api/v1/query?%s", lokiURL, params.Encode())

// Create request
req, err := http.NewRequestWithContext(ctx, http.MethodGet, reqURL, nil)
if err != nil {
return 0, fmt.Errorf("failed to create request: %w", err)
}

auth := base64.StdEncoding.EncodeToString([]byte(username + ":" + password))
req.Header.Set("Authorization", "Basic "+auth)

// Execute request
resp, err := http.DefaultClient.Do(req)
if err != nil {
return 0, fmt.Errorf("failed to execute request: %w", err)
}
defer resp.Body.Close()

// Read and parse response
body, err := io.ReadAll(resp.Body)
if err != nil {
return 0, fmt.Errorf("failed to read response: %w", err)
}

if resp.StatusCode != http.StatusOK {
return 0, fmt.Errorf("unexpected status code %d: %s", resp.StatusCode, string(body))
}

// Parse JSON response
var result struct {
Status string `json:"status"`
Data struct {
Result []struct {
Value []interface{} `json:"value"`
} `json:"result"`
} `json:"data"`
}

if err := json.Unmarshal(body, &result); err != nil {
return 0, fmt.Errorf("failed to parse response: %w", err)
}

// Extract count value
if len(result.Data.Result) == 0 {
return 0, nil
}
if len(result.Data.Result[0].Value) != 2 {
return 0, errors.New("unexpected value format in response")
}
// Convert value to a string
valueStr, ok := result.Data.Result[0].Value[1].(string)
if !ok {
return 0, errors.New("value is not a string")
}
// Convert string to float64 first to handle scientific notation
floatVal, err := strconv.ParseFloat(valueStr, 64)
if err != nil {
return 0, fmt.Errorf("parsing count value: %w", err)
}
// Round to nearest integer
return int(floatVal + 0.5), nil
}

// CheckMetricsExist checks if metrics exist for the given network. Github labels are also
// included if provided as env vars (GH_*).
func CheckMetricsExist(ctx context.Context, log logging.Logger, networkUUID string) error {
username, password, err := getCollectorCredentials(prometheusCmd)
if err != nil {
return fmt.Errorf("failed to get collector credentials: %w", err)
}
query, err := getCheckMetricsQuery(networkUUID)
if err != nil {
return err
}
url := getPrometheusURL()

log.Info("checking if metrics exist",
zap.String("url", url),
zap.String("query", query),
)

count, err := queryPrometheus(ctx, log, url, username, password, query)
if err != nil {
return err
}
if count == 0 {
return errors.New("metrics not found")
}

log.Info("metrics exist",
zap.Int("count", count),
)
return nil
}

// getCheckMetricsQuery returns the query to check if metrics exist.
func getCheckMetricsQuery(networkUUID string) (string, error) {
selectors, err := getSelectors(networkUUID)
if err != nil {
return "", err
}
return fmt.Sprintf("count({%s})", selectors), nil
}

func queryPrometheus(
ctx context.Context,
log logging.Logger,
url string,
username string,
password string,
query string,
) (int, error) {
// Create client with basic auth
client, err := api.NewClient(api.Config{
Address: url,
RoundTripper: &basicAuthRoundTripper{
username: username,
password: password,
rt: api.DefaultRoundTripper,
},
})
if err != nil {
return 0, fmt.Errorf("failed to create client: %w", err)
}

// Query Prometheus
result, warnings, err := v1.NewAPI(client).QueryRange(ctx, query, v1.Range{
Start: time.Now().Add(-time.Hour),
End: time.Now(),
Step: time.Minute,
})
if err != nil {
return 0, fmt.Errorf("query failed: %w", err)
}
if len(warnings) > 0 {
log.Warn("prometheus query warnings",
zap.Strings("warnings", warnings),
)
}

if matrix, ok := result.(model.Matrix); !ok {
return 0, fmt.Errorf("unexpected result type: %s", result.Type())
} else if len(matrix) > 0 {
return int(matrix[0].Values[len(matrix[0].Values)-1].Value), nil
}

return 0, nil
}

type basicAuthRoundTripper struct {
username, password string
rt http.RoundTripper
}

func (b *basicAuthRoundTripper) RoundTrip(req *http.Request) (*http.Response, error) {
req.SetBasicAuth(b.username, b.password)
return b.rt.RoundTrip(req)
}

// getSelectors returns the comma-separated list of selectors.
func getSelectors(networkUUID string) (string, error) {
selectors := []string{}
if len(networkUUID) > 0 {
selectors = append(selectors, fmt.Sprintf(`network_uuid="%s"`, networkUUID))
}
githubLabels := githubLabelsFromEnv()
for label := range githubLabels {
value, err := githubLabels.GetStringVal(label)
if err != nil {
return "", err
}
if len(value) == 0 {
continue
}
selectors = append(selectors, fmt.Sprintf(`%s="%s"`, label, value))
}
return strings.Join(selectors, ","), nil
}
50 changes: 45 additions & 5 deletions tests/fixture/tmpnet/cmd/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@ import (
"io/fs"
"os"
"path/filepath"
"time"

"github.com/spf13/cobra"
"go.uber.org/zap"
Expand Down Expand Up @@ -81,10 +80,7 @@ func main() {
Nodes: tmpnet.NewNodesOrPanic(int(nodeCount)),
}

// Extreme upper bound, should never take this long
networkStartTimeout := 2 * time.Minute

ctx, cancel := context.WithTimeout(context.Background(), networkStartTimeout)
ctx, cancel := context.WithTimeout(context.Background(), tmpnet.DefaultNetworkTimeout)
defer cancel()
if err := tmpnet.BootstrapNewNetwork(
ctx,
Expand Down Expand Up @@ -196,6 +192,50 @@ func main() {
}
rootCmd.AddCommand(stopCollectorsCmd)

var networkUUID string

checkMetricsCmd := &cobra.Command{
Use: "check-metrics",
Short: "Checks whether the default prometheus server has the expected metrics",
RunE: func(*cobra.Command, []string) error {
ctx, cancel := context.WithTimeout(context.Background(), tmpnet.DefaultNetworkTimeout)
defer cancel()
log, err := tests.LoggerForFormat("", rawLogFormat)
if err != nil {
return err
}
return tmpnet.CheckMetricsExist(ctx, log, networkUUID)
},
}
checkMetricsCmd.PersistentFlags().StringVar(
&networkUUID,
"network-uuid",
"",
"[optional] The network UUID to check metrics for. Labels read from GH_* env vars will always be used.",
)
rootCmd.AddCommand(checkMetricsCmd)

checkLogsCmd := &cobra.Command{
Use: "check-logs",
Short: "Checks whether the default loki server has the expected logs",
RunE: func(*cobra.Command, []string) error {
ctx, cancel := context.WithTimeout(context.Background(), tmpnet.DefaultNetworkTimeout)
defer cancel()
log, err := tests.LoggerForFormat("", rawLogFormat)
if err != nil {
return err
}
return tmpnet.CheckLogsExist(ctx, log, networkUUID)
},
}
checkLogsCmd.PersistentFlags().StringVar(
&networkUUID,
"network-uuid",
"",
"[optional] The network UUID to check logs for. Labels read from GH_* env vars will always be used.",
)
rootCmd.AddCommand(checkLogsCmd)

if err := rootCmd.Execute(); err != nil {
fmt.Fprintf(os.Stderr, "tmpnetctl failed: %v\n", err)
os.Exit(1)
Expand Down
Loading

0 comments on commit e4243fc

Please sign in to comment.