Skip to content

Commit

Permalink
Merge pull request #22 from box/metrics-update
Browse files Browse the repository at this point in the history
Refactor metrics, add tests
  • Loading branch information
gregory-lyons authored Aug 1, 2017
2 parents 5128ed0 + 687fa05 commit 6ebfe2d
Show file tree
Hide file tree
Showing 12 changed files with 240 additions and 151 deletions.
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,7 @@ In rare cases, you may wish to trigger a kube-applier run without checking in a
![screenshot](https://github.com/box/kube-applier/raw/master/static/img/status_page_screenshot.png "Status Page Screenshot")

kube-applier hosts a status page on a webserver, served at the service endpoint URL. The status page displays information about the most recent apply run, including:
* Run Type
* Start and end times
* Latency
* Most recent commit
Expand All @@ -135,7 +136,7 @@ The HTML template for the status page lives in `templates/status.html`, and `sta

### Metrics
kube-applier uses [Prometheus](https://github.com/prometheus/client_golang) for metrics. Metrics are hosted on the webserver at /metrics (status UI is the index page). In addition to the Prometheus default metrics, the following custom metrics are included:
* **run_latency_seconds** - A [Summary](https://godoc.org/github.com/prometheus/client_golang/prometheus#Summary) that keeps track of the durations of each apply run, tagged with a boolean for whether or not the run was a success (i.e. no failed apply attempts).
* **run_latency_seconds** - A [Summary](https://godoc.org/github.com/prometheus/client_golang/prometheus#Summary) that keeps track of the durations of each apply run, tagged with the run type and a boolean for whether or not the run was a success (i.e. no failed apply attempts).
* **file_apply_count** - A [Counter](https://godoc.org/github.com/prometheus/client_golang/prometheus#Counter) for each file that has had an apply attempt over the lifetime of the container, incremented with each apply attempt and tagged by the filepath and the result of the attempt.

The Prometheus [HTTP API](https://prometheus.io/docs/querying/api/) (also see the [Go library](https://github.com/prometheus/client_golang/tree/master/api/prometheus)) can be used for querying the metrics server.
Expand Down
15 changes: 10 additions & 5 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,9 +43,6 @@ func main() {
log.Fatalf("Invalid DIFF_URL_FORMAT, must contain %q: %v", "%s", diffURLFormat)
}

metrics := &metrics.Prometheus{}
metrics.Init()

clock := &sysutil.Clock{}

if err := sysutil.WaitForDir(repoPath, clock, waitForRepoInterval); err != nil {
Expand All @@ -55,7 +52,6 @@ func main() {
kubeClient := &kube.Client{Server: server}
kubeClient.Configure()

batchApplier := &run.BatchApplier{kubeClient, metrics}
gitUtil := &git.GitUtil{repoPath}
fileSystem := &sysutil.FileSystem{}
listFactory := &applylist.Factory{repoPath, blacklistPath, whitelistPath, fileSystem}
Expand All @@ -74,6 +70,10 @@ func main() {
// Limit of 5 is arbitrary - there is significant delay between sends, and receives are handled near instantaneously.
runResults := make(chan run.Result, 5)

// Runner sends run results to runMetrics channel, metrics handler receives the results and updates its metrics.
// Limit of 5 is arbitrary - there is significant delay between sends, and receives are handled hear instantaneously.
runMetrics := make(chan run.Result, 5)

// Runner, webserver, and scheduler all send fatal errors to errors channel, and main() exits upon receiving an error.
// No limit needed, as a single fatal error will exit the program anyway.
errors := make(chan error)
Expand All @@ -84,6 +84,10 @@ func main() {
// The runner will block on popping the current count until it is updated.
runCount := make(chan int)

metrics := &metrics.Prometheus{RunMetrics: runMetrics}
metrics.Configure()
batchApplier := &run.BatchApplier{kubeClient}

pollTicker := time.Tick(pollInterval)
fullRunTicker := time.Tick(fullRunInterval)

Expand All @@ -92,18 +96,19 @@ func main() {
listFactory,
gitUtil,
clock,
metrics,
diffURLFormat,
"",
quickRunQueue,
fullRunQueue,
runResults,
runMetrics,
errors,
runCount,
}
scheduler := &run.Scheduler{gitUtil, pollTicker, fullRunTicker, quickRunQueue, fullRunQueue, errors, ""}
webserver := &webserver.WebServer{listenPort, clock, metrics.GetHandler(), fullRunQueue, runResults, errors}

go metrics.StartMetricsLoop()
go scheduler.Start()
go runner.StartRunCounter()
go runner.StartQuickLoop()
Expand Down
45 changes: 0 additions & 45 deletions metrics/mock_prometheus.go

This file was deleted.

42 changes: 25 additions & 17 deletions metrics/prometheus.go
Original file line number Diff line number Diff line change
@@ -1,21 +1,17 @@
package metrics

import (
"github.com/box/kube-applier/run"
"github.com/prometheus/client_golang/prometheus"
"net/http"
"strconv"
)

// PrometheusInterface allows for mocking out the functionality of Prometheus when testing the full process of an apply run.
type PrometheusInterface interface {
UpdateFileSuccess(string, bool)
UpdateRunLatency(float64, bool)
}

// Prometheus implements instrumentation of metrics for kube-applier.
// fileApplyCount is a Counter vector to increment the number of successful and failed apply attempts for each file in the repo.
// runLatency is a Summary vector that keeps track of the duration for apply runs.
type Prometheus struct {
RunMetrics <-chan run.Result
fileApplyCount *prometheus.CounterVec
runLatency *prometheus.SummaryVec
}
Expand All @@ -25,8 +21,8 @@ func (p *Prometheus) GetHandler() http.Handler {
return prometheus.UninstrumentedHandler()
}

// Init creates and registers the custom metrics for kube-applier.
func (p *Prometheus) Init() {
// Configure creates and registers the custom metrics for kube-applier, and starts a loop to receive run results.
func (p *Prometheus) Configure() {
p.fileApplyCount = prometheus.NewCounterVec(prometheus.CounterOpts{
Name: "file_apply_count",
Help: "Success metric for every file applied",
Expand All @@ -45,23 +41,35 @@ func (p *Prometheus) Init() {
[]string{
// Result: true if the run was successful, false otherwise
"success",
// FullRun or QuickRun
"run_type",
},
)

prometheus.MustRegister(p.fileApplyCount)
prometheus.MustRegister(p.runLatency)
}

// UpdateFileSuccess increments the given file's Counter for either successful apply attempts or failed apply attempts.
func (p *Prometheus) UpdateFileSuccess(file string, success bool) {
p.fileApplyCount.With(prometheus.Labels{
"file": file, "success": strconv.FormatBool(success),
}).Inc()
// StartMetricsLoop receives from the RunMetrics channel and calls processResult when a run result comes in.
func (p *Prometheus) StartMetricsLoop() {
for result := range p.RunMetrics {
p.processResult(result)
}
}

// UpdateRunLatency adds a data point (latency of the most recent run) to the run_latency_seconds Summary metric, with a tag indicating whether or not the run was successful.
func (p *Prometheus) UpdateRunLatency(runLatency float64, success bool) {
// processResult parses a run result for info and updates the metrics (file_apply_count and run_latency_seconds).
func (p *Prometheus) processResult(result run.Result) {
runSuccess := len(result.Failures) == 0
runType := result.RunType
latency := result.Finish.Sub(result.Start).Seconds()
for _, successFile := range result.Successes {
p.fileApplyCount.With(prometheus.Labels{"file": successFile.FilePath, "success": "true"}).Inc()
}
for _, failureFile := range result.Failures {
p.fileApplyCount.With(prometheus.Labels{"file": failureFile.FilePath, "success": "false"}).Inc()
}
p.runLatency.With(prometheus.Labels{
"success": strconv.FormatBool(success),
}).Observe(runLatency)
"success": strconv.FormatBool(runSuccess),
"run_type": string(runType),
}).Observe(latency)
}
139 changes: 139 additions & 0 deletions metrics/prometheus_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
package metrics

import (
"fmt"
"github.com/box/kube-applier/run"
"github.com/stretchr/testify/assert"
"net/http"
"net/http/httptest"
"regexp"
"testing"
)

type testCase struct {
successes []run.ApplyAttempt
failures []run.ApplyAttempt
runType run.RunType
expectedPatterns []string
}

// TestPrometheusProcessResult tests the processResult() function to ensure that the metrics page is updated properly.
// With each "test case", we construct a fake run.Result and call processResult.
// We then make a request to the metrics page handler and parse its raw output.
// We then use regexp patterns to check that the raw output has the expected state for each metric.
// Note that filenames are reused in order to ensure that the metrics update iteratively.
func TestPrometheusProcessResult(t *testing.T) {
runMetrics := make(chan run.Result, 5)
p := &Prometheus{RunMetrics: runMetrics}
p.Configure()

testCases := []testCase{
// Case 1: No successes, no failures, full run
{
[]run.ApplyAttempt{},
[]run.ApplyAttempt{},
run.FullRun,
[]string{
// Expect count 1 for latency metric with run_type=fullRun, success=true
makeLatencyPattern(run.FullRun, true, 1),
},
},
// Case 2: Successes, no failures, full run
{
[]run.ApplyAttempt{{FilePath: "file1"}, {FilePath: "file2"}},
[]run.ApplyAttempt{},
run.FullRun,
[]string{
// Expect count 2 for latency metric with run_type=fullRun, success=true
makeLatencyPattern(run.FullRun, true, 2),
// Expect count 1 for file1 with success=true
makeFilePattern("file1", true, 1),
// Expect count 1 for file2 with success=true
makeFilePattern("file2", true, 1),
},
},
// Case 3: Successes, failures, full run
{
[]run.ApplyAttempt{{FilePath: "file1"}, {FilePath: "file3"}},
[]run.ApplyAttempt{{FilePath: "file2"}},
run.FullRun,
[]string{
// Expect count 1 for latency metric with run_type=fullRun, success=false
makeLatencyPattern(run.FullRun, false, 1),
// Expect count 2 for file1 with success=true
makeFilePattern("file1", true, 2),
// Expect count 1 for file3 with success=true
makeFilePattern("file3", true, 1),
// Expect count 1 for file2 with success=false
makeFilePattern("file2", false, 1),

// Ensure that previous metrics remain unchanged.
// Expect count 2 for latency metric with run_type=fullRun, success=true
makeLatencyPattern(run.FullRun, true, 2),
// Expect count 1 for file2 with success=true
makeFilePattern("file2", true, 1),
},
},
// Case 4: Successes, failures, quick run
{
[]run.ApplyAttempt{{FilePath: "file1"}, {FilePath: "file3"}},
[]run.ApplyAttempt{{FilePath: "file2"}},
run.QuickRun,
[]string{
// Expect count 1 for latency metric with run_type=quickRun, success=false
makeLatencyPattern(run.QuickRun, false, 1),
// Expect count 3 for file1 with success=true
makeFilePattern("file1", true, 3),
// Expect count 2 for file3 with success=true
makeFilePattern("file3", true, 2),
// Expect count 2 for file2 with success=false
makeFilePattern("file2", false, 2),

// Ensure that previous metrics remain unchanged.
// Expect count 2 for latency metric with run_type=fullRun, success=true
makeLatencyPattern(run.FullRun, true, 2),
// Expect count 1 for latency metric with run_type=fullRun, success=false
makeLatencyPattern(run.FullRun, false, 1),
// Expect count 1 for file2 with success=true
makeFilePattern("file2", true, 1),
},
},
}

for _, tc := range testCases {
processAndCheckOutput(t, p, tc)
}
}

// Request content body from the handler.
func requestContentBody(handler http.Handler) string {
req, _ := http.NewRequest("GET", "", nil)
w := httptest.NewRecorder()
handler.ServeHTTP(w, req)
return w.Body.String()
}

// Build a regex pattern for file_apply_count metric.
func makeFilePattern(filename string, success bool, count int) string {
return fmt.Sprintf(
"\\bfile_apply_count\\{file\\=\"%v\",success\\=\"%v\"\\} %v\\b",
filename, success, count)
}

// Build a regex pattern for run_latency_seconds_count metric.
func makeLatencyPattern(runType run.RunType, success bool, count int) string {
return fmt.Sprintf(
"\\brun_latency_seconds_count\\{run_type\\=\"%v\",success\\=\"%v\"\\} %v\\b",
runType, success, count)
}

// Process the test case and check that the metrics output contains the expected patterns.
func processAndCheckOutput(t *testing.T, p *Prometheus, tc testCase) {
assert := assert.New(t)
result := run.Result{Successes: tc.successes, Failures: tc.failures, RunType: tc.runType}
p.processResult(result)
metricsRaw := requestContentBody(p.GetHandler())
for _, pattern := range tc.expectedPatterns {
assert.True(regexp.MatchString(pattern, metricsRaw))
}
}
5 changes: 1 addition & 4 deletions run/batch_applier.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@ package run

import (
"github.com/box/kube-applier/kube"
"github.com/box/kube-applier/metrics"
"log"
)

Expand All @@ -19,10 +18,9 @@ type BatchApplierInterface interface {
Apply(int, []string) (successes []ApplyAttempt, failures []ApplyAttempt)
}

// BatchApplier makes apply calls for a batch of files, and updates metrics based on the results of each call.
// BatchApplier makes apply calls for a batch of files.
type BatchApplier struct {
KubeClient kube.ClientInterface
Metrics metrics.PrometheusInterface
}

// Apply takes a list of files and attempts an apply command on each, labeling logs with the run ID.
Expand All @@ -47,7 +45,6 @@ func (a *BatchApplier) Apply(id int, applyList []string) (successes []ApplyAttem
failures = append(failures, appliedFile)
log.Printf("RUN %v: %v\n%v\n%v", id, cmd, output, appliedFile.ErrorMessage)
}
a.Metrics.UpdateFileSuccess(path, success)
}
return successes, failures
}
Loading

0 comments on commit 6ebfe2d

Please sign in to comment.