Skip to content

Commit 11aba6d

Browse files
authored
Merge pull request #153 from fairfaxmedia/feature/aggregate-datacenter-combined
Implement aggregated datacenter metrics
2 parents f895b40 + 5582807 commit 11aba6d

File tree

8 files changed

+662
-16
lines changed

8 files changed

+662
-16
lines changed

README.md

+13
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,19 @@ Different flags (for the same filter target) combine with AND semantics. For
119119
example, `-metric-allowlist 'bytes_total$' -metric-blocklist imgopto` would only
120120
export metrics whose names ended in bytes_total, but didn't include imgopto.
121121

122+
### Metrics Grouping: by datacenter or aggregate
123+
124+
The Fastly real-time stats API returns measurements grouped by datacenter as
125+
well as aggregated measurements for all datacenters. By default, exported
126+
metrics are grouped by datacenter. The response body size of the metrics
127+
endpoint can potentially be very large. This will be exacerbated when using
128+
the exporter with many services, many origins with Origin Inspector, and many
129+
domains with Domain Inspector. One way to reduce the output size of the
130+
metrics endpoint is by using the `-aggregate-only` flag. When this flag is
131+
used only the `aggregated` metrics from the real-time stats API will be
132+
exported. Metrics will still include the datacenter label but it will always
133+
be set to "aggregate".
134+
122135
### Service discovery
123136

124137
Per-service metrics are available via `/metrics?target=<service ID>`. Available

cmd/fastly-exporter/main.go

+3
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ func main() {
4444
serviceRefresh time.Duration
4545
apiTimeout time.Duration
4646
rtTimeout time.Duration
47+
aggregateOnly bool
4748
debug bool
4849
versionFlag bool
4950
configFileExample bool
@@ -67,6 +68,7 @@ func main() {
6768
fs.DurationVar(&serviceRefresh, "api-refresh", 1*time.Minute, "DEPRECATED -- use service-refresh instead")
6869
fs.DurationVar(&apiTimeout, "api-timeout", 15*time.Second, "HTTP client timeout for api.fastly.com requests (5–60s)")
6970
fs.DurationVar(&rtTimeout, "rt-timeout", 45*time.Second, "HTTP client timeout for rt.fastly.com requests (45–120s)")
71+
fs.BoolVar(&aggregateOnly, "aggregate-only", false, "Use aggregated data rather than per-datacenter")
7072
fs.BoolVar(&debug, "debug", false, "log debug information")
7173
fs.BoolVar(&versionFlag, "version", false, "print version information and exit")
7274
fs.String("config-file", "", "config file (optional)")
@@ -341,6 +343,7 @@ func main() {
341343
subscriberOptions = []rt.SubscriberOption{
342344
rt.WithLogger(rtLogger),
343345
rt.WithMetadataProvider(serviceCache),
346+
rt.WithAggregateOnly(aggregateOnly),
344347
}
345348
)
346349
manager = rt.NewManager(serviceCache, rtClient, token, registry, subscriberOptions, productCache, rtLogger)

pkg/domain/process.go

+11-1
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,18 @@
11
package domain
22

33
// Process updates the metrics with data from the API response.
4-
func Process(response *Response, serviceID, serviceName, serviceVersion string, m *Metrics) {
4+
func Process(response *Response, serviceID, serviceName, _ string, m *Metrics, aggregateOnly bool) {
5+
const aggregateDC = "aggregate"
6+
57
for _, d := range response.Data {
8+
if aggregateOnly {
9+
for domain, stats := range d.Aggregated {
10+
process(serviceID, serviceName, aggregateDC, domain, stats, m)
11+
}
12+
13+
continue
14+
}
15+
616
for datacenter, byDomain := range d.Datacenter {
717
for domain, stats := range byDomain {
818
process(serviceID, serviceName, datacenter, domain, stats, m)

pkg/origin/process.go

+11-1
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,18 @@ const (
77
)
88

99
// Process updates the metrics with data from the API response.
10-
func Process(response *Response, serviceID, serviceName, serviceVersion string, m *Metrics) {
10+
func Process(response *Response, serviceID, serviceName, _ string, m *Metrics, aggregateOnly bool) {
11+
const aggregateDC = "aggregate"
12+
1113
for _, d := range response.Data {
14+
if aggregateOnly {
15+
for origin, stats := range d.Aggregated {
16+
process(serviceID, serviceName, aggregateDC, origin, stats, m)
17+
}
18+
19+
continue
20+
}
21+
1222
for datacenter, byOrigin := range d.Datacenter {
1323
for origin, stats := range byOrigin {
1424
process(serviceID, serviceName, datacenter, origin, stats, m)

pkg/realtime/process.go

+9-1
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,16 @@ import (
77
)
88

99
// Process updates the metrics with data from the API response.
10-
func Process(response *Response, serviceID, serviceName, serviceVersion string, m *Metrics) {
10+
func Process(response *Response, serviceID, serviceName, _ string, m *Metrics, aggregateOnly bool) {
11+
const aggregateDC = "aggregate"
12+
1113
for _, d := range response.Data {
14+
if aggregateOnly {
15+
process(serviceID, serviceName, aggregateDC, d.Aggregated, m)
16+
17+
continue
18+
}
19+
1220
for datacenter, stats := range d.Datacenter {
1321
process(serviceID, serviceName, datacenter, stats, m)
1422
}

pkg/rt/common_test.go

+450
Large diffs are not rendered by default.

pkg/rt/subscriber.go

+21-13
Original file line numberDiff line numberDiff line change
@@ -35,16 +35,17 @@ type MetadataProvider interface {
3535
// Subscriber polls rt.fastly.com endpoints for a single service ID. It emits
3636
// the received stats data to Prometheus metrics.
3737
type Subscriber struct {
38-
client HTTPClient
39-
token string
40-
serviceID string
41-
provider MetadataProvider
42-
metrics *prom.Metrics
43-
postprocess func()
44-
logger log.Logger
45-
rtDelayCount int
46-
oiDelayCount int
47-
diDelayCount int
38+
client HTTPClient
39+
token string
40+
serviceID string
41+
provider MetadataProvider
42+
metrics *prom.Metrics
43+
postprocess func()
44+
logger log.Logger
45+
rtDelayCount int
46+
oiDelayCount int
47+
diDelayCount int
48+
aggregateOnly bool
4849
}
4950

5051
// SubscriberOption provides some additional behavior to a subscriber.
@@ -71,6 +72,13 @@ func WithPostprocess(f func()) SubscriberOption {
7172
return func(s *Subscriber) { s.postprocess = f }
7273
}
7374

75+
// WithAggregateOnly sets whether aggregate metrics are output instead of
76+
// per datacenter metrics. By default, per datacenter are provided. Enabling
77+
// this feature will significantly reduce the payload size of the metrics endpoint.
78+
func WithAggregateOnly(aggregateOnly bool) SubscriberOption {
79+
return func(s *Subscriber) { s.aggregateOnly = aggregateOnly }
80+
}
81+
7482
// NewSubscriber returns a ready-to-use subscriber. Callers must be sure to
7583
// invoke the Run method of the returned subscriber in order to actually update
7684
// any metrics.
@@ -223,7 +231,7 @@ func (s *Subscriber) queryRealtime(ctx context.Context, ts uint64) (currentName
223231
s.rtDelayCount = 0
224232
result = apiResultSuccess
225233
}
226-
realtime.Process(&response, s.serviceID, name, version, s.metrics.Realtime)
234+
realtime.Process(&response, s.serviceID, name, version, s.metrics.Realtime, s.aggregateOnly)
227235
s.postprocess()
228236

229237
case http.StatusUnauthorized, http.StatusForbidden:
@@ -287,7 +295,7 @@ func (s *Subscriber) queryOrigins(ctx context.Context, ts uint64) (currentName s
287295
s.oiDelayCount = 0
288296
result = apiResultSuccess
289297
}
290-
origin.Process(&response, s.serviceID, name, version, s.metrics.Origin)
298+
origin.Process(&response, s.serviceID, name, version, s.metrics.Origin, s.aggregateOnly)
291299
s.postprocess()
292300

293301
case http.StatusUnauthorized, http.StatusForbidden:
@@ -351,7 +359,7 @@ func (s *Subscriber) queryDomains(ctx context.Context, ts uint64) (currentName s
351359
s.diDelayCount = 0
352360
result = apiResultSuccess
353361
}
354-
domain.Process(&response, s.serviceID, name, version, s.metrics.Domain)
362+
domain.Process(&response, s.serviceID, name, version, s.metrics.Domain, s.aggregateOnly)
355363
s.postprocess()
356364

357365
case http.StatusUnauthorized, http.StatusForbidden:

pkg/rt/subscriber_test.go

+144
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,54 @@ func TestRTSubscriberFixture(t *testing.T) {
6262
}
6363
}
6464

65+
func TestRTSubscriberAggOnlyFixture(t *testing.T) {
66+
var (
67+
namespace = "testspace"
68+
subsystem = "testsystem"
69+
registry = prometheus.NewRegistry()
70+
nameFilter = filter.Filter{}
71+
metrics = prom.NewMetrics(namespace, subsystem, nameFilter, registry)
72+
)
73+
74+
// Set up a subscriber.
75+
var (
76+
client = newMockRealtimeClient(rtResponseFixture, `{}`)
77+
serviceID = "my-service-id"
78+
serviceName = "my-service-name"
79+
serviceVersion = 123
80+
cache = &mockCache{}
81+
processed = make(chan struct{})
82+
postprocess = func() { close(processed) }
83+
options = []rt.SubscriberOption{rt.WithMetadataProvider(cache), rt.WithPostprocess(postprocess), rt.WithAggregateOnly(true)}
84+
subscriber = rt.NewSubscriber(client, "irrelevant token", serviceID, metrics, options...)
85+
)
86+
87+
// Prep the mock cache.
88+
cache.update([]api.Service{{ID: serviceID, Name: serviceName, Version: serviceVersion}})
89+
90+
// Tell the subscriber to fetch real-time stats.
91+
ctx, cancel := context.WithCancel(context.Background())
92+
errc := make(chan error, 1)
93+
go func() { errc <- subscriber.RunRealtime(ctx) }()
94+
95+
// Block until the subscriber does finishes one fetch
96+
<-processed
97+
98+
// Assert the Prometheus metrics.
99+
output := prometheusOutput(t, registry, namespace+"_"+subsystem+"_")
100+
assertMetricOutput(t, expectedRTMetricsAggOutputMap, output)
101+
102+
// Kill the subscriber's goroutine, and wait for it to finish.
103+
cancel()
104+
err := <-errc
105+
switch {
106+
case err == nil:
107+
case errors.Is(err, context.Canceled):
108+
case err != nil:
109+
t.Fatal(err)
110+
}
111+
}
112+
65113
func TestOriginSubscriberFixture(t *testing.T) {
66114
var (
67115
namespace = "testspace"
@@ -110,6 +158,54 @@ func TestOriginSubscriberFixture(t *testing.T) {
110158
}
111159
}
112160

161+
func TestOriginSubscriberAggOnlyFixture(t *testing.T) {
162+
var (
163+
namespace = "testspace"
164+
subsystem = "testsytem"
165+
registry = prometheus.NewRegistry()
166+
nameFilter = filter.Filter{}
167+
metrics = prom.NewMetrics(namespace, subsystem, nameFilter, registry)
168+
)
169+
170+
// Set up a subscriber.
171+
var (
172+
client = newMockRealtimeClient(originsResponseFixture, `{}`)
173+
serviceID = "my-service-id"
174+
serviceName = "my-service-name"
175+
serviceVersion = 123
176+
cache = &mockCache{}
177+
processed = make(chan struct{})
178+
postprocess = func() { close(processed) }
179+
options = []rt.SubscriberOption{rt.WithMetadataProvider(cache), rt.WithPostprocess(postprocess), rt.WithAggregateOnly(true)}
180+
subscriber = rt.NewSubscriber(client, "irrelevant token", serviceID, metrics, options...)
181+
)
182+
183+
// Prep the mock cache.
184+
cache.update([]api.Service{{ID: serviceID, Name: serviceName, Version: serviceVersion}})
185+
186+
// Tell the subscriber to fetch real-time stats.
187+
ctx, cancel := context.WithCancel(context.Background())
188+
errc := make(chan error, 1)
189+
go func() { errc <- subscriber.RunOrigins(ctx) }()
190+
191+
// Block until the subscriber does finishes one fetch
192+
<-processed
193+
194+
// Assert the Prometheus metrics.
195+
output := prometheusOutput(t, registry, namespace+"_origin_")
196+
assertMetricOutput(t, expectedOriginsMetricsAggOutputMap, output)
197+
198+
// Kill the subscriber's goroutine, and wait for it to finish.
199+
cancel()
200+
err := <-errc
201+
switch {
202+
case err == nil:
203+
case errors.Is(err, context.Canceled):
204+
case err != nil:
205+
t.Fatal(err)
206+
}
207+
}
208+
113209
func TestDomainSubscriberFixture(t *testing.T) {
114210
var (
115211
namespace = "testspace"
@@ -158,6 +254,54 @@ func TestDomainSubscriberFixture(t *testing.T) {
158254
}
159255
}
160256

257+
func TestDomainSubscriberAggOnlyFixture(t *testing.T) {
258+
var (
259+
namespace = "testspace"
260+
subsystem = "testsytem"
261+
registry = prometheus.NewRegistry()
262+
nameFilter = filter.Filter{}
263+
metrics = prom.NewMetrics(namespace, subsystem, nameFilter, registry)
264+
)
265+
266+
// Set up a subscriber.
267+
var (
268+
client = newMockRealtimeClient(domainsResponseFixture, `{}`)
269+
serviceID = "my-service-id"
270+
serviceName = "my-service-name"
271+
serviceVersion = 123
272+
cache = &mockCache{}
273+
processed = make(chan struct{})
274+
postprocess = func() { close(processed) }
275+
options = []rt.SubscriberOption{rt.WithMetadataProvider(cache), rt.WithPostprocess(postprocess), rt.WithAggregateOnly(true)}
276+
subscriber = rt.NewSubscriber(client, "irrelevant token", serviceID, metrics, options...)
277+
)
278+
279+
// Prep the mock cache.
280+
cache.update([]api.Service{{ID: serviceID, Name: serviceName, Version: serviceVersion}})
281+
282+
// Tell the subscriber to fetch real-time stats.
283+
ctx, cancel := context.WithCancel(context.Background())
284+
errc := make(chan error, 1)
285+
go func() { errc <- subscriber.RunDomains(ctx) }()
286+
287+
// Block until the subscriber does finishes one fetch
288+
<-processed
289+
290+
// Assert the Prometheus metrics.
291+
output := prometheusOutput(t, registry, namespace+"_domain_")
292+
assertMetricOutput(t, expectedDomainsMetricsAggOutputMap, output)
293+
294+
// Kill the subscriber's goroutine, and wait for it to finish.
295+
cancel()
296+
err := <-errc
297+
switch {
298+
case err == nil:
299+
case errors.Is(err, context.Canceled):
300+
case err != nil:
301+
t.Fatal(err)
302+
}
303+
}
304+
161305
func TestSubscriberNoData(t *testing.T) {
162306
var (
163307
client = newMockRealtimeClient(`{"Error": "No data available, please retry"}`, `{}`)

0 commit comments

Comments
 (0)