Skip to content

Commit eb5c7d0

Browse files
committed
Record Metrics for Reminder
Signed-off-by: Vyom Yadav <[email protected]>
1 parent 6d4a5df commit eb5c7d0

File tree

5 files changed

+235
-23
lines changed

5 files changed

+235
-23
lines changed

internal/reminder/metrics/metrics.go

+79
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
// SPDX-FileCopyrightText: Copyright 2024 The Minder Authors
2+
// SPDX-License-Identifier: Apache-2.0
3+
4+
// Package metrics provides metrics for the reminder service
5+
package metrics
6+
7+
import (
8+
"context"
9+
10+
"go.opentelemetry.io/otel/metric"
11+
)
12+
13+
// Default bucket boundaries in seconds for the delay histograms
14+
var delayBuckets = []float64{
15+
60, // 1 minute
16+
300, // 5 minutes
17+
600, // 10 minutes
18+
1800, // 30 minutes
19+
3600, // 1 hour
20+
7200, // 2 hours
21+
10800, // 3 hours
22+
18000, // 5 hours
23+
25200, // 7 hours
24+
36000, // 10 hours
25+
}
26+
27+
// Metrics contains all the metrics for the reminder service
28+
type Metrics struct {
29+
// Time between when a reminder became eligible and when it was sent
30+
SendDelay metric.Float64Histogram
31+
32+
// Time between when a reminder became eligible and when it was sent for the first time
33+
NewSendDelay metric.Float64Histogram
34+
35+
// Current number of reminders in the batch
36+
BatchSize metric.Int64Histogram
37+
}
38+
39+
// NewMetrics creates a new metrics instance
40+
func NewMetrics(meter metric.Meter) (*Metrics, error) {
41+
sendDelay, err := meter.Float64Histogram(
42+
"send_delay",
43+
metric.WithDescription("Time between reminder becoming eligible and actual send (seconds)"),
44+
metric.WithUnit("s"),
45+
metric.WithExplicitBucketBoundaries(delayBuckets...),
46+
)
47+
if err != nil {
48+
return nil, err
49+
}
50+
51+
newSendDelay, err := meter.Float64Histogram(
52+
"new_send_delay",
53+
metric.WithDescription("Time between reminder becoming eligible and actual send (seconds) for first time reminders"),
54+
metric.WithUnit("s"),
55+
metric.WithExplicitBucketBoundaries(delayBuckets...),
56+
)
57+
if err != nil {
58+
return nil, err
59+
}
60+
61+
batchSize, err := meter.Int64Histogram(
62+
"batch_size",
63+
metric.WithDescription("Current number of reminders in the batch"),
64+
)
65+
if err != nil {
66+
return nil, err
67+
}
68+
69+
return &Metrics{
70+
SendDelay: sendDelay,
71+
NewSendDelay: newSendDelay,
72+
BatchSize: batchSize,
73+
}, nil
74+
}
75+
76+
// RecordBatch records the metrics for a batch of reminders
77+
func (m *Metrics) RecordBatch(ctx context.Context, size int64) {
78+
m.BatchSize.Record(ctx, size)
79+
}

internal/reminder/metrics_server.go

+87
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
// SPDX-FileCopyrightText: Copyright 2024 The Minder Authors
2+
// SPDX-License-Identifier: Apache-2.0
3+
4+
package reminder
5+
6+
import (
7+
"context"
8+
"fmt"
9+
"net/http"
10+
"time"
11+
12+
"github.com/prometheus/client_golang/prometheus/promhttp"
13+
"github.com/rs/zerolog"
14+
"go.opentelemetry.io/otel"
15+
"go.opentelemetry.io/otel/exporters/prometheus"
16+
sdkmetric "go.opentelemetry.io/otel/sdk/metric"
17+
"go.opentelemetry.io/otel/sdk/resource"
18+
semconv "go.opentelemetry.io/otel/semconv/v1.17.0"
19+
)
20+
21+
const (
22+
metricsPath = "/metrics"
23+
readHeaderTimeout = 2 * time.Second
24+
)
25+
26+
func (r *reminder) startMetricServer(ctx context.Context, mpReady chan<- struct{}) error {
27+
logger := zerolog.Ctx(ctx)
28+
29+
prometheusExporter, err := prometheus.New(
30+
prometheus.WithNamespace("reminder"),
31+
)
32+
if err != nil {
33+
return fmt.Errorf("failed to create Prometheus exporter: %w", err)
34+
}
35+
36+
res := resource.NewWithAttributes(
37+
semconv.SchemaURL,
38+
semconv.ServiceName("reminder"),
39+
// TODO: Make this auto-generated
40+
semconv.ServiceVersion("v0.1.0"),
41+
)
42+
43+
mp := sdkmetric.NewMeterProvider(
44+
sdkmetric.WithReader(prometheusExporter),
45+
sdkmetric.WithResource(res),
46+
)
47+
48+
otel.SetMeterProvider(mp)
49+
50+
// Indicates that a global MeterProvider is available
51+
close(mpReady)
52+
53+
mux := http.NewServeMux()
54+
mux.Handle(metricsPath, promhttp.Handler())
55+
56+
server := &http.Server{
57+
Addr: r.cfg.MetricServer.GetAddress(),
58+
Handler: mux,
59+
ReadHeaderTimeout: readHeaderTimeout,
60+
}
61+
62+
logger.Info().Msgf("starting metrics server on %s", server.Addr)
63+
64+
errCh := make(chan error)
65+
go func() {
66+
errCh <- server.ListenAndServe()
67+
}()
68+
69+
select {
70+
case err := <-errCh:
71+
return err
72+
case <-ctx.Done():
73+
case <-r.stop:
74+
}
75+
76+
// shutdown the metrics server when either the context is done or when reminder is stopped
77+
shutdownCtx, shutdownRelease := context.WithTimeout(context.Background(), 5*time.Second)
78+
defer shutdownRelease()
79+
80+
logger.Info().Msg("shutting down metrics server")
81+
82+
if err := mp.Shutdown(shutdownCtx); err != nil {
83+
logger.Err(err).Msg("error shutting down metrics provider")
84+
}
85+
86+
return server.Shutdown(shutdownCtx)
87+
}

internal/reminder/reminder.go

+62-18
Original file line numberDiff line numberDiff line change
@@ -14,9 +14,11 @@ import (
1414
"github.com/ThreeDotsLabs/watermill/message"
1515
"github.com/google/uuid"
1616
"github.com/rs/zerolog"
17+
"go.opentelemetry.io/otel"
1718

1819
"github.com/mindersec/minder/internal/db"
1920
remindermessages "github.com/mindersec/minder/internal/reminder/messages"
21+
"github.com/mindersec/minder/internal/reminder/metrics"
2022
reminderconfig "github.com/mindersec/minder/pkg/config/reminder"
2123
"github.com/mindersec/minder/pkg/eventer/constants"
2224
)
@@ -42,6 +44,8 @@ type reminder struct {
4244
ticker *time.Ticker
4345

4446
eventPublisher message.Publisher
47+
48+
metrics *metrics.Metrics
4549
}
4650

4751
// NewReminder creates a new reminder instance
@@ -74,21 +78,52 @@ func (r *reminder) Start(ctx context.Context) error {
7478
return errors.New("reminder stopped, cannot start again")
7579
default:
7680
}
81+
defer r.Stop()
7782

7883
interval := r.cfg.RecurrenceConfig.Interval
7984
if interval <= 0 {
8085
return fmt.Errorf("invalid interval: %s", r.cfg.RecurrenceConfig.Interval)
8186
}
8287

88+
metricsServerDone := make(chan struct{})
89+
90+
if r.cfg.MetricsConfig.Enabled {
91+
metricsProviderReady := make(chan struct{})
92+
93+
go func() {
94+
if err := r.startMetricServer(ctx, metricsProviderReady); err != nil {
95+
logger.Fatal().Err(err).Msg("failed to start metrics server")
96+
}
97+
close(metricsServerDone)
98+
}()
99+
100+
select {
101+
case <-metricsProviderReady:
102+
var err error
103+
r.metrics, err = metrics.NewMetrics(otel.Meter("reminder"))
104+
if err != nil {
105+
return err
106+
}
107+
case <-ctx.Done():
108+
logger.Info().Msg("reminder stopped")
109+
return nil
110+
}
111+
}
112+
83113
r.ticker = time.NewTicker(interval)
84-
defer r.Stop()
85114

86115
for {
87116
select {
88117
case <-ctx.Done():
118+
if r.cfg.MetricsConfig.Enabled {
119+
<-metricsServerDone
120+
}
89121
logger.Info().Msg("reminder stopped")
90122
return nil
91123
case <-r.stop:
124+
if r.cfg.MetricsConfig.Enabled {
125+
<-metricsServerDone
126+
}
92127
logger.Info().Msg("reminder stopped")
93128
return nil
94129
case <-r.ticker.C:
@@ -126,7 +161,7 @@ func (r *reminder) sendReminders(ctx context.Context) error {
126161
logger := zerolog.Ctx(ctx)
127162

128163
// Fetch a batch of repositories
129-
repos, err := r.getRepositoryBatch(ctx)
164+
repos, repoToLastUpdated, err := r.getRepositoryBatch(ctx)
130165
if err != nil {
131166
return fmt.Errorf("error fetching repository batch: %w", err)
132167
}
@@ -143,6 +178,10 @@ func (r *reminder) sendReminders(ctx context.Context) error {
143178
return fmt.Errorf("error creating reminder messages: %w", err)
144179
}
145180

181+
if r.metrics != nil {
182+
r.metrics.RecordBatch(ctx, int64(len(repos)))
183+
}
184+
146185
err = r.eventPublisher.Publish(constants.TopicQueueRepoReminder, messages...)
147186
if err != nil {
148187
return fmt.Errorf("error publishing messages: %w", err)
@@ -151,13 +190,16 @@ func (r *reminder) sendReminders(ctx context.Context) error {
151190
repoIds := make([]uuid.UUID, len(repos))
152191
for _, repo := range repos {
153192
repoIds = append(repoIds, repo.ID)
154-
}
193+
if r.metrics != nil {
194+
sendDelay := time.Since(repoToLastUpdated[repo.ID]) - r.cfg.RecurrenceConfig.MinElapsed
155195

156-
// TODO: Collect Metrics
157-
// Potential metrics:
158-
// - Gauge: Number of reminders in the current batch
159-
// - UpDownCounter: Average reminders sent per batch
160-
// - Histogram: reminder_last_sent time distribution
196+
recorder := r.metrics.SendDelay
197+
if !repo.ReminderLastSent.Valid {
198+
recorder = r.metrics.NewSendDelay
199+
}
200+
recorder.Record(ctx, sendDelay.Seconds())
201+
}
202+
}
161203

162204
err = r.store.UpdateReminderLastSentForRepositories(ctx, repoIds)
163205
if err != nil {
@@ -167,7 +209,7 @@ func (r *reminder) sendReminders(ctx context.Context) error {
167209
return nil
168210
}
169211

170-
func (r *reminder) getRepositoryBatch(ctx context.Context) ([]db.Repository, error) {
212+
func (r *reminder) getRepositoryBatch(ctx context.Context) ([]db.Repository, map[uuid.UUID]time.Time, error) {
171213
logger := zerolog.Ctx(ctx)
172214

173215
logger.Debug().Msgf("fetching repositories after cursor: %s", r.repositoryCursor)
@@ -176,21 +218,23 @@ func (r *reminder) getRepositoryBatch(ctx context.Context) ([]db.Repository, err
176218
Limit: int64(r.cfg.RecurrenceConfig.BatchSize),
177219
})
178220
if err != nil {
179-
return nil, err
221+
return nil, nil, err
180222
}
181223

182-
eligibleRepos, err := r.getEligibleRepositories(ctx, repos)
224+
eligibleRepos, eligibleReposLastUpdated, err := r.getEligibleRepositories(ctx, repos)
183225
if err != nil {
184-
return nil, err
226+
return nil, nil, err
185227
}
186228
logger.Debug().Msgf("%d/%d repositories are eligible for reminders", len(eligibleRepos), len(repos))
187229

188230
r.updateRepositoryCursor(ctx, repos)
189231

190-
return eligibleRepos, nil
232+
return eligibleRepos, eligibleReposLastUpdated, nil
191233
}
192234

193-
func (r *reminder) getEligibleRepositories(ctx context.Context, repos []db.Repository) ([]db.Repository, error) {
235+
func (r *reminder) getEligibleRepositories(ctx context.Context, repos []db.Repository) (
236+
[]db.Repository, map[uuid.UUID]time.Time, error,
237+
) {
194238
eligibleRepos := make([]db.Repository, 0, len(repos))
195239

196240
// We have a slice of repositories, but the sqlc-generated code wants a slice of UUIDs,
@@ -202,11 +246,11 @@ func (r *reminder) getEligibleRepositories(ctx context.Context, repos []db.Repos
202246
}
203247
oldestRuleEvals, err := r.store.ListOldestRuleEvaluationsByRepositoryId(ctx, repoIds)
204248
if err != nil {
205-
return nil, err
249+
return nil, nil, err
206250
}
207251
idToLastUpdate := make(map[uuid.UUID]time.Time, len(oldestRuleEvals))
208-
for _, times := range oldestRuleEvals {
209-
idToLastUpdate[times.RepositoryID] = times.OldestLastUpdated
252+
for _, ruleEval := range oldestRuleEvals {
253+
idToLastUpdate[ruleEval.RepositoryID] = ruleEval.OldestLastUpdated
210254
}
211255

212256
cutoff := time.Now().Add(-1 * r.cfg.RecurrenceConfig.MinElapsed)
@@ -216,7 +260,7 @@ func (r *reminder) getEligibleRepositories(ctx context.Context, repos []db.Repos
216260
}
217261
}
218262

219-
return eligibleRepos, nil
263+
return eligibleRepos, idToLastUpdate, nil
220264
}
221265

222266
func (r *reminder) updateRepositoryCursor(ctx context.Context, repos []db.Repository) {

internal/reminder/reminder_test.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -159,7 +159,7 @@ func Test_getRepositoryBatch(t *testing.T) {
159159

160160
r := createTestReminder(t, store, cfg)
161161

162-
got, err := r.getRepositoryBatch(context.Background())
162+
got, _, err := r.getRepositoryBatch(context.Background())
163163
if test.err != "" {
164164
require.ErrorContains(t, err, test.err)
165165
return

pkg/config/reminder/config.go

+6-4
Original file line numberDiff line numberDiff line change
@@ -18,10 +18,12 @@ import (
1818

1919
// Config contains the configuration for the reminder service
2020
type Config struct {
21-
Database config.DatabaseConfig `mapstructure:"database"`
22-
RecurrenceConfig RecurrenceConfig `mapstructure:"recurrence"`
23-
EventConfig serverconfig.EventConfig `mapstructure:"events"`
24-
LoggingConfig LoggingConfig `mapstructure:"logging"`
21+
Database config.DatabaseConfig `mapstructure:"database"`
22+
RecurrenceConfig RecurrenceConfig `mapstructure:"recurrence"`
23+
EventConfig serverconfig.EventConfig `mapstructure:"events"`
24+
LoggingConfig LoggingConfig `mapstructure:"logging"`
25+
MetricsConfig serverconfig.MetricsConfig `mapstructure:"metrics"`
26+
MetricServer serverconfig.MetricServerConfig `mapstructure:"metric_server" default:"{\"port\":\"9091\"}"`
2527
}
2628

2729
// Validate validates the configuration

0 commit comments

Comments
 (0)