Skip to content

Commit 6f7f8b0

Browse files
committed
Record Metrics for Reminder
Signed-off-by: Vyom Yadav <[email protected]>
1 parent f9c90d2 commit 6f7f8b0

File tree

5 files changed

+230
-26
lines changed

5 files changed

+230
-26
lines changed

internal/reminder/metrics/metrics.go

+72
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
// SPDX-FileCopyrightText: Copyright 2025 The Minder Authors
2+
// SPDX-License-Identifier: Apache-2.0
3+
4+
// Package metrics provides metrics for the reminder service
5+
package metrics
6+
7+
import (
8+
"go.opentelemetry.io/otel/metric"
9+
)
10+
11+
// Default bucket boundaries in seconds for the delay histograms
12+
var delayBuckets = []float64{
13+
60, // 1 minute
14+
300, // 5 minutes
15+
600, // 10 minutes
16+
1800, // 30 minutes
17+
3600, // 1 hour
18+
7200, // 2 hours
19+
10800, // 3 hours
20+
18000, // 5 hours
21+
25200, // 7 hours
22+
36000, // 10 hours
23+
}
24+
25+
// Metrics contains all the metrics for the reminder service
26+
type Metrics struct {
27+
// Time between when a reminder became eligible and when it was sent
28+
SendDelay metric.Float64Histogram
29+
30+
// Time between when a reminder became eligible and when it was sent for the first time
31+
NewSendDelay metric.Float64Histogram
32+
33+
// Current number of reminders in the batch
34+
BatchSize metric.Int64Histogram
35+
}
36+
37+
// NewMetrics creates a new metrics instance
38+
func NewMetrics(meter metric.Meter) (*Metrics, error) {
39+
sendDelay, err := meter.Float64Histogram(
40+
"send_delay",
41+
metric.WithDescription("Time between reminder becoming eligible and actual send (seconds)"),
42+
metric.WithUnit("s"),
43+
metric.WithExplicitBucketBoundaries(delayBuckets...),
44+
)
45+
if err != nil {
46+
return nil, err
47+
}
48+
49+
newSendDelay, err := meter.Float64Histogram(
50+
"new_send_delay",
51+
metric.WithDescription("Time between reminder becoming eligible and actual send (seconds) for first time reminders"),
52+
metric.WithUnit("s"),
53+
metric.WithExplicitBucketBoundaries(delayBuckets...),
54+
)
55+
if err != nil {
56+
return nil, err
57+
}
58+
59+
batchSize, err := meter.Int64Histogram(
60+
"batch_size",
61+
metric.WithDescription("Current number of reminders in the batch"),
62+
)
63+
if err != nil {
64+
return nil, err
65+
}
66+
67+
return &Metrics{
68+
SendDelay: sendDelay,
69+
NewSendDelay: newSendDelay,
70+
BatchSize: batchSize,
71+
}, nil
72+
}

internal/reminder/metrics_server.go

+94
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
// SPDX-FileCopyrightText: Copyright 2025 The Minder Authors
2+
// SPDX-License-Identifier: Apache-2.0
3+
4+
package reminder
5+
6+
import (
7+
"context"
8+
"errors"
9+
"fmt"
10+
"net/http"
11+
"time"
12+
13+
"github.com/prometheus/client_golang/prometheus/promhttp"
14+
"github.com/rs/zerolog"
15+
"go.opentelemetry.io/otel"
16+
"go.opentelemetry.io/otel/exporters/prometheus"
17+
sdkmetric "go.opentelemetry.io/otel/sdk/metric"
18+
"go.opentelemetry.io/otel/sdk/resource"
19+
semconv "go.opentelemetry.io/otel/semconv/v1.17.0"
20+
)
21+
22+
const (
23+
metricsPath = "/metrics"
24+
readHeaderTimeout = 2 * time.Second
25+
)
26+
27+
func (r *reminder) startMetricServer(ctx context.Context) error {
28+
logger := zerolog.Ctx(ctx)
29+
30+
prometheusExporter, err := prometheus.New(
31+
prometheus.WithNamespace("reminder"),
32+
)
33+
if err != nil {
34+
return fmt.Errorf("failed to create Prometheus exporter: %w", err)
35+
}
36+
37+
res := resource.NewWithAttributes(
38+
semconv.SchemaURL,
39+
semconv.ServiceName("reminder"),
40+
// TODO: Make this auto-generated
41+
semconv.ServiceVersion("v0.1.0"),
42+
)
43+
44+
mp := sdkmetric.NewMeterProvider(
45+
sdkmetric.WithReader(prometheusExporter),
46+
sdkmetric.WithResource(res),
47+
)
48+
49+
otel.SetMeterProvider(mp)
50+
51+
mux := http.NewServeMux()
52+
mux.Handle(metricsPath, promhttp.Handler())
53+
54+
server := &http.Server{
55+
Addr: r.cfg.MetricServer.GetAddress(),
56+
Handler: mux,
57+
ReadHeaderTimeout: readHeaderTimeout,
58+
}
59+
logger.Info().Msgf("starting metrics server on %s", server.Addr)
60+
61+
// Start the metrics server
62+
go func() {
63+
err = server.ListenAndServe()
64+
if err != nil && !errors.Is(err, http.ErrServerClosed) {
65+
logger.Err(err).Msg("error starting metrics server")
66+
}
67+
}()
68+
69+
// Watch for context cancellation or stop signal to shutdown the metrics server
70+
go func() {
71+
select {
72+
case <-ctx.Done():
73+
case <-r.stop:
74+
}
75+
76+
// shutdown the metrics server when either the context is done or when reminder is stopped
77+
shutdownCtx, shutdownRelease := context.WithTimeout(context.Background(), 5*time.Second)
78+
defer shutdownRelease()
79+
80+
logger.Info().Msg("shutting down metrics server")
81+
82+
if err = server.Shutdown(shutdownCtx); err != nil {
83+
logger.Err(err).Msg("error shutting down metrics server")
84+
}
85+
86+
if err = mp.Shutdown(shutdownCtx); err != nil {
87+
logger.Err(err).Msg("error shutting down metrics provider")
88+
}
89+
90+
close(r.metricsServerDone)
91+
}()
92+
93+
return nil
94+
}

internal/reminder/reminder.go

+57-21
Original file line numberDiff line numberDiff line change
@@ -14,9 +14,11 @@ import (
1414
"github.com/ThreeDotsLabs/watermill/message"
1515
"github.com/google/uuid"
1616
"github.com/rs/zerolog"
17+
"go.opentelemetry.io/otel"
1718

1819
"github.com/mindersec/minder/internal/db"
1920
remindermessages "github.com/mindersec/minder/internal/reminder/messages"
21+
"github.com/mindersec/minder/internal/reminder/metrics"
2022
reminderconfig "github.com/mindersec/minder/pkg/config/reminder"
2123
"github.com/mindersec/minder/pkg/eventer/constants"
2224
)
@@ -42,14 +44,18 @@ type reminder struct {
4244
ticker *time.Ticker
4345

4446
eventPublisher message.Publisher
47+
48+
metrics *metrics.Metrics
49+
metricsServerDone chan struct{}
4550
}
4651

4752
// NewReminder creates a new reminder instance
4853
func NewReminder(ctx context.Context, store db.Store, config *reminderconfig.Config) (Interface, error) {
4954
r := &reminder{
50-
store: store,
51-
cfg: config,
52-
stop: make(chan struct{}),
55+
store: store,
56+
cfg: config,
57+
stop: make(chan struct{}),
58+
metricsServerDone: make(chan struct{}),
5359
}
5460

5561
// Set to a random UUID to start
@@ -74,21 +80,40 @@ func (r *reminder) Start(ctx context.Context) error {
7480
return errors.New("reminder stopped, cannot start again")
7581
default:
7682
}
83+
// Reminder only stops in case of error or context cancellation
84+
// An errored out reminder cannot be started again, so it is stopped here
85+
// This also prevents resource leaks if user doesn't explicitly stop the reminder
86+
defer r.Stop()
7787

7888
interval := r.cfg.RecurrenceConfig.Interval
7989
if interval <= 0 {
8090
return fmt.Errorf("invalid interval: %s", r.cfg.RecurrenceConfig.Interval)
8191
}
8292

93+
if r.cfg.MetricsConfig.Enabled {
94+
if err := r.startMetricServer(ctx); err != nil {
95+
logger.Err(err).Msg("failed to start metrics server")
96+
}
97+
98+
var err error
99+
r.metrics, err = metrics.NewMetrics(otel.Meter("reminder"))
100+
if err != nil {
101+
return err
102+
}
103+
} else {
104+
close(r.metricsServerDone)
105+
}
106+
83107
r.ticker = time.NewTicker(interval)
84-
defer r.Stop()
85108

86109
for {
87110
select {
88111
case <-ctx.Done():
112+
<-r.metricsServerDone
89113
logger.Info().Msg("reminder stopped")
90114
return nil
91115
case <-r.stop:
116+
<-r.metricsServerDone
92117
logger.Info().Msg("reminder stopped")
93118
return nil
94119
case <-r.ticker.C:
@@ -120,13 +145,15 @@ func (r *reminder) Stop() {
120145
zerolog.Ctx(context.Background()).Error().Err(err).Msg("error closing event publisher")
121146
}
122147
})
148+
// Wait for the metrics server to stop
149+
<-r.metricsServerDone
123150
}
124151

125152
func (r *reminder) sendReminders(ctx context.Context) error {
126153
logger := zerolog.Ctx(ctx)
127154

128155
// Fetch a batch of repositories
129-
repos, err := r.getRepositoryBatch(ctx)
156+
repos, repoToLastUpdated, err := r.getRepositoryBatch(ctx)
130157
if err != nil {
131158
return fmt.Errorf("error fetching repository batch: %w", err)
132159
}
@@ -143,6 +170,10 @@ func (r *reminder) sendReminders(ctx context.Context) error {
143170
return fmt.Errorf("error creating reminder messages: %w", err)
144171
}
145172

173+
if r.metrics != nil {
174+
r.metrics.BatchSize.Record(ctx, int64(len(repos)))
175+
}
176+
146177
err = r.eventPublisher.Publish(constants.TopicQueueRepoReminder, messages...)
147178
if err != nil {
148179
return fmt.Errorf("error publishing messages: %w", err)
@@ -151,13 +182,16 @@ func (r *reminder) sendReminders(ctx context.Context) error {
151182
repoIds := make([]uuid.UUID, len(repos))
152183
for _, repo := range repos {
153184
repoIds = append(repoIds, repo.ID)
154-
}
185+
if r.metrics != nil {
186+
sendDelay := time.Since(repoToLastUpdated[repo.ID]) - r.cfg.RecurrenceConfig.MinElapsed
155187

156-
// TODO: Collect Metrics
157-
// Potential metrics:
158-
// - Gauge: Number of reminders in the current batch
159-
// - UpDownCounter: Average reminders sent per batch
160-
// - Histogram: reminder_last_sent time distribution
188+
recorder := r.metrics.SendDelay
189+
if !repo.ReminderLastSent.Valid {
190+
recorder = r.metrics.NewSendDelay
191+
}
192+
recorder.Record(ctx, sendDelay.Seconds())
193+
}
194+
}
161195

162196
err = r.store.UpdateReminderLastSentForRepositories(ctx, repoIds)
163197
if err != nil {
@@ -167,7 +201,7 @@ func (r *reminder) sendReminders(ctx context.Context) error {
167201
return nil
168202
}
169203

170-
func (r *reminder) getRepositoryBatch(ctx context.Context) ([]db.Repository, error) {
204+
func (r *reminder) getRepositoryBatch(ctx context.Context) ([]db.Repository, map[uuid.UUID]time.Time, error) {
171205
logger := zerolog.Ctx(ctx)
172206

173207
logger.Debug().Msgf("fetching repositories after cursor: %s", r.repositoryCursor)
@@ -176,21 +210,23 @@ func (r *reminder) getRepositoryBatch(ctx context.Context) ([]db.Repository, err
176210
Limit: int64(r.cfg.RecurrenceConfig.BatchSize),
177211
})
178212
if err != nil {
179-
return nil, err
213+
return nil, nil, err
180214
}
181215

182-
eligibleRepos, err := r.getEligibleRepositories(ctx, repos)
216+
eligibleRepos, eligibleReposLastUpdated, err := r.getEligibleRepositories(ctx, repos)
183217
if err != nil {
184-
return nil, err
218+
return nil, nil, err
185219
}
186220
logger.Debug().Msgf("%d/%d repositories are eligible for reminders", len(eligibleRepos), len(repos))
187221

188222
r.updateRepositoryCursor(ctx, repos)
189223

190-
return eligibleRepos, nil
224+
return eligibleRepos, eligibleReposLastUpdated, nil
191225
}
192226

193-
func (r *reminder) getEligibleRepositories(ctx context.Context, repos []db.Repository) ([]db.Repository, error) {
227+
func (r *reminder) getEligibleRepositories(ctx context.Context, repos []db.Repository) (
228+
[]db.Repository, map[uuid.UUID]time.Time, error,
229+
) {
194230
eligibleRepos := make([]db.Repository, 0, len(repos))
195231

196232
// We have a slice of repositories, but the sqlc-generated code wants a slice of UUIDs,
@@ -202,11 +238,11 @@ func (r *reminder) getEligibleRepositories(ctx context.Context, repos []db.Repos
202238
}
203239
oldestRuleEvals, err := r.store.ListOldestRuleEvaluationsByRepositoryId(ctx, repoIds)
204240
if err != nil {
205-
return nil, err
241+
return nil, nil, err
206242
}
207243
idToLastUpdate := make(map[uuid.UUID]time.Time, len(oldestRuleEvals))
208-
for _, times := range oldestRuleEvals {
209-
idToLastUpdate[times.RepositoryID] = times.OldestLastUpdated
244+
for _, ruleEval := range oldestRuleEvals {
245+
idToLastUpdate[ruleEval.RepositoryID] = ruleEval.OldestLastUpdated
210246
}
211247

212248
cutoff := time.Now().Add(-1 * r.cfg.RecurrenceConfig.MinElapsed)
@@ -216,7 +252,7 @@ func (r *reminder) getEligibleRepositories(ctx context.Context, repos []db.Repos
216252
}
217253
}
218254

219-
return eligibleRepos, nil
255+
return eligibleRepos, idToLastUpdate, nil
220256
}
221257

222258
func (r *reminder) updateRepositoryCursor(ctx context.Context, repos []db.Repository) {

internal/reminder/reminder_test.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -159,7 +159,7 @@ func Test_getRepositoryBatch(t *testing.T) {
159159

160160
r := createTestReminder(t, store, cfg)
161161

162-
got, err := r.getRepositoryBatch(context.Background())
162+
got, _, err := r.getRepositoryBatch(context.Background())
163163
if test.err != "" {
164164
require.ErrorContains(t, err, test.err)
165165
return

pkg/config/reminder/config.go

+6-4
Original file line numberDiff line numberDiff line change
@@ -18,10 +18,12 @@ import (
1818

1919
// Config contains the configuration for the reminder service
2020
type Config struct {
21-
Database config.DatabaseConfig `mapstructure:"database"`
22-
RecurrenceConfig RecurrenceConfig `mapstructure:"recurrence"`
23-
EventConfig serverconfig.EventConfig `mapstructure:"events"`
24-
LoggingConfig LoggingConfig `mapstructure:"logging"`
21+
Database config.DatabaseConfig `mapstructure:"database"`
22+
RecurrenceConfig RecurrenceConfig `mapstructure:"recurrence"`
23+
EventConfig serverconfig.EventConfig `mapstructure:"events"`
24+
LoggingConfig LoggingConfig `mapstructure:"logging"`
25+
MetricsConfig serverconfig.MetricsConfig `mapstructure:"metrics"`
26+
MetricServer serverconfig.MetricServerConfig `mapstructure:"metric_server" default:"{\"port\":\"9091\"}"`
2527
}
2628

2729
// Validate validates the configuration

0 commit comments

Comments
 (0)