Skip to content

aggmetric: acquire lock in label config evaluation in SQLMetric #147486

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
May 30, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 27 additions & 22 deletions pkg/util/metric/aggmetric/agg_metric.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@ package aggmetric
import (
"hash/fnv"
"strings"
"sync/atomic"

"github.com/cockroachdb/cockroach/pkg/util/cache"
"github.com/cockroachdb/cockroach/pkg/util/metric"
Expand Down Expand Up @@ -175,16 +174,16 @@ func (cs *childSet) clear() {
}

type SQLMetric struct {
labelConfig atomic.Uint64
mu struct {
mu struct {
labelConfig metric.LabelConfig
syncutil.Mutex
children ChildrenStorage
}
}

func NewSQLMetric(labelConfig metric.LabelConfig) *SQLMetric {
sm := &SQLMetric{}
sm.labelConfig.Store(uint64(labelConfig))
sm.mu.labelConfig = labelConfig
sm.mu.children = &UnorderedCacheWrapper{
cache: getCacheStorage(),
}
Expand All @@ -205,18 +204,18 @@ func (sm *SQLMetric) Each(
lvs := cm.labelValues()
dbLabel := dbLabel
appLabel := appLabel
switch sm.labelConfig.Load() {
case uint64(metric.LabelConfigDB):
switch sm.mu.labelConfig {
case metric.LabelConfigDB:
childLabels = append(childLabels, &io_prometheus_client.LabelPair{
Name: &dbLabel,
Value: &lvs[0],
})
case uint64(metric.LabelConfigApp):
case metric.LabelConfigApp:
childLabels = append(childLabels, &io_prometheus_client.LabelPair{
Name: &appLabel,
Value: &lvs[0],
})
case uint64(metric.LabelConfigAppAndDB):
case metric.LabelConfigAppAndDB:
childLabels = append(childLabels, &io_prometheus_client.LabelPair{
Name: &dbLabel,
Value: &lvs[0],
Expand All @@ -242,12 +241,12 @@ func (sm *SQLMetric) add(metric ChildMetric) {

type createChildMetricFunc func(labelValues labelValuesSlice) ChildMetric

// getOrAddChild returns the child metric for the given label values. If the child
// getOrAddChildLocked returns the child metric for the given label values. If the child
// doesn't exist, it creates a new one and adds it to the collection.
func (sm *SQLMetric) getOrAddChild(f createChildMetricFunc, labelValues ...string) ChildMetric {
sm.mu.Lock()
defer sm.mu.Unlock()

// REQUIRES: sm.mu is locked.
func (sm *SQLMetric) getOrAddChildLocked(
f createChildMetricFunc, labelValues ...string,
) ChildMetric {
// If the child already exists, return it.
if child, ok := sm.get(labelValues...); ok {
return child
Expand All @@ -266,18 +265,24 @@ func (sm *SQLMetric) getOrAddChild(f createChildMetricFunc, labelValues ...strin
func (sm *SQLMetric) getChildByLabelConfig(
f createChildMetricFunc, db string, app string,
) (ChildMetric, bool) {
// We should acquire the lock before evaluating the label configuration
// and accessing the children storage in a thread-safe manner. We have moved
// the lock acquisition from getOrAddChildLocked to here to fix bug #147475.
sm.mu.Lock()
defer sm.mu.Unlock()

var childMetric ChildMetric
switch sm.labelConfig.Load() {
case uint64(metric.LabelConfigDisabled):
switch sm.mu.labelConfig {
case metric.LabelConfigDisabled:
return nil, false
case uint64(metric.LabelConfigDB):
childMetric = sm.getOrAddChild(f, db)
case metric.LabelConfigDB:
childMetric = sm.getOrAddChildLocked(f, db)
return childMetric, true
case uint64(metric.LabelConfigApp):
childMetric = sm.getOrAddChild(f, app)
case metric.LabelConfigApp:
childMetric = sm.getOrAddChildLocked(f, app)
return childMetric, true
case uint64(metric.LabelConfigAppAndDB):
childMetric = sm.getOrAddChild(f, db, app)
case metric.LabelConfigAppAndDB:
childMetric = sm.getOrAddChildLocked(f, db, app)
return childMetric, true
default:
return nil, false
Expand All @@ -290,7 +295,7 @@ func (sm *SQLMetric) ReinitialiseChildMetrics(labelConfig metric.LabelConfig) {
sm.mu.Lock()
defer sm.mu.Unlock()
sm.mu.children.Clear()
sm.labelConfig.Store(uint64(labelConfig))
sm.mu.labelConfig = labelConfig
}

type MetricItem interface {
Expand Down
30 changes: 29 additions & 1 deletion pkg/util/metric/aggmetric/agg_metric_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,9 @@ import (
"bufio"
"bytes"
"sort"
"strconv"
"strings"
"sync"
"testing"
"time"

Expand Down Expand Up @@ -299,7 +301,7 @@ func TestAggMetricClear(t *testing.T) {
Name: "bar_counter",
})
r.AddMetric(d)
d.labelConfig.Store(uint64(metric.LabelConfigAppAndDB))
d.mu.labelConfig = metric.LabelConfigAppAndDB
tenant2 := roachpb.MustMakeTenantID(2)
c1 := c.AddChild(tenant2.String())

Expand Down Expand Up @@ -420,3 +422,29 @@ func TestSQLMetricsReinitialise(t *testing.T) {
})

}

// TestConcurrentUpdatesAndReinitialiseMetric tests that concurrent updates to a metric
// do not cause a panic when the metric is reinitialised and scraped. The test case
// validates the fix for the bug #147475.
func TestConcurrentUpdatesAndReinitialiseMetric(t *testing.T) {
defer leaktest.AfterTest(t)()
r := metric.NewRegistry()

c := NewSQLCounter(metric.Metadata{Name: "test.counter"})
c.ReinitialiseChildMetrics(metric.LabelConfigApp)
r.AddMetric(c)
var wg sync.WaitGroup
for i := 0; i < 100; i++ {
wg.Add(1)
go func() {
c.Inc(1, "test_db"+"_"+strconv.Itoa(i), "test_app"+"_"+strconv.Itoa(i))
wg.Done()
}()
}
c.ReinitialiseChildMetrics(metric.LabelConfigAppAndDB)
wg.Wait()
pe := metric.MakePrometheusExporter()
require.NotPanics(t, func() {
pe.ScrapeRegistry(r, metric.WithIncludeChildMetrics(true), metric.WithIncludeAggregateMetrics(true))
})
}
2 changes: 1 addition & 1 deletion pkg/util/metric/aggmetric/counter_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ func TestAggCounter(t *testing.T) {
c := NewSQLCounter(metric.Metadata{
Name: "foo_counter",
})
c.labelConfig.Store(uint64(metric.LabelConfigAppAndDB))
c.mu.labelConfig = metric.LabelConfigAppAndDB
r.AddMetric(c)
cacheStorage := cache.NewUnorderedCache(cache.Config{
Policy: cache.CacheLRU,
Expand Down
4 changes: 2 additions & 2 deletions pkg/util/metric/aggmetric/gauge_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ func TestSQLGaugeEviction(t *testing.T) {
g.mu.children = &UnorderedCacheWrapper{
cache: cacheStorage,
}
g.labelConfig.Store(uint64(metric.LabelConfigAppAndDB))
g.mu.labelConfig = metric.LabelConfigAppAndDB

for i := 0; i < cacheSize; i++ {
g.Update(1, "1", strconv.Itoa(i))
Expand Down Expand Up @@ -69,7 +69,7 @@ func TestSQLGaugeMethods(t *testing.T) {
cache: cacheStorage,
}
r.AddMetric(g)
g.labelConfig.Store(uint64(metric.LabelConfigAppAndDB))
g.mu.labelConfig = metric.LabelConfigAppAndDB

g.Update(10, "1", "1")
g.Update(10, "2", "2")
Expand Down
2 changes: 1 addition & 1 deletion pkg/util/metric/aggmetric/histogram_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ func TestSQLHistogram(t *testing.T) {
h.mu.children = &UnorderedCacheWrapper{
cache: cacheStorage,
}
h.labelConfig.Store(uint64(metric.LabelConfigAppAndDB))
h.mu.labelConfig = metric.LabelConfigAppAndDB

for i := 0; i < cacheSize; i++ {
h.RecordValue(1, "1", strconv.Itoa(i))
Expand Down