Skip to content

Commit 3b66a69

Browse files
committed
[Prometheus] Add ray_cluster_provisioned_duration_seconds metric
Signed-off-by: win5923 <[email protected]>
1 parent 847585d commit 3b66a69

File tree

4 files changed

+54
-48
lines changed

4 files changed

+54
-48
lines changed
+14-45
Original file line numberDiff line numberDiff line change
@@ -1,64 +1,33 @@
11
package common
22

33
import (
4+
"time"
5+
46
"github.com/prometheus/client_golang/prometheus"
57
"github.com/prometheus/client_golang/prometheus/promauto"
68
"sigs.k8s.io/controller-runtime/pkg/metrics"
79
)
810

911
// Define all the prometheus counters for all clusters
1012
var (
11-
clustersCreatedCount = promauto.NewCounterVec(
12-
prometheus.CounterOpts{
13-
Name: "ray_operator_clusters_created_total",
14-
Help: "Counts number of clusters created",
15-
},
16-
[]string{"namespace"},
17-
)
18-
clustersDeletedCount = promauto.NewCounterVec(
19-
prometheus.CounterOpts{
20-
Name: "ray_operator_clusters_deleted_total",
21-
Help: "Counts number of clusters deleted",
22-
},
23-
[]string{"namespace"},
24-
)
25-
clustersSuccessfulCount = promauto.NewCounterVec(
26-
prometheus.CounterOpts{
27-
Name: "ray_operator_clusters_successful_total",
28-
Help: "Counts number of clusters successful",
29-
},
30-
[]string{"namespace"},
31-
)
32-
clustersFailedCount = promauto.NewCounterVec(
33-
prometheus.CounterOpts{
34-
Name: "ray_operator_clusters_failed_total",
35-
Help: "Counts number of clusters failed",
13+
RayClusterProvisionedDurationSeconds = promauto.NewHistogramVec(
14+
prometheus.HistogramOpts{
15+
Name: "ray_cluster_provisioned_duration_seconds",
16+
Help: "The time from RayClusters created to all ray pods are ready for the first time (RayClusterProvisioned) in seconds",
17+
// It may not be applicable to all users, but default buckets cannot be used either.
18+
// For reference, see: https://github.com/prometheus/client_golang/blob/331dfab0cc853dca0242a0d96a80184087a80c1d/prometheus/histogram.go#L271
19+
Buckets: []float64{30, 60, 120, 180, 240, 300, 600, 900, 1800, 3600},
3620
},
37-
[]string{"namespace"},
21+
[]string{"namespace", "name"},
3822
)
3923
)
4024

4125
func init() {
4226
// Register custom metrics with the global prometheus registry
43-
metrics.Registry.MustRegister(clustersCreatedCount,
44-
clustersDeletedCount,
45-
clustersSuccessfulCount,
46-
clustersFailedCount)
47-
}
48-
49-
func CreatedClustersCounterInc(namespace string) {
50-
clustersCreatedCount.WithLabelValues(namespace).Inc()
51-
}
52-
53-
// TODO: We don't handle the delete events in new reconciler mode, how to emit deletion metrics?
54-
func DeletedClustersCounterInc(namespace string) {
55-
clustersDeletedCount.WithLabelValues(namespace).Inc()
56-
}
57-
58-
func SuccessfulClustersCounterInc(namespace string) {
59-
clustersSuccessfulCount.WithLabelValues(namespace).Inc()
27+
metrics.Registry.MustRegister(RayClusterProvisionedDurationSeconds)
6028
}
6129

62-
func FailedClustersCounterInc(namespace string) {
63-
clustersFailedCount.WithLabelValues(namespace).Inc()
30+
// ObserveRayClusterProvisionedDuration observes the duration of RayCluster from creation to provisioned
31+
func ObserveRayClusterProvisionedDuration(namespace, name string, duration time.Duration) {
32+
RayClusterProvisionedDurationSeconds.WithLabelValues(namespace, name).Observe(duration.Seconds())
6433
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
package common
2+
3+
import (
4+
"strings"
5+
"testing"
6+
"time"
7+
8+
"github.com/prometheus/client_golang/prometheus/testutil"
9+
)
10+
11+
func TestObserveRayClusterProvisionedDuration(t *testing.T) {
12+
ObserveRayClusterProvisionedDuration("default", "raycluster-sample", 2*time.Minute)
13+
14+
metric := `
15+
# HELP ray_cluster_provisioned_duration_seconds The time from RayClusters created to all ray pods are ready for the first time (RayClusterProvisioned) in seconds
16+
# TYPE ray_cluster_provisioned_duration_seconds histogram
17+
ray_cluster_provisioned_duration_seconds_bucket{name="raycluster-sample",namespace="default",le="30"} 0
18+
ray_cluster_provisioned_duration_seconds_bucket{name="raycluster-sample",namespace="default",le="60"} 0
19+
ray_cluster_provisioned_duration_seconds_bucket{name="raycluster-sample",namespace="default",le="120"} 1
20+
ray_cluster_provisioned_duration_seconds_bucket{name="raycluster-sample",namespace="default",le="180"} 1
21+
ray_cluster_provisioned_duration_seconds_bucket{name="raycluster-sample",namespace="default",le="240"} 1
22+
ray_cluster_provisioned_duration_seconds_bucket{name="raycluster-sample",namespace="default",le="300"} 1
23+
ray_cluster_provisioned_duration_seconds_bucket{name="raycluster-sample",namespace="default",le="600"} 1
24+
ray_cluster_provisioned_duration_seconds_bucket{name="raycluster-sample",namespace="default",le="900"} 1
25+
ray_cluster_provisioned_duration_seconds_bucket{name="raycluster-sample",namespace="default",le="1800"} 1
26+
ray_cluster_provisioned_duration_seconds_bucket{name="raycluster-sample",namespace="default",le="3600"} 1
27+
ray_cluster_provisioned_duration_seconds_sum{name="raycluster-sample",namespace="default"} 120
28+
ray_cluster_provisioned_duration_seconds_count{name="raycluster-sample",namespace="default"} 1
29+
`
30+
31+
if err := testutil.CollectAndCompare(RayClusterProvisionedDurationSeconds, strings.NewReader(metric), "ray_cluster_provisioned_duration_seconds"); err != nil {
32+
t.Errorf("unexpected collecting result:\n%s", err)
33+
}
34+
}

ray-operator/controllers/ray/raycluster_controller.go

+5-3
Original file line numberDiff line numberDiff line change
@@ -734,12 +734,9 @@ func (r *RayClusterReconciler) reconcilePods(ctx context.Context, instance *rayv
734734
} else if len(headPods.Items) == 0 {
735735
// Create head Pod if it does not exist.
736736
logger.Info("reconcilePods: Found 0 head Pods; creating a head Pod for the RayCluster.")
737-
common.CreatedClustersCounterInc(instance.Namespace)
738737
if err := r.createHeadPod(ctx, *instance); err != nil {
739-
common.FailedClustersCounterInc(instance.Namespace)
740738
return errstd.Join(utils.ErrFailedCreateHeadPod, err)
741739
}
742-
common.SuccessfulClustersCounterInc(instance.Namespace)
743740
} else if len(headPods.Items) > 1 { // This should never happen. This protects against the case that users manually create headpod.
744741
correctHeadPodName := instance.Name + "-head"
745742
headPodNames := make([]string, len(headPods.Items))
@@ -1336,6 +1333,11 @@ func (r *RayClusterReconciler) calculateStatus(ctx context.Context, instance *ra
13361333
Reason: rayv1.AllPodRunningAndReadyFirstTime,
13371334
Message: "All Ray Pods are ready for the first time",
13381335
})
1336+
1337+
// Record ray_cluster_provisioned_duration_seconds duration metric
1338+
// Calculate the time between cluster creation and being fully provisioned
1339+
provisionDuration := time.Since(instance.CreationTimestamp.Time)
1340+
common.ObserveRayClusterProvisionedDuration(instance.Namespace, instance.Name, provisionDuration)
13391341
} else {
13401342
meta.SetStatusCondition(&newInstance.Status.Conditions, metav1.Condition{
13411343
Type: string(rayv1.RayClusterProvisioned),

ray-operator/go.mod

+1
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,7 @@ require (
6363
github.com/josharian/intern v1.0.0 // indirect
6464
github.com/json-iterator/go v1.1.12 // indirect
6565
github.com/klauspost/compress v1.17.11 // indirect
66+
github.com/kylelemons/godebug v1.1.0 // indirect
6667
github.com/mailru/easyjson v0.9.0 // indirect
6768
github.com/moby/spdystream v0.5.0 // indirect
6869
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect

0 commit comments

Comments
 (0)