Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Prometheus] Add ray_cluster_provisioned_duration_seconds metric #3212

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 13 additions & 44 deletions ray-operator/controllers/ray/common/metrics.go
Original file line number Diff line number Diff line change
@@ -1,64 +1,33 @@
package common

import (
"time"

"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promauto"
"sigs.k8s.io/controller-runtime/pkg/metrics"
)

// Define all the prometheus counters for all clusters
var (
clustersCreatedCount = promauto.NewCounterVec(
prometheus.CounterOpts{
Name: "ray_operator_clusters_created_total",
Help: "Counts number of clusters created",
},
[]string{"namespace"},
)
clustersDeletedCount = promauto.NewCounterVec(
prometheus.CounterOpts{
Name: "ray_operator_clusters_deleted_total",
Help: "Counts number of clusters deleted",
},
[]string{"namespace"},
)
clustersSuccessfulCount = promauto.NewCounterVec(
prometheus.CounterOpts{
Name: "ray_operator_clusters_successful_total",
Help: "Counts number of clusters successful",
},
[]string{"namespace"},
)
clustersFailedCount = promauto.NewCounterVec(
prometheus.CounterOpts{
Name: "ray_operator_clusters_failed_total",
Help: "Counts number of clusters failed",
rayClusterProvisionedHistogram = promauto.NewHistogramVec(
prometheus.HistogramOpts{
Name: "ray_cluster_provisioned_duration_seconds",
Help: "The time from RayClusters created to all ray pods are ready for the first time (RayClusterProvisioned) in seconds",
// It may not be applicable to all users, but default buckets cannot be used either.
// For reference, see: https://github.com/prometheus/client_golang/blob/331dfab0cc853dca0242a0d96a80184087a80c1d/prometheus/histogram.go#L271
Buckets: []float64{30, 60, 120, 180, 240, 300, 600, 900, 1800, 3600},
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure what bucket ranges would be suitable for most users.

},
[]string{"namespace"},
)
)

func init() {
// Register custom metrics with the global prometheus registry
metrics.Registry.MustRegister(clustersCreatedCount,
clustersDeletedCount,
clustersSuccessfulCount,
clustersFailedCount)
}

func CreatedClustersCounterInc(namespace string) {
clustersCreatedCount.WithLabelValues(namespace).Inc()
}

// TODO: We don't handle the delete events in new reconciler mode, how to emit deletion metrics?
func DeletedClustersCounterInc(namespace string) {
clustersDeletedCount.WithLabelValues(namespace).Inc()
}

func SuccessfulClustersCounterInc(namespace string) {
clustersSuccessfulCount.WithLabelValues(namespace).Inc()
metrics.Registry.MustRegister(rayClusterProvisionedHistogram)
}

func FailedClustersCounterInc(namespace string) {
clustersFailedCount.WithLabelValues(namespace).Inc()
// ObserveRayClusterProvisionedDuration observes the duration of RayCluster from creation to provisioned
func ObserveRayClusterProvisionedDuration(namespace string, duration time.Duration) {
rayClusterProvisionedHistogram.WithLabelValues(namespace).Observe(duration.Seconds())
}
34 changes: 34 additions & 0 deletions ray-operator/controllers/ray/common/metrics_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
package common

import (
"strings"
"testing"
"time"

"github.com/prometheus/client_golang/prometheus/testutil"
)

func TestObserveRayClusterProvisionedDuration(t *testing.T) {
ObserveRayClusterProvisionedDuration("default", 2*time.Minute)

metric := `
# HELP ray_cluster_provisioned_duration_seconds The time from RayClusters created to all ray pods are ready for the first time (RayClusterProvisioned) in seconds
# TYPE ray_cluster_provisioned_duration_seconds histogram
ray_cluster_provisioned_duration_seconds_bucket{namespace="default",le="30"} 0
ray_cluster_provisioned_duration_seconds_bucket{namespace="default",le="60"} 0
ray_cluster_provisioned_duration_seconds_bucket{namespace="default",le="120"} 1
ray_cluster_provisioned_duration_seconds_bucket{namespace="default",le="180"} 1
ray_cluster_provisioned_duration_seconds_bucket{namespace="default",le="240"} 1
ray_cluster_provisioned_duration_seconds_bucket{namespace="default",le="300"} 1
ray_cluster_provisioned_duration_seconds_bucket{namespace="default",le="600"} 1
ray_cluster_provisioned_duration_seconds_bucket{namespace="default",le="900"} 1
ray_cluster_provisioned_duration_seconds_bucket{namespace="default",le="1800"} 1
ray_cluster_provisioned_duration_seconds_bucket{namespace="default",le="3600"} 1
ray_cluster_provisioned_duration_seconds_sum{namespace="default"} 120
ray_cluster_provisioned_duration_seconds_count{namespace="default"} 1
`

if err := testutil.CollectAndCompare(rayClusterProvisionedHistogram, strings.NewReader(metric), "ray_cluster_provisioned_duration_seconds"); err != nil {
t.Errorf("unexpected collecting result:\n%s", err)
}
}
8 changes: 5 additions & 3 deletions ray-operator/controllers/ray/raycluster_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -734,12 +734,9 @@ func (r *RayClusterReconciler) reconcilePods(ctx context.Context, instance *rayv
} else if len(headPods.Items) == 0 {
// Create head Pod if it does not exist.
logger.Info("reconcilePods: Found 0 head Pods; creating a head Pod for the RayCluster.")
common.CreatedClustersCounterInc(instance.Namespace)
if err := r.createHeadPod(ctx, *instance); err != nil {
common.FailedClustersCounterInc(instance.Namespace)
return errstd.Join(utils.ErrFailedCreateHeadPod, err)
}
common.SuccessfulClustersCounterInc(instance.Namespace)
} else if len(headPods.Items) > 1 { // This should never happen. This protects against the case that users manually create headpod.
correctHeadPodName := instance.Name + "-head"
headPodNames := make([]string, len(headPods.Items))
Expand Down Expand Up @@ -1336,6 +1333,11 @@ func (r *RayClusterReconciler) calculateStatus(ctx context.Context, instance *ra
Reason: rayv1.AllPodRunningAndReadyFirstTime,
Message: "All Ray Pods are ready for the first time",
})

// Record ray_cluster_provisioned_duration_seconds duration metric
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The metric should not be recorded in calculateStatus. It should only be recorded when the status update succeeds, to avoid counting it more than once.

// Calculate the time between cluster creation and being fully provisioned
provisionDuration := time.Since(instance.CreationTimestamp.Time)
common.ObserveRayClusterProvisionedDuration(instance.Namespace, provisionDuration)
} else {
meta.SetStatusCondition(&newInstance.Status.Conditions, metav1.Condition{
Type: string(rayv1.RayClusterProvisioned),
Expand Down
1 change: 1 addition & 0 deletions ray-operator/go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ require (
github.com/josharian/intern v1.0.0 // indirect
github.com/json-iterator/go v1.1.12 // indirect
github.com/klauspost/compress v1.17.11 // indirect
github.com/kylelemons/godebug v1.1.0 // indirect
github.com/mailru/easyjson v0.9.0 // indirect
github.com/moby/spdystream v0.5.0 // indirect
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
Expand Down
Loading