|
1 | 1 | package common
|
2 | 2 |
|
3 | 3 | import (
|
| 4 | + "time" |
| 5 | + |
4 | 6 | "github.com/prometheus/client_golang/prometheus"
|
5 | 7 | "github.com/prometheus/client_golang/prometheus/promauto"
|
6 | 8 | "sigs.k8s.io/controller-runtime/pkg/metrics"
|
7 | 9 | )
|
8 | 10 |
|
9 | 11 | // Define all the prometheus counters for all clusters
|
10 | 12 | var (
|
11 |
| - clustersCreatedCount = promauto.NewCounterVec( |
12 |
| - prometheus.CounterOpts{ |
13 |
| - Name: "ray_operator_clusters_created_total", |
14 |
| - Help: "Counts number of clusters created", |
15 |
| - }, |
16 |
| - []string{"namespace"}, |
17 |
| - ) |
18 |
| - clustersDeletedCount = promauto.NewCounterVec( |
19 |
| - prometheus.CounterOpts{ |
20 |
| - Name: "ray_operator_clusters_deleted_total", |
21 |
| - Help: "Counts number of clusters deleted", |
22 |
| - }, |
23 |
| - []string{"namespace"}, |
24 |
| - ) |
25 |
| - clustersSuccessfulCount = promauto.NewCounterVec( |
26 |
| - prometheus.CounterOpts{ |
27 |
| - Name: "ray_operator_clusters_successful_total", |
28 |
| - Help: "Counts number of clusters successful", |
29 |
| - }, |
30 |
| - []string{"namespace"}, |
31 |
| - ) |
32 |
| - clustersFailedCount = promauto.NewCounterVec( |
33 |
| - prometheus.CounterOpts{ |
34 |
| - Name: "ray_operator_clusters_failed_total", |
35 |
| - Help: "Counts number of clusters failed", |
| 13 | + RayClusterProvisionedDurationSeconds = promauto.NewHistogramVec( |
| 14 | + prometheus.HistogramOpts{ |
| 15 | + Name: "ray_cluster_provisioned_duration_seconds", |
| 16 | + Help: "The time from RayClusters created to all ray pods are ready for the first time (RayClusterProvisioned) in seconds", |
| 17 | + // It may not be applicable to all users, but default buckets cannot be used either. |
| 18 | + // For reference, see: https://github.com/prometheus/client_golang/blob/331dfab0cc853dca0242a0d96a80184087a80c1d/prometheus/histogram.go#L271 |
| 19 | + Buckets: []float64{30, 60, 120, 180, 240, 300, 600, 900, 1800, 3600}, |
36 | 20 | },
|
37 |
| - []string{"namespace"}, |
| 21 | + []string{"namespace", "name"}, |
38 | 22 | )
|
39 | 23 | )
|
40 | 24 |
|
41 | 25 | func init() {
|
42 | 26 | // Register custom metrics with the global prometheus registry
|
43 |
| - metrics.Registry.MustRegister(clustersCreatedCount, |
44 |
| - clustersDeletedCount, |
45 |
| - clustersSuccessfulCount, |
46 |
| - clustersFailedCount) |
47 |
| -} |
48 |
| - |
49 |
| -func CreatedClustersCounterInc(namespace string) { |
50 |
| - clustersCreatedCount.WithLabelValues(namespace).Inc() |
51 |
| -} |
52 |
| - |
53 |
| -// TODO: We don't handle the delete events in new reconciler mode, how to emit deletion metrics? |
54 |
| -func DeletedClustersCounterInc(namespace string) { |
55 |
| - clustersDeletedCount.WithLabelValues(namespace).Inc() |
56 |
| -} |
57 |
| - |
58 |
| -func SuccessfulClustersCounterInc(namespace string) { |
59 |
| - clustersSuccessfulCount.WithLabelValues(namespace).Inc() |
| 27 | + metrics.Registry.MustRegister(RayClusterProvisionedDurationSeconds) |
60 | 28 | }
|
61 | 29 |
|
62 |
| -func FailedClustersCounterInc(namespace string) { |
63 |
| - clustersFailedCount.WithLabelValues(namespace).Inc() |
| 30 | +// ObserveRayClusterProvisionedDuration observes the duration of RayCluster from creation to provisioned |
| 31 | +func ObserveRayClusterProvisionedDuration(namespace, name string, duration time.Duration) { |
| 32 | + RayClusterProvisionedDurationSeconds.WithLabelValues(namespace, name).Observe(duration.Seconds()) |
64 | 33 | }
|
0 commit comments