@@ -24,6 +24,7 @@ import (
24
24
25
25
nodedisruptionv1alpha1 "github.com/criteo/node-disruption-controller/api/v1alpha1"
26
26
"github.com/criteo/node-disruption-controller/pkg/resolver"
27
+ "github.com/prometheus/client_golang/prometheus"
27
28
"k8s.io/apimachinery/pkg/api/errors"
28
29
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
29
30
"k8s.io/apimachinery/pkg/runtime"
@@ -46,6 +47,37 @@ type NodeDisruptionReconcilerConfig struct {
46
47
RejectOverlappingDisruption bool
47
48
}
48
49
50
+ var (
51
+ NodeDisruptionState = prometheus .NewGaugeVec (
52
+ prometheus.GaugeOpts {
53
+ Name : "node_disruption_state" ,
54
+ Help : "State of node disruption: pending=0, rejected=1, accepted=2" ,
55
+ },
56
+ []string {"node_disruption_name" },
57
+ )
58
+ NodeDisruptionCreated = prometheus .NewGaugeVec (
59
+ prometheus.GaugeOpts {
60
+ Name : "node_disruption_created" ,
61
+ Help : "Date of create of the node disruption" ,
62
+ },
63
+ []string {"node_disruption_name" },
64
+ )
65
+ NodeDisruptionDeadline = prometheus .NewGaugeVec (
66
+ prometheus.GaugeOpts {
67
+ Name : "node_disruption_deadline" ,
68
+ Help : "Date of the deadline of the node disruption (0 if unset)" ,
69
+ },
70
+ []string {"node_disruption_name" },
71
+ )
72
+ NodeDisruptionImpactedNodes = prometheus .NewGaugeVec (
73
+ prometheus.GaugeOpts {
74
+ Name : "node_disruption_impacted_node" ,
75
+ Help : "high cardinality: create a metric for each node impacted by a given node disruption" ,
76
+ },
77
+ []string {"node_disruption_name" , "node_name" },
78
+ )
79
+ )
80
+
49
81
// NodeDisruptionReconciler reconciles NodeDisruptions
50
82
type NodeDisruptionReconciler struct {
51
83
client.Client
@@ -72,11 +104,14 @@ func (r *NodeDisruptionReconciler) Reconcile(ctx context.Context, req ctrl.Reque
72
104
73
105
if err != nil {
74
106
if errors .IsNotFound (err ) {
75
- // If the resource was not found, nothing has to be done
107
+ PruneNodeDisruptionMetric (req .NamespacedName .Name )
108
+ // If the ressource was not found, nothing has to be done
76
109
return clusterResult , nil
77
110
}
78
111
return clusterResult , err
79
112
}
113
+ logger .Info ("Updating metrics" )
114
+ UpdateNodeDisruptionMetric (nd )
80
115
81
116
logger .Info ("Start reconcile of NodeDisruption" , "state" , nd .Status .State , "retryDate" , nd .Status .NextRetryDate .Time )
82
117
if time .Now ().Before (nd .Status .NextRetryDate .Time ) {
@@ -105,6 +140,38 @@ func (r *NodeDisruptionReconciler) Reconcile(ctx context.Context, req ctrl.Reque
105
140
return clusterResult , nil
106
141
}
107
142
143
+ // PruneNodeDisruptionMetric remove metrics for a Node Disruption that don't exist anymore
144
+ func PruneNodeDisruptionMetric (nd_name string ) {
145
+ NodeDisruptionState .DeletePartialMatch (prometheus.Labels {"node_disruption_name" : nd_name })
146
+ NodeDisruptionCreated .DeletePartialMatch (prometheus.Labels {"node_disruption_name" : nd_name })
147
+ NodeDisruptionDeadline .DeletePartialMatch (prometheus.Labels {"node_disruption_name" : nd_name })
148
+ NodeDisruptionImpactedNodes .DeletePartialMatch (prometheus.Labels {"node_disruption_name" : nd_name })
149
+ }
150
+
151
+ func UpdateNodeDisruptionMetric (nd * nodedisruptionv1alpha1.NodeDisruption ) {
152
+ nd_state := 0
153
+ if nd .Status .State == nodedisruptionv1alpha1 .Pending {
154
+ nd_state = 0
155
+ } else if nd .Status .State == nodedisruptionv1alpha1 .Rejected {
156
+ nd_state = 1
157
+ } else if nd .Status .State == nodedisruptionv1alpha1 .Granted {
158
+ nd_state = 2
159
+ }
160
+ NodeDisruptionState .WithLabelValues (nd .Name ).Set (float64 (nd_state ))
161
+ NodeDisruptionCreated .WithLabelValues (nd .Name ).Set (float64 (nd .CreationTimestamp .Unix ()))
162
+ // Deadline might not be set so it will be 0 but timestamp in Go are not Unix epoch
163
+ // so converting a 0 timestamp will not result in epoch 0. We override this to have nice values
164
+ deadline := nd .Spec .Retry .Deadline .Unix ()
165
+ if nd .Spec .Retry .Deadline .IsZero () {
166
+ deadline = 0
167
+ }
168
+ NodeDisruptionDeadline .WithLabelValues (nd .Name ).Set (float64 (deadline ))
169
+
170
+ for _ , node_name := range nd .Status .DisruptedNodes {
171
+ NodeDisruptionImpactedNodes .WithLabelValues (nd .Name , node_name ).Set (1 )
172
+ }
173
+ }
174
+
108
175
// SetupWithManager sets up the controller with the Manager.
109
176
func (r * NodeDisruptionReconciler ) SetupWithManager (mgr ctrl.Manager ) error {
110
177
r .Recorder = mgr .GetEventRecorderFor ("node-disruption-controller" )
0 commit comments