Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit e12ca3f

Browse files
authoredFeb 21, 2024··
Add metrics to node disruption (#52)
Add the following metrics: - state: reflect the state of the node disruption - created and deadline: report as value the creation timestamp and the deadline - impact nodes: report for each nodes a metric, helpful for interesection Unittesting of metrics in go is not easy, it will be part of a future change
1 parent 79b3985 commit e12ca3f

File tree

3 files changed

+77
-2
lines changed

3 files changed

+77
-2
lines changed
 

‎Makefile

+1-1
Original file line numberDiff line numberDiff line change
@@ -157,7 +157,7 @@ KINDCONFIG ?= $(shell pwd)/.kubecfg
157157

158158
## Tool Versions
159159
KUSTOMIZE_VERSION ?= v5.0.1
160-
CONTROLLER_TOOLS_VERSION ?= v0.12.0
160+
CONTROLLER_TOOLS_VERSION ?= v0.14.0
161161
KIND_VERSION ?= v0.20.0
162162

163163
.PHONY: kustomize

‎cmd/main.go

+8
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ import (
3434

3535
nodedisruptionv1alpha1 "github.com/criteo/node-disruption-controller/api/v1alpha1"
3636
"github.com/criteo/node-disruption-controller/internal/controller"
37+
"sigs.k8s.io/controller-runtime/pkg/metrics"
3738
//+kubebuilder:scaffold:imports
3839
)
3940

@@ -47,6 +48,13 @@ func init() {
4748

4849
utilruntime.Must(nodedisruptionv1alpha1.AddToScheme(scheme))
4950
//+kubebuilder:scaffold:scheme
51+
52+
metrics.Registry.MustRegister(
53+
controller.NodeDisruptionState,
54+
controller.NodeDisruptionCreated,
55+
controller.NodeDisruptionDeadline,
56+
controller.NodeDisruptionImpactedNodes,
57+
)
5058
}
5159

5260
func main() {

‎internal/controller/nodedisruption_controller.go

+68-1
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ import (
2424

2525
nodedisruptionv1alpha1 "github.com/criteo/node-disruption-controller/api/v1alpha1"
2626
"github.com/criteo/node-disruption-controller/pkg/resolver"
27+
"github.com/prometheus/client_golang/prometheus"
2728
"k8s.io/apimachinery/pkg/api/errors"
2829
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2930
"k8s.io/apimachinery/pkg/runtime"
@@ -46,6 +47,37 @@ type NodeDisruptionReconcilerConfig struct {
4647
RejectOverlappingDisruption bool
4748
}
4849

50+
var (
51+
NodeDisruptionState = prometheus.NewGaugeVec(
52+
prometheus.GaugeOpts{
53+
Name: "node_disruption_state",
54+
Help: "State of node disruption: pending=0, rejected=1, accepted=2",
55+
},
56+
[]string{"node_disruption_name"},
57+
)
58+
NodeDisruptionCreated = prometheus.NewGaugeVec(
59+
prometheus.GaugeOpts{
60+
Name: "node_disruption_created",
61+
Help: "Date of create of the node disruption",
62+
},
63+
[]string{"node_disruption_name"},
64+
)
65+
NodeDisruptionDeadline = prometheus.NewGaugeVec(
66+
prometheus.GaugeOpts{
67+
Name: "node_disruption_deadline",
68+
Help: "Date of the deadline of the node disruption (0 if unset)",
69+
},
70+
[]string{"node_disruption_name"},
71+
)
72+
NodeDisruptionImpactedNodes = prometheus.NewGaugeVec(
73+
prometheus.GaugeOpts{
74+
Name: "node_disruption_impacted_node",
75+
Help: "high cardinality: create a metric for each node impacted by a given node disruption",
76+
},
77+
[]string{"node_disruption_name", "node_name"},
78+
)
79+
)
80+
4981
// NodeDisruptionReconciler reconciles NodeDisruptions
5082
type NodeDisruptionReconciler struct {
5183
client.Client
@@ -72,11 +104,14 @@ func (r *NodeDisruptionReconciler) Reconcile(ctx context.Context, req ctrl.Reque
72104

73105
if err != nil {
74106
if errors.IsNotFound(err) {
75-
// If the resource was not found, nothing has to be done
107+
PruneNodeDisruptionMetric(req.NamespacedName.Name)
108+
// If the ressource was not found, nothing has to be done
76109
return clusterResult, nil
77110
}
78111
return clusterResult, err
79112
}
113+
logger.Info("Updating metrics")
114+
UpdateNodeDisruptionMetric(nd)
80115

81116
logger.Info("Start reconcile of NodeDisruption", "state", nd.Status.State, "retryDate", nd.Status.NextRetryDate.Time)
82117
if time.Now().Before(nd.Status.NextRetryDate.Time) {
@@ -105,6 +140,38 @@ func (r *NodeDisruptionReconciler) Reconcile(ctx context.Context, req ctrl.Reque
105140
return clusterResult, nil
106141
}
107142

143+
// PruneNodeDisruptionMetric remove metrics for a Node Disruption that don't exist anymore
144+
func PruneNodeDisruptionMetric(nd_name string) {
145+
NodeDisruptionState.DeletePartialMatch(prometheus.Labels{"node_disruption_name": nd_name})
146+
NodeDisruptionCreated.DeletePartialMatch(prometheus.Labels{"node_disruption_name": nd_name})
147+
NodeDisruptionDeadline.DeletePartialMatch(prometheus.Labels{"node_disruption_name": nd_name})
148+
NodeDisruptionImpactedNodes.DeletePartialMatch(prometheus.Labels{"node_disruption_name": nd_name})
149+
}
150+
151+
func UpdateNodeDisruptionMetric(nd *nodedisruptionv1alpha1.NodeDisruption) {
152+
nd_state := 0
153+
if nd.Status.State == nodedisruptionv1alpha1.Pending {
154+
nd_state = 0
155+
} else if nd.Status.State == nodedisruptionv1alpha1.Rejected {
156+
nd_state = 1
157+
} else if nd.Status.State == nodedisruptionv1alpha1.Granted {
158+
nd_state = 2
159+
}
160+
NodeDisruptionState.WithLabelValues(nd.Name).Set(float64(nd_state))
161+
NodeDisruptionCreated.WithLabelValues(nd.Name).Set(float64(nd.CreationTimestamp.Unix()))
162+
// Deadline might not be set so it will be 0 but timestamp in Go are not Unix epoch
163+
// so converting a 0 timestamp will not result in epoch 0. We override this to have nice values
164+
deadline := nd.Spec.Retry.Deadline.Unix()
165+
if nd.Spec.Retry.Deadline.IsZero() {
166+
deadline = 0
167+
}
168+
NodeDisruptionDeadline.WithLabelValues(nd.Name).Set(float64(deadline))
169+
170+
for _, node_name := range nd.Status.DisruptedNodes {
171+
NodeDisruptionImpactedNodes.WithLabelValues(nd.Name, node_name).Set(1)
172+
}
173+
}
174+
108175
// SetupWithManager sets up the controller with the Manager.
109176
func (r *NodeDisruptionReconciler) SetupWithManager(mgr ctrl.Manager) error {
110177
r.Recorder = mgr.GetEventRecorderFor("node-disruption-controller")

0 commit comments

Comments
 (0)
Please sign in to comment.