Skip to content

Commit 1376a1d

Browse files
FatmaBouzghaiaf.bouzghaia
and
f.bouzghaia
authored
Implement spec.type to Node Disruption API (#67)
Implement #64 Signed-off-by: f.bouzghaia <[email protected]> Co-authored-by: f.bouzghaia <[email protected]>
1 parent 7d7fd52 commit 1376a1d

8 files changed

+173
-26
lines changed

DOC.md

+7
Original file line numberDiff line numberDiff line change
@@ -756,6 +756,13 @@ NodeDisruptionSpec defines the desired state of NodeDisruption
756756
Configure the retrying behavior of a NodeDisruption<br/>
757757
</td>
758758
<td>false</td>
759+
</tr><tr>
760+
<td><b>type</b></td>
761+
<td>string</td>
762+
<td>
763+
Type of the node disruption<br/>
764+
</td>
765+
<td>false</td>
759766
</tr></tbody>
760767
</table>
761768

api/v1alpha1/nodedisruption_types.go

+2
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,8 @@ type NodeDisruptionSpec struct {
4949
// Label query over nodes that will be impacted by the disruption
5050
NodeSelector metav1.LabelSelector `json:"nodeSelector,omitempty"`
5151
Retry RetrySpec `json:"retry,omitempty"`
52+
// Type of the node disruption
53+
Type string `json:"type,omitempty"`
5254
}
5355

5456
// Configure the retrying behavior of a NodeDisruption

chart/templates/nodedisruption-crd.yaml

+3
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,9 @@ spec:
101101
description: Enable retrying
102102
type: boolean
103103
type: object
104+
type:
105+
description: Type of the node disruption
106+
type: string
104107
type: object
105108
status:
106109
description: NodeDisruptionStatus defines the observed state of NodeDisruption

cmd/main.go

+4
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ package main
1919
import (
2020
"flag"
2121
"os"
22+
"strings"
2223
"time"
2324

2425
// Import all Kubernetes client auth plugins (e.g. Azure, GCP, OIDC, etc.)
@@ -56,6 +57,7 @@ func main() {
5657
var rejectEmptyNodeDisruption bool
5758
var retryInterval time.Duration
5859
var rejectOverlappingDisruption bool
60+
var nodeDisruptionTypes string
5961
flag.StringVar(&metricsAddr, "metrics-bind-address", ":8080", "The address the metric endpoint binds to.")
6062
flag.StringVar(&probeAddr, "health-probe-bind-address", ":8081", "The address the probe endpoint binds to.")
6163
flag.BoolVar(&enableLeaderElection, "leader-elect", false,
@@ -64,6 +66,7 @@ func main() {
6466
flag.BoolVar(&rejectEmptyNodeDisruption, "reject-empty-node-disruption", false, "Reject NodeDisruption matching no actual node.")
6567
flag.DurationVar(&retryInterval, "retry-interval", controller.DefaultRetryInterval, "How long to wait between each retry (Default 60s)")
6668
flag.BoolVar(&rejectOverlappingDisruption, "reject-overlapping-disruption", false, "Automatically reject any overlapping NodeDisruption (based on node selector), preserving the oldest one")
69+
flag.StringVar(&nodeDisruptionTypes, "node-disruption-types", "", "The list of types allowed for a node disruption separated by a comma.")
6770

6871
opts := zap.Options{
6972
Development: true,
@@ -104,6 +107,7 @@ func main() {
104107
RejectEmptyNodeDisruption: rejectEmptyNodeDisruption,
105108
RetryInterval: retryInterval,
106109
RejectOverlappingDisruption: rejectOverlappingDisruption,
110+
NodeDisruptionTypes: strings.Split(nodeDisruptionTypes, ","),
107111
},
108112
}).SetupWithManager(mgr); err != nil {
109113
setupLog.Error(err, "unable to create controller", "controller", "NodeDisruption")

config/crd/bases/nodedisruption.criteo.com_nodedisruptions.yaml

+3
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,9 @@ spec:
100100
description: Enable retrying
101101
type: boolean
102102
type: object
103+
type:
104+
description: Type of the node disruption
105+
type: string
103106
type: object
104107
status:
105108
description: NodeDisruptionStatus defines the observed state of NodeDisruption

internal/controller/metrics.go

+14-7
Original file line numberDiff line numberDiff line change
@@ -17,49 +17,56 @@ var (
1717
Name: METIC_PREFIX + "node_disruption_granted_total",
1818
Help: "Total number of granted node disruptions",
1919
},
20-
[]string{},
20+
[]string{"type"},
2121
)
2222
NodeDisruptionRejectedTotal = promauto.With(metrics.Registry).NewCounterVec(
2323
prometheus.CounterOpts{
2424
Name: METIC_PREFIX + "node_disruption_rejected_total",
2525
Help: "Total number of rejected node disruptions",
2626
},
27-
[]string{},
27+
[]string{"type"},
2828
)
2929
NodeDisruptionStateAsValue = promauto.With(metrics.Registry).NewGaugeVec(
3030
prometheus.GaugeOpts{
3131
Name: METIC_PREFIX + "node_disruption_state_value",
3232
Help: "State of node disruption: pending=0, rejected=-1, accepted=1",
3333
},
34-
[]string{"node_disruption_name"},
34+
[]string{"node_disruption_name", "type"},
3535
)
3636
NodeDisruptionStateAsLabel = promauto.With(metrics.Registry).NewGaugeVec(
3737
prometheus.GaugeOpts{
3838
Name: METIC_PREFIX + "node_disruption_state_label",
3939
Help: "State of node disruption: 0 not in this state; 1 is in state",
4040
},
41-
[]string{"node_disruption_name", "state"},
41+
[]string{"node_disruption_name", "state", "type"},
4242
)
4343
NodeDisruptionCreated = promauto.With(metrics.Registry).NewGaugeVec(
4444
prometheus.GaugeOpts{
4545
Name: METIC_PREFIX + "node_disruption_created",
4646
Help: "Date of create of the node disruption",
4747
},
48-
[]string{"node_disruption_name"},
48+
[]string{"node_disruption_name", "type"},
4949
)
5050
NodeDisruptionDeadline = promauto.With(metrics.Registry).NewGaugeVec(
5151
prometheus.GaugeOpts{
5252
Name: METIC_PREFIX + "node_disruption_deadline",
5353
Help: "Date of the deadline of the node disruption (0 if unset)",
5454
},
55-
[]string{"node_disruption_name"},
55+
[]string{"node_disruption_name", "type"},
5656
)
5757
NodeDisruptionImpactedNodes = promauto.With(metrics.Registry).NewGaugeVec(
5858
prometheus.GaugeOpts{
5959
Name: METIC_PREFIX + "node_disruption_impacted_node",
6060
Help: "high cardinality: create a metric for each node impacted by a given node disruption",
6161
},
62-
[]string{"node_disruption_name", "node_name"},
62+
[]string{"node_disruption_name", "node_name", "type"},
63+
)
64+
NodeDisruptionType = promauto.With(metrics.Registry).NewGaugeVec(
65+
prometheus.GaugeOpts{
66+
Name: METIC_PREFIX + "node_disruption_type",
67+
Help: "Type of the node disruption",
68+
},
69+
[]string{"node_disruption_name", "type"},
6370
)
6471
// DISRUPTION BUDGET METRICS
6572
DisruptionBudgetCheckHealthHookStatusCodeTotal = promauto.With(metrics.Registry).NewCounterVec(

internal/controller/nodedisruption_controller.go

+28-15
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ import (
2929
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
3030
"k8s.io/apimachinery/pkg/runtime"
3131
"k8s.io/client-go/tools/record"
32+
"k8s.io/utils/strings/slices"
3233
ctrl "sigs.k8s.io/controller-runtime"
3334
"sigs.k8s.io/controller-runtime/pkg/client"
3435
"sigs.k8s.io/controller-runtime/pkg/log"
@@ -45,6 +46,8 @@ type NodeDisruptionReconcilerConfig struct {
4546
RetryInterval time.Duration
4647
// Reject NodeDisruption if its node selector overlaps an older NodeDisruption's selector
4748
RejectOverlappingDisruption bool
49+
// Specify which node disruption types are allowed to be granted
50+
NodeDisruptionTypes []string
4851
}
4952

5053
// NodeDisruptionReconciler reconciles NodeDisruptions
@@ -116,39 +119,40 @@ func PruneNodeDisruptionMetrics(nd_name string) {
116119
NodeDisruptionCreated.DeletePartialMatch(prometheus.Labels{"node_disruption_name": nd_name})
117120
NodeDisruptionDeadline.DeletePartialMatch(prometheus.Labels{"node_disruption_name": nd_name})
118121
NodeDisruptionImpactedNodes.DeletePartialMatch(prometheus.Labels{"node_disruption_name": nd_name})
122+
NodeDisruptionType.DeletePartialMatch(prometheus.Labels{"node_disruption_name": nd_name})
119123
}
120124

121125
// UpdateNodeDisruptionMetrics update metrics for a Node Disruption
122126
func UpdateNodeDisruptionMetrics(nd *nodedisruptionv1alpha1.NodeDisruption) {
123127
nd_state := 0
124128
if nd.Status.State == nodedisruptionv1alpha1.Pending {
125129
nd_state = 0
126-
NodeDisruptionStateAsLabel.WithLabelValues(nd.Name, string(nodedisruptionv1alpha1.Pending)).Set(1)
127-
NodeDisruptionStateAsLabel.WithLabelValues(nd.Name, string(nodedisruptionv1alpha1.Granted)).Set(0)
128-
NodeDisruptionStateAsLabel.WithLabelValues(nd.Name, string(nodedisruptionv1alpha1.Rejected)).Set(0)
130+
NodeDisruptionStateAsLabel.WithLabelValues(nd.Name, string(nodedisruptionv1alpha1.Pending), nd.Spec.Type).Set(1)
131+
NodeDisruptionStateAsLabel.WithLabelValues(nd.Name, string(nodedisruptionv1alpha1.Granted), nd.Spec.Type).Set(0)
132+
NodeDisruptionStateAsLabel.WithLabelValues(nd.Name, string(nodedisruptionv1alpha1.Rejected), nd.Spec.Type).Set(0)
129133
} else if nd.Status.State == nodedisruptionv1alpha1.Rejected {
130134
nd_state = -1
131-
NodeDisruptionStateAsLabel.WithLabelValues(nd.Name, string(nodedisruptionv1alpha1.Pending)).Set(0)
132-
NodeDisruptionStateAsLabel.WithLabelValues(nd.Name, string(nodedisruptionv1alpha1.Rejected)).Set(1)
133-
NodeDisruptionStateAsLabel.WithLabelValues(nd.Name, string(nodedisruptionv1alpha1.Granted)).Set(0)
135+
NodeDisruptionStateAsLabel.WithLabelValues(nd.Name, string(nodedisruptionv1alpha1.Pending), nd.Spec.Type).Set(0)
136+
NodeDisruptionStateAsLabel.WithLabelValues(nd.Name, string(nodedisruptionv1alpha1.Rejected), nd.Spec.Type).Set(1)
137+
NodeDisruptionStateAsLabel.WithLabelValues(nd.Name, string(nodedisruptionv1alpha1.Granted), nd.Spec.Type).Set(0)
134138
} else if nd.Status.State == nodedisruptionv1alpha1.Granted {
135139
nd_state = 1
136-
NodeDisruptionStateAsLabel.WithLabelValues(nd.Name, string(nodedisruptionv1alpha1.Pending)).Set(0)
137-
NodeDisruptionStateAsLabel.WithLabelValues(nd.Name, string(nodedisruptionv1alpha1.Rejected)).Set(0)
138-
NodeDisruptionStateAsLabel.WithLabelValues(nd.Name, string(nodedisruptionv1alpha1.Granted)).Set(1)
140+
NodeDisruptionStateAsLabel.WithLabelValues(nd.Name, string(nodedisruptionv1alpha1.Pending), nd.Spec.Type).Set(0)
141+
NodeDisruptionStateAsLabel.WithLabelValues(nd.Name, string(nodedisruptionv1alpha1.Rejected), nd.Spec.Type).Set(0)
142+
NodeDisruptionStateAsLabel.WithLabelValues(nd.Name, string(nodedisruptionv1alpha1.Granted), nd.Spec.Type).Set(1)
139143
}
140-
NodeDisruptionStateAsValue.WithLabelValues(nd.Name).Set(float64(nd_state))
141-
NodeDisruptionCreated.WithLabelValues(nd.Name).Set(float64(nd.CreationTimestamp.Unix()))
144+
NodeDisruptionStateAsValue.WithLabelValues(nd.Name, nd.Spec.Type).Set(float64(nd_state))
145+
NodeDisruptionCreated.WithLabelValues(nd.Name, nd.Spec.Type).Set(float64(nd.CreationTimestamp.Unix()))
142146
// Deadline might not be set so it will be 0 but timestamp in Go are not Unix epoch
143147
// so converting a 0 timestamp will not result in epoch 0. We override this to have nice values
144148
deadline := nd.Spec.Retry.Deadline.Unix()
145149
if nd.Spec.Retry.Deadline.IsZero() {
146150
deadline = 0
147151
}
148-
NodeDisruptionDeadline.WithLabelValues(nd.Name).Set(float64(deadline))
152+
NodeDisruptionDeadline.WithLabelValues(nd.Name, nd.Spec.Type).Set(float64(deadline))
149153

150154
for _, node_name := range nd.Status.DisruptedNodes {
151-
NodeDisruptionImpactedNodes.WithLabelValues(nd.Name, node_name).Set(1)
155+
NodeDisruptionImpactedNodes.WithLabelValues(nd.Name, node_name, nd.Spec.Type).Set(1)
152156
}
153157
}
154158

@@ -195,9 +199,9 @@ func (ndr *SingleNodeDisruptionReconciler) TryTransitionState(ctx context.Contex
195199
return err
196200
}
197201
if ndr.NodeDisruption.Status.State == nodedisruptionv1alpha1.Granted {
198-
NodeDisruptionGrantedTotal.WithLabelValues().Inc()
202+
NodeDisruptionGrantedTotal.WithLabelValues(ndr.NodeDisruption.Spec.Type).Inc()
199203
} else if ndr.NodeDisruption.Status.State == nodedisruptionv1alpha1.Rejected {
200-
NodeDisruptionRejectedTotal.WithLabelValues().Inc()
204+
NodeDisruptionRejectedTotal.WithLabelValues(ndr.NodeDisruption.Spec.Type).Inc()
201205
}
202206
}
203207
// If the disruption is not Pending nor unknown, the state is final
@@ -331,6 +335,15 @@ func (ndr *SingleNodeDisruptionReconciler) ValidateWithInternalConstraints(ctx c
331335
return anyFailed, []nodedisruptionv1alpha1.DisruptedBudgetStatus{status}, err
332336
}
333337

338+
if len(ndr.Config.NodeDisruptionTypes) != 0 && !slices.Contains(ndr.Config.NodeDisruptionTypes, ndr.NodeDisruption.Spec.Type) {
339+
status := nodedisruptionv1alpha1.DisruptedBudgetStatus{
340+
Reference: ndr.getNodeDisruptionReference(),
341+
Reason: "Type provided of node disruption is not managed",
342+
Ok: false,
343+
}
344+
return true, []nodedisruptionv1alpha1.DisruptedBudgetStatus{status}, nil
345+
}
346+
334347
return false, statuses, nil
335348
}
336349

internal/controller/nodedisruption_controller_test.go

+112-4
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,7 @@ func startDummyHTTPServer(handle http.HandlerFunc, listenAddr string) (cancelFn
112112
return func() { _ = srv.Shutdown(context.Background()) }
113113
}
114114

115-
func createNodeDisruption(name string, namespace string, nodeSelectorLabel map[string]string, ctx context.Context) {
115+
func createNodeDisruption(name string, namespace string, nodeSelectorLabel map[string]string, disruptionType string, ctx context.Context) {
116116
overlappingDisruption := &nodedisruptionv1alpha1.NodeDisruption{
117117
TypeMeta: metav1.TypeMeta{
118118
APIVersion: "nodedisruption.criteo.com/v1alpha1",
@@ -124,6 +124,7 @@ func createNodeDisruption(name string, namespace string, nodeSelectorLabel map[s
124124
},
125125
Spec: nodedisruptionv1alpha1.NodeDisruptionSpec{
126126
NodeSelector: metav1.LabelSelector{MatchLabels: nodeSelectorLabel},
127+
Type: disruptionType,
127128
},
128129
}
129130
Expect(k8sClient.Create(ctx, overlappingDisruption.DeepCopy())).Should(Succeed())
@@ -644,7 +645,7 @@ var _ = Describe("NodeDisruption controller", func() {
644645

645646
BeforeEach(func() {
646647
By("configuring a first disruption")
647-
createNodeDisruption(firstDisruptionName, NDNamespace, nodeLabels1, ctx)
648+
createNodeDisruption(firstDisruptionName, NDNamespace, nodeLabels1, "", ctx)
648649
})
649650
AfterEach(func() {
650651
clearAllNodeDisruptionResources()
@@ -664,7 +665,7 @@ var _ = Describe("NodeDisruption controller", func() {
664665
})
665666

666667
By("creating an overlapping disruption")
667-
createNodeDisruption(overlappingDisruptionName, NDNamespace, node2Label, ctx)
668+
createNodeDisruption(overlappingDisruptionName, NDNamespace, node2Label, "", ctx)
668669
})
669670
It("rejects the NodeDisruption", func() {
670671
Eventually(func() nodedisruptionv1alpha1.NodeDisruptionState {
@@ -691,7 +692,7 @@ var _ = Describe("NodeDisruption controller", func() {
691692
})
692693

693694
By("creating an overlapping disruption")
694-
createNodeDisruption(overlappingDisruptionName, NDNamespace, node2Label, ctx)
695+
createNodeDisruption(overlappingDisruptionName, NDNamespace, node2Label, "", ctx)
695696
})
696697
It("accepts the NodeDisruption", func() {
697698
Eventually(func() nodedisruptionv1alpha1.NodeDisruptionState {
@@ -705,5 +706,112 @@ var _ = Describe("NodeDisruption controller", func() {
705706
})
706707
})
707708
})
709+
710+
Describe("Reject typed disruptions feature", Label("Node disruption type"), Ordered, func() {
711+
var (
712+
createdDisruption = &nodedisruptionv1alpha1.NodeDisruption{}
713+
disruptionName = "disruption-test"
714+
allowedDisruptionTypes = []string{"maintenance", "decommission", "tor-maintenance"}
715+
)
716+
717+
AfterEach(func() {
718+
clearAllNodeDisruptionResources()
719+
cancelFn()
720+
})
721+
722+
Context("NodeDisruptionTypes is enabled", func() {
723+
When("the created disruption has an allowed type", func() {
724+
BeforeEach(func() {
725+
By("Configuring a disruption")
726+
createNodeDisruption(disruptionName, NDNamespace, nodeLabels1, "maintenance", ctx)
727+
728+
By("starting a reconciler with NodeDisruptionTypes enabled")
729+
cancelFn = startReconcilerWithConfig(NodeDisruptionReconcilerConfig{
730+
RejectOverlappingDisruption: false,
731+
RetryInterval: time.Second * 1,
732+
NodeDisruptionTypes: allowedDisruptionTypes,
733+
})
734+
})
735+
It("grants the NodeDisruption", func() {
736+
Eventually(func() nodedisruptionv1alpha1.NodeDisruptionState {
737+
err := k8sClient.Get(ctx, types.NamespacedName{Name: disruptionName, Namespace: NDNamespace}, createdDisruption)
738+
if err != nil {
739+
panic("should be able to get")
740+
}
741+
return createdDisruption.Status.State
742+
}, timeout, interval).Should(Equal(nodedisruptionv1alpha1.Granted))
743+
})
744+
})
745+
When("the created disruption has not an allowed type", func() {
746+
BeforeEach(func() {
747+
By("Configuring a disruption")
748+
createNodeDisruption(disruptionName, NDNamespace, nodeLabels1, "toto", ctx)
749+
750+
By("starting a reconciler with NodeDisruptionTypes enabled")
751+
cancelFn = startReconcilerWithConfig(NodeDisruptionReconcilerConfig{
752+
RejectOverlappingDisruption: false,
753+
RetryInterval: time.Second * 1,
754+
NodeDisruptionTypes: allowedDisruptionTypes,
755+
})
756+
})
757+
It("rejects the NodeDisruption", func() {
758+
Eventually(func() nodedisruptionv1alpha1.NodeDisruptionState {
759+
err := k8sClient.Get(ctx, types.NamespacedName{Name: disruptionName, Namespace: NDNamespace}, createdDisruption)
760+
if err != nil {
761+
panic("should be able to get")
762+
}
763+
return createdDisruption.Status.State
764+
}, timeout, interval).Should(Equal(nodedisruptionv1alpha1.Rejected))
765+
})
766+
})
767+
})
768+
769+
Context("NodeDisruptionTypes is disabled", func() {
770+
When("the created disruption has a type", func() {
771+
BeforeEach(func() {
772+
By("Configuring a disruption")
773+
createNodeDisruption(disruptionName, NDNamespace, nodeLabels1, "maintenance", ctx)
774+
775+
By("starting a reconciler with NodeDisruptionTypes enabled")
776+
cancelFn = startReconcilerWithConfig(NodeDisruptionReconcilerConfig{
777+
RejectOverlappingDisruption: false,
778+
RetryInterval: time.Second * 1,
779+
NodeDisruptionTypes: []string{},
780+
})
781+
})
782+
It("grants the NodeDisruption", func() {
783+
Eventually(func() nodedisruptionv1alpha1.NodeDisruptionState {
784+
err := k8sClient.Get(ctx, types.NamespacedName{Name: disruptionName, Namespace: NDNamespace}, createdDisruption)
785+
if err != nil {
786+
panic("should be able to get")
787+
}
788+
return createdDisruption.Status.State
789+
}, timeout, interval).Should(Equal(nodedisruptionv1alpha1.Granted))
790+
})
791+
})
792+
When("the created disruption has not a type", func() {
793+
BeforeEach(func() {
794+
By("Configuring a disruption")
795+
createNodeDisruption(disruptionName, NDNamespace, nodeLabels1, "", ctx)
796+
797+
By("starting a reconciler with NodeDisruptionTypes enabled")
798+
cancelFn = startReconcilerWithConfig(NodeDisruptionReconcilerConfig{
799+
RejectOverlappingDisruption: false,
800+
RetryInterval: time.Second * 1,
801+
NodeDisruptionTypes: []string{},
802+
})
803+
})
804+
It("grants the NodeDisruption", func() {
805+
Eventually(func() nodedisruptionv1alpha1.NodeDisruptionState {
806+
err := k8sClient.Get(ctx, types.NamespacedName{Name: disruptionName, Namespace: NDNamespace}, createdDisruption)
807+
if err != nil {
808+
panic("should be able to get")
809+
}
810+
return createdDisruption.Status.State
811+
}, timeout, interval).Should(Equal(nodedisruptionv1alpha1.Granted))
812+
})
813+
})
814+
})
815+
})
708816
})
709817
})

0 commit comments

Comments
 (0)