Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement spec.type to Node Disruption API [#64] #67

Merged
merged 3 commits into from
Jun 28, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions DOC.md
Original file line number Diff line number Diff line change
Expand Up @@ -756,6 +756,13 @@ NodeDisruptionSpec defines the desired state of NodeDisruption
Configure the retrying behavior of a NodeDisruption<br/>
</td>
<td>false</td>
</tr><tr>
<td><b>type</b></td>
<td>string</td>
<td>
Type of the node disruption<br/>
</td>
<td>false</td>
</tr></tbody>
</table>

Expand Down
2 changes: 2 additions & 0 deletions api/v1alpha1/nodedisruption_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,8 @@ type NodeDisruptionSpec struct {
// Label query over nodes that will be impacted by the disruption
NodeSelector metav1.LabelSelector `json:"nodeSelector,omitempty"`
Retry RetrySpec `json:"retry,omitempty"`
// Type of the node disruption
Type string `json:"type,omitempty"`
}

// Configure the retrying behavior of a NodeDisruption
Expand Down
3 changes: 3 additions & 0 deletions chart/templates/nodedisruption-crd.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,9 @@ spec:
description: Enable retrying
type: boolean
type: object
type:
description: Type of the node disruption
type: string
type: object
status:
description: NodeDisruptionStatus defines the observed state of NodeDisruption
Expand Down
4 changes: 4 additions & 0 deletions cmd/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ package main
import (
"flag"
"os"
"strings"
"time"

// Import all Kubernetes client auth plugins (e.g. Azure, GCP, OIDC, etc.)
Expand Down Expand Up @@ -56,6 +57,7 @@ func main() {
var rejectEmptyNodeDisruption bool
var retryInterval time.Duration
var rejectOverlappingDisruption bool
var nodeDisruptionTypes string
flag.StringVar(&metricsAddr, "metrics-bind-address", ":8080", "The address the metric endpoint binds to.")
flag.StringVar(&probeAddr, "health-probe-bind-address", ":8081", "The address the probe endpoint binds to.")
flag.BoolVar(&enableLeaderElection, "leader-elect", false,
Expand All @@ -64,6 +66,7 @@ func main() {
flag.BoolVar(&rejectEmptyNodeDisruption, "reject-empty-node-disruption", false, "Reject NodeDisruption matching no actual node.")
flag.DurationVar(&retryInterval, "retry-interval", controller.DefaultRetryInterval, "How long to wait between each retry (Default 60s)")
flag.BoolVar(&rejectOverlappingDisruption, "reject-overlapping-disruption", false, "Automatically reject any overlapping NodeDisruption (based on node selector), preserving the oldest one")
flag.StringVar(&nodeDisruptionTypes, "node-disruption-types", "", "The list of types allowed for a node disruption separated by a comma.")

opts := zap.Options{
Development: true,
Expand Down Expand Up @@ -104,6 +107,7 @@ func main() {
RejectEmptyNodeDisruption: rejectEmptyNodeDisruption,
RetryInterval: retryInterval,
RejectOverlappingDisruption: rejectOverlappingDisruption,
NodeDisruptionTypes: strings.Split(nodeDisruptionTypes, ","),
},
}).SetupWithManager(mgr); err != nil {
setupLog.Error(err, "unable to create controller", "controller", "NodeDisruption")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,9 @@ spec:
description: Enable retrying
type: boolean
type: object
type:
description: Type of the node disruption
type: string
type: object
status:
description: NodeDisruptionStatus defines the observed state of NodeDisruption
Expand Down
21 changes: 14 additions & 7 deletions internal/controller/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,49 +17,56 @@ var (
Name: METIC_PREFIX + "node_disruption_granted_total",
Help: "Total number of granted node disruptions",
},
[]string{},
[]string{"type"},
)
NodeDisruptionRejectedTotal = promauto.With(metrics.Registry).NewCounterVec(
prometheus.CounterOpts{
Name: METIC_PREFIX + "node_disruption_rejected_total",
Help: "Total number of rejected node disruptions",
},
[]string{},
[]string{"type"},
)
NodeDisruptionStateAsValue = promauto.With(metrics.Registry).NewGaugeVec(
prometheus.GaugeOpts{
Name: METIC_PREFIX + "node_disruption_state_value",
Help: "State of node disruption: pending=0, rejected=-1, accepted=1",
},
[]string{"node_disruption_name"},
[]string{"node_disruption_name", "type"},
)
NodeDisruptionStateAsLabel = promauto.With(metrics.Registry).NewGaugeVec(
prometheus.GaugeOpts{
Name: METIC_PREFIX + "node_disruption_state_label",
Help: "State of node disruption: 0 not in this state; 1 is in state",
},
[]string{"node_disruption_name", "state"},
[]string{"node_disruption_name", "state", "type"},
)
NodeDisruptionCreated = promauto.With(metrics.Registry).NewGaugeVec(
prometheus.GaugeOpts{
Name: METIC_PREFIX + "node_disruption_created",
Help: "Date of create of the node disruption",
},
[]string{"node_disruption_name"},
[]string{"node_disruption_name", "type"},
)
NodeDisruptionDeadline = promauto.With(metrics.Registry).NewGaugeVec(
prometheus.GaugeOpts{
Name: METIC_PREFIX + "node_disruption_deadline",
Help: "Date of the deadline of the node disruption (0 if unset)",
},
[]string{"node_disruption_name"},
[]string{"node_disruption_name", "type"},
)
NodeDisruptionImpactedNodes = promauto.With(metrics.Registry).NewGaugeVec(
prometheus.GaugeOpts{
Name: METIC_PREFIX + "node_disruption_impacted_node",
Help: "high cardinality: create a metric for each node impacted by a given node disruption",
},
[]string{"node_disruption_name", "node_name"},
[]string{"node_disruption_name", "node_name", "type"},
)
NodeDisruptionType = promauto.With(metrics.Registry).NewGaugeVec(
prometheus.GaugeOpts{
Name: METIC_PREFIX + "node_disruption_type",
Help: "Type of the node disruption",
},
[]string{"node_disruption_name", "type"},
)
// DISRUPTION BUDGET METRICS
DisruptionBudgetCheckHealthHookStatusCodeTotal = promauto.With(metrics.Registry).NewCounterVec(
Expand Down
43 changes: 28 additions & 15 deletions internal/controller/nodedisruption_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ import (
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/client-go/tools/record"
"k8s.io/utils/strings/slices"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/log"
Expand All @@ -45,6 +46,8 @@ type NodeDisruptionReconcilerConfig struct {
RetryInterval time.Duration
// Reject NodeDisruption if its node selector overlaps an older NodeDisruption's selector
RejectOverlappingDisruption bool
// Specify which node disruption types are allowed to be granted
NodeDisruptionTypes []string
}

// NodeDisruptionReconciler reconciles NodeDisruptions
Expand Down Expand Up @@ -116,39 +119,40 @@ func PruneNodeDisruptionMetrics(nd_name string) {
NodeDisruptionCreated.DeletePartialMatch(prometheus.Labels{"node_disruption_name": nd_name})
NodeDisruptionDeadline.DeletePartialMatch(prometheus.Labels{"node_disruption_name": nd_name})
NodeDisruptionImpactedNodes.DeletePartialMatch(prometheus.Labels{"node_disruption_name": nd_name})
NodeDisruptionType.DeletePartialMatch(prometheus.Labels{"node_disruption_name": nd_name})
}

// UpdateNodeDisruptionMetrics update metrics for a Node Disruption
func UpdateNodeDisruptionMetrics(nd *nodedisruptionv1alpha1.NodeDisruption) {
nd_state := 0
if nd.Status.State == nodedisruptionv1alpha1.Pending {
nd_state = 0
NodeDisruptionStateAsLabel.WithLabelValues(nd.Name, string(nodedisruptionv1alpha1.Pending)).Set(1)
NodeDisruptionStateAsLabel.WithLabelValues(nd.Name, string(nodedisruptionv1alpha1.Granted)).Set(0)
NodeDisruptionStateAsLabel.WithLabelValues(nd.Name, string(nodedisruptionv1alpha1.Rejected)).Set(0)
NodeDisruptionStateAsLabel.WithLabelValues(nd.Name, string(nodedisruptionv1alpha1.Pending), nd.Spec.Type).Set(1)
NodeDisruptionStateAsLabel.WithLabelValues(nd.Name, string(nodedisruptionv1alpha1.Granted), nd.Spec.Type).Set(0)
NodeDisruptionStateAsLabel.WithLabelValues(nd.Name, string(nodedisruptionv1alpha1.Rejected), nd.Spec.Type).Set(0)
} else if nd.Status.State == nodedisruptionv1alpha1.Rejected {
nd_state = -1
NodeDisruptionStateAsLabel.WithLabelValues(nd.Name, string(nodedisruptionv1alpha1.Pending)).Set(0)
NodeDisruptionStateAsLabel.WithLabelValues(nd.Name, string(nodedisruptionv1alpha1.Rejected)).Set(1)
NodeDisruptionStateAsLabel.WithLabelValues(nd.Name, string(nodedisruptionv1alpha1.Granted)).Set(0)
NodeDisruptionStateAsLabel.WithLabelValues(nd.Name, string(nodedisruptionv1alpha1.Pending), nd.Spec.Type).Set(0)
NodeDisruptionStateAsLabel.WithLabelValues(nd.Name, string(nodedisruptionv1alpha1.Rejected), nd.Spec.Type).Set(1)
NodeDisruptionStateAsLabel.WithLabelValues(nd.Name, string(nodedisruptionv1alpha1.Granted), nd.Spec.Type).Set(0)
} else if nd.Status.State == nodedisruptionv1alpha1.Granted {
nd_state = 1
NodeDisruptionStateAsLabel.WithLabelValues(nd.Name, string(nodedisruptionv1alpha1.Pending)).Set(0)
NodeDisruptionStateAsLabel.WithLabelValues(nd.Name, string(nodedisruptionv1alpha1.Rejected)).Set(0)
NodeDisruptionStateAsLabel.WithLabelValues(nd.Name, string(nodedisruptionv1alpha1.Granted)).Set(1)
NodeDisruptionStateAsLabel.WithLabelValues(nd.Name, string(nodedisruptionv1alpha1.Pending), nd.Spec.Type).Set(0)
NodeDisruptionStateAsLabel.WithLabelValues(nd.Name, string(nodedisruptionv1alpha1.Rejected), nd.Spec.Type).Set(0)
NodeDisruptionStateAsLabel.WithLabelValues(nd.Name, string(nodedisruptionv1alpha1.Granted), nd.Spec.Type).Set(1)
}
NodeDisruptionStateAsValue.WithLabelValues(nd.Name).Set(float64(nd_state))
NodeDisruptionCreated.WithLabelValues(nd.Name).Set(float64(nd.CreationTimestamp.Unix()))
NodeDisruptionStateAsValue.WithLabelValues(nd.Name, nd.Spec.Type).Set(float64(nd_state))
NodeDisruptionCreated.WithLabelValues(nd.Name, nd.Spec.Type).Set(float64(nd.CreationTimestamp.Unix()))
// Deadline might not be set so it will be 0 but timestamp in Go are not Unix epoch
// so converting a 0 timestamp will not result in epoch 0. We override this to have nice values
deadline := nd.Spec.Retry.Deadline.Unix()
if nd.Spec.Retry.Deadline.IsZero() {
deadline = 0
}
NodeDisruptionDeadline.WithLabelValues(nd.Name).Set(float64(deadline))
NodeDisruptionDeadline.WithLabelValues(nd.Name, nd.Spec.Type).Set(float64(deadline))

for _, node_name := range nd.Status.DisruptedNodes {
NodeDisruptionImpactedNodes.WithLabelValues(nd.Name, node_name).Set(1)
NodeDisruptionImpactedNodes.WithLabelValues(nd.Name, node_name, nd.Spec.Type).Set(1)
}
}

Expand Down Expand Up @@ -195,9 +199,9 @@ func (ndr *SingleNodeDisruptionReconciler) TryTransitionState(ctx context.Contex
return err
}
if ndr.NodeDisruption.Status.State == nodedisruptionv1alpha1.Granted {
NodeDisruptionGrantedTotal.WithLabelValues().Inc()
NodeDisruptionGrantedTotal.WithLabelValues(ndr.NodeDisruption.Spec.Type).Inc()
} else if ndr.NodeDisruption.Status.State == nodedisruptionv1alpha1.Rejected {
NodeDisruptionRejectedTotal.WithLabelValues().Inc()
NodeDisruptionRejectedTotal.WithLabelValues(ndr.NodeDisruption.Spec.Type).Inc()
}
}
// If the disruption is not Pending nor unknown, the state is final
Expand Down Expand Up @@ -331,6 +335,15 @@ func (ndr *SingleNodeDisruptionReconciler) ValidateWithInternalConstraints(ctx c
return anyFailed, []nodedisruptionv1alpha1.DisruptedBudgetStatus{status}, err
}

if len(ndr.Config.NodeDisruptionTypes) != 0 && !slices.Contains(ndr.Config.NodeDisruptionTypes, ndr.NodeDisruption.Spec.Type) {
status := nodedisruptionv1alpha1.DisruptedBudgetStatus{
Reference: ndr.getNodeDisruptionReference(),
Reason: "Type provided of node disruption is not managed",
Ok: false,
}
return true, []nodedisruptionv1alpha1.DisruptedBudgetStatus{status}, nil
}

return false, statuses, nil
}

Expand Down
116 changes: 112 additions & 4 deletions internal/controller/nodedisruption_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ func startDummyHTTPServer(handle http.HandlerFunc, listenAddr string) (cancelFn
return func() { _ = srv.Shutdown(context.Background()) }
}

func createNodeDisruption(name string, namespace string, nodeSelectorLabel map[string]string, ctx context.Context) {
func createNodeDisruption(name string, namespace string, nodeSelectorLabel map[string]string, disruptionType string, ctx context.Context) {
overlappingDisruption := &nodedisruptionv1alpha1.NodeDisruption{
TypeMeta: metav1.TypeMeta{
APIVersion: "nodedisruption.criteo.com/v1alpha1",
Expand All @@ -124,6 +124,7 @@ func createNodeDisruption(name string, namespace string, nodeSelectorLabel map[s
},
Spec: nodedisruptionv1alpha1.NodeDisruptionSpec{
NodeSelector: metav1.LabelSelector{MatchLabels: nodeSelectorLabel},
Type: disruptionType,
},
}
Expect(k8sClient.Create(ctx, overlappingDisruption.DeepCopy())).Should(Succeed())
Expand Down Expand Up @@ -644,7 +645,7 @@ var _ = Describe("NodeDisruption controller", func() {

BeforeEach(func() {
By("configuring a first disruption")
createNodeDisruption(firstDisruptionName, NDNamespace, nodeLabels1, ctx)
createNodeDisruption(firstDisruptionName, NDNamespace, nodeLabels1, "", ctx)
})
AfterEach(func() {
clearAllNodeDisruptionResources()
Expand All @@ -664,7 +665,7 @@ var _ = Describe("NodeDisruption controller", func() {
})

By("creating an overlapping disruption")
createNodeDisruption(overlappingDisruptionName, NDNamespace, node2Label, ctx)
createNodeDisruption(overlappingDisruptionName, NDNamespace, node2Label, "", ctx)
})
It("rejects the NodeDisruption", func() {
Eventually(func() nodedisruptionv1alpha1.NodeDisruptionState {
Expand All @@ -691,7 +692,7 @@ var _ = Describe("NodeDisruption controller", func() {
})

By("creating an overlapping disruption")
createNodeDisruption(overlappingDisruptionName, NDNamespace, node2Label, ctx)
createNodeDisruption(overlappingDisruptionName, NDNamespace, node2Label, "", ctx)
})
It("accepts the NodeDisruption", func() {
Eventually(func() nodedisruptionv1alpha1.NodeDisruptionState {
Expand All @@ -705,5 +706,112 @@ var _ = Describe("NodeDisruption controller", func() {
})
})
})

Describe("Reject typed disruptions feature", Label("Node disruption type"), Ordered, func() {
var (
createdDisruption = &nodedisruptionv1alpha1.NodeDisruption{}
disruptionName = "disruption-test"
allowedDisruptionTypes = []string{"maintenance", "decommission", "tor-maintenance"}
)

AfterEach(func() {
clearAllNodeDisruptionResources()
cancelFn()
})

Context("NodeDisruptionTypes is enabled", func() {
When("the created disruption has an allowed type", func() {
BeforeEach(func() {
By("Configuring a disruption")
createNodeDisruption(disruptionName, NDNamespace, nodeLabels1, "maintenance", ctx)

By("starting a reconciler with NodeDisruptionTypes enabled")
cancelFn = startReconcilerWithConfig(NodeDisruptionReconcilerConfig{
RejectOverlappingDisruption: false,
RetryInterval: time.Second * 1,
NodeDisruptionTypes: allowedDisruptionTypes,
})
})
It("grants the NodeDisruption", func() {
Eventually(func() nodedisruptionv1alpha1.NodeDisruptionState {
err := k8sClient.Get(ctx, types.NamespacedName{Name: disruptionName, Namespace: NDNamespace}, createdDisruption)
if err != nil {
panic("should be able to get")
}
return createdDisruption.Status.State
}, timeout, interval).Should(Equal(nodedisruptionv1alpha1.Granted))
})
})
When("the created disruption has not an allowed type", func() {
BeforeEach(func() {
By("Configuring a disruption")
createNodeDisruption(disruptionName, NDNamespace, nodeLabels1, "toto", ctx)

By("starting a reconciler with NodeDisruptionTypes enabled")
cancelFn = startReconcilerWithConfig(NodeDisruptionReconcilerConfig{
RejectOverlappingDisruption: false,
RetryInterval: time.Second * 1,
NodeDisruptionTypes: allowedDisruptionTypes,
})
})
It("rejects the NodeDisruption", func() {
Eventually(func() nodedisruptionv1alpha1.NodeDisruptionState {
err := k8sClient.Get(ctx, types.NamespacedName{Name: disruptionName, Namespace: NDNamespace}, createdDisruption)
if err != nil {
panic("should be able to get")
}
return createdDisruption.Status.State
}, timeout, interval).Should(Equal(nodedisruptionv1alpha1.Rejected))
})
})
})

Context("NodeDisruptionTypes is disabled", func() {
When("the created disruption has a type", func() {
BeforeEach(func() {
By("Configuring a disruption")
createNodeDisruption(disruptionName, NDNamespace, nodeLabels1, "maintenance", ctx)

By("starting a reconciler with NodeDisruptionTypes enabled")
cancelFn = startReconcilerWithConfig(NodeDisruptionReconcilerConfig{
RejectOverlappingDisruption: false,
RetryInterval: time.Second * 1,
NodeDisruptionTypes: []string{},
})
})
It("grants the NodeDisruption", func() {
Eventually(func() nodedisruptionv1alpha1.NodeDisruptionState {
err := k8sClient.Get(ctx, types.NamespacedName{Name: disruptionName, Namespace: NDNamespace}, createdDisruption)
if err != nil {
panic("should be able to get")
}
return createdDisruption.Status.State
}, timeout, interval).Should(Equal(nodedisruptionv1alpha1.Granted))
})
})
When("the created disruption has not a type", func() {
BeforeEach(func() {
By("Configuring a disruption")
createNodeDisruption(disruptionName, NDNamespace, nodeLabels1, "", ctx)

By("starting a reconciler with NodeDisruptionTypes enabled")
cancelFn = startReconcilerWithConfig(NodeDisruptionReconcilerConfig{
RejectOverlappingDisruption: false,
RetryInterval: time.Second * 1,
NodeDisruptionTypes: []string{},
})
})
It("grants the NodeDisruption", func() {
Eventually(func() nodedisruptionv1alpha1.NodeDisruptionState {
err := k8sClient.Get(ctx, types.NamespacedName{Name: disruptionName, Namespace: NDNamespace}, createdDisruption)
if err != nil {
panic("should be able to get")
}
return createdDisruption.Status.State
}, timeout, interval).Should(Equal(nodedisruptionv1alpha1.Granted))
})
})
})
})
})
})
Loading