Skip to content

Commit

Permalink
Timeout if Pending takes too long (#854)
Browse files Browse the repository at this point in the history
Add LastUnavailableNodeCount timestamp to avoid
getting stuck at Pending forever.

Signed-off-by: Radim Hrazdil <[email protected]>
  • Loading branch information
rhrazdil authored Oct 20, 2021
1 parent d7a8df8 commit b98a1cd
Show file tree
Hide file tree
Showing 6 changed files with 73 additions and 41 deletions.
8 changes: 7 additions & 1 deletion api/shared/nodenetworkconfigurationpolicy_types.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
package shared

import "k8s.io/apimachinery/pkg/util/intstr"
import (
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/intstr"
)

// NodeNetworkConfigurationPolicySpec defines the desired state of NodeNetworkConfigurationPolicy
type NodeNetworkConfigurationPolicySpec struct {
Expand Down Expand Up @@ -28,6 +31,9 @@ type NodeNetworkConfigurationPolicyStatus struct {
// processing a NodeNetworkConfigurationPolicy
// +optional
UnavailableNodeCount int `json:"unavailableNodeCount,omitempty" optional:"true"`
// LastUnavailableNodeCountUpdate is time of the last UnavailableNodeCount update
// +optional
LastUnavailableNodeCountUpdate *metav1.Time `json:"lastUnavailableNodeCountUpdate,omitempty" optional:"true"`
}

const (
Expand Down
11 changes: 8 additions & 3 deletions controllers/nodenetworkconfigurationpolicy_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ import (
nmstate "github.com/nmstate/kubernetes-nmstate/pkg/helper"
"github.com/nmstate/kubernetes-nmstate/pkg/node"
"github.com/nmstate/kubernetes-nmstate/pkg/policyconditions"
"github.com/nmstate/kubernetes-nmstate/pkg/probe"
"github.com/nmstate/kubernetes-nmstate/pkg/selectors"
)

Expand Down Expand Up @@ -180,7 +181,7 @@ func (r *NodeNetworkConfigurationPolicyReconciler) Reconcile(ctx context.Context
return ctrl.Result{}, nil
}

if r.shouldIncrementUnavailableNodeCount(previousConditions) {
if r.shouldIncrementUnavailableNodeCount(instance, previousConditions) {
err = r.incrementUnavailableNodeCount(instance)
if err != nil {
if apierrors.IsConflict(err) {
Expand Down Expand Up @@ -335,8 +336,10 @@ func (r *NodeNetworkConfigurationPolicyReconciler) deleteEnactmentForPolicy(poli
return nil
}

func (r *NodeNetworkConfigurationPolicyReconciler) shouldIncrementUnavailableNodeCount(conditions *nmstateapi.ConditionList) bool {
return !enactmentstatus.IsProgressing(conditions)
func (r *NodeNetworkConfigurationPolicyReconciler) shouldIncrementUnavailableNodeCount(policy *nmstatev1beta1.NodeNetworkConfigurationPolicy, conditions *nmstateapi.ConditionList) bool {
return !enactmentstatus.IsProgressing(conditions) &&
(policy.Status.LastUnavailableNodeCountUpdate == nil ||
time.Now().Sub(policy.Status.LastUnavailableNodeCountUpdate.Time) < (nmstate.DesiredStateConfigurationTimeout+probe.ProbesTotalTimeout))
}

func (r *NodeNetworkConfigurationPolicyReconciler) incrementUnavailableNodeCount(policy *nmstatev1beta1.NodeNetworkConfigurationPolicy) error {
Expand All @@ -352,6 +355,7 @@ func (r *NodeNetworkConfigurationPolicyReconciler) incrementUnavailableNodeCount
if policy.Status.UnavailableNodeCount >= maxUnavailable {
return apierrors.NewConflict(schema.GroupResource{Resource: "nodenetworkconfigurationpolicies"}, policy.Name, fmt.Errorf("maximal number of %d nodes are already processing policy configuration", policy.Status.UnavailableNodeCount))
}
policy.Status.LastUnavailableNodeCountUpdate = &metav1.Time{Time: time.Now()}
policy.Status.UnavailableNodeCount += 1
err = r.Client.Status().Update(context.TODO(), policy)
if err != nil {
Expand Down Expand Up @@ -382,6 +386,7 @@ func tryDecrementingUnavailableNodeCount(statusWriterClient client.StatusClient,
if instance.Status.UnavailableNodeCount <= 0 {
return fmt.Errorf("no unavailable nodes")
}
instance.Status.LastUnavailableNodeCountUpdate = &metav1.Time{Time: time.Now()}
instance.Status.UnavailableNodeCount -= 1
return statusWriterClient.Status().Update(context.TODO(), instance)
})
Expand Down
67 changes: 38 additions & 29 deletions controllers/nodenetworkconfigurationpolicy_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@ package controllers

import (
"context"

. "github.com/onsi/ginkgo"
. "github.com/onsi/ginkgo/extensions/table"
. "github.com/onsi/gomega"
Expand Down Expand Up @@ -70,10 +69,11 @@ var _ = Describe("NodeNetworkConfigurationPolicy controller predicates", func()
)

type incrementUnavailableNodeCountCase struct {
currentUnavailableNodeCount int
expectedUnavailableNodeCount int
expectedReconcileResult ctrl.Result
previousEnactmentConditions func(*shared.ConditionList, string)
currentUnavailableNodeCount int
expectedUnavailableNodeCount int
expectedReconcileResult ctrl.Result
previousEnactmentConditions func(*shared.ConditionList, string)
shouldUpdateUnavailableNodeCount bool
}
DescribeTable("when claimNodeRunningUpdate is called and",
func(c incrementUnavailableNodeCountCase) {
Expand Down Expand Up @@ -130,48 +130,57 @@ var _ = Describe("NodeNetworkConfigurationPolicy controller predicates", func()
obtainedNNCP := nmstatev1beta1.NodeNetworkConfigurationPolicy{}
cl.Get(context.TODO(), types.NamespacedName{Name: nncp.Name}, &obtainedNNCP)
Expect(obtainedNNCP.Status.UnavailableNodeCount).To(Equal(c.expectedUnavailableNodeCount))
if c.shouldUpdateUnavailableNodeCount {
Expect(obtainedNNCP.Status.LastUnavailableNodeCountUpdate).ToNot(BeNil())
}
},
Entry("No node applying policy with empty enactment, should succeed incrementing UnavailableNodeCount",
incrementUnavailableNodeCountCase{
currentUnavailableNodeCount: 0,
expectedUnavailableNodeCount: 0,
previousEnactmentConditions: func(*shared.ConditionList, string) {},
expectedReconcileResult: ctrl.Result{},
currentUnavailableNodeCount: 0,
expectedUnavailableNodeCount: 0,
previousEnactmentConditions: func(*shared.ConditionList, string) {},
expectedReconcileResult: ctrl.Result{},
shouldUpdateUnavailableNodeCount: true,
}),
Entry("No node applying policy with progressing enactment, should succeed incrementing UnavailableNodeCount",
incrementUnavailableNodeCountCase{
currentUnavailableNodeCount: 0,
expectedUnavailableNodeCount: 0,
previousEnactmentConditions: conditions.SetProgressing,
expectedReconcileResult: ctrl.Result{},
currentUnavailableNodeCount: 0,
expectedUnavailableNodeCount: 0,
previousEnactmentConditions: conditions.SetProgressing,
expectedReconcileResult: ctrl.Result{},
shouldUpdateUnavailableNodeCount: false,
}),
Entry("No node applying policy with Pending enactment, should succeed incrementing UnavailableNodeCount",
incrementUnavailableNodeCountCase{
currentUnavailableNodeCount: 0,
expectedUnavailableNodeCount: 0,
previousEnactmentConditions: conditions.SetPending,
expectedReconcileResult: ctrl.Result{},
currentUnavailableNodeCount: 0,
expectedUnavailableNodeCount: 0,
previousEnactmentConditions: conditions.SetPending,
expectedReconcileResult: ctrl.Result{},
shouldUpdateUnavailableNodeCount: true,
}),
Entry("One node applying policy with empty enactment, should conflict incrementing UnavailableNodeCount",
incrementUnavailableNodeCountCase{
currentUnavailableNodeCount: 1,
expectedUnavailableNodeCount: 1,
previousEnactmentConditions: func(*shared.ConditionList, string) {},
expectedReconcileResult: ctrl.Result{RequeueAfter: nodeRunningUpdateRetryTime},
currentUnavailableNodeCount: 1,
expectedUnavailableNodeCount: 1,
previousEnactmentConditions: func(*shared.ConditionList, string) {},
expectedReconcileResult: ctrl.Result{RequeueAfter: nodeRunningUpdateRetryTime},
shouldUpdateUnavailableNodeCount: false,
}),
Entry("One node applying policy with Progressing enactment, should succeed incrementing UnavailableNodeCount",
incrementUnavailableNodeCountCase{
currentUnavailableNodeCount: 1,
expectedUnavailableNodeCount: 0,
previousEnactmentConditions: conditions.SetProgressing,
expectedReconcileResult: ctrl.Result{},
currentUnavailableNodeCount: 1,
expectedUnavailableNodeCount: 0,
previousEnactmentConditions: conditions.SetProgressing,
expectedReconcileResult: ctrl.Result{},
shouldUpdateUnavailableNodeCount: false,
}),
Entry("One node applying policy with Pending enactment, should conflict incrementing UnavailableNodeCount",
incrementUnavailableNodeCountCase{
currentUnavailableNodeCount: 1,
expectedUnavailableNodeCount: 1,
previousEnactmentConditions: conditions.SetPending,
expectedReconcileResult: ctrl.Result{RequeueAfter: nodeRunningUpdateRetryTime},
currentUnavailableNodeCount: 1,
expectedUnavailableNodeCount: 1,
previousEnactmentConditions: conditions.SetPending,
expectedReconcileResult: ctrl.Result{RequeueAfter: nodeRunningUpdateRetryTime},
shouldUpdateUnavailableNodeCount: false,
}),
)
})
10 changes: 10 additions & 0 deletions deploy/crds/nmstate.io_nodenetworkconfigurationpolicies.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,11 @@ spec:
- type
type: object
type: array
lastUnavailableNodeCountUpdate:
description: LastUnavailableNodeCountUpdate is time of the last UnavailableNodeCount
update
format: date-time
type: string
unavailableNodeCount:
description: UnavailableNodeCount represents the total number of potentially
unavailable nodes that are processing a NodeNetworkConfigurationPolicy
Expand Down Expand Up @@ -176,6 +181,11 @@ spec:
- type
type: object
type: array
lastUnavailableNodeCountUpdate:
description: LastUnavailableNodeCountUpdate is time of the last UnavailableNodeCount
update
format: date-time
type: string
unavailableNodeCount:
description: UnavailableNodeCount represents the total number of potentially
unavailable nodes that are processing a NodeNetworkConfigurationPolicy
Expand Down
17 changes: 9 additions & 8 deletions pkg/helper/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,14 @@ var (
log = logf.Log.WithName("client")
)

const defaultGwRetrieveTimeout = 120 * time.Second
const defaultGwProbeTimeout = 120 * time.Second
const apiServerProbeTimeout = 120 * time.Second
const (
defaultGwProbeTimeout = 120 * time.Second
apiServerProbeTimeout = 120 * time.Second
// DesiredStateConfigurationTimeout doubles the default gw ping probe and API server
// connectivity check timeout to ensure the Checkpoint is alive before rolling it back
// https://nmstate.github.io/cli_guide#manual-transaction-control
DesiredStateConfigurationTimeout = (defaultGwProbeTimeout + apiServerProbeTimeout) * 2
)

func InitializeNodeNetworkState(client client.Client, node *corev1.Node) (*nmstatev1beta1.NodeNetworkState, error) {
ownerRefList := []metav1.OwnerReference{{Name: node.ObjectMeta.Name, Kind: "Node", APIVersion: "v1", UID: node.UID}}
Expand Down Expand Up @@ -104,11 +109,7 @@ func ApplyDesiredState(client client.Client, desiredState shared.State) (string,
// working fine after apply
probes := probe.Select(client)

// commit timeout doubles the default gw ping probe and check API server
// connectivity timeout, to
// ensure the Checkpoint is alive before rolling it back
// https://nmstate.github.io/cli_guide#manual-transaction-control
setOutput, err := nmstatectl.Set(desiredState, (defaultGwProbeTimeout+apiServerProbeTimeout)*2)
setOutput, err := nmstatectl.Set(desiredState, DesiredStateConfigurationTimeout)
if err != nil {
return setOutput, err
}
Expand Down
1 change: 1 addition & 0 deletions pkg/probe/probes.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ const (
defaultDnsProbeTimeout = 120 * time.Second
apiServerProbeTimeout = 120 * time.Second
nodeReadinessProbeTimeout = 120 * time.Second
ProbesTotalTimeout = defaultGwRetrieveTimeout + defaultDnsProbeTimeout + defaultDnsProbeTimeout + apiServerProbeTimeout + nodeReadinessProbeTimeout
)

func currentStateAsGJson() (gjson.Result, error) {
Expand Down

0 comments on commit b98a1cd

Please sign in to comment.