diff --git a/pkg/rc/replication_controller.go b/pkg/rc/replication_controller.go index 9626f2e67..bcc5b4a00 100644 --- a/pkg/rc/replication_controller.go +++ b/pkg/rc/replication_controller.go @@ -181,23 +181,28 @@ func (rc *replicationController) meetDesires() error { if err != nil { return err } + eligible, err := rc.eligibleNodes() + if err != nil { + return err + } rc.logger.NoFields().Infof("Currently on nodes %s", current) nodesChanged := false - if rc.ReplicasDesired > len(current) { - err := rc.addPods(current) + switch { + case rc.ReplicasDesired > len(current): + err := rc.addPods(current, eligible) if err != nil { return err } nodesChanged = true - } else if len(current) > rc.ReplicasDesired { - err := rc.removePods(current) + case len(current) > rc.ReplicasDesired: + err := rc.removePods(current, eligible) if err != nil { return err } nodesChanged = true - } else { + default: rc.logger.NoFields().Debugln("Taking no action") } @@ -208,15 +213,13 @@ func (rc *replicationController) meetDesires() error { } } + rc.checkForIneligible(current, eligible) + return rc.ensureConsistency(current) } -func (rc *replicationController) addPods(current types.PodLocations) error { +func (rc *replicationController) addPods(current types.PodLocations, eligible []types.NodeName) error { currentNodes := current.Nodes() - eligible, err := rc.eligibleNodes() - if err != nil { - return err - } // TODO: With Docker or runc we would not be constrained to running only once per node. // So it may be the case that we need to make the Scheduler interface smarter and use it here. @@ -320,12 +323,8 @@ func (rc *replicationController) alertInfo(msg string) alerting.AlertInfo { } } -func (rc *replicationController) removePods(current types.PodLocations) error { +func (rc *replicationController) removePods(current types.PodLocations, eligible []types.NodeName) error { currentNodes := current.Nodes() - eligible, err := rc.eligibleNodes() - if err != nil { - return err - } // If we need to downsize the number of nodes, prefer any in current that are not eligible anymore. // TODO: evaluate changes to 'eligible' more frequently @@ -458,6 +457,32 @@ func (rc *replicationController) ensureConsistency(current types.PodLocations) e return nil } +func (rc *replicationController) checkForIneligible(current types.PodLocations, eligible []types.NodeName) { + // Check that the RC doesn't have any current nodes that are ineligible. + var ineligibleCurrent []types.NodeName + for _, currentPod := range current { + found := false + for _, eligibleNode := range eligible { + if eligibleNode == currentPod.Node { + found = true + break + } + } + + if !found { + ineligibleCurrent = append(ineligibleCurrent, currentPod.Node) + } + } + + if len(ineligibleCurrent) > 0 { + errMsg := fmt.Sprintf("RC has scheduled %d ineligible nodes: %s", len(ineligibleCurrent), ineligibleCurrent) + err := rc.alerter.Alert(rc.alertInfo(errMsg)) + if err != nil { + rc.logger.WithError(err).Errorln("Unable to send alert") + } + } +} + func (rc *replicationController) eligibleNodes() ([]types.NodeName, error) { rc.mu.Lock() manifest := rc.Manifest diff --git a/pkg/rc/replication_controller_test.go b/pkg/rc/replication_controller_test.go index 041d4dd1d..e366578df 100644 --- a/pkg/rc/replication_controller_test.go +++ b/pkg/rc/replication_controller_test.go @@ -679,3 +679,137 @@ func TestUnscheduleMoreThan5(t *testing.T) { close(quit) wg.Wait() } + +func TestAlertIfNodeBecomesIneligible(t *testing.T) { + _, _, applicator, rc, alerter, _, closeFn := setup(t) + defer closeFn() + + for i := 0; i < 7; i++ { + err := applicator.SetLabel(labels.NODE, fmt.Sprintf("node%d", i), "nodeQuality", "good") + if err != nil { + t.Fatal(err) + } + } + + rc.ReplicasDesired = 7 + + err := rc.meetDesires() + if err != nil { + t.Fatal(err) + } + + current, err := rc.CurrentPods() + if err != nil { + t.Fatal(err) + } + + if len(current) != 7 { + t.Fatalf("rc should have scheduled 7 pods but found %d", len(current)) + } + + if len(alerter.Alerts) != 0 { + t.Fatalf("there shouldn't have been any alerts yet but there were %d", len(alerter.Alerts)) + } + + // now make one of the nodes ineligible, creating a situation where the + // RC has 7 "current" nodes and 7 desired recplicas, but only 6 of + // those nodes meet the node selector's criteria + err = applicator.SetLabel(labels.NODE, "node3", "nodeQuality", "bad") + if err != nil { + t.Fatal(err) + } + + err = rc.meetDesires() + if err != nil { + t.Fatal(err) + } + + current, err = rc.CurrentPods() + if err != nil { + t.Fatal(err) + } + + // There should still be 7 pods by design, we want to be paranoid about + // unscheduling. The operator should decrease the replica count if they + // want an unschedule to happen + if len(current) != 7 { + t.Fatalf("rc should still have 7 pods (even though only 6 are eligible) but found %d", len(current)) + } + + if len(alerter.Alerts) != 1 { + t.Fatalf("the RC should have alerted since replicas desired is greater than the number of eligible nodes, but there were %d alerts", len(alerter.Alerts)) + } +} + +// Tests that an RC will not do any scheduling/unscheduling if the only thing +// that changes is the set of nodes that match the node selector. This might be +// counter-intuitive but we don't want an RC to risk an outage by swapping +// pods. For example imagine there is a single node in an RC, and that node +// becomes ineligible and another node becomes eligible. We require that the +// operator increase the RC replica count to 2 to deploy the eligible node, and +// then (likely after some time has passed or some application-specific +// conditions have been met) decrease the replica count back to 1 to unschedule +// the ineligible node. +func TestRCDoesNotFixMembership(t *testing.T) { + _, _, applicator, rc, alerter, _, closeFn := setup(t) + defer closeFn() + + err := applicator.SetLabel(labels.NODE, "node1", "nodeQuality", "good") + if err != nil { + t.Fatal(err) + } + + rc.ReplicasDesired = 1 + + err = rc.meetDesires() + if err != nil { + t.Fatal(err) + } + + current, err := rc.CurrentPods() + if err != nil { + t.Fatal(err) + } + + if len(current) != 1 { + t.Fatalf("rc should have scheduled 1 pods but found %d", len(current)) + } + + if len(alerter.Alerts) != 0 { + t.Fatalf("there shouldn't have been any alerts yet but there were %d", len(alerter.Alerts)) + } + + // now mark node1 as ineligible and node2 as eligible. We want to test + // that the RC does not take any action because replicas desired == + // len(current nodes) + err = applicator.SetLabel(labels.NODE, "node1", "nodeQuality", "bad") + if err != nil { + t.Fatal(err) + } + err = applicator.SetLabel(labels.NODE, "node2", "nodeQuality", "good") + if err != nil { + t.Fatal(err) + } + + err = rc.meetDesires() + if err != nil { + t.Fatal(err) + } + + current, err = rc.CurrentPods() + if err != nil { + t.Fatal(err) + } + + if len(current) != 1 { + t.Fatalf("RC should have still only had 1 node but it had %d", len(current)) + } + + if current[0].Node != "node1" { + t.Fatalf("expected the RC to still consider node1 to be current, but the single node was %s", current[0].Node) + } + + if len(alerter.Alerts) != 1 { + t.Fatalf("the RC should have alerted since it has some current nodes that aren't eligible and is unable to correct this. There were %d alerts", len(alerter.Alerts)) + } +}