Skip to content

Commit 0188c4d

Browse files
authored
Merge pull request #1122 from jprzychodzen/release-1.16-nk-backport
Cherry-pick NodeKiller changes
2 parents 4baf007 + c97704f commit 0188c4d

File tree

5 files changed

+127
-7
lines changed

5 files changed

+127
-7
lines changed

clusterloader2/pkg/chaos/monkey.go

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,9 @@ limitations under the License.
1717
package chaos
1818

1919
import (
20+
"fmt"
21+
"strings"
22+
2023
clientset "k8s.io/client-go/kubernetes"
2124
"k8s.io/perf-tests/clusterloader2/api"
2225
)
@@ -47,3 +50,13 @@ func (m *Monkey) Init(config api.ChaosMonkeyConfig, stopCh <-chan struct{}) erro
4750

4851
return nil
4952
}
53+
54+
// Summary logs Monkey execution
55+
func (m *Monkey) Summary() string {
56+
var sb strings.Builder
57+
if m.nodeKiller != nil {
58+
sb.WriteString(fmt.Sprintf("Summary of Chaos Monkey execution\n"))
59+
sb.WriteString(m.nodeKiller.Summary())
60+
}
61+
return sb.String()
62+
}

clusterloader2/pkg/chaos/nodes.go

Lines changed: 94 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -18,35 +18,74 @@ package chaos
1818

1919
import (
2020
"fmt"
21+
"math"
2122
"math/rand"
23+
"strings"
2224
"sync"
2325
"time"
2426

2527
"k8s.io/perf-tests/clusterloader2/api"
28+
"k8s.io/perf-tests/clusterloader2/pkg/framework/client"
2629
"k8s.io/perf-tests/clusterloader2/pkg/util"
2730

2831
v1 "k8s.io/api/core/v1"
32+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2933
"k8s.io/apimachinery/pkg/util/sets"
3034
"k8s.io/apimachinery/pkg/util/wait"
3135
clientset "k8s.io/client-go/kubernetes"
3236
"k8s.io/klog"
3337
)
3438

39+
const (
40+
monitoringNamespace = "monitoring"
41+
prometheusLabel = "prometheus=k8s"
42+
)
43+
3544
// NodeKiller is a utility to simulate node failures.
3645
type NodeKiller struct {
3746
config api.NodeFailureConfig
3847
client clientset.Interface
3948
provider string
4049
// killedNodes stores names of the nodes that have been killed by NodeKiller.
4150
killedNodes sets.String
51+
recorder *eventRecorder
52+
}
53+
54+
type nodeAction string
55+
56+
const (
57+
stopServices nodeAction = "stopService"
58+
rebootNode = "rebootNode"
59+
)
60+
61+
type event struct {
62+
time time.Time
63+
action nodeAction
64+
nodeName string
65+
}
66+
67+
type eventRecorder struct {
68+
events []event
69+
mux sync.Mutex
70+
}
71+
72+
func newEventRecorder() *eventRecorder {
73+
return &eventRecorder{[]event{}, sync.Mutex{}}
74+
}
75+
76+
func (r *eventRecorder) record(a nodeAction, nodeName string) {
77+
e := event{time.Now(), a, nodeName}
78+
r.mux.Lock()
79+
r.events = append(r.events, e)
80+
r.mux.Unlock()
4281
}
4382

4483
// NewNodeKiller creates new NodeKiller.
4584
func NewNodeKiller(config api.NodeFailureConfig, client clientset.Interface, provider string) (*NodeKiller, error) {
4685
if provider != "gce" && provider != "gke" {
4786
return nil, fmt.Errorf("provider %q is not supported by NodeKiller", provider)
4887
}
49-
return &NodeKiller{config, client, provider, sets.NewString()}, nil
88+
return &NodeKiller{config, client, provider, sets.NewString(), newEventRecorder()}, nil
5089
}
5190

5291
// Run starts NodeKiller until stopCh is closed.
@@ -68,18 +107,37 @@ func (k *NodeKiller) pickNodes() ([]v1.Node, error) {
68107
if err != nil {
69108
return nil, err
70109
}
110+
111+
prometheusPods, err := client.ListPodsWithOptions(k.client, monitoringNamespace, metav1.ListOptions{
112+
LabelSelector: prometheusLabel,
113+
})
114+
if err != nil {
115+
return nil, err
116+
}
117+
nodesHasPrometheusPod := sets.NewString()
118+
for i := range prometheusPods {
119+
if prometheusPods[i].Spec.NodeName != "" {
120+
nodesHasPrometheusPod.Insert(prometheusPods[i].Spec.NodeName)
121+
klog.Infof("%s: Node %s removed from killing. Runs pod %s", k, prometheusPods[i].Spec.NodeName, prometheusPods[i].Name)
122+
}
123+
}
124+
71125
nodes := allNodes[:0]
72126
for _, node := range allNodes {
73-
if !k.killedNodes.Has(node.Name) {
127+
if !nodesHasPrometheusPod.Has(node.Name) && !k.killedNodes.Has(node.Name) {
74128
nodes = append(nodes, node)
75129
}
76130
}
77131
rand.Shuffle(len(nodes), func(i, j int) {
78132
nodes[i], nodes[j] = nodes[j], nodes[i]
79133
})
80-
numNodes := int(k.config.FailureRate * float64(len(nodes)))
134+
numNodes := int(math.Ceil(k.config.FailureRate * float64(len(nodes))))
135+
klog.Infof("%s: %d nodes available, wants to fail %d nodes", k, len(nodes), numNodes)
81136
if len(nodes) > numNodes {
82-
return nodes[:numNodes], nil
137+
nodes = nodes[:numNodes]
138+
}
139+
for _, node := range nodes {
140+
klog.Infof("%s: Node %q schedule for failure", k, node.Name)
83141
}
84142
return nodes, nil
85143
}
@@ -94,6 +152,7 @@ func (k *NodeKiller) kill(nodes []v1.Node) {
94152
defer wg.Done()
95153

96154
klog.Infof("%s: Stopping docker and kubelet on %q to simulate failure", k, node.Name)
155+
k.addStopServicesEvent(node.Name)
97156
err := util.SSH("sudo systemctl stop docker kubelet", &node, nil)
98157
if err != nil {
99158
klog.Errorf("%s: ERROR while stopping node %q: %v", k, node.Name, err)
@@ -103,7 +162,19 @@ func (k *NodeKiller) kill(nodes []v1.Node) {
103162
time.Sleep(time.Duration(k.config.SimulatedDowntime))
104163

105164
klog.Infof("%s: Rebooting %q to repair the node", k, node.Name)
106-
err = util.SSH("sudo reboot", &node, nil)
165+
// Scheduling a reboot in one second, then disconnecting.
166+
//
167+
// Bash command explanation:
168+
// 'nohup' - Making sure that end of SSH connection signal will not break sudo
169+
// 'sudo' - Elevated priviliages, required by 'shutdown'
170+
// 'shutdown' - Control machine power
171+
// '-r' - Making 'shutdown' to reboot, instead of power-off
172+
// '+1s' - Parameter to 'reboot', to wait 1 second before rebooting.
173+
// '> /dev/null 2> /dev/null < /dev/null' - File descriptor redirect, all three I/O to avoid ssh hanging,
174+
// see https://web.archive.org/web/20090429074212/http://www.openssh.com/faq.html#3.10
175+
// '&' - Execute command in background, end without waiting for result
176+
k.addRebootEvent(node.Name)
177+
err = util.SSH("nohup sudo shutdown -r +1s > /dev/null 2> /dev/null < /dev/null &", &node, nil)
107178
if err != nil {
108179
klog.Errorf("%s: Error while rebooting node %q: %v", k, node.Name, err)
109180
return
@@ -113,6 +184,24 @@ func (k *NodeKiller) kill(nodes []v1.Node) {
113184
wg.Wait()
114185
}
115186

187+
func (k *NodeKiller) addStopServicesEvent(nodeName string) {
188+
k.recorder.record(stopServices, nodeName)
189+
}
190+
191+
func (k *NodeKiller) addRebootEvent(nodeName string) {
192+
k.recorder.record(rebootNode, nodeName)
193+
}
194+
195+
// Summary logs NodeKiller execution
196+
func (k *NodeKiller) Summary() string {
197+
var sb strings.Builder
198+
sb.WriteString(fmt.Sprintf("%s: Recorded following events\n", k))
199+
for _, e := range k.recorder.events {
200+
sb.WriteString(fmt.Sprintf("%s: At %v %v happend for node %s\n", k, e.time.Format(time.UnixDate), e.action, e.nodeName))
201+
}
202+
return sb.String()
203+
}
204+
116205
func (k *NodeKiller) String() string {
117206
return "NodeKiller"
118207
}

clusterloader2/pkg/framework/client/objects.go

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -131,6 +131,23 @@ func RetryFunction(f func() error, options ...*ApiCallOptions) wait.ConditionFun
131131
}
132132
}
133133

134+
// ListPodsWithOptions lists the pods using the provided options.
135+
func ListPodsWithOptions(c clientset.Interface, namespace string, listOpts metav1.ListOptions) ([]apiv1.Pod, error) {
136+
var pods []apiv1.Pod
137+
listFunc := func() error {
138+
podsList, err := c.CoreV1().Pods(namespace).List(listOpts)
139+
if err != nil {
140+
return err
141+
}
142+
pods = podsList.Items
143+
return nil
144+
}
145+
if err := RetryWithExponentialBackOff(RetryFunction(listFunc)); err != nil {
146+
return pods, err
147+
}
148+
return pods, nil
149+
}
150+
134151
// ListNodes returns list of cluster nodes.
135152
func ListNodes(c clientset.Interface) ([]apiv1.Node, error) {
136153
return ListNodesWithOptions(c, metav1.ListOptions{})

clusterloader2/pkg/test/simple_test_executor.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -102,6 +102,7 @@ func (ste *simpleTestExecutor) ExecuteTest(ctx Context, conf *api.Config) *error
102102
}
103103
}
104104
}
105+
klog.Infof(ctx.GetChaosMonkey().Summary())
105106
return errList
106107
}
107108

clusterloader2/testing/load/config.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -64,8 +64,8 @@ tuningSets:
6464
chaosMonkey:
6565
nodeFailure:
6666
failureRate: 0.01
67-
interval: 1m
68-
jitterFactor: 10.0
67+
interval: 5m
68+
jitterFactor: 2.0
6969
simulatedDowntime: 10m
7070
{{end}}
7171
steps:

0 commit comments

Comments
 (0)