@@ -18,35 +18,74 @@ package chaos
18
18
19
19
import (
20
20
"fmt"
21
+ "math"
21
22
"math/rand"
23
+ "strings"
22
24
"sync"
23
25
"time"
24
26
25
27
"k8s.io/perf-tests/clusterloader2/api"
28
+ "k8s.io/perf-tests/clusterloader2/pkg/framework/client"
26
29
"k8s.io/perf-tests/clusterloader2/pkg/util"
27
30
28
31
v1 "k8s.io/api/core/v1"
32
+ metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
29
33
"k8s.io/apimachinery/pkg/util/sets"
30
34
"k8s.io/apimachinery/pkg/util/wait"
31
35
clientset "k8s.io/client-go/kubernetes"
32
36
"k8s.io/klog"
33
37
)
34
38
39
+ const (
40
+ monitoringNamespace = "monitoring"
41
+ prometheusLabel = "prometheus=k8s"
42
+ )
43
+
35
44
// NodeKiller is a utility to simulate node failures.
36
45
type NodeKiller struct {
37
46
config api.NodeFailureConfig
38
47
client clientset.Interface
39
48
provider string
40
49
// killedNodes stores names of the nodes that have been killed by NodeKiller.
41
50
killedNodes sets.String
51
+ recorder * eventRecorder
52
+ }
53
+
54
+ type nodeAction string
55
+
56
+ const (
57
+ stopServices nodeAction = "stopService"
58
+ rebootNode = "rebootNode"
59
+ )
60
+
61
+ type event struct {
62
+ time time.Time
63
+ action nodeAction
64
+ nodeName string
65
+ }
66
+
67
+ type eventRecorder struct {
68
+ events []event
69
+ mux sync.Mutex
70
+ }
71
+
72
+ func newEventRecorder () * eventRecorder {
73
+ return & eventRecorder {[]event {}, sync.Mutex {}}
74
+ }
75
+
76
+ func (r * eventRecorder ) record (a nodeAction , nodeName string ) {
77
+ e := event {time .Now (), a , nodeName }
78
+ r .mux .Lock ()
79
+ r .events = append (r .events , e )
80
+ r .mux .Unlock ()
42
81
}
43
82
44
83
// NewNodeKiller creates new NodeKiller.
45
84
func NewNodeKiller (config api.NodeFailureConfig , client clientset.Interface , provider string ) (* NodeKiller , error ) {
46
85
if provider != "gce" && provider != "gke" {
47
86
return nil , fmt .Errorf ("provider %q is not supported by NodeKiller" , provider )
48
87
}
49
- return & NodeKiller {config , client , provider , sets .NewString ()}, nil
88
+ return & NodeKiller {config , client , provider , sets .NewString (), newEventRecorder () }, nil
50
89
}
51
90
52
91
// Run starts NodeKiller until stopCh is closed.
@@ -68,18 +107,37 @@ func (k *NodeKiller) pickNodes() ([]v1.Node, error) {
68
107
if err != nil {
69
108
return nil , err
70
109
}
110
+
111
+ prometheusPods , err := client .ListPodsWithOptions (k .client , monitoringNamespace , metav1.ListOptions {
112
+ LabelSelector : prometheusLabel ,
113
+ })
114
+ if err != nil {
115
+ return nil , err
116
+ }
117
+ nodesHasPrometheusPod := sets .NewString ()
118
+ for i := range prometheusPods {
119
+ if prometheusPods [i ].Spec .NodeName != "" {
120
+ nodesHasPrometheusPod .Insert (prometheusPods [i ].Spec .NodeName )
121
+ klog .Infof ("%s: Node %s removed from killing. Runs pod %s" , k , prometheusPods [i ].Spec .NodeName , prometheusPods [i ].Name )
122
+ }
123
+ }
124
+
71
125
nodes := allNodes [:0 ]
72
126
for _ , node := range allNodes {
73
- if ! k .killedNodes .Has (node .Name ) {
127
+ if ! nodesHasPrometheusPod . Has ( node . Name ) && ! k .killedNodes .Has (node .Name ) {
74
128
nodes = append (nodes , node )
75
129
}
76
130
}
77
131
rand .Shuffle (len (nodes ), func (i , j int ) {
78
132
nodes [i ], nodes [j ] = nodes [j ], nodes [i ]
79
133
})
80
- numNodes := int (k .config .FailureRate * float64 (len (nodes )))
134
+ numNodes := int (math .Ceil (k .config .FailureRate * float64 (len (nodes ))))
135
+ klog .Infof ("%s: %d nodes available, wants to fail %d nodes" , k , len (nodes ), numNodes )
81
136
if len (nodes ) > numNodes {
82
- return nodes [:numNodes ], nil
137
+ nodes = nodes [:numNodes ]
138
+ }
139
+ for _ , node := range nodes {
140
+ klog .Infof ("%s: Node %q schedule for failure" , k , node .Name )
83
141
}
84
142
return nodes , nil
85
143
}
@@ -94,6 +152,7 @@ func (k *NodeKiller) kill(nodes []v1.Node) {
94
152
defer wg .Done ()
95
153
96
154
klog .Infof ("%s: Stopping docker and kubelet on %q to simulate failure" , k , node .Name )
155
+ k .addStopServicesEvent (node .Name )
97
156
err := util .SSH ("sudo systemctl stop docker kubelet" , & node , nil )
98
157
if err != nil {
99
158
klog .Errorf ("%s: ERROR while stopping node %q: %v" , k , node .Name , err )
@@ -103,7 +162,19 @@ func (k *NodeKiller) kill(nodes []v1.Node) {
103
162
time .Sleep (time .Duration (k .config .SimulatedDowntime ))
104
163
105
164
klog .Infof ("%s: Rebooting %q to repair the node" , k , node .Name )
106
- err = util .SSH ("sudo reboot" , & node , nil )
165
+ // Scheduling a reboot in one second, then disconnecting.
166
+ //
167
+ // Bash command explanation:
168
+ // 'nohup' - Making sure that end of SSH connection signal will not break sudo
169
+ // 'sudo' - Elevated priviliages, required by 'shutdown'
170
+ // 'shutdown' - Control machine power
171
+ // '-r' - Making 'shutdown' to reboot, instead of power-off
172
+ // '+1s' - Parameter to 'reboot', to wait 1 second before rebooting.
173
+ // '> /dev/null 2> /dev/null < /dev/null' - File descriptor redirect, all three I/O to avoid ssh hanging,
174
+ // see https://web.archive.org/web/20090429074212/http://www.openssh.com/faq.html#3.10
175
+ // '&' - Execute command in background, end without waiting for result
176
+ k .addRebootEvent (node .Name )
177
+ err = util .SSH ("nohup sudo shutdown -r +1s > /dev/null 2> /dev/null < /dev/null &" , & node , nil )
107
178
if err != nil {
108
179
klog .Errorf ("%s: Error while rebooting node %q: %v" , k , node .Name , err )
109
180
return
@@ -113,6 +184,24 @@ func (k *NodeKiller) kill(nodes []v1.Node) {
113
184
wg .Wait ()
114
185
}
115
186
187
+ func (k * NodeKiller ) addStopServicesEvent (nodeName string ) {
188
+ k .recorder .record (stopServices , nodeName )
189
+ }
190
+
191
+ func (k * NodeKiller ) addRebootEvent (nodeName string ) {
192
+ k .recorder .record (rebootNode , nodeName )
193
+ }
194
+
195
+ // Summary logs NodeKiller execution
196
+ func (k * NodeKiller ) Summary () string {
197
+ var sb strings.Builder
198
+ sb .WriteString (fmt .Sprintf ("%s: Recorded following events\n " , k ))
199
+ for _ , e := range k .recorder .events {
200
+ sb .WriteString (fmt .Sprintf ("%s: At %v %v happend for node %s\n " , k , e .time .Format (time .UnixDate ), e .action , e .nodeName ))
201
+ }
202
+ return sb .String ()
203
+ }
204
+
116
205
func (k * NodeKiller ) String () string {
117
206
return "NodeKiller"
118
207
}
0 commit comments