@@ -20,6 +20,7 @@ import (
20
20
"fmt"
21
21
"math"
22
22
"math/rand"
23
+ "strings"
23
24
"sync"
24
25
"time"
25
26
@@ -47,14 +48,44 @@ type NodeKiller struct {
47
48
provider string
48
49
// killedNodes stores names of the nodes that have been killed by NodeKiller.
49
50
killedNodes sets.String
51
+ recorder * eventRecorder
52
+ }
53
+
54
+ type nodeAction string
55
+
56
+ const (
57
+ stopServices nodeAction = "stopService"
58
+ rebootNode = "rebootNode"
59
+ )
60
+
61
+ type event struct {
62
+ time time.Time
63
+ action nodeAction
64
+ nodeName string
65
+ }
66
+
67
+ type eventRecorder struct {
68
+ events []event
69
+ mux sync.Mutex
70
+ }
71
+
72
+ func newEventRecorder () * eventRecorder {
73
+ return & eventRecorder {[]event {}, sync.Mutex {}}
74
+ }
75
+
76
+ func (r * eventRecorder ) record (a nodeAction , nodeName string ) {
77
+ e := event {time .Now (), a , nodeName }
78
+ r .mux .Lock ()
79
+ r .events = append (r .events , e )
80
+ r .mux .Unlock ()
50
81
}
51
82
52
83
// NewNodeKiller creates new NodeKiller.
53
84
func NewNodeKiller (config api.NodeFailureConfig , client clientset.Interface , provider string ) (* NodeKiller , error ) {
54
85
if provider != "gce" && provider != "gke" {
55
86
return nil , fmt .Errorf ("provider %q is not supported by NodeKiller" , provider )
56
87
}
57
- return & NodeKiller {config , client , provider , sets .NewString ()}, nil
88
+ return & NodeKiller {config , client , provider , sets .NewString (), newEventRecorder () }, nil
58
89
}
59
90
60
91
// Run starts NodeKiller until stopCh is closed.
@@ -121,6 +152,7 @@ func (k *NodeKiller) kill(nodes []v1.Node) {
121
152
defer wg .Done ()
122
153
123
154
klog .Infof ("%s: Stopping docker and kubelet on %q to simulate failure" , k , node .Name )
155
+ k .addStopServicesEvent (node .Name )
124
156
err := util .SSH ("sudo systemctl stop docker kubelet" , & node , nil )
125
157
if err != nil {
126
158
klog .Errorf ("%s: ERROR while stopping node %q: %v" , k , node .Name , err )
@@ -141,6 +173,7 @@ func (k *NodeKiller) kill(nodes []v1.Node) {
141
173
// '> /dev/null 2> /dev/null < /dev/null' - File descriptor redirect, all three I/O to avoid ssh hanging,
142
174
// see https://web.archive.org/web/20090429074212/http://www.openssh.com/faq.html#3.10
143
175
// '&' - Execute command in background, end without waiting for result
176
+ k .addRebootEvent (node .Name )
144
177
err = util .SSH ("nohup sudo shutdown -r +1s > /dev/null 2> /dev/null < /dev/null &" , & node , nil )
145
178
if err != nil {
146
179
klog .Errorf ("%s: Error while rebooting node %q: %v" , k , node .Name , err )
@@ -151,6 +184,24 @@ func (k *NodeKiller) kill(nodes []v1.Node) {
151
184
wg .Wait ()
152
185
}
153
186
187
+ func (k * NodeKiller ) addStopServicesEvent (nodeName string ) {
188
+ k .recorder .record (stopServices , nodeName )
189
+ }
190
+
191
+ func (k * NodeKiller ) addRebootEvent (nodeName string ) {
192
+ k .recorder .record (rebootNode , nodeName )
193
+ }
194
+
195
+ // Summary logs NodeKiller execution
196
+ func (k * NodeKiller ) Summary () string {
197
+ var sb strings.Builder
198
+ sb .WriteString (fmt .Sprintf ("%s: Recorded following events\n " , k ))
199
+ for _ , e := range k .recorder .events {
200
+ sb .WriteString (fmt .Sprintf ("%s: At %v %v happend for node %s\n " , k , e .time .Format (time .UnixDate ), e .action , e .nodeName ))
201
+ }
202
+ return sb .String ()
203
+ }
204
+
154
205
func (k * NodeKiller ) String () string {
155
206
return "NodeKiller"
156
207
}
0 commit comments