Skip to content

Commit 0232e3b

Browse files
committed
*: Add a command to force fail a keeper
Add a `failkeeper` command to `stolonctl`. This command wil force a keeper as "temporarily" failed. It's just a one shot operation, the sentinel compute a new clusterdata considering the keeper as failed and then restore its state to the real one. This can be useful to force a new master keeper election. For example, if the force failed keeper is a master, the sentinel will try to elect a new master, if no new master can be elected the force failed keeper, if really healthy, will be re-elected as master`,
1 parent 90b1a1e commit 0232e3b

20 files changed

+271
-15
lines changed

cmd/sentinel/cmd/sentinel.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -240,6 +240,11 @@ func (s *Sentinel) updateKeepersStatus(cd *cluster.ClusterData, keepersInfo clus
240240
// Update keepers' healthy states
241241
for _, k := range cd.Keepers {
242242
healthy := s.isKeeperHealthy(cd, k)
243+
if k.Status.ForceFail {
244+
healthy = false
245+
// reset ForceFail
246+
k.Status.ForceFail = false
247+
}
243248
// set zero LastHealthyTime to time.Now() to avoid the keeper being
244249
// removed since previous versions don't have it set
245250
if k.Status.LastHealthyTime.IsZero() {
@@ -302,6 +307,11 @@ func (s *Sentinel) updateKeepersStatus(cd *cluster.ClusterData, keepersInfo clus
302307
// Update dbs' healthy state
303308
for _, db := range cd.DBs {
304309
db.Status.Healthy = s.isDBHealthy(cd, db)
310+
// if keeper is unhealthy then mark also the db ad unhealthy
311+
keeper := cd.Keepers[db.Spec.KeeperUID]
312+
if !keeper.Status.Healthy {
313+
db.Status.Healthy = false
314+
}
305315
}
306316

307317
return cd, kihs

cmd/stolonctl/cmd/failkeeper.go

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
// Copyright 2018 Sorint.lab
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
package cmd
16+
17+
import (
18+
"context"
19+
20+
cmdcommon "github.com/sorintlab/stolon/cmd"
21+
"github.com/spf13/cobra"
22+
)
23+
24+
var failKeeperCmd = &cobra.Command{
25+
Use: "failkeeper [keeper uid]",
26+
Short: `Force keeper as "temporarily" failed. The sentinel will compute a new clusterdata considering it as failed and then restore its state to the real one.`,
27+
Long: `Force keeper as "temporarily" failed. It's just a one shot operation, the sentinel will compute a new clusterdata considering the keeper as failed and then restore its state to the real one. For example, if the force failed keeper is a master, the sentinel will try to elect a new master. If no new master can be elected, the force failed keeper, if really healthy, will be re-elected as master`,
28+
Run: failKeeper,
29+
}
30+
31+
func init() {
32+
CmdStolonCtl.AddCommand(failKeeperCmd)
33+
}
34+
35+
func failKeeper(cmd *cobra.Command, args []string) {
36+
if len(args) > 1 {
37+
die("too many arguments")
38+
}
39+
40+
if len(args) == 0 {
41+
die("keeper uid required")
42+
}
43+
44+
keeperID := args[0]
45+
46+
store, err := cmdcommon.NewStore(&cfg.CommonConfig)
47+
if err != nil {
48+
die("%v", err)
49+
}
50+
51+
cd, pair, err := getClusterData(store)
52+
if err != nil {
53+
die("cannot get cluster data: %v", err)
54+
}
55+
if cd.Cluster == nil {
56+
die("no cluster spec available")
57+
}
58+
if cd.Cluster.Spec == nil {
59+
die("no cluster spec available")
60+
}
61+
62+
newCd := cd.DeepCopy()
63+
keeperInfo := newCd.Keepers[keeperID]
64+
if keeperInfo == nil {
65+
die("keeper doesn't exist")
66+
}
67+
68+
keeperInfo.Status.ForceFail = true
69+
70+
_, err = store.AtomicPutClusterData(context.TODO(), newCd, pair)
71+
if err != nil {
72+
die("cannot update cluster data: %v", err)
73+
}
74+
}

doc/README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ We suggest that you first read the [Stolon Architecture and Requirements](archit
1919
* [Enabling pg_rewind](pg_rewind.md)
2020
* [Enabling synchronous replication](syncrepl.md)
2121
* [PostgreSQL SSL/TLS setup](ssl.md)
22+
* [Forcing a failover](forcefailover.md)
2223

2324
### Recipes
2425

doc/commands/stolon-keeper.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,4 +41,4 @@ stolon-keeper [flags]
4141
--uid string keeper uid (must be unique in the cluster and can contain only lower-case letters, numbers and the underscore character). If not provided a random uid will be generated.
4242
```
4343

44-
###### Auto generated by spf13/cobra on 19-Jul-2018
44+
###### Auto generated by spf13/cobra on 21-Aug-2018

doc/commands/stolon-proxy.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,4 +34,4 @@ stolon-proxy [flags]
3434
--tcp-keepalive-interval int set tcp keepalive interval (seconds)
3535
```
3636

37-
###### Auto generated by spf13/cobra on 19-Jul-2018
37+
###### Auto generated by spf13/cobra on 21-Aug-2018

doc/commands/stolon-sentinel.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,4 +29,4 @@ stolon-sentinel [flags]
2929
--store-skip-tls-verify skip store certificate verification (insecure!!!)
3030
```
3131

32-
###### Auto generated by spf13/cobra on 19-Jul-2018
32+
###### Auto generated by spf13/cobra on 21-Aug-2018

doc/commands/stolonctl.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ stolonctl [flags]
3333
### SEE ALSO
3434

3535
* [stolonctl clusterdata](stolonctl_clusterdata.md) - Retrieve the current cluster data
36+
* [stolonctl failkeeper](stolonctl_failkeeper.md) - Force keeper as "temporarily" failed. The sentinel will compute a new clusterdata considering it as failed and then restore its state to the real one.
3637
* [stolonctl init](stolonctl_init.md) - Initialize a new cluster
3738
* [stolonctl promote](stolonctl_promote.md) - Promotes a standby cluster to a primary cluster
3839
* [stolonctl removekeeper](stolonctl_removekeeper.md) - Removes keeper from cluster data
@@ -41,4 +42,4 @@ stolonctl [flags]
4142
* [stolonctl update](stolonctl_update.md) - Update a cluster specification
4243
* [stolonctl version](stolonctl_version.md) - Display the version
4344

44-
###### Auto generated by spf13/cobra on 19-Jul-2018
45+
###### Auto generated by spf13/cobra on 21-Aug-2018

doc/commands/stolonctl_clusterdata.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,4 +40,4 @@ stolonctl clusterdata [flags]
4040

4141
* [stolonctl](stolonctl.md) - stolon command line client
4242

43-
###### Auto generated by spf13/cobra on 19-Jul-2018
43+
###### Auto generated by spf13/cobra on 21-Aug-2018

doc/commands/stolonctl_failkeeper.md

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
## stolonctl failkeeper
2+
3+
Force keeper as "temporarily" failed. The sentinel will compute a new clusterdata considering it as failed and then restore its state to the real one.
4+
5+
### Synopsis
6+
7+
Force keeper as "temporarily" failed. It's just a one shot operation, the sentinel will compute a new clusterdata considering the keeper as failed and then restore its state to the real one. For example, if the force failed keeper is a master, the sentinel will try to elect a new master. If no new master can be elected, the force failed keeper, if really healthy, will be re-elected as master
8+
9+
```
10+
stolonctl failkeeper [keeper uid] [flags]
11+
```
12+
13+
### Options
14+
15+
```
16+
-h, --help help for failkeeper
17+
```
18+
19+
### Options inherited from parent commands
20+
21+
```
22+
--cluster-name string cluster name
23+
--kube-context string name of the kubeconfig context to use
24+
--kube-namespace string name of the kubernetes namespace to use
25+
--kube-resource-kind string the k8s resource kind to be used to store stolon clusterdata and do sentinel leader election (only "configmap" is currently supported)
26+
--kubeconfig string path to kubeconfig file. Overrides $KUBECONFIG
27+
--log-level string debug, info (default), warn or error (default "info")
28+
--metrics-listen-address string metrics listen address i.e "0.0.0.0:8080" (disabled by default)
29+
--store-backend string store backend type (etcdv2/etcd, etcdv3, consul or kubernetes)
30+
--store-ca-file string verify certificates of HTTPS-enabled store servers using this CA bundle
31+
--store-cert-file string certificate file for client identification to the store
32+
--store-endpoints string a comma-delimited list of store endpoints (use https scheme for tls communication) (defaults: http://127.0.0.1:2379 for etcd, http://127.0.0.1:8500 for consul)
33+
--store-key string private key file for client identification to the store
34+
--store-prefix string the store base prefix (default "stolon/cluster")
35+
--store-skip-tls-verify skip store certificate verification (insecure!!!)
36+
```
37+
38+
### SEE ALSO
39+
40+
* [stolonctl](stolonctl.md) - stolon command line client
41+
42+
###### Auto generated by spf13/cobra on 21-Aug-2018

doc/commands/stolonctl_init.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,4 +41,4 @@ stolonctl init [flags]
4141

4242
* [stolonctl](stolonctl.md) - stolon command line client
4343

44-
###### Auto generated by spf13/cobra on 19-Jul-2018
44+
###### Auto generated by spf13/cobra on 21-Aug-2018

0 commit comments

Comments
 (0)