Skip to content

Commit 0232e3b

Browse files
committed
*: Add a command to force fail a keeper
Add a `failkeeper` command to `stolonctl`. This command wil force a keeper as "temporarily" failed. It's just a one shot operation, the sentinel compute a new clusterdata considering the keeper as failed and then restore its state to the real one. This can be useful to force a new master keeper election. For example, if the force failed keeper is a master, the sentinel will try to elect a new master, if no new master can be elected the force failed keeper, if really healthy, will be re-elected as master`,
1 parent 90b1a1e commit 0232e3b

20 files changed

+271
-15
lines changed

cmd/sentinel/cmd/sentinel.go

+10
Original file line numberDiff line numberDiff line change
@@ -240,6 +240,11 @@ func (s *Sentinel) updateKeepersStatus(cd *cluster.ClusterData, keepersInfo clus
240240
// Update keepers' healthy states
241241
for _, k := range cd.Keepers {
242242
healthy := s.isKeeperHealthy(cd, k)
243+
if k.Status.ForceFail {
244+
healthy = false
245+
// reset ForceFail
246+
k.Status.ForceFail = false
247+
}
243248
// set zero LastHealthyTime to time.Now() to avoid the keeper being
244249
// removed since previous versions don't have it set
245250
if k.Status.LastHealthyTime.IsZero() {
@@ -302,6 +307,11 @@ func (s *Sentinel) updateKeepersStatus(cd *cluster.ClusterData, keepersInfo clus
302307
// Update dbs' healthy state
303308
for _, db := range cd.DBs {
304309
db.Status.Healthy = s.isDBHealthy(cd, db)
310+
// if keeper is unhealthy then mark also the db ad unhealthy
311+
keeper := cd.Keepers[db.Spec.KeeperUID]
312+
if !keeper.Status.Healthy {
313+
db.Status.Healthy = false
314+
}
305315
}
306316

307317
return cd, kihs

cmd/stolonctl/cmd/failkeeper.go

+74
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
// Copyright 2018 Sorint.lab
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
package cmd
16+
17+
import (
18+
"context"
19+
20+
cmdcommon "github.com/sorintlab/stolon/cmd"
21+
"github.com/spf13/cobra"
22+
)
23+
24+
var failKeeperCmd = &cobra.Command{
25+
Use: "failkeeper [keeper uid]",
26+
Short: `Force keeper as "temporarily" failed. The sentinel will compute a new clusterdata considering it as failed and then restore its state to the real one.`,
27+
Long: `Force keeper as "temporarily" failed. It's just a one shot operation, the sentinel will compute a new clusterdata considering the keeper as failed and then restore its state to the real one. For example, if the force failed keeper is a master, the sentinel will try to elect a new master. If no new master can be elected, the force failed keeper, if really healthy, will be re-elected as master`,
28+
Run: failKeeper,
29+
}
30+
31+
func init() {
32+
CmdStolonCtl.AddCommand(failKeeperCmd)
33+
}
34+
35+
func failKeeper(cmd *cobra.Command, args []string) {
36+
if len(args) > 1 {
37+
die("too many arguments")
38+
}
39+
40+
if len(args) == 0 {
41+
die("keeper uid required")
42+
}
43+
44+
keeperID := args[0]
45+
46+
store, err := cmdcommon.NewStore(&cfg.CommonConfig)
47+
if err != nil {
48+
die("%v", err)
49+
}
50+
51+
cd, pair, err := getClusterData(store)
52+
if err != nil {
53+
die("cannot get cluster data: %v", err)
54+
}
55+
if cd.Cluster == nil {
56+
die("no cluster spec available")
57+
}
58+
if cd.Cluster.Spec == nil {
59+
die("no cluster spec available")
60+
}
61+
62+
newCd := cd.DeepCopy()
63+
keeperInfo := newCd.Keepers[keeperID]
64+
if keeperInfo == nil {
65+
die("keeper doesn't exist")
66+
}
67+
68+
keeperInfo.Status.ForceFail = true
69+
70+
_, err = store.AtomicPutClusterData(context.TODO(), newCd, pair)
71+
if err != nil {
72+
die("cannot update cluster data: %v", err)
73+
}
74+
}

doc/README.md

+1
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ We suggest that you first read the [Stolon Architecture and Requirements](archit
1919
* [Enabling pg_rewind](pg_rewind.md)
2020
* [Enabling synchronous replication](syncrepl.md)
2121
* [PostgreSQL SSL/TLS setup](ssl.md)
22+
* [Forcing a failover](forcefailover.md)
2223

2324
### Recipes
2425

doc/commands/stolon-keeper.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -41,4 +41,4 @@ stolon-keeper [flags]
4141
--uid string keeper uid (must be unique in the cluster and can contain only lower-case letters, numbers and the underscore character). If not provided a random uid will be generated.
4242
```
4343

44-
###### Auto generated by spf13/cobra on 19-Jul-2018
44+
###### Auto generated by spf13/cobra on 21-Aug-2018

doc/commands/stolon-proxy.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -34,4 +34,4 @@ stolon-proxy [flags]
3434
--tcp-keepalive-interval int set tcp keepalive interval (seconds)
3535
```
3636

37-
###### Auto generated by spf13/cobra on 19-Jul-2018
37+
###### Auto generated by spf13/cobra on 21-Aug-2018

doc/commands/stolon-sentinel.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -29,4 +29,4 @@ stolon-sentinel [flags]
2929
--store-skip-tls-verify skip store certificate verification (insecure!!!)
3030
```
3131

32-
###### Auto generated by spf13/cobra on 19-Jul-2018
32+
###### Auto generated by spf13/cobra on 21-Aug-2018

doc/commands/stolonctl.md

+2-1
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ stolonctl [flags]
3333
### SEE ALSO
3434

3535
* [stolonctl clusterdata](stolonctl_clusterdata.md) - Retrieve the current cluster data
36+
* [stolonctl failkeeper](stolonctl_failkeeper.md) - Force keeper as "temporarily" failed. The sentinel will compute a new clusterdata considering it as failed and then restore its state to the real one.
3637
* [stolonctl init](stolonctl_init.md) - Initialize a new cluster
3738
* [stolonctl promote](stolonctl_promote.md) - Promotes a standby cluster to a primary cluster
3839
* [stolonctl removekeeper](stolonctl_removekeeper.md) - Removes keeper from cluster data
@@ -41,4 +42,4 @@ stolonctl [flags]
4142
* [stolonctl update](stolonctl_update.md) - Update a cluster specification
4243
* [stolonctl version](stolonctl_version.md) - Display the version
4344

44-
###### Auto generated by spf13/cobra on 19-Jul-2018
45+
###### Auto generated by spf13/cobra on 21-Aug-2018

doc/commands/stolonctl_clusterdata.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -40,4 +40,4 @@ stolonctl clusterdata [flags]
4040

4141
* [stolonctl](stolonctl.md) - stolon command line client
4242

43-
###### Auto generated by spf13/cobra on 19-Jul-2018
43+
###### Auto generated by spf13/cobra on 21-Aug-2018

doc/commands/stolonctl_failkeeper.md

+42
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
## stolonctl failkeeper
2+
3+
Force keeper as "temporarily" failed. The sentinel will compute a new clusterdata considering it as failed and then restore its state to the real one.
4+
5+
### Synopsis
6+
7+
Force keeper as "temporarily" failed. It's just a one shot operation, the sentinel will compute a new clusterdata considering the keeper as failed and then restore its state to the real one. For example, if the force failed keeper is a master, the sentinel will try to elect a new master. If no new master can be elected, the force failed keeper, if really healthy, will be re-elected as master
8+
9+
```
10+
stolonctl failkeeper [keeper uid] [flags]
11+
```
12+
13+
### Options
14+
15+
```
16+
-h, --help help for failkeeper
17+
```
18+
19+
### Options inherited from parent commands
20+
21+
```
22+
--cluster-name string cluster name
23+
--kube-context string name of the kubeconfig context to use
24+
--kube-namespace string name of the kubernetes namespace to use
25+
--kube-resource-kind string the k8s resource kind to be used to store stolon clusterdata and do sentinel leader election (only "configmap" is currently supported)
26+
--kubeconfig string path to kubeconfig file. Overrides $KUBECONFIG
27+
--log-level string debug, info (default), warn or error (default "info")
28+
--metrics-listen-address string metrics listen address i.e "0.0.0.0:8080" (disabled by default)
29+
--store-backend string store backend type (etcdv2/etcd, etcdv3, consul or kubernetes)
30+
--store-ca-file string verify certificates of HTTPS-enabled store servers using this CA bundle
31+
--store-cert-file string certificate file for client identification to the store
32+
--store-endpoints string a comma-delimited list of store endpoints (use https scheme for tls communication) (defaults: http://127.0.0.1:2379 for etcd, http://127.0.0.1:8500 for consul)
33+
--store-key string private key file for client identification to the store
34+
--store-prefix string the store base prefix (default "stolon/cluster")
35+
--store-skip-tls-verify skip store certificate verification (insecure!!!)
36+
```
37+
38+
### SEE ALSO
39+
40+
* [stolonctl](stolonctl.md) - stolon command line client
41+
42+
###### Auto generated by spf13/cobra on 21-Aug-2018

doc/commands/stolonctl_init.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -41,4 +41,4 @@ stolonctl init [flags]
4141

4242
* [stolonctl](stolonctl.md) - stolon command line client
4343

44-
###### Auto generated by spf13/cobra on 19-Jul-2018
44+
###### Auto generated by spf13/cobra on 21-Aug-2018

doc/commands/stolonctl_promote.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -40,4 +40,4 @@ stolonctl promote [flags]
4040

4141
* [stolonctl](stolonctl.md) - stolon command line client
4242

43-
###### Auto generated by spf13/cobra on 19-Jul-2018
43+
###### Auto generated by spf13/cobra on 21-Aug-2018

doc/commands/stolonctl_removekeeper.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -39,4 +39,4 @@ stolonctl removekeeper [keeper uid] [flags]
3939

4040
* [stolonctl](stolonctl.md) - stolon command line client
4141

42-
###### Auto generated by spf13/cobra on 19-Jul-2018
42+
###### Auto generated by spf13/cobra on 21-Aug-2018

doc/commands/stolonctl_spec.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -40,4 +40,4 @@ stolonctl spec [flags]
4040

4141
* [stolonctl](stolonctl.md) - stolon command line client
4242

43-
###### Auto generated by spf13/cobra on 19-Jul-2018
43+
###### Auto generated by spf13/cobra on 21-Aug-2018

doc/commands/stolonctl_status.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -39,4 +39,4 @@ stolonctl status [flags]
3939

4040
* [stolonctl](stolonctl.md) - stolon command line client
4141

42-
###### Auto generated by spf13/cobra on 19-Jul-2018
42+
###### Auto generated by spf13/cobra on 21-Aug-2018

doc/commands/stolonctl_update.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -41,4 +41,4 @@ stolonctl update [flags]
4141

4242
* [stolonctl](stolonctl.md) - stolon command line client
4343

44-
###### Auto generated by spf13/cobra on 19-Jul-2018
44+
###### Auto generated by spf13/cobra on 21-Aug-2018

doc/commands/stolonctl_version.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -39,4 +39,4 @@ stolonctl version [flags]
3939

4040
* [stolonctl](stolonctl.md) - stolon command line client
4141

42-
###### Auto generated by spf13/cobra on 19-Jul-2018
42+
###### Auto generated by spf13/cobra on 21-Aug-2018

doc/forcefailover.md

+11
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
## Forcing a failover
2+
3+
You can force a "master" keeper failover using the [stolonctl failkeeper](commands/stolonctl_failkeeper.md)
4+
5+
This commands forces a keeper as "temporarily" failed. It's just a one shot operation, the sentinel will compute a new clusterdata considering the keeper as failed and then restore its state to the real one.
6+
7+
For example, if the force failed keeper is a master, the sentinel will try to elect a new master. If no new master can be elected, the force failed keeper, if really healthy, will be re-elected as master
8+
9+
To avoid losing any transaction when using asynchronous replication take a look at this recipe:
10+
11+
* [Manual switchover without transactions loss](manual_switchover.md)

doc/manual_switchover.md

+3-2
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
If for any reason (eg. maintenance) you want to switch the current master to another one without losing any transaction you can do this in these ways:
44

5-
* If you've synchronous replication enabled you can just stop the current master keeper, one of the synchronous standbys will be elected as the new master.
5+
* If you've synchronous replication enabled you can just stop/[forcefailover](forcefailover.md) the current master keeper, one of the synchronous standbys will be elected as the new master.
6+
7+
* If you aren't using synchronous replication you can just temporarily enable it (see [here](syncrepl.md)), wait that the cluster reconfigures some synchronous standbys (you can monitor `pg_stat_replication` for a standby with `sync_state` = `sync`) and then stop/[forcefailover](forcefailover.md) the master keeper, wait for a new synchronous standby to be elected and disable synchronous replication.
68

7-
* If you aren't using synchronous replication you can just temporarily enable it (see [here](syncrepl.md)), wait that the cluster reconfigures some synchronous standbys (you can monitor `pg_stat_replication` for a standby with `sync_state` = `sync`) and then stop the master keeper, wait for a new synchronous standby to be elected and disable synchronous replication.

internal/cluster/cluster.go

+2
Original file line numberDiff line numberDiff line change
@@ -544,6 +544,8 @@ type KeeperStatus struct {
544544
BootUUID string `json:"bootUUID,omitempty"`
545545

546546
PostgresBinaryVersion PostgresBinaryVersion `json:"postgresBinaryVersion,omitempty"`
547+
548+
ForceFail bool `json:"forceFail,omitempty"`
547549
}
548550

549551
type Keeper struct {

tests/integration/ha_test.go

+115-1
Original file line numberDiff line numberDiff line change
@@ -1559,7 +1559,6 @@ func testKeeperRemovalStolonCtl(t *testing.T, syncRepl bool) {
15591559

15601560
// remove master from the cluster data, must fail
15611561
err = StolonCtl(clusterName, tstore.storeBackend, storeEndpoints, "removekeeper", master.uid)
1562-
t.Logf("received err: %v", err)
15631562
if err == nil {
15641563
t.Fatalf("expected err")
15651564
}
@@ -1815,3 +1814,118 @@ func TestDisappearedKeeperData(t *testing.T) {
18151814
t.Fatalf("unexpected err: %v", err)
18161815
}
18171816
}
1817+
1818+
func testForceFail(t *testing.T, syncRepl bool, standbyCluster bool) {
1819+
dir, err := ioutil.TempDir("", "stolon")
1820+
if err != nil {
1821+
t.Fatalf("unexpected err: %v", err)
1822+
}
1823+
defer os.RemoveAll(dir)
1824+
1825+
var ptk *TestKeeper
1826+
var primary *TestKeeper
1827+
if standbyCluster {
1828+
primaryClusterName := uuid.NewV4().String()
1829+
ptks, ptss, ptp, ptstore := setupServers(t, primaryClusterName, dir, 1, 1, false, false, nil)
1830+
defer shutdown(ptks, ptss, ptp, ptstore)
1831+
for _, ptk = range ptks {
1832+
break
1833+
}
1834+
primary = ptk
1835+
}
1836+
1837+
clusterName := uuid.NewV4().String()
1838+
1839+
tks, tss, tp, tstore := setupServers(t, clusterName, dir, 2, 1, syncRepl, false, ptk)
1840+
defer shutdown(tks, tss, tp, tstore)
1841+
1842+
storeEndpoints := fmt.Sprintf("%s:%s", tstore.listenAddress, tstore.port)
1843+
storePath := filepath.Join(common.StorePrefix, clusterName)
1844+
sm := store.NewKVBackedStore(tstore.store, storePath)
1845+
1846+
master, standbys := waitMasterStandbysReady(t, sm, tks)
1847+
standby := standbys[0]
1848+
1849+
if !standbyCluster {
1850+
primary = master
1851+
}
1852+
1853+
// a standby cluster will disable syncRepl since it's not possible to do sync repl on cascading standbys
1854+
if syncRepl && !standbyCluster {
1855+
if err := WaitClusterDataSynchronousStandbys([]string{standby.uid}, sm, 30*time.Second); err != nil {
1856+
t.Fatalf("expected synchronous standby on keeper %q in cluster data", standby.uid)
1857+
}
1858+
}
1859+
1860+
if err := populate(t, primary); err != nil {
1861+
t.Fatalf("unexpected err: %v", err)
1862+
}
1863+
if err := write(t, primary, 1, 1); err != nil {
1864+
t.Fatalf("unexpected err: %v", err)
1865+
}
1866+
1867+
// get the primary/master XLogPos
1868+
xLogPos, err := GetXLogPos(primary)
1869+
if err != nil {
1870+
t.Fatalf("unexpected err: %v", err)
1871+
}
1872+
// wait for the keepers to have reported their state
1873+
if err := WaitClusterSyncedXLogPos([]*TestKeeper{master, standby}, xLogPos, sm, 20*time.Second); err != nil {
1874+
t.Fatalf("unexpected err: %v", err)
1875+
}
1876+
1877+
// the proxy should connect to the right master
1878+
if err := tp.WaitRightMaster(master, 3*cluster.DefaultProxyCheckInterval); err != nil {
1879+
t.Fatalf("unexpected err: %v", err)
1880+
}
1881+
1882+
// mark master as failed
1883+
err = StolonCtl(clusterName, tstore.storeBackend, storeEndpoints, "failkeeper", master.uid)
1884+
if err != nil {
1885+
t.Fatalf("unexpected err: %v", err)
1886+
}
1887+
1888+
// Wait for cluster data containing standby as master
1889+
if err := WaitClusterDataMaster(standby.uid, sm, 30*time.Second); err != nil {
1890+
t.Fatalf("expected master %q in cluster view", standby.uid)
1891+
}
1892+
if err := standby.WaitDBRole(common.RoleMaster, ptk, 30*time.Second); err != nil {
1893+
t.Fatalf("unexpected err: %v", err)
1894+
}
1895+
if !standbyCluster {
1896+
primary = standby
1897+
}
1898+
1899+
c, err := getLines(t, standby)
1900+
if err != nil {
1901+
t.Fatalf("unexpected err: %v", err)
1902+
}
1903+
if c != 1 {
1904+
t.Fatalf("wrong number of lines, want: %d, got: %d", 1, c)
1905+
}
1906+
1907+
// the proxy should connect to the right master
1908+
if err := tp.WaitRightMaster(standby, 3*cluster.DefaultProxyCheckInterval); err != nil {
1909+
t.Fatalf("unexpected err: %v", err)
1910+
}
1911+
}
1912+
1913+
func TestForceFail(t *testing.T) {
1914+
t.Parallel()
1915+
testForceFail(t, false, false)
1916+
}
1917+
1918+
func TestForceFailSyncRepl(t *testing.T) {
1919+
t.Parallel()
1920+
testForceFail(t, true, false)
1921+
}
1922+
1923+
func TestForceFailStandbyCluster(t *testing.T) {
1924+
t.Parallel()
1925+
testForceFail(t, false, true)
1926+
}
1927+
1928+
func TestForceFailSyncReplStandbyCluster(t *testing.T) {
1929+
t.Parallel()
1930+
testForceFail(t, false, true)
1931+
}

0 commit comments

Comments
 (0)