*: Add a command to force fail a keeper

sgotti · sgotti · commit 0232e3bcea64 · 2018-09-03T11:19:46.000+02:00
Add a `failkeeper` command to `stolonctl`. This command wil force a keeper as
"temporarily" failed.

It's just a one shot operation, the sentinel compute a new clusterdata
considering the keeper as failed and then restore its state to the real one.

This can be useful to force a new master keeper election. For example, if the
force failed keeper is a master, the sentinel will try to elect a new master, if
no new master can be elected the force failed keeper, if really healthy, will be
re-elected as master`,
diff --git a/cmd/sentinel/cmd/sentinel.go b/cmd/sentinel/cmd/sentinel.go
@@ -240,6 +240,11 @@ func (s *Sentinel) updateKeepersStatus(cd *cluster.ClusterData, keepersInfo clus
 	// Update keepers' healthy states
 	for _, k := range cd.Keepers {
 		healthy := s.isKeeperHealthy(cd, k)
+		if k.Status.ForceFail {
+			healthy = false
+			// reset ForceFail
+			k.Status.ForceFail = false
+		}
 		// set zero LastHealthyTime to time.Now() to avoid the keeper being
 		// removed since previous versions don't have it set
 		if k.Status.LastHealthyTime.IsZero() {
@@ -302,6 +307,11 @@ func (s *Sentinel) updateKeepersStatus(cd *cluster.ClusterData, keepersInfo clus
 	// Update dbs' healthy state
 	for _, db := range cd.DBs {
 		db.Status.Healthy = s.isDBHealthy(cd, db)
+		// if keeper is unhealthy then mark also the db ad unhealthy
+		keeper := cd.Keepers[db.Spec.KeeperUID]
+		if !keeper.Status.Healthy {
+			db.Status.Healthy = false
+		}
 	}
 
 	return cd, kihs
diff --git a/cmd/stolonctl/cmd/failkeeper.go b/cmd/stolonctl/cmd/failkeeper.go
@@ -0,0 +1,74 @@
+// Copyright 2018 Sorint.lab
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package cmd
+
+import (
+	"context"
+
+	cmdcommon "github.com/sorintlab/stolon/cmd"
+	"github.com/spf13/cobra"
+)
+
+var failKeeperCmd = &cobra.Command{
+	Use:   "failkeeper [keeper uid]",
+	Short: `Force keeper as "temporarily" failed. The sentinel will compute a new clusterdata considering it as failed and then restore its state to the real one.`,
+	Long:  `Force keeper as "temporarily" failed. It's just a one shot operation, the sentinel will compute a new clusterdata considering the keeper as failed and then restore its state to the real one. For example, if the force failed keeper is a master, the sentinel will try to elect a new master. If no new master can be elected, the force failed keeper, if really healthy, will be re-elected as master`,
+	Run:   failKeeper,
+}
+
+func init() {
+	CmdStolonCtl.AddCommand(failKeeperCmd)
+}
+
+func failKeeper(cmd *cobra.Command, args []string) {
+	if len(args) > 1 {
+		die("too many arguments")
+	}
+
+	if len(args) == 0 {
+		die("keeper uid required")
+	}
+
+	keeperID := args[0]
+
+	store, err := cmdcommon.NewStore(&cfg.CommonConfig)
+	if err != nil {
+		die("%v", err)
+	}
+
+	cd, pair, err := getClusterData(store)
+	if err != nil {
+		die("cannot get cluster data: %v", err)
+	}
+	if cd.Cluster == nil {
+		die("no cluster spec available")
+	}
+	if cd.Cluster.Spec == nil {
+		die("no cluster spec available")
+	}
+
+	newCd := cd.DeepCopy()
+	keeperInfo := newCd.Keepers[keeperID]
+	if keeperInfo == nil {
+		die("keeper doesn't exist")
+	}
+
+	keeperInfo.Status.ForceFail = true
+
+	_, err = store.AtomicPutClusterData(context.TODO(), newCd, pair)
+	if err != nil {
+		die("cannot update cluster data: %v", err)
+	}
+}
diff --git a/doc/README.md b/doc/README.md
@@ -19,6 +19,7 @@ We suggest that you first read the [Stolon Architecture and Requirements](archit
 * [Enabling pg_rewind](pg_rewind.md)
 * [Enabling synchronous replication](syncrepl.md)
 * [PostgreSQL SSL/TLS setup](ssl.md)
+* [Forcing a failover](forcefailover.md)
 
 ### Recipes
 
diff --git a/doc/commands/stolon-keeper.md b/doc/commands/stolon-keeper.md
@@ -41,4 +41,4 @@ stolon-keeper [flags]
       --uid string                      keeper uid (must be unique in the cluster and can contain only lower-case letters, numbers and the underscore character). If not provided a random uid will be generated.
 ```
 
-###### Auto generated by spf13/cobra on 19-Jul-2018
+###### Auto generated by spf13/cobra on 21-Aug-2018
diff --git a/doc/commands/stolon-proxy.md b/doc/commands/stolon-proxy.md
@@ -34,4 +34,4 @@ stolon-proxy [flags]
       --tcp-keepalive-interval int      set tcp keepalive interval (seconds)
 ```
 
-###### Auto generated by spf13/cobra on 19-Jul-2018
+###### Auto generated by spf13/cobra on 21-Aug-2018
diff --git a/doc/commands/stolon-sentinel.md b/doc/commands/stolon-sentinel.md
@@ -29,4 +29,4 @@ stolon-sentinel [flags]
       --store-skip-tls-verify           skip store certificate verification (insecure!!!)
 ```
 
-###### Auto generated by spf13/cobra on 19-Jul-2018
+###### Auto generated by spf13/cobra on 21-Aug-2018
diff --git a/doc/commands/stolonctl.md b/doc/commands/stolonctl.md
@@ -33,6 +33,7 @@ stolonctl [flags]
 ### SEE ALSO
 
 * [stolonctl clusterdata](stolonctl_clusterdata.md)	 - Retrieve the current cluster data
+* [stolonctl failkeeper](stolonctl_failkeeper.md)	 - Force keeper as "temporarily" failed. The sentinel will compute a new clusterdata considering it as failed and then restore its state to the real one.
 * [stolonctl init](stolonctl_init.md)	 - Initialize a new cluster
 * [stolonctl promote](stolonctl_promote.md)	 - Promotes a standby cluster to a primary cluster
 * [stolonctl removekeeper](stolonctl_removekeeper.md)	 - Removes keeper from cluster data
@@ -41,4 +42,4 @@ stolonctl [flags]
 * [stolonctl update](stolonctl_update.md)	 - Update a cluster specification
 * [stolonctl version](stolonctl_version.md)	 - Display the version
 
-###### Auto generated by spf13/cobra on 19-Jul-2018
+###### Auto generated by spf13/cobra on 21-Aug-2018
diff --git a/doc/commands/stolonctl_clusterdata.md b/doc/commands/stolonctl_clusterdata.md
@@ -40,4 +40,4 @@ stolonctl clusterdata [flags]
 
 * [stolonctl](stolonctl.md)	 - stolon command line client
 
-###### Auto generated by spf13/cobra on 19-Jul-2018
+###### Auto generated by spf13/cobra on 21-Aug-2018
diff --git a/doc/commands/stolonctl_failkeeper.md b/doc/commands/stolonctl_failkeeper.md
@@ -0,0 +1,42 @@
+## stolonctl failkeeper
+
+Force keeper as "temporarily" failed. The sentinel will compute a new clusterdata considering it as failed and then restore its state to the real one.
+
+### Synopsis
+
+Force keeper as "temporarily" failed. It's just a one shot operation, the sentinel will compute a new clusterdata considering the keeper as failed and then restore its state to the real one. For example, if the force failed keeper is a master, the sentinel will try to elect a new master. If no new master can be elected, the force failed keeper, if really healthy, will be re-elected as master
+
+```
+stolonctl failkeeper [keeper uid] [flags]
+```
+
+### Options
+
+```
+  -h, --help   help for failkeeper
+```
+
+### Options inherited from parent commands
+
+```
+      --cluster-name string             cluster name
+      --kube-context string             name of the kubeconfig context to use
+      --kube-namespace string           name of the kubernetes namespace to use
+      --kube-resource-kind string       the k8s resource kind to be used to store stolon clusterdata and do sentinel leader election (only "configmap" is currently supported)
+      --kubeconfig string               path to kubeconfig file. Overrides $KUBECONFIG
+      --log-level string                debug, info (default), warn or error (default "info")
+      --metrics-listen-address string   metrics listen address i.e "0.0.0.0:8080" (disabled by default)
+      --store-backend string            store backend type (etcdv2/etcd, etcdv3, consul or kubernetes)
+      --store-ca-file string            verify certificates of HTTPS-enabled store servers using this CA bundle
+      --store-cert-file string          certificate file for client identification to the store
+      --store-endpoints string          a comma-delimited list of store endpoints (use https scheme for tls communication) (defaults: http://127.0.0.1:2379 for etcd, http://127.0.0.1:8500 for consul)
+      --store-key string                private key file for client identification to the store
+      --store-prefix string             the store base prefix (default "stolon/cluster")
+      --store-skip-tls-verify           skip store certificate verification (insecure!!!)
+```
+
+### SEE ALSO
+
+* [stolonctl](stolonctl.md)	 - stolon command line client
+
+###### Auto generated by spf13/cobra on 21-Aug-2018
diff --git a/doc/commands/stolonctl_init.md b/doc/commands/stolonctl_init.md
@@ -41,4 +41,4 @@ stolonctl init [flags]
 
 * [stolonctl](stolonctl.md)	 - stolon command line client
 
-###### Auto generated by spf13/cobra on 19-Jul-2018
+###### Auto generated by spf13/cobra on 21-Aug-2018
diff --git a/doc/commands/stolonctl_promote.md b/doc/commands/stolonctl_promote.md
@@ -40,4 +40,4 @@ stolonctl promote [flags]
 
 * [stolonctl](stolonctl.md)	 - stolon command line client
 
-###### Auto generated by spf13/cobra on 19-Jul-2018
+###### Auto generated by spf13/cobra on 21-Aug-2018
diff --git a/doc/commands/stolonctl_removekeeper.md b/doc/commands/stolonctl_removekeeper.md
@@ -39,4 +39,4 @@ stolonctl removekeeper [keeper uid] [flags]
 
 * [stolonctl](stolonctl.md)	 - stolon command line client
 
-###### Auto generated by spf13/cobra on 19-Jul-2018
+###### Auto generated by spf13/cobra on 21-Aug-2018
diff --git a/doc/commands/stolonctl_spec.md b/doc/commands/stolonctl_spec.md
@@ -40,4 +40,4 @@ stolonctl spec [flags]
 
 * [stolonctl](stolonctl.md)	 - stolon command line client
 
-###### Auto generated by spf13/cobra on 19-Jul-2018
+###### Auto generated by spf13/cobra on 21-Aug-2018
diff --git a/doc/commands/stolonctl_status.md b/doc/commands/stolonctl_status.md
@@ -39,4 +39,4 @@ stolonctl status [flags]
 
 * [stolonctl](stolonctl.md)	 - stolon command line client
 
-###### Auto generated by spf13/cobra on 19-Jul-2018
+###### Auto generated by spf13/cobra on 21-Aug-2018
diff --git a/doc/commands/stolonctl_update.md b/doc/commands/stolonctl_update.md
@@ -41,4 +41,4 @@ stolonctl update [flags]
 
 * [stolonctl](stolonctl.md)	 - stolon command line client
 
-###### Auto generated by spf13/cobra on 19-Jul-2018
+###### Auto generated by spf13/cobra on 21-Aug-2018
diff --git a/doc/commands/stolonctl_version.md b/doc/commands/stolonctl_version.md
@@ -39,4 +39,4 @@ stolonctl version [flags]
 
 * [stolonctl](stolonctl.md)	 - stolon command line client
 
-###### Auto generated by spf13/cobra on 19-Jul-2018
+###### Auto generated by spf13/cobra on 21-Aug-2018
diff --git a/doc/forcefailover.md b/doc/forcefailover.md
@@ -0,0 +1,11 @@
+## Forcing a failover
+
+You can force a "master" keeper failover using the [stolonctl failkeeper](commands/stolonctl_failkeeper.md)
+
+This commands forces a keeper as "temporarily" failed. It's just a one shot operation, the sentinel will compute a new clusterdata considering the keeper as failed and then restore its state to the real one.
+
+For example, if the force failed keeper is a master, the sentinel will try to elect a new master. If no new master can be elected, the force failed keeper, if really healthy, will be re-elected as master
+
+To avoid losing any transaction when using asynchronous replication take a look at this recipe:
+
+* [Manual switchover without transactions loss](manual_switchover.md)
diff --git a/doc/manual_switchover.md b/doc/manual_switchover.md
@@ -2,6 +2,7 @@
 
 If for any reason (eg. maintenance) you want to switch the current master to another one without losing any transaction you can do this in these ways:
 
-* If you've synchronous replication enabled you can just stop the current master keeper, one of the synchronous standbys will be elected as the new master.
+* If you've synchronous replication enabled you can just stop/[forcefailover](forcefailover.md) the current master keeper, one of the synchronous standbys will be elected as the new master.
+
+* If you aren't using synchronous replication you can just temporarily enable it (see [here](syncrepl.md)), wait that the cluster reconfigures some synchronous standbys (you can monitor `pg_stat_replication` for a standby with `sync_state` = `sync`) and then stop/[forcefailover](forcefailover.md) the master keeper, wait for a new synchronous standby to be elected and disable synchronous replication.
 
-* If you aren't using synchronous replication you can just temporarily enable it (see [here](syncrepl.md)), wait that the cluster reconfigures some synchronous standbys (you can monitor `pg_stat_replication` for a standby with `sync_state` = `sync`) and then stop the master keeper, wait for a new synchronous standby to be elected and disable synchronous replication.
diff --git a/internal/cluster/cluster.go b/internal/cluster/cluster.go
@@ -544,6 +544,8 @@ type KeeperStatus struct {
 	BootUUID string `json:"bootUUID,omitempty"`
 
 	PostgresBinaryVersion PostgresBinaryVersion `json:"postgresBinaryVersion,omitempty"`
+
+	ForceFail bool `json:"forceFail,omitempty"`
 }
 
 type Keeper struct {
diff --git a/tests/integration/ha_test.go b/tests/integration/ha_test.go
@@ -1559,7 +1559,6 @@ func testKeeperRemovalStolonCtl(t *testing.T, syncRepl bool) {
 
 	// remove master from the cluster data, must fail
 	err = StolonCtl(clusterName, tstore.storeBackend, storeEndpoints, "removekeeper", master.uid)
-	t.Logf("received err: %v", err)
 	if err == nil {
 		t.Fatalf("expected err")
 	}
@@ -1815,3 +1814,118 @@ func TestDisappearedKeeperData(t *testing.T) {
 		t.Fatalf("unexpected err: %v", err)
 	}
 }
+
+func testForceFail(t *testing.T, syncRepl bool, standbyCluster bool) {
+	dir, err := ioutil.TempDir("", "stolon")
+	if err != nil {
+		t.Fatalf("unexpected err: %v", err)
+	}
+	defer os.RemoveAll(dir)
+
+	var ptk *TestKeeper
+	var primary *TestKeeper
+	if standbyCluster {
+		primaryClusterName := uuid.NewV4().String()
+		ptks, ptss, ptp, ptstore := setupServers(t, primaryClusterName, dir, 1, 1, false, false, nil)
+		defer shutdown(ptks, ptss, ptp, ptstore)
+		for _, ptk = range ptks {
+			break
+		}
+		primary = ptk
+	}
+
+	clusterName := uuid.NewV4().String()
+
+	tks, tss, tp, tstore := setupServers(t, clusterName, dir, 2, 1, syncRepl, false, ptk)
+	defer shutdown(tks, tss, tp, tstore)
+
+	storeEndpoints := fmt.Sprintf("%s:%s", tstore.listenAddress, tstore.port)
+	storePath := filepath.Join(common.StorePrefix, clusterName)
+	sm := store.NewKVBackedStore(tstore.store, storePath)
+
+	master, standbys := waitMasterStandbysReady(t, sm, tks)
+	standby := standbys[0]
+
+	if !standbyCluster {
+		primary = master
+	}
+
+	// a standby cluster will disable syncRepl since it's not possible to do sync repl on cascading standbys
+	if syncRepl && !standbyCluster {
+		if err := WaitClusterDataSynchronousStandbys([]string{standby.uid}, sm, 30*time.Second); err != nil {
+			t.Fatalf("expected synchronous standby on keeper %q in cluster data", standby.uid)
+		}
+	}
+
+	if err := populate(t, primary); err != nil {
+		t.Fatalf("unexpected err: %v", err)
+	}
+	if err := write(t, primary, 1, 1); err != nil {
+		t.Fatalf("unexpected err: %v", err)
+	}
+
+	// get the primary/master XLogPos
+	xLogPos, err := GetXLogPos(primary)
+	if err != nil {
+		t.Fatalf("unexpected err: %v", err)
+	}
+	// wait for the keepers to have reported their state
+	if err := WaitClusterSyncedXLogPos([]*TestKeeper{master, standby}, xLogPos, sm, 20*time.Second); err != nil {
+		t.Fatalf("unexpected err: %v", err)
+	}
+
+	// the proxy should connect to the right master
+	if err := tp.WaitRightMaster(master, 3*cluster.DefaultProxyCheckInterval); err != nil {
+		t.Fatalf("unexpected err: %v", err)
+	}
+
+	// mark master as failed
+	err = StolonCtl(clusterName, tstore.storeBackend, storeEndpoints, "failkeeper", master.uid)
+	if err != nil {
+		t.Fatalf("unexpected err: %v", err)
+	}
+
+	// Wait for cluster data containing standby as master
+	if err := WaitClusterDataMaster(standby.uid, sm, 30*time.Second); err != nil {
+		t.Fatalf("expected master %q in cluster view", standby.uid)
+	}
+	if err := standby.WaitDBRole(common.RoleMaster, ptk, 30*time.Second); err != nil {
+		t.Fatalf("unexpected err: %v", err)
+	}
+	if !standbyCluster {
+		primary = standby
+	}
+
+	c, err := getLines(t, standby)
+	if err != nil {
+		t.Fatalf("unexpected err: %v", err)
+	}
+	if c != 1 {
+		t.Fatalf("wrong number of lines, want: %d, got: %d", 1, c)
+	}
+
+	// the proxy should connect to the right master
+	if err := tp.WaitRightMaster(standby, 3*cluster.DefaultProxyCheckInterval); err != nil {
+		t.Fatalf("unexpected err: %v", err)
+	}
+}
+
+func TestForceFail(t *testing.T) {
+	t.Parallel()
+	testForceFail(t, false, false)
+}
+
+func TestForceFailSyncRepl(t *testing.T) {
+	t.Parallel()
+	testForceFail(t, true, false)
+}
+
+func TestForceFailStandbyCluster(t *testing.T) {
+	t.Parallel()
+	testForceFail(t, false, true)
+}
+
+func TestForceFailSyncReplStandbyCluster(t *testing.T) {
+	t.Parallel()
+	testForceFail(t, false, true)
+}

Original file line number	Diff line number	Diff line change
`@@ -40,4 +40,4 @@ stolonctl clusterdata [flags]`
`40`	`40`
`41`	`41`	`* [stolonctl](stolonctl.md) - stolon command line client`
`42`	`42`
`43`		`-###### Auto generated by spf13/cobra on 19-Jul-2018`
	`43`	`+###### Auto generated by spf13/cobra on 21-Aug-2018`
Original file line number	Diff line number	Diff line change
`@@ -41,4 +41,4 @@ stolonctl init [flags]`
`41`	`41`
`42`	`42`	`* [stolonctl](stolonctl.md) - stolon command line client`
`43`	`43`
`44`		`-###### Auto generated by spf13/cobra on 19-Jul-2018`
	`44`	`+###### Auto generated by spf13/cobra on 21-Aug-2018`