Skip to content

Commit 2c818a0

Browse files
committed
keeper: report error when local dbuid is not correct
if the local dbuid is different than the clusterdata required one but initmode is none something bad happened to the local keeper data. This should happen only when the local dbstate file disappeared (ephemeral data dir, manual remove). In such case don't just update it with the clusterdata one and continue but report an error and stop.
1 parent beb6ab2 commit 2c818a0

File tree

2 files changed

+107
-44
lines changed

2 files changed

+107
-44
lines changed

cmd/keeper/cmd/keeper.go

Lines changed: 24 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -1052,40 +1052,15 @@ func (p *PostgresKeeper) postgresKeeperSM(pctx context.Context) {
10521052
}
10531053
}
10541054

1055-
initialized, err := pgm.IsInitialized()
1056-
if err != nil {
1057-
log.Errorw("failed to detect if instance is initialized", zap.Error(err))
1058-
return
1059-
}
1060-
1061-
if initialized {
1062-
var started bool
1063-
started, err = pgm.IsStarted()
1055+
if p.dbLocalState.UID != db.UID {
1056+
var initialized bool
1057+
initialized, err = pgm.IsInitialized()
10641058
if err != nil {
1065-
// log error getting instance state but go ahead.
1066-
log.Errorw("failed to retrieve instance status", zap.Error(err))
1067-
}
1068-
log.Debugw("db status", "initialized", true, "started", started)
1069-
} else {
1070-
log.Debugw("db status", "initialized", false, "started", false)
1071-
}
1072-
1073-
dbls = p.dbLocalStateCopy()
1074-
// if the db is initialized but there isn't a db local state then generate a new one
1075-
if initialized && dbls.UID == "" {
1076-
ndbls := &DBLocalState{
1077-
UID: common.UID(),
1078-
Generation: cluster.NoGeneration,
1079-
Initializing: false,
1080-
}
1081-
if err = p.saveDBLocalState(ndbls); err != nil {
1082-
log.Errorw("failed to save db local state", zap.Error(err))
1059+
log.Errorw("failed to detect if instance is initialized", zap.Error(err))
10831060
return
10841061
}
1085-
}
1062+
log.Infow("current db UID different than cluster data db UID", "db", p.dbLocalState.UID, "cdDB", db.UID)
10861063

1087-
if dbls.UID != db.UID {
1088-
log.Infow("current db UID different than cluster data db UID", "db", dbls.UID, "cdDB", db.UID)
10891064
switch db.Spec.InitMode {
10901065
case cluster.DBInitModeNew:
10911066
log.Infow("initializing the database cluster")
@@ -1125,7 +1100,6 @@ func (p *PostgresKeeper) postgresKeeperSM(pctx context.Context) {
11251100
log.Errorw("failed to initialize postgres database cluster", zap.Error(err))
11261101
return
11271102
}
1128-
initialized = true
11291103

11301104
if err = pgm.StartTmpMerged(); err != nil {
11311105
log.Errorw("failed to start instance", zap.Error(err))
@@ -1226,7 +1200,6 @@ func (p *PostgresKeeper) postgresKeeperSM(pctx context.Context) {
12261200
return
12271201
}
12281202
}
1229-
initialized = true
12301203

12311204
if err = pgm.StopIfStarted(true); err != nil {
12321205
log.Errorw("failed to stop pg instance", zap.Error(err))
@@ -1335,7 +1308,6 @@ func (p *PostgresKeeper) postgresKeeperSM(pctx context.Context) {
13351308
}
13361309
}
13371310
}
1338-
initialized = true
13391311

13401312
case cluster.DBInitModeExisting:
13411313
ndbls := &DBLocalState{
@@ -1384,24 +1356,32 @@ func (p *PostgresKeeper) postgresKeeperSM(pctx context.Context) {
13841356
return
13851357
}
13861358
case cluster.DBInitModeNone:
1387-
ndbls := &DBLocalState{
1388-
// replace our current db uid with the required one.
1389-
UID: db.UID,
1390-
// Set a no generation since we aren't already converged.
1391-
Generation: cluster.NoGeneration,
1392-
Initializing: false,
1393-
}
1394-
if err = p.saveDBLocalState(ndbls); err != nil {
1395-
log.Errorw("failed to save db local state", zap.Error(err))
1396-
return
1397-
}
1359+
log.Errorw("different local dbUID but init mode is none, this shouldn't happen. Something bad happened to the keeper data. Check that keeper data is on a persistent volume and that the keeper state files weren't removed")
13981360
return
13991361
default:
14001362
log.Errorw("unknown db init mode", "initMode", string(db.Spec.InitMode))
14011363
return
14021364
}
14031365
}
14041366

1367+
initialized, err := pgm.IsInitialized()
1368+
if err != nil {
1369+
log.Errorw("failed to detect if instance is initialized", zap.Error(err))
1370+
return
1371+
}
1372+
1373+
if initialized {
1374+
var started bool
1375+
started, err = pgm.IsStarted()
1376+
if err != nil {
1377+
// log error getting instance state but go ahead.
1378+
log.Errorw("failed to retrieve instance status", zap.Error(err))
1379+
}
1380+
log.Debugw("db status", "initialized", true, "started", started)
1381+
} else {
1382+
log.Debugw("db status", "initialized", false, "started", false)
1383+
}
1384+
14051385
// create postgres parameteres
14061386
pgParameters = p.createPGParameters(db)
14071387
// update pgm postgres parameters

tests/integration/ha_test.go

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1710,3 +1710,86 @@ func TestStandbyCantSync(t *testing.T) {
17101710
t.Fatalf("expected different dbuid for standbys[0]: got the same: %q", newStandby0DBUID)
17111711
}
17121712
}
1713+
1714+
// TestDisappearedKeeperData tests that, if keeper data disappears (at least
1715+
// dbstate file is missing) and there's not init mode defined in the db spec, it'll
1716+
// return en error
1717+
func TestDisappearedKeeperData(t *testing.T) {
1718+
t.Parallel()
1719+
1720+
dir, err := ioutil.TempDir("", "stolon")
1721+
if err != nil {
1722+
t.Fatalf("unexpected err: %v", err)
1723+
}
1724+
defer os.RemoveAll(dir)
1725+
1726+
clusterName := uuid.NewV4().String()
1727+
1728+
tks, tss, tp, tstore := setupServers(t, clusterName, dir, 2, 1, false, false, nil)
1729+
defer shutdown(tks, tss, tp, tstore)
1730+
1731+
storePath := filepath.Join(common.StorePrefix, clusterName)
1732+
sm := store.NewKVBackedStore(tstore.store, storePath)
1733+
1734+
master, standbys := waitMasterStandbysReady(t, sm, tks)
1735+
standby := standbys[0]
1736+
1737+
if err := populate(t, master); err != nil {
1738+
t.Fatalf("unexpected err: %v", err)
1739+
}
1740+
if err := write(t, master, 1, 1); err != nil {
1741+
t.Fatalf("unexpected err: %v", err)
1742+
}
1743+
1744+
// get the master XLogPos
1745+
xLogPos, err := GetXLogPos(master)
1746+
if err != nil {
1747+
t.Fatalf("unexpected err: %v", err)
1748+
}
1749+
// wait for the keepers to have reported their state
1750+
if err := WaitClusterSyncedXLogPos([]*TestKeeper{master, standby}, xLogPos, sm, 20*time.Second); err != nil {
1751+
t.Fatalf("unexpected err: %v", err)
1752+
}
1753+
1754+
// the proxy should connect to the right master
1755+
if err := tp.WaitRightMaster(master, 3*cluster.DefaultProxyCheckInterval); err != nil {
1756+
t.Fatalf("unexpected err: %v", err)
1757+
}
1758+
1759+
// Stop the master keeper
1760+
t.Logf("Stopping current master keeper: %s", master.uid)
1761+
master.Stop()
1762+
1763+
// Remove master data
1764+
if err := os.RemoveAll(master.dataDir); err != nil {
1765+
t.Fatalf("unexpected err: %v", err)
1766+
}
1767+
1768+
// restart master
1769+
t.Logf("Starting current master keeper: %s", master.uid)
1770+
master.Start()
1771+
waitKeeperReady(t, sm, master)
1772+
1773+
// master shouldn't start its postgres instance and standby should be elected as new master
1774+
1775+
// Wait for cluster data containing standby as master
1776+
if err := WaitClusterDataMaster(standby.uid, sm, 30*time.Second); err != nil {
1777+
t.Fatalf("expected master %q in cluster view", standby.uid)
1778+
}
1779+
if err := standby.WaitDBRole(common.RoleMaster, nil, 30*time.Second); err != nil {
1780+
t.Fatalf("unexpected err: %v", err)
1781+
}
1782+
1783+
c, err := getLines(t, standby)
1784+
if err != nil {
1785+
t.Fatalf("unexpected err: %v", err)
1786+
}
1787+
if c != 1 {
1788+
t.Fatalf("wrong number of lines, want: %d, got: %d", 1, c)
1789+
}
1790+
1791+
// the proxy should connect to the right master
1792+
if err := tp.WaitRightMaster(standby, 3*cluster.DefaultProxyCheckInterval); err != nil {
1793+
t.Fatalf("unexpected err: %v", err)
1794+
}
1795+
}

0 commit comments

Comments
 (0)