Skip to content

Commit 0299758

Browse files
committed
Merge pull request sorintlab#510 from sgotti/keeper_report_error_when_local_dbuid_is_not_correct
keeper: report error when local dbuid is not correct
2 parents 9d34b4e + 2c818a0 commit 0299758

File tree

2 files changed

+107
-44
lines changed

2 files changed

+107
-44
lines changed

cmd/keeper/cmd/keeper.go

Lines changed: 24 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -1052,40 +1052,15 @@ func (p *PostgresKeeper) postgresKeeperSM(pctx context.Context) {
10521052
}
10531053
}
10541054

1055-
initialized, err := pgm.IsInitialized()
1056-
if err != nil {
1057-
log.Errorw("failed to detect if instance is initialized", zap.Error(err))
1058-
return
1059-
}
1060-
1061-
if initialized {
1062-
var started bool
1063-
started, err = pgm.IsStarted()
1055+
if p.dbLocalState.UID != db.UID {
1056+
var initialized bool
1057+
initialized, err = pgm.IsInitialized()
10641058
if err != nil {
1065-
// log error getting instance state but go ahead.
1066-
log.Errorw("failed to retrieve instance status", zap.Error(err))
1067-
}
1068-
log.Debugw("db status", "initialized", true, "started", started)
1069-
} else {
1070-
log.Debugw("db status", "initialized", false, "started", false)
1071-
}
1072-
1073-
dbls = p.dbLocalStateCopy()
1074-
// if the db is initialized but there isn't a db local state then generate a new one
1075-
if initialized && dbls.UID == "" {
1076-
ndbls := &DBLocalState{
1077-
UID: common.UID(),
1078-
Generation: cluster.NoGeneration,
1079-
Initializing: false,
1080-
}
1081-
if err = p.saveDBLocalState(ndbls); err != nil {
1082-
log.Errorw("failed to save db local state", zap.Error(err))
1059+
log.Errorw("failed to detect if instance is initialized", zap.Error(err))
10831060
return
10841061
}
1085-
}
1062+
log.Infow("current db UID different than cluster data db UID", "db", p.dbLocalState.UID, "cdDB", db.UID)
10861063

1087-
if dbls.UID != db.UID {
1088-
log.Infow("current db UID different than cluster data db UID", "db", dbls.UID, "cdDB", db.UID)
10891064
switch db.Spec.InitMode {
10901065
case cluster.DBInitModeNew:
10911066
log.Infow("initializing the database cluster")
@@ -1125,7 +1100,6 @@ func (p *PostgresKeeper) postgresKeeperSM(pctx context.Context) {
11251100
log.Errorw("failed to initialize postgres database cluster", zap.Error(err))
11261101
return
11271102
}
1128-
initialized = true
11291103

11301104
if err = pgm.StartTmpMerged(); err != nil {
11311105
log.Errorw("failed to start instance", zap.Error(err))
@@ -1226,7 +1200,6 @@ func (p *PostgresKeeper) postgresKeeperSM(pctx context.Context) {
12261200
return
12271201
}
12281202
}
1229-
initialized = true
12301203

12311204
if err = pgm.StopIfStarted(true); err != nil {
12321205
log.Errorw("failed to stop pg instance", zap.Error(err))
@@ -1335,7 +1308,6 @@ func (p *PostgresKeeper) postgresKeeperSM(pctx context.Context) {
13351308
}
13361309
}
13371310
}
1338-
initialized = true
13391311

13401312
case cluster.DBInitModeExisting:
13411313
ndbls := &DBLocalState{
@@ -1384,24 +1356,32 @@ func (p *PostgresKeeper) postgresKeeperSM(pctx context.Context) {
13841356
return
13851357
}
13861358
case cluster.DBInitModeNone:
1387-
ndbls := &DBLocalState{
1388-
// replace our current db uid with the required one.
1389-
UID: db.UID,
1390-
// Set a no generation since we aren't already converged.
1391-
Generation: cluster.NoGeneration,
1392-
Initializing: false,
1393-
}
1394-
if err = p.saveDBLocalState(ndbls); err != nil {
1395-
log.Errorw("failed to save db local state", zap.Error(err))
1396-
return
1397-
}
1359+
log.Errorw("different local dbUID but init mode is none, this shouldn't happen. Something bad happened to the keeper data. Check that keeper data is on a persistent volume and that the keeper state files weren't removed")
13981360
return
13991361
default:
14001362
log.Errorw("unknown db init mode", "initMode", string(db.Spec.InitMode))
14011363
return
14021364
}
14031365
}
14041366

1367+
initialized, err := pgm.IsInitialized()
1368+
if err != nil {
1369+
log.Errorw("failed to detect if instance is initialized", zap.Error(err))
1370+
return
1371+
}
1372+
1373+
if initialized {
1374+
var started bool
1375+
started, err = pgm.IsStarted()
1376+
if err != nil {
1377+
// log error getting instance state but go ahead.
1378+
log.Errorw("failed to retrieve instance status", zap.Error(err))
1379+
}
1380+
log.Debugw("db status", "initialized", true, "started", started)
1381+
} else {
1382+
log.Debugw("db status", "initialized", false, "started", false)
1383+
}
1384+
14051385
// create postgres parameteres
14061386
pgParameters = p.createPGParameters(db)
14071387
// update pgm postgres parameters

tests/integration/ha_test.go

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1710,3 +1710,86 @@ func TestStandbyCantSync(t *testing.T) {
17101710
t.Fatalf("expected different dbuid for standbys[0]: got the same: %q", newStandby0DBUID)
17111711
}
17121712
}
1713+
1714+
// TestDisappearedKeeperData tests that, if keeper data disappears (at least
1715+
// dbstate file is missing) and there's not init mode defined in the db spec, it'll
1716+
// return en error
1717+
func TestDisappearedKeeperData(t *testing.T) {
1718+
t.Parallel()
1719+
1720+
dir, err := ioutil.TempDir("", "stolon")
1721+
if err != nil {
1722+
t.Fatalf("unexpected err: %v", err)
1723+
}
1724+
defer os.RemoveAll(dir)
1725+
1726+
clusterName := uuid.NewV4().String()
1727+
1728+
tks, tss, tp, tstore := setupServers(t, clusterName, dir, 2, 1, false, false, nil)
1729+
defer shutdown(tks, tss, tp, tstore)
1730+
1731+
storePath := filepath.Join(common.StorePrefix, clusterName)
1732+
sm := store.NewKVBackedStore(tstore.store, storePath)
1733+
1734+
master, standbys := waitMasterStandbysReady(t, sm, tks)
1735+
standby := standbys[0]
1736+
1737+
if err := populate(t, master); err != nil {
1738+
t.Fatalf("unexpected err: %v", err)
1739+
}
1740+
if err := write(t, master, 1, 1); err != nil {
1741+
t.Fatalf("unexpected err: %v", err)
1742+
}
1743+
1744+
// get the master XLogPos
1745+
xLogPos, err := GetXLogPos(master)
1746+
if err != nil {
1747+
t.Fatalf("unexpected err: %v", err)
1748+
}
1749+
// wait for the keepers to have reported their state
1750+
if err := WaitClusterSyncedXLogPos([]*TestKeeper{master, standby}, xLogPos, sm, 20*time.Second); err != nil {
1751+
t.Fatalf("unexpected err: %v", err)
1752+
}
1753+
1754+
// the proxy should connect to the right master
1755+
if err := tp.WaitRightMaster(master, 3*cluster.DefaultProxyCheckInterval); err != nil {
1756+
t.Fatalf("unexpected err: %v", err)
1757+
}
1758+
1759+
// Stop the master keeper
1760+
t.Logf("Stopping current master keeper: %s", master.uid)
1761+
master.Stop()
1762+
1763+
// Remove master data
1764+
if err := os.RemoveAll(master.dataDir); err != nil {
1765+
t.Fatalf("unexpected err: %v", err)
1766+
}
1767+
1768+
// restart master
1769+
t.Logf("Starting current master keeper: %s", master.uid)
1770+
master.Start()
1771+
waitKeeperReady(t, sm, master)
1772+
1773+
// master shouldn't start its postgres instance and standby should be elected as new master
1774+
1775+
// Wait for cluster data containing standby as master
1776+
if err := WaitClusterDataMaster(standby.uid, sm, 30*time.Second); err != nil {
1777+
t.Fatalf("expected master %q in cluster view", standby.uid)
1778+
}
1779+
if err := standby.WaitDBRole(common.RoleMaster, nil, 30*time.Second); err != nil {
1780+
t.Fatalf("unexpected err: %v", err)
1781+
}
1782+
1783+
c, err := getLines(t, standby)
1784+
if err != nil {
1785+
t.Fatalf("unexpected err: %v", err)
1786+
}
1787+
if c != 1 {
1788+
t.Fatalf("wrong number of lines, want: %d, got: %d", 1, c)
1789+
}
1790+
1791+
// the proxy should connect to the right master
1792+
if err := tp.WaitRightMaster(standby, 3*cluster.DefaultProxyCheckInterval); err != nil {
1793+
t.Fatalf("unexpected err: %v", err)
1794+
}
1795+
}

0 commit comments

Comments
 (0)