Skip to content

Commit 946ffd1

Browse files
committed
sentinel: use only one sync standby on postgres <= 9.5
With the new syncrepl logic during a sync standby transition we need two sync standbys, this won't work correctly with postgres <= 9.5 so fix it providing only the new sync standby with the issue of a small window where we can't elect the new sync standby if it's not in sync after the switch and fallback to the previous one.
1 parent 6c47055 commit 946ffd1

File tree

3 files changed

+77
-42
lines changed

3 files changed

+77
-42
lines changed

cmd/sentinel/cmd/sentinel.go

Lines changed: 53 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -365,15 +365,6 @@ func (s *Sentinel) activeProxiesInfos(proxiesInfo cluster.ProxiesInfo) cluster.P
365365
return activeProxiesInfo
366366
}
367367

368-
func (s *Sentinel) getDBForKeeper(cd *cluster.ClusterData, keeperUID string) *cluster.DB {
369-
for _, db := range cd.DBs {
370-
if db.Spec.KeeperUID == keeperUID {
371-
return db
372-
}
373-
}
374-
return nil
375-
}
376-
377368
func (s *Sentinel) findInitialKeeper(cd *cluster.ClusterData) (*cluster.Keeper, error) {
378369
if len(cd.Keepers) < 1 {
379370
return nil, fmt.Errorf("no keepers registered")
@@ -953,7 +944,7 @@ func (s *Sentinel) updateCluster(cd *cluster.ClusterData, pis cluster.ProxiesInf
953944
dbsToRemove := []*cluster.DB{}
954945
for _, k := range newcd.Keepers {
955946
// get db associated to the keeper
956-
db := s.getDBForKeeper(cd, k.UID)
947+
db := cd.FindDB(k)
957948
if db != nil {
958949
// skip keepers with an assigned db
959950
continue
@@ -1088,6 +1079,7 @@ func (s *Sentinel) updateCluster(cd *cluster.ClusterData, pis cluster.ProxiesInf
10881079

10891080
if curMasterDBUID == wantedMasterDBUID {
10901081
masterDB := newcd.DBs[curMasterDBUID]
1082+
masterDBKeeper := newcd.Keepers[masterDB.Spec.KeeperUID]
10911083

10921084
if newcd.Proxy.Spec.MasterDBUID == "" {
10931085
// if the Proxy.Spec.MasterDBUID is empty we have to wait for all
@@ -1189,6 +1181,27 @@ func (s *Sentinel) updateCluster(cd *cluster.ClusterData, pis cluster.ProxiesInf
11891181

11901182
// Setup synchronous standbys
11911183
if s.syncRepl(clusterSpec) {
1184+
minSynchronousStandbys := int(*clusterSpec.MinSynchronousStandbys)
1185+
maxSynchronousStandbys := int(*clusterSpec.MaxSynchronousStandbys)
1186+
merge := true
1187+
// PostgresSQL <= 9.5 only supports one sync standby at a
1188+
// time (defining multiple sync standbys is like doing "1
1189+
// (standby1, standby2)" on postgres >= 9.6 and so we won't
1190+
// be able to know which is the real in sync standby.
1191+
//
1192+
// So we always have to define 1 standby in
1193+
// masterDB.Spec.SynchronousStandbys with the downside that
1194+
// can there be a time window where we cannot elect the
1195+
// synchronous standby as a new primary if it's not yet in
1196+
// sync
1197+
if masterDBKeeper.Status.PostgresBinaryVersion.Maj != 0 {
1198+
if masterDBKeeper.Status.PostgresBinaryVersion.Maj == 9 && masterDBKeeper.Status.PostgresBinaryVersion.Min <= 5 {
1199+
minSynchronousStandbys = 1
1200+
maxSynchronousStandbys = 1
1201+
merge = false
1202+
}
1203+
}
1204+
11921205
// update synchronousStandbys only if the reported
11931206
// SynchronousStandbys are the same as the required ones. In
11941207
// this way, when we have to choose a new master we are sure
@@ -1234,8 +1247,8 @@ func (s *Sentinel) updateCluster(cd *cluster.ClusterData, pis cluster.ProxiesInf
12341247
}
12351248

12361249
// Remove synchronous standbys in excess
1237-
if uint16(len(synchronousStandbys)) > *clusterSpec.MaxSynchronousStandbys {
1238-
rc := len(synchronousStandbys) - int(*clusterSpec.MaxSynchronousStandbys)
1250+
if len(synchronousStandbys) > maxSynchronousStandbys {
1251+
rc := len(synchronousStandbys) - maxSynchronousStandbys
12391252
removedCount := 0
12401253
toRemove = map[string]struct{}{}
12411254
for dbUID, _ := range synchronousStandbys {
@@ -1254,7 +1267,7 @@ func (s *Sentinel) updateCluster(cd *cluster.ClusterData, pis cluster.ProxiesInf
12541267
// try to add missing standbys up to MaxSynchronousStandbys
12551268
bestStandbys := s.findBestStandbys(newcd, curMasterDB)
12561269

1257-
ac := int(*clusterSpec.MaxSynchronousStandbys) - len(synchronousStandbys)
1270+
ac := maxSynchronousStandbys - len(synchronousStandbys)
12581271
addedCount := 0
12591272
for _, bestStandby := range bestStandbys {
12601273
if addedCount >= ac {
@@ -1273,7 +1286,7 @@ func (s *Sentinel) updateCluster(cd *cluster.ClusterData, pis cluster.ProxiesInf
12731286
// also if not in a good state. In this way we have more
12741287
// possibilities to choose a sync standby to replace a
12751288
// failed master if they becoe healthy again
1276-
ac = int(*clusterSpec.MinSynchronousStandbys) - len(synchronousStandbys)
1289+
ac = minSynchronousStandbys - len(synchronousStandbys)
12771290
addedCount = 0
12781291
for _, db := range newcd.DBs {
12791292
if addedCount >= ac {
@@ -1289,29 +1302,31 @@ func (s *Sentinel) updateCluster(cd *cluster.ClusterData, pis cluster.ProxiesInf
12891302
}
12901303
}
12911304

1292-
// if some of the new synchronousStandbys are not inside
1293-
// the prevSynchronousStandbys then also add all
1294-
// the prevSynchronousStandbys. In this way when there's
1295-
// a synchronousStandbys change we'll have, in a first
1296-
// step, both the old and the new standbys, then in the
1297-
// second step the old will be removed (since the new
1298-
// standbys are all inside prevSynchronousStandbys), so
1299-
// we'll always be able to choose a sync standby that we
1300-
// know was defined in the primary and in sync if the
1301-
// primary fails.
1302-
allInPrev := true
1303-
for k, _ := range synchronousStandbys {
1304-
if _, ok := prevSynchronousStandbys[k]; !ok {
1305-
allInPrev = false
1305+
if merge {
1306+
// if some of the new synchronousStandbys are not inside
1307+
// the prevSynchronousStandbys then also add all
1308+
// the prevSynchronousStandbys. In this way when there's
1309+
// a synchronousStandbys change we'll have, in a first
1310+
// step, both the old and the new standbys, then in the
1311+
// second step the old will be removed (since the new
1312+
// standbys are all inside prevSynchronousStandbys), so
1313+
// we'll always be able to choose a sync standby that we
1314+
// know was defined in the primary and in sync if the
1315+
// primary fails.
1316+
allInPrev := true
1317+
for k, _ := range synchronousStandbys {
1318+
if _, ok := prevSynchronousStandbys[k]; !ok {
1319+
allInPrev = false
1320+
}
13061321
}
1307-
}
1308-
if !allInPrev {
1309-
log.Infow("merging current and previous synchronous standbys", "masterDB", masterDB.UID, "prevSynchronousStandbys", prevSynchronousStandbys, "synchronousStandbys", synchronousStandbys)
1310-
// use only existing dbs
1311-
for _, db := range newcd.DBs {
1312-
if _, ok := prevSynchronousStandbys[db.UID]; ok {
1313-
log.Infow("adding previous synchronous standby", "masterDB", masterDB.UID, "synchronousStandbyDB", db.UID, "keeper", db.Spec.KeeperUID)
1314-
synchronousStandbys[db.UID] = struct{}{}
1322+
if !allInPrev {
1323+
log.Infow("merging current and previous synchronous standbys", "masterDB", masterDB.UID, "prevSynchronousStandbys", prevSynchronousStandbys, "synchronousStandbys", synchronousStandbys)
1324+
// use only existing dbs
1325+
for _, db := range newcd.DBs {
1326+
if _, ok := prevSynchronousStandbys[db.UID]; ok {
1327+
log.Infow("adding previous synchronous standby", "masterDB", masterDB.UID, "synchronousStandbyDB", db.UID, "keeper", db.Spec.KeeperUID)
1328+
synchronousStandbys[db.UID] = struct{}{}
1329+
}
13151330
}
13161331
}
13171332
}
@@ -1323,8 +1338,8 @@ func (s *Sentinel) updateCluster(cd *cluster.ClusterData, pis cluster.ProxiesInf
13231338
}
13241339

13251340
// If there're not enough real synchronous standbys add a fake synchronous standby because we have to be strict and make the master block transactions until MinSynchronousStandbys real standbys are available
1326-
if len(synchronousStandbys)+len(externalSynchronousStandbys) < int(*clusterSpec.MinSynchronousStandbys) {
1327-
log.Infow("using a fake synchronous standby since there are not enough real standbys available", "masterDB", masterDB.UID, "required", int(*clusterSpec.MinSynchronousStandbys))
1341+
if len(synchronousStandbys)+len(externalSynchronousStandbys) < minSynchronousStandbys {
1342+
log.Infow("using a fake synchronous standby since there are not enough real standbys available", "masterDB", masterDB.UID, "required", minSynchronousStandbys)
13281343
addFakeStandby = true
13291344
}
13301345

doc/syncrepl.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ You can enable/disable synchronous replication at any time and the keepers will
77

88
### Min and Max number of synchronous replication standbys
99

10-
In the cluster spec you can set the `MinSynchronousStandbys` and `MaxSynchronousStandbys` values (they both defaults to 1). Having multiple synchronous standbys is a feature provided starting from [PostgreSQL 9.6](https://www.postgresql.org/docs/9.6/static/warm-standby.html#SYNCHRONOUS-REPLICATION). Increasing these value above 1 for postgres versions below 9.6 will lead to errors starting the instance.
10+
In the cluster spec you can set the `MinSynchronousStandbys` and `MaxSynchronousStandbys` values (they both defaults to 1). Having multiple synchronous standbys is a feature provided starting from [PostgreSQL 9.6](https://www.postgresql.org/docs/9.6/static/warm-standby.html#SYNCHRONOUS-REPLICATION). Values different than 1 for postgres versions below 9.6 will be ignored.
1111

1212
## Enable synchronous replication.
1313

@@ -24,7 +24,7 @@ stolonctl --cluster-name=mycluster --store-backend=etcd update --patch '{ "synch
2424

2525
## Set min and max number of synchronous replication standbys
2626

27-
Set MinSynchronousStandbys/MaxSynchronousStandbys to a value different than 1 only when using PostgreSQL >= 9.6
27+
Set MinSynchronousStandbys/MaxSynchronousStandbys to a value greater than 1 (only when using PostgreSQL >= 9.6)
2828

2929
```
3030
stolonctl --cluster-name=mycluster --store-backend=etcd update --patch '{ "synchronousReplication" : true, "minSynchronousStandbys": 2, "maxSynchronousStandbys": 3 }'

tests/integration/ha_test.go

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1512,9 +1512,29 @@ func testKeeperRemovalStolonCtl(t *testing.T, syncRepl bool) {
15121512

15131513
master, standbys := waitMasterStandbysReady(t, sm, tks)
15141514

1515+
maj, min, err := master.PGDataVersion()
1516+
if err != nil {
1517+
t.Fatalf("unexpected err: %v", err)
1518+
}
1519+
// on postgresql <= 9.5 we can have only 1 synchronous standby
15151520
if syncRepl {
1516-
if err := WaitClusterDataSynchronousStandbys([]string{standbys[0].uid, standbys[1].uid}, sm, 30*time.Second); err != nil {
1517-
t.Fatalf("expected synchronous standbys")
1521+
if maj == 9 && min <= 5 {
1522+
ok := false
1523+
if err := WaitClusterDataSynchronousStandbys([]string{standbys[0].uid}, sm, 30*time.Second); err == nil {
1524+
ok = true
1525+
}
1526+
if !ok {
1527+
if err := WaitClusterDataSynchronousStandbys([]string{standbys[1].uid}, sm, 30*time.Second); err == nil {
1528+
ok = true
1529+
}
1530+
}
1531+
if !ok {
1532+
t.Fatalf("expected synchronous standbys")
1533+
}
1534+
} else {
1535+
if err := WaitClusterDataSynchronousStandbys([]string{standbys[0].uid, standbys[1].uid}, sm, 30*time.Second); err != nil {
1536+
t.Fatalf("expected synchronous standbys")
1537+
}
15181538
}
15191539
}
15201540

0 commit comments

Comments
 (0)