test: fix flaky test #453

elhimov · 2025-07-30T11:52:21Z

Helper method that performs initial assigning of master/replica roles and is widely used in ConnectionPool tests was adjusted to wait for the roles to be applied successfully.
Prior to this patch it doesn't, so sometimes subsequent test code might work unexpectedly (the problem was caught with
TestConnectionHandlerOpenUpdateClose)

Related issues:

Closes #452

test_helpers/pool_helper.go

bigbes · 2025-08-05T12:51:31Z

test_helpers/pool_helper.go

+	// Wait for the role to be applied.
+	for {
+		time.Sleep(10 * time.Millisecond)
+
+		var reason string
+
+		data, err := conn.Do(tarantool.NewCallRequest("box.info")).Get()
+		switch {
+		case err != nil:
+			reason = fmt.Sprintf("failed to get box.info: %s", err)
+		case len(data) < 1:
+			reason = "box.info is empty"
+		default:
+			status, statusFound := data[0].(map[interface{}]interface{})["status"]
+			readonly, readonlyFound := data[0].(map[interface{}]interface{})["ro"]
+			switch {
+			case !statusFound:
+				reason = "box.info.status is missing"
+			case status != "running":
+				reason = fmt.Sprintf("box.info.status='%s' (waiting for 'running')", status)
+			case !readonlyFound:
+				reason = "box.info.ro is missing"
+			case readonly != isReplica:
+				reason = fmt.Sprintf("box.info.ro='%v' (waiting for '%v')", readonly, isReplica)
+			}
+		}
+
+		if len(reason) == 0 {
+			break
+		}
+
+		select {
+		case <-ctx.Done():
+			return fmt.Errorf("%w: failed to apply role, the last reason: %s", ctx.Err(), reason)
+		default:
+			continue
+		}
+	}
+


this is more readable version of this code, imo:

Suggested change

// Wait for the role to be applied.

for {

time.Sleep(10 * time.Millisecond)

var reason string

data, err := conn.Do(tarantool.NewCallRequest("box.info")).Get()

switch {

case err != nil:

reason = fmt.Sprintf("failed to get box.info: %s", err)

case len(data) < 1:

reason = "box.info is empty"

default:

status, statusFound := data[0].(map[interface{}]interface{})["status"]

readonly, readonlyFound := data[0].(map[interface{}]interface{})["ro"]

switch {

case !statusFound:

reason = "box.info.status is missing"

case status != "running":

reason = fmt.Sprintf("box.info.status='%s' (waiting for 'running')", status)

case !readonlyFound:

reason = "box.info.ro is missing"

case readonly != isReplica:

reason = fmt.Sprintf("box.info.ro='%v' (waiting for '%v')", readonly, isReplica)

}

}

if len(reason) == 0 {

break

}

select {

case <-ctx.Done():

return fmt.Errorf("%w: failed to apply role, the last reason: %s", ctx.Err(), reason)

default:

continue

}

}

var reason string

// Wait for the role to be applied.

for {

select {

case <-time.After(10 * time.Millisecond):

case <-ctx.Done():

return fmt.Errorf("%w: failed to apply role, the last reason: %s", ctx.Err(), reason)

}

data, err := conn.Do(tarantool.NewCallRequest("box.info")).Get()

switch {

case err != nil:

reason = fmt.Sprintf("failed to get box.info: %s", err)

continue

case len(data) < 1:

reason = "box.info is empty"

continue

}

status, statusFound := data[0].(map[interface{}]interface{})["status"]

readonly, readonlyFound := data[0].(map[interface{}]interface{})["ro"]

switch {

case !statusFound:

reason = "box.info.status is missing"

case status != "running":

reason = fmt.Sprintf("box.info.status='%s' (waiting for 'running')", status)

case !readonlyFound:

reason = "box.info.ro is missing"

case readonly != isReplica:

reason = fmt.Sprintf("box.info.ro='%v' (waiting for '%v')", readonly, isReplica)

default:

return nil

}

}

It can be further optimised for readability with something like that:

func checkInfoStatus(data []interface{}) string { status, statusFound := data[0].(map[interface{}]interface{})["status"] switch { case !statusFound: return "box.info.status is missing" case status != "running": return fmt.Sprintf("box.info.status='%s' (waiting for 'running')", status) default: return "" } } func checkInfoRO(data []interface{}, isReplica bool) string { readonly, readonlyFound := data[0].(map[interface{}]interface{})["ro"] switch { case !readonlyFound: reason = "box.info.ro is missing" case readonly != isReplica: reason = fmt.Sprintf("box.info.ro='%v' (waiting for '%v')", readonly, isReplica) default: return nil } } func SetInstanceRO(ctx context.Context, dialer tarantool.Dialer, connOpts tarantool.Opts, isReplica bool) error { conn, err := tarantool.Connect(ctx, dialer, connOpts) if err != nil { return err } defer conn.Close() req := tarantool.NewCallRequest("box.cfg"). Args([]interface{}{map[string]bool{"read_only": isReplica}}) if _, err := conn.Do(req).Get(); err != nil { return err } var reason string // Wait for the role to be applied. for { select { case <-time.After(10 * time.Millisecond): case <-ctx.Done(): return fmt.Errorf("%w: failed to apply role, the last reason: %s", ctx.Err(), reason) } data, err := conn.Do(tarantool.NewCallRequest("box.info")).Get() switch { case err != nil: reason = fmt.Sprintf("failed to get box.info: %s", err) case len(data) < 1: reason = "box.info is empty" case checkInfoStatus(data) != "": reason = checkInfoStatus(data) case checkInfoRO(data) != "": reason = checkInfoRO(data) default: return nil } } }

Rearranged in a slightly different manner that seems more readable.

bigbes · 2025-08-05T13:13:10Z

test_helpers/pool_helper.go

+	errs := make([]error, len(dialers))
+	var wg sync.WaitGroup
 	for i, dialer := range dialers {
-		ctx, cancel := GetConnectContext()
-		err := SetInstanceRO(ctx, dialer, connOpts, roles[i])
-		cancel()
-		if err != nil {
-			return err
-		}
+		wg.Add(1)
+		// Pass loop variables to avoid its closure.
+		go func(i int, dialer tarantool.Dialer) {
+			defer wg.Done()
+			errs[i] = SetInstanceRO(ctx, dialer, connOpts, roles[i])
+		}(i, dialer)
 	}
+	wg.Wait()


Usually it's better not to use wg.Add() in a cycle, to avoid possible race conditions if starting goroutine and awaiter are different ones.
There's no problem in that code, but rule of thumb is to avoid that kind of problems without double-checking: if we know number of workers before running them - use wg.Add(count).

errs := make([]error, len(dialers)) var wg sync.WaitGroup wg.Add(len(dialers)) for i, dialer := range dialers { // Pass loop variables to avoid its closure. go func(i int, dialer tarantool.Dialer) { defer wg.Done() errs[i] = SetInstanceRO(ctx, dialer, connOpts, roles[i]) }(i, dialer) } wg.Wait()

it's just a nitpicking, but it can save some time in the future.

bigbes

Theoretically, I understand these changes and believe they should work, but I can't guarantee it.
It's up to you to agree or disagree with style comments, since it's test code.

Helper method that performs initial assigning of master/replica roles and is widely used in ConnectionPool tests was adjusted to wait for the roles to be applied successfully. Prior to this patch it doesn't, so sometimes subsequent test code might work unexpectedly (the problem was caught with TestConnectionHandlerOpenUpdateClose) Closes #452

dmyger · 2025-08-06T09:44:16Z

test_helpers/pool_helper.go

@@ -206,6 +208,45 @@ func SetInstanceRO(ctx context.Context, dialer tarantool.Dialer, connOpts tarant
 		return err
 	}

+	checkRole := func(conn *tarantool.Connection, isReplica bool) string {


The nested function doesn't use local variable capture, maybe it still makes sense to put this function in a separate unit so as not to mess up the code inside here.

Yes, technically this nested function is able to "live" outside, but it tightly coupled semantically to the outer function, and as I see it doesn't make sense outside of it as a standalone function. Please, consider it just as a way to rearrange code within SetInstanceRO to make it more readable.

dmyger · 2025-08-06T09:53:33Z

test_helpers/pool_helper.go

+		go func(i int, dialer tarantool.Dialer) {
+			defer wg.Done()
+			errs[i] = SetInstanceRO(ctx, dialer, connOpts, roles[i])
+		}(i, dialer)


Suggested change

go func(i int, dialer tarantool.Dialer) {

defer wg.Done()

errs[i] = SetInstanceRO(ctx, dialer, connOpts, roles[i])

}(i, dialer)

go func() {

defer wg.Done()

errs[i] = SetInstanceRO(ctx, dialer, connOpts, roles[i])

}()

It's just a cosmetic, so up to you.
Since Go 1.22 it's safe to capture variables when using closures with concurrency inside loop body.

I know, but in go.mod there is line

go 1.20

elhimov force-pushed the elhimov/gh-452-flaky-on-mac-TestConnectionHandlerOpenUpdateClose branch 2 times, most recently from 0b1890e to e7c2e9c Compare July 30, 2025 14:53

elhimov marked this pull request as draft July 30, 2025 15:12

elhimov force-pushed the elhimov/gh-452-flaky-on-mac-TestConnectionHandlerOpenUpdateClose branch 2 times, most recently from e002994 to 76e4a2f Compare August 5, 2025 10:03

elhimov marked this pull request as ready for review August 5, 2025 10:04

elhimov requested review from bigbes and dmyger August 5, 2025 10:04

bigbes reviewed Aug 5, 2025

View reviewed changes

test_helpers/pool_helper.go Outdated Show resolved Hide resolved

bigbes reviewed Aug 5, 2025

View reviewed changes

test_helpers/pool_helper.go Outdated Show resolved Hide resolved

bigbes reviewed Aug 5, 2025

View reviewed changes

bigbes approved these changes Aug 5, 2025

View reviewed changes

elhimov force-pushed the elhimov/gh-452-flaky-on-mac-TestConnectionHandlerOpenUpdateClose branch from 76e4a2f to 537f38f Compare August 5, 2025 21:25

elhimov requested a review from bigbes August 5, 2025 23:20

bigbes approved these changes Aug 6, 2025

View reviewed changes

dmyger reviewed Aug 6, 2025

View reviewed changes

elhimov requested a review from dmyger August 6, 2025 10:13

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

test: fix flaky test #453

test: fix flaky test #453

Uh oh!

elhimov commented Jul 30, 2025

Uh oh!

Uh oh!

Uh oh!

bigbes Aug 5, 2025 •

edited

Loading

Uh oh!

bigbes Aug 5, 2025

Uh oh!

elhimov Aug 5, 2025

Uh oh!

bigbes Aug 5, 2025 •

edited

Loading

Uh oh!

elhimov Aug 5, 2025

Uh oh!

bigbes left a comment

Uh oh!

dmyger Aug 6, 2025

Uh oh!

elhimov Aug 6, 2025 •

edited

Loading

Uh oh!

dmyger Aug 6, 2025

Uh oh!

elhimov Aug 6, 2025

Uh oh!

Uh oh!

test: fix flaky test #453

Are you sure you want to change the base?

test: fix flaky test #453

Uh oh!

Conversation

elhimov commented Jul 30, 2025

Uh oh!

Uh oh!

Uh oh!

bigbes Aug 5, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

bigbes Aug 5, 2025

Choose a reason for hiding this comment

Uh oh!

elhimov Aug 5, 2025

Choose a reason for hiding this comment

Uh oh!

bigbes Aug 5, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

elhimov Aug 5, 2025

Choose a reason for hiding this comment

Uh oh!

bigbes left a comment

Choose a reason for hiding this comment

Uh oh!

dmyger Aug 6, 2025

Choose a reason for hiding this comment

Uh oh!

elhimov Aug 6, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

dmyger Aug 6, 2025

Choose a reason for hiding this comment

Uh oh!

elhimov Aug 6, 2025

Choose a reason for hiding this comment

Uh oh!

Uh oh!

bigbes Aug 5, 2025 •

edited

Loading

bigbes Aug 5, 2025 •

edited

Loading

elhimov Aug 6, 2025 •

edited

Loading