replicaset: make locating master faster

Serpentian · Serpentian · commit 0253ae27048c · 2025-05-20T17:35:22.000+03:00
When the old master hangs, we firstly make a call to it to check if it's still alive. After that, we contact each replica in the replicaset and wait for their responses. As a result, we end up calling the dead master twice, which causes the master discovery process to lag by 3 * MASTER_SEARCH_TIMEOUT seconds (2 times in `locate_master()` while waiting for responses from dead master and 1 before waking up of the master search fiber). We can reduce the time required to discover the new master by skipping the dead old master during the iteration over replicas. This would limit the delay on the router to a maximum of 2 * MASTER_SEARCH_TIMEOUT per master search iteration, if only one node is down. Closes #549 NO_DOC=bugfix
diff --git a/test/replicaset-luatest/replicaset_3_test.lua b/test/replicaset-luatest/replicaset_3_test.lua
@@ -559,7 +559,9 @@ test_group.test_locate_master_when_old_master_hangs = function(g)
     g.replica_1_a:freeze()
 
     -- Replicaset is able to locate a new one.
+    local start_ts = fiber.clock()
     local is_done, is_nop = rs:locate_master()
+    t.assert_lt(fiber.clock() - start_ts, 2 * vconst.MASTER_SEARCH_TIMEOUT)
     t.assert(is_done)
     t.assert_not(is_nop)
     t.assert_equals(rs.master.uuid, g.replica_1_b:instance_uuid())
diff --git a/vshard/replicaset.lua b/vshard/replicaset.lua
@@ -1163,7 +1163,13 @@ local function replicaset_locate_master(replicaset)
     local deadline = fiber_clock() + timeout
     local async_opts = {is_async = true, timeout = timeout}
     local replicaset_id = replicaset.id
+    local old_master_id = replicaset.master and replicaset.master.id
     for replica_id, replica in pairs(replicaset.replicas) do
+        if replica_id == old_master_id then
+            -- No need to wait for master one more time, we have just tried to
+            -- check it and it didn't respond.
+            goto next_replica
+        end
         replicaset_connect_to_replica(replicaset, replica)
         ok, err = replica:check_is_connected()
         if ok then
@@ -1176,6 +1182,7 @@ local function replicaset_locate_master(replicaset)
         elseif err ~= nil then
             last_err = err
         end
+        ::next_replica::
     end
     local master_id
     for replica_id, f in pairs(futures) do