Skip to content

Commit fc7a877

Browse files
committed
rebalancer: reduce spam in logs
Before this patch "Some buckets are not active ..." logs were printed every `REBALANCER_WORK_INTERVAL` seconds, flooding the log file with redundant entries. Also these records were lacking of information about the current replicaset. This patch fixes the issue by limiting this log frequency and adding information about replicaset's id to log messages. Part of #212 NO_DOC=bugfix
1 parent 97965fb commit fc7a877

File tree

2 files changed

+33
-1
lines changed

2 files changed

+33
-1
lines changed

test/storage-luatest/storage_1_1_1_test.lua

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -247,3 +247,26 @@ rebalancer_recovery_group.test_rebalancer_routes_logging = function(g)
247247
t.assert(g.replica_1_a:grep_log(route_1_to_2))
248248
t.assert(g.replica_1_a:grep_log(route_1_to_3))
249249
end
250+
251+
rebalancer_recovery_group.test_no_log_spam_when_buckets_no_active = function(g)
252+
g.replica_2_a:stop()
253+
g.replica_1_a:exec(function()
254+
rawset(_G, 'old_rebalancer_interval', ivconst.REBALANCER_WORK_INTERVAL)
255+
rawset(_G, 'old_rebalancer_timeout',
256+
ivconst.REBALANCER_GET_STATE_TIMEOUT)
257+
ivconst.REBALANCER_WORK_INTERVAL = 0.01
258+
ivconst.REBALANCER_GET_STATE_TIMEOUT = 0.01
259+
end)
260+
261+
t.helpers.retrying({timeout = 10}, function()
262+
t.assert(g.replica_1_a:grep_log(g.buckets_not_active_log))
263+
end)
264+
test_only_one_record_appears_in_logs(g.replica_1_a,
265+
g.buckets_not_active_pattern,
266+
g.rebalancer_wait_interval * 2)
267+
g.replica_1_a:exec(function()
268+
ivconst.REBALANCER_WORK_INTERVAL = _G.old_rebalancer_interval
269+
ivconst.REBALANCER_GET_STATE_TIMEOUT = _G.old_rebalancer_timeout
270+
end)
271+
g.replica_2_a:start()
272+
end

vshard/storage/init.lua

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2823,6 +2823,7 @@ end
28232823
--
28242824
local function rebalancer_service_f(service)
28252825
local module_version = M.module_version
2826+
local some_buckets_are_not_active = false
28262827
while module_version == M.module_version do
28272828
service:next_iter()
28282829
while not M.is_rebalancer_active do
@@ -2844,7 +2845,13 @@ local function rebalancer_service_f(service)
28442845
'Error during downloading rebalancer states: %s',
28452846
replicasets))
28462847
end
2847-
log.info('Some buckets are not active, retry rebalancing later')
2848+
if not some_buckets_are_not_active then
2849+
log.info('Some buckets are not active, replicaset ' ..
2850+
string.format('%s will retry', M.this_replicaset.id) ..
2851+
string.format(' rebalancing every %s s.',
2852+
consts.REBALANCER_WORK_INTERVAL))
2853+
some_buckets_are_not_active = true
2854+
end
28482855
service:set_activity('idling')
28492856
lfiber.testcancel()
28502857
lfiber.sleep(consts.REBALANCER_WORK_INTERVAL)
@@ -2874,6 +2881,7 @@ local function rebalancer_service_f(service)
28742881
service:set_activity('idling')
28752882
lfiber.testcancel()
28762883
lfiber.sleep(consts.REBALANCER_IDLE_INTERVAL)
2884+
some_buckets_are_not_active = false
28772885
goto continue
28782886
end
28792887
local routes = rebalancer_build_routes(replicasets)
@@ -2901,6 +2909,7 @@ local function rebalancer_service_f(service)
29012909
goto continue
29022910
end
29032911
end
2912+
some_buckets_are_not_active = false
29042913
log.info('Rebalance routes are sent. Schedule next wakeup after '..
29052914
'%f seconds', consts.REBALANCER_WORK_INTERVAL)
29062915
service:set_activity('idling')

0 commit comments

Comments
 (0)