Skip to content

Commit f5c25f7

Browse files
committed
rebalancer: reduce spam of "Some buckets are not active" logs
Before this patch "Some buckets are not active ..." logs were printed every `REBALANCER_WORK_INTERVAL` seconds, flooding the log file with redundant messages. Also these records were lacking of information about replicaset that has non-active buckets during rebalancing. This patch fixes the issue by limiting this log frequency and adding information about replicaset's id and rebalancer's retry time to log messages. Closes #212 NO_DOC=bugfix
1 parent 1da893d commit f5c25f7

File tree

3 files changed

+55
-1
lines changed

3 files changed

+55
-1
lines changed

test/storage-luatest/storage_1_1_1_test.lua

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,21 @@ local function move_bucket(src_storage, dest_storage, bucket_id)
116116
end, {bucket_id})
117117
end
118118

119+
local function test_only_one_record_appears_in_logs(server, record, wait_time)
120+
local first_log_record = nil
121+
t.helpers.retrying({timeout = 10}, function()
122+
first_log_record = server:grep_log(record)
123+
t.assert(first_log_record)
124+
end)
125+
-- We need to wait a bit in order to catch how much as possible
126+
-- spam in server's logs.
127+
require('fiber').sleep(wait_time)
128+
local last_log_record = server:grep_log(record)
129+
t.assert(last_log_record)
130+
t.assert_equals(first_log_record, last_log_record,
131+
'There are two identical records in logs')
132+
end
133+
119134
rebalancer_recovery_group.before_all(function(g)
120135
global_cfg = vtest.config_new(cfg_template)
121136
vtest.cluster_new(g, global_cfg)
@@ -133,6 +148,10 @@ rebalancer_recovery_group.before_all(function(g)
133148
box.space.test_space:create_index(
134149
'bucket_id', {parts = {'bucket_id'}, unique = false})
135150
end)
151+
g.replicaset_not_connected_pattern = '%d+-%d+-%d+ %d+:%d+:%d+.%d+ .* '
152+
g.replcaset_not_connected_msg = 'Some buckets in replicaset %s ' ..
153+
'are not active! '
154+
g.rebalancer_wait_interval = 0.01
136155
end)
137156

138157
rebalancer_recovery_group.after_all(function(g)
@@ -199,3 +218,25 @@ rebalancer_recovery_group.test_rebalancer_routes_logging = function(g)
199218
move_bucket(g.replica_1_a, g.replica_2_a, moved_bucket_from_2)
200219
move_bucket(g.replica_1_a, g.replica_3_a, moved_bucket_from_3)
201220
end
221+
222+
rebalancer_recovery_group.test_no_log_spam_when_buckets_no_active = function(g)
223+
local replicaset_2_uuid = g.replica_2_a:replicaset_uuid()
224+
g.replica_2_a:stop()
225+
g.replica_1_a:exec(function()
226+
rawset(_G, 'old_rebalancer_interval', ivconst.REBALANCER_WORK_INTERVAL)
227+
rawset(_G, 'old_rebalancer_timeout',
228+
ivconst.REBALANCER_GET_STATE_TIMEOUT)
229+
ivconst.REBALANCER_WORK_INTERVAL = 0.01
230+
ivconst.REBALANCER_GET_STATE_TIMEOUT = 0.01
231+
end)
232+
local rs_not_connected_log = g.replicaset_not_connected_pattern ..
233+
string.format(g.replcaset_not_connected_msg,
234+
replicaset_2_uuid)
235+
test_only_one_record_appears_in_logs(g.replica_1_a, rs_not_connected_log,
236+
g.rebalancer_wait_interval * 2)
237+
g.replica_1_a:exec(function()
238+
ivconst.REBALANCER_WORK_INTERVAL = _G.old_rebalancer_interval
239+
ivconst.REBALANCER_GET_STATE_TIMEOUT = _G.old_rebalancer_timeout
240+
end)
241+
g.replica_2_a:start()
242+
end

vshard/storage/init.lua

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -232,7 +232,10 @@ if not M then
232232
-- Condition variable fired each time a bucket locked for
233233
-- RW refs reaches 0 of the latter.
234234
bucket_rw_lock_is_ready_cond = lfiber.cond(),
235-
235+
-- This table contains true/false values for each replicaset depending
236+
-- on whether all of its buckets have BACTIVE status during rebalancing
237+
-- stage.
238+
replicasets_active_completeness = {},
236239
------------------------- Reload -------------------------
237240
-- Version of the loaded module. This number is used on
238241
-- reload to determine which upgrade scripts to run.
@@ -2795,6 +2798,12 @@ local function rebalancer_download_states()
27952798
replicaset, 'vshard.storage.rebalancer_request_state', {},
27962799
{timeout = consts.REBALANCER_GET_STATE_TIMEOUT})
27972800
if state == nil then
2801+
if not M.replicasets_active_completeness[replicaset.id] then
2802+
M.replicasets_active_completeness[replicaset.id] = true
2803+
log.info('Some buckets in replicaset %s are not active! ' ..
2804+
'Will retry rebalancing every %s s.', replicaset.id,
2805+
consts.REBALANCER_WORK_INTERVAL)
2806+
end
27982807
return
27992808
end
28002809
local bucket_count = state.bucket_active_count +
@@ -2806,6 +2815,7 @@ local function rebalancer_download_states()
28062815
replicasets[id] = {bucket_count = bucket_count,
28072816
weight = replicaset.weight,
28082817
pinned_count = state.bucket_pinned_count}
2818+
M.replicasets_active_completeness[replicaset.id] = false
28092819
end
28102820
end
28112821
local sum = total_bucket_active_count + total_bucket_locked_count

vshard/storage/reload_evolution.lua

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,9 @@ migrations[#migrations + 1] = function(M)
3131
M.bucket_generation_cond = fiber.cond()
3232
M.route_map = {}
3333
end
34+
if not M.replicasets_active_completeness then
35+
M.replicasets_active_completeness = {}
36+
end
3437
end
3538

3639
--

0 commit comments

Comments
 (0)