rebalancer: reduce spam of "Some buckets are not active" logs

mrForza · mrForza · commit f5c25f7bbf16 · 2025-08-22T18:20:22.000+03:00
Before this patch "Some buckets are not active ..." logs were printed every `REBALANCER_WORK_INTERVAL` seconds, flooding the log file with redundant messages. Also these records were lacking of information about replicaset that has non-active buckets during rebalancing. This patch fixes the issue by limiting this log frequency and adding information about replicaset's id and rebalancer's retry time to log messages. Closes #212 NO_DOC=bugfix
diff --git a/test/storage-luatest/storage_1_1_1_test.lua b/test/storage-luatest/storage_1_1_1_test.lua
@@ -116,6 +116,21 @@ local function move_bucket(src_storage, dest_storage, bucket_id)
     end, {bucket_id})
 end
 
+local function test_only_one_record_appears_in_logs(server, record, wait_time)
+    local first_log_record = nil
+    t.helpers.retrying({timeout = 10}, function()
+        first_log_record = server:grep_log(record)
+        t.assert(first_log_record)
+    end)
+    -- We need to wait a bit in order to catch how much as possible
+    -- spam in server's logs.
+    require('fiber').sleep(wait_time)
+    local last_log_record = server:grep_log(record)
+    t.assert(last_log_record)
+    t.assert_equals(first_log_record, last_log_record,
+                    'There are two identical records in logs')
+end
+
 rebalancer_recovery_group.before_all(function(g)
     global_cfg = vtest.config_new(cfg_template)
     vtest.cluster_new(g, global_cfg)
@@ -133,6 +148,10 @@ rebalancer_recovery_group.before_all(function(g)
         box.space.test_space:create_index(
             'bucket_id', {parts = {'bucket_id'}, unique = false})
     end)
+    g.replicaset_not_connected_pattern = '%d+-%d+-%d+ %d+:%d+:%d+.%d+ .* '
+    g.replcaset_not_connected_msg = 'Some buckets in replicaset %s ' ..
+                                    'are not active! '
+    g.rebalancer_wait_interval = 0.01
 end)
 
 rebalancer_recovery_group.after_all(function(g)
@@ -199,3 +218,25 @@ rebalancer_recovery_group.test_rebalancer_routes_logging = function(g)
     move_bucket(g.replica_1_a, g.replica_2_a, moved_bucket_from_2)
     move_bucket(g.replica_1_a, g.replica_3_a, moved_bucket_from_3)
 end
+
+rebalancer_recovery_group.test_no_log_spam_when_buckets_no_active = function(g)
+    local replicaset_2_uuid = g.replica_2_a:replicaset_uuid()
+    g.replica_2_a:stop()
+    g.replica_1_a:exec(function()
+        rawset(_G, 'old_rebalancer_interval', ivconst.REBALANCER_WORK_INTERVAL)
+        rawset(_G, 'old_rebalancer_timeout',
+        ivconst.REBALANCER_GET_STATE_TIMEOUT)
+        ivconst.REBALANCER_WORK_INTERVAL = 0.01
+        ivconst.REBALANCER_GET_STATE_TIMEOUT = 0.01
+    end)
+    local rs_not_connected_log = g.replicaset_not_connected_pattern ..
+                                 string.format(g.replcaset_not_connected_msg,
+                                               replicaset_2_uuid)
+    test_only_one_record_appears_in_logs(g.replica_1_a, rs_not_connected_log,
+                                         g.rebalancer_wait_interval * 2)
+    g.replica_1_a:exec(function()
+        ivconst.REBALANCER_WORK_INTERVAL = _G.old_rebalancer_interval
+        ivconst.REBALANCER_GET_STATE_TIMEOUT = _G.old_rebalancer_timeout
+    end)
+    g.replica_2_a:start()
+end
diff --git a/vshard/storage/init.lua b/vshard/storage/init.lua
@@ -232,7 +232,10 @@ if not M then
         -- Condition variable fired each time a bucket locked for
         -- RW refs reaches 0 of the latter.
         bucket_rw_lock_is_ready_cond = lfiber.cond(),
-
+        -- This table contains true/false values for each replicaset depending
+        -- on whether all of its buckets have BACTIVE status during rebalancing
+        -- stage.
+        replicasets_active_completeness = {},
         ------------------------- Reload -------------------------
         -- Version of the loaded module. This number is used on
         -- reload to determine which upgrade scripts to run.
@@ -2795,6 +2798,12 @@ local function rebalancer_download_states()
             replicaset, 'vshard.storage.rebalancer_request_state', {},
             {timeout = consts.REBALANCER_GET_STATE_TIMEOUT})
         if state == nil then
+            if not M.replicasets_active_completeness[replicaset.id] then
+                M.replicasets_active_completeness[replicaset.id] = true
+                log.info('Some buckets in replicaset %s are not active! ' ..
+                         'Will retry rebalancing every %s s.', replicaset.id,
+                         consts.REBALANCER_WORK_INTERVAL)
+            end
             return
         end
         local bucket_count = state.bucket_active_count +
@@ -2806,6 +2815,7 @@ local function rebalancer_download_states()
             replicasets[id] = {bucket_count = bucket_count,
                                weight = replicaset.weight,
                                pinned_count = state.bucket_pinned_count}
+            M.replicasets_active_completeness[replicaset.id] = false
         end
     end
     local sum = total_bucket_active_count + total_bucket_locked_count
diff --git a/vshard/storage/reload_evolution.lua b/vshard/storage/reload_evolution.lua
@@ -31,6 +31,9 @@ migrations[#migrations + 1] = function(M)
         M.bucket_generation_cond = fiber.cond()
         M.route_map = {}
     end
+    if not M.replicasets_active_completeness then
+        M.replicasets_active_completeness = {}
+    end
 end
 
 --