Skip to content

Commit bb8b109

Browse files
committed
rebalancer: add replicaset.id in "Some buckets are not active" log
Before this patch the function `rebalancer_download_states` didn't return information about replicaset from which the states could not be downloaded. As a result, the log "Some buckets are not active ..." lacks of valuable information about unhealthy replicaset. Now, we return `(BUCKETS_NOT_IN_PROPER_STATE error with replicaset. id, nil)` instead of `nil` in case when rebalancer can't download state from this replicaset. Also we add replicaset.id in "Some buckets are not active ..." log. Also we change `rebalancer/rebalancer.test.lua` test which expected the old "Some buckets are not active" log without replicaset.id. Closes #212 NO_DOC=bugfix
1 parent 06c15b0 commit bb8b109

File tree

5 files changed

+40
-7
lines changed

5 files changed

+40
-7
lines changed

test/rebalancer/rebalancer.result

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -318,7 +318,13 @@ _bucket:update({150}, {{'=', 2, vshard.consts.BUCKET.RECEIVING}})
318318
---
319319
- [150, 'receiving']
320320
...
321-
wait_rebalancer_state("Some buckets are not active", test_run)
321+
formatted_replicaset_uuid = string.gsub(util.replicasets[1], '%-', '%%-')
322+
---
323+
...
324+
log_msg = string.format('Some buckets in replicaset %s are not active', formatted_replicaset_uuid)
325+
---
326+
...
327+
wait_rebalancer_state(log_msg, test_run)
322328
---
323329
...
324330
_bucket:update({150}, {{'=', 2, vshard.consts.BUCKET.ACTIVE}})

test/rebalancer/rebalancer.test.lua

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -156,7 +156,9 @@ util.map_bucket_protection(test_run, {REPLICASET_1}, false)
156156
test_run:switch('box_1_a')
157157
vshard.storage.rebalancer_enable()
158158
_bucket:update({150}, {{'=', 2, vshard.consts.BUCKET.RECEIVING}})
159-
wait_rebalancer_state("Some buckets are not active", test_run)
159+
formatted_replicaset_uuid = string.gsub(util.replicasets[1], '%-', '%%-')
160+
log_msg = string.format('Some buckets in replicaset %s are not active', formatted_replicaset_uuid)
161+
wait_rebalancer_state(log_msg, test_run)
160162
_bucket:update({150}, {{'=', 2, vshard.consts.BUCKET.ACTIVE}})
161163
vshard.storage.sync()
162164

test/storage-luatest/storage_1_1_1_test.lua

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -220,3 +220,20 @@ rebalancer_recovery_group.test_rebalancer_routes_logging = function(g)
220220
g.replica_1_a:grep_log('The cluster is balanced ok.')
221221
end)
222222
end
223+
224+
rebalancer_recovery_group.test_no_log_spam_when_buckets_no_active = function(g)
225+
local moved_bucket = vtest.storage_first_bucket(g.replica_2_a)
226+
start_bucket_move(g.replica_1_a, g.replica_2_a, moved_bucket)
227+
wait_for_bucket_is_transferred(g.replica_1_a, g.replica_2_a, moved_bucket)
228+
vtest.storage_stop(g.replica_2_a)
229+
local buckets_not_active = string.format('Some buckets in replicaset ' ..
230+
'%s are not active',
231+
g.replica_2_a:replicaset_uuid())
232+
t.helpers.retrying({timeout = 60}, function()
233+
g.replica_1_a:exec(function() ivshard.storage.rebalancer_wakeup() end)
234+
t.assert(g.replica_1_a:grep_log(buckets_not_active))
235+
end)
236+
vtest.storage_start(g.replica_2_a, global_cfg)
237+
start_bucket_move(g.replica_2_a, g.replica_1_a, moved_bucket)
238+
wait_for_bucket_is_transferred(g.replica_2_a, g.replica_1_a, moved_bucket)
239+
end

vshard/error.lua

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -207,6 +207,11 @@ local error_message_template = {
207207
msg = 'Mismatch server name: expected "%s", but got "%s"',
208208
args = {'expected_name', 'actual_name'},
209209
},
210+
[42] = {
211+
name = 'BUCKETS_NOT_IN_PROPER_STATE',
212+
msg = 'Replicaset %s doesn\'t have all buckets in proper state',
213+
args = {'replicaset_id'}
214+
}
210215
}
211216

212217
--

vshard/storage/init.lua

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2802,11 +2802,12 @@ local function rebalancer_download_states()
28022802
local total_bucket_locked_count = 0
28032803
local total_bucket_active_count = 0
28042804
for id, replicaset in pairs(M.replicasets) do
2805-
local state, err = master_call(
2805+
local state = master_call(
28062806
replicaset, 'vshard.storage.rebalancer_request_state', {},
28072807
{timeout = consts.REBALANCER_GET_STATE_TIMEOUT})
28082808
if state == nil then
2809-
return nil, err
2809+
return lerror.vshard(lerror.code.BUCKETS_NOT_IN_PROPER_STATE,
2810+
replicaset.id), nil
28102811
end
28112812
local bucket_count = state.bucket_active_count +
28122813
state.bucket_pinned_count
@@ -2850,13 +2851,15 @@ local function rebalancer_service_f(service, limiter)
28502851
if M.module_version ~= module_version then
28512852
return
28522853
end
2853-
if not status or replicasets == nil then
2854+
if not status or total_bucket_active_count == nil then
28542855
local err = status and total_bucket_active_count or replicasets
28552856
if err then
28562857
limiter:log_error(err, service:set_status_error(
2857-
'Error during downloading rebalancer states: %s', err))
2858+
'Error during downloading rebalancer states: %s',
2859+
err.replicaset_id))
28582860
end
2859-
log.info('Some buckets are not active, retry rebalancing later')
2861+
log.info('Some buckets in replicaset %s are not active, retry ' ..
2862+
'rebalancing later', err and err.replicaset_id)
28602863
service:set_activity('idling')
28612864
lfiber.testcancel()
28622865
lfiber.sleep(consts.REBALANCER_WORK_INTERVAL)

0 commit comments

Comments
 (0)