Skip to content

Commit 5f66134

Browse files
committed
recovery: simplify the procedure of recovery
Before the patch a special map of buckets to recovery was maintained during a master lifecycle. But 1) the same buckets can be got from _bucket.index.status iterator, 2) it does not work when a master switch emerges during transfer so a bucket arrives to a new master via replication after it is configured. Such buckets are not put into the map of buckets to recovery. This patch gets rid of this map and uses directly _bucket.index.status.
1 parent 5292cfe commit 5f66134

File tree

10 files changed

+144
-104
lines changed

10 files changed

+144
-104
lines changed

test/rebalancer/bucket_ref.result

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -268,13 +268,8 @@ finish_refs = true
268268
while vshard.storage.buckets_info(1)[1].rw_lock do fiber.sleep(0.01) end
269269
---
270270
...
271-
vshard.storage.buckets_info(1)
271+
while box.space._bucket:get{1} do fiber.sleep(0.01) end
272272
---
273-
- 1:
274-
status: sent
275-
ro_lock: true
276-
destination: <replicaset_1>
277-
id: 1
278273
...
279274
_ = test_run:switch('box_1_a')
280275
---

test/rebalancer/bucket_ref.test.lua

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@ fiber.sleep(0.2)
7575
vshard.storage.buckets_info(1)
7676
finish_refs = true
7777
while vshard.storage.buckets_info(1)[1].rw_lock do fiber.sleep(0.01) end
78-
vshard.storage.buckets_info(1)
78+
while box.space._bucket:get{1} do fiber.sleep(0.01) end
7979
_ = test_run:switch('box_1_a')
8080
vshard.storage.buckets_info(1)
8181

test/rebalancer/receiving_bucket.result

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -233,7 +233,7 @@ while box.space._bucket:get{101}.status ~= vshard.consts.BUCKET.ACTIVE do vshard
233233
...
234234
box.space._bucket:get{101}
235235
---
236-
- [101, 'active', '<replicaset_1>']
236+
- [101, 'active']
237237
...
238238
_ = test_run:switch('box_1_a')
239239
---

test/rebalancer/restart_during_rebalancing.result

Lines changed: 0 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -295,10 +295,6 @@ vshard.storage.info().bucket
295295
pinned: 0
296296
sending: 0
297297
...
298-
vshard.storage.internal.buckets_to_recovery
299-
---
300-
- []
301-
...
302298
check_consistency()
303299
---
304300
- true
@@ -316,10 +312,6 @@ vshard.storage.info().bucket
316312
pinned: 0
317313
sending: 0
318314
...
319-
vshard.storage.internal.buckets_to_recovery
320-
---
321-
- []
322-
...
323315
check_consistency()
324316
---
325317
- true
@@ -337,10 +329,6 @@ vshard.storage.info().bucket
337329
pinned: 0
338330
sending: 0
339331
...
340-
vshard.storage.internal.buckets_to_recovery
341-
---
342-
- []
343-
...
344332
check_consistency()
345333
---
346334
- true
@@ -358,10 +346,6 @@ vshard.storage.info().bucket
358346
pinned: 0
359347
sending: 0
360348
...
361-
vshard.storage.internal.buckets_to_recovery
362-
---
363-
- []
364-
...
365349
check_consistency()
366350
---
367351
- true

test/rebalancer/restart_during_rebalancing.test.lua

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -120,19 +120,15 @@ util.check_loading_result()
120120

121121
test_run:switch('fullbox_1_a')
122122
vshard.storage.info().bucket
123-
vshard.storage.internal.buckets_to_recovery
124123
check_consistency()
125124
test_run:switch('fullbox_2_a')
126125
vshard.storage.info().bucket
127-
vshard.storage.internal.buckets_to_recovery
128126
check_consistency()
129127
test_run:switch('fullbox_3_a')
130128
vshard.storage.info().bucket
131-
vshard.storage.internal.buckets_to_recovery
132129
check_consistency()
133130
test_run:switch('fullbox_4_a')
134131
vshard.storage.info().bucket
135-
vshard.storage.internal.buckets_to_recovery
136132
check_consistency()
137133

138134
test_run:switch('default')

test/storage/recovery.result

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -54,9 +54,9 @@ _ = test_run:switch('storage_2_a')
5454
_bucket = box.space._bucket
5555
---
5656
...
57-
_bucket:replace{2, vshard.consts.BUCKET.ACTIVE, util.replicasets[1]}
57+
_bucket:replace{2, vshard.consts.BUCKET.ACTIVE}
5858
---
59-
- [2, 'active', '<replicaset_1>']
59+
- [2, 'active']
6060
...
6161
_bucket:replace{3, vshard.consts.BUCKET.SENDING, util.replicasets[1]}
6262
---
@@ -87,7 +87,7 @@ _ = test_run:switch('storage_2_a')
8787
...
8888
_bucket:select{}
8989
---
90-
- - [2, 'active', '<replicaset_1>']
90+
- - [2, 'active']
9191
- [3, 'sending', '<replicaset_1>']
9292
...
9393
_ = test_run:switch('storage_1_a')
@@ -129,9 +129,9 @@ _bucket:replace{1, vshard.consts.BUCKET.SENDING, util.replicasets[2]}
129129
_ = test_run:switch('storage_2_a')
130130
---
131131
...
132-
_bucket:replace{1, vshard.consts.BUCKET.ACTIVE, util.replicasets[1]}
132+
_bucket:replace{1, vshard.consts.BUCKET.ACTIVE}
133133
---
134-
- [1, 'active', '<replicaset_1>']
134+
- [1, 'active']
135135
...
136136
_ = test_run:switch('default')
137137
---
@@ -180,9 +180,9 @@ _bucket = box.space._bucket
180180
...
181181
_bucket:select{}
182182
---
183-
- - [1, 'active', '<replicaset_1>']
184-
- [2, 'active', '<replicaset_1>']
185-
- [3, 'active', '<replicaset_1>']
183+
- - [1, 'active']
184+
- [2, 'active']
185+
- [3, 'active']
186186
...
187187
--
188188
-- Test a case when a bucket is sending in one place and garbage

test/storage/recovery.test.lua

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ _bucket:replace{3, vshard.consts.BUCKET.RECEIVING, util.replicasets[2]}
2323

2424
_ = test_run:switch('storage_2_a')
2525
_bucket = box.space._bucket
26-
_bucket:replace{2, vshard.consts.BUCKET.ACTIVE, util.replicasets[1]}
26+
_bucket:replace{2, vshard.consts.BUCKET.ACTIVE}
2727
_bucket:replace{3, vshard.consts.BUCKET.SENDING, util.replicasets[1]}
2828

2929
_ = test_run:cmd('stop server storage_1_a')
@@ -54,7 +54,7 @@ while _bucket:count() ~= 2 do vshard.storage.recovery_wakeup() fiber.sleep(0.1)
5454
_ = test_run:switch('storage_1_a')
5555
_bucket:replace{1, vshard.consts.BUCKET.SENDING, util.replicasets[2]}
5656
_ = test_run:switch('storage_2_a')
57-
_bucket:replace{1, vshard.consts.BUCKET.ACTIVE, util.replicasets[1]}
57+
_bucket:replace{1, vshard.consts.BUCKET.ACTIVE}
5858
_ = test_run:switch('default')
5959
_ = test_run:cmd('stop server storage_2_a')
6060
_ = test_run:cmd('stop server storage_1_a')

test/storage/recovery_errinj.result

Lines changed: 84 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -79,10 +79,92 @@ _bucket:get{1}
7979
_ = test_run:switch('storage_1_a')
8080
---
8181
...
82-
fiber = require('fiber')
82+
while _bucket:count() ~= 0 do vshard.storage.recovery_wakeup() fiber.sleep(0.1) end
8383
---
8484
...
85-
while _bucket:count() ~= 0 do vshard.storage.recovery_wakeup() fiber.sleep(0.1) end
85+
--
86+
-- Test a case when a bucket was sending on a source. Then the
87+
-- master was switched. In such a case the sending will fail
88+
-- and the new master should recovery the bucket.
89+
--
90+
_ = test_run:switch('storage_2_a')
91+
---
92+
...
93+
box.error.injection.set("ERRINJ_WAL_DELAY", true)
94+
---
95+
- ok
96+
...
97+
err = nil
98+
---
99+
...
100+
ok = nil
101+
---
102+
...
103+
f = fiber.create(function() ok, err = vshard.storage.bucket_send(1, util.replicasets[1]) end)
104+
---
105+
...
106+
while not vshard.storage.buckets_info(1)[1].rw_lock do fiber.sleep(0.01) end
107+
---
108+
...
109+
cfg.sharding[util.replicasets[2]].replicas[util.name_to_uuid.storage_2_a].master = false
110+
---
111+
...
112+
cfg.sharding[util.replicasets[2]].replicas[util.name_to_uuid.storage_2_b].master = true
113+
---
114+
...
115+
vshard.storage.cfg(cfg, util.name_to_uuid.storage_2_a)
116+
---
117+
...
118+
box.space._bucket:get{1}
119+
---
120+
- [1, 'sending', '<replicaset_1>']
121+
...
122+
_ = test_run:switch('storage_2_b')
123+
---
124+
...
125+
cfg.sharding[util.replicasets[2]].replicas[util.name_to_uuid.storage_2_a].master = false
126+
---
127+
...
128+
cfg.sharding[util.replicasets[2]].replicas[util.name_to_uuid.storage_2_b].master = true
129+
---
130+
...
131+
vshard.storage.cfg(cfg, util.name_to_uuid.storage_2_b)
132+
---
133+
...
134+
box.space._bucket:get{1}
135+
---
136+
- [1, 'active']
137+
...
138+
_ = test_run:switch('storage_2_a')
139+
---
140+
...
141+
box.error.injection.set("ERRINJ_WAL_DELAY", false)
142+
---
143+
- ok
144+
...
145+
while not err do fiber.sleep(0.01) end
146+
---
147+
...
148+
ok, err
149+
---
150+
- false
151+
- Can't modify data because this instance is in read-only mode.
152+
...
153+
box.space._bucket:get{1}
154+
---
155+
- [1, 'sending', '<replicaset_1>']
156+
...
157+
_ = test_run:switch('storage_2_b')
158+
---
159+
...
160+
while box.space._bucket:get{1}.status ~= vshard.consts.BUCKET.SENDING do fiber.sleep(0.01) end
161+
---
162+
...
163+
box.space._bucket:get{1}
164+
---
165+
- [1, 'sending', '<replicaset_1>']
166+
...
167+
while box.space._bucket:get{1}.status ~= vshard.consts.BUCKET.ACTIVE do vshard.storage.recovery_wakeup() fiber.sleep(0.01) end
86168
---
87169
...
88170
_ = test_run:switch("default")

test/storage/recovery_errinj.test.lua

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,9 +29,40 @@ while _bucket:get{1}.status ~= vshard.consts.BUCKET.ACTIVE do fiber.sleep(0.01)
2929
_bucket:get{1}
3030

3131
_ = test_run:switch('storage_1_a')
32-
fiber = require('fiber')
3332
while _bucket:count() ~= 0 do vshard.storage.recovery_wakeup() fiber.sleep(0.1) end
3433

34+
--
35+
-- Test a case when a bucket was sending on a source. Then the
36+
-- master was switched. In such a case the sending will fail
37+
-- and the new master should recovery the bucket.
38+
--
39+
_ = test_run:switch('storage_2_a')
40+
box.error.injection.set("ERRINJ_WAL_DELAY", true)
41+
err = nil
42+
ok = nil
43+
f = fiber.create(function() ok, err = vshard.storage.bucket_send(1, util.replicasets[1]) end)
44+
while not vshard.storage.buckets_info(1)[1].rw_lock do fiber.sleep(0.01) end
45+
cfg.sharding[util.replicasets[2]].replicas[util.name_to_uuid.storage_2_a].master = false
46+
cfg.sharding[util.replicasets[2]].replicas[util.name_to_uuid.storage_2_b].master = true
47+
vshard.storage.cfg(cfg, util.name_to_uuid.storage_2_a)
48+
box.space._bucket:get{1}
49+
50+
_ = test_run:switch('storage_2_b')
51+
cfg.sharding[util.replicasets[2]].replicas[util.name_to_uuid.storage_2_a].master = false
52+
cfg.sharding[util.replicasets[2]].replicas[util.name_to_uuid.storage_2_b].master = true
53+
vshard.storage.cfg(cfg, util.name_to_uuid.storage_2_b)
54+
box.space._bucket:get{1}
55+
56+
_ = test_run:switch('storage_2_a')
57+
box.error.injection.set("ERRINJ_WAL_DELAY", false)
58+
while not err do fiber.sleep(0.01) end
59+
ok, err
60+
box.space._bucket:get{1}
61+
_ = test_run:switch('storage_2_b')
62+
while box.space._bucket:get{1}.status ~= vshard.consts.BUCKET.SENDING do fiber.sleep(0.01) end
63+
box.space._bucket:get{1}
64+
while box.space._bucket:get{1}.status ~= vshard.consts.BUCKET.ACTIVE do vshard.storage.recovery_wakeup() fiber.sleep(0.01) end
65+
3566
_ = test_run:switch("default")
3667
test_run:drop_cluster(REPLICASET_2)
3768
test_run:drop_cluster(REPLICASET_1)

0 commit comments

Comments
 (0)