@@ -112,8 +112,13 @@ namespace NKikimr::NStorage {
112
112
Self->Send (new IEventHandle (TEvents::TSystem::Poison, 0 , Self->StateStorageSelfHealActor .value (), Self->SelfId (), nullptr , 0 ));
113
113
Self->StateStorageSelfHealActor .reset ();
114
114
}
115
-
116
- auto needReconfig = [&](auto name, auto clearFunc, auto ssMutableFunc, auto buildFunc) {
115
+ enum ReconfigType {
116
+ NONE,
117
+ ONE_NODE,
118
+ FULL
119
+ };
120
+ std::unordered_map<ui32, ui32> nodesToReplace;
121
+ auto needReconfig = [&](auto clearFunc, auto ssMutableFunc, auto buildFunc) {
117
122
auto copyCurrentConfig = currentConfig;
118
123
auto ss = *(copyCurrentConfig.*ssMutableFunc)();
119
124
if (ss.RingGroupsSize () == 0 ) {
@@ -127,26 +132,127 @@ namespace NKikimr::NStorage {
127
132
TIntrusivePtr<TStateStorageInfo> oldSSInfo;
128
133
oldSSInfo = (*buildFunc)(ss);
129
134
newSSInfo = (*buildFunc)(*(targetConfig.*ssMutableFunc)());
130
- STLOG (PRI_DEBUG, BS_NODE, NW52, " Need to reconfig " << name << " " << (oldSSInfo->RingGroups != newSSInfo->RingGroups ));
131
135
if (oldSSInfo->RingGroups == newSSInfo->RingGroups ) {
132
136
(targetConfig.*clearFunc)();
133
- return false ;
137
+ return ReconfigType::NONE ;
134
138
}
135
139
136
- return true ;
140
+ if (oldSSInfo->RingGroups .size () != newSSInfo->RingGroups .size ()) {
141
+ return ReconfigType::FULL;
142
+ }
143
+
144
+ bool hasBadNodes = false ;
145
+ for (ui32 ringGroupIdx : xrange (oldSSInfo->RingGroups .size ())) {
146
+ auto & oldRg = oldSSInfo->RingGroups [ringGroupIdx];
147
+ auto & newRg = newSSInfo->RingGroups [ringGroupIdx];
148
+ if (oldRg.NToSelect != newRg.NToSelect || oldRg.Rings .size () != newRg.Rings .size ()) {
149
+ return ReconfigType::FULL;
150
+ }
151
+ for (ui32 j : xrange (oldRg.Rings .size ())) {
152
+ auto & oldRing = oldRg.Rings [j];
153
+ auto & newRing = newRg.Rings [j];
154
+ if (oldRing.IsDisabled != newRing.IsDisabled
155
+ || oldRing.UseRingSpecificNodeSelection != newRing.UseRingSpecificNodeSelection
156
+ || oldRing.Replicas .size () != newRing.Replicas .size ()) {
157
+ return ReconfigType::FULL;
158
+ }
159
+ for (auto & actorId : oldRing.Replicas ) {
160
+ if (!Self->SelfHealNodesState .contains (actorId.NodeId ()) || Self->SelfHealNodesState .at (actorId.NodeId ()) > 0 ) {
161
+ hasBadNodes = true ;
162
+ }
163
+ }
164
+ }
165
+ }
166
+ if (!hasBadNodes) {
167
+ return ReconfigType::NONE; // Current config is optimal and all nodes are good
168
+ }
169
+
170
+ // Check can be node replacement applyed
171
+ for (ui32 ringGroupIdx : xrange (oldSSInfo->RingGroups .size ())) {
172
+ auto & oldRg = oldSSInfo->RingGroups [ringGroupIdx];
173
+ auto & newRg = newSSInfo->RingGroups [ringGroupIdx];
174
+
175
+ // Find not changed rings and place them on previous position
176
+ auto equalRingsByNodes = [](auto & ring1, auto & ring2) {
177
+ if (ring1.Replicas .size () != ring2.Replicas .size ()) {
178
+ return false ;
179
+ }
180
+ for (ui32 replicaPos : xrange (ring1.Replicas .size ())) {
181
+ if (ring1.Replicas [replicaPos].NodeId () != ring2.Replicas [replicaPos].NodeId ()) {
182
+ return false ;
183
+ }
184
+ }
185
+ return true ;
186
+ };
187
+ for (ui32 oldRingIdx : xrange (oldRg.Rings .size ())) {
188
+ for (ui32 newRingIdx : xrange (newRg.Rings .size ())) {
189
+ if (newRingIdx != oldRingIdx && equalRingsByNodes (oldRg.Rings [oldRingIdx], newRg.Rings [newRingIdx])) {
190
+ std::swap (newRg.Rings [newRingIdx], newRg.Rings [oldRingIdx]);
191
+ break ;
192
+ }
193
+ }
194
+ }
195
+ for (ui32 j : xrange (oldRg.Rings .size ())) {
196
+ auto & oldRing = oldRg.Rings [j];
197
+ auto & newRing = newRg.Rings [j];
198
+ if (oldRing == newRing) {
199
+ continue ;
200
+ }
201
+ // Place replicas in ring on previous position
202
+ for (ui32 oldReplicaPos : xrange (oldRing.Replicas .size ())) {
203
+ for (ui32 newReplicaPos : xrange (newRing.Replicas .size ())) {
204
+ if (newReplicaPos != oldReplicaPos && oldRing.Replicas [oldReplicaPos].NodeId () == newRing.Replicas [newReplicaPos].NodeId ()) {
205
+ std::swap (newRing.Replicas [newReplicaPos], newRing.Replicas [oldReplicaPos]);
206
+ break ;
207
+ }
208
+ }
209
+ }
210
+
211
+ for (ui32 k : xrange (oldRing.Replicas .size ())) {
212
+ auto oldRep = oldRing.Replicas [k].NodeId ();
213
+ auto newRep = newRing.Replicas [k].NodeId ();
214
+ if (oldRep == newRep) {
215
+ continue ;
216
+ }
217
+ if (auto it = nodesToReplace.find (oldRep); it != nodesToReplace.end () && it->second != newRep) {
218
+ return ReconfigType::FULL;
219
+ }
220
+ nodesToReplace[oldRep] = newRep;
221
+ }
222
+ }
223
+ }
224
+ if (nodesToReplace.size () == 1 ) {
225
+ return ReconfigType::ONE_NODE;
226
+ }
227
+ return nodesToReplace.empty () ? ReconfigType::NONE : ReconfigType::FULL;
137
228
};
138
- #define NEED_RECONFIG (NAME ) needReconfig(#NAME, &NKikimrBlobStorage::TStateStorageConfig::Clear##NAME##Config, &NKikimrBlobStorage::TStateStorageConfig::Mutable##NAME##Config, &NKikimr::Build##NAME##Info)
229
+ #define NEED_RECONFIG (NAME ) needReconfig(&NKikimrBlobStorage::TStateStorageConfig::Clear##NAME##Config, &NKikimrBlobStorage::TStateStorageConfig::Mutable##NAME##Config, &NKikimr::Build##NAME##Info)
139
230
auto needReconfigSS = NEED_RECONFIG (StateStorage);
140
231
auto needReconfigSSB = NEED_RECONFIG (StateStorageBoard);
141
232
auto needReconfigSB = NEED_RECONFIG (SchemeBoard);
233
+ #undef NEED_RECONFIG
142
234
143
- if (! needReconfigSS && ! needReconfigSSB && ! needReconfigSB) {
235
+ if (needReconfigSS == ReconfigType::NONE && needReconfigSSB == ReconfigType::NONE && needReconfigSB == ReconfigType::NONE ) {
144
236
throw TExError () << " Current configuration is recommended. Nothing to self-heal." ;
145
237
}
146
- #undef NEED_RECONFIG
238
+ if (nodesToReplace.size () == 1 && needReconfigSS != ReconfigType::FULL && needReconfigSSB != ReconfigType::FULL && needReconfigSB != ReconfigType::FULL) {
239
+ STLOG (PRI_DEBUG, BS_NODE, NW52, " Need to reconfig one node " << nodesToReplace.begin ()->first << " to " << nodesToReplace.begin ()->second
240
+ , (CurrentConfig, currentConfig), (TargetConfig, targetConfig));
241
+
242
+ TQuery::TReassignStateStorageNode cmd;
243
+ cmd.SetFrom (nodesToReplace.begin ()->first );
244
+ cmd.SetTo (nodesToReplace.begin ()->second );
245
+ cmd.SetStateStorage (needReconfigSS == ReconfigType::ONE_NODE);
246
+ cmd.SetStateStorageBoard (needReconfigSSB == ReconfigType::ONE_NODE);
247
+ cmd.SetSchemeBoard (needReconfigSB == ReconfigType::ONE_NODE);
248
+ ReassignStateStorageNode (cmd);
249
+ return ;
250
+ }
147
251
148
252
AdjustRingGroupActorIdOffsetInRecommendedStateStorageConfig (&targetConfig);
149
253
254
+ STLOG (PRI_DEBUG, BS_NODE, NW52, " Need to reconfig, starting StateStorageSelfHealActor" , (CurrentConfig, currentConfig), (TargetConfig, targetConfig));
255
+
150
256
Self->StateStorageSelfHealActor = Register (new TStateStorageSelfhealActor (Sender, Cookie,
151
257
TDuration::Seconds (waitForConfigStep), std::move (currentConfig), std::move (targetConfig)));
152
258
auto ev = PrepareResult (TResult::OK, std::nullopt);
0 commit comments