-
Notifications
You must be signed in to change notification settings - Fork 718
Add manual request approval to CMS #21593
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
4795def
7ed7fb5
2a60e4b
54a520f
c1f333f
635e389
8d1d7f2
ad95dd4
7f094a7
ceaa245
0c14dcc
f10660f
5ba78e4
a796f45
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -1460,6 +1460,107 @@ void TCms::RemoveRequest(TEvCms::TEvManageRequestRequest::TPtr &ev, const TActor | |||||
} | ||||||
} | ||||||
|
||||||
void TCms::ManuallyApproveRequest(TEvCms::TEvManageRequestRequest::TPtr &ev, const TActorContext &ctx) | ||||||
{ | ||||||
// This actor waits for permission response and then sends manage request response | ||||||
// with approved permissions to the sender of the request while also removing scheduled request. | ||||||
class TRequestApproveActor : public TActor<TRequestApproveActor> { | ||||||
public: | ||||||
using TBase = TActor<TRequestApproveActor>; | ||||||
const TString RequestId; | ||||||
const TCmsStatePtr State; | ||||||
const TActorId SendTo; | ||||||
|
||||||
TRequestApproveActor(TString requestId, TCmsStatePtr state, TActorId sendTo) | ||||||
: TBase(&TRequestApproveActor::StateWork) | ||||||
, RequestId(std::move(requestId)) | ||||||
, State(std::move(state)) | ||||||
, SendTo(sendTo) | ||||||
{} | ||||||
|
||||||
void Handle(TEvCms::TEvPermissionResponse::TPtr &ev) { | ||||||
auto resp = ev->Get(); | ||||||
|
||||||
const NKikimrCms::TStatus status = resp->Record.GetStatus(); | ||||||
|
||||||
THolder<TEvCms::TEvManageRequestResponse> manageResponse = MakeHolder<TEvCms::TEvManageRequestResponse>(); | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
Не нужно повторять, а читаемость только улучшается. |
||||||
|
||||||
if (status.GetCode() != TStatus::ALLOW) { | ||||||
manageResponse->Record.MutableStatus()->SetCode(status.GetCode()); | ||||||
manageResponse->Record.MutableStatus()->SetReason(status.GetReason()); | ||||||
Send(SendTo, std::move(manageResponse), 0, ev->Cookie); | ||||||
PassAway(); | ||||||
return; | ||||||
} | ||||||
|
||||||
manageResponse->Record.MutableStatus()->SetCode(TStatus::OK); | ||||||
for (auto& permission : resp->Record.permissions()) { | ||||||
manageResponse->Record.AddManuallyApprovedPermissions()->CopyFrom(permission); | ||||||
} | ||||||
|
||||||
Send(SendTo, std::move(manageResponse), 0, ev->Cookie); | ||||||
|
||||||
PassAway(); | ||||||
} | ||||||
|
||||||
STFUNC(StateWork) { | ||||||
SammyVimes marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
switch (ev->GetTypeRewrite()) { | ||||||
hFunc(TEvCms::TEvPermissionResponse, Handle); | ||||||
default: | ||||||
LOG_ERROR_S(*TlsActivationContext, NKikimrServices::CMS, | ||||||
"Unexpected event type: " << ev->GetTypeName()); | ||||||
break; | ||||||
} | ||||||
} | ||||||
}; | ||||||
|
||||||
auto &rec = ev->Get()->Record; | ||||||
|
||||||
TString requestId = rec.GetRequestId(); | ||||||
|
||||||
// Find the scheduled request by RequestId | ||||||
auto it = State->ScheduledRequests.find(requestId); | ||||||
if (it == State->ScheduledRequests.end()) { | ||||||
return ReplyWithError<TEvCms::TEvManageRequestResponse>( | ||||||
ev, TStatus::WRONG_REQUEST, "Unknown request for manual approval", ctx); | ||||||
} | ||||||
|
||||||
THolder<TRequestInfo> copy = MakeHolder<TRequestInfo>(it->second); | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
То же самое. |
||||||
|
||||||
// Create a permission for each action in the scheduled request | ||||||
THolder<TEvCms::TEvPermissionResponse> resp = MakeHolder<TEvCms::TEvPermissionResponse>(); | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
resp->Record.MutableStatus()->SetCode(TStatus::ALLOW); | ||||||
for (const auto& action : copy->Request.GetActions()) { | ||||||
auto items = ClusterInfo->FindLockedItems(action, &ctx); | ||||||
for (const auto& item : items) { | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Тут двойной indirection, т.к. |
||||||
TErrorInfo error; | ||||||
TDuration duration = TDuration::MicroSeconds(action.GetDuration()); | ||||||
duration += TDuration::MicroSeconds(copy->Request.GetDuration()); | ||||||
// To get permissions ASAP and not in the priority order. | ||||||
item->DeactivateScheduledLocks(Min<i32>()); | ||||||
bool isLocked = item->IsLocked(error, State->Config.DefaultRetryTime, TActivationContext::Now(), duration); | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
item->ReactivateScheduledLocks(); | ||||||
if (isLocked) { | ||||||
return ReplyWithError<TEvCms::TEvManageRequestResponse>( | ||||||
ev, TStatus::WRONG_REQUEST, "Request has already locked items: " + error.Reason.GetMessage(), ctx); | ||||||
} | ||||||
} | ||||||
|
||||||
auto* perm = resp->Record.AddPermissions(); | ||||||
perm->MutableAction()->CopyFrom(action); | ||||||
TInstant deadline = TActivationContext::Now() + TDuration::MicroSeconds(copy->Request.GetDuration()); | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
Там, откуда ты скопировал |
||||||
perm->SetDeadline(deadline.GetValue()); | ||||||
} | ||||||
|
||||||
AcceptPermissions(resp->Record, rec.GetRequestId(), rec.GetUser(), ctx, true); | ||||||
SammyVimes marked this conversation as resolved.
Show resolved
Hide resolved
SammyVimes marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
|
||||||
auto actor = new TRequestApproveActor(requestId, State, ev->Sender); | ||||||
TActorId approveActorId = ctx.RegisterWithSameMailbox(actor); | ||||||
|
||||||
auto handle = new IEventHandle(approveActorId, SelfId(), resp.Release(), 0, ev->Cookie); | ||||||
Execute(CreateTxStorePermissions(std::move(ev->Release()), handle, rec.GetUser(), std::move(copy)), ctx); | ||||||
SammyVimes marked this conversation as resolved.
Show resolved
Hide resolved
SammyVimes marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||
} | ||||||
|
||||||
void TCms::GetNotifications(TEvCms::TEvManageNotificationRequest::TPtr &ev, bool all, | ||||||
const TActorContext &ctx) | ||||||
{ | ||||||
|
@@ -1856,6 +1957,10 @@ void TCms::Handle(TEvCms::TEvManageRequestRequest::TPtr &ev, const TActorContext | |||||
case TManageRequestRequest::REJECT: | ||||||
RemoveRequest(ev, ctx); | ||||||
return; | ||||||
case TManageRequestRequest::APPROVE: { | ||||||
ManuallyApproveRequest(ev, ctx); | ||||||
return; | ||||||
} | ||||||
default: | ||||||
return ReplyWithError<TEvCms::TEvManageRequestResponse>( | ||||||
ev, TStatus::WRONG_REQUEST, "Unknown command", ctx); | ||||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -521,6 +521,157 @@ Y_UNIT_TEST_SUITE(TCmsTest) { | |
); | ||
} | ||
|
||
Y_UNIT_TEST(ManualRequestApproval) | ||
{ | ||
auto opts = TTestEnvOpts(8, 8).WithSentinel().WithDynamicGroups(); | ||
TCmsTestEnv env(opts); | ||
|
||
// Disconnect 3 nodes, in this case locking is not allowed | ||
for (ui32 i = 0; i < 3; i++) { | ||
auto& node = TFakeNodeWhiteboardService::Info[env.GetNodeId(i)]; | ||
node.Connected = false; | ||
} | ||
env.RegenerateBSConfig(TFakeNodeWhiteboardService::Config.MutableResponse()->MutableStatus(0)->MutableBaseConfig(), opts); | ||
|
||
auto req = MakePermissionRequest( | ||
TRequestOptions("user", false, false, true), | ||
MakeAction(TAction::SHUTDOWN_HOST, env.GetNodeId(0), 60000000) | ||
); | ||
req->Record.SetAvailabilityMode(NKikimrCms::EAvailabilityMode::MODE_MAX_AVAILABILITY); | ||
|
||
// Request that cannot be fulfilled | ||
auto rec1 = env.CheckPermissionRequest(req, TStatus::DISALLOW_TEMP); | ||
|
||
auto rid1 = rec1.GetRequestId(); | ||
|
||
// Manual approval | ||
auto approveResp = env.CheckApproveRequest("user", rid1, false, TStatus::OK); | ||
UNIT_ASSERT_VALUES_EQUAL(approveResp.ManuallyApprovedPermissionsSize(), 1); | ||
TString permissionId = approveResp.GetManuallyApprovedPermissions(0).GetId(); | ||
auto rec2 = env.CheckGetPermission("user", permissionId); | ||
UNIT_ASSERT_VALUES_EQUAL(rec2.PermissionsSize(), 1); | ||
UNIT_ASSERT_VALUES_EQUAL(rec2.GetPermissions(0).GetId(), permissionId); | ||
|
||
{ | ||
// Request cannot be fulfilled, since node 0 is locked by previously approved request | ||
auto req = MakePermissionRequest( | ||
TRequestOptions("user", false, false, true), | ||
MakeAction(TAction::SHUTDOWN_HOST, env.GetNodeId(0), 60000000) | ||
); | ||
req->Record.SetAvailabilityMode(NKikimrCms::EAvailabilityMode::MODE_MAX_AVAILABILITY); | ||
|
||
env.CheckPermissionRequest(req, TStatus::DISALLOW_TEMP); | ||
} | ||
|
||
env.AdvanceCurrentTime(TDuration::Minutes(30)); | ||
|
||
{ | ||
// This will trigger CMS cleanup. | ||
env.CheckPermissionRequest(MakePermissionRequest( | ||
TRequestOptions("user", false, false, true), | ||
MakeAction(TAction::SHUTDOWN_HOST, env.GetNodeId(0), 60000000) | ||
), TStatus::DISALLOW_TEMP); | ||
|
||
auto list = env.CheckListRequests("user", 2); | ||
auto reqs = list.GetRequests(); | ||
for (const auto& req : reqs) { | ||
UNIT_ASSERT_VALUES_UNEQUAL(req.GetRequestId(), "user-r-1"); | ||
} | ||
|
||
// Check that manually approved permission was cleaned up | ||
env.CheckGetPermission("user", permissionId, false, TStatus::WRONG_REQUEST); | ||
} | ||
} | ||
|
||
Y_UNIT_TEST(ManualRequestApprovalLockingAllNodes) | ||
{ | ||
auto opts = TTestEnvOpts(8, 8).WithSentinel().WithDynamicGroups(); | ||
TCmsTestEnv env(opts); | ||
|
||
auto& node = TFakeNodeWhiteboardService::Info[env.GetNodeId(0)]; | ||
node.Connected = false; | ||
env.RegenerateBSConfig(TFakeNodeWhiteboardService::Config.MutableResponse()->MutableStatus(0)->MutableBaseConfig(), opts); | ||
|
||
for (ui32 i = 0; i < 8; i++) { | ||
auto req = MakePermissionRequest( | ||
TRequestOptions("user", false, false, true), | ||
MakeAction(TAction::SHUTDOWN_HOST, env.GetNodeId(i), 60000000) | ||
); | ||
req->Record.SetAvailabilityMode(NKikimrCms::EAvailabilityMode::MODE_MAX_AVAILABILITY); | ||
|
||
// Request that cannot be fulfilled | ||
auto rec1 = env.CheckPermissionRequest(req, TStatus::DISALLOW_TEMP); | ||
|
||
auto rid1 = rec1.GetRequestId(); | ||
|
||
// Manual approval | ||
auto approveResp = env.CheckApproveRequest("user", rid1, false, TStatus::OK); | ||
UNIT_ASSERT_VALUES_EQUAL(approveResp.ManuallyApprovedPermissionsSize(), 1); | ||
TString permissionId = approveResp.GetManuallyApprovedPermissions(0).GetId(); | ||
auto rec2 = env.CheckGetPermission("user", permissionId); | ||
UNIT_ASSERT_VALUES_EQUAL(rec2.PermissionsSize(), 1); | ||
UNIT_ASSERT_VALUES_EQUAL(rec2.GetPermissions(0).GetId(), permissionId); | ||
} | ||
} | ||
|
||
Y_UNIT_TEST(ManualRequestApprovalWithPartialAlreadyApproved) | ||
{ | ||
auto opts = TTestEnvOpts(8, 8).WithSentinel().WithDynamicGroups(); | ||
TCmsTestEnv env(opts); | ||
|
||
auto req = MakePermissionRequest( | ||
TRequestOptions("user", true, false, true), | ||
MakeAction(TAction::SHUTDOWN_HOST, env.GetNodeId(0), 60000000), | ||
MakeAction(TAction::SHUTDOWN_HOST, env.GetNodeId(1), 60000000), | ||
MakeAction(TAction::SHUTDOWN_HOST, env.GetNodeId(2), 60000000) | ||
); | ||
|
||
req->Record.SetAvailabilityMode(NKikimrCms::EAvailabilityMode::MODE_MAX_AVAILABILITY); | ||
|
||
auto rec1 = env.CheckPermissionRequest(req, TStatus::ALLOW_PARTIAL); | ||
UNIT_ASSERT_VALUES_EQUAL(rec1.PermissionsSize(), 1); | ||
auto rec2 = env.CheckGetPermission("user", rec1.GetPermissions(0).GetId()); | ||
UNIT_ASSERT_VALUES_EQUAL(rec2.PermissionsSize(), 1); | ||
UNIT_ASSERT_VALUES_EQUAL(rec2.GetPermissions(0).GetId(), rec1.GetPermissions(0).GetId()); | ||
|
||
auto rid1 = rec1.GetRequestId(); | ||
|
||
// // Manual approval | ||
auto approveResp = env.CheckApproveRequest("user", rid1, false, TStatus::OK); | ||
UNIT_ASSERT_VALUES_EQUAL(approveResp.ManuallyApprovedPermissionsSize(), 2); | ||
for (const auto& permission : approveResp.GetManuallyApprovedPermissions()) { | ||
auto permissionId = permission.GetId(); | ||
auto rec3 = env.CheckGetPermission("user", permissionId); | ||
UNIT_ASSERT_VALUES_EQUAL(rec3.PermissionsSize(), 1); | ||
UNIT_ASSERT_VALUES_EQUAL(rec3.GetPermissions(0).GetId(), permissionId); | ||
} | ||
} | ||
|
||
Y_UNIT_TEST(ManualRequestApprovalAlreadyLockedNode) | ||
{ | ||
auto opts = TTestEnvOpts(8, 8).WithSentinel().WithDynamicGroups(); | ||
TCmsTestEnv env(opts); | ||
|
||
auto req = MakePermissionRequest( | ||
TRequestOptions("user", true, false, true), | ||
MakeAction(TAction::SHUTDOWN_HOST, env.GetNodeId(0), 60000000) | ||
); | ||
req->Record.SetAvailabilityMode(NKikimrCms::EAvailabilityMode::MODE_MAX_AVAILABILITY); | ||
|
||
env.CheckPermissionRequest(req, TStatus::ALLOW); | ||
|
||
auto req2 = MakePermissionRequest( | ||
TRequestOptions("user", true, false, true), | ||
MakeAction(TAction::SHUTDOWN_HOST, env.GetNodeId(0), 60000000) | ||
); | ||
req2->Record.SetAvailabilityMode(NKikimrCms::EAvailabilityMode::MODE_MAX_AVAILABILITY); | ||
auto rec2 = env.CheckPermissionRequest(req2, TStatus::DISALLOW_TEMP); | ||
auto rid2 = rec2.GetRequestId(); | ||
|
||
// Manual approval should fail since node 0 is already locked | ||
env.CheckApproveRequest("user", rid2, false, TStatus::WRONG_REQUEST); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. А что если допустить, что в таком случае мы выдаем лок? Кажется, что это было бы полезно. Представь, что в кластере 2 из 100 хостов забраны в обслуживание и на них висят локи. При запуске роллинг-рестарта, в конце концов он упрется в эти 2 хоста, и можно было бы так "подопнуть" роллинг, выдав ему локи, он быстро порестартит и удалит свои пермишены. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Согласен, но я бы это тоже отдельным тикетом делал, т.к. сейчас CMS совсем не рассчитывает на несколько локов на одну сущность |
||
} | ||
|
||
Y_UNIT_TEST(RequestReplacePDiskDoesntBreakGroup) | ||
{ | ||
auto opts = TTestEnvOpts(8, 2).WithSentinel().WithDynamicGroups(); | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -213,6 +213,7 @@ message TManageRequestRequest { | |
LIST = 1; | ||
GET = 2; | ||
REJECT = 3; | ||
APPROVE = 4; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Хотелось бы еще поддержать такую возможность в публичном Maintenance API, которое использует ydbops. Уверен, что скоро вы будете им пользоваться, потому что ваша команда его и разрабатывает) Но можно это сделать отдельной задачей |
||
} | ||
|
||
optional string User = 1; | ||
|
@@ -234,6 +235,7 @@ message TManageRequestResponse { | |
|
||
optional TStatus Status = 1; | ||
repeated TScheduledRequest Requests = 2; | ||
repeated TPermission ManuallyApprovedPermissions = 3; | ||
} | ||
|
||
message TCheckRequest { | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.