Skip to content
4 changes: 2 additions & 2 deletions ydb/core/cms/cluster_info.h
Original file line number Diff line number Diff line change
Expand Up @@ -903,6 +903,8 @@ class TClusterInfo : public TThrRefBase {
}

ui64 AddExternalLocks(const TNotificationInfo &notification, const TActorContext *ctx);

TSet<TLockableItem *> FindLockedItems(const NKikimrCms::TAction &action, const TActorContext *ctx);

void SetHostMarkers(const TString &hostName, const THashSet<NKikimrCms::EMarker> &markers);
void ResetHostMarkers(const TString &hostName);
Expand Down Expand Up @@ -1018,8 +1020,6 @@ class TClusterInfo : public TThrRefBase {
return TPDiskID();
}

TSet<TLockableItem *> FindLockedItems(const NKikimrCms::TAction &action, const TActorContext *ctx);

TNodes Nodes;
TTablets Tablets;
TPDisks PDisks;
Expand Down
105 changes: 105 additions & 0 deletions ydb/core/cms/cms.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1460,6 +1460,107 @@ void TCms::RemoveRequest(TEvCms::TEvManageRequestRequest::TPtr &ev, const TActor
}
}

void TCms::ManuallyApproveRequest(TEvCms::TEvManageRequestRequest::TPtr &ev, const TActorContext &ctx)
{
// This actor waits for permission response and then sends manage request response
// with approved permissions to the sender of the request while also removing scheduled request.
class TRequestApproveActor : public TActor<TRequestApproveActor> {
public:
using TBase = TActor<TRequestApproveActor>;
const TString RequestId;
const TCmsStatePtr State;
const TActorId SendTo;

TRequestApproveActor(TString requestId, TCmsStatePtr state, TActorId sendTo)
: TBase(&TRequestApproveActor::StateWork)
, RequestId(std::move(requestId))
, State(std::move(state))
, SendTo(sendTo)
{}

void Handle(TEvCms::TEvPermissionResponse::TPtr &ev) {
auto resp = ev->Get();
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
auto resp = ev->Get();
const auto& resp = ev->Get();


const NKikimrCms::TStatus status = resp->Record.GetStatus();

THolder<TEvCms::TEvManageRequestResponse> manageResponse = MakeHolder<TEvCms::TEvManageRequestResponse>();
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
THolder<TEvCms::TEvManageRequestResponse> manageResponse = MakeHolder<TEvCms::TEvManageRequestResponse>();
auto manageResponse = MakeHolder<TEvCms::TEvManageRequestResponse>();

Не нужно повторять, а читаемость только улучшается.


if (status.GetCode() != TStatus::ALLOW) {
manageResponse->Record.MutableStatus()->SetCode(status.GetCode());
manageResponse->Record.MutableStatus()->SetReason(status.GetReason());
Send(SendTo, std::move(manageResponse), 0, ev->Cookie);
PassAway();
return;
}

manageResponse->Record.MutableStatus()->SetCode(TStatus::OK);
for (auto& permission : resp->Record.permissions()) {
manageResponse->Record.AddManuallyApprovedPermissions()->CopyFrom(permission);
}

Send(SendTo, std::move(manageResponse), 0, ev->Cookie);

PassAway();
}

STFUNC(StateWork) {
switch (ev->GetTypeRewrite()) {
hFunc(TEvCms::TEvPermissionResponse, Handle);
default:
LOG_ERROR_S(*TlsActivationContext, NKikimrServices::CMS,
"Unexpected event type: " << ev->GetTypeName());
break;
}
}
};

auto &rec = ev->Get()->Record;

TString requestId = rec.GetRequestId();

// Find the scheduled request by RequestId
auto it = State->ScheduledRequests.find(requestId);
if (it == State->ScheduledRequests.end()) {
return ReplyWithError<TEvCms::TEvManageRequestResponse>(
ev, TStatus::WRONG_REQUEST, "Unknown request for manual approval", ctx);
}

THolder<TRequestInfo> copy = MakeHolder<TRequestInfo>(it->second);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
THolder<TRequestInfo> copy = MakeHolder<TRequestInfo>(it->second);
auto copy = MakeHolder<TRequestInfo>(it->second);

То же самое.


// Create a permission for each action in the scheduled request
THolder<TEvCms::TEvPermissionResponse> resp = MakeHolder<TEvCms::TEvPermissionResponse>();
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
THolder<TEvCms::TEvPermissionResponse> resp = MakeHolder<TEvCms::TEvPermissionResponse>();
auto resp = MakeHolder<TEvCms::TEvPermissionResponse>();

resp->Record.MutableStatus()->SetCode(TStatus::ALLOW);
for (const auto& action : copy->Request.GetActions()) {
auto items = ClusterInfo->FindLockedItems(action, &ctx);
for (const auto& item : items) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Тут двойной indirection, т.к. item — это указатель.

TErrorInfo error;
TDuration duration = TDuration::MicroSeconds(action.GetDuration());
duration += TDuration::MicroSeconds(copy->Request.GetDuration());
// To get permissions ASAP and not in the priority order.
item->DeactivateScheduledLocks(Min<i32>());
bool isLocked = item->IsLocked(error, State->Config.DefaultRetryTime, TActivationContext::Now(), duration);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
bool isLocked = item->IsLocked(error, State->Config.DefaultRetryTime, TActivationContext::Now(), duration);
const bool isLocked = item->IsLocked(error, State->Config.DefaultRetryTime, ctx.Now(), duration);

item->ReactivateScheduledLocks();
if (isLocked) {
return ReplyWithError<TEvCms::TEvManageRequestResponse>(
ev, TStatus::WRONG_REQUEST, "Request has already locked items: " + error.Reason.GetMessage(), ctx);
}
}

auto* perm = resp->Record.AddPermissions();
perm->MutableAction()->CopyFrom(action);
TInstant deadline = TActivationContext::Now() + TDuration::MicroSeconds(copy->Request.GetDuration());
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
TInstant deadline = TActivationContext::Now() + TDuration::MicroSeconds(copy->Request.GetDuration());
const TInstant deadline = ctx.Now() + TDuration::MicroSeconds(copy->Request.GetDuration());

Там, откуда ты скопировал TActivationContext::Now() нет TActorContext, а тут есть.

perm->SetDeadline(deadline.GetValue());
}

AcceptPermissions(resp->Record, rec.GetRequestId(), rec.GetUser(), ctx, true);

auto actor = new TRequestApproveActor(requestId, State, ev->Sender);
TActorId approveActorId = ctx.RegisterWithSameMailbox(actor);

auto handle = new IEventHandle(approveActorId, SelfId(), resp.Release(), 0, ev->Cookie);
Execute(CreateTxStorePermissions(std::move(ev->Release()), handle, rec.GetUser(), std::move(copy)), ctx);
}

void TCms::GetNotifications(TEvCms::TEvManageNotificationRequest::TPtr &ev, bool all,
const TActorContext &ctx)
{
Expand Down Expand Up @@ -1856,6 +1957,10 @@ void TCms::Handle(TEvCms::TEvManageRequestRequest::TPtr &ev, const TActorContext
case TManageRequestRequest::REJECT:
RemoveRequest(ev, ctx);
return;
case TManageRequestRequest::APPROVE: {
ManuallyApproveRequest(ev, ctx);
return;
}
default:
return ReplyWithError<TEvCms::TEvManageRequestResponse>(
ev, TStatus::WRONG_REQUEST, "Unknown command", ctx);
Expand Down
1 change: 1 addition & 0 deletions ydb/core/cms/cms_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -382,6 +382,7 @@ class TCms : public TActor<TCms>, public TTabletExecutedFlat {
void RemovePermission(TEvCms::TEvManagePermissionRequest::TPtr &ev, bool done, const TActorContext &ctx);
void GetRequest(TEvCms::TEvManageRequestRequest::TPtr &ev, bool all, const TActorContext &ctx);
void RemoveRequest(TEvCms::TEvManageRequestRequest::TPtr &ev, const TActorContext &ctx);
void ManuallyApproveRequest(TEvCms::TEvManageRequestRequest::TPtr &ev, const TActorContext &ctx);
void GetNotifications(TEvCms::TEvManageNotificationRequest::TPtr &ev, bool all, const TActorContext &ctx);
bool RemoveNotification(const TString &id, const TString &user, bool remove, TErrorInfo &error);

Expand Down
151 changes: 151 additions & 0 deletions ydb/core/cms/cms_ut.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -521,6 +521,157 @@ Y_UNIT_TEST_SUITE(TCmsTest) {
);
}

Y_UNIT_TEST(ManualRequestApproval)
{
auto opts = TTestEnvOpts(8, 8).WithSentinel().WithDynamicGroups();
TCmsTestEnv env(opts);

// Disconnect 3 nodes, in this case locking is not allowed
for (ui32 i = 0; i < 3; i++) {
auto& node = TFakeNodeWhiteboardService::Info[env.GetNodeId(i)];
node.Connected = false;
}
env.RegenerateBSConfig(TFakeNodeWhiteboardService::Config.MutableResponse()->MutableStatus(0)->MutableBaseConfig(), opts);

auto req = MakePermissionRequest(
TRequestOptions("user", false, false, true),
MakeAction(TAction::SHUTDOWN_HOST, env.GetNodeId(0), 60000000)
);
req->Record.SetAvailabilityMode(NKikimrCms::EAvailabilityMode::MODE_MAX_AVAILABILITY);

// Request that cannot be fulfilled
auto rec1 = env.CheckPermissionRequest(req, TStatus::DISALLOW_TEMP);

auto rid1 = rec1.GetRequestId();

// Manual approval
auto approveResp = env.CheckApproveRequest("user", rid1, false, TStatus::OK);
UNIT_ASSERT_VALUES_EQUAL(approveResp.ManuallyApprovedPermissionsSize(), 1);
TString permissionId = approveResp.GetManuallyApprovedPermissions(0).GetId();
auto rec2 = env.CheckGetPermission("user", permissionId);
UNIT_ASSERT_VALUES_EQUAL(rec2.PermissionsSize(), 1);
UNIT_ASSERT_VALUES_EQUAL(rec2.GetPermissions(0).GetId(), permissionId);

{
// Request cannot be fulfilled, since node 0 is locked by previously approved request
auto req = MakePermissionRequest(
TRequestOptions("user", false, false, true),
MakeAction(TAction::SHUTDOWN_HOST, env.GetNodeId(0), 60000000)
);
req->Record.SetAvailabilityMode(NKikimrCms::EAvailabilityMode::MODE_MAX_AVAILABILITY);

env.CheckPermissionRequest(req, TStatus::DISALLOW_TEMP);
}

env.AdvanceCurrentTime(TDuration::Minutes(30));

{
// This will trigger CMS cleanup.
env.CheckPermissionRequest(MakePermissionRequest(
TRequestOptions("user", false, false, true),
MakeAction(TAction::SHUTDOWN_HOST, env.GetNodeId(0), 60000000)
), TStatus::DISALLOW_TEMP);

auto list = env.CheckListRequests("user", 2);
auto reqs = list.GetRequests();
for (const auto& req : reqs) {
UNIT_ASSERT_VALUES_UNEQUAL(req.GetRequestId(), "user-r-1");
}

// Check that manually approved permission was cleaned up
env.CheckGetPermission("user", permissionId, false, TStatus::WRONG_REQUEST);
}
}

Y_UNIT_TEST(ManualRequestApprovalLockingAllNodes)
{
auto opts = TTestEnvOpts(8, 8).WithSentinel().WithDynamicGroups();
TCmsTestEnv env(opts);

auto& node = TFakeNodeWhiteboardService::Info[env.GetNodeId(0)];
node.Connected = false;
env.RegenerateBSConfig(TFakeNodeWhiteboardService::Config.MutableResponse()->MutableStatus(0)->MutableBaseConfig(), opts);

for (ui32 i = 0; i < 8; i++) {
auto req = MakePermissionRequest(
TRequestOptions("user", false, false, true),
MakeAction(TAction::SHUTDOWN_HOST, env.GetNodeId(i), 60000000)
);
req->Record.SetAvailabilityMode(NKikimrCms::EAvailabilityMode::MODE_MAX_AVAILABILITY);

// Request that cannot be fulfilled
auto rec1 = env.CheckPermissionRequest(req, TStatus::DISALLOW_TEMP);

auto rid1 = rec1.GetRequestId();

// Manual approval
auto approveResp = env.CheckApproveRequest("user", rid1, false, TStatus::OK);
UNIT_ASSERT_VALUES_EQUAL(approveResp.ManuallyApprovedPermissionsSize(), 1);
TString permissionId = approveResp.GetManuallyApprovedPermissions(0).GetId();
auto rec2 = env.CheckGetPermission("user", permissionId);
UNIT_ASSERT_VALUES_EQUAL(rec2.PermissionsSize(), 1);
UNIT_ASSERT_VALUES_EQUAL(rec2.GetPermissions(0).GetId(), permissionId);
}
}

Y_UNIT_TEST(ManualRequestApprovalWithPartialAlreadyApproved)
{
auto opts = TTestEnvOpts(8, 8).WithSentinel().WithDynamicGroups();
TCmsTestEnv env(opts);

auto req = MakePermissionRequest(
TRequestOptions("user", true, false, true),
MakeAction(TAction::SHUTDOWN_HOST, env.GetNodeId(0), 60000000),
MakeAction(TAction::SHUTDOWN_HOST, env.GetNodeId(1), 60000000),
MakeAction(TAction::SHUTDOWN_HOST, env.GetNodeId(2), 60000000)
);

req->Record.SetAvailabilityMode(NKikimrCms::EAvailabilityMode::MODE_MAX_AVAILABILITY);

auto rec1 = env.CheckPermissionRequest(req, TStatus::ALLOW_PARTIAL);
UNIT_ASSERT_VALUES_EQUAL(rec1.PermissionsSize(), 1);
auto rec2 = env.CheckGetPermission("user", rec1.GetPermissions(0).GetId());
UNIT_ASSERT_VALUES_EQUAL(rec2.PermissionsSize(), 1);
UNIT_ASSERT_VALUES_EQUAL(rec2.GetPermissions(0).GetId(), rec1.GetPermissions(0).GetId());

auto rid1 = rec1.GetRequestId();

// // Manual approval
auto approveResp = env.CheckApproveRequest("user", rid1, false, TStatus::OK);
UNIT_ASSERT_VALUES_EQUAL(approveResp.ManuallyApprovedPermissionsSize(), 2);
for (const auto& permission : approveResp.GetManuallyApprovedPermissions()) {
auto permissionId = permission.GetId();
auto rec3 = env.CheckGetPermission("user", permissionId);
UNIT_ASSERT_VALUES_EQUAL(rec3.PermissionsSize(), 1);
UNIT_ASSERT_VALUES_EQUAL(rec3.GetPermissions(0).GetId(), permissionId);
}
}

Y_UNIT_TEST(ManualRequestApprovalAlreadyLockedNode)
{
auto opts = TTestEnvOpts(8, 8).WithSentinel().WithDynamicGroups();
TCmsTestEnv env(opts);

auto req = MakePermissionRequest(
TRequestOptions("user", true, false, true),
MakeAction(TAction::SHUTDOWN_HOST, env.GetNodeId(0), 60000000)
);
req->Record.SetAvailabilityMode(NKikimrCms::EAvailabilityMode::MODE_MAX_AVAILABILITY);

env.CheckPermissionRequest(req, TStatus::ALLOW);

auto req2 = MakePermissionRequest(
TRequestOptions("user", true, false, true),
MakeAction(TAction::SHUTDOWN_HOST, env.GetNodeId(0), 60000000)
);
req2->Record.SetAvailabilityMode(NKikimrCms::EAvailabilityMode::MODE_MAX_AVAILABILITY);
auto rec2 = env.CheckPermissionRequest(req2, TStatus::DISALLOW_TEMP);
auto rid2 = rec2.GetRequestId();

// Manual approval should fail since node 0 is already locked
env.CheckApproveRequest("user", rid2, false, TStatus::WRONG_REQUEST);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

А что если допустить, что в таком случае мы выдаем лок? Кажется, что это было бы полезно.

Представь, что в кластере 2 из 100 хостов забраны в обслуживание и на них висят локи. При запуске роллинг-рестарта, в конце концов он упрется в эти 2 хоста, и можно было бы так "подопнуть" роллинг, выдав ему локи, он быстро порестартит и удалит свои пермишены.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Согласен, но я бы это тоже отдельным тикетом делал, т.к. сейчас CMS совсем не рассчитывает на несколько локов на одну сущность

}

Y_UNIT_TEST(RequestReplacePDiskDoesntBreakGroup)
{
auto opts = TTestEnvOpts(8, 2).WithSentinel().WithDynamicGroups();
Expand Down
10 changes: 10 additions & 0 deletions ydb/core/cms/cms_ut_common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -963,6 +963,16 @@ TCmsTestEnv::CheckListRequests(const TString &user,
return rec;
}

NKikimrCms::TManageRequestResponse
TCmsTestEnv::CheckApproveRequest(const TString &user,
const TString &id,
bool dry,
NKikimrCms::TStatus::ECode code)
{
auto req = MakeManageRequestRequest(user, TManageRequestRequest::APPROVE, id, dry);
return CheckManageRequestRequest(req, code);
}

NKikimrCms::TPermissionResponse
TCmsTestEnv::CheckRequest(const TString &user,
TString id,
Expand Down
6 changes: 6 additions & 0 deletions ydb/core/cms/cms_ut_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -336,6 +336,12 @@ class TCmsTestEnv : public TTestBasicRuntime {
bool dry = false,
NKikimrCms::TStatus::ECode code = NKikimrCms::TStatus::OK);
NKikimrCms::TManageRequestResponse CheckListRequests(const TString &user, ui64 count);
NKikimrCms::TManageRequestResponse CheckApproveRequest(
const TString &user,
const TString &id,
bool dry = false,
NKikimrCms::TStatus::ECode code = NKikimrCms::TStatus::OK
);

NKikimrCms::TPermissionResponse CheckRequest(
const TString &user,
Expand Down
11 changes: 11 additions & 0 deletions ydb/core/driver_lib/cli_utils/cli_cmds_cms.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -254,6 +254,16 @@ class TClientCommandRejectRequest : public TClientCommandManageRequest
}
};

class TClientCommandApproveRequest : public TClientCommandManageRequest
{
public:
TClientCommandApproveRequest()
: TClientCommandManageRequest("approve", {}, "Approve scheduled request",
NKikimrCms::TManageRequestRequest::APPROVE, true)
{
}
};

class TClientCommandCheckRequest : public TCmsClientCommand
{
public:
Expand Down Expand Up @@ -589,6 +599,7 @@ class TClientCommandRequest : public TClientCommandTree
AddCommand(std::make_unique<TClientCommandGetRequest>());
AddCommand(std::make_unique<TClientCommandListRequest>());
AddCommand(std::make_unique<TClientCommandRejectRequest>());
AddCommand(std::make_unique<TClientCommandApproveRequest>());
AddCommand(std::make_unique<TClientCommandCheckRequest>());
}
};
Expand Down
2 changes: 2 additions & 0 deletions ydb/core/protos/cms.proto
Original file line number Diff line number Diff line change
Expand Up @@ -213,6 +213,7 @@ message TManageRequestRequest {
LIST = 1;
GET = 2;
REJECT = 3;
APPROVE = 4;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Хотелось бы еще поддержать такую возможность в публичном Maintenance API, которое использует ydbops. Уверен, что скоро вы будете им пользоваться, потому что ваша команда его и разрабатывает) Но можно это сделать отдельной задачей

}

optional string User = 1;
Expand All @@ -234,6 +235,7 @@ message TManageRequestResponse {

optional TStatus Status = 1;
repeated TScheduledRequest Requests = 2;
repeated TPermission ManuallyApprovedPermissions = 3;
}

message TCheckRequest {
Expand Down
Loading