Skip to content

Commit 74f3b10

Browse files
committed
Robust script lease expiration check
1 parent faaf6d3 commit 74f3b10

File tree

2 files changed

+50
-19
lines changed

2 files changed

+50
-19
lines changed

ydb/core/kqp/proxy_service/kqp_script_executions.cpp

Lines changed: 47 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1054,6 +1054,7 @@ class TCheckLeaseStatusActorBase : public TActorBootstrapped<TCheckLeaseStatusAc
10541054
using TBase = TActorBootstrapped<TCheckLeaseStatusActorBase>;
10551055

10561056
inline static const TDuration CHECK_ALIVE_REQUEST_TIMEOUT = TDuration::Seconds(60);
1057+
inline static const ui64 MAX_CHECK_ALIVE_RETRIES = 100;
10571058

10581059
public:
10591060
TCheckLeaseStatusActorBase(const TString& operationName, const TString& database, const TString& executionId,
@@ -1101,12 +1102,12 @@ class TCheckLeaseStatusActorBase : public TActorBootstrapped<TCheckLeaseStatusAc
11011102
SetupFinalizeRequest(EFinalizationStatus::FS_ROLLBACK, Ydb::StatusIds::UNAVAILABLE, Ydb::Query::EXEC_STATUS_ABORTED, NYql::TIssues{ NYql::TIssue("Lease expired") }, leaseGeneration);
11021103
Schedule(CHECK_ALIVE_REQUEST_TIMEOUT, new TEvents::TEvWakeup());
11031104

1104-
ui64 flags = IEventHandle::FlagTrackDelivery;
1105+
CheckAliveFlags = IEventHandle::FlagTrackDelivery;
11051106
if (runScriptActorId.NodeId() != SelfId().NodeId()) {
1106-
flags |= IEventHandle::FlagSubscribeOnSession;
1107+
CheckAliveFlags |= IEventHandle::FlagSubscribeOnSession;
11071108
SubscribedOnSession = runScriptActorId.NodeId();
11081109
}
1109-
Send(runScriptActorId, new TEvCheckAliveRequest(), flags);
1110+
Send(runScriptActorId, new TEvCheckAliveRequest(), CheckAliveFlags);
11101111

11111112
Become(&TCheckLeaseStatusActorBase::StateFunc);
11121113
}
@@ -1162,7 +1163,7 @@ class TCheckLeaseStatusActorBase : public TActorBootstrapped<TCheckLeaseStatusAc
11621163
}
11631164

11641165
void RunScriptFinalizeRequest() {
1165-
if (WaitFinishQuery) {
1166+
if (WaitFinishQuery || LeaseVerified) {
11661167
return;
11671168
}
11681169

@@ -1171,26 +1172,56 @@ class TCheckLeaseStatusActorBase : public TActorBootstrapped<TCheckLeaseStatusAc
11711172
FinalExecStatus = ScriptFinalizeRequest->Description.ExecStatus;
11721173
FinalIssues = ScriptFinalizeRequest->Description.Issues;
11731174
Send(MakeKqpFinalizeScriptServiceId(SelfId().NodeId()), ScriptFinalizeRequest.release());
1174-
Send(RunScriptActorId, new TEvents::TEvPoison()); // Try to shutdown TRunScriptActor if it still running
1175+
}
1176+
1177+
bool RetryCheckAlive() {
1178+
CheckAliveRetries++;
1179+
1180+
if (WaitFinishQuery || LeaseVerified) {
1181+
// Already finished checks
1182+
return false;
1183+
}
1184+
1185+
if (CheckAliveRetries >= MAX_CHECK_ALIVE_RETRIES) {
1186+
KQP_PROXY_LOG_E("Retry limit exceeded for TRunScriptActor check alive, start finalization");
1187+
RunScriptFinalizeRequest();
1188+
return false;
1189+
}
1190+
1191+
Send(RunScriptActorId, new TEvCheckAliveRequest(), CheckAliveFlags);
1192+
return true;
11751193
}
11761194

11771195
void Handle(TEvCheckAliveResponse::TPtr&) {
1178-
OnLeaseVerified();
1196+
if (WaitFinishQuery) {
1197+
KQP_PROXY_LOG_E("Script execution lease was verified after started finalization");
1198+
} else if (!LeaseVerified) {
1199+
LeaseVerified = true;
1200+
OnLeaseVerified();
1201+
}
11791202
}
11801203

11811204
void Handle(TEvents::TEvWakeup::TPtr&) {
1182-
KQP_PROXY_LOG_W("TRunScriptActor is unavailable, start finalization");
1183-
RunScriptFinalizeRequest();
1205+
KQP_PROXY_LOG_W("Deliver TRunScriptActor check alive request timeout, retry check alive");
1206+
if (RetryCheckAlive()) {
1207+
Schedule(CHECK_ALIVE_REQUEST_TIMEOUT, new TEvents::TEvWakeup());
1208+
}
11841209
}
11851210

1186-
void Handle(TEvents::TEvUndelivered::TPtr&) {
1187-
KQP_PROXY_LOG_W("Got delivery problem to node with TRunScriptActor, start finalization");
1188-
RunScriptFinalizeRequest();
1211+
void Handle(TEvents::TEvUndelivered::TPtr& ev) {
1212+
const auto reason = ev->Get()->Reason;
1213+
if (reason == TEvents::TEvUndelivered::ReasonActorUnknown) {
1214+
KQP_PROXY_LOG_W("TRunScriptActor not found, start finalization");
1215+
RunScriptFinalizeRequest();
1216+
} else {
1217+
KQP_PROXY_LOG_W("Got delivery problem to node with TRunScriptActor, reason: " << reason);
1218+
RetryCheckAlive();
1219+
}
11891220
}
11901221

11911222
void Handle(TEvInterconnect::TEvNodeDisconnected::TPtr&) {
1192-
KQP_PROXY_LOG_W("Node with TRunScriptActor was disconnected, start finalization");
1193-
RunScriptFinalizeRequest();
1223+
KQP_PROXY_LOG_W("Node with TRunScriptActor was disconnected, retry check alive");
1224+
RetryCheckAlive();
11941225
}
11951226

11961227
void Handle(TEvScriptExecutionFinished::TPtr& ev) {
@@ -1222,7 +1253,10 @@ class TCheckLeaseStatusActorBase : public TActorBootstrapped<TCheckLeaseStatusAc
12221253
NYql::TIssues FinalIssues;
12231254

12241255
bool WaitFinishQuery = false;
1256+
bool LeaseVerified = false;
12251257
std::optional<ui32> SubscribedOnSession;
1258+
ui64 CheckAliveFlags = 0;
1259+
ui64 CheckAliveRetries = 0;
12261260
TActorId RunScriptActorId;
12271261

12281262
protected:

ydb/core/kqp/run_script_actor/kqp_run_script_actor.cpp

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -310,13 +310,10 @@ class TRunScriptActor : public NActors::TActorBootstrapped<TRunScriptActor> {
310310
}
311311

312312
// Event in case of error in registering script in database
313-
// or expired / failed script finalization
313+
// Just pass away, because we have not started execution.
314314
void Handle(NActors::TEvents::TEvPoison::TPtr&) {
315-
if (RunState == ERunState::Created) {
316-
PassAway();
317-
} else {
318-
CancelRunningQuery();
319-
}
315+
Y_ABORT_UNLESS(RunState == ERunState::Created);
316+
PassAway();
320317
}
321318

322319
bool ShouldSaveResult(size_t resultSetId) const {

0 commit comments

Comments
 (0)