@@ -1054,6 +1054,7 @@ class TCheckLeaseStatusActorBase : public TActorBootstrapped<TCheckLeaseStatusAc
1054
1054
using TBase = TActorBootstrapped<TCheckLeaseStatusActorBase>;
1055
1055
1056
1056
inline static const TDuration CHECK_ALIVE_REQUEST_TIMEOUT = TDuration::Seconds(60 );
1057
+ inline static const ui64 MAX_CHECK_ALIVE_RETRIES = 100 ;
1057
1058
1058
1059
public:
1059
1060
TCheckLeaseStatusActorBase (const TString& operationName, const TString& database, const TString& executionId,
@@ -1101,12 +1102,12 @@ class TCheckLeaseStatusActorBase : public TActorBootstrapped<TCheckLeaseStatusAc
1101
1102
SetupFinalizeRequest (EFinalizationStatus::FS_ROLLBACK, Ydb::StatusIds::UNAVAILABLE, Ydb::Query::EXEC_STATUS_ABORTED, NYql::TIssues{ NYql::TIssue (" Lease expired" ) }, leaseGeneration);
1102
1103
Schedule (CHECK_ALIVE_REQUEST_TIMEOUT, new TEvents::TEvWakeup ());
1103
1104
1104
- ui64 flags = IEventHandle::FlagTrackDelivery;
1105
+ CheckAliveFlags = IEventHandle::FlagTrackDelivery;
1105
1106
if (runScriptActorId.NodeId () != SelfId ().NodeId ()) {
1106
- flags |= IEventHandle::FlagSubscribeOnSession;
1107
+ CheckAliveFlags |= IEventHandle::FlagSubscribeOnSession;
1107
1108
SubscribedOnSession = runScriptActorId.NodeId ();
1108
1109
}
1109
- Send (runScriptActorId, new TEvCheckAliveRequest (), flags );
1110
+ Send (runScriptActorId, new TEvCheckAliveRequest (), CheckAliveFlags );
1110
1111
1111
1112
Become (&TCheckLeaseStatusActorBase::StateFunc);
1112
1113
}
@@ -1162,7 +1163,7 @@ class TCheckLeaseStatusActorBase : public TActorBootstrapped<TCheckLeaseStatusAc
1162
1163
}
1163
1164
1164
1165
void RunScriptFinalizeRequest () {
1165
- if (WaitFinishQuery) {
1166
+ if (WaitFinishQuery || LeaseVerified ) {
1166
1167
return ;
1167
1168
}
1168
1169
@@ -1171,26 +1172,56 @@ class TCheckLeaseStatusActorBase : public TActorBootstrapped<TCheckLeaseStatusAc
1171
1172
FinalExecStatus = ScriptFinalizeRequest->Description .ExecStatus ;
1172
1173
FinalIssues = ScriptFinalizeRequest->Description .Issues ;
1173
1174
Send (MakeKqpFinalizeScriptServiceId (SelfId ().NodeId ()), ScriptFinalizeRequest.release ());
1174
- Send (RunScriptActorId, new TEvents::TEvPoison ()); // Try to shutdown TRunScriptActor if it still running
1175
+ }
1176
+
1177
+ bool RetryCheckAlive () {
1178
+ CheckAliveRetries++;
1179
+
1180
+ if (WaitFinishQuery || LeaseVerified) {
1181
+ // Already finished checks
1182
+ return false ;
1183
+ }
1184
+
1185
+ if (CheckAliveRetries >= MAX_CHECK_ALIVE_RETRIES) {
1186
+ KQP_PROXY_LOG_E (" Retry limit exceeded for TRunScriptActor check alive, start finalization" );
1187
+ RunScriptFinalizeRequest ();
1188
+ return false ;
1189
+ }
1190
+
1191
+ Send (RunScriptActorId, new TEvCheckAliveRequest (), CheckAliveFlags);
1192
+ return true ;
1175
1193
}
1176
1194
1177
1195
void Handle (TEvCheckAliveResponse::TPtr&) {
1178
- OnLeaseVerified ();
1196
+ if (WaitFinishQuery) {
1197
+ KQP_PROXY_LOG_E (" Script execution lease was verified after started finalization" );
1198
+ } else if (!LeaseVerified) {
1199
+ LeaseVerified = true ;
1200
+ OnLeaseVerified ();
1201
+ }
1179
1202
}
1180
1203
1181
1204
void Handle (TEvents::TEvWakeup::TPtr&) {
1182
- KQP_PROXY_LOG_W (" TRunScriptActor is unavailable, start finalization" );
1183
- RunScriptFinalizeRequest ();
1205
+ KQP_PROXY_LOG_W (" Deliver TRunScriptActor check alive request timeout, retry check alive" );
1206
+ if (RetryCheckAlive ()) {
1207
+ Schedule (CHECK_ALIVE_REQUEST_TIMEOUT, new TEvents::TEvWakeup ());
1208
+ }
1184
1209
}
1185
1210
1186
- void Handle (TEvents::TEvUndelivered::TPtr&) {
1187
- KQP_PROXY_LOG_W (" Got delivery problem to node with TRunScriptActor, start finalization" );
1188
- RunScriptFinalizeRequest ();
1211
+ void Handle (TEvents::TEvUndelivered::TPtr& ev) {
1212
+ const auto reason = ev->Get ()->Reason ;
1213
+ if (reason == TEvents::TEvUndelivered::ReasonActorUnknown) {
1214
+ KQP_PROXY_LOG_W (" TRunScriptActor not found, start finalization" );
1215
+ RunScriptFinalizeRequest ();
1216
+ } else {
1217
+ KQP_PROXY_LOG_W (" Got delivery problem to node with TRunScriptActor, reason: " << reason);
1218
+ RetryCheckAlive ();
1219
+ }
1189
1220
}
1190
1221
1191
1222
void Handle (TEvInterconnect::TEvNodeDisconnected::TPtr&) {
1192
- KQP_PROXY_LOG_W (" Node with TRunScriptActor was disconnected, start finalization " );
1193
- RunScriptFinalizeRequest ();
1223
+ KQP_PROXY_LOG_W (" Node with TRunScriptActor was disconnected, retry check alive " );
1224
+ RetryCheckAlive ();
1194
1225
}
1195
1226
1196
1227
void Handle (TEvScriptExecutionFinished::TPtr& ev) {
@@ -1222,7 +1253,10 @@ class TCheckLeaseStatusActorBase : public TActorBootstrapped<TCheckLeaseStatusAc
1222
1253
NYql::TIssues FinalIssues;
1223
1254
1224
1255
bool WaitFinishQuery = false ;
1256
+ bool LeaseVerified = false ;
1225
1257
std::optional<ui32> SubscribedOnSession;
1258
+ ui64 CheckAliveFlags = 0 ;
1259
+ ui64 CheckAliveRetries = 0 ;
1226
1260
TActorId RunScriptActorId;
1227
1261
1228
1262
protected:
0 commit comments