@@ -48,6 +48,7 @@ import java.net.URI
4848import java.time.Duration
4949import java.time.LocalDateTime
5050import java.time.format.DateTimeFormatter
51+ import java.util.concurrent.TimeUnit
5152import java.util.concurrent.TimeoutException
5253import kotlin.coroutines.resume
5354import kotlin.coroutines.resumeWithException
@@ -227,16 +228,9 @@ class CoderRemoteConnectionHandle {
227228
228229 // Wait for the IDE to come up.
229230 indicator.text = " Waiting for ${workspace.ideName} backend..."
230- var status: UnattendedHostStatus ? = null
231231 val remoteProjectPath = accessor.makeRemotePath(ShellArgument .PlainText (workspace.projectPath))
232232 val logsDir = accessor.getLogsDir(workspace.ideProduct.productCode, remoteProjectPath)
233- while (lifetime.status == LifetimeStatus .Alive ) {
234- status = ensureIDEBackend(accessor, workspace, ideDir, remoteProjectPath, logsDir, lifetime, null )
235- if (! status?.joinLink.isNullOrBlank()) {
236- break
237- }
238- delay(5000 )
239- }
233+ var status = ensureIDEBackend(accessor, workspace, ideDir, remoteProjectPath, logsDir, lifetime, null )
240234
241235 // We wait for non-null, so this only happens on cancellation.
242236 val joinLink = status?.joinLink
@@ -302,6 +296,7 @@ class CoderRemoteConnectionHandle {
302296 }
303297 // Continue once the client is present.
304298 handle.onClientPresenceChanged.advise(lifetime) {
299+ logger.info(" ${workspace.ideName} client to ${workspace.hostname} presence: ${handle.clientPresent} " )
305300 if (handle.clientPresent && continuation.isActive) {
306301 continuation.resume(true )
307302 }
@@ -437,8 +432,8 @@ class CoderRemoteConnectionHandle {
437432 }
438433
439434 /* *
440- * Ensure the backend is started. Status and/or links may be null if the
441- * backend has not started .
435+ * Ensure the backend is started. It will not return until a join link is
436+ * received or the lifetime expires .
442437 */
443438 private suspend fun ensureIDEBackend (
444439 accessor : HighLevelHostAccessor ,
@@ -449,41 +444,67 @@ class CoderRemoteConnectionHandle {
449444 lifetime : LifetimeDefinition ,
450445 currentStatus : UnattendedHostStatus ? ,
451446 ): UnattendedHostStatus ? {
452- val details = " ${workspace.hostname} :${ideDir.toRawString()} , project=${remoteProjectPath.toRawString()} "
453- return try {
454- if (currentStatus?.appPid != null &&
455- ! currentStatus.joinLink.isNullOrBlank() &&
456- accessor.isPidAlive(currentStatus.appPid.toInt())
457- ) {
458- // If the PID is alive, assume the join link we have is still
459- // valid. The join link seems to change even if it is the same
460- // backend running, so if we always fetched the link the client
461- // would relaunch over and over.
447+ val details = " $${workspace.hostname} :${ideDir.toRawString()} , project=${remoteProjectPath.toRawString()} "
448+
449+ // Check if the current IDE is alive.
450+ if (currentStatus != null ) {
451+ val isAlive = try {
452+ val isAlive = accessor.isPidAlive(currentStatus.appPid.toInt())
453+ logger.info(" ${workspace.ideName} status: pid=${currentStatus.appPid} , alive=$isAlive " )
454+ isAlive
455+ } catch (ex: Exception ) {
456+ logger.info(" Failed to check if ${workspace.ideName} is alive on $details : pid=${currentStatus.appPid} " , ex)
457+ false
458+ }
459+ if (isAlive) {
460+ // Use the current status and join link.
462461 return currentStatus
462+ } else {
463+ logger.info(" Relaunching ${workspace.ideName} since it is not alive..." )
463464 }
465+ } else {
466+ logger.info(" Launching ${workspace.ideName} for the first time on ${workspace.hostname} ..." )
467+ }
464468
465- // See if there is already a backend running. Weirdly, there is
466- // always a PID, even if there is no backend running, and
467- // backendUnresponsive is always false, but the links are null so
468- // hopefully that is an accurate indicator that the IDE is up.
469- val status = accessor.getHostIdeStatus(ideDir, remoteProjectPath)
470- if (! status.joinLink.isNullOrBlank()) {
471- logger.info(" Found existing ${workspace.ideName} backend on $details " )
472- return status
473- }
469+ // If the PID is not alive, spawn a new backend. This may not be
470+ // idempotent, so only call if we are really sure we need to.
471+ accessor.startHostIdeInBackgroundAndDetach(lifetime, ideDir, remoteProjectPath, logsDir)
474472
475- // Otherwise, spawn a new backend. This does not seem to spawn a
476- // second backend if one is already running, yet it does somehow
477- // cause a second client to launch. So only run this if we are
478- // really sure we have to launch a new backend.
479- logger.info(" Starting ${workspace.ideName} backend on $details " )
480- accessor.startHostIdeInBackgroundAndDetach(lifetime, ideDir, remoteProjectPath, logsDir)
481- // Get the newly spawned PID and join link.
482- return accessor.getHostIdeStatus(ideDir, remoteProjectPath)
483- } catch (ex: Exception ) {
484- logger.info(" Failed to get ${workspace.ideName} status from $details " , ex)
485- currentStatus
473+ // Get the newly spawned PID and join link.
474+ var attempts = 0
475+ val maxAttempts = 6
476+ val wait = TimeUnit .SECONDS .toMillis(5 )
477+ while (lifetime.status == LifetimeStatus .Alive ) {
478+ try {
479+ attempts++
480+ val status = accessor.getHostIdeStatus(ideDir, remoteProjectPath)
481+ if (! status.joinLink.isNullOrBlank()) {
482+ logger.info(" Found join link for ${workspace.ideName} ; proceeding to connect: pid=${status.appPid} " )
483+ return status
484+ }
485+ // If we did not get a join link, see if the IDE is alive in
486+ // case it died and we need to respawn.
487+ val isAlive = status.appPid > 0 && accessor.isPidAlive(status.appPid.toInt())
488+ logger.info(" ${workspace.ideName} status: pid=${status.appPid} , alive=$isAlive , unresponsive=${status.backendUnresponsive} , attempt=$attempts " )
489+ // It is not clear whether the PID can be trusted because we get
490+ // one even when there is no backend at all. For now give it
491+ // some time and if it is still dead, only then try to respawn.
492+ if (! isAlive && attempts >= maxAttempts) {
493+ logger.info(" ${workspace.ideName} is still not alive after $attempts checks, respawning backend and waiting $wait ms to try again" )
494+ accessor.startHostIdeInBackgroundAndDetach(lifetime, ideDir, remoteProjectPath, logsDir)
495+ attempts = 0
496+ } else {
497+ logger.info(" No join link found in status; waiting $wait ms to try again" )
498+ }
499+ } catch (ex: Exception ) {
500+ logger.info(" Failed to get ${workspace.ideName} status from $details ; waiting $wait ms to try again" , ex)
501+ }
502+ delay(wait)
486503 }
504+
505+ // This means the lifetime is no longer alive.
506+ logger.info(" Connection to ${workspace.ideName} on $details aborted by user" )
507+ return null
487508 }
488509
489510 companion object {
0 commit comments