Skip to content

Commit 9f5969a

Browse files
committed
Add option to retry only known errors.
1 parent e6c1d3c commit 9f5969a

File tree

11 files changed

+136
-5
lines changed

11 files changed

+136
-5
lines changed

backend/src/main/scala/cromwell/backend/standard/StandardAsyncExecutionActor.scala

Lines changed: 31 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ import cromwell.backend.validation._
2828
import cromwell.core._
2929
import cromwell.core.io.{AsyncIoActorClient, DefaultIoCommandBuilder, IoCommandBuilder}
3030
import cromwell.core.path.Path
31+
import cromwell.core.retry._
3132
import cromwell.services.keyvalue.KeyValueServiceActor._
3233
import cromwell.services.keyvalue.KvClient
3334
import cromwell.services.metadata.CallMetadataKeys
@@ -273,6 +274,25 @@ trait StandardAsyncExecutionActor
273274
}
274275
}
275276

277+
lazy val maxRetriesMode: MaxRetriesMode = {
278+
val maxRetriesModeOption: Option[MaxRetriesMode] =
279+
jobDescriptor.workflowDescriptor.getWorkflowOption(WorkflowOptions.MaxRetriesMode) flatMap { value: String =>
280+
MaxRetriesMode.tryParse(value) match {
281+
case Success(v) => Option(v)
282+
case Failure(e) =>
283+
// should not happen, this case should have been screened for and fast-failed during workflow materialization.
284+
log.error(
285+
e,
286+
s"Programmer error: unexpected failure attempting to convert value for workflow option " +
287+
s"'${WorkflowOptions.MaxRetriesMode.name}' to MaxRetriesMode."
288+
)
289+
Option(MaxRetriesMode.DefaultMode)
290+
}
291+
}
292+
293+
maxRetriesModeOption.getOrElse(MaxRetriesMode.DefaultMode)
294+
}
295+
276296
lazy val memoryRetryRequested: Boolean = memoryRetryFactor.nonEmpty
277297

278298
/**
@@ -1107,8 +1127,17 @@ trait StandardAsyncExecutionActor
11071127

11081128
failedRetryableOrNonRetryable match {
11091129
case failedNonRetryable: FailedNonRetryableExecutionHandle if previousFailedRetries < maxRetries =>
1110-
// The user asked us to retry finitely for them, possibly with a memory modification
1111-
evaluateFailureRetry(failedNonRetryable, kvsFromPreviousAttempt, kvsForNextAttempt, memoryRetry)
1130+
maxRetriesMode match {
1131+
case AllErrors =>
1132+
// The user asked us to retry finitely for them, possibly with a memory modification
1133+
evaluateFailureRetry(failedNonRetryable, kvsFromPreviousAttempt, kvsForNextAttempt, memoryRetry)
1134+
case KnownErrors if memoryRetry.oomDetected =>
1135+
// The user asked us to retry finitely for them, with a memory modification
1136+
evaluateFailureRetry(failedNonRetryable, kvsFromPreviousAttempt, kvsForNextAttempt, memoryRetry)
1137+
case _ =>
1138+
// No reason to retry
1139+
Future.successful(failedNonRetryable)
1140+
}
11121141
case failedNonRetryable: FailedNonRetryableExecutionHandle =>
11131142
// No reason to retry
11141143
Future.successful(failedNonRetryable)
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
{
2+
"max_retries_mode" : "AllErrors"
3+
}
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
{
2+
"max_retries_mode" : "KnownErrors"
3+
}
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
name: max_retries_mode_allerrors
2+
testFormat: workflowfailure
3+
4+
files {
5+
workflow: max_retries/max_retries.wdl
6+
options: max_retries/max_retries_mode_allerrors.options
7+
}
8+
9+
metadata {
10+
"failures.0.causedBy.0.message": "Job retry_for_me.broken_task:NA:2 exited with return code 1 which has not been declared as a valid return code. See 'continueOnReturnCode' runtime attribute for more details."
11+
}
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
name: max_retries_mode_knownerrors
2+
testFormat: workflowfailure
3+
4+
files {
5+
workflow: max_retries/max_retries.wdl
6+
options: max_retries/max_retries_mode_knownerrors.options
7+
}
8+
9+
metadata {
10+
"failures.0.causedBy.0.message": "Job retry_for_me.broken_task:NA:1 exited with return code 1 which has not been declared as a valid return code. See 'continueOnReturnCode' runtime attribute for more details."
11+
}

core/src/main/scala/cromwell/core/WorkflowOptions.scala

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@ object WorkflowOptions {
6262
case object WorkflowFailureMode extends WorkflowOption("workflow_failure_mode")
6363
case object UseReferenceDisks extends WorkflowOption("use_reference_disks")
6464
case object MemoryRetryMultiplier extends WorkflowOption("memory_retry_multiplier")
65+
case object MaxRetriesMode extends WorkflowOption("max_retries_mode")
6566
case object WorkflowCallbackUri extends WorkflowOption("workflow_callback_uri")
6667

6768
private lazy val WorkflowOptionsConf = ConfigFactory.load.getConfig("workflow-options")
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
package cromwell.core.retry
2+
3+
import scala.util.{Failure, Success, Try}
4+
5+
sealed trait MaxRetriesMode
6+
case object AllErrors extends MaxRetriesMode
7+
case object KnownErrors extends MaxRetriesMode
8+
9+
object MaxRetriesMode {
10+
val DefaultMode = AllErrors
11+
private val AllModes = Seq(AllErrors, KnownErrors)
12+
13+
def tryParse(mode: String): Try[MaxRetriesMode] =
14+
AllModes find { _.toString.equalsIgnoreCase(mode) } map { Success(_) } getOrElse Failure(
15+
new Exception(s"Invalid max retries mode: '$mode', supported modes are: ${AllModes.mkString("'", "', '", "'")}")
16+
)
17+
}

docs/RuntimeAttributes.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -247,7 +247,7 @@ runtime {
247247

248248
*Default: _0_*
249249

250-
This retry option is introduced to provide a method for tackling transient job failures. For example, if a task fails due to a timeout from accessing an external service, then this option helps re-run the failed the task without having to re-run the entire workflow. It takes an Int as a value that indicates the maximum number of times Cromwell should retry a failed task. This retry is applied towards jobs that fail while executing the task command. This method only applies to transient job failures and is a feeble attempt to retry a job, that is it cannot be used to increase memory in out-of-memory situations.
250+
This retry option is introduced to provide a method for tackling transient job failures. For example, if a task fails due to a timeout from accessing an external service, then this option helps re-run the failed the task without having to re-run the entire workflow. It takes an Int as a value that indicates the maximum number of times Cromwell should retry a failed task. This retry is applied towards jobs that fail while executing the task command. This method only applies to transient job failures and is a feeble attempt to retry a job.
251251

252252
If using the Google backend, it's important to note that The `maxRetries` count is independent from the [preemptible](#preemptible) count. For example, the task below can be retried up to 6 times if it's preempted 3 times AND the command execution fails 3 times.
253253

docs/wf_options/Overview.md

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -144,3 +144,12 @@ Example `options.json`:
144144
"memory_retry_multiplier" : 1.1
145145
}
146146
```
147+
148+
## Max Retries Mode
149+
150+
The `max_retries_mode` workflow options sets the behavior of retrying failed jobs when the [`maxRetries` runtime
151+
attribute](../RuntimeAttributes.md#maxretries) is specified.
152+
153+
The possible values are `AllErrors` or `KnownErrors`. If set to `AllErrors`, the job will be retried for any error. If
154+
set to `KnownErrors`, the job will only be retried for errors that are known to be retryable, such as increasing memory
155+
in out-of-memory situations. The default value is `AllErrors`.

engine/src/main/scala/cromwell/engine/workflow/lifecycle/materialization/MaterializeWorkflowDescriptorActor.scala

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ import cromwell.core.io.AsyncIo
2828
import cromwell.core.labels.{Label, Labels}
2929
import cromwell.core.logging.WorkflowLogging
3030
import cromwell.core.path.{PathBuilder, PathBuilderFactory}
31+
import cromwell.core.retry._
3132
import cromwell.engine._
3233
import cromwell.engine.backend.CromwellBackends
3334
import cromwell.engine.workflow.WorkflowProcessingEventPublishing._
@@ -184,6 +185,19 @@ object MaterializeWorkflowDescriptorActor {
184185
s"'$optionName' is specified in workflow options but value is not of expected Double type: ${e.getMessage}".invalidNel
185186
}
186187
}
188+
189+
def validateMaxRetriesMode(workflowOptions: WorkflowOptions): ErrorOr[MaxRetriesMode] = {
190+
val modeString: Try[String] = workflowOptions.get(WorkflowOptions.MaxRetriesMode) match {
191+
case Success(value) => Success(value)
192+
case Failure(_: OptionNotFoundException) => Success(MaxRetriesMode.DefaultMode.toString)
193+
case Failure(e) => Failure(e)
194+
}
195+
196+
modeString flatMap MaxRetriesMode.tryParse match {
197+
case Success(mode) => mode.validNel
198+
case Failure(t) => t.getMessage.invalidNel
199+
}
200+
}
187201
}
188202

189203
// TODO WOM: need to decide where to draw the line between language specific initialization and WOM
@@ -499,12 +513,15 @@ class MaterializeWorkflowDescriptorActor(override val serviceRegistryActor: Acto
499513

500514
val memoryRetryMultiplierValidation: ErrorOr[Unit] = validateMemoryRetryMultiplier(workflowOptions)
501515

516+
val maxRetriesModeValidation: ErrorOr[MaxRetriesMode] = validateMaxRetriesMode(workflowOptions)
517+
502518
(failureModeValidation,
503519
backendAssignmentsValidation,
504520
callCachingModeValidation,
505521
useReferenceDisksValidation,
506-
memoryRetryMultiplierValidation
507-
) mapN { case (failureMode, backendAssignments, callCachingMode, _, _) =>
522+
memoryRetryMultiplierValidation,
523+
maxRetriesModeValidation
524+
) mapN { case (failureMode, backendAssignments, callCachingMode, _, _, _) =>
508525
val callable = womNamespace.executable.entryPoint
509526
val backendDescriptor = BackendWorkflowDescriptor(id,
510527
callable,

0 commit comments

Comments
 (0)