Skip to content

Commit 4b768d8

Browse files
committed
add retry logic
1 parent dda82db commit 4b768d8

File tree

25 files changed

+3818
-1274
lines changed

25 files changed

+3818
-1274
lines changed

apps/workspace-engine/oapi/openapi.json

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -837,6 +837,9 @@
837837
},
838838
"policyId": {
839839
"type": "string"
840+
},
841+
"retry": {
842+
"$ref": "#/components/schemas/RetryRule"
840843
}
841844
},
842845
"required": [
@@ -1301,6 +1304,48 @@
13011304
],
13021305
"type": "object"
13031306
},
1307+
"RetryRule": {
1308+
"properties": {
1309+
"backoffSeconds": {
1310+
"description": "Minimum seconds to wait between retry attempts. If null, retries are allowed immediately after job completion.",
1311+
"format": "int32",
1312+
"minimum": 0,
1313+
"type": "integer"
1314+
},
1315+
"backoffStrategy": {
1316+
"default": "linear",
1317+
"description": "Backoff strategy: \"linear\" uses constant backoffSeconds delay, \"exponential\" doubles the delay with each retry (backoffSeconds * 2^(attempt-1)).",
1318+
"enum": [
1319+
"linear",
1320+
"exponential"
1321+
],
1322+
"type": "string"
1323+
},
1324+
"maxBackoffSeconds": {
1325+
"description": "Maximum backoff time in seconds (cap for exponential backoff). If null, no maximum is enforced.",
1326+
"format": "int32",
1327+
"minimum": 0,
1328+
"type": "integer"
1329+
},
1330+
"maxRetries": {
1331+
"description": "Maximum number of retries allowed. 0 means no retries (1 attempt total), 3 means up to 4 attempts (1 initial + 3 retries).",
1332+
"format": "int32",
1333+
"minimum": 0,
1334+
"type": "integer"
1335+
},
1336+
"retryOnStatuses": {
1337+
"description": "Job statuses that count toward the retry limit. If null or empty and maxRetries > 0, defaults to [\"failure\", \"invalidIntegration\"] (smart default: only retry on errors, not on success). If maxRetries = 0, counts all statuses (strict: no retries at all). Example: [\"failure\", \"cancelled\"] will only count failed/cancelled jobs.",
1338+
"items": {
1339+
"$ref": "#/components/schemas/JobStatus"
1340+
},
1341+
"type": "array"
1342+
}
1343+
},
1344+
"required": [
1345+
"maxRetries"
1346+
],
1347+
"type": "object"
1348+
},
13041349
"RuleEvaluation": {
13051350
"properties": {
13061351
"actionRequired": {

apps/workspace-engine/oapi/spec/schemas/policy.jsonnet

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,43 @@ local openapi = import '../lib/openapi.libsonnet';
5959
anyApproval: openapi.schemaRef('AnyApprovalRule'),
6060
environmentProgression: openapi.schemaRef('EnvironmentProgressionRule'),
6161
gradualRollout: openapi.schemaRef('GradualRolloutRule'),
62+
retry: openapi.schemaRef('RetryRule'),
63+
},
64+
},
65+
66+
RetryRule: {
67+
type: 'object',
68+
required: ['maxRetries'],
69+
properties: {
70+
maxRetries: {
71+
type: 'integer',
72+
format: 'int32',
73+
minimum: 0,
74+
description: 'Maximum number of retries allowed. 0 means no retries (1 attempt total), 3 means up to 4 attempts (1 initial + 3 retries).',
75+
},
76+
retryOnStatuses: {
77+
type: 'array',
78+
items: openapi.schemaRef('JobStatus'),
79+
description: 'Job statuses that count toward the retry limit. If null or empty, defaults to ["failure", "invalidIntegration", "invalidJobAgent"] for maxRetries > 0, or ["failure", "invalidIntegration", "invalidJobAgent", "successful"] for maxRetries = 0. Cancelled and skipped jobs never count by default (allows redeployment after cancellation). Example: ["failure", "cancelled"] will only count failed/cancelled jobs.',
80+
},
81+
backoffSeconds: {
82+
type: 'integer',
83+
format: 'int32',
84+
minimum: 0,
85+
description: 'Minimum seconds to wait between retry attempts. If null, retries are allowed immediately after job completion.',
86+
},
87+
backoffStrategy: {
88+
type: 'string',
89+
enum: ['linear', 'exponential'],
90+
default: 'linear',
91+
description: 'Backoff strategy: "linear" uses constant backoffSeconds delay, "exponential" doubles the delay with each retry (backoffSeconds * 2^(attempt-1)).',
92+
},
93+
maxBackoffSeconds: {
94+
type: 'integer',
95+
format: 'int32',
96+
minimum: 0,
97+
description: 'Maximum backoff time in seconds (cap for exponential backoff). If null, no maximum is enforced.',
98+
},
6299
},
63100
},
64101

apps/workspace-engine/pkg/events/handler/deployment/deployment.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -266,7 +266,7 @@ func getJobsToRetrigger(ws *workspace.Workspace, deployment *oapi.Deployment) []
266266

267267
// retriggerInvalidJobAgentJobs creates new Pending jobs for all releases that currently have InvalidJobAgent jobs
268268
// Note: This is an explicit retrigger operation for configuration fixes, so we bypass normal
269-
// eligibility checks (like skipdeployed). The old InvalidJobAgent job remains for history.
269+
// eligibility checks (like retry limits). The old InvalidJobAgent job remains for history.
270270
func retriggerInvalidJobAgentJobs(ctx context.Context, ws *workspace.Workspace, jobsToRetrigger []*oapi.Job) {
271271
// Create job factory and dispatcher
272272
jobFactory := jobs.NewFactory(ws.Store())

apps/workspace-engine/pkg/oapi/oapi.gen.go

Lines changed: 30 additions & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

apps/workspace-engine/pkg/workspace/releasemanager/ARCHITECTURE.md

Lines changed: 0 additions & 145 deletions
This file was deleted.
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
package deployment
2+
3+
import (
4+
"time"
5+
)
6+
7+
// EligibilityDecision represents whether a job should be created.
8+
type EligibilityDecision string
9+
10+
const (
11+
// EligibilityAllowed indicates the job should be created immediately
12+
EligibilityAllowed EligibilityDecision = "allowed"
13+
14+
// EligibilityDenied indicates the job should not be created (retry limit exceeded, etc.)
15+
EligibilityDenied EligibilityDecision = "denied"
16+
17+
// EligibilityPending indicates the job creation is delayed (e.g., waiting for backoff)
18+
// The system should schedule re-evaluation at NextEvaluationTime
19+
EligibilityPending EligibilityDecision = "pending"
20+
)
21+
22+
// EligibilityResult contains the result of a job eligibility check.
23+
type EligibilityResult struct {
24+
// Decision indicates whether the job should be created
25+
Decision EligibilityDecision
26+
27+
// Reason provides a human-readable explanation for the decision
28+
Reason string
29+
30+
// NextEvaluationTime is when the eligibility should be re-evaluated (for pending decisions)
31+
// This is used to schedule future reconciliation
32+
NextEvaluationTime *time.Time
33+
34+
// Details contains additional structured information about the decision
35+
Details map[string]interface{}
36+
}
37+
38+
// IsAllowed returns true if the job can be created immediately.
39+
func (r *EligibilityResult) IsAllowed() bool {
40+
return r.Decision == EligibilityAllowed
41+
}
42+
43+
// IsDenied returns true if the job should not be created.
44+
func (r *EligibilityResult) IsDenied() bool {
45+
return r.Decision == EligibilityDenied
46+
}
47+
48+
// IsPending returns true if the job creation is delayed pending some condition.
49+
func (r *EligibilityResult) IsPending() bool {
50+
return r.Decision == EligibilityPending
51+
}
52+
53+
// ShouldScheduleRetry returns true if there's a future time when this should be re-evaluated.
54+
func (r *EligibilityResult) ShouldScheduleRetry() bool {
55+
return r.IsPending() && r.NextEvaluationTime != nil
56+
}

0 commit comments

Comments
 (0)