Skip to content

Commit 92569ed

Browse files
committed
TEP-0121: Reconciler Implementation
Prior to this commit, `retries` logic for TaskRun is handled by Tekton `PipelineRun` reconciler. This commit delegates the retries implementation for `TaskRun` to the `TaskRun` reconciler. The major change is we stopped relying on `len(retriesStatus)` to decide if a target `TaskRun` failed or not. Instead, we use status `ConditionSuceeded` to gate the completion of a `TaskRun`. Even a `TaskRun` failed on one execution, as long as it has remaining retries, the TaskRun won't be stored in etcd with its status set as `Failed`. Instead, the status will be: ```yaml Type: Succeeded Status: Unknown Reason: ToBeRetried ```
1 parent 17b705c commit 92569ed

11 files changed

+647
-383
lines changed

docs/taskruns.md

+18-3
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ weight: 300
2626
- [Specifying `Sidecars`](#specifying-sidecars)
2727
- [Overriding `Task` `Steps` and `Sidecars`](#overriding-task-steps-and-sidecars)
2828
- [Specifying `LimitRange` values](#specifying-limitrange-values)
29+
- [Specifying `Retries`](#specifying-retries)
2930
- [Configuring the failure timeout](#configuring-the-failure-timeout)
3031
- [Specifying `ServiceAccount` credentials](#specifying-serviceaccount-credentials)
3132
- [Monitoring execution status](#monitoring-execution-status)
@@ -698,11 +699,25 @@ object(s), if present. Any `Request` or `Limit` specified by the user (on `Task`
698699

699700
For more information, see the [`LimitRange` support in Pipeline](./compute-resources.md#limitrange-support).
700701

702+
### Specifying `Retries`
703+
You can use the `retries` field to set how many times you want to retry on a failed TaskRun.
704+
All TaskRun failures are retriable except for `Cancellation`.
705+
706+
For a retriable `TaskRun`, when an error occurs:
707+
- The error status is archived in `status.RetriesStatus`
708+
- The `Succeeded` condition in `status` is updated:
709+
```
710+
Type: Succeeded
711+
Status: Unknown
712+
Reason: ToBeRetried
713+
```
714+
- `status.StartTime` and `status.PodName` are unset to trigger another retry attempt.
715+
701716
### Configuring the failure timeout
702717

703-
You can use the `timeout` field to set the `TaskRun's` desired timeout value. If you do not specify this
704-
value for the `TaskRun`, the global default timeout value applies. If you set the timeout to 0, the `TaskRun` will
705-
have no timeout and will run until it completes successfully or fails from an error.
718+
You can use the `timeout` field to set the `TaskRun's` desired timeout value for **each retry attempt**. If you do
719+
not specify this value, the global default timeout value applies (the same, to `each retry attempt`). If you set the timeout to 0,
720+
the `TaskRun` will have no timeout and will run until it completes successfully or fails from an error.
706721

707722
The global default timeout is set to 60 minutes when you first install Tekton. You can set
708723
a different global default timeout value using the `default-timeout-minutes` field in

pkg/pod/status.go

+10
Original file line numberDiff line numberDiff line change
@@ -368,6 +368,16 @@ func DidTaskRunFail(pod *corev1.Pod) bool {
368368
return f
369369
}
370370

371+
// IsPodArchived indicates if a pod is archived in the retriesStatus.
372+
func IsPodArchived(pod *corev1.Pod, trs *v1beta1.TaskRunStatus) bool {
373+
for _, retryStatus := range trs.RetriesStatus {
374+
if retryStatus.PodName == pod.GetName() {
375+
return true
376+
}
377+
}
378+
return false
379+
}
380+
371381
func areStepsComplete(pod *corev1.Pod) bool {
372382
stepsComplete := len(pod.Status.ContainerStatuses) > 0 && pod.Status.Phase == corev1.PodRunning
373383
for _, s := range pod.Status.ContainerStatuses {

pkg/pod/status_test.go

+45
Original file line numberDiff line numberDiff line change
@@ -1634,6 +1634,51 @@ func TestMarkStatusSuccess(t *testing.T) {
16341634
}
16351635
}
16361636

1637+
func TestIsPodArchived(t *testing.T) {
1638+
for _, tc := range []struct {
1639+
name string
1640+
podName string
1641+
retriesStatus []v1beta1.TaskRunStatus
1642+
want bool
1643+
}{{
1644+
name: "Pod is not in the empty retriesStatus",
1645+
podName: "pod",
1646+
retriesStatus: []v1beta1.TaskRunStatus{},
1647+
want: false,
1648+
}, {
1649+
name: "Pod is not in the non-empty retriesStatus",
1650+
podName: "pod-retry1",
1651+
retriesStatus: []v1beta1.TaskRunStatus{{
1652+
TaskRunStatusFields: v1beta1.TaskRunStatusFields{
1653+
PodName: "pod",
1654+
},
1655+
}},
1656+
want: false,
1657+
}, {
1658+
name: "Pod is in the retriesStatus",
1659+
podName: "pod",
1660+
retriesStatus: []v1beta1.TaskRunStatus{{
1661+
TaskRunStatusFields: v1beta1.TaskRunStatusFields{
1662+
PodName: "pod",
1663+
}},
1664+
},
1665+
want: true,
1666+
}} {
1667+
t.Run(tc.name, func(t *testing.T) {
1668+
trs := v1beta1.TaskRunStatus{
1669+
TaskRunStatusFields: v1beta1.TaskRunStatusFields{
1670+
PodName: "pod",
1671+
RetriesStatus: tc.retriesStatus,
1672+
},
1673+
}
1674+
got := IsPodArchived(&corev1.Pod{ObjectMeta: metav1.ObjectMeta{Name: tc.podName}}, &trs)
1675+
if tc.want != got {
1676+
t.Errorf("got: %v, want: %v", got, tc.want)
1677+
}
1678+
})
1679+
}
1680+
}
1681+
16371682
func statusRunning() duckv1.Status {
16381683
var trs v1beta1.TaskRunStatus
16391684
markStatusRunning(&trs, v1beta1.TaskRunReasonRunning.String(), "Not all Steps in the Task have finished executing")

pkg/reconciler/pipelinerun/pipelinerun.go

+2-29
Original file line numberDiff line numberDiff line change
@@ -888,26 +888,10 @@ func (c *Reconciler) createTaskRuns(ctx context.Context, rpt *resources.Resolved
888888
func (c *Reconciler) createTaskRun(ctx context.Context, taskRunName string, params []v1beta1.Param, rpt *resources.ResolvedPipelineTask, pr *v1beta1.PipelineRun, storageBasePath string) (*v1beta1.TaskRun, error) {
889889
logger := logging.FromContext(ctx)
890890

891-
tr, _ := c.taskRunLister.TaskRuns(pr.Namespace).Get(taskRunName)
892-
if tr != nil {
893-
// retry should happen only when the taskrun has failed
894-
if !tr.Status.GetCondition(apis.ConditionSucceeded).IsFalse() {
895-
return tr, nil
896-
}
897-
// Don't modify the lister cache's copy.
898-
tr = tr.DeepCopy()
899-
// is a retry
900-
addRetryHistory(tr)
901-
clearStatus(tr)
902-
tr.Status.MarkResourceOngoing("", "")
903-
logger.Infof("Updating taskrun %s with cleared status and retry history (length: %d).", tr.GetName(), len(tr.Status.RetriesStatus))
904-
return c.PipelineClientSet.TektonV1beta1().TaskRuns(pr.Namespace).UpdateStatus(ctx, tr, metav1.UpdateOptions{})
905-
}
906-
907891
rpt.PipelineTask = resources.ApplyPipelineTaskContexts(rpt.PipelineTask)
908892
taskRunSpec := pr.GetTaskRunSpec(rpt.PipelineTask.Name)
909893
params = append(params, rpt.PipelineTask.Params...)
910-
tr = &v1beta1.TaskRun{
894+
tr := &v1beta1.TaskRun{
911895
ObjectMeta: metav1.ObjectMeta{
912896
Name: taskRunName,
913897
Namespace: pr.Namespace,
@@ -916,6 +900,7 @@ func (c *Reconciler) createTaskRun(ctx context.Context, taskRunName string, para
916900
Annotations: combineTaskRunAndTaskSpecAnnotations(pr, rpt.PipelineTask),
917901
},
918902
Spec: v1beta1.TaskRunSpec{
903+
Retries: rpt.PipelineTask.Retries,
919904
Params: params,
920905
ServiceAccountName: taskRunSpec.TaskServiceAccountName,
921906
PodTemplate: taskRunSpec.TaskPodTemplate,
@@ -1169,18 +1154,6 @@ func combinedSubPath(workspaceSubPath string, pipelineTaskSubPath string) string
11691154
return filepath.Join(workspaceSubPath, pipelineTaskSubPath)
11701155
}
11711156

1172-
func addRetryHistory(tr *v1beta1.TaskRun) {
1173-
newStatus := *tr.Status.DeepCopy()
1174-
newStatus.RetriesStatus = nil
1175-
tr.Status.RetriesStatus = append(tr.Status.RetriesStatus, newStatus)
1176-
}
1177-
1178-
func clearStatus(tr *v1beta1.TaskRun) {
1179-
tr.Status.StartTime = nil
1180-
tr.Status.CompletionTime = nil
1181-
tr.Status.PodName = ""
1182-
}
1183-
11841157
func getTaskrunAnnotations(pr *v1beta1.PipelineRun) map[string]string {
11851158
// Propagate annotations from PipelineRun to TaskRun.
11861159
annotations := make(map[string]string, len(pr.ObjectMeta.Annotations)+1)

pkg/reconciler/pipelinerun/pipelinerun_test.go

+24-100
Original file line numberDiff line numberDiff line change
@@ -494,6 +494,7 @@ spec:
494494
value: test-pipeline
495495
- name: contextRetriesParam
496496
value: "5"
497+
retries: 5
497498
resources:
498499
inputs:
499500
- name: workspace
@@ -3527,103 +3528,6 @@ spec:
35273528
}
35283529
}
35293530

3530-
// TestReconcileWithTimeoutAndRetry runs "Reconcile" against pipelines with
3531-
// retries and timeout settings, and status that represents different number of
3532-
// retries already performed. It verifies the reconciled status and events
3533-
// generated
3534-
func TestReconcileWithTimeoutAndRetry(t *testing.T) {
3535-
for _, tc := range []struct {
3536-
name string
3537-
retries int
3538-
conditionSucceeded corev1.ConditionStatus
3539-
wantEvents []string
3540-
}{{
3541-
name: "One try has to be done",
3542-
retries: 1,
3543-
conditionSucceeded: corev1.ConditionFalse,
3544-
wantEvents: []string{
3545-
"Warning Failed PipelineRun \"test-pipeline-retry-run-with-timeout\" failed to finish within",
3546-
},
3547-
}, {
3548-
name: "No more retries are needed",
3549-
retries: 2,
3550-
conditionSucceeded: corev1.ConditionUnknown,
3551-
wantEvents: []string{
3552-
"Warning Failed PipelineRun \"test-pipeline-retry-run-with-timeout\" failed to finish within",
3553-
},
3554-
}} {
3555-
t.Run(tc.name, func(t *testing.T) {
3556-
ps := []*v1beta1.Pipeline{parse.MustParseV1beta1Pipeline(t, fmt.Sprintf(`
3557-
metadata:
3558-
name: test-pipeline-retry
3559-
namespace: foo
3560-
spec:
3561-
tasks:
3562-
- name: hello-world-1
3563-
retries: %d
3564-
taskRef:
3565-
name: hello-world
3566-
`, tc.retries))}
3567-
prs := []*v1beta1.PipelineRun{parse.MustParseV1beta1PipelineRun(t, `
3568-
metadata:
3569-
name: test-pipeline-retry-run-with-timeout
3570-
namespace: foo
3571-
spec:
3572-
pipelineRef:
3573-
name: test-pipeline-retry
3574-
serviceAccountName: test-sa
3575-
timeout: 12h0m0s
3576-
status:
3577-
startTime: "2021-12-31T00:00:00Z"
3578-
`)}
3579-
3580-
ts := []*v1beta1.Task{
3581-
simpleHelloWorldTask,
3582-
}
3583-
trs := []*v1beta1.TaskRun{parse.MustParseV1beta1TaskRun(t, `
3584-
metadata:
3585-
name: hello-world-1
3586-
namespace: foo
3587-
status:
3588-
conditions:
3589-
- status: "False"
3590-
type: Succeeded
3591-
podName: my-pod-name
3592-
retriesStatus:
3593-
- conditions:
3594-
- status: "False"
3595-
type: Succeeded
3596-
`)}
3597-
3598-
prtrs := &v1beta1.PipelineRunTaskRunStatus{
3599-
PipelineTaskName: "hello-world-1",
3600-
Status: &trs[0].Status,
3601-
}
3602-
prs[0].Status.TaskRuns = make(map[string]*v1beta1.PipelineRunTaskRunStatus)
3603-
prs[0].Status.TaskRuns["hello-world-1"] = prtrs
3604-
3605-
d := test.Data{
3606-
PipelineRuns: prs,
3607-
Pipelines: ps,
3608-
Tasks: ts,
3609-
TaskRuns: trs,
3610-
}
3611-
prt := newPipelineRunTest(d, t)
3612-
defer prt.Cancel()
3613-
3614-
reconciledRun, _ := prt.reconcileRun("foo", "test-pipeline-retry-run-with-timeout", []string{}, false)
3615-
3616-
if len(reconciledRun.Status.TaskRuns["hello-world-1"].Status.RetriesStatus) != tc.retries {
3617-
t.Fatalf(" %d retries expected but got %d ", tc.retries, len(reconciledRun.Status.TaskRuns["hello-world-1"].Status.RetriesStatus))
3618-
}
3619-
3620-
if status := reconciledRun.Status.TaskRuns["hello-world-1"].Status.GetCondition(apis.ConditionSucceeded).Status; status != tc.conditionSucceeded {
3621-
t.Fatalf("Succeeded expected to be %s but is %s", tc.conditionSucceeded, status)
3622-
}
3623-
})
3624-
}
3625-
}
3626-
36273531
// TestReconcileAndPropagateCustomPipelineTaskRunSpec tests that custom PipelineTaskRunSpec declared
36283532
// in PipelineRun is propagated to created TaskRuns
36293533
func TestReconcileAndPropagateCustomPipelineTaskRunSpec(t *testing.T) {
@@ -9673,6 +9577,7 @@ spec:
96739577
"pr", "p", "platforms-and-browsers", false),
96749578
`
96759579
spec:
9580+
retries: 1
96769581
params:
96779582
- name: platform
96789583
value: linux
@@ -9687,12 +9592,17 @@ status:
96879592
conditions:
96889593
- type: Succeeded
96899594
status: "False"
9595+
retriesStatus:
9596+
- conditions:
9597+
- status: "False"
9598+
type: Succeeded
96909599
`),
96919600
mustParseTaskRunWithObjectMeta(t,
96929601
taskRunObjectMeta("pr-platforms-and-browsers-1", "foo",
96939602
"pr", "p", "platforms-and-browsers", false),
96949603
`
96959604
spec:
9605+
retries: 1
96969606
params:
96979607
- name: platform
96989608
value: mac
@@ -9807,6 +9717,7 @@ status:
98079717
"pr", "p", "platforms-and-browsers", false),
98089718
`
98099719
spec:
9720+
retries: 1
98109721
params:
98119722
- name: platform
98129723
value: linux
@@ -9820,7 +9731,7 @@ spec:
98209731
status:
98219732
conditions:
98229733
- type: Succeeded
9823-
status: "Unknown"
9734+
status: "False"
98249735
retriesStatus:
98259736
- conditions:
98269737
- status: "False"
@@ -9831,6 +9742,7 @@ status:
98319742
"pr", "p", "platforms-and-browsers", false),
98329743
`
98339744
spec:
9745+
retries: 1
98349746
params:
98359747
- name: platform
98369748
value: mac
@@ -9855,6 +9767,7 @@ status:
98559767
"pr", "p", "platforms-and-browsers", false),
98569768
`
98579769
spec:
9770+
retries: 1
98589771
params:
98599772
- name: platform
98609773
value: linux
@@ -9868,13 +9781,18 @@ spec:
98689781
status:
98699782
conditions:
98709783
- type: Succeeded
9871-
status: "False"
9784+
status: "Unknown"
9785+
retriesStatus:
9786+
- conditions:
9787+
- status: "False"
9788+
type: Succeeded
98729789
`),
98739790
mustParseTaskRunWithObjectMeta(t,
98749791
taskRunObjectMeta("pr-platforms-and-browsers-1", "foo",
98759792
"pr", "p", "platforms-and-browsers", false),
98769793
`
98779794
spec:
9795+
retries: 1
98789796
params:
98799797
- name: platform
98809798
value: mac
@@ -9888,7 +9806,11 @@ spec:
98889806
status:
98899807
conditions:
98909808
- type: Succeeded
9891-
status: "False"
9809+
status: "Unknown"
9810+
retriesStatus:
9811+
- conditions:
9812+
- status: "False"
9813+
type: Succeeded
98929814
`),
98939815
},
98949816
prs: []*v1beta1.PipelineRun{
@@ -9989,6 +9911,7 @@ status:
99899911
"pr", "p", "platforms-and-browsers", false),
99909912
`
99919913
spec:
9914+
retries: 1
99929915
params:
99939916
- name: platform
99949917
value: linux
@@ -10013,6 +9936,7 @@ status:
100139936
"pr", "p", "platforms-and-browsers", false),
100149937
`
100159938
spec:
9939+
retries: 1
100169940
params:
100179941
- name: platform
100189942
value: mac

0 commit comments

Comments
 (0)