Skip to content

Commit 746b1c4

Browse files
authored
chore: manually backport #5297: allow argocd-update step to return recommended requeue interval (#5301)
Signed-off-by: Kent Rancourt <[email protected]>
1 parent b194048 commit 746b1c4

File tree

7 files changed

+232
-36
lines changed

7 files changed

+232
-36
lines changed

pkg/cli/cmd/update/freight.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@ func (o *updateFreightAliasOptions) addFlags(cmd *cobra.Command) {
7373
cmd.Flags(), &o.Project, o.Config.Project,
7474
"The project the freight belongs to. If not set, the default project will be used.",
7575
)
76-
option.Name(cmd.Flags(), &o.Name, "The name of the freight to to be updated.")
76+
option.Name(cmd.Flags(), &o.Name, "The name of the freight to be updated.")
7777
option.OldAlias(cmd.Flags(), &o.OldAlias, "The existing alias of the freight to be updated.")
7878
option.NewAlias(cmd.Flags(), &o.NewAlias, "The new alias to be assigned to the freight.")
7979

pkg/controller/promotions/promotions.go

Lines changed: 37 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,7 @@ type reconciler struct {
8080
kargoapi.Promotion,
8181
*kargoapi.Stage,
8282
*kargoapi.Freight,
83-
) (*kargoapi.PromotionStatus, error)
83+
) (*kargoapi.PromotionStatus, *time.Duration, error)
8484

8585
terminatePromotionFn func(
8686
context.Context,
@@ -328,6 +328,8 @@ func (r *reconciler) Reconcile(
328328

329329
newStatus := promo.Status.DeepCopy()
330330

331+
var suggestedRequeueInterval *time.Duration
332+
331333
// Wrap the promoteFn() call in an anonymous function to recover() any panics, so
332334
// we can update the promo's phase with Error if it does. This breaks an infinite
333335
// cycle of a bad promo continuously failing to reconcile, and surfaces the error.
@@ -343,7 +345,9 @@ func (r *reconciler) Reconcile(
343345
newStatus.Message = fmt.Sprintf("%v", err)
344346
}
345347
}()
346-
otherStatus, promoteErr := r.promoteFn(
348+
var otherStatus *kargoapi.PromotionStatus
349+
var promoteErr error
350+
otherStatus, suggestedRequeueInterval, promoteErr = r.promoteFn(
347351
promoCtx,
348352
*promo,
349353
stage,
@@ -457,7 +461,9 @@ func (r *reconciler) Reconcile(
457461
// If the promotion is still running, we'll need to periodically check on
458462
// it.
459463
if newStatus.Phase == kargoapi.PromotionPhaseRunning {
460-
return ctrl.Result{RequeueAfter: calculateRequeueInterval(promo)}, nil
464+
return ctrl.Result{
465+
RequeueAfter: calculateRequeueInterval(promo, suggestedRequeueInterval),
466+
}, nil
461467
}
462468
return ctrl.Result{}, nil
463469
}
@@ -467,19 +473,22 @@ func (r *reconciler) promote(
467473
promo kargoapi.Promotion,
468474
stage *kargoapi.Stage,
469475
targetFreight *kargoapi.Freight,
470-
) (*kargoapi.PromotionStatus, error) {
476+
) (*kargoapi.PromotionStatus, *time.Duration, error) {
471477
logger := logging.LoggerFromContext(ctx)
472478
stageName := stage.Name
473479
stageNamespace := promo.Namespace
474480

475481
if targetFreight == nil {
476482
// nolint:staticcheck
477-
return nil, fmt.Errorf("Freight %q not found in namespace %q", promo.Spec.Freight, promo.Namespace)
483+
return nil, nil, fmt.Errorf(
484+
"Freight %q not found in namespace %q",
485+
promo.Spec.Freight, promo.Namespace,
486+
)
478487
}
479488

480489
if !stage.IsFreightAvailable(targetFreight) {
481490
// nolint:staticcheck
482-
return nil, fmt.Errorf(
491+
return nil, nil, fmt.Errorf(
483492
"Freight %q is not available to Stage %q in namespace %q",
484493
promo.Spec.Freight,
485494
stageName,
@@ -549,7 +558,7 @@ func (r *reconciler) promote(
549558
promoCtx.StepExecutionMetadata = nil
550559
workingPromo.Status.HealthChecks = nil
551560
} else if !os.IsExist(err) {
552-
return nil, fmt.Errorf("error creating working directory: %w", err)
561+
return nil, nil, fmt.Errorf("error creating working directory: %w", err)
553562
}
554563
defer func() {
555564
if workingPromo.Status.Phase.IsTerminal() {
@@ -576,7 +585,7 @@ func (r *reconciler) promote(
576585
}
577586
if err != nil {
578587
workingPromo.Status.Phase = kargoapi.PromotionPhaseErrored
579-
return &workingPromo.Status, err
588+
return &workingPromo.Status, nil, err
580589
}
581590

582591
logger.Debug("promotion", "phase", workingPromo.Status.Phase)
@@ -606,7 +615,11 @@ func (r *reconciler) promote(
606615
}
607616
}
608617

609-
return &workingPromo.Status, nil
618+
if workingPromo.Status.Phase == kargoapi.PromotionPhaseRunning {
619+
return &workingPromo.Status, res.RetryAfter, nil
620+
}
621+
622+
return &workingPromo.Status, nil, nil
610623
}
611624

612625
// buildTargetFreightCollection constructs a FreightCollection that contains all
@@ -721,31 +734,38 @@ func parseCreateActorAnnotation(promo *kargoapi.Promotion) string {
721734

722735
var defaultRequeueInterval = 5 * time.Minute
723736

724-
func calculateRequeueInterval(p *kargoapi.Promotion) time.Duration {
737+
func calculateRequeueInterval(
738+
p *kargoapi.Promotion,
739+
suggestedRequeueInterval *time.Duration,
740+
) time.Duration {
741+
requeueInterval := defaultRequeueInterval
742+
if suggestedRequeueInterval != nil {
743+
requeueInterval = *suggestedRequeueInterval
744+
}
745+
725746
// Ensure we have a step for the current step index.
726747
if int(p.Status.CurrentStep) >= len(p.Spec.Steps) {
727-
return defaultRequeueInterval
748+
return requeueInterval
728749
}
729750

730751
step := p.Spec.Steps[p.Status.CurrentStep]
731752
registration := promotion.GetStepRunnerRegistration(step.Uses)
732753
timeout := step.Retry.GetTimeout(registration.Metadata.DefaultTimeout)
733754

734-
// If the timeout is 0 (no timeout), we should requeue at the default
735-
// interval.
755+
// If the timeout is 0 (no timeout), we should requeue at the full interval.
736756
if timeout == 0 {
737-
return defaultRequeueInterval
757+
return requeueInterval
738758
}
739759

740760
// Ensure we have an execution metadata entry for the current step.
741761
if int(p.Status.CurrentStep) >= len(p.Status.StepExecutionMetadata) {
742-
return defaultRequeueInterval
762+
return requeueInterval
743763
}
744764

745765
md := p.Status.StepExecutionMetadata[p.Status.CurrentStep]
746766
targetTimeout := md.StartedAt.Add(timeout)
747-
if targetTimeout.Before(time.Now().Add(defaultRequeueInterval)) {
767+
if targetTimeout.Before(time.Now().Add(requeueInterval)) {
748768
return time.Until(targetTimeout)
749769
}
750-
return defaultRequeueInterval
770+
return requeueInterval
751771
}

pkg/controller/promotions/promotions_test.go

Lines changed: 89 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ import (
1111
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
1212
k8sruntime "k8s.io/apimachinery/pkg/runtime"
1313
"k8s.io/apimachinery/pkg/types"
14+
"k8s.io/utils/ptr"
1415
ctrl "sigs.k8s.io/controller-runtime"
1516
"sigs.k8s.io/controller-runtime/pkg/client"
1617
"sigs.k8s.io/controller-runtime/pkg/client/fake"
@@ -63,8 +64,11 @@ func TestReconcile(t *testing.T) {
6364
testCases := []struct {
6465
name string
6566
promos []client.Object
66-
promoteFn func(context.Context, kargoapi.Promotion,
67-
*kargoapi.Freight) (*kargoapi.PromotionStatus, error)
67+
promoteFn func(
68+
context.Context,
69+
kargoapi.Promotion,
70+
*kargoapi.Freight,
71+
) (*kargoapi.PromotionStatus, *time.Duration, error)
6872
terminateFn func(context.Context, *kargoapi.Promotion) error
6973
promoToReconcile *types.NamespacedName // if nil, uses the first of the promos
7074
expectPromoteFnCalled bool
@@ -238,7 +242,11 @@ func TestReconcile(t *testing.T) {
238242
newPromo("fake-namespace", "fake-promo", "fake-stage", kargoapi.PromotionPhasePending, before),
239243
},
240244
promoToReconcile: &types.NamespacedName{Namespace: "fake-namespace", Name: "fake-promo"},
241-
promoteFn: func(_ context.Context, _ kargoapi.Promotion, _ *kargoapi.Freight) (*kargoapi.PromotionStatus, error) {
245+
promoteFn: func(
246+
context.Context,
247+
kargoapi.Promotion,
248+
*kargoapi.Freight,
249+
) (*kargoapi.PromotionStatus, *time.Duration, error) {
242250
panic("expected panic")
243251
},
244252
},
@@ -263,8 +271,12 @@ func TestReconcile(t *testing.T) {
263271
newPromo("fake-namespace", "fake-promo", "fake-stage", kargoapi.PromotionPhasePending, before),
264272
},
265273
promoToReconcile: &types.NamespacedName{Namespace: "fake-namespace", Name: "fake-promo"},
266-
promoteFn: func(_ context.Context, _ kargoapi.Promotion, _ *kargoapi.Freight) (*kargoapi.PromotionStatus, error) {
267-
return nil, errors.New("expected error")
274+
promoteFn: func(
275+
context.Context,
276+
kargoapi.Promotion,
277+
*kargoapi.Freight,
278+
) (*kargoapi.PromotionStatus, *time.Duration, error) {
279+
return nil, nil, errors.New("expected error")
268280
},
269281
},
270282
{
@@ -300,12 +312,14 @@ func TestReconcile(t *testing.T) {
300312
p kargoapi.Promotion,
301313
_ *kargoapi.Stage,
302314
f *kargoapi.Freight,
303-
) (*kargoapi.PromotionStatus, error) {
315+
) (*kargoapi.PromotionStatus, *time.Duration, error) {
304316
promoteWasCalled = true
305317
if tc.promoteFn != nil {
306318
return tc.promoteFn(ctx, p, f)
307319
}
308-
return &kargoapi.PromotionStatus{Phase: kargoapi.PromotionPhaseSucceeded}, nil
320+
return &kargoapi.PromotionStatus{
321+
Phase: kargoapi.PromotionPhaseSucceeded,
322+
}, nil, nil
309323
}
310324

311325
terminateWasCalled := false
@@ -620,9 +634,10 @@ func Test_calculateRequeueInterval(t *testing.T) {
620634
require.Greater(t, testTimeout, defaultRequeueInterval)
621635

622636
testCases := []struct {
623-
name string
624-
promo *kargoapi.Promotion
625-
assertions func(*testing.T, time.Duration)
637+
name string
638+
promo *kargoapi.Promotion
639+
suggestedRequeueInterval *time.Duration
640+
assertions func(*testing.T, time.Duration)
626641
}{
627642
{
628643
name: "current step out of bounds",
@@ -698,7 +713,62 @@ func Test_calculateRequeueInterval(t *testing.T) {
698713
},
699714
},
700715
{
701-
name: "timeout occurs after next interval",
716+
name: "timeout would occur after suggested interval elapses",
717+
suggestedRequeueInterval: ptr.To(time.Minute),
718+
promo: &kargoapi.Promotion{
719+
Spec: kargoapi.PromotionSpec{
720+
Steps: []kargoapi.PromotionStep{{
721+
Uses: testStepKindWithTimeout,
722+
}},
723+
},
724+
Status: kargoapi.PromotionStatus{
725+
CurrentStep: 0,
726+
StepExecutionMetadata: []kargoapi.StepExecutionMetadata{{
727+
// If the step started now and times out after an interval greater
728+
// than the default requeue interval, then the wall clock time of
729+
// the timeout will be AFTER the wall clock time of the next
730+
// reconciliation.
731+
StartedAt: &metav1.Time{Time: time.Now()},
732+
}},
733+
},
734+
},
735+
assertions: func(t *testing.T, requeueInterval time.Duration) {
736+
// The request should be requeued according to the suggestion.
737+
require.Equal(t, time.Minute, requeueInterval)
738+
// Sanity check that the requeue interval is always greater than 0.
739+
require.Greater(t, requeueInterval, time.Duration(0))
740+
},
741+
},
742+
{
743+
name: "timeout would occur before suggested interval elapses",
744+
suggestedRequeueInterval: ptr.To(time.Minute),
745+
promo: &kargoapi.Promotion{
746+
Spec: kargoapi.PromotionSpec{
747+
Steps: []kargoapi.PromotionStep{{
748+
Uses: testStepKindWithTimeout,
749+
}},
750+
},
751+
Status: kargoapi.PromotionStatus{
752+
CurrentStep: 0,
753+
StepExecutionMetadata: []kargoapi.StepExecutionMetadata{{
754+
// If the step has only a minute to go before timeout, then the wall
755+
// clock time of the timeout will be BEFORE the wall clock time of
756+
// the next reconciliation.
757+
StartedAt: &metav1.Time{
758+
Time: metav1.Now().Add(-testTimeout).Add(time.Minute),
759+
},
760+
}},
761+
},
762+
},
763+
assertions: func(t *testing.T, requeueInterval time.Duration) {
764+
// The interval to the next reconciliation should be shortened.
765+
require.Less(t, requeueInterval, time.Minute)
766+
// Sanity check that the requeue interval is always greater than 0.
767+
require.Greater(t, requeueInterval, time.Duration(0))
768+
},
769+
},
770+
{
771+
name: "timeout would occur after default interval elapses",
702772
promo: &kargoapi.Promotion{
703773
Spec: kargoapi.PromotionSpec{
704774
Steps: []kargoapi.PromotionStep{{
@@ -724,7 +794,7 @@ func Test_calculateRequeueInterval(t *testing.T) {
724794
},
725795
},
726796
{
727-
name: "timeout occurs before next interval",
797+
name: "timeout would occur before next default interval elapses",
728798
promo: &kargoapi.Promotion{
729799
Spec: kargoapi.PromotionSpec{
730800
Steps: []kargoapi.PromotionStep{{
@@ -753,7 +823,13 @@ func Test_calculateRequeueInterval(t *testing.T) {
753823
}
754824
for _, testCase := range testCases {
755825
t.Run(testCase.name, func(t *testing.T) {
756-
testCase.assertions(t, calculateRequeueInterval(testCase.promo))
826+
testCase.assertions(
827+
t,
828+
calculateRequeueInterval(
829+
testCase.promo,
830+
testCase.suggestedRequeueInterval,
831+
),
832+
)
757833
})
758834
}
759835
}

pkg/promotion/promotion.go

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ import (
55
"fmt"
66
"slices"
77
"strings"
8+
"time"
89

910
gocache "github.com/patrickmn/go-cache"
1011
"github.com/stretchr/testify/assert/yaml"
@@ -401,6 +402,10 @@ type Result struct {
401402
StepExecutionMetadata kargoapi.StepExecutionMetadataList
402403
// State is the current state of the promotion process.
403404
State State
405+
// RetryAfter is an optional, SUGGESTED duration after which a Promotion
406+
// reporting itself to be in a Running status should be retried. Note: This is
407+
// unrelated to retrying upon non-terminal failures.
408+
RetryAfter *time.Duration
404409
}
405410

406411
// getTaskOutputs returns the outputs of a task that are relevant to the current
@@ -505,4 +510,8 @@ type StepResult struct {
505510
// returned by some StepRunner upon successful execution of a Step. These
506511
// criteria can be used later as input to a health.Checker.
507512
HealthCheck *health.Criteria
513+
// RetryAfter is an optional, SUGGESTED duration after which a step reporting
514+
// itself to be in a Running status should be retried. Note: This is unrelated
515+
// to retrying upon non-terminal failures.
516+
RetryAfter *time.Duration
508517
}

0 commit comments

Comments
 (0)