Skip to content

Commit bff7e2e

Browse files
🐛 Fix rollout with unavailable machines (#13020)
* Fix rollout with unavailable machines * Address comments
1 parent 9e0cf62 commit bff7e2e

File tree

2 files changed

+35
-1
lines changed

2 files changed

+35
-1
lines changed

internal/controllers/machinedeployment/machinedeployment_rollout_rollingupdate.go

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -247,12 +247,18 @@ func (p *rolloutPlanner) reconcileOldMachineSetsRollingUpdate(ctx context.Contex
247247
}
248248
}
249249

250+
// On top of considering maxUnavailable, before scaling down it is also important to consider how unavailability is distributed
251+
// across MachineSets, because if there are unavailable machines on the new MachineSet it is necessary to slow down or stop rollout
252+
// to prevent to transition all the Machines to a broken state.
253+
// In order to do so, compute the number of unavailable machines on the new MachineSet and use it to reduce totalScaleDownCount.
254+
newMSUnavailableMachineCount := max(ptr.Deref(p.newMS.Spec.Replicas, 0)-ptr.Deref(p.newMS.Status.AvailableReplicas, 0), 0)
255+
250256
// Compute the total number of replicas that can be scaled down.
251257
// Exit immediately if there is no room for scaling down.
252258
// NOTE: this is a quick preliminary check to verify if there is room for scaling down any of the oldMSs; further down the code
253259
// will make additional checks to ensure scale down actually happens without breaching MaxUnavailable, and
254260
// if necessary, it will reduce the extent of the scale down accordingly.
255-
totalScaleDownCount := max(totalSpecReplicas-totalPendingScaleDown-minAvailable, 0)
261+
totalScaleDownCount := max(totalSpecReplicas-totalPendingScaleDown-minAvailable-newMSUnavailableMachineCount, 0)
256262
if totalScaleDownCount <= 0 {
257263
return nil
258264
}

internal/controllers/machinedeployment/machinedeployment_rollout_rollingupdate_test.go

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -463,6 +463,34 @@ func Test_reconcileOldMachineSetsRollingUpdate(t *testing.T) {
463463
},
464464
expectedNotes: map[string][]string{},
465465
},
466+
{
467+
name: "do not scale down if there are unavailable replicas on the new MachineSet (maxSurge 1, maxUnavailable 0), all machines unavailable",
468+
scaleIntent: map[string]int32{},
469+
md: createMD("v2", 3, withRollingUpdateStrategy(1, 0)),
470+
newMS: createMS("ms2", "v2", 1, withStatusReplicas(1), withStatusAvailableReplicas(0)), // one machine created on the new NewMS due to maxSurge, but it is not reaching the available state,
471+
oldMSs: []*clusterv1.MachineSet{
472+
createMS("ms1", "v1", 3, withStatusReplicas(3), withStatusAvailableReplicas(0)),
473+
},
474+
expectScaleIntent: map[string]int32{
475+
// no new scale down intent for oldMSs (ms1): unavailability on new MS
476+
},
477+
expectedNotes: map[string][]string{},
478+
skipMaxUnavailabilityCheck: true, // The test case is simulating all machines not unavailable, so this check will fail
479+
},
480+
{
481+
name: "do not scale down if there are unavailable replicas on the new MachineSet (maxSurge 0, maxUnavailable 1), all machines unavailable",
482+
scaleIntent: map[string]int32{},
483+
md: createMD("v2", 3, withRollingUpdateStrategy(0, 1)),
484+
newMS: createMS("ms2", "v2", 1, withStatusReplicas(1), withStatusAvailableReplicas(0)), // one machine created on the new NewMS after scale down on the old MS, but it is not reaching the available state,
485+
oldMSs: []*clusterv1.MachineSet{
486+
createMS("ms1", "v1", 2, withStatusReplicas(2), withStatusAvailableReplicas(0)), // OldMS scaled down due to maxUnavailable 1
487+
},
488+
expectScaleIntent: map[string]int32{
489+
// no new scale down intent for oldMSs (ms1): unavailability on new MS
490+
},
491+
expectedNotes: map[string][]string{},
492+
skipMaxUnavailabilityCheck: true, // The test case is simulating all machines not unavailable, so this check will fail
493+
},
466494
{
467495
name: "do not scale down if there are more replicas than minAvailable replicas, but scale down from current reconcile already takes the availability buffer (newMS is scaling down)",
468496
scaleIntent: map[string]int32{

0 commit comments

Comments
 (0)