Skip to content

Commit 35d0209

Browse files
Fix rollout with unavailable machines
1 parent b2f4500 commit 35d0209

File tree

2 files changed

+37
-1
lines changed

2 files changed

+37
-1
lines changed

internal/controllers/machinedeployment/machinedeployment_rollout_rollingupdate.go

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -247,12 +247,18 @@ func (p *rolloutPlanner) reconcileOldMachineSetsRollingUpdate(ctx context.Contex
247247
}
248248
}
249249

250+
// On top of considering maxUnavailable, before scaling down it is also important to consider how unavailability is distributed
251+
// across MachineSets, because if there are unavailable machines on the new MachineSet it is necessary to slow down or stop rollout
252+
// to prevent to transition all the Machines to a broken state.
253+
// In order to do so, compute the number of unavailable machines on the new MachineSet and use it to reduce totalScaleDownCount.
254+
newMSUnavailableMachineCount := max(ptr.Deref(p.newMS.Spec.Replicas, 0)-ptr.Deref(p.newMS.Status.AvailableReplicas, 0), 0)
255+
250256
// Compute the total number of replicas that can be scaled down.
251257
// Exit immediately if there is no room for scaling down.
252258
// NOTE: this is a quick preliminary check to verify if there is room for scaling down any of the oldMSs; further down the code
253259
// will make additional checks to ensure scale down actually happens without breaching MaxUnavailable, and
254260
// if necessary, it will reduce the extent of the scale down accordingly.
255-
totalScaleDownCount := max(totalSpecReplicas-totalPendingScaleDown-minAvailable, 0)
261+
totalScaleDownCount := max(totalSpecReplicas-totalPendingScaleDown-minAvailable-newMSUnavailableMachineCount, 0)
256262
if totalScaleDownCount <= 0 {
257263
return nil
258264
}

internal/controllers/machinedeployment/machinedeployment_rollout_rollingupdate_test.go

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -463,6 +463,36 @@ func Test_reconcileOldMachineSetsRollingUpdate(t *testing.T) {
463463
},
464464
expectedNotes: map[string][]string{},
465465
},
466+
{
467+
name: "do not scale down if there are unavailable replicas on the new MachineSet (maxSurge 1, maxUnavailable 9), all machines unavailable",
468+
scaleIntent: map[string]int32{},
469+
md: createMD("v2", 3, withRollingUpdateStrategy(1, 0)),
470+
newMS: createMS("ms2", "v2", 1, withStatusReplicas(1), withStatusAvailableReplicas(0)), // one machine created on the new NewMS due to maxSurge, but it is not reaching the available state,
471+
oldMSs: []*clusterv1.MachineSet{
472+
createMS("ms1", "v1", 2, withStatusReplicas(3), withStatusAvailableReplicas(0)), // OldMS has unavailable replicas
473+
},
474+
expectScaleIntent: map[string]int32{
475+
// no new scale down intent for oldMSs (ms1):
476+
// 3 available replicas from ms1 - 1 replica already scaling down from ms1 + 3 available replica from ms2 = 5 available replicas == minAvailability, we cannot further scale down
477+
},
478+
expectedNotes: map[string][]string{},
479+
skipMaxUnavailabilityCheck: true, // The test case is simulating all machines not unavailable, so this check will fail
480+
},
481+
{
482+
name: "do not scale down if there are unavailable replicas on the new MachineSet (maxSurge 0, maxUnavailable 1), all machines unavailable",
483+
scaleIntent: map[string]int32{},
484+
md: createMD("v2", 3, withRollingUpdateStrategy(0, 1)),
485+
newMS: createMS("ms2", "v2", 1, withStatusReplicas(1), withStatusAvailableReplicas(0)), // one machine created on the new NewMS after scale down on the old MS, but it is not reaching the available state,
486+
oldMSs: []*clusterv1.MachineSet{
487+
createMS("ms1", "v1", 2, withStatusReplicas(2), withStatusAvailableReplicas(0)), // OldMS scaled down due to maxUnavailable 1
488+
},
489+
expectScaleIntent: map[string]int32{
490+
// no new scale down intent for oldMSs (ms1):
491+
// 3 available replicas from ms1 - 1 replica already scaling down from ms1 + 3 available replica from ms2 = 5 available replicas == minAvailability, we cannot further scale down
492+
},
493+
expectedNotes: map[string][]string{},
494+
skipMaxUnavailabilityCheck: true, // The test case is simulating all machines not unavailable, so this check will fail
495+
},
466496
{
467497
name: "do not scale down if there are more replicas than minAvailable replicas, but scale down from current reconcile already takes the availability buffer (newMS is scaling down)",
468498
scaleIntent: map[string]int32{

0 commit comments

Comments
 (0)