Skip to content

Commit efed170

Browse files
mxm-trmaxime.hubert
authored andcommitted
fix(autoscaler): Fix internal cache issue when downscale is stressed
1 parent bf23616 commit efed170

File tree

6 files changed

+141
-10
lines changed

6 files changed

+141
-10
lines changed

cluster-autoscaler/cloudprovider/ovhcloud/ovh_cloud_manager.go

Lines changed: 37 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ import (
2323
"fmt"
2424
"io"
2525
"io/ioutil"
26+
"slices"
2627
"sync"
2728
"time"
2829

@@ -62,7 +63,7 @@ type OvhCloudManager struct {
6263
ClusterID string
6364
ProjectID string
6465

65-
NodePools []sdk.NodePool
66+
NodePoolsPerID map[string]*sdk.NodePool
6667
NodeGroupPerProviderID map[string]*NodeGroup
6768
NodeGroupPerProviderIDLock sync.RWMutex
6869

@@ -148,7 +149,7 @@ func NewManager(configFile io.Reader) (*OvhCloudManager, error) {
148149
ProjectID: cfg.ProjectID,
149150
ClusterID: cfg.ClusterID,
150151

151-
NodePools: make([]sdk.NodePool, 0),
152+
NodePoolsPerID: make(map[string]*sdk.NodePool),
152153
NodeGroupPerProviderID: make(map[string]*NodeGroup),
153154
NodeGroupPerProviderIDLock: sync.RWMutex{},
154155

@@ -232,6 +233,40 @@ func (m *OvhCloudManager) ReAuthenticate() error {
232233
return nil
233234
}
234235

236+
// setNodePoolsState updates nodepool local informations based on given list
237+
// Updates NodePoolsPerID by modifying data so the reference in NodeGroupPerProviderID can access refreshed data
238+
//
239+
// - Updates fields on already referenced nodepool
240+
// - Adds nodepool if not referenced yet
241+
// - Deletes from map if nodepool is not in the given list (it doesn't exist anymore)
242+
func (m *OvhCloudManager) setNodePoolsState(pools []sdk.NodePool) {
243+
m.NodeGroupPerProviderIDLock.Lock()
244+
defer m.NodeGroupPerProviderIDLock.Unlock()
245+
246+
poolIDsToKeep := []string{}
247+
for _, pool := range pools {
248+
poolIDsToKeep = append(poolIDsToKeep, pool.ID)
249+
}
250+
251+
// Update nodepools state
252+
for _, pool := range pools {
253+
poolRef, ok := m.NodePoolsPerID[pool.ID]
254+
if ok {
255+
*poolRef = pool // Update existing value
256+
} else {
257+
poolCopy := pool
258+
m.NodePoolsPerID[pool.ID] = &poolCopy
259+
}
260+
}
261+
262+
// Remove nodepools that doesn't exist anymore
263+
for poolID := range m.NodePoolsPerID {
264+
if !slices.Contains(poolIDsToKeep, poolID) {
265+
delete(m.NodePoolsPerID, poolID)
266+
}
267+
}
268+
}
269+
235270
// readConfig read cloud provider configuration file into a struct
236271
func readConfig(configFile io.Reader) (*Config, error) {
237272
cfg := &Config{}

cluster-autoscaler/cloudprovider/ovhcloud/ovh_cloud_manager_test.go

Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -290,3 +290,99 @@ func TestOvhCloudManager_cacheConcurrency(t *testing.T) {
290290
manager.getNodeGroupPerProviderID("")
291291
})
292292
}
293+
294+
func TestOvhCloudManager_setNodePoolsState(t *testing.T) {
295+
manager := newTestManager(t)
296+
np1 := sdk.NodePool{ID: "np1", DesiredNodes: 1}
297+
np2 := sdk.NodePool{ID: "np2", DesiredNodes: 2}
298+
np3 := sdk.NodePool{ID: "np3", DesiredNodes: 3}
299+
300+
type fields struct {
301+
NodePoolsPerID map[string]*sdk.NodePool
302+
NodeGroupPerProviderID map[string]*NodeGroup
303+
}
304+
type args struct {
305+
poolsList []sdk.NodePool
306+
307+
nodePoolsPerID map[string]*sdk.NodePool
308+
nodeGroupPerProviderID map[string]*NodeGroup
309+
}
310+
tests := []struct {
311+
name string
312+
fields fields
313+
args args
314+
wantNodePoolsPerID map[string]uint32 // ID => desired nodes
315+
wantNodeGroupPerProviderID map[string]uint32 // ID => desired nodes
316+
}{
317+
{
318+
name: "NodePoolsPerID and NodeGroupPerProviderID empty",
319+
fields: fields{
320+
NodePoolsPerID: map[string]*sdk.NodePool{},
321+
NodeGroupPerProviderID: map[string]*NodeGroup{},
322+
},
323+
args: args{
324+
poolsList: []sdk.NodePool{
325+
np1,
326+
},
327+
nodePoolsPerID: map[string]*sdk.NodePool{},
328+
nodeGroupPerProviderID: map[string]*NodeGroup{},
329+
},
330+
wantNodePoolsPerID: map[string]uint32{"np1": 1},
331+
wantNodeGroupPerProviderID: map[string]uint32{},
332+
},
333+
{
334+
name: "NodePoolsPerID and NodeGroupPerProviderID empty",
335+
fields: fields{
336+
NodePoolsPerID: map[string]*sdk.NodePool{
337+
"np2": &np2,
338+
"np3": &np3,
339+
},
340+
NodeGroupPerProviderID: map[string]*NodeGroup{
341+
"np2-node-id": {NodePool: &np2},
342+
"np3-node-id": {NodePool: &np3},
343+
},
344+
},
345+
args: args{
346+
poolsList: []sdk.NodePool{
347+
{
348+
ID: "np1",
349+
DesiredNodes: 1,
350+
},
351+
{
352+
ID: "np2",
353+
DesiredNodes: 20,
354+
},
355+
},
356+
nodePoolsPerID: map[string]*sdk.NodePool{},
357+
nodeGroupPerProviderID: map[string]*NodeGroup{},
358+
},
359+
wantNodePoolsPerID: map[string]uint32{
360+
"np1": 1, // np1 added
361+
"np2": 20, // np2 updated
362+
// np3 removed
363+
},
364+
wantNodeGroupPerProviderID: map[string]uint32{
365+
"np2-node-id": 20,
366+
"np3-node-id": 3, // Node reference that eventually stays in cache must not crash
367+
},
368+
},
369+
}
370+
for _, tt := range tests {
371+
t.Run(tt.name, func(t *testing.T) {
372+
manager.NodePoolsPerID = tt.fields.NodePoolsPerID
373+
manager.NodeGroupPerProviderID = tt.fields.NodeGroupPerProviderID
374+
375+
manager.setNodePoolsState(tt.args.poolsList)
376+
377+
assert.Len(t, manager.NodePoolsPerID, len(tt.wantNodePoolsPerID))
378+
for id, desiredNodes := range tt.wantNodePoolsPerID {
379+
assert.Equal(t, desiredNodes, manager.NodePoolsPerID[id].DesiredNodes)
380+
}
381+
382+
assert.Len(t, manager.NodeGroupPerProviderID, len(tt.wantNodeGroupPerProviderID))
383+
for nodeID, desiredNodes := range tt.wantNodeGroupPerProviderID {
384+
assert.Equal(t, desiredNodes, manager.NodeGroupPerProviderID[nodeID].DesiredNodes)
385+
}
386+
})
387+
}
388+
}

cluster-autoscaler/cloudprovider/ovhcloud/ovh_cloud_node_group.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ const providerIDPrefix = "openstack:///"
4141

4242
// NodeGroup implements cloudprovider.NodeGroup interface.
4343
type NodeGroup struct {
44-
sdk.NodePool
44+
*sdk.NodePool
4545

4646
Manager *OvhCloudManager
4747
CurrentSize int
@@ -294,7 +294,7 @@ func (ng *NodeGroup) Create() (cloudprovider.NodeGroup, error) {
294294

295295
// Forge a node group interface given the API response
296296
return &NodeGroup{
297-
NodePool: *np,
297+
NodePool: np,
298298
Manager: ng.Manager,
299299
CurrentSize: int(ng.DesiredNodes),
300300
}, nil

cluster-autoscaler/cloudprovider/ovhcloud/ovh_cloud_node_group_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ func newTestNodeGroup(t *testing.T, flavor string) *NodeGroup {
8585

8686
ng := &NodeGroup{
8787
Manager: manager,
88-
NodePool: sdk.NodePool{
88+
NodePool: &sdk.NodePool{
8989
ID: "id",
9090
Name: fmt.Sprintf("pool-%s", flavor),
9191
Flavor: flavor,

cluster-autoscaler/cloudprovider/ovhcloud/ovh_cloud_provider.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,7 @@ func (provider *OVHCloudProvider) NodeGroups() []cloudprovider.NodeGroup {
100100
groups := make([]cloudprovider.NodeGroup, 0)
101101

102102
// Cast API node pools into CA node groups
103-
for _, pool := range provider.manager.NodePools {
103+
for _, pool := range provider.manager.NodePoolsPerID {
104104
// Node pools without autoscaling are equivalent to node pools with autoscaling but no scale possible
105105
if !pool.Autoscale {
106106
pool.MaxNodes = pool.DesiredNodes
@@ -238,7 +238,7 @@ func (provider *OVHCloudProvider) GetAvailableMachineTypes() ([]string, error) {
238238
// Implementation optional.
239239
func (provider *OVHCloudProvider) NewNodeGroup(machineType string, labels map[string]string, systemLabels map[string]string, taints []apiv1.Taint, extraResources map[string]resource.Quantity) (cloudprovider.NodeGroup, error) {
240240
ng := &NodeGroup{
241-
NodePool: sdk.NodePool{
241+
NodePool: &sdk.NodePool{
242242
Name: fmt.Sprintf("%s-%d", machineType, rand.Int63()),
243243
Flavor: machineType,
244244
MinNodes: 0,
@@ -314,7 +314,7 @@ func (provider *OVHCloudProvider) Refresh() error {
314314
}
315315

316316
// Update the node pools cache
317-
provider.manager.NodePools = pools
317+
provider.manager.setNodePoolsState(pools)
318318

319319
return nil
320320
}

cluster-autoscaler/cloudprovider/ovhcloud/ovh_cloud_provider_test.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -141,7 +141,7 @@ func TestOVHCloudProvider_NodeGroups(t *testing.T) {
141141
})
142142

143143
t.Run("check empty node groups length after reset", func(t *testing.T) {
144-
provider.manager.NodePools = []sdk.NodePool{}
144+
provider.manager.NodePoolsPerID = map[string]*sdk.NodePool{}
145145
groups := provider.NodeGroups()
146146

147147
assert.Equal(t, 0, len(groups))
@@ -403,7 +403,7 @@ func TestOVHCloudProvider_Refresh(t *testing.T) {
403403
provider := newTestProvider(t)
404404

405405
t.Run("check refresh reset node groups correctly", func(t *testing.T) {
406-
provider.manager.NodePools = []sdk.NodePool{}
406+
provider.manager.NodePoolsPerID = map[string]*sdk.NodePool{}
407407
groups := provider.NodeGroups()
408408

409409
assert.Equal(t, 0, len(groups))

0 commit comments

Comments
 (0)