Skip to content

Commit 5c795cf

Browse files
Merge pull request #271 from atlassian/vportella/add-nodegroup-health-checking
Freeze scaling when unhealthy nodes found and remove them
2 parents f6afbfe + 3042c3a commit 5c795cf

File tree

14 files changed

+1005
-93
lines changed

14 files changed

+1005
-93
lines changed

docs/configuration/nodegroup.md

Lines changed: 34 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -29,12 +29,12 @@ node_groups:
2929
taint_effect: NoExecute
3030
max_node_age: 24h
3131
aws:
32-
fleet_instance_ready_timeout: 1m
33-
launch_template_id: lt-1a2b3c4d
34-
launch_template_version: "1"
35-
lifecycle: on-demand
36-
instance_type_overrides: ["t2.large", "t3.large"]
37-
resource_tagging: false
32+
fleet_instance_ready_timeout: 1m
33+
launch_template_id: lt-1a2b3c4d
34+
launch_template_version: "1"
35+
lifecycle: on-demand
36+
instance_type_overrides: ["t2.large", "t3.large"]
37+
resource_tagging: false
3838
```
3939
4040
## Options
@@ -273,3 +273,31 @@ When not at the minimum, the natural scaling up and down of the node group will
273273
node group.
274274

275275
This is an optional feature and by default is disabled.
276+
277+
### `unhealthy_node_grace_period`
278+
279+
Defines the minimum age of a node before it can be tested to check if it is unhealthy.
280+
281+
When enabled, instances can be tested periodically to determine if they are healthy. Escalator will pause all scaling activity and flush out unhealthy instances if they go above a configured threshold for the nodegroup. It will continuously do this until enough instances in the nodegroup are healthy and normal scaling activity can resume.
282+
283+
Cordoned nodes are skipped and can never be considered unhealthy.
284+
285+
This is an optional field. The default value is empty, which disables the feature.
286+
287+
### `health_check_newest_nodes_percent`
288+
289+
**[Only used if `unhealthy_node_grace_period` is set.]**
290+
291+
The percentage of nodes (ordered by age from newer to older) in the nodegroup that are considered when checking for the maximum allowed unhealthy nodes in the nodegroup. The nodes captured by this percentage form the "test set" to be checked. Only nodes which are older than `unhealthy_node_grace_period` will be included in the test set.
292+
293+
This field is required.
294+
295+
### `max_unhealthy_nodes_percent`
296+
297+
**[Only used if `unhealthy_node_grace_period` is set.]**
298+
299+
The maximum percentage of unhealthy nodes in the test set from `health_check_newest_nodes_percent`. Beyond this threshold all scaling activity is paused and unhealthy nodes are flushed out.
300+
301+
> **Note:** The valid range for `max_unhealthy_nodes_percent` is `0%` to `99%`.
302+
303+
This is an optional field. If not set, it will default to `0%`.

pkg/controller/controller.go

Lines changed: 149 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ package controller
22

33
import (
44
"math"
5+
"sort"
56
"time"
67

78
"github.com/atlassian/escalator/pkg/cloudprovider"
@@ -227,6 +228,12 @@ func (c *Controller) scaleNodeGroup(nodegroup string, nodeGroup *NodeGroupState)
227228
nodeGroup.memCapacity = *allNodes[0].Status.Allocatable.Memory()
228229
}
229230

231+
// Taint all instances considered to be unhealthy before filtering the nodes
232+
// into groups.
233+
if nodeGroup.Opts.UnhealthyNodeGracePeriodDuration() > 0 {
234+
c.taintUnhealthyInstances(allNodes, nodeGroup)
235+
}
236+
230237
// Filter into untainted and tainted nodes
231238
untaintedNodes, taintedNodes, forceTaintedNodes, cordonedNodes := c.filterNodes(nodeGroup, allNodes)
232239

@@ -420,6 +427,22 @@ func (c *Controller) scaleNodeGroup(nodegroup string, nodeGroup *NodeGroupState)
420427
log.WithField("nodegroup", nodegroup).Error(forceActionErr)
421428
}
422429

430+
// If the nodegroup is considered to be unhealthy, then prevent any scaling
431+
// for the time being and instead try removing tainted nodes to get the
432+
// nodegroup into a healthy state again. No healthy nodes should be removed
433+
// and no new cloud provider nodes should be added.
434+
nodeGroupIsHealthy := true
435+
436+
if nodeGroup.Opts.UnhealthyNodeGracePeriodDuration() > 0 {
437+
if !c.isNodegroupHealthy(nodeGroup, allNodes) {
438+
nodeGroupIsHealthy = false
439+
nodesDelta = 0
440+
log.WithField("nodegroup", nodegroup).Infof("NodegroupUnhealthy: nodesDelta overridden to 0 from %d because the nodegroup is unhealthy", nodesDelta)
441+
}
442+
}
443+
444+
c.reportNodeGroupHealthMetric(nodegroup, nodeGroupIsHealthy)
445+
423446
// Perform a scale up, do nothing or scale down based on the nodes delta
424447
var nodesDeltaResult int
425448
// actionErr keeps the error of any action below and checked after action
@@ -439,7 +462,7 @@ func (c *Controller) scaleNodeGroup(nodegroup string, nodeGroup *NodeGroupState)
439462
log.WithField("nodegroup", nodegroup).Info("No need to scale")
440463
// reap any expired nodes
441464
var removed int
442-
removed, actionErr = c.TryRemoveTaintedNodes(scaleOptions)
465+
removed, actionErr = c.TryRemoveTaintedNodes(scaleOptions, nodeGroupIsHealthy)
443466
log.WithField("nodegroup", nodegroup).Infof("Reaper: There were %v empty nodes deleted this round", removed)
444467
}
445468

@@ -457,6 +480,131 @@ func (c *Controller) scaleNodeGroup(nodegroup string, nodeGroup *NodeGroupState)
457480
return nodesDelta, err
458481
}
459482

483+
// Finds all unhealthy instances in the nodegroup and adds the taint to mark
484+
// them for deletion.
485+
func (c *Controller) taintUnhealthyInstances(nodes []*v1.Node, state *NodeGroupState) []int {
486+
bundles := make(nodesByOldestCreationTime, 0, len(nodes))
487+
488+
for i, node := range nodes {
489+
// If the node is deemed healthy then there is nothing to do
490+
if !k8s.IsNodeUnhealthy(node, state.Opts.unhealthyNodeGracePeriodDuration) {
491+
continue
492+
}
493+
494+
bundles = append(bundles, nodeIndexBundle{node, i})
495+
}
496+
497+
return c.taintInstances(bundles, state, len(bundles))
498+
}
499+
500+
func (c *Controller) reportNodeGroupHealthMetric(nodegroup string, nodeGroupHealthy bool) {
501+
healthy := 1
502+
503+
if !nodeGroupHealthy {
504+
healthy = 0
505+
}
506+
507+
metrics.NodeGroupUnhealthy.WithLabelValues(nodegroup).Set(float64(healthy))
508+
}
509+
510+
// isNodegroupHealthy checks if the nodegroup is healthy.
511+
// It does this by checking the health of the newest X% of nodes in the nodegroup that are older than the grace period.
512+
// If the percentage of unhealthy nodes in this newest set of nodes is greater than the configured threshold, the nodegroup is considered unhealthy.
513+
func (c *Controller) isNodegroupHealthy(state *NodeGroupState, nodes []*v1.Node) bool {
514+
// Sort the nodes is reverse order based on age
515+
reversedOrderedNodes := c.getNodesOrderedNewestFirst(nodes)
516+
517+
// Filter out any nodes which are not old enough for the test group
518+
oldEnoughNodes := c.filterOutNodesTooNew(state, reversedOrderedNodes)
519+
520+
// Out of the nodes that are left, find the most recent configured
521+
// percentage of nodes to do the test.
522+
nodesForTest := c.getMostRecentNodes(state, oldEnoughNodes)
523+
524+
// If there are no nodes to test, then the nodegroup is considered healthy.
525+
if len(nodesForTest) == 0 {
526+
return true
527+
}
528+
529+
// Get the total number of unhealthy nodes in the test set.
530+
unhealthyNodesCount := c.countUnhealthyNodes(state, nodesForTest)
531+
532+
// If the number of unhealthy nodes in the test group exceeds the percentage
533+
// allowed then the test has failed.
534+
return (unhealthyNodesCount*100)/len(nodesForTest) <= state.Opts.MaxUnhealthyNodesPercent
535+
}
536+
537+
func (c *Controller) getNodesOrderedNewestFirst(nodes []*v1.Node) []*v1.Node {
538+
sortedNodes := make(nodesByOldestCreationTime, 0, len(nodes))
539+
540+
for i, node := range nodes {
541+
sortedNodes = append(sortedNodes, nodeIndexBundle{node, i})
542+
}
543+
544+
// Sort in reverse to get the newest instances at the front to make it
545+
// easier to loop through.
546+
sort.Sort(sort.Reverse(sortedNodes))
547+
548+
reverseOrderedNodes := make([]*v1.Node, 0, len(nodes))
549+
550+
for _, sortedNode := range sortedNodes {
551+
reverseOrderedNodes = append(reverseOrderedNodes, sortedNode.node)
552+
}
553+
554+
return reverseOrderedNodes
555+
}
556+
557+
// Returns the list of nodes which are at least as old at the health check grace
558+
// period duration configured for the nodegroup. These nodes are considered to
559+
// be too new and still have a chance to be not Ready for legitimate reasons so
560+
// they should not be considered.
561+
func (c *Controller) filterOutNodesTooNew(state *NodeGroupState, nodes []*v1.Node) []*v1.Node {
562+
now := time.Now()
563+
newNodes := make([]*v1.Node, 0)
564+
565+
for _, node := range nodes {
566+
// Check if the node is old enough to be included in the new list
567+
if node.CreationTimestamp.Add(state.Opts.unhealthyNodeGracePeriodDuration).Before(now) {
568+
newNodes = append(newNodes, node)
569+
}
570+
}
571+
572+
return newNodes
573+
}
574+
575+
// Returns the most recent X% of instances from the given list of nodes.
576+
func (c *Controller) getMostRecentNodes(state *NodeGroupState, nodes []*v1.Node) []*v1.Node {
577+
// Round up rather than down from HealthCheckNewestNodesPercent so that if
578+
// there is a single instance then a non-100% percentage will still result
579+
// in testing the instance. We want to test more rather than less.
580+
numberOfNodes := int(math.Ceil((float64(state.Opts.HealthCheckNewestNodesPercent) / 100) * float64(len(nodes))))
581+
recentNodes := make([]*v1.Node, 0)
582+
583+
for i, node := range nodes {
584+
if i == numberOfNodes {
585+
break
586+
}
587+
588+
recentNodes = append(recentNodes, node)
589+
}
590+
591+
return recentNodes
592+
}
593+
594+
func (c *Controller) countUnhealthyNodes(state *NodeGroupState, nodes []*v1.Node) int {
595+
unhealthyNodesCount := 0
596+
597+
for _, node := range nodes {
598+
// Include the unhealthyNodeDuration in the call to be 100% sure that we
599+
// are not counting nodes which are too young are unhealthy.
600+
if k8s.IsNodeUnhealthy(node, state.Opts.unhealthyNodeGracePeriodDuration) {
601+
unhealthyNodesCount++
602+
}
603+
}
604+
605+
return unhealthyNodesCount
606+
}
607+
460608
func (c *Controller) isScaleOnStarve(
461609
nodeGroup *NodeGroupState,
462610
podRequests k8s.PodRequestedUsage,

0 commit comments

Comments
 (0)