From 47fcaabf50efd283c18e2c5b37ea13bf452af484 Mon Sep 17 00:00:00 2001 From: Georgiana Dolocan Date: Fri, 14 Nov 2025 16:42:10 +0200 Subject: [PATCH 1/7] Add two restarts alert --- helm-charts/support/values.jsonnet | 40 ++++++++++++++++++++++++++++-- 1 file changed, 38 insertions(+), 2 deletions(-) diff --git a/helm-charts/support/values.jsonnet b/helm-charts/support/values.jsonnet index db65b53dc0..de9aa23875 100644 --- a/helm-charts/support/values.jsonnet +++ b/helm-charts/support/values.jsonnet @@ -64,6 +64,33 @@ function(VARS_2I2C_AWS_ACCOUNT_ID=null) }, }; + local makeTwoServersStartupFailureAlert = function( + summary, + severity, + labels={}, + ) { + alert: 'At least two servers have failed to start', + expr: ||| + round( + increase( + ( + max by (namespace) ( + jupyterhub_server_spawn_duration_seconds_count{status="failure"} + ) + )[1h:1m] + ) + ) >= 2 + |||, + 'for': '0m', + labels: { + cluster: cluster_name, + severity: severity, + } + labels, + annotations: { + summary: summary, + }, + }; + local diskIOApproachingSaturation = function( name, severity, @@ -173,7 +200,7 @@ function(VARS_2I2C_AWS_ACCOUNT_ID=null) receiver: 'server-startup-pager', matchers: [ 'cluster =~ .*', - 'alertname = "Server Startup Failed"', + 'alertname =~ ".*failed to start.*"', ], }, ], @@ -217,7 +244,16 @@ function(VARS_2I2C_AWS_ACCOUNT_ID=null) name: 'Server Startup Failure', rules: [ makeServerStartupFailureAlert( - 'Outage alert: Server Startup failed: cluster %s hub:{{ $labels.namespace }}' % [cluster_name], + 'A server failed to start: cluster %s hub:{{ $labels.namespace }}' % [cluster_name], + 'same day action needed' + ), + ], + }, + { + name: 'Two servers failed to start', + rules: [ + makeTwoServersStartupFailureAlert( + 'At least two servers have failed to start in the last hour: cluster %s hub:{{ $labels.namespace }}' % [cluster_name], 'same day action needed' ), ], From 4186ab1789d5425f21bbd48f25eb775b8e1e1d6c Mon Sep 17 00:00:00 2001 From: Georgiana Dolocan Date: Mon, 17 Nov 2025 12:03:24 +0200 Subject: [PATCH 2/7] UPdate naming to match receiver --- helm-charts/support/values.jsonnet | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/helm-charts/support/values.jsonnet b/helm-charts/support/values.jsonnet index de9aa23875..e50998637c 100644 --- a/helm-charts/support/values.jsonnet +++ b/helm-charts/support/values.jsonnet @@ -44,7 +44,7 @@ function(VARS_2I2C_AWS_ACCOUNT_ID=null) labels={}, ) { // Structure is documented in https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/ - alert: 'Server Startup Failed', + alert: 'One Server Startup Failure', expr: ||| # We trigger any time there is a server startup failure, for any reason. # The 'max' is to reduce the labels being passed to only the necessary ones @@ -69,7 +69,7 @@ function(VARS_2I2C_AWS_ACCOUNT_ID=null) severity, labels={}, ) { - alert: 'At least two servers have failed to start', + alert: 'Two Servers Startup Failure', expr: ||| round( increase( @@ -200,7 +200,7 @@ function(VARS_2I2C_AWS_ACCOUNT_ID=null) receiver: 'server-startup-pager', matchers: [ 'cluster =~ .*', - 'alertname =~ ".*failed to start.*"', + 'alertname =~ ".*failed to start"', ], }, ], @@ -241,7 +241,7 @@ function(VARS_2I2C_AWS_ACCOUNT_ID=null) ], }, { - name: 'Server Startup Failure', + name: 'A server failed to start', rules: [ makeServerStartupFailureAlert( 'A server failed to start: cluster %s hub:{{ $labels.namespace }}' % [cluster_name], From c62a455a09aece9a5905c501d7d6110e638f9acb Mon Sep 17 00:00:00 2001 From: Georgiana Dolocan Date: Mon, 17 Nov 2025 15:37:44 +0200 Subject: [PATCH 3/7] UPdate naming again --- helm-charts/support/values.jsonnet | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/helm-charts/support/values.jsonnet b/helm-charts/support/values.jsonnet index e50998637c..693160f1c0 100644 --- a/helm-charts/support/values.jsonnet +++ b/helm-charts/support/values.jsonnet @@ -44,7 +44,7 @@ function(VARS_2I2C_AWS_ACCOUNT_ID=null) labels={}, ) { // Structure is documented in https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/ - alert: 'One Server Startup Failure', + alert: 'One server failed to start', expr: ||| # We trigger any time there is a server startup failure, for any reason. # The 'max' is to reduce the labels being passed to only the necessary ones @@ -69,7 +69,7 @@ function(VARS_2I2C_AWS_ACCOUNT_ID=null) severity, labels={}, ) { - alert: 'Two Servers Startup Failure', + alert: 'Two servers failed to start in the last hour', expr: ||| round( increase( @@ -200,7 +200,7 @@ function(VARS_2I2C_AWS_ACCOUNT_ID=null) receiver: 'server-startup-pager', matchers: [ 'cluster =~ .*', - 'alertname =~ ".*failed to start"', + 'alertname =~ ".*failed to start.*"', ], }, ], @@ -241,20 +241,15 @@ function(VARS_2I2C_AWS_ACCOUNT_ID=null) ], }, { - name: 'A server failed to start', + name: 'Server Startup Failure', rules: [ makeServerStartupFailureAlert( 'A server failed to start: cluster %s hub:{{ $labels.namespace }}' % [cluster_name], 'same day action needed' ), - ], - }, - { - name: 'Two servers failed to start', - rules: [ makeTwoServersStartupFailureAlert( 'At least two servers have failed to start in the last hour: cluster %s hub:{{ $labels.namespace }}' % [cluster_name], - 'same day action needed' + 'immediate action needed' ), ], }, From e02246c3f26e533fae8e951ab587c1cd8b1ed030 Mon Sep 17 00:00:00 2001 From: Georgiana Dolocan Date: Mon, 17 Nov 2025 18:36:34 +0200 Subject: [PATCH 4/7] Use delta with an abs to account for hub restarts --- helm-charts/support/values.jsonnet | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/helm-charts/support/values.jsonnet b/helm-charts/support/values.jsonnet index 693160f1c0..b428eaada8 100644 --- a/helm-charts/support/values.jsonnet +++ b/helm-charts/support/values.jsonnet @@ -72,12 +72,14 @@ function(VARS_2I2C_AWS_ACCOUNT_ID=null) alert: 'Two servers failed to start in the last hour', expr: ||| round( - increase( - ( - max by (namespace) ( - jupyterhub_server_spawn_duration_seconds_count{status="failure"} - ) - )[1h:1m] + abs( + delta( + ( + max by (namespace) ( + jupyterhub_server_spawn_duration_seconds_count{status="failure"} + ) + )[1h:1m] + ) ) ) >= 2 |||, From 7ee6d2923b228e9cce2445a2d771d78b6c3d339e Mon Sep 17 00:00:00 2001 From: Georgiana Dolocan Date: Mon, 17 Nov 2025 18:38:24 +0200 Subject: [PATCH 5/7] Decrease interval to 30min --- helm-charts/support/values.jsonnet | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/helm-charts/support/values.jsonnet b/helm-charts/support/values.jsonnet index b428eaada8..615ee16066 100644 --- a/helm-charts/support/values.jsonnet +++ b/helm-charts/support/values.jsonnet @@ -69,7 +69,7 @@ function(VARS_2I2C_AWS_ACCOUNT_ID=null) severity, labels={}, ) { - alert: 'Two servers failed to start in the last hour', + alert: 'Two servers failed to start in the last 30m', expr: ||| round( abs( @@ -78,7 +78,7 @@ function(VARS_2I2C_AWS_ACCOUNT_ID=null) max by (namespace) ( jupyterhub_server_spawn_duration_seconds_count{status="failure"} ) - )[1h:1m] + )[30m:1m] ) ) ) >= 2 From 47c0dc2b98ae4ec57db27d333045317ab00c2437 Mon Sep 17 00:00:00 2001 From: Georgiana Dolocan Date: Tue, 18 Nov 2025 11:44:45 +0200 Subject: [PATCH 6/7] Go back to increase --- helm-charts/support/values.jsonnet | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/helm-charts/support/values.jsonnet b/helm-charts/support/values.jsonnet index 615ee16066..dedd80a4a6 100644 --- a/helm-charts/support/values.jsonnet +++ b/helm-charts/support/values.jsonnet @@ -72,14 +72,12 @@ function(VARS_2I2C_AWS_ACCOUNT_ID=null) alert: 'Two servers failed to start in the last 30m', expr: ||| round( - abs( - delta( - ( - max by (namespace) ( - jupyterhub_server_spawn_duration_seconds_count{status="failure"} - ) - )[30m:1m] - ) + increase( + ( + max by (namespace) ( + jupyterhub_server_spawn_duration_seconds_count{status="failure"} + ) + )[30m:1m] ) ) >= 2 |||, From 0715308ebe0720ac05d6889f5834bd5f661f1522 Mon Sep 17 00:00:00 2001 From: Georgiana Dolocan Date: Tue, 18 Nov 2025 15:04:12 +0200 Subject: [PATCH 7/7] use changes metric --- helm-charts/support/values.jsonnet | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/helm-charts/support/values.jsonnet b/helm-charts/support/values.jsonnet index dedd80a4a6..5afdda988a 100644 --- a/helm-charts/support/values.jsonnet +++ b/helm-charts/support/values.jsonnet @@ -71,14 +71,12 @@ function(VARS_2I2C_AWS_ACCOUNT_ID=null) ) { alert: 'Two servers failed to start in the last 30m', expr: ||| - round( - increase( - ( - max by (namespace) ( - jupyterhub_server_spawn_duration_seconds_count{status="failure"} - ) - )[30m:1m] - ) + changes( + ( + max by (namespace) ( + jupyterhub_server_spawn_duration_seconds_count{status="failure"} + ) + )[30m:1m] ) >= 2 |||, 'for': '0m',