@@ -44,7 +44,7 @@ function(VARS_2I2C_AWS_ACCOUNT_ID=null)
4444 labels={},
4545 ) {
4646 // Structure is documented in https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/
47- alert: 'Server Startup Failed ' ,
47+ alert: 'One server failed to start ' ,
4848 expr: |||
4949 # We trigger any time there is a server startup failure, for any reason.
5050 # The 'max' is to reduce the labels being passed to only the necessary ones
@@ -64,6 +64,31 @@ function(VARS_2I2C_AWS_ACCOUNT_ID=null)
6464 },
6565 };
6666
67+ local makeTwoServersStartupFailureAlert = function (
68+ summary,
69+ severity,
70+ labels={},
71+ ) {
72+ alert: 'Two servers failed to start in the last 30m' ,
73+ expr: |||
74+ changes(
75+ (
76+ max by (namespace) (
77+ jupyterhub_server_spawn_duration_seconds_count{status="failure"}
78+ )
79+ )[30m:1m]
80+ ) >= 2
81+ ||| ,
82+ 'for' : '0m' ,
83+ labels: {
84+ cluster: cluster_name,
85+ severity: severity,
86+ } + labels,
87+ annotations: {
88+ summary: summary,
89+ },
90+ };
91+
6792 local diskIOApproachingSaturation = function (
6893 name,
6994 severity,
@@ -173,7 +198,7 @@ function(VARS_2I2C_AWS_ACCOUNT_ID=null)
173198 receiver: 'server-startup-pager' ,
174199 matchers: [
175200 'cluster =~ .*' ,
176- 'alertname = "Server Startup Failed "' ,
201+ 'alertname =~ ".*failed to start.* "' ,
177202 ],
178203 },
179204 ],
@@ -217,9 +242,13 @@ function(VARS_2I2C_AWS_ACCOUNT_ID=null)
217242 name: 'Server Startup Failure' ,
218243 rules: [
219244 makeServerStartupFailureAlert(
220- 'Outage alert: Server Startup failed : cluster %s hub:{{ $labels.namespace }}' % [cluster_name],
245+ 'A server failed to start : cluster %s hub:{{ $labels.namespace }}' % [cluster_name],
221246 'same day action needed'
222247 ),
248+ makeTwoServersStartupFailureAlert(
249+ 'At least two servers have failed to start in the last hour: cluster %s hub:{{ $labels.namespace }}' % [cluster_name],
250+ 'immediate action needed'
251+ ),
223252 ],
224253 },
225254 {
0 commit comments