Skip to content

Commit 64f2ce9

Browse files
Merge pull request #7120 from GeorgianaElena/fix-alert
Add two server failure alert
2 parents 62acf94 + 0715308 commit 64f2ce9

File tree

1 file changed

+32
-3
lines changed

1 file changed

+32
-3
lines changed

helm-charts/support/values.jsonnet

Lines changed: 32 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ function(VARS_2I2C_AWS_ACCOUNT_ID=null)
4444
labels={},
4545
) {
4646
// Structure is documented in https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/
47-
alert: 'Server Startup Failed',
47+
alert: 'One server failed to start',
4848
expr: |||
4949
# We trigger any time there is a server startup failure, for any reason.
5050
# The 'max' is to reduce the labels being passed to only the necessary ones
@@ -64,6 +64,31 @@ function(VARS_2I2C_AWS_ACCOUNT_ID=null)
6464
},
6565
};
6666

67+
local makeTwoServersStartupFailureAlert = function(
68+
summary,
69+
severity,
70+
labels={},
71+
) {
72+
alert: 'Two servers failed to start in the last 30m',
73+
expr: |||
74+
changes(
75+
(
76+
max by (namespace) (
77+
jupyterhub_server_spawn_duration_seconds_count{status="failure"}
78+
)
79+
)[30m:1m]
80+
) >= 2
81+
|||,
82+
'for': '0m',
83+
labels: {
84+
cluster: cluster_name,
85+
severity: severity,
86+
} + labels,
87+
annotations: {
88+
summary: summary,
89+
},
90+
};
91+
6792
local diskIOApproachingSaturation = function(
6893
name,
6994
severity,
@@ -173,7 +198,7 @@ function(VARS_2I2C_AWS_ACCOUNT_ID=null)
173198
receiver: 'server-startup-pager',
174199
matchers: [
175200
'cluster =~ .*',
176-
'alertname = "Server Startup Failed"',
201+
'alertname =~ ".*failed to start.*"',
177202
],
178203
},
179204
],
@@ -217,9 +242,13 @@ function(VARS_2I2C_AWS_ACCOUNT_ID=null)
217242
name: 'Server Startup Failure',
218243
rules: [
219244
makeServerStartupFailureAlert(
220-
'Outage alert: Server Startup failed: cluster %s hub:{{ $labels.namespace }}' % [cluster_name],
245+
'A server failed to start: cluster %s hub:{{ $labels.namespace }}' % [cluster_name],
221246
'same day action needed'
222247
),
248+
makeTwoServersStartupFailureAlert(
249+
'At least two servers have failed to start in the last hour: cluster %s hub:{{ $labels.namespace }}' % [cluster_name],
250+
'immediate action needed'
251+
),
223252
],
224253
},
225254
{

0 commit comments

Comments
 (0)