Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 32 additions & 3 deletions helm-charts/support/values.jsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ function(VARS_2I2C_AWS_ACCOUNT_ID=null)
labels={},
) {
// Structure is documented in https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/
alert: 'Server Startup Failed',
alert: 'One server failed to start',
expr: |||
# We trigger any time there is a server startup failure, for any reason.
# The 'max' is to reduce the labels being passed to only the necessary ones
Expand All @@ -64,6 +64,31 @@ function(VARS_2I2C_AWS_ACCOUNT_ID=null)
},
};

local makeTwoServersStartupFailureAlert = function(
summary,
severity,
labels={},
) {
alert: 'Two servers failed to start in the last 30m',
expr: |||
changes(
(
max by (namespace) (
jupyterhub_server_spawn_duration_seconds_count{status="failure"}
)
)[30m:1m]
) >= 2
|||,
'for': '0m',
labels: {
cluster: cluster_name,
severity: severity,
} + labels,
annotations: {
summary: summary,
},
};

local diskIOApproachingSaturation = function(
name,
severity,
Expand Down Expand Up @@ -173,7 +198,7 @@ function(VARS_2I2C_AWS_ACCOUNT_ID=null)
receiver: 'server-startup-pager',
matchers: [
'cluster =~ .*',
'alertname = "Server Startup Failed"',
'alertname =~ ".*failed to start.*"',
],
},
],
Expand Down Expand Up @@ -217,9 +242,13 @@ function(VARS_2I2C_AWS_ACCOUNT_ID=null)
name: 'Server Startup Failure',
rules: [
makeServerStartupFailureAlert(
'Outage alert: Server Startup failed: cluster %s hub:{{ $labels.namespace }}' % [cluster_name],
'A server failed to start: cluster %s hub:{{ $labels.namespace }}' % [cluster_name],
'same day action needed'
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's reduce this to 'same week'?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Makes sense, ty! I'll tackle this as part of #7022 as I feel like the whole priorities structure needs a bit of a cleanup.

Right now this is a P2 and we should document that the expectation for this alert should be: if I can start a server, mark it as resolved.

),
makeTwoServersStartupFailureAlert(
'At least two servers have failed to start in the last hour: cluster %s hub:{{ $labels.namespace }}' % [cluster_name],
'immediate action needed'
),
],
},
{
Expand Down
Loading