File tree Expand file tree Collapse file tree 1 file changed +0
-30
lines changed
Expand file tree Collapse file tree 1 file changed +0
-30
lines changed Original file line number Diff line number Diff line change @@ -38,32 +38,6 @@ function(VARS_2I2C_AWS_ACCOUNT_ID=null)
3838 },
3939 };
4040
41- local makeServerStartupFailureAlert = function (
42- summary,
43- severity,
44- labels={},
45- ) {
46- // Structure is documented in https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/
47- alert: 'One server failed to start' ,
48- expr: |||
49- # We trigger any time there is a server startup failure, for any reason.
50- # The 'max' is to reduce the labels being passed to only the necessary ones
51- max by (namespace) (
52- (jupyterhub_server_spawn_duration_seconds_count{status="failure"} > 0)
53- -
54- ((jupyterhub_server_spawn_duration_seconds_count{status="failure"} offset 2m) > 0)
55- ) > 0
56- ||| ,
57- 'for' : '1m' ,
58- labels: {
59- cluster: cluster_name,
60- severity: severity,
61- } + labels,
62- annotations: {
63- summary: summary,
64- },
65- };
66-
6741 local makeTwoServersStartupFailureAlert = function (
6842 summary,
6943 severity,
@@ -241,10 +215,6 @@ function(VARS_2I2C_AWS_ACCOUNT_ID=null)
241215 {
242216 name: 'Server Startup Failure' ,
243217 rules: [
244- makeServerStartupFailureAlert(
245- 'A server failed to start: cluster %s hub:{{ $labels.namespace }}' % [cluster_name],
246- 'same day action needed'
247- ),
248218 makeTwoServersStartupFailureAlert(
249219 'At least two servers have failed to start in the last hour: cluster %s hub:{{ $labels.namespace }}' % [cluster_name],
250220 'immediate action needed'
You can’t perform that action at this time.
0 commit comments