From c440bd2bdaec8fed13f47fdfe76dda63e7d0008a Mon Sep 17 00:00:00 2001 From: Steve Simpson Date: Thu, 5 Dec 2024 21:48:24 +0100 Subject: [PATCH] Alerting: Change default for max_attempts to 3. (#97461) Currently the default is 1, this means that by default users will see transient query errors reflected as alert evaluation failures, when often an immediate retry is sufficient to evaluate the rule successfully. Enabling retries by default leads to a better experience out of the box. --- conf/defaults.ini | 4 ++-- conf/sample.ini | 4 ++-- pkg/setting/setting_unified_alerting.go | 2 +- pkg/setting/setting_unified_alerting_test.go | 8 ++++---- 4 files changed, 9 insertions(+), 9 deletions(-) diff --git a/conf/defaults.ini b/conf/defaults.ini index 091140c7cc1..212c57353b2 100644 --- a/conf/defaults.ini +++ b/conf/defaults.ini @@ -1338,8 +1338,8 @@ execute_alerts = true # The timeout string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m. evaluation_timeout = 30s -# Number of times we'll attempt to evaluate an alert rule before giving up on that evaluation. The default value is 1. -max_attempts = 1 +# Number of times we'll attempt to evaluate an alert rule before giving up on that evaluation. The default value is 3. +max_attempts = 3 # Minimum interval to enforce between rule evaluations. Rules will be adjusted if they are less than this value or if they are not multiple of the scheduler interval (10s). Higher values can help with resource management as we'll schedule fewer evaluations over time. # The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m. diff --git a/conf/sample.ini b/conf/sample.ini index 687f4052053..99dfb68fdb8 100644 --- a/conf/sample.ini +++ b/conf/sample.ini @@ -1322,8 +1322,8 @@ # The timeout string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m. ;evaluation_timeout = 30s -# Number of times we'll attempt to evaluate an alert rule before giving up on that evaluation. The default value is 1. -;max_attempts = 1 +# Number of times we'll attempt to evaluate an alert rule before giving up on that evaluation. The default value is 3. +;max_attempts = 3 # Minimum interval to enforce between rule evaluations. Rules will be adjusted if they are less than this value or if they are not multiple of the scheduler interval (10s). Higher values can help with resource management as we'll schedule fewer evaluations over time. # The interval string is a possibly signed sequence of decimal numbers, followed by a unit suffix (ms, s, m, h, d), e.g. 30s or 1m. diff --git a/pkg/setting/setting_unified_alerting.go b/pkg/setting/setting_unified_alerting.go index a39636533da..1d602622c65 100644 --- a/pkg/setting/setting_unified_alerting.go +++ b/pkg/setting/setting_unified_alerting.go @@ -49,7 +49,7 @@ const ( evaluatorDefaultEvaluationTimeout = 30 * time.Second schedulerDefaultAdminConfigPollInterval = time.Minute schedulerDefaultExecuteAlerts = true - schedulerDefaultMaxAttempts = 1 + schedulerDefaultMaxAttempts = 3 schedulerDefaultLegacyMinInterval = 1 screenshotsDefaultCapture = false screenshotsDefaultCaptureTimeout = 10 * time.Second diff --git a/pkg/setting/setting_unified_alerting_test.go b/pkg/setting/setting_unified_alerting_test.go index 0b453b46e12..bd972e7ef54 100644 --- a/pkg/setting/setting_unified_alerting_test.go +++ b/pkg/setting/setting_unified_alerting_test.go @@ -120,14 +120,14 @@ func TestUnifiedAlertingSettings(t *testing.T) { "evaluation_timeout": evaluatorDefaultEvaluationTimeout.String(), }, alertingOptions: map[string]string{ - "max_attempts": "1", + "max_attempts": "1", // Note: Ignored, setting does not exist. "min_interval_seconds": "120", "execute_alerts": "true", "evaluation_timeout_seconds": "160", }, verifyCfg: func(t *testing.T, cfg Cfg) { require.Equal(t, 120*time.Second, cfg.UnifiedAlerting.AdminConfigPollInterval) - require.Equal(t, int64(1), cfg.UnifiedAlerting.MaxAttempts) + require.Equal(t, int64(3), cfg.UnifiedAlerting.MaxAttempts) require.Equal(t, 120*time.Second, cfg.UnifiedAlerting.MinInterval) require.Equal(t, true, cfg.UnifiedAlerting.ExecuteAlerts) require.Equal(t, 160*time.Second, cfg.UnifiedAlerting.EvaluationTimeout) @@ -168,14 +168,14 @@ func TestUnifiedAlertingSettings(t *testing.T) { "evaluation_timeout": "invalid", }, alertingOptions: map[string]string{ - "max_attempts": "1", + "max_attempts": "1", // Note: Ignored, setting does not exist. "min_interval_seconds": "120", "execute_alerts": "false", "evaluation_timeout_seconds": "160", }, verifyCfg: func(t *testing.T, cfg Cfg) { require.Equal(t, alertmanagerDefaultConfigPollInterval, cfg.UnifiedAlerting.AdminConfigPollInterval) - require.Equal(t, int64(1), cfg.UnifiedAlerting.MaxAttempts) + require.Equal(t, int64(3), cfg.UnifiedAlerting.MaxAttempts) require.Equal(t, 120*time.Second, cfg.UnifiedAlerting.MinInterval) require.Equal(t, false, cfg.UnifiedAlerting.ExecuteAlerts) require.Equal(t, 160*time.Second, cfg.UnifiedAlerting.EvaluationTimeout)