mirror of
https://github.com/grafana/grafana.git
synced 2025-09-28 01:54:01 +08:00
Alerting: Fix alert flapping in the internal alertmanager (#38648)
* Alerting: Fix alert flapping in the alertmanager fixes a bug that caused Alerts that are evaluated at low intervals (sub 1 minute), to flap in the Alertmanager. Mostly due to a combination of `EndsAt` and resend delay. The Alertmanager uses `EndsAt` as a heuristic to know whenever it should resolve a firing alert, in the case that it hasn't heard back from the alert generation system. Because grafana sent the alert with an `EndsAt` which is equal to the `For` of the alert itself, and we had a hard-coded 1 minute re-send delay (only applicable to firing alerts) this meant that a firing alert would resolve in the Alertmanager before we re-notify that it still firing. This commit, increases the `EndsAt` by 3x the the resend delay or alert interval (depending on which one is higher). The resendDelay has been decreased to 30 seconds.
This commit is contained in:
@ -17,6 +17,8 @@ import (
|
||||
"github.com/grafana/grafana/pkg/services/ngalert/store"
|
||||
)
|
||||
|
||||
var ResendDelay = 30 * time.Second
|
||||
|
||||
type Manager struct {
|
||||
log log.Logger
|
||||
metrics *metrics.Metrics
|
||||
@ -33,7 +35,7 @@ func NewManager(logger log.Logger, metrics *metrics.Metrics, ruleStore store.Rul
|
||||
manager := &Manager{
|
||||
cache: newCache(logger, metrics),
|
||||
quit: make(chan struct{}),
|
||||
ResendDelay: 1 * time.Minute, // TODO: make this configurable
|
||||
ResendDelay: ResendDelay, // TODO: make this configurable
|
||||
log: logger,
|
||||
metrics: metrics,
|
||||
ruleStore: ruleStore,
|
||||
|
@ -152,7 +152,7 @@ func TestProcessEvalResults(t *testing.T) {
|
||||
},
|
||||
},
|
||||
StartsAt: evaluationTime,
|
||||
EndsAt: evaluationTime.Add(20 * time.Second),
|
||||
EndsAt: evaluationTime.Add(state.ResendDelay * 3),
|
||||
LastEvaluationTime: evaluationTime,
|
||||
EvaluationDuration: evaluationDuration,
|
||||
Annotations: map[string]string{"annotation": "test"},
|
||||
@ -274,7 +274,7 @@ func TestProcessEvalResults(t *testing.T) {
|
||||
},
|
||||
},
|
||||
StartsAt: evaluationTime.Add(1 * time.Minute),
|
||||
EndsAt: evaluationTime.Add(1 * time.Minute).Add(time.Duration(20) * time.Second),
|
||||
EndsAt: evaluationTime.Add(1 * time.Minute).Add(state.ResendDelay * 3),
|
||||
LastEvaluationTime: evaluationTime.Add(1 * time.Minute),
|
||||
EvaluationDuration: evaluationDuration,
|
||||
Annotations: map[string]string{"annotation": "test"},
|
||||
@ -350,7 +350,7 @@ func TestProcessEvalResults(t *testing.T) {
|
||||
},
|
||||
},
|
||||
StartsAt: evaluationTime.Add(80 * time.Second),
|
||||
EndsAt: evaluationTime.Add(80 * time.Second).Add(1 * time.Minute),
|
||||
EndsAt: evaluationTime.Add(80 * time.Second).Add(state.ResendDelay * 3),
|
||||
LastEvaluationTime: evaluationTime.Add(80 * time.Second),
|
||||
EvaluationDuration: evaluationDuration,
|
||||
Annotations: map[string]string{"annotation": "test"},
|
||||
@ -413,7 +413,7 @@ func TestProcessEvalResults(t *testing.T) {
|
||||
},
|
||||
},
|
||||
StartsAt: evaluationTime.Add(10 * time.Second),
|
||||
EndsAt: evaluationTime.Add(10 * time.Second).Add(1 * time.Minute),
|
||||
EndsAt: evaluationTime.Add(10 * time.Second).Add(state.ResendDelay * 3),
|
||||
LastEvaluationTime: evaluationTime.Add(10 * time.Second),
|
||||
EvaluationDuration: evaluationDuration,
|
||||
Annotations: map[string]string{"annotation": "test"},
|
||||
@ -476,7 +476,7 @@ func TestProcessEvalResults(t *testing.T) {
|
||||
},
|
||||
},
|
||||
StartsAt: evaluationTime,
|
||||
EndsAt: evaluationTime.Add(1 * time.Minute),
|
||||
EndsAt: evaluationTime.Add(state.ResendDelay * 3),
|
||||
LastEvaluationTime: evaluationTime.Add(10 * time.Second),
|
||||
EvaluationDuration: evaluationDuration,
|
||||
Annotations: map[string]string{"annotation": "test"},
|
||||
@ -539,7 +539,7 @@ func TestProcessEvalResults(t *testing.T) {
|
||||
},
|
||||
},
|
||||
StartsAt: evaluationTime.Add(10 * time.Second),
|
||||
EndsAt: evaluationTime.Add(10 * time.Second).Add(20 * time.Second),
|
||||
EndsAt: evaluationTime.Add(10 * time.Second).Add(state.ResendDelay * 3),
|
||||
LastEvaluationTime: evaluationTime.Add(10 * time.Second),
|
||||
EvaluationDuration: evaluationDuration,
|
||||
Annotations: map[string]string{"annotation": "test"},
|
||||
@ -602,7 +602,7 @@ func TestProcessEvalResults(t *testing.T) {
|
||||
},
|
||||
},
|
||||
StartsAt: evaluationTime.Add(10 * time.Second),
|
||||
EndsAt: evaluationTime.Add(10 * time.Second).Add(20 * time.Second),
|
||||
EndsAt: evaluationTime.Add(10 * time.Second).Add(state.ResendDelay * 3),
|
||||
LastEvaluationTime: evaluationTime.Add(10 * time.Second),
|
||||
EvaluationDuration: evaluationDuration,
|
||||
Annotations: map[string]string{"annotation": "test"},
|
||||
@ -665,7 +665,7 @@ func TestProcessEvalResults(t *testing.T) {
|
||||
},
|
||||
},
|
||||
StartsAt: evaluationTime.Add(10 * time.Second),
|
||||
EndsAt: evaluationTime.Add(10 * time.Second).Add(20 * time.Second),
|
||||
EndsAt: evaluationTime.Add(10 * time.Second).Add(state.ResendDelay * 3),
|
||||
LastEvaluationTime: evaluationTime.Add(10 * time.Second),
|
||||
EvaluationDuration: evaluationDuration,
|
||||
Annotations: map[string]string{"annotation": "test"},
|
||||
@ -729,7 +729,7 @@ func TestProcessEvalResults(t *testing.T) {
|
||||
},
|
||||
},
|
||||
StartsAt: evaluationTime.Add(10 * time.Second),
|
||||
EndsAt: evaluationTime.Add(10 * time.Second).Add(1 * time.Minute),
|
||||
EndsAt: evaluationTime.Add(10 * time.Second).Add(state.ResendDelay * 3),
|
||||
LastEvaluationTime: evaluationTime.Add(10 * time.Second),
|
||||
EvaluationDuration: evaluationDuration,
|
||||
Annotations: map[string]string{"annotation": "test"},
|
||||
@ -793,7 +793,7 @@ func TestProcessEvalResults(t *testing.T) {
|
||||
},
|
||||
},
|
||||
StartsAt: evaluationTime.Add(10 * time.Second),
|
||||
EndsAt: evaluationTime.Add(10 * time.Second).Add(1 * time.Minute),
|
||||
EndsAt: evaluationTime.Add(10 * time.Second).Add(state.ResendDelay * 3),
|
||||
LastEvaluationTime: evaluationTime.Add(10 * time.Second),
|
||||
EvaluationDuration: evaluationDuration,
|
||||
Annotations: map[string]string{"annotation": "test"},
|
||||
|
@ -151,12 +151,15 @@ func (a *State) TrimResults(alertRule *ngModels.AlertRule) {
|
||||
a.Results = newResults
|
||||
}
|
||||
|
||||
// setEndsAt sets the ending timestamp of the alert.
|
||||
// The internal Alertmanager will use this time to know when it should automatically resolve the alert
|
||||
// in case it hasn't received additional alerts. Under regular operations the scheduler will continue to send the
|
||||
// alert with an updated EndsAt, if the alert is resolved then a last alert is sent with EndsAt = last evaluation time.
|
||||
func (a *State) setEndsAt(alertRule *ngModels.AlertRule, result eval.Result) {
|
||||
if int64(alertRule.For.Seconds()) > alertRule.IntervalSeconds {
|
||||
// For is set and longer than IntervalSeconds
|
||||
a.EndsAt = result.EvaluatedAt.Add(alertRule.For)
|
||||
} else {
|
||||
// For is not set or is less than or equal to IntervalSeconds
|
||||
a.EndsAt = result.EvaluatedAt.Add(time.Duration(alertRule.IntervalSeconds*2) * time.Second)
|
||||
ends := ResendDelay
|
||||
if alertRule.IntervalSeconds > int64(ResendDelay.Seconds()) {
|
||||
ends = time.Duration(alertRule.IntervalSeconds)
|
||||
}
|
||||
|
||||
a.EndsAt = result.EvaluatedAt.Add(ends * 3)
|
||||
}
|
||||
|
@ -4,9 +4,8 @@ import (
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
ngmodels "github.com/grafana/grafana/pkg/services/ngalert/models"
|
||||
|
||||
"github.com/grafana/grafana/pkg/services/ngalert/eval"
|
||||
ngmodels "github.com/grafana/grafana/pkg/services/ngalert/models"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
)
|
||||
@ -114,87 +113,79 @@ func TestSetEndsAt(t *testing.T) {
|
||||
testCases := []struct {
|
||||
name string
|
||||
expected time.Time
|
||||
testState *State
|
||||
testRule *ngmodels.AlertRule
|
||||
testResult eval.Result
|
||||
}{
|
||||
{
|
||||
name: "For: unset Interval: 10s EndsAt should be evaluation time + 2X IntervalSeconds",
|
||||
expected: evaluationTime.Add(20 * time.Second),
|
||||
testState: &State{},
|
||||
name: "less than resend delay: for=unset,interval=10s - endsAt = resendDelay * 3",
|
||||
expected: evaluationTime.Add(ResendDelay * 3),
|
||||
testRule: &ngmodels.AlertRule{
|
||||
IntervalSeconds: 10,
|
||||
},
|
||||
testResult: eval.Result{
|
||||
EvaluatedAt: evaluationTime,
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "For: 0s Interval: 10s EndsAt should be evaluation time + 2X IntervalSeconds",
|
||||
expected: evaluationTime.Add(20 * time.Second),
|
||||
testState: &State{},
|
||||
name: "less than resend delay: for=0s,interval=10s - endsAt = resendDelay * 3",
|
||||
expected: evaluationTime.Add(ResendDelay * 3),
|
||||
testRule: &ngmodels.AlertRule{
|
||||
For: 0 * time.Second,
|
||||
IntervalSeconds: 10,
|
||||
},
|
||||
testResult: eval.Result{
|
||||
EvaluatedAt: evaluationTime,
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "For: 1s Interval: 10s EndsAt should be evaluation time + 2X IntervalSeconds",
|
||||
expected: evaluationTime.Add(20 * time.Second),
|
||||
testState: &State{},
|
||||
testRule: &ngmodels.AlertRule{
|
||||
For: 0 * time.Second,
|
||||
IntervalSeconds: 10,
|
||||
},
|
||||
testResult: eval.Result{
|
||||
EvaluatedAt: evaluationTime,
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "For: 10s Interval: 10s EndsAt should be evaluation time + 2X IntervalSeconds",
|
||||
expected: evaluationTime.Add(20 * time.Second),
|
||||
testState: &State{},
|
||||
name: "less than resend delay: for=10s,interval=10s - endsAt = resendDelay * 3",
|
||||
expected: evaluationTime.Add(ResendDelay * 3),
|
||||
testRule: &ngmodels.AlertRule{
|
||||
For: 10 * time.Second,
|
||||
IntervalSeconds: 10,
|
||||
},
|
||||
testResult: eval.Result{
|
||||
EvaluatedAt: evaluationTime,
|
||||
},
|
||||
{
|
||||
name: "less than resend delay: for=10s,interval=20s - endsAt = resendDelay * 3",
|
||||
expected: evaluationTime.Add(ResendDelay * 3),
|
||||
testRule: &ngmodels.AlertRule{
|
||||
For: 10 * time.Second,
|
||||
IntervalSeconds: 20,
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "For: 11s Interval: 10s EndsAt should be evaluation time + For duration",
|
||||
expected: evaluationTime.Add(11 * time.Second),
|
||||
testState: &State{},
|
||||
name: "more than resend delay: for=unset,interval=1m - endsAt = interval * 3",
|
||||
expected: evaluationTime.Add(60 * 3),
|
||||
testRule: &ngmodels.AlertRule{
|
||||
For: 11 * time.Second,
|
||||
IntervalSeconds: 10,
|
||||
},
|
||||
testResult: eval.Result{
|
||||
EvaluatedAt: evaluationTime,
|
||||
IntervalSeconds: 60,
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "For: 20s Interval: 10s EndsAt should be evaluation time + For duration",
|
||||
expected: evaluationTime.Add(20 * time.Second),
|
||||
testState: &State{},
|
||||
name: "more than resend delay: for=0s,interval=1m - endsAt = resendDelay * 3",
|
||||
expected: evaluationTime.Add(60 * 3),
|
||||
testRule: &ngmodels.AlertRule{
|
||||
For: 20 * time.Second,
|
||||
IntervalSeconds: 10,
|
||||
For: 0 * time.Second,
|
||||
IntervalSeconds: 60,
|
||||
},
|
||||
testResult: eval.Result{
|
||||
EvaluatedAt: evaluationTime,
|
||||
},
|
||||
{
|
||||
name: "more than resend delay: for=1m,interval=5m - endsAt = interval * 3",
|
||||
expected: evaluationTime.Add(300 * 3),
|
||||
testRule: &ngmodels.AlertRule{
|
||||
For: 60 * time.Second,
|
||||
IntervalSeconds: 300,
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "more than resend delay: for=5m,interval=1m - endsAt = interval * 3",
|
||||
expected: evaluationTime.Add(60 * 3),
|
||||
testRule: &ngmodels.AlertRule{
|
||||
For: 300 * time.Second,
|
||||
IntervalSeconds: 60,
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
for _, tc := range testCases {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
tc.testState.setEndsAt(tc.testRule, tc.testResult)
|
||||
assert.Equal(t, tc.expected, tc.testState.EndsAt)
|
||||
s := &State{}
|
||||
r := eval.Result{EvaluatedAt: evaluationTime}
|
||||
s.setEndsAt(tc.testRule, r)
|
||||
assert.Equal(t, tc.expected, s.EndsAt)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
Reference in New Issue
Block a user