Alerting: Fix alert flapping in the internal alertmanager (#38648)

* Alerting: Fix alert flapping in the alertmanager

fixes a bug that caused Alerts that are evaluated at low intervals (sub 1 minute), to flap in the Alertmanager.
Mostly due to a combination of `EndsAt` and resend delay.

The Alertmanager uses `EndsAt` as a heuristic to know whenever it should resolve a firing alert, in the case that it hasn't heard
back from the alert generation system.

Because grafana sent the alert with an `EndsAt` which is equal to the `For` of the alert itself,
and we had a hard-coded 1 minute re-send delay (only applicable to firing alerts) this meant that a firing alert would resolve in the Alertmanager before we re-notify that it still firing.

This commit, increases the `EndsAt` by 3x the the resend delay or alert interval (depending on which one is higher). The resendDelay has been decreased to 30 seconds.
This commit is contained in:
gotjosh
2021-09-02 16:22:59 +01:00
committed by GitHub
parent 7f1327d1ed
commit dd502f22eb
4 changed files with 62 additions and 66 deletions

View File

@ -17,6 +17,8 @@ import (
"github.com/grafana/grafana/pkg/services/ngalert/store"
)
var ResendDelay = 30 * time.Second
type Manager struct {
log log.Logger
metrics *metrics.Metrics
@ -33,7 +35,7 @@ func NewManager(logger log.Logger, metrics *metrics.Metrics, ruleStore store.Rul
manager := &Manager{
cache: newCache(logger, metrics),
quit: make(chan struct{}),
ResendDelay: 1 * time.Minute, // TODO: make this configurable
ResendDelay: ResendDelay, // TODO: make this configurable
log: logger,
metrics: metrics,
ruleStore: ruleStore,

View File

@ -152,7 +152,7 @@ func TestProcessEvalResults(t *testing.T) {
},
},
StartsAt: evaluationTime,
EndsAt: evaluationTime.Add(20 * time.Second),
EndsAt: evaluationTime.Add(state.ResendDelay * 3),
LastEvaluationTime: evaluationTime,
EvaluationDuration: evaluationDuration,
Annotations: map[string]string{"annotation": "test"},
@ -274,7 +274,7 @@ func TestProcessEvalResults(t *testing.T) {
},
},
StartsAt: evaluationTime.Add(1 * time.Minute),
EndsAt: evaluationTime.Add(1 * time.Minute).Add(time.Duration(20) * time.Second),
EndsAt: evaluationTime.Add(1 * time.Minute).Add(state.ResendDelay * 3),
LastEvaluationTime: evaluationTime.Add(1 * time.Minute),
EvaluationDuration: evaluationDuration,
Annotations: map[string]string{"annotation": "test"},
@ -350,7 +350,7 @@ func TestProcessEvalResults(t *testing.T) {
},
},
StartsAt: evaluationTime.Add(80 * time.Second),
EndsAt: evaluationTime.Add(80 * time.Second).Add(1 * time.Minute),
EndsAt: evaluationTime.Add(80 * time.Second).Add(state.ResendDelay * 3),
LastEvaluationTime: evaluationTime.Add(80 * time.Second),
EvaluationDuration: evaluationDuration,
Annotations: map[string]string{"annotation": "test"},
@ -413,7 +413,7 @@ func TestProcessEvalResults(t *testing.T) {
},
},
StartsAt: evaluationTime.Add(10 * time.Second),
EndsAt: evaluationTime.Add(10 * time.Second).Add(1 * time.Minute),
EndsAt: evaluationTime.Add(10 * time.Second).Add(state.ResendDelay * 3),
LastEvaluationTime: evaluationTime.Add(10 * time.Second),
EvaluationDuration: evaluationDuration,
Annotations: map[string]string{"annotation": "test"},
@ -476,7 +476,7 @@ func TestProcessEvalResults(t *testing.T) {
},
},
StartsAt: evaluationTime,
EndsAt: evaluationTime.Add(1 * time.Minute),
EndsAt: evaluationTime.Add(state.ResendDelay * 3),
LastEvaluationTime: evaluationTime.Add(10 * time.Second),
EvaluationDuration: evaluationDuration,
Annotations: map[string]string{"annotation": "test"},
@ -539,7 +539,7 @@ func TestProcessEvalResults(t *testing.T) {
},
},
StartsAt: evaluationTime.Add(10 * time.Second),
EndsAt: evaluationTime.Add(10 * time.Second).Add(20 * time.Second),
EndsAt: evaluationTime.Add(10 * time.Second).Add(state.ResendDelay * 3),
LastEvaluationTime: evaluationTime.Add(10 * time.Second),
EvaluationDuration: evaluationDuration,
Annotations: map[string]string{"annotation": "test"},
@ -602,7 +602,7 @@ func TestProcessEvalResults(t *testing.T) {
},
},
StartsAt: evaluationTime.Add(10 * time.Second),
EndsAt: evaluationTime.Add(10 * time.Second).Add(20 * time.Second),
EndsAt: evaluationTime.Add(10 * time.Second).Add(state.ResendDelay * 3),
LastEvaluationTime: evaluationTime.Add(10 * time.Second),
EvaluationDuration: evaluationDuration,
Annotations: map[string]string{"annotation": "test"},
@ -665,7 +665,7 @@ func TestProcessEvalResults(t *testing.T) {
},
},
StartsAt: evaluationTime.Add(10 * time.Second),
EndsAt: evaluationTime.Add(10 * time.Second).Add(20 * time.Second),
EndsAt: evaluationTime.Add(10 * time.Second).Add(state.ResendDelay * 3),
LastEvaluationTime: evaluationTime.Add(10 * time.Second),
EvaluationDuration: evaluationDuration,
Annotations: map[string]string{"annotation": "test"},
@ -729,7 +729,7 @@ func TestProcessEvalResults(t *testing.T) {
},
},
StartsAt: evaluationTime.Add(10 * time.Second),
EndsAt: evaluationTime.Add(10 * time.Second).Add(1 * time.Minute),
EndsAt: evaluationTime.Add(10 * time.Second).Add(state.ResendDelay * 3),
LastEvaluationTime: evaluationTime.Add(10 * time.Second),
EvaluationDuration: evaluationDuration,
Annotations: map[string]string{"annotation": "test"},
@ -793,7 +793,7 @@ func TestProcessEvalResults(t *testing.T) {
},
},
StartsAt: evaluationTime.Add(10 * time.Second),
EndsAt: evaluationTime.Add(10 * time.Second).Add(1 * time.Minute),
EndsAt: evaluationTime.Add(10 * time.Second).Add(state.ResendDelay * 3),
LastEvaluationTime: evaluationTime.Add(10 * time.Second),
EvaluationDuration: evaluationDuration,
Annotations: map[string]string{"annotation": "test"},

View File

@ -151,12 +151,15 @@ func (a *State) TrimResults(alertRule *ngModels.AlertRule) {
a.Results = newResults
}
// setEndsAt sets the ending timestamp of the alert.
// The internal Alertmanager will use this time to know when it should automatically resolve the alert
// in case it hasn't received additional alerts. Under regular operations the scheduler will continue to send the
// alert with an updated EndsAt, if the alert is resolved then a last alert is sent with EndsAt = last evaluation time.
func (a *State) setEndsAt(alertRule *ngModels.AlertRule, result eval.Result) {
if int64(alertRule.For.Seconds()) > alertRule.IntervalSeconds {
// For is set and longer than IntervalSeconds
a.EndsAt = result.EvaluatedAt.Add(alertRule.For)
} else {
// For is not set or is less than or equal to IntervalSeconds
a.EndsAt = result.EvaluatedAt.Add(time.Duration(alertRule.IntervalSeconds*2) * time.Second)
ends := ResendDelay
if alertRule.IntervalSeconds > int64(ResendDelay.Seconds()) {
ends = time.Duration(alertRule.IntervalSeconds)
}
a.EndsAt = result.EvaluatedAt.Add(ends * 3)
}

View File

@ -4,9 +4,8 @@ import (
"testing"
"time"
ngmodels "github.com/grafana/grafana/pkg/services/ngalert/models"
"github.com/grafana/grafana/pkg/services/ngalert/eval"
ngmodels "github.com/grafana/grafana/pkg/services/ngalert/models"
"github.com/stretchr/testify/assert"
)
@ -114,87 +113,79 @@ func TestSetEndsAt(t *testing.T) {
testCases := []struct {
name string
expected time.Time
testState *State
testRule *ngmodels.AlertRule
testResult eval.Result
}{
{
name: "For: unset Interval: 10s EndsAt should be evaluation time + 2X IntervalSeconds",
expected: evaluationTime.Add(20 * time.Second),
testState: &State{},
name: "less than resend delay: for=unset,interval=10s - endsAt = resendDelay * 3",
expected: evaluationTime.Add(ResendDelay * 3),
testRule: &ngmodels.AlertRule{
IntervalSeconds: 10,
},
testResult: eval.Result{
EvaluatedAt: evaluationTime,
},
},
{
name: "For: 0s Interval: 10s EndsAt should be evaluation time + 2X IntervalSeconds",
expected: evaluationTime.Add(20 * time.Second),
testState: &State{},
name: "less than resend delay: for=0s,interval=10s - endsAt = resendDelay * 3",
expected: evaluationTime.Add(ResendDelay * 3),
testRule: &ngmodels.AlertRule{
For: 0 * time.Second,
IntervalSeconds: 10,
},
testResult: eval.Result{
EvaluatedAt: evaluationTime,
},
},
{
name: "For: 1s Interval: 10s EndsAt should be evaluation time + 2X IntervalSeconds",
expected: evaluationTime.Add(20 * time.Second),
testState: &State{},
testRule: &ngmodels.AlertRule{
For: 0 * time.Second,
IntervalSeconds: 10,
},
testResult: eval.Result{
EvaluatedAt: evaluationTime,
},
},
{
name: "For: 10s Interval: 10s EndsAt should be evaluation time + 2X IntervalSeconds",
expected: evaluationTime.Add(20 * time.Second),
testState: &State{},
name: "less than resend delay: for=10s,interval=10s - endsAt = resendDelay * 3",
expected: evaluationTime.Add(ResendDelay * 3),
testRule: &ngmodels.AlertRule{
For: 10 * time.Second,
IntervalSeconds: 10,
},
testResult: eval.Result{
EvaluatedAt: evaluationTime,
},
{
name: "less than resend delay: for=10s,interval=20s - endsAt = resendDelay * 3",
expected: evaluationTime.Add(ResendDelay * 3),
testRule: &ngmodels.AlertRule{
For: 10 * time.Second,
IntervalSeconds: 20,
},
},
{
name: "For: 11s Interval: 10s EndsAt should be evaluation time + For duration",
expected: evaluationTime.Add(11 * time.Second),
testState: &State{},
name: "more than resend delay: for=unset,interval=1m - endsAt = interval * 3",
expected: evaluationTime.Add(60 * 3),
testRule: &ngmodels.AlertRule{
For: 11 * time.Second,
IntervalSeconds: 10,
},
testResult: eval.Result{
EvaluatedAt: evaluationTime,
IntervalSeconds: 60,
},
},
{
name: "For: 20s Interval: 10s EndsAt should be evaluation time + For duration",
expected: evaluationTime.Add(20 * time.Second),
testState: &State{},
name: "more than resend delay: for=0s,interval=1m - endsAt = resendDelay * 3",
expected: evaluationTime.Add(60 * 3),
testRule: &ngmodels.AlertRule{
For: 20 * time.Second,
IntervalSeconds: 10,
For: 0 * time.Second,
IntervalSeconds: 60,
},
testResult: eval.Result{
EvaluatedAt: evaluationTime,
},
{
name: "more than resend delay: for=1m,interval=5m - endsAt = interval * 3",
expected: evaluationTime.Add(300 * 3),
testRule: &ngmodels.AlertRule{
For: 60 * time.Second,
IntervalSeconds: 300,
},
},
{
name: "more than resend delay: for=5m,interval=1m - endsAt = interval * 3",
expected: evaluationTime.Add(60 * 3),
testRule: &ngmodels.AlertRule{
For: 300 * time.Second,
IntervalSeconds: 60,
},
},
}
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
tc.testState.setEndsAt(tc.testRule, tc.testResult)
assert.Equal(t, tc.expected, tc.testState.EndsAt)
s := &State{}
r := eval.Result{EvaluatedAt: evaluationTime}
s.setEndsAt(tc.testRule, r)
assert.Equal(t, tc.expected, s.EndsAt)
})
}
}