mirror of
https://github.com/grafana/grafana.git
synced 2025-09-29 01:54:18 +08:00
Alerting: Fix alert flapping in the internal alertmanager (#38648)
* Alerting: Fix alert flapping in the alertmanager fixes a bug that caused Alerts that are evaluated at low intervals (sub 1 minute), to flap in the Alertmanager. Mostly due to a combination of `EndsAt` and resend delay. The Alertmanager uses `EndsAt` as a heuristic to know whenever it should resolve a firing alert, in the case that it hasn't heard back from the alert generation system. Because grafana sent the alert with an `EndsAt` which is equal to the `For` of the alert itself, and we had a hard-coded 1 minute re-send delay (only applicable to firing alerts) this meant that a firing alert would resolve in the Alertmanager before we re-notify that it still firing. This commit, increases the `EndsAt` by 3x the the resend delay or alert interval (depending on which one is higher). The resendDelay has been decreased to 30 seconds.
This commit is contained in:
@ -17,6 +17,8 @@ import (
|
|||||||
"github.com/grafana/grafana/pkg/services/ngalert/store"
|
"github.com/grafana/grafana/pkg/services/ngalert/store"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
var ResendDelay = 30 * time.Second
|
||||||
|
|
||||||
type Manager struct {
|
type Manager struct {
|
||||||
log log.Logger
|
log log.Logger
|
||||||
metrics *metrics.Metrics
|
metrics *metrics.Metrics
|
||||||
@ -33,7 +35,7 @@ func NewManager(logger log.Logger, metrics *metrics.Metrics, ruleStore store.Rul
|
|||||||
manager := &Manager{
|
manager := &Manager{
|
||||||
cache: newCache(logger, metrics),
|
cache: newCache(logger, metrics),
|
||||||
quit: make(chan struct{}),
|
quit: make(chan struct{}),
|
||||||
ResendDelay: 1 * time.Minute, // TODO: make this configurable
|
ResendDelay: ResendDelay, // TODO: make this configurable
|
||||||
log: logger,
|
log: logger,
|
||||||
metrics: metrics,
|
metrics: metrics,
|
||||||
ruleStore: ruleStore,
|
ruleStore: ruleStore,
|
||||||
|
@ -152,7 +152,7 @@ func TestProcessEvalResults(t *testing.T) {
|
|||||||
},
|
},
|
||||||
},
|
},
|
||||||
StartsAt: evaluationTime,
|
StartsAt: evaluationTime,
|
||||||
EndsAt: evaluationTime.Add(20 * time.Second),
|
EndsAt: evaluationTime.Add(state.ResendDelay * 3),
|
||||||
LastEvaluationTime: evaluationTime,
|
LastEvaluationTime: evaluationTime,
|
||||||
EvaluationDuration: evaluationDuration,
|
EvaluationDuration: evaluationDuration,
|
||||||
Annotations: map[string]string{"annotation": "test"},
|
Annotations: map[string]string{"annotation": "test"},
|
||||||
@ -274,7 +274,7 @@ func TestProcessEvalResults(t *testing.T) {
|
|||||||
},
|
},
|
||||||
},
|
},
|
||||||
StartsAt: evaluationTime.Add(1 * time.Minute),
|
StartsAt: evaluationTime.Add(1 * time.Minute),
|
||||||
EndsAt: evaluationTime.Add(1 * time.Minute).Add(time.Duration(20) * time.Second),
|
EndsAt: evaluationTime.Add(1 * time.Minute).Add(state.ResendDelay * 3),
|
||||||
LastEvaluationTime: evaluationTime.Add(1 * time.Minute),
|
LastEvaluationTime: evaluationTime.Add(1 * time.Minute),
|
||||||
EvaluationDuration: evaluationDuration,
|
EvaluationDuration: evaluationDuration,
|
||||||
Annotations: map[string]string{"annotation": "test"},
|
Annotations: map[string]string{"annotation": "test"},
|
||||||
@ -350,7 +350,7 @@ func TestProcessEvalResults(t *testing.T) {
|
|||||||
},
|
},
|
||||||
},
|
},
|
||||||
StartsAt: evaluationTime.Add(80 * time.Second),
|
StartsAt: evaluationTime.Add(80 * time.Second),
|
||||||
EndsAt: evaluationTime.Add(80 * time.Second).Add(1 * time.Minute),
|
EndsAt: evaluationTime.Add(80 * time.Second).Add(state.ResendDelay * 3),
|
||||||
LastEvaluationTime: evaluationTime.Add(80 * time.Second),
|
LastEvaluationTime: evaluationTime.Add(80 * time.Second),
|
||||||
EvaluationDuration: evaluationDuration,
|
EvaluationDuration: evaluationDuration,
|
||||||
Annotations: map[string]string{"annotation": "test"},
|
Annotations: map[string]string{"annotation": "test"},
|
||||||
@ -413,7 +413,7 @@ func TestProcessEvalResults(t *testing.T) {
|
|||||||
},
|
},
|
||||||
},
|
},
|
||||||
StartsAt: evaluationTime.Add(10 * time.Second),
|
StartsAt: evaluationTime.Add(10 * time.Second),
|
||||||
EndsAt: evaluationTime.Add(10 * time.Second).Add(1 * time.Minute),
|
EndsAt: evaluationTime.Add(10 * time.Second).Add(state.ResendDelay * 3),
|
||||||
LastEvaluationTime: evaluationTime.Add(10 * time.Second),
|
LastEvaluationTime: evaluationTime.Add(10 * time.Second),
|
||||||
EvaluationDuration: evaluationDuration,
|
EvaluationDuration: evaluationDuration,
|
||||||
Annotations: map[string]string{"annotation": "test"},
|
Annotations: map[string]string{"annotation": "test"},
|
||||||
@ -476,7 +476,7 @@ func TestProcessEvalResults(t *testing.T) {
|
|||||||
},
|
},
|
||||||
},
|
},
|
||||||
StartsAt: evaluationTime,
|
StartsAt: evaluationTime,
|
||||||
EndsAt: evaluationTime.Add(1 * time.Minute),
|
EndsAt: evaluationTime.Add(state.ResendDelay * 3),
|
||||||
LastEvaluationTime: evaluationTime.Add(10 * time.Second),
|
LastEvaluationTime: evaluationTime.Add(10 * time.Second),
|
||||||
EvaluationDuration: evaluationDuration,
|
EvaluationDuration: evaluationDuration,
|
||||||
Annotations: map[string]string{"annotation": "test"},
|
Annotations: map[string]string{"annotation": "test"},
|
||||||
@ -539,7 +539,7 @@ func TestProcessEvalResults(t *testing.T) {
|
|||||||
},
|
},
|
||||||
},
|
},
|
||||||
StartsAt: evaluationTime.Add(10 * time.Second),
|
StartsAt: evaluationTime.Add(10 * time.Second),
|
||||||
EndsAt: evaluationTime.Add(10 * time.Second).Add(20 * time.Second),
|
EndsAt: evaluationTime.Add(10 * time.Second).Add(state.ResendDelay * 3),
|
||||||
LastEvaluationTime: evaluationTime.Add(10 * time.Second),
|
LastEvaluationTime: evaluationTime.Add(10 * time.Second),
|
||||||
EvaluationDuration: evaluationDuration,
|
EvaluationDuration: evaluationDuration,
|
||||||
Annotations: map[string]string{"annotation": "test"},
|
Annotations: map[string]string{"annotation": "test"},
|
||||||
@ -602,7 +602,7 @@ func TestProcessEvalResults(t *testing.T) {
|
|||||||
},
|
},
|
||||||
},
|
},
|
||||||
StartsAt: evaluationTime.Add(10 * time.Second),
|
StartsAt: evaluationTime.Add(10 * time.Second),
|
||||||
EndsAt: evaluationTime.Add(10 * time.Second).Add(20 * time.Second),
|
EndsAt: evaluationTime.Add(10 * time.Second).Add(state.ResendDelay * 3),
|
||||||
LastEvaluationTime: evaluationTime.Add(10 * time.Second),
|
LastEvaluationTime: evaluationTime.Add(10 * time.Second),
|
||||||
EvaluationDuration: evaluationDuration,
|
EvaluationDuration: evaluationDuration,
|
||||||
Annotations: map[string]string{"annotation": "test"},
|
Annotations: map[string]string{"annotation": "test"},
|
||||||
@ -665,7 +665,7 @@ func TestProcessEvalResults(t *testing.T) {
|
|||||||
},
|
},
|
||||||
},
|
},
|
||||||
StartsAt: evaluationTime.Add(10 * time.Second),
|
StartsAt: evaluationTime.Add(10 * time.Second),
|
||||||
EndsAt: evaluationTime.Add(10 * time.Second).Add(20 * time.Second),
|
EndsAt: evaluationTime.Add(10 * time.Second).Add(state.ResendDelay * 3),
|
||||||
LastEvaluationTime: evaluationTime.Add(10 * time.Second),
|
LastEvaluationTime: evaluationTime.Add(10 * time.Second),
|
||||||
EvaluationDuration: evaluationDuration,
|
EvaluationDuration: evaluationDuration,
|
||||||
Annotations: map[string]string{"annotation": "test"},
|
Annotations: map[string]string{"annotation": "test"},
|
||||||
@ -729,7 +729,7 @@ func TestProcessEvalResults(t *testing.T) {
|
|||||||
},
|
},
|
||||||
},
|
},
|
||||||
StartsAt: evaluationTime.Add(10 * time.Second),
|
StartsAt: evaluationTime.Add(10 * time.Second),
|
||||||
EndsAt: evaluationTime.Add(10 * time.Second).Add(1 * time.Minute),
|
EndsAt: evaluationTime.Add(10 * time.Second).Add(state.ResendDelay * 3),
|
||||||
LastEvaluationTime: evaluationTime.Add(10 * time.Second),
|
LastEvaluationTime: evaluationTime.Add(10 * time.Second),
|
||||||
EvaluationDuration: evaluationDuration,
|
EvaluationDuration: evaluationDuration,
|
||||||
Annotations: map[string]string{"annotation": "test"},
|
Annotations: map[string]string{"annotation": "test"},
|
||||||
@ -793,7 +793,7 @@ func TestProcessEvalResults(t *testing.T) {
|
|||||||
},
|
},
|
||||||
},
|
},
|
||||||
StartsAt: evaluationTime.Add(10 * time.Second),
|
StartsAt: evaluationTime.Add(10 * time.Second),
|
||||||
EndsAt: evaluationTime.Add(10 * time.Second).Add(1 * time.Minute),
|
EndsAt: evaluationTime.Add(10 * time.Second).Add(state.ResendDelay * 3),
|
||||||
LastEvaluationTime: evaluationTime.Add(10 * time.Second),
|
LastEvaluationTime: evaluationTime.Add(10 * time.Second),
|
||||||
EvaluationDuration: evaluationDuration,
|
EvaluationDuration: evaluationDuration,
|
||||||
Annotations: map[string]string{"annotation": "test"},
|
Annotations: map[string]string{"annotation": "test"},
|
||||||
|
@ -151,12 +151,15 @@ func (a *State) TrimResults(alertRule *ngModels.AlertRule) {
|
|||||||
a.Results = newResults
|
a.Results = newResults
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// setEndsAt sets the ending timestamp of the alert.
|
||||||
|
// The internal Alertmanager will use this time to know when it should automatically resolve the alert
|
||||||
|
// in case it hasn't received additional alerts. Under regular operations the scheduler will continue to send the
|
||||||
|
// alert with an updated EndsAt, if the alert is resolved then a last alert is sent with EndsAt = last evaluation time.
|
||||||
func (a *State) setEndsAt(alertRule *ngModels.AlertRule, result eval.Result) {
|
func (a *State) setEndsAt(alertRule *ngModels.AlertRule, result eval.Result) {
|
||||||
if int64(alertRule.For.Seconds()) > alertRule.IntervalSeconds {
|
ends := ResendDelay
|
||||||
// For is set and longer than IntervalSeconds
|
if alertRule.IntervalSeconds > int64(ResendDelay.Seconds()) {
|
||||||
a.EndsAt = result.EvaluatedAt.Add(alertRule.For)
|
ends = time.Duration(alertRule.IntervalSeconds)
|
||||||
} else {
|
|
||||||
// For is not set or is less than or equal to IntervalSeconds
|
|
||||||
a.EndsAt = result.EvaluatedAt.Add(time.Duration(alertRule.IntervalSeconds*2) * time.Second)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
a.EndsAt = result.EvaluatedAt.Add(ends * 3)
|
||||||
}
|
}
|
||||||
|
@ -4,9 +4,8 @@ import (
|
|||||||
"testing"
|
"testing"
|
||||||
"time"
|
"time"
|
||||||
|
|
||||||
ngmodels "github.com/grafana/grafana/pkg/services/ngalert/models"
|
|
||||||
|
|
||||||
"github.com/grafana/grafana/pkg/services/ngalert/eval"
|
"github.com/grafana/grafana/pkg/services/ngalert/eval"
|
||||||
|
ngmodels "github.com/grafana/grafana/pkg/services/ngalert/models"
|
||||||
|
|
||||||
"github.com/stretchr/testify/assert"
|
"github.com/stretchr/testify/assert"
|
||||||
)
|
)
|
||||||
@ -114,87 +113,79 @@ func TestSetEndsAt(t *testing.T) {
|
|||||||
testCases := []struct {
|
testCases := []struct {
|
||||||
name string
|
name string
|
||||||
expected time.Time
|
expected time.Time
|
||||||
testState *State
|
|
||||||
testRule *ngmodels.AlertRule
|
testRule *ngmodels.AlertRule
|
||||||
testResult eval.Result
|
testResult eval.Result
|
||||||
}{
|
}{
|
||||||
{
|
{
|
||||||
name: "For: unset Interval: 10s EndsAt should be evaluation time + 2X IntervalSeconds",
|
name: "less than resend delay: for=unset,interval=10s - endsAt = resendDelay * 3",
|
||||||
expected: evaluationTime.Add(20 * time.Second),
|
expected: evaluationTime.Add(ResendDelay * 3),
|
||||||
testState: &State{},
|
|
||||||
testRule: &ngmodels.AlertRule{
|
testRule: &ngmodels.AlertRule{
|
||||||
IntervalSeconds: 10,
|
IntervalSeconds: 10,
|
||||||
},
|
},
|
||||||
testResult: eval.Result{
|
|
||||||
EvaluatedAt: evaluationTime,
|
|
||||||
},
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
name: "For: 0s Interval: 10s EndsAt should be evaluation time + 2X IntervalSeconds",
|
name: "less than resend delay: for=0s,interval=10s - endsAt = resendDelay * 3",
|
||||||
expected: evaluationTime.Add(20 * time.Second),
|
expected: evaluationTime.Add(ResendDelay * 3),
|
||||||
testState: &State{},
|
|
||||||
testRule: &ngmodels.AlertRule{
|
testRule: &ngmodels.AlertRule{
|
||||||
For: 0 * time.Second,
|
For: 0 * time.Second,
|
||||||
IntervalSeconds: 10,
|
IntervalSeconds: 10,
|
||||||
},
|
},
|
||||||
testResult: eval.Result{
|
|
||||||
EvaluatedAt: evaluationTime,
|
|
||||||
},
|
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
name: "For: 1s Interval: 10s EndsAt should be evaluation time + 2X IntervalSeconds",
|
name: "less than resend delay: for=10s,interval=10s - endsAt = resendDelay * 3",
|
||||||
expected: evaluationTime.Add(20 * time.Second),
|
expected: evaluationTime.Add(ResendDelay * 3),
|
||||||
testState: &State{},
|
|
||||||
testRule: &ngmodels.AlertRule{
|
|
||||||
For: 0 * time.Second,
|
|
||||||
IntervalSeconds: 10,
|
|
||||||
},
|
|
||||||
testResult: eval.Result{
|
|
||||||
EvaluatedAt: evaluationTime,
|
|
||||||
},
|
|
||||||
},
|
|
||||||
{
|
|
||||||
name: "For: 10s Interval: 10s EndsAt should be evaluation time + 2X IntervalSeconds",
|
|
||||||
expected: evaluationTime.Add(20 * time.Second),
|
|
||||||
testState: &State{},
|
|
||||||
testRule: &ngmodels.AlertRule{
|
testRule: &ngmodels.AlertRule{
|
||||||
For: 10 * time.Second,
|
For: 10 * time.Second,
|
||||||
IntervalSeconds: 10,
|
IntervalSeconds: 10,
|
||||||
},
|
},
|
||||||
testResult: eval.Result{
|
},
|
||||||
EvaluatedAt: evaluationTime,
|
{
|
||||||
|
name: "less than resend delay: for=10s,interval=20s - endsAt = resendDelay * 3",
|
||||||
|
expected: evaluationTime.Add(ResendDelay * 3),
|
||||||
|
testRule: &ngmodels.AlertRule{
|
||||||
|
For: 10 * time.Second,
|
||||||
|
IntervalSeconds: 20,
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
name: "For: 11s Interval: 10s EndsAt should be evaluation time + For duration",
|
name: "more than resend delay: for=unset,interval=1m - endsAt = interval * 3",
|
||||||
expected: evaluationTime.Add(11 * time.Second),
|
expected: evaluationTime.Add(60 * 3),
|
||||||
testState: &State{},
|
|
||||||
testRule: &ngmodels.AlertRule{
|
testRule: &ngmodels.AlertRule{
|
||||||
For: 11 * time.Second,
|
IntervalSeconds: 60,
|
||||||
IntervalSeconds: 10,
|
|
||||||
},
|
|
||||||
testResult: eval.Result{
|
|
||||||
EvaluatedAt: evaluationTime,
|
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
name: "For: 20s Interval: 10s EndsAt should be evaluation time + For duration",
|
name: "more than resend delay: for=0s,interval=1m - endsAt = resendDelay * 3",
|
||||||
expected: evaluationTime.Add(20 * time.Second),
|
expected: evaluationTime.Add(60 * 3),
|
||||||
testState: &State{},
|
|
||||||
testRule: &ngmodels.AlertRule{
|
testRule: &ngmodels.AlertRule{
|
||||||
For: 20 * time.Second,
|
For: 0 * time.Second,
|
||||||
IntervalSeconds: 10,
|
IntervalSeconds: 60,
|
||||||
},
|
},
|
||||||
testResult: eval.Result{
|
},
|
||||||
EvaluatedAt: evaluationTime,
|
{
|
||||||
|
name: "more than resend delay: for=1m,interval=5m - endsAt = interval * 3",
|
||||||
|
expected: evaluationTime.Add(300 * 3),
|
||||||
|
testRule: &ngmodels.AlertRule{
|
||||||
|
For: 60 * time.Second,
|
||||||
|
IntervalSeconds: 300,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "more than resend delay: for=5m,interval=1m - endsAt = interval * 3",
|
||||||
|
expected: evaluationTime.Add(60 * 3),
|
||||||
|
testRule: &ngmodels.AlertRule{
|
||||||
|
For: 300 * time.Second,
|
||||||
|
IntervalSeconds: 60,
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
for _, tc := range testCases {
|
for _, tc := range testCases {
|
||||||
t.Run(tc.name, func(t *testing.T) {
|
t.Run(tc.name, func(t *testing.T) {
|
||||||
tc.testState.setEndsAt(tc.testRule, tc.testResult)
|
s := &State{}
|
||||||
assert.Equal(t, tc.expected, tc.testState.EndsAt)
|
r := eval.Result{EvaluatedAt: evaluationTime}
|
||||||
|
s.setEndsAt(tc.testRule, r)
|
||||||
|
assert.Equal(t, tc.expected, s.EndsAt)
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Reference in New Issue
Block a user