mirror of
https://github.com/grafana/grafana.git
synced 2025-07-31 09:32:40 +08:00
Alerting: Add scheduled clean-up of deleted rules (#101963)
* add scheduled clean up of deleted rules --------- Signed-off-by: Yuri Tseretyan <yuriy.tseretyan@grafana.com>
This commit is contained in:
@ -1406,6 +1406,14 @@ resolved_alert_retention = 15m
|
||||
# 0 value means no limit
|
||||
rule_version_record_limit = 0
|
||||
|
||||
# The retention period for deleted alerting rules.
|
||||
# Determines how long deleted rules are retained before being permanently removed.
|
||||
# The retention duration must be specified using a time format with unit suffixes
|
||||
# such as ms, s, m, h, d (e.g., 30d for 30 days).
|
||||
# Default: 30d
|
||||
# 0 value means that rules are deleted permanently immediately.
|
||||
deleted_rule_retention = 30d
|
||||
|
||||
[unified_alerting.screenshots]
|
||||
# Enable screenshots in notifications. You must have either installed the Grafana image rendering
|
||||
# plugin, or set up Grafana to use a remote rendering service.
|
||||
|
@ -1389,6 +1389,14 @@
|
||||
# 0 value means no limit
|
||||
;rule_version_record_limit= 0
|
||||
|
||||
# The retention period for deleted alerting rules.
|
||||
# Determines how long deleted rules are retained before being permanently removed.
|
||||
# The retention duration must be specified using a time format with unit suffixes
|
||||
# such as ms, s, m, h, d (e.g., 30d for 30 days).
|
||||
# Default: 30d
|
||||
# 0 value means that rules are deleted permanently immediately.
|
||||
;deleted_rule_retention = 30d
|
||||
|
||||
[unified_alerting.screenshots]
|
||||
# Enable screenshots in notifications. You must have either installed the Grafana image rendering
|
||||
# plugin, or set up Grafana to use a remote rendering service.
|
||||
|
@ -10,6 +10,7 @@ import (
|
||||
"github.com/google/wire"
|
||||
|
||||
sdkhttpclient "github.com/grafana/grafana-plugin-sdk-go/backend/httpclient"
|
||||
|
||||
"github.com/grafana/grafana/pkg/api"
|
||||
"github.com/grafana/grafana/pkg/api/avatar"
|
||||
"github.com/grafana/grafana/pkg/api/routing"
|
||||
@ -421,6 +422,7 @@ var wireSet = wire.NewSet(
|
||||
prefimpl.ProvideService,
|
||||
oauthtoken.ProvideService,
|
||||
wire.Bind(new(oauthtoken.OAuthTokenService), new(*oauthtoken.Service)),
|
||||
wire.Bind(new(cleanup.AlertRuleService), new(*ngstore.DBstore)),
|
||||
)
|
||||
|
||||
var wireCLISet = wire.NewSet(
|
||||
@ -453,6 +455,7 @@ var wireTestSet = wire.NewSet(
|
||||
oauthtoken.ProvideService,
|
||||
oauthtokentest.ProvideService,
|
||||
wire.Bind(new(oauthtoken.OAuthTokenService), new(*oauthtokentest.Service)),
|
||||
wire.Bind(new(cleanup.AlertRuleService), new(*ngstore.DBstore)),
|
||||
)
|
||||
|
||||
func Initialize(cfg *setting.Cfg, opts Options, apiOpts api.ServerOptions) (*Server, error) {
|
||||
|
@ -27,6 +27,10 @@ import (
|
||||
"github.com/grafana/grafana/pkg/setting"
|
||||
)
|
||||
|
||||
type AlertRuleService interface {
|
||||
CleanUpDeletedAlertRules(ctx context.Context) (int64, error)
|
||||
}
|
||||
|
||||
type CleanUpService struct {
|
||||
log log.Logger
|
||||
tracer tracing.Tracer
|
||||
@ -41,12 +45,13 @@ type CleanUpService struct {
|
||||
tempUserService tempuser.Service
|
||||
annotationCleaner annotations.Cleaner
|
||||
dashboardService dashboards.DashboardService
|
||||
alertRuleService AlertRuleService
|
||||
}
|
||||
|
||||
func ProvideService(cfg *setting.Cfg, serverLockService *serverlock.ServerLockService,
|
||||
shortURLService shorturls.Service, sqlstore db.DB, queryHistoryService queryhistory.Service,
|
||||
dashboardVersionService dashver.Service, dashSnapSvc dashboardsnapshots.Service, deleteExpiredImageService *image.DeleteExpiredService,
|
||||
tempUserService tempuser.Service, tracer tracing.Tracer, annotationCleaner annotations.Cleaner, dashboardService dashboards.DashboardService) *CleanUpService {
|
||||
tempUserService tempuser.Service, tracer tracing.Tracer, annotationCleaner annotations.Cleaner, dashboardService dashboards.DashboardService, service AlertRuleService) *CleanUpService {
|
||||
s := &CleanUpService{
|
||||
Cfg: cfg,
|
||||
ServerLockService: serverLockService,
|
||||
@ -61,6 +66,7 @@ func ProvideService(cfg *setting.Cfg, serverLockService *serverlock.ServerLockSe
|
||||
tracer: tracer,
|
||||
annotationCleaner: annotationCleaner,
|
||||
dashboardService: dashboardService,
|
||||
alertRuleService: service,
|
||||
}
|
||||
return s
|
||||
}
|
||||
@ -112,6 +118,10 @@ func (srv *CleanUpService) clean(ctx context.Context) {
|
||||
cleanupJobs = append(cleanupJobs, cleanUpJob{"delete stale short URLs", srv.deleteStaleShortURLs})
|
||||
}
|
||||
|
||||
if srv.Cfg.UnifiedAlerting.DeletedRuleRetention > 0 {
|
||||
cleanupJobs = append(cleanupJobs, cleanUpJob{"cleanup trash alert rules", srv.cleanUpTrashAlertRules})
|
||||
}
|
||||
|
||||
logger := srv.log.FromContext(ctx)
|
||||
logger.Debug("Starting cleanup jobs", "jobs", fmt.Sprintf("%v", cleanupJobs))
|
||||
|
||||
@ -313,3 +323,13 @@ func (srv *CleanUpService) cleanUpTrashDashboards(ctx context.Context) {
|
||||
logger.Debug("Cleaned up deleted dashboards", "dashboards affected", affected)
|
||||
}
|
||||
}
|
||||
|
||||
func (srv *CleanUpService) cleanUpTrashAlertRules(ctx context.Context) {
|
||||
logger := srv.log.FromContext(ctx)
|
||||
affected, err := srv.alertRuleService.CleanUpDeletedAlertRules(ctx)
|
||||
if err != nil {
|
||||
logger.Error("Problem cleaning up deleted alert rules", "error", err)
|
||||
} else {
|
||||
logger.Debug("Cleaned up deleted alert rules", "rows affected", affected)
|
||||
}
|
||||
}
|
||||
|
@ -73,7 +73,7 @@ func (st DBstore) DeleteAlertRulesByUID(ctx context.Context, orgID int64, user *
|
||||
logger.Debug("Deleted alert rule state", "count", rows)
|
||||
|
||||
var versions []alertRuleVersion
|
||||
if st.FeatureToggles.IsEnabledGlobally(featuremgmt.FlagAlertRuleRestore) {
|
||||
if st.FeatureToggles.IsEnabledGlobally(featuremgmt.FlagAlertRuleRestore) && st.Cfg.DeletedRuleRetention > 0 { // save deleted version only if retention is greater than 0
|
||||
versions, err = st.getLatestVersionOfRulesByUID(ctx, orgID, ruleUID)
|
||||
if err != nil {
|
||||
logger.Error("Failed to get latest version of deleted alert rules. The recovery will not be possible", "error", err)
|
||||
@ -1243,6 +1243,24 @@ func (st DBstore) GetNamespacesByRuleUID(ctx context.Context, orgID int64, uids
|
||||
return result, err
|
||||
}
|
||||
|
||||
func (st DBstore) CleanUpDeletedAlertRules(ctx context.Context) (int64, error) {
|
||||
affectedRows := int64(-1)
|
||||
err := st.SQLStore.WithTransactionalDbSession(ctx, func(sess *sqlstore.DBSession) error {
|
||||
expire := TimeNow().Add(-st.Cfg.DeletedRuleRetention)
|
||||
st.Logger.Debug("Permanently remove expired deleted rules", "deletedBefore", expire)
|
||||
result, err := sess.Exec("DELETE FROM alert_rule_version WHERE rule_uid='' AND created <= ?", expire)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
affectedRows, err = result.RowsAffected()
|
||||
if err != nil {
|
||||
st.Logger.Warn("Failed to get rows affected by the delete operation", "error", err)
|
||||
}
|
||||
return nil
|
||||
})
|
||||
return affectedRows, err
|
||||
}
|
||||
|
||||
func getINSubQueryArgs[T any](inputSlice []T) ([]any, []string) {
|
||||
args := make([]any, 0, len(inputSlice))
|
||||
in := make([]string, 0, len(inputSlice))
|
||||
|
@ -784,13 +784,15 @@ func TestIntegration_DeleteAlertRulesByUID(t *testing.T) {
|
||||
require.Empty(t, savedInstances)
|
||||
})
|
||||
|
||||
t.Run("should remove all version and insert one with empty rule_uid", func(t *testing.T) {
|
||||
t.Run("should remove all version and insert one with empty rule_uid when DeletedRuleRetention is set", func(t *testing.T) {
|
||||
orgID := int64(rand.Intn(1000))
|
||||
gen = gen.With(gen.WithOrgID(orgID))
|
||||
// Create a new store to pass the custom bus to check the signal
|
||||
b := &fakeBus{}
|
||||
logger := log.New("test-dbstore")
|
||||
|
||||
cfg.UnifiedAlerting.DeletedRuleRetention = 1000 * time.Hour
|
||||
|
||||
store := createTestStore(sqlStore, folderService, logger, cfg.UnifiedAlerting, b)
|
||||
store.FeatureToggles = featuremgmt.WithFeatures(featuremgmt.FlagAlertRuleRestore)
|
||||
|
||||
@ -848,6 +850,59 @@ func TestIntegration_DeleteAlertRulesByUID(t *testing.T) {
|
||||
return nil
|
||||
})
|
||||
})
|
||||
|
||||
t.Run("should remove all versions and not keep history if DeletedRuleRetention = 0", func(t *testing.T) {
|
||||
orgID := int64(rand.Intn(1000))
|
||||
gen = gen.With(gen.WithOrgID(orgID))
|
||||
// Create a new store to pass the custom bus to check the signal
|
||||
b := &fakeBus{}
|
||||
logger := log.New("test-dbstore")
|
||||
|
||||
cfg.UnifiedAlerting.DeletedRuleRetention = 0
|
||||
|
||||
store := createTestStore(sqlStore, folderService, logger, cfg.UnifiedAlerting, b)
|
||||
store.FeatureToggles = featuremgmt.WithFeatures(featuremgmt.FlagAlertRuleRestore)
|
||||
|
||||
result, err := store.InsertAlertRules(context.Background(), &models.AlertingUserUID, gen.GenerateMany(3))
|
||||
uids := make([]string, 0, len(result))
|
||||
for _, rule := range result {
|
||||
uids = append(uids, rule.UID)
|
||||
}
|
||||
require.NoError(t, err)
|
||||
rules, err := store.ListAlertRules(context.Background(), &models.ListAlertRulesQuery{OrgID: orgID, RuleUIDs: uids})
|
||||
require.NoError(t, err)
|
||||
|
||||
updates := make([]models.UpdateRule, 0, len(rules))
|
||||
for _, rule := range rules {
|
||||
rule2 := models.CopyRule(rule, gen.WithTitle(util.GenerateShortUID()))
|
||||
updates = append(updates, models.UpdateRule{
|
||||
Existing: rule,
|
||||
New: *rule2,
|
||||
})
|
||||
}
|
||||
err = store.UpdateAlertRules(context.Background(), &models.AlertingUserUID, updates)
|
||||
require.NoError(t, err)
|
||||
|
||||
versions, err := store.GetAlertRuleVersions(context.Background(), orgID, rules[0].GUID)
|
||||
require.NoError(t, err)
|
||||
require.Len(t, versions, 2)
|
||||
|
||||
err = store.DeleteAlertRulesByUID(context.Background(), orgID, util.Pointer(models.UserUID("test")), uids...)
|
||||
require.NoError(t, err)
|
||||
|
||||
guids := make([]string, 0, len(rules))
|
||||
for _, rule := range rules {
|
||||
guids = append(guids, rule.GUID)
|
||||
}
|
||||
|
||||
_ = sqlStore.WithDbSession(context.Background(), func(sess *sqlstore.DBSession) error {
|
||||
var versions []alertRuleVersion
|
||||
err = sess.Table(alertRuleVersion{}).Where(`rule_uid = ''`).In("rule_guid", guids).Find(&versions)
|
||||
require.NoError(t, err)
|
||||
require.Emptyf(t, versions, "some rules were not permanently deleted") // should be one version per GUID
|
||||
return nil
|
||||
})
|
||||
})
|
||||
}
|
||||
|
||||
func TestIntegrationInsertAlertRules(t *testing.T) {
|
||||
@ -1962,6 +2017,7 @@ func TestIntegration_ListDeletedRules(t *testing.T) {
|
||||
cfg.UnifiedAlerting = setting.UnifiedAlertingSettings{
|
||||
BaseInterval: 1 * time.Second,
|
||||
RuleVersionRecordLimit: -1,
|
||||
DeletedRuleRetention: 10 * time.Hour,
|
||||
}
|
||||
sqlStore := db.InitTestDB(t)
|
||||
folderService := setupFolderService(t, sqlStore, cfg, featuremgmt.WithFeatures())
|
||||
@ -2011,6 +2067,72 @@ func TestIntegration_ListDeletedRules(t *testing.T) {
|
||||
})
|
||||
}
|
||||
|
||||
func TestIntegration_CleanUpDeletedAlertRules(t *testing.T) {
|
||||
if testing.Short() {
|
||||
t.Skip("skipping integration test")
|
||||
}
|
||||
|
||||
oldClk := TimeNow
|
||||
t.Cleanup(func() {
|
||||
TimeNow = oldClk
|
||||
})
|
||||
|
||||
t0 := time.Now().UTC().Truncate(time.Second)
|
||||
TimeNow = func() time.Time {
|
||||
return t0
|
||||
}
|
||||
|
||||
sqlStore := db.InitTestDB(t, sqlstore.InitTestDBOpt{
|
||||
Cfg: nil,
|
||||
})
|
||||
cfg := setting.NewCfg()
|
||||
cfg.UnifiedAlerting.BaseInterval = 1 * time.Second
|
||||
cfg.UnifiedAlerting.RuleVersionRecordLimit = -1
|
||||
cfg.UnifiedAlerting.DeletedRuleRetention = 10 * time.Second
|
||||
|
||||
folderService := setupFolderService(t, sqlStore, cfg, featuremgmt.WithFeatures())
|
||||
logger := log.New("test-dbstore")
|
||||
store := createTestStore(sqlStore, folderService, logger, cfg.UnifiedAlerting, &fakeBus{})
|
||||
store.FeatureToggles = featuremgmt.WithFeatures(featuremgmt.FlagAlertRuleRestore)
|
||||
|
||||
gen := models.RuleGen
|
||||
orgID := int64(rand.Intn(1000))
|
||||
|
||||
gen = gen.With(gen.WithOrgID(orgID))
|
||||
|
||||
result, err := store.InsertAlertRules(context.Background(), &models.AlertingUserUID, gen.GenerateMany(3))
|
||||
uids := make([]string, 0, len(result))
|
||||
for _, rule := range result {
|
||||
uids = append(uids, rule.UID)
|
||||
}
|
||||
require.NoError(t, err)
|
||||
|
||||
// simulate rule deletion at different time.
|
||||
// t0, t0+10s, t0+20s
|
||||
for idx, uid := range uids {
|
||||
TimeNow = func() time.Time {
|
||||
return t0.Add(time.Duration(idx) * 10 * time.Second)
|
||||
}
|
||||
err = store.DeleteAlertRulesByUID(context.Background(), orgID, util.Pointer(models.UserUID("test")), uid)
|
||||
require.NoError(t, err)
|
||||
}
|
||||
|
||||
before, err := store.ListDeletedRules(context.Background(), orgID)
|
||||
require.NoError(t, err)
|
||||
require.Len(t, before, 3)
|
||||
|
||||
// retention is 10s, now=t+20s, therefore, only one row should be deleted
|
||||
_, err = store.CleanUpDeletedAlertRules(context.Background())
|
||||
require.NoError(t, err)
|
||||
|
||||
after, err := store.ListDeletedRules(context.Background(), orgID)
|
||||
require.NoError(t, err)
|
||||
assert.Len(t, after, 1)
|
||||
for _, rule := range after {
|
||||
assert.GreaterOrEqual(t, rule.Updated, TimeNow().Add(-cfg.UnifiedAlerting.DeletedRuleRetention))
|
||||
}
|
||||
}
|
||||
|
||||
func createTestStore(
|
||||
sqlStore db.DB,
|
||||
folderService folder.Service,
|
||||
|
@ -129,6 +129,9 @@ type UnifiedAlertingSettings struct {
|
||||
// should be stored in the database for each alert_rule in an organization including the current one.
|
||||
// 0 value means no limit
|
||||
RuleVersionRecordLimit int
|
||||
|
||||
// DeletedRuleRetention defines the maximum duration to retain deleted alerting rules before permanent removal.
|
||||
DeletedRuleRetention time.Duration
|
||||
}
|
||||
|
||||
type RecordingRuleSettings struct {
|
||||
@ -477,6 +480,11 @@ func (cfg *Cfg) ReadUnifiedAlertingSettings(iniFile *ini.File) error {
|
||||
return fmt.Errorf("setting 'rule_version_record_limit' is invalid, only 0 or a positive integer are allowed")
|
||||
}
|
||||
|
||||
uaCfg.DeletedRuleRetention = ua.Key("deleted_rule_retention").MustDuration(30 * 24 * time.Hour)
|
||||
if uaCfg.DeletedRuleRetention < 0 {
|
||||
return fmt.Errorf("setting 'deleted_rule_retention' is invalid, only 0 or a positive duration are allowed")
|
||||
}
|
||||
|
||||
cfg.UnifiedAlerting = uaCfg
|
||||
return nil
|
||||
}
|
||||
|
Reference in New Issue
Block a user