mirror of
https://github.com/grafana/grafana.git
synced 2025-07-30 05:32:32 +08:00

* Spike: Extras * Attempt to wire it up * Hack * Fix issue with jobs * Wire more things up * Fix more wiring stuff * Remove webhook secret key from main registration * Move secret encryption also outside register * Add TODOs in code * Add more explanations * Move connectors to different package * Move pull request job into webhooks * Separate registration * Remove duplicate files * Fix missing function * Extract webhook repository logic out of the core github repository * Use status patcher in webhook connector * Fix change in go mod * Change hooks signature * Remove TODOs * Remove Webhook methos from go-git * Remove leftover * Fix mistake in OpenAPI spec * Fix some tests * Fix some issues * Fix linting
533 lines
16 KiB
Go
533 lines
16 KiB
Go
package controller
|
|
|
|
import (
|
|
"context"
|
|
"encoding/json"
|
|
"errors"
|
|
"fmt"
|
|
"time"
|
|
|
|
apierrors "k8s.io/apimachinery/pkg/api/errors"
|
|
v1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
|
"k8s.io/apimachinery/pkg/types"
|
|
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
|
|
"k8s.io/apimachinery/pkg/util/wait"
|
|
"k8s.io/apiserver/pkg/endpoints/request"
|
|
"k8s.io/client-go/tools/cache"
|
|
"k8s.io/client-go/util/workqueue"
|
|
|
|
"github.com/grafana/grafana-app-sdk/logging"
|
|
"github.com/grafana/grafana/pkg/apimachinery/identity"
|
|
provisioning "github.com/grafana/grafana/pkg/apis/provisioning/v0alpha1"
|
|
client "github.com/grafana/grafana/pkg/generated/clientset/versioned/typed/provisioning/v0alpha1"
|
|
informer "github.com/grafana/grafana/pkg/generated/informers/externalversions/provisioning/v0alpha1"
|
|
listers "github.com/grafana/grafana/pkg/generated/listers/provisioning/v0alpha1"
|
|
"github.com/grafana/grafana/pkg/registry/apis/provisioning/jobs"
|
|
"github.com/grafana/grafana/pkg/registry/apis/provisioning/repository"
|
|
"github.com/grafana/grafana/pkg/registry/apis/provisioning/resources"
|
|
"github.com/grafana/grafana/pkg/registry/apis/provisioning/secrets"
|
|
"github.com/grafana/grafana/pkg/storage/legacysql/dualwrite"
|
|
)
|
|
|
|
type RepoGetter interface {
|
|
// Given a repository configuration, return it as a repository instance
|
|
// This will only error for un-recoverable system errors
|
|
// the repository instance may or may not be valid/healthy
|
|
AsRepository(ctx context.Context, cfg *provisioning.Repository) (repository.Repository, error)
|
|
}
|
|
|
|
type RepositoryTester interface {
|
|
TestRepository(ctx context.Context, repo repository.Repository) (*provisioning.TestResults, error)
|
|
}
|
|
|
|
const loggerName = "provisioning-repository-controller"
|
|
|
|
const (
|
|
maxAttempts = 3
|
|
)
|
|
|
|
type queueItem struct {
|
|
key string
|
|
obj interface{}
|
|
attempts int
|
|
}
|
|
|
|
// RepositoryController controls how and when CRD is established.
|
|
type RepositoryController struct {
|
|
client client.ProvisioningV0alpha1Interface
|
|
resourceLister resources.ResourceLister
|
|
repoLister listers.RepositoryLister
|
|
repoSynced cache.InformerSynced
|
|
parsers resources.ParserFactory
|
|
logger logging.Logger
|
|
secrets secrets.Service
|
|
dualwrite dualwrite.Service
|
|
|
|
jobs jobs.Queue
|
|
finalizer *finalizer
|
|
|
|
// Converts config to instance
|
|
repoGetter RepoGetter
|
|
tester RepositoryTester
|
|
// To allow injection for testing.
|
|
processFn func(item *queueItem) error
|
|
enqueueRepository func(obj any)
|
|
keyFunc func(obj any) (string, error)
|
|
|
|
queue workqueue.TypedRateLimitingInterface[*queueItem]
|
|
}
|
|
|
|
// NewRepositoryController creates new RepositoryController.
|
|
func NewRepositoryController(
|
|
provisioningClient client.ProvisioningV0alpha1Interface,
|
|
repoInformer informer.RepositoryInformer,
|
|
repoGetter RepoGetter,
|
|
resourceLister resources.ResourceLister,
|
|
parsers resources.ParserFactory,
|
|
clients resources.ClientFactory,
|
|
tester RepositoryTester,
|
|
jobs jobs.Queue,
|
|
secrets secrets.Service,
|
|
dualwrite dualwrite.Service,
|
|
) (*RepositoryController, error) {
|
|
rc := &RepositoryController{
|
|
client: provisioningClient,
|
|
resourceLister: resourceLister,
|
|
repoLister: repoInformer.Lister(),
|
|
repoSynced: repoInformer.Informer().HasSynced,
|
|
queue: workqueue.NewTypedRateLimitingQueueWithConfig(
|
|
workqueue.DefaultTypedControllerRateLimiter[*queueItem](),
|
|
workqueue.TypedRateLimitingQueueConfig[*queueItem]{
|
|
Name: "provisioningRepositoryController",
|
|
},
|
|
),
|
|
repoGetter: repoGetter,
|
|
parsers: parsers,
|
|
finalizer: &finalizer{
|
|
lister: resourceLister,
|
|
clientFactory: clients,
|
|
},
|
|
tester: tester,
|
|
jobs: jobs,
|
|
logger: logging.DefaultLogger.With("logger", loggerName),
|
|
secrets: secrets,
|
|
dualwrite: dualwrite,
|
|
}
|
|
|
|
_, err := repoInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
|
|
AddFunc: rc.enqueue,
|
|
UpdateFunc: func(oldObj, newObj interface{}) {
|
|
rc.enqueue(newObj)
|
|
},
|
|
})
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
rc.processFn = rc.process
|
|
rc.enqueueRepository = rc.enqueue
|
|
rc.keyFunc = repoKeyFunc
|
|
|
|
return rc, nil
|
|
}
|
|
|
|
func repoKeyFunc(obj any) (string, error) {
|
|
repo, ok := obj.(*provisioning.Repository)
|
|
if !ok {
|
|
return "", fmt.Errorf("expected a Repository but got %T", obj)
|
|
}
|
|
return cache.DeletionHandlingMetaNamespaceKeyFunc(repo)
|
|
}
|
|
|
|
// Run starts the RepositoryController.
|
|
func (rc *RepositoryController) Run(ctx context.Context, workerCount int) {
|
|
defer utilruntime.HandleCrash()
|
|
defer rc.queue.ShutDown()
|
|
|
|
logger := rc.logger
|
|
ctx = logging.Context(ctx, logger)
|
|
logger.Info("Starting RepositoryController")
|
|
defer logger.Info("Shutting down RepositoryController")
|
|
|
|
if !cache.WaitForCacheSync(ctx.Done(), rc.repoSynced) {
|
|
return
|
|
}
|
|
|
|
logger.Info("Starting workers", "count", workerCount)
|
|
for i := 0; i < workerCount; i++ {
|
|
go wait.UntilWithContext(ctx, rc.runWorker, time.Second)
|
|
}
|
|
|
|
logger.Info("Started workers")
|
|
<-ctx.Done()
|
|
logger.Info("Shutting down workers")
|
|
}
|
|
|
|
func (rc *RepositoryController) runWorker(ctx context.Context) {
|
|
for rc.processNextWorkItem(ctx) {
|
|
}
|
|
}
|
|
|
|
func (rc *RepositoryController) enqueue(obj interface{}) {
|
|
key, err := rc.keyFunc(obj)
|
|
if err != nil {
|
|
utilruntime.HandleError(fmt.Errorf("couldn't get key for object: %v", err))
|
|
return
|
|
}
|
|
|
|
item := queueItem{key: key, obj: obj}
|
|
rc.queue.Add(&item)
|
|
}
|
|
|
|
// processNextWorkItem deals with one key off the queue.
|
|
// It returns false when it's time to quit.
|
|
func (rc *RepositoryController) processNextWorkItem(ctx context.Context) bool {
|
|
item, quit := rc.queue.Get()
|
|
if quit {
|
|
return false
|
|
}
|
|
defer rc.queue.Done(item)
|
|
|
|
// TODO: should we move tracking work to trace ids instead?
|
|
logger := logging.FromContext(ctx).With("work_key", item.key)
|
|
logger.Info("RepositoryController processing key")
|
|
|
|
err := rc.processFn(item)
|
|
if err == nil {
|
|
rc.queue.Forget(item)
|
|
return true
|
|
}
|
|
|
|
item.attempts++
|
|
logger = logger.With("error", err, "attempts", item.attempts)
|
|
logger.Error("RepositoryController failed to process key")
|
|
|
|
if item.attempts >= maxAttempts {
|
|
logger.Error("RepositoryController failed too many times")
|
|
rc.queue.Forget(item)
|
|
return true
|
|
}
|
|
|
|
if !apierrors.IsServiceUnavailable(err) {
|
|
logger.Info("RepositoryController will not retry")
|
|
rc.queue.Forget(item)
|
|
return true
|
|
} else {
|
|
logger.Info("RepositoryController will retry as service is unavailable")
|
|
}
|
|
|
|
utilruntime.HandleError(fmt.Errorf("%v failed with: %v", item, err))
|
|
rc.queue.AddRateLimited(item)
|
|
|
|
return true
|
|
}
|
|
|
|
func (rc *RepositoryController) handleDelete(ctx context.Context, obj *provisioning.Repository) error {
|
|
logger := logging.FromContext(ctx)
|
|
logger.Info("handle repository delete")
|
|
|
|
// Process any finalizers
|
|
if len(obj.Finalizers) > 0 {
|
|
repo, err := rc.repoGetter.AsRepository(ctx, obj)
|
|
if err != nil {
|
|
logger.Warn("unable to get repository for cleanup")
|
|
} else {
|
|
err := rc.finalizer.process(ctx, repo, obj.Finalizers)
|
|
if err != nil {
|
|
logger.Warn("error running finalizer", "err")
|
|
}
|
|
}
|
|
|
|
// remove the finalizers
|
|
_, err = rc.client.Repositories(obj.GetNamespace()).
|
|
Patch(ctx, obj.Name, types.JSONPatchType, []byte(`[
|
|
{ "op": "remove", "path": "/metadata/finalizers" }
|
|
]`), v1.PatchOptions{
|
|
FieldManager: "repository-controller",
|
|
})
|
|
return err // delete will be called again
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func (rc *RepositoryController) shouldCheckHealth(obj *provisioning.Repository) bool {
|
|
if obj.Status.Health.Checked == 0 || obj.Generation != obj.Status.ObservedGeneration {
|
|
return true
|
|
}
|
|
|
|
healthAge := time.Since(time.UnixMilli(obj.Status.Health.Checked))
|
|
if obj.Status.Health.Healthy {
|
|
return healthAge > time.Minute*5 // when healthy, check every 5 mins
|
|
}
|
|
|
|
return healthAge > time.Minute // otherwise within a minute
|
|
}
|
|
|
|
func (rc *RepositoryController) runHealthCheck(ctx context.Context, repo repository.Repository) provisioning.HealthStatus {
|
|
logger := logging.FromContext(ctx)
|
|
logger.Info("running health check")
|
|
res, err := rc.tester.TestRepository(ctx, repo)
|
|
if err != nil {
|
|
res = &provisioning.TestResults{
|
|
Success: false,
|
|
Errors: []provisioning.ErrorDetails{{
|
|
Detail: fmt.Sprintf("error running test repository: %s", err.Error()),
|
|
}},
|
|
}
|
|
}
|
|
|
|
healthStatus := provisioning.HealthStatus{
|
|
Healthy: res.Success,
|
|
Checked: time.Now().UnixMilli(),
|
|
}
|
|
for _, err := range res.Errors {
|
|
if err.Detail != "" {
|
|
healthStatus.Message = append(healthStatus.Message, err.Detail)
|
|
}
|
|
}
|
|
|
|
logger.Info("health check completed", "status", healthStatus)
|
|
|
|
return healthStatus
|
|
}
|
|
|
|
func (rc *RepositoryController) shouldResync(obj *provisioning.Repository) bool {
|
|
// don't trigger resync if a sync was never started
|
|
if obj.Status.Sync.Finished == 0 && obj.Status.Sync.State == "" {
|
|
return false
|
|
}
|
|
|
|
syncAge := time.Since(time.UnixMilli(obj.Status.Sync.Finished))
|
|
syncInterval := time.Duration(obj.Spec.Sync.IntervalSeconds) * time.Second
|
|
tolerance := time.Second
|
|
|
|
// HACK: how would this work in a multi-tenant world or under heavy load?
|
|
// It will start queueing up jobs and we will have to deal with that
|
|
pendingForTooLong := syncAge >= syncInterval/2 && obj.Status.Sync.State == provisioning.JobStatePending
|
|
isRunning := obj.Status.Sync.State == provisioning.JobStateWorking
|
|
|
|
return obj.Spec.Sync.Enabled && syncAge >= (syncInterval-tolerance) && !pendingForTooLong && !isRunning
|
|
}
|
|
|
|
func (rc *RepositoryController) runHooks(ctx context.Context, repo repository.Repository, obj *provisioning.Repository) ([]map[string]interface{}, error) {
|
|
logger := logging.FromContext(ctx)
|
|
hooks, _ := repo.(repository.Hooks)
|
|
if hooks == nil || obj.Generation == obj.Status.ObservedGeneration {
|
|
return nil, nil
|
|
}
|
|
|
|
if obj.Status.ObservedGeneration < 1 {
|
|
logger.Info("handle repository create")
|
|
patchOperations, err := hooks.OnCreate(ctx)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("error running OnCreate: %w", err)
|
|
}
|
|
return patchOperations, nil
|
|
}
|
|
|
|
logger.Info("handle repository spec update", "Generation", obj.Generation, "ObservedGeneration", obj.Status.ObservedGeneration)
|
|
patchOperations, err := hooks.OnUpdate(ctx)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("error running OnUpdate: %w", err)
|
|
}
|
|
|
|
return patchOperations, nil
|
|
}
|
|
|
|
func (rc *RepositoryController) determineSyncStrategy(ctx context.Context, obj *provisioning.Repository, shouldResync bool, healthStatus provisioning.HealthStatus) *provisioning.SyncJobOptions {
|
|
logger := logging.FromContext(ctx)
|
|
|
|
switch {
|
|
case !obj.Spec.Sync.Enabled:
|
|
logger.Info("skip sync as it's disabled")
|
|
return nil
|
|
case !healthStatus.Healthy:
|
|
logger.Info("skip sync for unhealthy repository")
|
|
return nil
|
|
case dualwrite.IsReadingLegacyDashboardsAndFolders(ctx, rc.dualwrite):
|
|
logger.Info("skip sync as we are reading from legacy storage")
|
|
return nil
|
|
case healthStatus.Healthy != obj.Status.Health.Healthy:
|
|
logger.Info("repository became healthy, full resync")
|
|
return &provisioning.SyncJobOptions{}
|
|
case obj.Status.ObservedGeneration < 1:
|
|
logger.Info("full sync for new repository")
|
|
return &provisioning.SyncJobOptions{}
|
|
case obj.Generation != obj.Status.ObservedGeneration:
|
|
logger.Info("full sync for spec change")
|
|
return &provisioning.SyncJobOptions{}
|
|
case shouldResync:
|
|
logger.Info("incremental sync for sync interval")
|
|
return &provisioning.SyncJobOptions{Incremental: true}
|
|
default:
|
|
return nil
|
|
}
|
|
}
|
|
|
|
func (rc *RepositoryController) addSyncJob(ctx context.Context, obj *provisioning.Repository, syncOptions *provisioning.SyncJobOptions) error {
|
|
job, err := rc.jobs.Insert(ctx, obj.Namespace, provisioning.JobSpec{
|
|
Repository: obj.GetName(),
|
|
Action: provisioning.JobActionPull,
|
|
Pull: syncOptions,
|
|
})
|
|
if apierrors.IsAlreadyExists(err) {
|
|
logging.FromContext(ctx).Info("sync job already exists, nothing triggered")
|
|
return nil
|
|
}
|
|
if err != nil {
|
|
// FIXME: should we update the status of the repository if we fail to add the job?
|
|
return fmt.Errorf("error adding sync job: %w", err)
|
|
}
|
|
|
|
logging.FromContext(ctx).Info("sync job triggered", "job", job.Name)
|
|
return nil
|
|
}
|
|
|
|
func (rc *RepositoryController) patchStatus(ctx context.Context, obj *provisioning.Repository, patchOperations []map[string]interface{}) error {
|
|
if len(patchOperations) == 0 {
|
|
return nil
|
|
}
|
|
|
|
patch, err := json.Marshal(patchOperations)
|
|
if err != nil {
|
|
return fmt.Errorf("error encoding status patch: %w", err)
|
|
}
|
|
|
|
_, err = rc.client.Repositories(obj.GetNamespace()).
|
|
Patch(ctx, obj.Name, types.JSONPatchType, patch, v1.PatchOptions{}, "status")
|
|
if err != nil {
|
|
return fmt.Errorf("error applying status patch: %w", err)
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func (rc *RepositoryController) determineSyncStatus(obj *provisioning.Repository, syncOptions *provisioning.SyncJobOptions) *provisioning.SyncStatus {
|
|
const unhealthyMessage = "Repository is unhealthy"
|
|
|
|
hasUnhealthyMessage := len(obj.Status.Sync.Message) > 0 && obj.Status.Sync.Message[0] == unhealthyMessage
|
|
switch {
|
|
case syncOptions != nil:
|
|
return &provisioning.SyncStatus{
|
|
State: provisioning.JobStatePending,
|
|
LastRef: obj.Status.Sync.LastRef,
|
|
Started: time.Now().UnixMilli(),
|
|
}
|
|
case obj.Status.Health.Healthy && hasUnhealthyMessage: // if the repository is healthy and the message is set, clear it
|
|
// FIXME: is this the clearest way to do this? Should we introduce another status or way of way of handling more
|
|
// specific errors?
|
|
return &provisioning.SyncStatus{
|
|
LastRef: obj.Status.Sync.LastRef,
|
|
}
|
|
case !obj.Status.Health.Healthy && !hasUnhealthyMessage: // if the repository is unhealthy and the message is not already set, set it
|
|
return &provisioning.SyncStatus{
|
|
State: provisioning.JobStateError,
|
|
Message: []string{unhealthyMessage},
|
|
LastRef: obj.Status.Sync.LastRef,
|
|
}
|
|
default:
|
|
return nil
|
|
}
|
|
}
|
|
|
|
//nolint:gocyclo
|
|
func (rc *RepositoryController) process(item *queueItem) error {
|
|
logger := rc.logger.With("key", item.key)
|
|
|
|
namespace, name, err := cache.SplitMetaNamespaceKey(item.key)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
obj, err := rc.repoLister.Repositories(namespace).Get(name)
|
|
switch {
|
|
case apierrors.IsNotFound(err):
|
|
return errors.New("repository not found in cache")
|
|
case err != nil:
|
|
return err
|
|
}
|
|
|
|
ctx, _, err := identity.WithProvisioningIdentity(context.Background(), namespace)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
ctx = request.WithNamespace(ctx, namespace)
|
|
logger = logger.WithContext(ctx)
|
|
|
|
if obj.DeletionTimestamp != nil {
|
|
return rc.handleDelete(ctx, obj)
|
|
}
|
|
|
|
shouldResync := rc.shouldResync(obj)
|
|
shouldCheckHealth := rc.shouldCheckHealth(obj)
|
|
hasSpecChanged := obj.Generation != obj.Status.ObservedGeneration
|
|
patchOperations := []map[string]interface{}{}
|
|
|
|
// Determine the main triggering condition
|
|
switch {
|
|
case hasSpecChanged:
|
|
logger.Info("spec changed", "Generation", obj.Generation, "ObservedGeneration", obj.Status.ObservedGeneration)
|
|
patchOperations = append(patchOperations, map[string]interface{}{
|
|
"op": "replace",
|
|
"path": "/status/observedGeneration",
|
|
"value": obj.Generation,
|
|
})
|
|
case shouldResync:
|
|
logger.Info("sync interval triggered", "sync_interval", time.Duration(obj.Spec.Sync.IntervalSeconds)*time.Second, "sync_status", obj.Status.Sync)
|
|
case shouldCheckHealth:
|
|
logger.Info("health is stale", "health_status", obj.Status.Health.Healthy)
|
|
default:
|
|
logger.Info("skipping as conditions are not met", "status", obj.Status, "generation", obj.Generation, "sync_spec", obj.Spec.Sync)
|
|
return nil
|
|
}
|
|
|
|
repo, err := rc.repoGetter.AsRepository(ctx, obj)
|
|
if err != nil {
|
|
return fmt.Errorf("unable to create repository from configuration: %w", err)
|
|
}
|
|
|
|
healthStatus := obj.Status.Health
|
|
if shouldCheckHealth {
|
|
healthStatus = rc.runHealthCheck(ctx, repo)
|
|
patchOperations = append(patchOperations, map[string]interface{}{
|
|
"op": "replace",
|
|
"path": "/status/health",
|
|
"value": healthStatus,
|
|
})
|
|
}
|
|
|
|
// Run hooks
|
|
hookOps, err := rc.runHooks(ctx, repo, obj)
|
|
switch {
|
|
case err != nil:
|
|
return err
|
|
case len(hookOps) > 0:
|
|
patchOperations = append(patchOperations, hookOps...)
|
|
}
|
|
|
|
// determine the sync strategy and sync status to apply
|
|
syncOptions := rc.determineSyncStrategy(ctx, obj, shouldResync, healthStatus)
|
|
if syncStatus := rc.determineSyncStatus(obj, syncOptions); syncStatus != nil {
|
|
patchOperations = append(patchOperations, map[string]interface{}{
|
|
"op": "replace",
|
|
"path": "/status/sync",
|
|
"value": syncStatus,
|
|
})
|
|
}
|
|
|
|
// Apply all patch operations
|
|
if err := rc.patchStatus(ctx, obj, patchOperations); err != nil {
|
|
return err
|
|
}
|
|
|
|
// Trigger sync job after we have applied all patch operations
|
|
if syncOptions != nil {
|
|
if err := rc.addSyncJob(ctx, obj, syncOptions); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|