Add podman system check for checking storage consistency

Add a `podman system check` that performs consistency checks on local
storage, optionally removing damaged items so that they can be
recreated.

Signed-off-by: Nalin Dahyabhai <nalin@redhat.com>
This commit is contained in:
Nalin Dahyabhai
2024-04-02 16:18:19 -04:00
parent c510959826
commit fec58a4571
15 changed files with 565 additions and 0 deletions

138
cmd/podman/system/check.go Normal file
View File

@ -0,0 +1,138 @@
package system
import (
"context"
"errors"
"fmt"
"time"
"github.com/containers/common/pkg/completion"
"github.com/containers/podman/v5/cmd/podman/registry"
"github.com/containers/podman/v5/cmd/podman/validate"
"github.com/containers/podman/v5/pkg/domain/entities/types"
multierror "github.com/hashicorp/go-multierror"
"github.com/spf13/cobra"
)
var (
checkOptions = types.SystemCheckOptions{}
checkDescription = `
podman system check
Check storage for consistency and remove anything that looks damaged
`
checkCommand = &cobra.Command{
Use: "check [options]",
Short: "Check storage consistency",
Args: validate.NoArgs,
Long: checkDescription,
RunE: check,
ValidArgsFunction: completion.AutocompleteNone,
Example: `podman system check`,
}
)
func init() {
registry.Commands = append(registry.Commands, registry.CliCommand{
Command: checkCommand,
Parent: systemCmd,
})
flags := checkCommand.Flags()
flags.BoolVarP(&checkOptions.Quick, "quick", "q", false, "Skip time-consuming checks. The default is to include time-consuming checks")
flags.BoolVarP(&checkOptions.Repair, "repair", "r", false, "Remove inconsistent images")
flags.BoolVarP(&checkOptions.RepairLossy, "force", "f", false, "Remove inconsistent images and containers")
flags.DurationP("max", "m", 24*time.Hour, "Maximum allowed age of unreferenced layers")
_ = checkCommand.RegisterFlagCompletionFunc("max", completion.AutocompleteNone)
}
func check(cmd *cobra.Command, args []string) error {
flags := cmd.Flags()
if flags.Changed("max") {
maxAge, err := flags.GetDuration("max")
if err != nil {
return err
}
checkOptions.UnreferencedLayerMaximumAge = &maxAge
}
response, err := registry.ContainerEngine().SystemCheck(context.Background(), checkOptions)
if err != nil {
return err
}
if err = printSystemCheckResults(response); err != nil {
return err
}
if !checkOptions.Repair && !checkOptions.RepairLossy && response.Errors {
return errors.New("damage detected in local storage")
}
recheckOptions := checkOptions
recheckOptions.Repair = false
recheckOptions.RepairLossy = false
if response, err = registry.ContainerEngine().SystemCheck(context.Background(), recheckOptions); err != nil {
return err
}
if response.Errors {
return errors.New("damage in local storage still present after repair attempt")
}
return nil
}
func printSystemCheckResults(report *types.SystemCheckReport) error {
if !report.Errors {
return nil
}
errorSlice := func(strs []string) []error {
if strs == nil {
return nil
}
errs := make([]error, len(strs))
for i, s := range strs {
errs[i] = errors.New(s)
}
return errs
}
for damagedLayer, errorsSlice := range report.Layers {
merr := multierror.Append(nil, errorSlice(errorsSlice)...)
if err := merr.ErrorOrNil(); err != nil {
fmt.Printf("Damaged layer %s:\n%s", damagedLayer, err)
}
}
for _, removedLayer := range report.RemovedLayers {
fmt.Printf("Deleted damaged layer: %s\n", removedLayer)
}
for damagedROLayer, errorsSlice := range report.ROLayers {
merr := multierror.Append(nil, errorSlice(errorsSlice)...)
if err := merr.ErrorOrNil(); err != nil {
fmt.Printf("Damaged read-only layer %s:\n%s", damagedROLayer, err)
}
}
for damagedImage, errorsSlice := range report.Images {
merr := multierror.Append(nil, errorSlice(errorsSlice)...)
if err := merr.ErrorOrNil(); err != nil {
fmt.Printf("Damaged image %s:\n%s", damagedImage, err)
}
}
for removedImage := range report.RemovedImages {
fmt.Printf("Deleted damaged image: %s\n", removedImage)
}
for damagedROImage, errorsSlice := range report.ROImages {
merr := multierror.Append(nil, errorSlice(errorsSlice)...)
if err := merr.ErrorOrNil(); err != nil {
fmt.Printf("Damaged read-only image %s\n%s", damagedROImage, err)
}
}
for damagedContainer, errorsSlice := range report.Containers {
merr := multierror.Append(nil, errorSlice(errorsSlice)...)
if err := merr.ErrorOrNil(); err != nil {
fmt.Printf("Damaged container %s:\n%s", damagedContainer, err)
}
}
for removedContainer := range report.RemovedContainers {
fmt.Printf("Deleted damaged container: %s\n", removedContainer)
}
return nil
}

View File

@ -0,0 +1,59 @@
% podman-system-check 1
## NAME
podman\-system\-check - Perform consistency checks on image and container storage
## SYNOPSIS
**podman system check** [*options*]
## DESCRIPTION
Perform consistency checks on image and container storage, reporting images and
containers which have identified issues.
## OPTIONS
#### **--force**, **-f**
When attempting to remove damaged images, also remove containers which depend
on those images. By default, damaged images which are being used by containers
are left alone.
Containers which depend on damaged images do so regardless of which engine
created them, but because podman only "knows" how to shut down containers that
it started, the effect on still-running containers which were started by other
engines is difficult to predict.
#### **--max**, **-m**=*duration*
When considering layers which are not used by any images or containers, assume
that any layers which are more than *duration* old are the results of canceled
attempts to pull images, and should be treated as though they are damaged.
#### **--quick**, **-q**
Skip checks which are known to be time-consuming. This will prevent some types
of errors from being detected.
#### **--repair**, **-r**
Remove any images which are determined to have been damaged in some way, unless
they are in use by containers. Use **--force** to remove containers which
depend on damaged images, and those damaged images, as well.
## EXAMPLE
A reasonably quick check:
```
podman system check --quick --repair --force
```
A more thorough check:
```
podman system check --repair --max=1h --force
```
## SEE ALSO
**[podman(1)](podman.1.md)**, **[podman-system(1)](podman-system.1.md)**
## HISTORY
April 2024

View File

@ -13,6 +13,7 @@ The system command allows management of the podman systems
| Command | Man Page | Description |
| ------- | ------------------------------------------------------------ | ------------------------------------------------------------------------ |
| check | [podman-system-check(1)](podman-system-check.1.md) | Perform consistency checks on image and container storage.
| connection | [podman-system-connection(1)](podman-system-connection.1.md) | Manage the destination(s) for Podman service(s) |
| df | [podman-system-df(1)](podman-system-df.1.md) | Show podman disk usage. |
| events | [podman-events(1)](podman-events.1.md) | Monitor Podman events |

View File

@ -31,6 +31,7 @@ import (
"github.com/containers/podman/v5/libpod/lock"
"github.com/containers/podman/v5/libpod/plugin"
"github.com/containers/podman/v5/libpod/shutdown"
"github.com/containers/podman/v5/pkg/domain/entities"
"github.com/containers/podman/v5/pkg/rootless"
"github.com/containers/podman/v5/pkg/systemd"
"github.com/containers/podman/v5/pkg/util"
@ -39,9 +40,11 @@ import (
"github.com/containers/storage/pkg/lockfile"
"github.com/containers/storage/pkg/unshare"
"github.com/docker/docker/pkg/namesgenerator"
"github.com/hashicorp/go-multierror"
jsoniter "github.com/json-iterator/go"
spec "github.com/opencontainers/runtime-spec/specs-go"
"github.com/sirupsen/logrus"
"golang.org/x/exp/slices"
)
// Set up the JSON library for all of Libpod
@ -1249,3 +1252,133 @@ func (r *Runtime) LockConflicts() (map[uint32][]string, []uint32, error) {
return toReturn, locksHeld, nil
}
// SystemCheck checks our storage for consistency, and depending on the options
// specified, will attempt to remove anything which fails consistency checks.
func (r *Runtime) SystemCheck(ctx context.Context, options entities.SystemCheckOptions) (entities.SystemCheckReport, error) {
what := storage.CheckEverything()
if options.Quick {
what = storage.CheckMost()
}
if options.UnreferencedLayerMaximumAge != nil {
tmp := *options.UnreferencedLayerMaximumAge
what.LayerUnreferencedMaximumAge = &tmp
}
storageReport, err := r.store.Check(what)
if err != nil {
return entities.SystemCheckReport{}, err
}
if len(storageReport.Containers) == 0 &&
len(storageReport.Layers) == 0 &&
len(storageReport.ROLayers) == 0 &&
len(storageReport.Images) == 0 &&
len(storageReport.ROImages) == 0 {
// no errors detected
return entities.SystemCheckReport{}, nil
}
mapErrorSlicesToStringSlices := func(m map[string][]error) map[string][]string {
if len(m) == 0 {
return nil
}
mapped := make(map[string][]string, len(m))
for k, errs := range m {
strs := make([]string, len(errs))
for i, e := range errs {
strs[i] = e.Error()
}
mapped[k] = strs
}
return mapped
}
report := entities.SystemCheckReport{
Errors: true,
Layers: mapErrorSlicesToStringSlices(storageReport.Layers),
ROLayers: mapErrorSlicesToStringSlices(storageReport.ROLayers),
Images: mapErrorSlicesToStringSlices(storageReport.Images),
ROImages: mapErrorSlicesToStringSlices(storageReport.ROImages),
Containers: mapErrorSlicesToStringSlices(storageReport.Containers),
}
if !options.Repair && report.Errors {
// errors detected, no corrective measures to be taken
return report, err
}
// get a list of images that we knew of before we tried to clean up any
// that were damaged
imagesBefore, err := r.store.Images()
if err != nil {
return report, fmt.Errorf("getting a list of images before attempting repairs: %w", err)
}
repairOptions := storage.RepairOptions{
RemoveContainers: options.RepairLossy,
}
var containers []*Container
if repairOptions.RemoveContainers {
// build a list of the containers that we claim as ours that we
// expect to be removing in a bit
for containerID := range storageReport.Containers {
ctr, lookupErr := r.state.LookupContainer(containerID)
if lookupErr != nil {
// we're about to remove it, so it's okay that
// it isn't even one of ours
continue
}
containers = append(containers, ctr)
}
}
// run the cleanup
merr := multierror.Append(nil, r.store.Repair(storageReport, &repairOptions)...)
if repairOptions.RemoveContainers {
// get the list of containers that storage will still admit to knowing about
containersAfter, err := r.store.Containers()
if err != nil {
merr = multierror.Append(merr, fmt.Errorf("getting a list of containers after attempting repairs: %w", err))
}
for _, ctr := range containers {
// if one of our containers that we tried to remove is
// still on disk, report an error
if slices.IndexFunc(containersAfter, func(containerAfter storage.Container) bool {
return containerAfter.ID == ctr.ID()
}) != -1 {
merr = multierror.Append(merr, fmt.Errorf("clearing storage for container %s: %w", ctr.ID(), err))
continue
}
// remove the container from our database
if removeErr := r.state.RemoveContainer(ctr); removeErr != nil {
merr = multierror.Append(merr, fmt.Errorf("updating state database to reflect removal of container %s: %w", ctr.ID(), removeErr))
continue
}
if report.RemovedContainers == nil {
report.RemovedContainers = make(map[string]string)
}
report.RemovedContainers[ctr.ID()] = ctr.config.Name
}
}
// get a list of images that are still around after we clean up any
// that were damaged
imagesAfter, err := r.store.Images()
if err != nil {
merr = multierror.Append(merr, fmt.Errorf("getting a list of images after attempting repairs: %w", err))
}
for _, imageBefore := range imagesBefore {
if slices.IndexFunc(imagesAfter, func(imageAfter storage.Image) bool {
return imageAfter.ID == imageBefore.ID
}) == -1 {
if report.RemovedImages == nil {
report.RemovedImages = make(map[string][]string)
}
report.RemovedImages[imageBefore.ID] = slices.Clone(imageBefore.Names)
}
}
if merr != nil {
err = merr.ErrorOrNil()
}
return report, err
}

View File

@ -3,6 +3,7 @@ package libpod
import (
"fmt"
"net/http"
"time"
"github.com/containers/podman/v5/libpod"
"github.com/containers/podman/v5/pkg/api/handlers/utils"
@ -65,3 +66,46 @@ func DiskUsage(w http.ResponseWriter, r *http.Request) {
}
utils.WriteResponse(w, http.StatusOK, response)
}
func SystemCheck(w http.ResponseWriter, r *http.Request) {
decoder := r.Context().Value(api.DecoderKey).(*schema.Decoder)
runtime := r.Context().Value(api.RuntimeKey).(*libpod.Runtime)
query := struct {
Quick bool `schema:"quick"`
Repair bool `schema:"repair"`
RepairLossy bool `schema:"repair_lossy"`
UnreferencedLayerMaximumAge string `schema:"unreferenced_layer_max_age"`
}{}
if err := decoder.Decode(&query, r.URL.Query()); err != nil {
utils.Error(w, http.StatusBadRequest,
fmt.Errorf("failed to parse parameters for %s: %w", r.URL.String(), err))
return
}
containerEngine := abi.ContainerEngine{Libpod: runtime}
var unreferencedLayerMaximumAge *time.Duration
if query.UnreferencedLayerMaximumAge != "" {
duration, err := time.ParseDuration(query.UnreferencedLayerMaximumAge)
if err != nil {
utils.Error(w, http.StatusBadRequest,
fmt.Errorf("failed to parse unreferenced_layer_max_age parameter %q for %s: %w", query.UnreferencedLayerMaximumAge, r.URL.String(), err))
}
unreferencedLayerMaximumAge = &duration
}
checkOptions := entities.SystemCheckOptions{
Quick: query.Quick,
Repair: query.Repair,
RepairLossy: query.RepairLossy,
UnreferencedLayerMaximumAge: unreferencedLayerMaximumAge,
}
report, err := containerEngine.SystemCheck(r.Context(), checkOptions)
if err != nil {
utils.InternalServerError(w, err)
return
}
utils.WriteResponse(w, http.StatusOK, report)
}

View File

@ -188,6 +188,13 @@ type versionResponse struct {
Body entities.ComponentVersion
}
// Check
// swagger:response
type systemCheckResponse struct {
// in:body
Body entities.SystemCheckReport
}
// Disk usage
// swagger:response
type systemDiskUsage struct {

View File

@ -25,6 +25,39 @@ func (s *APIServer) registerSystemHandlers(r *mux.Router) error {
r.Handle(VersionedPath("/system/df"), s.APIHandler(compat.GetDiskUsage)).Methods(http.MethodGet)
// Added non version path to URI to support docker non versioned paths
r.Handle("/system/df", s.APIHandler(compat.GetDiskUsage)).Methods(http.MethodGet)
// swagger:operation POST /libpod/system/check libpod SystemCheckLibpod
// ---
// tags:
// - system
// summary: Performs consistency checks on storage, optionally removing items which fail checks
// parameters:
// - in: query
// name: quick
// type: boolean
// description: Skip time-consuming checks
// - in: query
// name: repair
// type: boolean
// description: Remove inconsistent images
// - in: query
// name: repair_lossy
// type: boolean
// description: Remove inconsistent containers and images
// - in: query
// name: unreferenced_layer_max_age
// type: string
// description: Maximum allowed age of unreferenced layers
// default: 24h0m0s
// produces:
// - application/json
// responses:
// 200:
// $ref: '#/responses/systemCheckResponse'
// 400:
// $ref: "#/responses/badParamError"
// 500:
// $ref: "#/responses/internalError"
r.Handle(VersionedPath("/libpod/system/check"), s.APIHandler(libpod.SystemCheck)).Methods(http.MethodPost)
// swagger:operation POST /libpod/system/prune libpod SystemPruneLibpod
// ---
// tags:

View File

@ -87,6 +87,26 @@ func Prune(ctx context.Context, options *PruneOptions) (*types.SystemPruneReport
return &report, response.Process(&report)
}
func Check(ctx context.Context, options *CheckOptions) (*types.SystemCheckReport, error) {
var report types.SystemCheckReport
conn, err := bindings.GetClient(ctx)
if err != nil {
return nil, err
}
params, err := options.ToParams()
if err != nil {
return nil, err
}
response, err := conn.DoRequest(ctx, nil, http.MethodPost, "/system/check", params, nil)
if err != nil {
return nil, err
}
defer response.Body.Close()
return &report, response.Process(&report)
}
func Version(ctx context.Context, options *VersionOptions) (*types.SystemVersionReport, error) {
var (
component types.SystemComponentVersion

View File

@ -38,3 +38,13 @@ type DiskOptions struct {
//go:generate go run ../generator/generator.go InfoOptions
type InfoOptions struct {
}
// CheckOptions are optional options for storage consistency check/repair
//
//go:generate go run ../generator/generator.go CheckOptions
type CheckOptions struct {
Quick *bool `schema:"quick"`
Repair *bool `schema:"repair"`
RepairLossy *bool `schema:"repair_lossy"`
UnreferencedLayerMaximumAge *string `schema:"unreferenced_layer_max_age"`
}

View File

@ -0,0 +1,78 @@
// Code generated by go generate; DO NOT EDIT.
package system
import (
"net/url"
"github.com/containers/podman/v5/pkg/bindings/internal/util"
)
// Changed returns true if named field has been set
func (o *CheckOptions) Changed(fieldName string) bool {
return util.Changed(o, fieldName)
}
// ToParams formats struct fields to be passed to API service
func (o *CheckOptions) ToParams() (url.Values, error) {
return util.ToParams(o)
}
// WithQuick set field Quick to given value
func (o *CheckOptions) WithQuick(value bool) *CheckOptions {
o.Quick = &value
return o
}
// GetQuick returns value of field Quick
func (o *CheckOptions) GetQuick() bool {
if o.Quick == nil {
var z bool
return z
}
return *o.Quick
}
// WithRepair set field Repair to given value
func (o *CheckOptions) WithRepair(value bool) *CheckOptions {
o.Repair = &value
return o
}
// GetRepair returns value of field Repair
func (o *CheckOptions) GetRepair() bool {
if o.Repair == nil {
var z bool
return z
}
return *o.Repair
}
// WithRepairLossy set field RepairLossy to given value
func (o *CheckOptions) WithRepairLossy(value bool) *CheckOptions {
o.RepairLossy = &value
return o
}
// GetRepairLossy returns value of field RepairLossy
func (o *CheckOptions) GetRepairLossy() bool {
if o.RepairLossy == nil {
var z bool
return z
}
return *o.RepairLossy
}
// WithUnreferencedLayerMaximumAge set field UnreferencedLayerMaximumAge to given value
func (o *CheckOptions) WithUnreferencedLayerMaximumAge(value string) *CheckOptions {
o.UnreferencedLayerMaximumAge = &value
return o
}
// GetUnreferencedLayerMaximumAge returns value of field UnreferencedLayerMaximumAge
func (o *CheckOptions) GetUnreferencedLayerMaximumAge() string {
if o.UnreferencedLayerMaximumAge == nil {
var z string
return z
}
return *o.UnreferencedLayerMaximumAge
}

View File

@ -103,6 +103,7 @@ type ContainerEngine interface { //nolint:interfacebloat
SecretExists(ctx context.Context, nameOrID string) (*BoolReport, error)
Shutdown(ctx context.Context)
SystemDf(ctx context.Context, options SystemDfOptions) (*SystemDfReport, error)
SystemCheck(ctx context.Context, options SystemCheckOptions) (*SystemCheckReport, error)
Unshare(ctx context.Context, args []string, options SystemUnshareOptions) error
Version(ctx context.Context) (*SystemVersionReport, error)
VolumeCreate(ctx context.Context, opts VolumeCreateOptions) (*IDOrNameResponse, error)

View File

@ -9,6 +9,8 @@ type ServiceOptions = types.ServiceOptions
type SystemPruneOptions = types.SystemPruneOptions
type SystemPruneReport = types.SystemPruneReport
type SystemMigrateOptions = types.SystemMigrateOptions
type SystemCheckOptions = types.SystemCheckOptions
type SystemCheckReport = types.SystemCheckReport
type SystemDfOptions = types.SystemDfOptions
type SystemDfReport = types.SystemDfReport
type SystemDfImageReport = types.SystemDfImageReport

View File

@ -15,6 +15,28 @@ type ServiceOptions struct {
URI string // Path to unix domain socket service should listen on
}
// SystemCheckOptions provides options for checking storage consistency.
type SystemCheckOptions struct {
Quick bool // skip the most time-intensive checks
Repair bool // remove damaged images
RepairLossy bool // remove damaged containers
UnreferencedLayerMaximumAge *time.Duration // maximum allowed age for unreferenced layers
}
// SystemCheckReport provides a report of what a storage consistency check
// found, and if we removed anything that was damaged, what we removed.
type SystemCheckReport struct {
Errors bool // any errors were detected
Layers map[string][]string // layer ID → what was detected
ROLayers map[string][]string // layer ID → what was detected
RemovedLayers []string // layer ID
Images map[string][]string // image ID → what was detected
ROImages map[string][]string // image ID → what was detected
RemovedImages map[string][]string // image ID → names
Containers map[string][]string // container ID → what was detected
RemovedContainers map[string]string // container ID → name
}
// SystemPruneOptions provides options to prune system.
type SystemPruneOptions struct {
All bool

View File

@ -337,3 +337,11 @@ func (ic ContainerEngine) Locks(ctx context.Context) (*entities.LocksReport, err
report.LocksHeld = held
return &report, nil
}
func (ic ContainerEngine) SystemCheck(ctx context.Context, options entities.SystemCheckOptions) (*entities.SystemCheckReport, error) {
report, err := ic.Libpod.SystemCheck(ctx, options)
if err != nil {
return nil, err
}
return &report, nil
}

View File

@ -23,6 +23,15 @@ func (ic *ContainerEngine) SystemPrune(ctx context.Context, opts entities.System
return system.Prune(ic.ClientCtx, options)
}
func (ic *ContainerEngine) SystemCheck(ctx context.Context, opts entities.SystemCheckOptions) (*entities.SystemCheckReport, error) {
options := new(system.CheckOptions).WithQuick(opts.Quick).WithRepair(opts.Repair).WithRepairLossy(opts.RepairLossy)
if opts.UnreferencedLayerMaximumAge != nil {
duration := *opts.UnreferencedLayerMaximumAge
options = options.WithUnreferencedLayerMaximumAge(duration.String())
}
return system.Check(ic.ClientCtx, options)
}
func (ic *ContainerEngine) Migrate(ctx context.Context, options entities.SystemMigrateOptions) error {
return errors.New("runtime migration is not supported on remote clients")
}