From 5dfc44cad2feacb661be63eba326a6bc66a2e591 Mon Sep 17 00:00:00 2001
From: Nalin Dahyabhai <nalin@redhat.com>
Date: Wed, 14 Apr 2021 13:35:43 -0400
Subject: [PATCH 1/2] rmi: don't break when the image is missing a manifest

In libpod/image.Image.Remove(), if the attempt to find the image's
parent fails for any reason, log a warning and proceed as though it
didn't have one instead of failing, which would leave us unable to
remove the image without resetting everything.

In libpod/Runtime.RemoveImage(), if we can't determine if an image has
children, log a warning, and assume that it doesn't have any instead of
failing, which would leave us unable to remove the image without
resetting everything.

In pkg/domain/infra/abi.ImageEngine.Remove(), when attempting to remove
all images, if we encounter an error checking if a given image has
children, log a warning, and assume that it doesn't have any instead of
failing, which would leave us unable to remove the image without
resetting everything.

Signed-off-by: Nalin Dahyabhai <nalin@redhat.com>
---
 libpod/image/image.go               |   3 +-
 libpod/runtime_img.go               |   3 +-
 pkg/domain/infra/abi/images.go      |   4 +-
 test/system/330-corrupt-images.bats | 134 ++++++++++++++++++++++++++++
 4 files changed, 140 insertions(+), 4 deletions(-)
 create mode 100644 test/system/330-corrupt-images.bats

diff --git a/libpod/image/image.go b/libpod/image/image.go
index cecd64eb7f..2ead63c6f8 100644
--- a/libpod/image/image.go
+++ b/libpod/image/image.go
@@ -623,7 +623,8 @@ func (i *Image) TopLayer() string {
 func (i *Image) Remove(ctx context.Context, force bool) error {
 	parent, err := i.GetParent(ctx)
 	if err != nil {
-		return err
+		logrus.Warnf("error determining parent of image: %v, ignoring the error", err)
+		parent = nil
 	}
 	if _, err := i.imageruntime.store.DeleteImage(i.ID(), true); err != nil {
 		return err
diff --git a/libpod/runtime_img.go b/libpod/runtime_img.go
index e57890fa29..cb08a8c300 100644
--- a/libpod/runtime_img.go
+++ b/libpod/runtime_img.go
@@ -66,7 +66,8 @@ func (r *Runtime) RemoveImage(ctx context.Context, img *image.Image, force bool)
 
 	hasChildren, err := img.IsParent(ctx)
 	if err != nil {
-		return nil, err
+		logrus.Warnf("error determining if an image is a parent: %v, ignoring the error", err)
+		hasChildren = false
 	}
 
 	if (len(img.Names()) > 1 && !img.InputIsID()) || hasChildren {
diff --git a/pkg/domain/infra/abi/images.go b/pkg/domain/infra/abi/images.go
index 75ce518b6f..c5831ff989 100644
--- a/pkg/domain/infra/abi/images.go
+++ b/pkg/domain/infra/abi/images.go
@@ -657,8 +657,8 @@ func (ir *ImageEngine) Remove(ctx context.Context, images []string, opts entitie
 			for _, img := range storageImages {
 				isParent, err := img.IsParent(ctx)
 				if err != nil {
-					rmErrors = append(rmErrors, err)
-					continue
+					logrus.Warnf("%v, ignoring the error", err)
+					isParent = false
 				}
 				// Skip parent images.
 				if isParent {
diff --git a/test/system/330-corrupt-images.bats b/test/system/330-corrupt-images.bats
new file mode 100644
index 0000000000..9836de3630
--- /dev/null
+++ b/test/system/330-corrupt-images.bats
@@ -0,0 +1,134 @@
+#!/usr/bin/env bats   -*- bats -*-
+#
+# All tests in here perform nasty manipulations on image storage.
+#
+
+load helpers
+
+###############################################################################
+# BEGIN setup/teardown
+
+# Create a scratch directory; this is what we'll use for image store and cache
+if [ -z "${PODMAN_CORRUPT_TEST_WORKDIR}" ]; then
+    export PODMAN_CORRUPT_TEST_WORKDIR=$(mktemp -d --tmpdir=${BATS_TMPDIR:-${TMPDIR:-/tmp}} podman_corrupt_test.XXXXXX)
+fi
+
+PODMAN_CORRUPT_TEST_IMAGE_FQIN=quay.io/libpod/alpine@sha256:634a8f35b5f16dcf4aaa0822adc0b1964bb786fca12f6831de8ddc45e5986a00
+PODMAN_CORRUPT_TEST_IMAGE_ID=961769676411f082461f9ef46626dd7a2d1e2b2a38e6a44364bcbecf51e66dd4
+
+# All tests in this file (and ONLY in this file) run with a custom rootdir
+function setup() {
+    skip_if_remote "none of these tests run under podman-remote"
+    _PODMAN_TEST_OPTS="--root ${PODMAN_CORRUPT_TEST_WORKDIR}/root"
+}
+
+function teardown() {
+    # No other tests should ever run with this custom rootdir
+    unset _PODMAN_TEST_OPTS
+
+    is_remote && return
+
+    # Clean up
+    umount ${PODMAN_CORRUPT_TEST_WORKDIR}/root/overlay || true
+    if is_rootless; then
+        run_podman unshare rm -rf ${PODMAN_CORRUPT_TEST_WORKDIR}/root
+    else
+        rm -rf ${PODMAN_CORRUPT_TEST_WORKDIR}/root
+    fi
+}
+
+# END   setup/teardown
+###############################################################################
+# BEGIN primary test helper
+
+# This is our main action, invoked by every actual test. It:
+#    - creates a new empty rootdir
+#    - populates it with our crafted test image
+#    - removes [ manifest, blob ]
+#    - confirms that "podman images" throws an error
+#    - runs the specified command (rmi -a -f, prune, reset, etc)
+#    - confirms that it succeeds, and also emits expected warnings
+function _corrupt_image_test() {
+    # Run this test twice: once removing manifest, once removing blob
+    for what_to_rm in manifest blob; do
+        # I have no idea, but this sometimes remains mounted
+        umount ${PODMAN_CORRUPT_TEST_WORKDIR}/root/overlay || true
+        # Start with a fresh storage root, load prefetched image into it.
+        /bin/rm -rf ${PODMAN_CORRUPT_TEST_WORKDIR}/root
+        mkdir -p ${PODMAN_CORRUPT_TEST_WORKDIR}/root
+        run_podman load -i ${PODMAN_CORRUPT_TEST_WORKDIR}/img.tar
+        # "podman load" restores it without a tag, which (a) causes rmi-by-name
+        # to fail, and (b) causes "podman images" to exit 0 instead of 125
+        run_podman tag ${PODMAN_CORRUPT_TEST_IMAGE_ID} ${PODMAN_CORRUPT_TEST_IMAGE_FQIN}
+
+        # shortcut variable name
+        local id=${PODMAN_CORRUPT_TEST_IMAGE_ID}
+
+        case "$what_to_rm" in
+            manifest)  rm_path=manifest ;;
+            blob)      rm_path="=$(echo -n "sha256:$id" | base64 -w0)" ;;
+            *)         die "Internal error: unknown action '$what_to_rm'" ;;
+        esac
+
+        # Corruptify, and confirm that 'podman images' throws an error
+        rm -v ${PODMAN_CORRUPT_TEST_WORKDIR}/root/*-images/$id/${rm_path}
+        run_podman 125 images
+        is "$output" "Error: error retrieving label for image \"$id\": you may need to remove the image to resolve the error"
+
+        # Run the requested command. Confirm it succeeds, with suitable warnings
+        run_podman $*
+        is "$output" ".*error determining parent of image" \
+           "$* with missing $what_to_rm"
+
+        run_podman images -a --noheading
+        is "$output" "" "podman images -a, after $*, is empty"
+    done
+}
+
+# END   primary test helper
+###############################################################################
+# BEGIN first "test" does a one-time pull of our desired image
+
+@test "podman corrupt images - initialize" {
+    # Pull once, save cached copy.
+    run_podman pull $PODMAN_CORRUPT_TEST_IMAGE_FQIN
+    run_podman save -o ${PODMAN_CORRUPT_TEST_WORKDIR}/img.tar \
+               $PODMAN_CORRUPT_TEST_IMAGE_FQIN
+}
+
+# END   first "test" does a one-time pull of our desired image
+###############################################################################
+# BEGIN actual tests
+
+@test "podman corrupt images - rmi -f <image-id>" {
+    _corrupt_image_test "rmi -f ${PODMAN_CORRUPT_TEST_IMAGE_ID}"
+}
+
+@test "podman corrupt images - rmi -f <image-name>" {
+    _corrupt_image_test "rmi -f ${PODMAN_CORRUPT_TEST_IMAGE_FQIN}"
+}
+
+@test "podman corrupt images - rmi -f -a" {
+    _corrupt_image_test "rmi -f -a"
+}
+
+@test "podman corrupt images - image prune" {
+    _corrupt_image_test "image prune -a -f"
+}
+
+@test "podman corrupt images - system reset" {
+    _corrupt_image_test "image prune -a -f"
+}
+
+# END   actual tests
+###############################################################################
+# BEGIN final cleanup
+
+@test "podman corrupt images - cleanup" {
+    rm -rf ${PODMAN_CORRUPT_TEST_WORKDIR}
+}
+
+# END   final cleanup
+###############################################################################
+
+# vim: filetype=sh

From 6925683082d00f2b9679fc98adcdbf9ca65e4ec9 Mon Sep 17 00:00:00 2001
From: Nalin Dahyabhai <nalin@redhat.com>
Date: Thu, 29 Apr 2021 14:30:21 -0400
Subject: [PATCH 2/2] Post-pick adjustment

In libpod/image/layerTree.toOCI(), if we can't read the image's
configuration in OCI format, log the error and return a `nil` pointer
instead of returning the error.

2.2's default logging level is "error" instead of "warn" used in the
version the previous commit is backported from.  The error that causes
us to fail in 2.2 is also in a different location, so we need to add a
similar change for it, and update tests to expect the accompanying error
message.

Signed-off-by: Nalin Dahyabhai <nalin@redhat.com>
---
 libpod/image/layer_tree.go          | 7 +++++--
 test/system/330-corrupt-images.bats | 2 +-
 test/system/helpers.bash            | 2 +-
 3 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/libpod/image/layer_tree.go b/libpod/image/layer_tree.go
index 18101575e6..cef8dadd67 100644
--- a/libpod/image/layer_tree.go
+++ b/libpod/image/layer_tree.go
@@ -5,6 +5,7 @@ import (
 
 	ociv1 "github.com/opencontainers/image-spec/specs-go/v1"
 	"github.com/pkg/errors"
+	"github.com/sirupsen/logrus"
 )
 
 // layerTree is an internal representation of local layers.
@@ -32,9 +33,11 @@ func (t *layerTree) toOCI(ctx context.Context, i *Image) (*ociv1.Image, error) {
 	oci, exists := t.ociCache[i.ID()]
 	if !exists {
 		oci, err = i.ociv1Image(ctx)
-		if err == nil {
-			t.ociCache[i.ID()] = oci
+		if err != nil {
+			logrus.Errorf("%v, ignoring the error", err)
+			return nil, nil
 		}
+		t.ociCache[i.ID()] = oci
 	}
 	return oci, err
 }
diff --git a/test/system/330-corrupt-images.bats b/test/system/330-corrupt-images.bats
index 9836de3630..049f84bb73 100644
--- a/test/system/330-corrupt-images.bats
+++ b/test/system/330-corrupt-images.bats
@@ -77,7 +77,7 @@ function _corrupt_image_test() {
 
         # Run the requested command. Confirm it succeeds, with suitable warnings
         run_podman $*
-        is "$output" ".*error determining parent of image" \
+        is "$output" ".*file does not exist, ignoring the error" \
            "$* with missing $what_to_rm"
 
         run_podman images -a --noheading
diff --git a/test/system/helpers.bash b/test/system/helpers.bash
index f5df85338e..ac9fa85090 100644
--- a/test/system/helpers.bash
+++ b/test/system/helpers.bash
@@ -149,7 +149,7 @@ function run_podman() {
     echo "$_LOG_PROMPT $PODMAN $*"
     # BATS hangs if a subprocess remains and keeps FD 3 open; this happens
     # if podman crashes unexpectedly without cleaning up subprocesses.
-    run timeout --foreground -v --kill=10 $PODMAN_TIMEOUT $PODMAN "$@" 3>/dev/null
+    run timeout --foreground -v --kill=10 $PODMAN_TIMEOUT $PODMAN $_PODMAN_TEST_OPTS "$@" 3>/dev/null
     # without "quotes", multiple lines are glommed together into one
     if [ -n "$output" ]; then
         echo "$output"