From c726cf81063eb26c6ffbbcfcc661fad3779bfc3a Mon Sep 17 00:00:00 2001 From: Paul Holzinger Date: Thu, 17 Aug 2023 10:40:02 +0200 Subject: [PATCH] libpod: improve conmon error handling When conmon is started it blocks and waits for us to signal it to start via pipe. This works but when conmon exits before it waits for the start message it causes podman to fail with `write child: broken pipe`. This error is meaningless to podman users. The real error is that conmon failed so we should not return early if we fail to send the start message to conmon. Instead ignore the EPIPE error case as it is safe to assume to the conmon died and for other errors we make sure to kill conmon so that the following wait() call does not hang forever. This also fixes problems with having conmon zombie processes leaked as wait() was never called. Signed-off-by: Paul Holzinger --- libpod/oci_conmon_common.go | 10 +++++++++- libpod/oci_conmon_linux.go | 5 +---- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/libpod/oci_conmon_common.go b/libpod/oci_conmon_common.go index 6b0d6bc679..f315d94397 100644 --- a/libpod/oci_conmon_common.go +++ b/libpod/oci_conmon_common.go @@ -1261,8 +1261,16 @@ func (r *ConmonOCIRuntime) createOCIContainer(ctr *Container, restoreOptions *Co return 0, err } if err := r.moveConmonToCgroupAndSignal(ctr, cmd, parentStartPipe); err != nil { - return 0, err + // The child likely already exited in which case the cmd.Wait() below should return the proper error. + // EPIPE is expected if the child already exited so not worth to log and kill the process. + if !errors.Is(err, syscall.EPIPE) { + logrus.Errorf("Failed to signal conmon to start: %v", err) + if err := cmd.Process.Kill(); err != nil && !errors.Is(err, syscall.ESRCH) { + logrus.Errorf("Failed to kill conmon after error: %v", err) + } + } } + /* Wait for initial setup and fork, and reap child */ err = cmd.Wait() if err != nil { diff --git a/libpod/oci_conmon_linux.go b/libpod/oci_conmon_linux.go index 9819c83bae..a3a552bc64 100644 --- a/libpod/oci_conmon_linux.go +++ b/libpod/oci_conmon_linux.go @@ -162,10 +162,7 @@ func (r *ConmonOCIRuntime) moveConmonToCgroupAndSignal(ctr *Container, cmd *exec } /* We set the cgroup, now the child can start creating children */ - if err := writeConmonPipeData(startFd); err != nil { - return err - } - return nil + return writeConmonPipeData(startFd) } // GetLimits converts spec resource limits to cgroup consumable limits