mirror of
https://github.com/containers/podman.git
synced 2025-05-19 16:18:51 +08:00
libpod: fix wait and exit-code logic
This commit addresses three intertwined bugs to fix an issue when using Gitlab runner on Podman. The three bug fixes are not split into separate commits as tests won't pass otherwise; avoidable noise when bisecting future issues. 1) Podman conflated states: even when asking to wait for the `exited` state, Podman returned as soon as a container transitioned to `stopped`. The issues surfaced in Gitlab tests to fail [1] as `conmon`'s buffers have not (yet) been emptied when attaching to a container right after a wait. The race window was extremely narrow, and I only managed to reproduce with the Gitlab runner [1] unit tests. 2) The clearer separation between `exited` and `stopped` revealed a race condition predating the changes. If a container is configured for autoremoval (e.g., via `run --rm`), the "run" process competes with the "cleanup" process running in the background. The window of the race condition was sufficiently large that the "cleanup" process has already removed the container and storage before the "run" process could read the exit code and hence waited indefinitely. Address the exit-code race condition by recording exit codes in the main libpod database. Exit codes can now be read from a database. When waiting for a container to exit, Podman first waits for the container to transition to `exited` and will then query the database for its exit code. Outdated exit codes are pruned during cleanup (i.e., non-performance critical) and when refreshing the database after a reboot. An exit code is considered outdated when it is older than 5 minutes. While the race condition predates this change, the waiting process has apparently always been fast enough in catching the exit code due to issue 1): `exited` and `stopped` were conflated. The waiting process hence caught the exit code after the container transitioned to `stopped` but before it `exited` and got removed. 3) With 1) and 2), Podman is now waiting for a container to properly transition to the `exited` state. Some tests did not pass after 1) and 2) which revealed the third bug: `conmon` was executed with its working directory pointing to the OCI runtime bundle of the container. The changed working directory broke resolving relative paths in the "cleanup" process. The "cleanup" process error'ed before actually cleaning up the container and waiting "main" process ran indefinitely - or until hitting a timeout. Fix the issue by executing `conmon` with the same working directory as Podman. Note that fixing 3) *may* address a number of issues we have seen in the past where for *some* reason cleanup processes did not fire. [1] https://gitlab.com/gitlab-org/gitlab-runner/-/issues/27119#note_970712864 Signed-off-by: Valentin Rothberg <vrothberg@redhat.com> [MH: Minor reword of commit message] Signed-off-by: Matthew Heon <mheon@redhat.com>
This commit is contained in:

committed by
Matthew Heon

parent
15188dce05
commit
30e7cbccc1
@ -5,8 +5,10 @@ import (
|
|||||||
"fmt"
|
"fmt"
|
||||||
"net"
|
"net"
|
||||||
"os"
|
"os"
|
||||||
|
"strconv"
|
||||||
"strings"
|
"strings"
|
||||||
"sync"
|
"sync"
|
||||||
|
"time"
|
||||||
|
|
||||||
"github.com/containers/common/libnetwork/types"
|
"github.com/containers/common/libnetwork/types"
|
||||||
"github.com/containers/podman/v4/libpod/define"
|
"github.com/containers/podman/v4/libpod/define"
|
||||||
@ -63,6 +65,13 @@ type BoltState struct {
|
|||||||
// initially created the database. This must match for any further instances
|
// initially created the database. This must match for any further instances
|
||||||
// that access the database, to ensure that state mismatches with
|
// that access the database, to ensure that state mismatches with
|
||||||
// containers/storage do not occur.
|
// containers/storage do not occur.
|
||||||
|
// - exitCodeBucket/exitCodeTimeStampBucket: (#14559) exit codes must be part
|
||||||
|
// of the database to resolve a previous race condition when one process waits
|
||||||
|
// for the exit file to be written and another process removes it along with
|
||||||
|
// the container during auto-removal. The same race would happen trying to
|
||||||
|
// read the exit code from the containers bucket. Hence, exit codes go into
|
||||||
|
// their own bucket. To avoid the rather expensive JSON (un)marshaling, we
|
||||||
|
// have two buckets: one for the exit codes, the other for the timestamps.
|
||||||
|
|
||||||
// NewBoltState creates a new bolt-backed state database
|
// NewBoltState creates a new bolt-backed state database
|
||||||
func NewBoltState(path string, runtime *Runtime) (State, error) {
|
func NewBoltState(path string, runtime *Runtime) (State, error) {
|
||||||
@ -98,6 +107,8 @@ func NewBoltState(path string, runtime *Runtime) (State, error) {
|
|||||||
allVolsBkt,
|
allVolsBkt,
|
||||||
execBkt,
|
execBkt,
|
||||||
runtimeConfigBkt,
|
runtimeConfigBkt,
|
||||||
|
exitCodeBkt,
|
||||||
|
exitCodeTimeStampBkt,
|
||||||
}
|
}
|
||||||
|
|
||||||
// Does the DB need an update?
|
// Does the DB need an update?
|
||||||
@ -150,6 +161,10 @@ func (s *BoltState) Refresh() error {
|
|||||||
return define.ErrDBClosed
|
return define.ErrDBClosed
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if err := s.PruneContainerExitCodes(); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
db, err := s.getDBCon()
|
db, err := s.getDBCon()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
@ -1341,6 +1356,166 @@ func (s *BoltState) GetContainerConfig(id string) (*ContainerConfig, error) {
|
|||||||
return config, nil
|
return config, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// AddContainerExitCode adds the exit code for the specified container to the database.
|
||||||
|
func (s *BoltState) AddContainerExitCode(id string, exitCode int32) error {
|
||||||
|
db, err := s.getDBCon()
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
defer s.deferredCloseDBCon(db)
|
||||||
|
|
||||||
|
rawID := []byte(id)
|
||||||
|
rawExitCode := []byte(strconv.Itoa(int(exitCode)))
|
||||||
|
rawTimeStamp, err := time.Now().MarshalText()
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("marshaling exit-code time stamp: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
return db.Update(func(tx *bolt.Tx) error {
|
||||||
|
exitCodeBucket, err := getExitCodeBucket(tx)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
timeStampBucket, err := getExitCodeTimeStampBucket(tx)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := exitCodeBucket.Put(rawID, rawExitCode); err != nil {
|
||||||
|
return fmt.Errorf("adding exit code of container %s to DB: %w", id, err)
|
||||||
|
}
|
||||||
|
if err := timeStampBucket.Put(rawID, rawTimeStamp); err != nil {
|
||||||
|
if rmErr := exitCodeBucket.Delete(rawID); rmErr != nil {
|
||||||
|
logrus.Errorf("Removing exit code of container %s from DB: %v", id, rmErr)
|
||||||
|
}
|
||||||
|
return fmt.Errorf("adding exit-code time stamp of container %s to DB: %w", id, err)
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
// GetContainerExitCode returns the exit code for the specified container.
|
||||||
|
func (s *BoltState) GetContainerExitCode(id string) (int32, error) {
|
||||||
|
db, err := s.getDBCon()
|
||||||
|
if err != nil {
|
||||||
|
return -1, err
|
||||||
|
}
|
||||||
|
defer s.deferredCloseDBCon(db)
|
||||||
|
|
||||||
|
rawID := []byte(id)
|
||||||
|
result := int32(-1)
|
||||||
|
return result, db.View(func(tx *bolt.Tx) error {
|
||||||
|
exitCodeBucket, err := getExitCodeBucket(tx)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
rawExitCode := exitCodeBucket.Get(rawID)
|
||||||
|
if rawExitCode == nil {
|
||||||
|
return fmt.Errorf("getting exit code of container %s from DB: %w", id, define.ErrNoSuchExitCode)
|
||||||
|
}
|
||||||
|
|
||||||
|
exitCode, err := strconv.Atoi(string(rawExitCode))
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("converting raw exit code %v of container %s: %w", rawExitCode, id, err)
|
||||||
|
}
|
||||||
|
|
||||||
|
result = int32(exitCode)
|
||||||
|
return nil
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
// GetContainerExitCodeTimeStamp returns the time stamp when the exit code of
|
||||||
|
// the specified container was added to the database.
|
||||||
|
func (s *BoltState) GetContainerExitCodeTimeStamp(id string) (*time.Time, error) {
|
||||||
|
db, err := s.getDBCon()
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
defer s.deferredCloseDBCon(db)
|
||||||
|
|
||||||
|
rawID := []byte(id)
|
||||||
|
var result time.Time
|
||||||
|
return &result, db.View(func(tx *bolt.Tx) error {
|
||||||
|
timeStampBucket, err := getExitCodeTimeStampBucket(tx)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
rawTimeStamp := timeStampBucket.Get(rawID)
|
||||||
|
if rawTimeStamp == nil {
|
||||||
|
return fmt.Errorf("getting exit-code time stamp of container %s from DB: %w", id, define.ErrNoSuchExitCode)
|
||||||
|
}
|
||||||
|
|
||||||
|
if err := result.UnmarshalText(rawTimeStamp); err != nil {
|
||||||
|
return fmt.Errorf("converting raw time stamp %v of container %s from DB: %w", rawTimeStamp, id, err)
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
// PrunExitCodes removes exit codes older than 5 minutes.
|
||||||
|
func (s *BoltState) PruneContainerExitCodes() error {
|
||||||
|
db, err := s.getDBCon()
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
defer s.deferredCloseDBCon(db)
|
||||||
|
|
||||||
|
threshold := time.Minute * 5
|
||||||
|
return db.View(func(tx *bolt.Tx) error {
|
||||||
|
timeStampBucket, err := getExitCodeTimeStampBucket(tx)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
return timeStampBucket.ForEach(func(rawID, rawTimeStamp []byte) error {
|
||||||
|
var timeStamp time.Time
|
||||||
|
if err := timeStamp.UnmarshalText(rawTimeStamp); err != nil {
|
||||||
|
return fmt.Errorf("converting raw time stamp %v of container %s from DB: %w", rawTimeStamp, string(rawID), err)
|
||||||
|
}
|
||||||
|
if time.Since(timeStamp) > threshold {
|
||||||
|
// Since the DB connection is locked, pass it
|
||||||
|
// to remove the exit codes to avoid race
|
||||||
|
// conditions.
|
||||||
|
return s.removeContainerExitCode(rawID, db)
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
})
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
// removeContainerExitCode removes the exit code and time stamp of the specified container.
|
||||||
|
func (s *BoltState) removeContainerExitCode(rawID []byte, db *bolt.DB) error {
|
||||||
|
return db.Update(func(tx *bolt.Tx) error {
|
||||||
|
exitCodeBucket, err := getExitCodeBucket(tx)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
timeStampBucket, err := getExitCodeTimeStampBucket(tx)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
var finalErr error
|
||||||
|
if err := exitCodeBucket.Delete(rawID); err != nil {
|
||||||
|
finalErr = fmt.Errorf("removing exit code of container %s from DB: %w", string(rawID), err)
|
||||||
|
}
|
||||||
|
if err := timeStampBucket.Delete(rawID); err != nil {
|
||||||
|
err = fmt.Errorf("removing exit-code time stamp of container %s from DB: %w", string(rawID), err)
|
||||||
|
if finalErr != nil {
|
||||||
|
logrus.Error(err)
|
||||||
|
} else {
|
||||||
|
finalErr = err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return finalErr
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
// AddExecSession adds an exec session to the state.
|
// AddExecSession adds an exec session to the state.
|
||||||
func (s *BoltState) AddExecSession(ctr *Container, session *ExecSession) error {
|
func (s *BoltState) AddExecSession(ctr *Container, session *ExecSession) error {
|
||||||
if !s.valid {
|
if !s.valid {
|
||||||
|
@ -29,6 +29,9 @@ const (
|
|||||||
aliasesName = "aliases"
|
aliasesName = "aliases"
|
||||||
runtimeConfigName = "runtime-config"
|
runtimeConfigName = "runtime-config"
|
||||||
|
|
||||||
|
exitCodeName = "exit-code"
|
||||||
|
exitCodeTimeStampName = "exit-code-time-stamp"
|
||||||
|
|
||||||
configName = "config"
|
configName = "config"
|
||||||
stateName = "state"
|
stateName = "state"
|
||||||
dependenciesName = "dependencies"
|
dependenciesName = "dependencies"
|
||||||
@ -65,6 +68,9 @@ var (
|
|||||||
volDependenciesBkt = []byte(volCtrDependencies)
|
volDependenciesBkt = []byte(volCtrDependencies)
|
||||||
networksBkt = []byte(networksName)
|
networksBkt = []byte(networksName)
|
||||||
|
|
||||||
|
exitCodeBkt = []byte(exitCodeName)
|
||||||
|
exitCodeTimeStampBkt = []byte(exitCodeTimeStampName)
|
||||||
|
|
||||||
configKey = []byte(configName)
|
configKey = []byte(configName)
|
||||||
stateKey = []byte(stateName)
|
stateKey = []byte(stateName)
|
||||||
netNSKey = []byte(netNSName)
|
netNSKey = []byte(netNSName)
|
||||||
@ -362,6 +368,22 @@ func getRuntimeConfigBucket(tx *bolt.Tx) (*bolt.Bucket, error) {
|
|||||||
return bkt, nil
|
return bkt, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func getExitCodeBucket(tx *bolt.Tx) (*bolt.Bucket, error) {
|
||||||
|
bkt := tx.Bucket(exitCodeBkt)
|
||||||
|
if bkt == nil {
|
||||||
|
return nil, errors.Wrapf(define.ErrDBBadConfig, "exit-code container bucket not found in DB")
|
||||||
|
}
|
||||||
|
return bkt, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func getExitCodeTimeStampBucket(tx *bolt.Tx) (*bolt.Bucket, error) {
|
||||||
|
bkt := tx.Bucket(exitCodeTimeStampBkt)
|
||||||
|
if bkt == nil {
|
||||||
|
return nil, errors.Wrapf(define.ErrDBBadConfig, "exit-code time stamp bucket not found in DB")
|
||||||
|
}
|
||||||
|
return bkt, nil
|
||||||
|
}
|
||||||
|
|
||||||
func (s *BoltState) getContainerConfigFromDB(id []byte, config *ContainerConfig, ctrsBkt *bolt.Bucket) error {
|
func (s *BoltState) getContainerConfigFromDB(id []byte, config *ContainerConfig, ctrsBkt *bolt.Bucket) error {
|
||||||
ctrBkt := ctrsBkt.Bucket(id)
|
ctrBkt := ctrsBkt.Bucket(id)
|
||||||
if ctrBkt == nil {
|
if ctrBkt == nil {
|
||||||
|
@ -2,6 +2,7 @@ package libpod
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
|
"fmt"
|
||||||
"io"
|
"io"
|
||||||
"io/ioutil"
|
"io/ioutil"
|
||||||
"net/http"
|
"net/http"
|
||||||
@ -490,41 +491,84 @@ func (c *Container) RemoveArtifact(name string) error {
|
|||||||
|
|
||||||
// Wait blocks until the container exits and returns its exit code.
|
// Wait blocks until the container exits and returns its exit code.
|
||||||
func (c *Container) Wait(ctx context.Context) (int32, error) {
|
func (c *Container) Wait(ctx context.Context) (int32, error) {
|
||||||
return c.WaitWithInterval(ctx, DefaultWaitInterval)
|
return c.WaitForExit(ctx, DefaultWaitInterval)
|
||||||
}
|
}
|
||||||
|
|
||||||
// WaitWithInterval blocks until the container to exit and returns its exit
|
// WaitForExit blocks until the container exits and returns its exit code. The
|
||||||
// code. The argument is the interval at which checks the container's status.
|
// argument is the interval at which checks the container's status.
|
||||||
func (c *Container) WaitWithInterval(ctx context.Context, waitTimeout time.Duration) (int32, error) {
|
func (c *Container) WaitForExit(ctx context.Context, pollInterval time.Duration) (int32, error) {
|
||||||
if !c.valid {
|
if !c.valid {
|
||||||
return -1, define.ErrCtrRemoved
|
return -1, define.ErrCtrRemoved
|
||||||
}
|
}
|
||||||
|
|
||||||
exitFile, err := c.exitFilePath()
|
id := c.ID()
|
||||||
if err != nil {
|
var conmonTimer time.Timer
|
||||||
return -1, err
|
conmonTimerSet := false
|
||||||
}
|
|
||||||
chWait := make(chan error, 1)
|
|
||||||
|
|
||||||
go func() {
|
getExitCode := func() (bool, int32, error) {
|
||||||
<-ctx.Done()
|
containerRemoved := false
|
||||||
chWait <- define.ErrCanceled
|
if !c.batched {
|
||||||
}()
|
c.lock.Lock()
|
||||||
|
defer c.lock.Unlock()
|
||||||
for {
|
|
||||||
// ignore errors here (with exception of cancellation), it is only used to avoid waiting
|
|
||||||
// too long.
|
|
||||||
_, e := WaitForFile(exitFile, chWait, waitTimeout)
|
|
||||||
if e == define.ErrCanceled {
|
|
||||||
return -1, define.ErrCanceled
|
|
||||||
}
|
}
|
||||||
|
|
||||||
stopped, code, err := c.isStopped()
|
if err := c.syncContainer(); err != nil {
|
||||||
|
if !errors.Is(err, define.ErrNoSuchCtr) {
|
||||||
|
return false, -1, err
|
||||||
|
}
|
||||||
|
containerRemoved = true
|
||||||
|
}
|
||||||
|
|
||||||
|
// If conmon is not alive anymore set a timer to make sure
|
||||||
|
// we're returning even if conmon has forcefully been killed.
|
||||||
|
if !conmonTimerSet && !containerRemoved {
|
||||||
|
conmonAlive, err := c.ociRuntime.CheckConmonRunning(c)
|
||||||
|
switch {
|
||||||
|
case errors.Is(err, define.ErrNoSuchCtr):
|
||||||
|
containerRemoved = true
|
||||||
|
case err != nil:
|
||||||
|
return false, -1, err
|
||||||
|
case !conmonAlive:
|
||||||
|
timerDuration := time.Second * 20
|
||||||
|
conmonTimer = *time.NewTimer(timerDuration)
|
||||||
|
conmonTimerSet = true
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if !containerRemoved {
|
||||||
|
// If conmon is dead for more than $timerDuration or if the
|
||||||
|
// container has exited properly, try to look up the exit code.
|
||||||
|
select {
|
||||||
|
case <-conmonTimer.C:
|
||||||
|
logrus.Debugf("Exceeded conmon timeout waiting for container %s to exit", id)
|
||||||
|
default:
|
||||||
|
if !c.ensureState(define.ContainerStateExited, define.ContainerStateConfigured) {
|
||||||
|
return false, -1, nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
exitCode, err := c.runtime.state.GetContainerExitCode(id)
|
||||||
|
if err != nil {
|
||||||
|
return true, -1, err
|
||||||
|
}
|
||||||
|
|
||||||
|
return true, exitCode, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
for {
|
||||||
|
hasExited, exitCode, err := getExitCode()
|
||||||
|
if hasExited {
|
||||||
|
return exitCode, err
|
||||||
|
}
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return -1, err
|
return -1, err
|
||||||
}
|
}
|
||||||
if stopped {
|
select {
|
||||||
return code, nil
|
case <-ctx.Done():
|
||||||
|
return -1, fmt.Errorf("waiting for exit code of container %s canceled", id)
|
||||||
|
default:
|
||||||
|
time.Sleep(pollInterval)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -551,11 +595,12 @@ func (c *Container) WaitForConditionWithInterval(ctx context.Context, waitTimeou
|
|||||||
wantedStates := make(map[define.ContainerStatus]bool, len(conditions))
|
wantedStates := make(map[define.ContainerStatus]bool, len(conditions))
|
||||||
|
|
||||||
for _, condition := range conditions {
|
for _, condition := range conditions {
|
||||||
if condition == define.ContainerStateStopped || condition == define.ContainerStateExited {
|
switch condition {
|
||||||
|
case define.ContainerStateExited, define.ContainerStateStopped:
|
||||||
waitForExit = true
|
waitForExit = true
|
||||||
continue
|
default:
|
||||||
|
wantedStates[condition] = true
|
||||||
}
|
}
|
||||||
wantedStates[condition] = true
|
|
||||||
}
|
}
|
||||||
|
|
||||||
trySend := func(code int32, err error) {
|
trySend := func(code int32, err error) {
|
||||||
@ -572,7 +617,7 @@ func (c *Container) WaitForConditionWithInterval(ctx context.Context, waitTimeou
|
|||||||
go func() {
|
go func() {
|
||||||
defer wg.Done()
|
defer wg.Done()
|
||||||
|
|
||||||
code, err := c.WaitWithInterval(ctx, waitTimeout)
|
code, err := c.WaitForExit(ctx, waitTimeout)
|
||||||
trySend(code, err)
|
trySend(code, err)
|
||||||
}()
|
}()
|
||||||
}
|
}
|
||||||
|
@ -219,7 +219,7 @@ func (c *Container) handleExitFile(exitFile string, fi os.FileInfo) error {
|
|||||||
// Write an event for the container's death
|
// Write an event for the container's death
|
||||||
c.newContainerExitedEvent(c.state.ExitCode)
|
c.newContainerExitedEvent(c.state.ExitCode)
|
||||||
|
|
||||||
return nil
|
return c.runtime.state.AddContainerExitCode(c.ID(), c.state.ExitCode)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *Container) shouldRestart() bool {
|
func (c *Container) shouldRestart() bool {
|
||||||
@ -784,20 +784,6 @@ func (c *Container) getArtifactPath(name string) string {
|
|||||||
return filepath.Join(c.config.StaticDir, artifactsDir, name)
|
return filepath.Join(c.config.StaticDir, artifactsDir, name)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Used with Wait() to determine if a container has exited
|
|
||||||
func (c *Container) isStopped() (bool, int32, error) {
|
|
||||||
if !c.batched {
|
|
||||||
c.lock.Lock()
|
|
||||||
defer c.lock.Unlock()
|
|
||||||
}
|
|
||||||
err := c.syncContainer()
|
|
||||||
if err != nil {
|
|
||||||
return true, -1, err
|
|
||||||
}
|
|
||||||
|
|
||||||
return !c.ensureState(define.ContainerStateRunning, define.ContainerStatePaused, define.ContainerStateStopping), c.state.ExitCode, nil
|
|
||||||
}
|
|
||||||
|
|
||||||
// save container state to the database
|
// save container state to the database
|
||||||
func (c *Container) save() error {
|
func (c *Container) save() error {
|
||||||
if err := c.runtime.state.SaveContainer(c); err != nil {
|
if err := c.runtime.state.SaveContainer(c); err != nil {
|
||||||
@ -1282,13 +1268,6 @@ func (c *Container) stop(timeout uint) error {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Check if conmon is still alive.
|
|
||||||
// If it is not, we won't be getting an exit file.
|
|
||||||
conmonAlive, err := c.ociRuntime.CheckConmonRunning(c)
|
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
|
|
||||||
// Set the container state to "stopping" and unlock the container
|
// Set the container state to "stopping" and unlock the container
|
||||||
// before handing it over to conmon to unblock other commands. #8501
|
// before handing it over to conmon to unblock other commands. #8501
|
||||||
// demonstrates nicely that a high stop timeout will block even simple
|
// demonstrates nicely that a high stop timeout will block even simple
|
||||||
@ -1341,21 +1320,18 @@ func (c *Container) stop(timeout uint) error {
|
|||||||
}
|
}
|
||||||
|
|
||||||
c.newContainerEvent(events.Stop)
|
c.newContainerEvent(events.Stop)
|
||||||
|
|
||||||
c.state.PID = 0
|
|
||||||
c.state.ConmonPID = 0
|
|
||||||
c.state.StoppedByUser = true
|
c.state.StoppedByUser = true
|
||||||
|
|
||||||
|
conmonAlive, err := c.ociRuntime.CheckConmonRunning(c)
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
if !conmonAlive {
|
if !conmonAlive {
|
||||||
// Conmon is dead, so we can't expect an exit code.
|
if err := c.checkExitFile(); err != nil {
|
||||||
c.state.ExitCode = -1
|
return err
|
||||||
c.state.FinishedTime = time.Now()
|
|
||||||
c.state.State = define.ContainerStateStopped
|
|
||||||
if err := c.save(); err != nil {
|
|
||||||
logrus.Errorf("Saving container %s status: %v", c.ID(), err)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return errors.Wrapf(define.ErrConmonDead, "container %s conmon process missing, cannot retrieve exit code", c.ID())
|
return c.save()
|
||||||
}
|
}
|
||||||
|
|
||||||
if err := c.save(); err != nil {
|
if err := c.save(); err != nil {
|
||||||
@ -1939,6 +1915,18 @@ func (c *Container) cleanup(ctx context.Context) error {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Prune the exit codes of other container during clean up.
|
||||||
|
// Since Podman is no daemon, we have to clean them up somewhere.
|
||||||
|
// Cleanup seems like a good place as it's not performance
|
||||||
|
// critical.
|
||||||
|
if err := c.runtime.state.PruneContainerExitCodes(); err != nil {
|
||||||
|
if lastError == nil {
|
||||||
|
lastError = err
|
||||||
|
} else {
|
||||||
|
logrus.Errorf("Pruning container exit codes: %v", err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return lastError
|
return lastError
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -24,6 +24,10 @@ var (
|
|||||||
// not exist.
|
// not exist.
|
||||||
ErrNoSuchExecSession = errors.New("no such exec session")
|
ErrNoSuchExecSession = errors.New("no such exec session")
|
||||||
|
|
||||||
|
// ErrNoSuchExitCode indicates that the requested container exit code
|
||||||
|
// does not exist.
|
||||||
|
ErrNoSuchExitCode = errors.New("no such exit code")
|
||||||
|
|
||||||
// ErrDepExists indicates that the current object has dependencies and
|
// ErrDepExists indicates that the current object has dependencies and
|
||||||
// cannot be removed before them.
|
// cannot be removed before them.
|
||||||
ErrDepExists = errors.New("dependency exists")
|
ErrDepExists = errors.New("dependency exists")
|
||||||
|
@ -151,6 +151,9 @@ func (r *Runtime) GetEvents(ctx context.Context, filters []string) ([]*events.Ev
|
|||||||
// GetLastContainerEvent takes a container name or ID and an event status and returns
|
// GetLastContainerEvent takes a container name or ID and an event status and returns
|
||||||
// the last occurrence of the container event
|
// the last occurrence of the container event
|
||||||
func (r *Runtime) GetLastContainerEvent(ctx context.Context, nameOrID string, containerEvent events.Status) (*events.Event, error) {
|
func (r *Runtime) GetLastContainerEvent(ctx context.Context, nameOrID string, containerEvent events.Status) (*events.Event, error) {
|
||||||
|
// FIXME: events should be read in reverse order!
|
||||||
|
// https://github.com/containers/podman/issues/14579
|
||||||
|
|
||||||
// check to make sure the event.Status is valid
|
// check to make sure the event.Status is valid
|
||||||
if _, err := events.StringToStatus(containerEvent.String()); err != nil {
|
if _, err := events.StringToStatus(containerEvent.String()); err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
|
@ -264,11 +264,6 @@ func (r *ConmonOCIRuntime) CreateContainer(ctr *Container, restoreOptions *Conta
|
|||||||
// status, but will instead only check for the existence of the conmon exit file
|
// status, but will instead only check for the existence of the conmon exit file
|
||||||
// and update state to stopped if it exists.
|
// and update state to stopped if it exists.
|
||||||
func (r *ConmonOCIRuntime) UpdateContainerStatus(ctr *Container) error {
|
func (r *ConmonOCIRuntime) UpdateContainerStatus(ctr *Container) error {
|
||||||
exitFile, err := r.ExitFilePath(ctr)
|
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
|
|
||||||
runtimeDir, err := util.GetRuntimeDir()
|
runtimeDir, err := util.GetRuntimeDir()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
@ -340,22 +335,10 @@ func (r *ConmonOCIRuntime) UpdateContainerStatus(ctr *Container) error {
|
|||||||
// Only grab exit status if we were not already stopped
|
// Only grab exit status if we were not already stopped
|
||||||
// If we were, it should already be in the database
|
// If we were, it should already be in the database
|
||||||
if ctr.state.State == define.ContainerStateStopped && oldState != define.ContainerStateStopped {
|
if ctr.state.State == define.ContainerStateStopped && oldState != define.ContainerStateStopped {
|
||||||
var fi os.FileInfo
|
if _, err := ctr.Wait(context.Background()); err != nil {
|
||||||
chWait := make(chan error)
|
logrus.Errorf("Waiting for container %s to exit: %v", ctr.ID(), err)
|
||||||
defer close(chWait)
|
|
||||||
|
|
||||||
_, err := WaitForFile(exitFile, chWait, time.Second*5)
|
|
||||||
if err == nil {
|
|
||||||
fi, err = os.Stat(exitFile)
|
|
||||||
}
|
}
|
||||||
if err != nil {
|
return nil
|
||||||
ctr.state.ExitCode = -1
|
|
||||||
ctr.state.FinishedTime = time.Now()
|
|
||||||
logrus.Errorf("No exit file for container %s found: %v", ctr.ID(), err)
|
|
||||||
return nil
|
|
||||||
}
|
|
||||||
|
|
||||||
return ctr.handleExitFile(exitFile, fi)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Handle ContainerStateStopping - keep it unless the container
|
// Handle ContainerStateStopping - keep it unless the container
|
||||||
@ -1166,7 +1149,6 @@ func (r *ConmonOCIRuntime) createOCIContainer(ctr *Container, restoreOptions *Co
|
|||||||
}).Debugf("running conmon: %s", r.conmonPath)
|
}).Debugf("running conmon: %s", r.conmonPath)
|
||||||
|
|
||||||
cmd := exec.Command(r.conmonPath, args...)
|
cmd := exec.Command(r.conmonPath, args...)
|
||||||
cmd.Dir = ctr.bundlePath()
|
|
||||||
cmd.SysProcAttr = &syscall.SysProcAttr{
|
cmd.SysProcAttr = &syscall.SysProcAttr{
|
||||||
Setpgid: true,
|
Setpgid: true,
|
||||||
}
|
}
|
||||||
|
@ -111,6 +111,15 @@ type State interface {
|
|||||||
// Return a container config from the database by full ID
|
// Return a container config from the database by full ID
|
||||||
GetContainerConfig(id string) (*ContainerConfig, error)
|
GetContainerConfig(id string) (*ContainerConfig, error)
|
||||||
|
|
||||||
|
// Add the exit code for the specified container to the database.
|
||||||
|
AddContainerExitCode(id string, exitCode int32) error
|
||||||
|
|
||||||
|
// Return the exit code for the specified container.
|
||||||
|
GetContainerExitCode(id string) (int32, error)
|
||||||
|
|
||||||
|
// Remove exit codes older than 5 minutes.
|
||||||
|
PruneContainerExitCodes() error
|
||||||
|
|
||||||
// Add creates a reference to an exec session in the database.
|
// Add creates a reference to an exec session in the database.
|
||||||
// The container the exec session is attached to will be recorded.
|
// The container the exec session is attached to will be recorded.
|
||||||
// The container state will not be modified.
|
// The container state will not be modified.
|
||||||
|
@ -191,7 +191,6 @@ func waitDockerCondition(ctx context.Context, containerName string, interval tim
|
|||||||
var notRunningStates = []define.ContainerStatus{
|
var notRunningStates = []define.ContainerStatus{
|
||||||
define.ContainerStateCreated,
|
define.ContainerStateCreated,
|
||||||
define.ContainerStateRemoving,
|
define.ContainerStateRemoving,
|
||||||
define.ContainerStateStopped,
|
|
||||||
define.ContainerStateExited,
|
define.ContainerStateExited,
|
||||||
define.ContainerStateConfigured,
|
define.ContainerStateConfigured,
|
||||||
}
|
}
|
||||||
|
@ -16,7 +16,6 @@ import (
|
|||||||
"github.com/containers/image/v5/manifest"
|
"github.com/containers/image/v5/manifest"
|
||||||
"github.com/containers/podman/v4/libpod"
|
"github.com/containers/podman/v4/libpod"
|
||||||
"github.com/containers/podman/v4/libpod/define"
|
"github.com/containers/podman/v4/libpod/define"
|
||||||
"github.com/containers/podman/v4/libpod/events"
|
|
||||||
"github.com/containers/podman/v4/libpod/logs"
|
"github.com/containers/podman/v4/libpod/logs"
|
||||||
"github.com/containers/podman/v4/pkg/checkpoint"
|
"github.com/containers/podman/v4/pkg/checkpoint"
|
||||||
"github.com/containers/podman/v4/pkg/domain/entities"
|
"github.com/containers/podman/v4/pkg/domain/entities"
|
||||||
@ -939,6 +938,7 @@ func (ic *ContainerEngine) ContainerStart(ctx context.Context, namesOrIds []stri
|
|||||||
}
|
}
|
||||||
return reports, errors.Wrapf(err, "unable to start container %s", ctr.ID())
|
return reports, errors.Wrapf(err, "unable to start container %s", ctr.ID())
|
||||||
}
|
}
|
||||||
|
|
||||||
exitCode = ic.GetContainerExitCode(ctx, ctr)
|
exitCode = ic.GetContainerExitCode(ctx, ctr)
|
||||||
reports = append(reports, &entities.ContainerStartReport{
|
reports = append(reports, &entities.ContainerStartReport{
|
||||||
Id: ctr.ID(),
|
Id: ctr.ID(),
|
||||||
@ -1099,25 +1099,11 @@ func (ic *ContainerEngine) ContainerRun(ctx context.Context, opts entities.Conta
|
|||||||
|
|
||||||
func (ic *ContainerEngine) GetContainerExitCode(ctx context.Context, ctr *libpod.Container) int {
|
func (ic *ContainerEngine) GetContainerExitCode(ctx context.Context, ctr *libpod.Container) int {
|
||||||
exitCode, err := ctr.Wait(ctx)
|
exitCode, err := ctr.Wait(ctx)
|
||||||
if err == nil {
|
if err != nil {
|
||||||
return int(exitCode)
|
logrus.Errorf("Waiting for container %s: %v", ctr.ID(), err)
|
||||||
}
|
|
||||||
if errors.Cause(err) != define.ErrNoSuchCtr {
|
|
||||||
logrus.Errorf("Could not retrieve exit code: %v", err)
|
|
||||||
return define.ExecErrorCodeNotFound
|
return define.ExecErrorCodeNotFound
|
||||||
}
|
}
|
||||||
// Make 4 attempt with 0.25s backoff between each for 1 second total
|
return int(exitCode)
|
||||||
var event *events.Event
|
|
||||||
for i := 0; i < 4; i++ {
|
|
||||||
event, err = ic.Libpod.GetLastContainerEvent(ctx, ctr.ID(), events.Exited)
|
|
||||||
if err != nil {
|
|
||||||
time.Sleep(250 * time.Millisecond)
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
return event.ContainerExitCode
|
|
||||||
}
|
|
||||||
logrus.Errorf("Could not retrieve exit code from event: %v", err)
|
|
||||||
return define.ExecErrorCodeNotFound
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func (ic *ContainerEngine) ContainerLogs(ctx context.Context, containers []string, options entities.ContainerLogsOptions) error {
|
func (ic *ContainerEngine) ContainerLogs(ctx context.Context, containers []string, options entities.ContainerLogsOptions) error {
|
||||||
|
Reference in New Issue
Block a user