Merge pull request #14685 from mheon/gitlab

Followon to #14559
This commit is contained in:
openshift-ci[bot]
2022-06-27 10:58:10 +00:00
committed by GitHub
11 changed files with 388 additions and 112 deletions

View File

@ -5,8 +5,10 @@ import (
"fmt"
"net"
"os"
"strconv"
"strings"
"sync"
"time"
"github.com/containers/common/libnetwork/types"
"github.com/containers/podman/v4/libpod/define"
@ -63,6 +65,13 @@ type BoltState struct {
// initially created the database. This must match for any further instances
// that access the database, to ensure that state mismatches with
// containers/storage do not occur.
// - exitCodeBucket/exitCodeTimeStampBucket: (#14559) exit codes must be part
// of the database to resolve a previous race condition when one process waits
// for the exit file to be written and another process removes it along with
// the container during auto-removal. The same race would happen trying to
// read the exit code from the containers bucket. Hence, exit codes go into
// their own bucket. To avoid the rather expensive JSON (un)marshaling, we
// have two buckets: one for the exit codes, the other for the timestamps.
// NewBoltState creates a new bolt-backed state database
func NewBoltState(path string, runtime *Runtime) (State, error) {
@ -98,6 +107,8 @@ func NewBoltState(path string, runtime *Runtime) (State, error) {
allVolsBkt,
execBkt,
runtimeConfigBkt,
exitCodeBkt,
exitCodeTimeStampBkt,
}
// Does the DB need an update?
@ -192,6 +203,45 @@ func (s *BoltState) Refresh() error {
return err
}
exitCodeBucket, err := getExitCodeBucket(tx)
if err != nil {
return err
}
timeStampBucket, err := getExitCodeTimeStampBucket(tx)
if err != nil {
return err
}
// Clear all exec exit codes
toRemoveExitCodes := []string{}
err = exitCodeBucket.ForEach(func(id, _ []byte) error {
toRemoveExitCodes = append(toRemoveExitCodes, string(id))
return nil
})
if err != nil {
return errors.Wrapf(err, "error reading exit codes bucket")
}
for _, id := range toRemoveExitCodes {
if err := exitCodeBucket.Delete([]byte(id)); err != nil {
return errors.Wrapf(err, "error removing exit code for ID %s", id)
}
}
toRemoveTimeStamps := []string{}
err = timeStampBucket.ForEach(func(id, _ []byte) error {
toRemoveTimeStamps = append(toRemoveTimeStamps, string(id))
return nil
})
if err != nil {
return errors.Wrapf(err, "reading timestamps bucket")
}
for _, id := range toRemoveTimeStamps {
if err := timeStampBucket.Delete([]byte(id)); err != nil {
return errors.Wrapf(err, "removing timestamp for ID %s", id)
}
}
// Iterate through all IDs. Check if they are containers.
// If they are, unmarshal their state, and then clear
// PID, mountpoint, and state for all of them
@ -1341,6 +1391,204 @@ func (s *BoltState) GetContainerConfig(id string) (*ContainerConfig, error) {
return config, nil
}
// AddContainerExitCode adds the exit code for the specified container to the database.
func (s *BoltState) AddContainerExitCode(id string, exitCode int32) error {
if len(id) == 0 {
return define.ErrEmptyID
}
if !s.valid {
return define.ErrDBClosed
}
db, err := s.getDBCon()
if err != nil {
return err
}
defer s.deferredCloseDBCon(db)
rawID := []byte(id)
rawExitCode := []byte(strconv.Itoa(int(exitCode)))
rawTimeStamp, err := time.Now().MarshalText()
if err != nil {
return fmt.Errorf("marshaling exit-code time stamp: %w", err)
}
return db.Update(func(tx *bolt.Tx) error {
exitCodeBucket, err := getExitCodeBucket(tx)
if err != nil {
return err
}
timeStampBucket, err := getExitCodeTimeStampBucket(tx)
if err != nil {
return err
}
if err := exitCodeBucket.Put(rawID, rawExitCode); err != nil {
return fmt.Errorf("adding exit code of container %s to DB: %w", id, err)
}
if err := timeStampBucket.Put(rawID, rawTimeStamp); err != nil {
if rmErr := exitCodeBucket.Delete(rawID); rmErr != nil {
logrus.Errorf("Removing exit code of container %s from DB: %v", id, rmErr)
}
return fmt.Errorf("adding exit-code time stamp of container %s to DB: %w", id, err)
}
return nil
})
}
// GetContainerExitCode returns the exit code for the specified container.
func (s *BoltState) GetContainerExitCode(id string) (int32, error) {
if len(id) == 0 {
return -1, define.ErrEmptyID
}
if !s.valid {
return -1, define.ErrDBClosed
}
db, err := s.getDBCon()
if err != nil {
return -1, err
}
defer s.deferredCloseDBCon(db)
rawID := []byte(id)
result := int32(-1)
return result, db.View(func(tx *bolt.Tx) error {
exitCodeBucket, err := getExitCodeBucket(tx)
if err != nil {
return err
}
rawExitCode := exitCodeBucket.Get(rawID)
if rawExitCode == nil {
return fmt.Errorf("getting exit code of container %s from DB: %w", id, define.ErrNoSuchExitCode)
}
exitCode, err := strconv.Atoi(string(rawExitCode))
if err != nil {
return fmt.Errorf("converting raw exit code %v of container %s: %w", rawExitCode, id, err)
}
result = int32(exitCode)
return nil
})
}
// GetContainerExitCodeTimeStamp returns the time stamp when the exit code of
// the specified container was added to the database.
func (s *BoltState) GetContainerExitCodeTimeStamp(id string) (*time.Time, error) {
if len(id) == 0 {
return nil, define.ErrEmptyID
}
if !s.valid {
return nil, define.ErrDBClosed
}
db, err := s.getDBCon()
if err != nil {
return nil, err
}
defer s.deferredCloseDBCon(db)
rawID := []byte(id)
var result time.Time
return &result, db.View(func(tx *bolt.Tx) error {
timeStampBucket, err := getExitCodeTimeStampBucket(tx)
if err != nil {
return err
}
rawTimeStamp := timeStampBucket.Get(rawID)
if rawTimeStamp == nil {
return fmt.Errorf("getting exit-code time stamp of container %s from DB: %w", id, define.ErrNoSuchExitCode)
}
if err := result.UnmarshalText(rawTimeStamp); err != nil {
return fmt.Errorf("converting raw time stamp %v of container %s from DB: %w", rawTimeStamp, id, err)
}
return nil
})
}
// PruneExitCodes removes exit codes older than 5 minutes.
func (s *BoltState) PruneContainerExitCodes() error {
if !s.valid {
return define.ErrDBClosed
}
db, err := s.getDBCon()
if err != nil {
return err
}
defer s.deferredCloseDBCon(db)
toRemoveIDs := []string{}
threshold := time.Minute * 5
err = db.View(func(tx *bolt.Tx) error {
timeStampBucket, err := getExitCodeTimeStampBucket(tx)
if err != nil {
return err
}
return timeStampBucket.ForEach(func(rawID, rawTimeStamp []byte) error {
var timeStamp time.Time
if err := timeStamp.UnmarshalText(rawTimeStamp); err != nil {
return fmt.Errorf("converting raw time stamp %v of container %s from DB: %w", rawTimeStamp, string(rawID), err)
}
if time.Since(timeStamp) > threshold {
toRemoveIDs = append(toRemoveIDs, string(rawID))
}
return nil
})
})
if err != nil {
return errors.Wrapf(err, "reading exit codes to prune")
}
if len(toRemoveIDs) > 0 {
err = db.Update(func(tx *bolt.Tx) error {
exitCodeBucket, err := getExitCodeBucket(tx)
if err != nil {
return err
}
timeStampBucket, err := getExitCodeTimeStampBucket(tx)
if err != nil {
return err
}
var finalErr error
for _, id := range toRemoveIDs {
rawID := []byte(id)
if err := exitCodeBucket.Delete(rawID); err != nil {
if finalErr != nil {
logrus.Error(finalErr)
}
finalErr = fmt.Errorf("removing exit code of container %s from DB: %w", id, err)
}
if err := timeStampBucket.Delete(rawID); err != nil {
if finalErr != nil {
logrus.Error(finalErr)
}
finalErr = fmt.Errorf("removing exit code timestamp of container %s from DB: %w", id, err)
}
}
return finalErr
})
if err != nil {
return errors.Wrapf(err, "pruning exit codes")
}
}
return nil
}
// AddExecSession adds an exec session to the state.
func (s *BoltState) AddExecSession(ctr *Container, session *ExecSession) error {
if !s.valid {

View File

@ -29,6 +29,9 @@ const (
aliasesName = "aliases"
runtimeConfigName = "runtime-config"
exitCodeName = "exit-code"
exitCodeTimeStampName = "exit-code-time-stamp"
configName = "config"
stateName = "state"
dependenciesName = "dependencies"
@ -65,6 +68,9 @@ var (
volDependenciesBkt = []byte(volCtrDependencies)
networksBkt = []byte(networksName)
exitCodeBkt = []byte(exitCodeName)
exitCodeTimeStampBkt = []byte(exitCodeTimeStampName)
configKey = []byte(configName)
stateKey = []byte(stateName)
netNSKey = []byte(netNSName)
@ -362,6 +368,22 @@ func getRuntimeConfigBucket(tx *bolt.Tx) (*bolt.Bucket, error) {
return bkt, nil
}
func getExitCodeBucket(tx *bolt.Tx) (*bolt.Bucket, error) {
bkt := tx.Bucket(exitCodeBkt)
if bkt == nil {
return nil, errors.Wrapf(define.ErrDBBadConfig, "exit-code container bucket not found in DB")
}
return bkt, nil
}
func getExitCodeTimeStampBucket(tx *bolt.Tx) (*bolt.Bucket, error) {
bkt := tx.Bucket(exitCodeTimeStampBkt)
if bkt == nil {
return nil, errors.Wrapf(define.ErrDBBadConfig, "exit-code time stamp bucket not found in DB")
}
return bkt, nil
}
func (s *BoltState) getContainerConfigFromDB(id []byte, config *ContainerConfig, ctrsBkt *bolt.Bucket) error {
ctrBkt := ctrsBkt.Bucket(id)
if ctrBkt == nil {

View File

@ -2,6 +2,7 @@ package libpod
import (
"context"
"fmt"
"io"
"io/ioutil"
"net/http"
@ -490,41 +491,84 @@ func (c *Container) RemoveArtifact(name string) error {
// Wait blocks until the container exits and returns its exit code.
func (c *Container) Wait(ctx context.Context) (int32, error) {
return c.WaitWithInterval(ctx, DefaultWaitInterval)
return c.WaitForExit(ctx, DefaultWaitInterval)
}
// WaitWithInterval blocks until the container to exit and returns its exit
// code. The argument is the interval at which checks the container's status.
func (c *Container) WaitWithInterval(ctx context.Context, waitTimeout time.Duration) (int32, error) {
// WaitForExit blocks until the container exits and returns its exit code. The
// argument is the interval at which checks the container's status.
func (c *Container) WaitForExit(ctx context.Context, pollInterval time.Duration) (int32, error) {
if !c.valid {
return -1, define.ErrCtrRemoved
}
exitFile, err := c.exitFilePath()
if err != nil {
return -1, err
}
chWait := make(chan error, 1)
id := c.ID()
var conmonTimer time.Timer
conmonTimerSet := false
go func() {
<-ctx.Done()
chWait <- define.ErrCanceled
}()
for {
// ignore errors here (with exception of cancellation), it is only used to avoid waiting
// too long.
_, e := WaitForFile(exitFile, chWait, waitTimeout)
if e == define.ErrCanceled {
return -1, define.ErrCanceled
getExitCode := func() (bool, int32, error) {
containerRemoved := false
if !c.batched {
c.lock.Lock()
defer c.lock.Unlock()
}
stopped, code, err := c.isStopped()
if err := c.syncContainer(); err != nil {
if !errors.Is(err, define.ErrNoSuchCtr) {
return false, -1, err
}
containerRemoved = true
}
// If conmon is not alive anymore set a timer to make sure
// we're returning even if conmon has forcefully been killed.
if !conmonTimerSet && !containerRemoved {
conmonAlive, err := c.ociRuntime.CheckConmonRunning(c)
switch {
case errors.Is(err, define.ErrNoSuchCtr):
containerRemoved = true
case err != nil:
return false, -1, err
case !conmonAlive:
timerDuration := time.Second * 20
conmonTimer = *time.NewTimer(timerDuration)
conmonTimerSet = true
}
}
if !containerRemoved {
// If conmon is dead for more than $timerDuration or if the
// container has exited properly, try to look up the exit code.
select {
case <-conmonTimer.C:
logrus.Debugf("Exceeded conmon timeout waiting for container %s to exit", id)
default:
if !c.ensureState(define.ContainerStateExited, define.ContainerStateConfigured) {
return false, -1, nil
}
}
}
exitCode, err := c.runtime.state.GetContainerExitCode(id)
if err != nil {
return true, -1, err
}
return true, exitCode, nil
}
for {
hasExited, exitCode, err := getExitCode()
if hasExited {
return exitCode, err
}
if err != nil {
return -1, err
}
if stopped {
return code, nil
select {
case <-ctx.Done():
return -1, fmt.Errorf("waiting for exit code of container %s canceled", id)
default:
time.Sleep(pollInterval)
}
}
}
@ -551,11 +595,12 @@ func (c *Container) WaitForConditionWithInterval(ctx context.Context, waitTimeou
wantedStates := make(map[define.ContainerStatus]bool, len(conditions))
for _, condition := range conditions {
if condition == define.ContainerStateStopped || condition == define.ContainerStateExited {
switch condition {
case define.ContainerStateExited, define.ContainerStateStopped:
waitForExit = true
continue
default:
wantedStates[condition] = true
}
wantedStates[condition] = true
}
trySend := func(code int32, err error) {
@ -572,7 +617,7 @@ func (c *Container) WaitForConditionWithInterval(ctx context.Context, waitTimeou
go func() {
defer wg.Done()
code, err := c.WaitWithInterval(ctx, waitTimeout)
code, err := c.WaitForExit(ctx, waitTimeout)
trySend(code, err)
}()
}

View File

@ -219,7 +219,7 @@ func (c *Container) handleExitFile(exitFile string, fi os.FileInfo) error {
// Write an event for the container's death
c.newContainerExitedEvent(c.state.ExitCode)
return nil
return c.runtime.state.AddContainerExitCode(c.ID(), c.state.ExitCode)
}
func (c *Container) shouldRestart() bool {
@ -784,20 +784,6 @@ func (c *Container) getArtifactPath(name string) string {
return filepath.Join(c.config.StaticDir, artifactsDir, name)
}
// Used with Wait() to determine if a container has exited
func (c *Container) isStopped() (bool, int32, error) {
if !c.batched {
c.lock.Lock()
defer c.lock.Unlock()
}
err := c.syncContainer()
if err != nil {
return true, -1, err
}
return !c.ensureState(define.ContainerStateRunning, define.ContainerStatePaused, define.ContainerStateStopping), c.state.ExitCode, nil
}
// save container state to the database
func (c *Container) save() error {
if err := c.runtime.state.SaveContainer(c); err != nil {
@ -1282,13 +1268,6 @@ func (c *Container) stop(timeout uint) error {
}
}
// Check if conmon is still alive.
// If it is not, we won't be getting an exit file.
conmonAlive, err := c.ociRuntime.CheckConmonRunning(c)
if err != nil {
return err
}
// Set the container state to "stopping" and unlock the container
// before handing it over to conmon to unblock other commands. #8501
// demonstrates nicely that a high stop timeout will block even simple
@ -1341,21 +1320,18 @@ func (c *Container) stop(timeout uint) error {
}
c.newContainerEvent(events.Stop)
c.state.PID = 0
c.state.ConmonPID = 0
c.state.StoppedByUser = true
conmonAlive, err := c.ociRuntime.CheckConmonRunning(c)
if err != nil {
return err
}
if !conmonAlive {
// Conmon is dead, so we can't expect an exit code.
c.state.ExitCode = -1
c.state.FinishedTime = time.Now()
c.state.State = define.ContainerStateStopped
if err := c.save(); err != nil {
logrus.Errorf("Saving container %s status: %v", c.ID(), err)
if err := c.checkExitFile(); err != nil {
return err
}
return errors.Wrapf(define.ErrConmonDead, "container %s conmon process missing, cannot retrieve exit code", c.ID())
return c.save()
}
if err := c.save(); err != nil {
@ -1939,6 +1915,18 @@ func (c *Container) cleanup(ctx context.Context) error {
}
}
// Prune the exit codes of other container during clean up.
// Since Podman is no daemon, we have to clean them up somewhere.
// Cleanup seems like a good place as it's not performance
// critical.
if err := c.runtime.state.PruneContainerExitCodes(); err != nil {
if lastError == nil {
lastError = err
} else {
logrus.Errorf("Pruning container exit codes: %v", err)
}
}
return lastError
}

View File

@ -24,6 +24,10 @@ var (
// not exist.
ErrNoSuchExecSession = errors.New("no such exec session")
// ErrNoSuchExitCode indicates that the requested container exit code
// does not exist.
ErrNoSuchExitCode = errors.New("no such exit code")
// ErrDepExists indicates that the current object has dependencies and
// cannot be removed before them.
ErrDepExists = errors.New("dependency exists")

View File

@ -151,6 +151,9 @@ func (r *Runtime) GetEvents(ctx context.Context, filters []string) ([]*events.Ev
// GetLastContainerEvent takes a container name or ID and an event status and returns
// the last occurrence of the container event
func (r *Runtime) GetLastContainerEvent(ctx context.Context, nameOrID string, containerEvent events.Status) (*events.Event, error) {
// FIXME: events should be read in reverse order!
// https://github.com/containers/podman/issues/14579
// check to make sure the event.Status is valid
if _, err := events.StringToStatus(containerEvent.String()); err != nil {
return nil, err

View File

@ -264,11 +264,6 @@ func (r *ConmonOCIRuntime) CreateContainer(ctr *Container, restoreOptions *Conta
// status, but will instead only check for the existence of the conmon exit file
// and update state to stopped if it exists.
func (r *ConmonOCIRuntime) UpdateContainerStatus(ctr *Container) error {
exitFile, err := r.ExitFilePath(ctr)
if err != nil {
return err
}
runtimeDir, err := util.GetRuntimeDir()
if err != nil {
return err
@ -340,22 +335,10 @@ func (r *ConmonOCIRuntime) UpdateContainerStatus(ctr *Container) error {
// Only grab exit status if we were not already stopped
// If we were, it should already be in the database
if ctr.state.State == define.ContainerStateStopped && oldState != define.ContainerStateStopped {
var fi os.FileInfo
chWait := make(chan error)
defer close(chWait)
_, err := WaitForFile(exitFile, chWait, time.Second*5)
if err == nil {
fi, err = os.Stat(exitFile)
if _, err := ctr.Wait(context.Background()); err != nil {
logrus.Errorf("Waiting for container %s to exit: %v", ctr.ID(), err)
}
if err != nil {
ctr.state.ExitCode = -1
ctr.state.FinishedTime = time.Now()
logrus.Errorf("No exit file for container %s found: %v", ctr.ID(), err)
return nil
}
return ctr.handleExitFile(exitFile, fi)
return nil
}
// Handle ContainerStateStopping - keep it unless the container
@ -1166,7 +1149,6 @@ func (r *ConmonOCIRuntime) createOCIContainer(ctr *Container, restoreOptions *Co
}).Debugf("running conmon: %s", r.conmonPath)
cmd := exec.Command(r.conmonPath, args...)
cmd.Dir = ctr.bundlePath()
cmd.SysProcAttr = &syscall.SysProcAttr{
Setpgid: true,
}
@ -1354,8 +1336,6 @@ func (r *ConmonOCIRuntime) sharedConmonArgs(ctr *Container, cuuid, bundlePath, p
logDriverArg = define.NoLogging
case define.PassthroughLogging:
logDriverArg = define.PassthroughLogging
case define.JSONLogging:
fallthrough
//lint:ignore ST1015 the default case has to be here
default: //nolint:stylecheck,gocritic
// No case here should happen except JSONLogging, but keep this here in case the options are extended
@ -1365,6 +1345,8 @@ func (r *ConmonOCIRuntime) sharedConmonArgs(ctr *Container, cuuid, bundlePath, p
// to get here, either a user would specify `--log-driver ""`, or this came from another place in libpod
// since the former case is obscure, and the latter case isn't an error, let's silently fallthrough
fallthrough
case define.JSONLogging:
fallthrough
case define.KubernetesLogging:
logDriverArg = fmt.Sprintf("%s:%s", define.KubernetesLogging, logPath)
}

View File

@ -111,6 +111,15 @@ type State interface {
// Return a container config from the database by full ID
GetContainerConfig(id string) (*ContainerConfig, error)
// Add the exit code for the specified container to the database.
AddContainerExitCode(id string, exitCode int32) error
// Return the exit code for the specified container.
GetContainerExitCode(id string) (int32, error)
// Remove exit codes older than 5 minutes.
PruneContainerExitCodes() error
// Add creates a reference to an exec session in the database.
// The container the exec session is attached to will be recorded.
// The container state will not be modified.

View File

@ -191,7 +191,6 @@ func waitDockerCondition(ctx context.Context, containerName string, interval tim
var notRunningStates = []define.ContainerStatus{
define.ContainerStateCreated,
define.ContainerStateRemoving,
define.ContainerStateStopped,
define.ContainerStateExited,
define.ContainerStateConfigured,
}

View File

@ -16,7 +16,6 @@ import (
"github.com/containers/image/v5/manifest"
"github.com/containers/podman/v4/libpod"
"github.com/containers/podman/v4/libpod/define"
"github.com/containers/podman/v4/libpod/events"
"github.com/containers/podman/v4/libpod/logs"
"github.com/containers/podman/v4/pkg/checkpoint"
"github.com/containers/podman/v4/pkg/domain/entities"
@ -939,6 +938,7 @@ func (ic *ContainerEngine) ContainerStart(ctx context.Context, namesOrIds []stri
}
return reports, errors.Wrapf(err, "unable to start container %s", ctr.ID())
}
exitCode = ic.GetContainerExitCode(ctx, ctr)
reports = append(reports, &entities.ContainerStartReport{
Id: ctr.ID(),
@ -1099,25 +1099,11 @@ func (ic *ContainerEngine) ContainerRun(ctx context.Context, opts entities.Conta
func (ic *ContainerEngine) GetContainerExitCode(ctx context.Context, ctr *libpod.Container) int {
exitCode, err := ctr.Wait(ctx)
if err == nil {
return int(exitCode)
}
if errors.Cause(err) != define.ErrNoSuchCtr {
logrus.Errorf("Could not retrieve exit code: %v", err)
if err != nil {
logrus.Errorf("Waiting for container %s: %v", ctr.ID(), err)
return define.ExecErrorCodeNotFound
}
// Make 4 attempt with 0.25s backoff between each for 1 second total
var event *events.Event
for i := 0; i < 4; i++ {
event, err = ic.Libpod.GetLastContainerEvent(ctx, ctr.ID(), events.Exited)
if err != nil {
time.Sleep(250 * time.Millisecond)
continue
}
return event.ContainerExitCode
}
logrus.Errorf("Could not retrieve exit code from event: %v", err)
return define.ExecErrorCodeNotFound
return int(exitCode)
}
func (ic *ContainerEngine) ContainerLogs(ctx context.Context, containers []string, options entities.ContainerLogsOptions) error {

View File

@ -376,17 +376,7 @@ json-file | f
while read driver do_check; do
msg=$(random_string 15)
run_podman run --name myctr --log-driver $driver $IMAGE echo $msg
# Simple output check
# Special case: 'json-file' emits a warning, the rest do not
# ...but with podman-remote the warning is on the server only
if [[ $do_check == 'f' ]] && ! is_remote; then # 'f' for 'fallback'
is "${lines[0]}" ".* level=error msg=\"json-file logging specified but not supported. Choosing k8s-file logging instead\"" \
"Fallback warning emitted"
is "${lines[1]}" "$msg" "basic output sanity check (driver=$driver)"
else
is "$output" "$msg" "basic output sanity check (driver=$driver)"
fi
is "$output" "$msg" "basic output sanity check (driver=$driver)"
# Simply confirm that podman preserved our argument as-is
run_podman inspect --format '{{.HostConfig.LogConfig.Type}}' myctr