libpod: fix broken saveContainerError()

We cannot unlock then lock again without syncing the state as this will
then save a potentially old state causing very bad things, such as
double netns cleanup issues.

The fix here is simple move the saveContainerError() under the same
lock. The comment about the re-lock is just wrong. Not doing this under
the same lock would cause us to update the error after something else
changed the container alreayd.

Most likely this was caused by a misunderstanding on how go defer's work.
Given they run Last In - First Out (LIFO) it is safe as long as out
defer function is after the defer unlock() call.

I think this issue is very bad and might have caused a variety of other
weird flakes. As fact I am confident that this fixes the double cleanup
errors.

Fixes #21569
Also fixes the netns removal ENOENT issues seen in #19721.

Signed-off-by: Paul Holzinger <pholzing@redhat.com>
This commit is contained in:
Paul Holzinger
2024-08-09 18:59:22 +02:00
parent 277e061878
commit f276d53532

View File

@ -84,27 +84,20 @@ func (c *Container) Init(ctx context.Context, recursive bool) error {
// running before starting the container. The recursive parameter, if set, will start all
// dependencies before starting this container.
func (c *Container) Start(ctx context.Context, recursive bool) (finalErr error) {
defer func() {
if finalErr != nil {
// Have to re-lock.
// As this is the first defer, it's the last thing to
// happen in the function - so `defer c.lock.Unlock()`
// below already fired.
if !c.batched {
c.lock.Lock()
defer c.lock.Unlock()
}
if err := saveContainerError(c, finalErr); err != nil {
logrus.Debug(err)
}
}
}()
if !c.batched {
c.lock.Lock()
defer c.lock.Unlock()
// defer's are executed LIFO so we are locked here
// as long as we call this after the defer unlock()
defer func() {
if finalErr != nil {
if err := saveContainerError(c, finalErr); err != nil {
logrus.Debug(err)
}
}
}()
if err := c.syncContainer(); err != nil {
return err
}
@ -147,27 +140,20 @@ func (c *Container) Update(resources *spec.LinuxResources, restartPolicy *string
// ordering of the two such that no output from the container is lost (e.g. the
// Attach call occurs before Start).
func (c *Container) Attach(ctx context.Context, streams *define.AttachStreams, keys string, resize <-chan resize.TerminalSize, start bool) (retChan <-chan error, finalErr error) {
defer func() {
if finalErr != nil {
// Have to re-lock.
// As this is the first defer, it's the last thing to
// happen in the function - so `defer c.lock.Unlock()`
// below already fired.
if !c.batched {
c.lock.Lock()
defer c.lock.Unlock()
}
if err := saveContainerError(c, finalErr); err != nil {
logrus.Debug(err)
}
}
}()
if !c.batched {
c.lock.Lock()
defer c.lock.Unlock()
// defer's are executed LIFO so we are locked here
// as long as we call this after the defer unlock()
defer func() {
if finalErr != nil {
if err := saveContainerError(c, finalErr); err != nil {
logrus.Debug(err)
}
}
}()
if err := c.syncContainer(); err != nil {
return nil, err
}
@ -270,27 +256,20 @@ func (c *Container) Stop() error {
// manually. If timeout is 0, SIGKILL will be used immediately to kill the
// container.
func (c *Container) StopWithTimeout(timeout uint) (finalErr error) {
defer func() {
if finalErr != nil {
// Have to re-lock.
// As this is the first defer, it's the last thing to
// happen in the function - so `defer c.lock.Unlock()`
// below already fired.
if !c.batched {
c.lock.Lock()
defer c.lock.Unlock()
}
if err := saveContainerError(c, finalErr); err != nil {
logrus.Debug(err)
}
}
}()
if !c.batched {
c.lock.Lock()
defer c.lock.Unlock()
// defer's are executed LIFO so we are locked here
// as long as we call this after the defer unlock()
defer func() {
if finalErr != nil {
if err := saveContainerError(c, finalErr); err != nil {
logrus.Debug(err)
}
}
}()
if err := c.syncContainer(); err != nil {
return err
}