Lock pod while starting and stopping containers

The intention behind this is to stop races between
`pod stop|start` and `container stop|start` being run at the same
time. This could result in containers with no working network
(they join the still-running infra container's netns, which is
then torn down as the infra container is stopped, leaving the
container in an otherwise unused, nonfunctional, orphan netns.

Locking the pod (if present) in the public container start and
stop APIs should be sufficient to stop this.

Signed-off-by: Matt Heon <mheon@redhat.com>
This commit is contained in:
Matt Heon
2025-01-30 10:24:45 -05:00
parent be5d807b62
commit 06fa617f61
3 changed files with 64 additions and 26 deletions

View File

@ -1260,6 +1260,40 @@ func (c *Container) initAndStart(ctx context.Context) (retErr error) {
return c.waitForHealthy(ctx)
}
// Internal function to start a container without taking the pod lock.
// Please note that this DOES take the container lock.
// Intended to be used in pod-related functions.
func (c *Container) startNoPodLock(ctx context.Context, recursive bool) (finalErr error) {
if !c.batched {
c.lock.Lock()
defer c.lock.Unlock()
// defer's are executed LIFO so we are locked here
// as long as we call this after the defer unlock()
defer func() {
if finalErr != nil {
if err := saveContainerError(c, finalErr); err != nil {
logrus.Debug(err)
}
}
}()
if err := c.syncContainer(); err != nil {
return err
}
}
if err := c.prepareToStart(ctx, recursive); err != nil {
return err
}
// Start the container
if err := c.start(); err != nil {
return err
}
return c.waitForHealthy(ctx)
}
// Internal, non-locking function to start a container
func (c *Container) start() error {
if c.config.Spec.Process != nil {