Lock pod while starting and stopping containers

The intention behind this is to stop races between `pod stop|start` and `container stop|start` being run at the same time. This could result in containers with no working network (they join the still-running infra container's netns, which is then torn down as the infra container is stopped, leaving the container in an otherwise unused, nonfunctional, orphan netns. Locking the pod (if present) in the public container start and stop APIs should be sufficient to stop this. Signed-off-by: Matt Heon <mheon@redhat.com>
2025-10-17 11:14:40 +08:00 · 2025-01-30 10:24:45 -05:00
parent be5d807b62
commit 06fa617f61
3 changed files with 64 additions and 26 deletions
--- a/libpod/container_internal.go
+++ b/libpod/container_internal.go
@ -1260,6 +1260,40 @@ func (c *Container) initAndStart(ctx context.Context) (retErr error) {
 	return c.waitForHealthy(ctx)
 }

+// Internal function to start a container without taking the pod lock.
+// Please note that this DOES take the container lock.
+// Intended to be used in pod-related functions.
+func (c *Container) startNoPodLock(ctx context.Context, recursive bool) (finalErr error) {
+	if !c.batched {
+		c.lock.Lock()
+		defer c.lock.Unlock()
+
+		// defer's are executed LIFO so we are locked here
+		// as long as we call this after the defer unlock()
+		defer func() {
+			if finalErr != nil {
+				if err := saveContainerError(c, finalErr); err != nil {
+					logrus.Debug(err)
+				}
+			}
+		}()
+
+		if err := c.syncContainer(); err != nil {
+			return err
+		}
+	}
+
+	if err := c.prepareToStart(ctx, recursive); err != nil {
+		return err
+	}
+
+	// Start the container
+	if err := c.start(); err != nil {
+		return err
+	}
+	return c.waitForHealthy(ctx)
+}
+
 // Internal, non-locking function to start a container
 func (c *Container) start() error {
 	if c.config.Spec.Process != nil {