Move pod jobs to parallel execution

Make Podman pod operations that do not involve starting containers (which needs to be done in a specific order) use the same parallel operation code we use to make `podman stop` on large numbers of containers fast. We were previously stopping containers in a pod serially, which could take up to the timeout (default 15 seconds) for each container - stopping 100 containers that do not respond to SIGTERM would take 25 minutes. To do this, refactor the parallel operation code a bit to remove its dependency on libpod (damn circular import restrictions...) and use parallel functions that just re-use the standard container API operations - maximizes code reuse (previously each pod handler had a separate implementation of the container function it performed). This is a bit of a palate cleanser after fighting CI for two days - nice to be able to return to a land of sanity. Signed-off-by: Matthew Heon <matthew.heon@pm.me>
2025-05-17 15:18:43 +08:00 · 2020-08-19 16:15:35 -04:00
parent a7500e54a4
commit 2bb2425704
7 changed files with 179 additions and 190 deletions
--- a/pkg/parallel/parallel_linux.go
+++ b/pkg/parallel/parallel_linux.go
@ -1,11 +1,10 @@
-package parallel
+package ctr

 import (
 	"context"
-	"sync"

 	"github.com/containers/podman/v2/libpod"
-	"github.com/pkg/errors"
+	"github.com/containers/podman/v2/pkg/parallel"
 	"github.com/sirupsen/logrus"
 )

@ -14,44 +13,28 @@ import (
 // If no error is returned, each container specified in ctrs will have an entry
 // in the resulting map; containers with no error will be set to nil.
 func ContainerOp(ctx context.Context, ctrs []*libpod.Container, applyFunc func(*libpod.Container) error) (map[*libpod.Container]error, error) {
-	jobControlLock.RLock()
-	defer jobControlLock.RUnlock()
-
 	// We could use a sync.Map but given Go's lack of generic I'd rather
 	// just use a lock on a normal map...
 	// The expectation is that most of the time is spent in applyFunc
 	// anyways.
 	var (
-		errMap  = make(map[*libpod.Container]error)
-		errLock sync.Mutex
-		allDone sync.WaitGroup
+		errMap = make(map[*libpod.Container]<-chan error)
 	)

 	for _, ctr := range ctrs {
-		// Block until a thread is available
-		if err := jobControl.Acquire(ctx, 1); err != nil {
-			return nil, errors.Wrapf(err, "error acquiring job control semaphore")
-		}
-
-		allDone.Add(1)
-
 		c := ctr
-		go func() {
-			logrus.Debugf("Launching job on container %s", c.ID())
-
-			err := applyFunc(c)
-			errLock.Lock()
-			errMap[c] = err
-			errLock.Unlock()
-
-			allDone.Done()
-			jobControl.Release(1)
-		}()
+		logrus.Debugf("Starting parallel job on container %s", c.ID())
+		errChan := parallel.Enqueue(ctx, func() error {
+			return applyFunc(c)
+		})
+		errMap[c] = errChan
 	}

-	allDone.Wait()
+	finalErr := make(map[*libpod.Container]error)
+	for ctr, errChan := range errMap {
+		err := <-errChan
+		finalErr[ctr] = err
+	}

-	return errMap, nil
+	return finalErr, nil
 }
-
-// TODO: Add an Enqueue() function that returns a promise
--- a/pkg/parallel/parallel.go
+++ b/pkg/parallel/parallel.go
@ -1,6 +1,7 @@
 package parallel

 import (
+	"context"
 	"sync"

 	"github.com/pkg/errors"
@ -42,3 +43,32 @@ func SetMaxThreads(threads uint) error {
 func GetMaxThreads() uint {
 	return numThreads
 }
+
+// Enqueue adds a single function to the parallel jobs queue. This function will
+// be run when an unused thread is available.
+// Returns a receive-only error channel that will return the error (if any) from
+// the provided function fn when fn has finished executing. The channel will be
+// closed after this.
+func Enqueue(ctx context.Context, fn func() error) <-chan error {
+	retChan := make(chan error)
+
+	go func() {
+		jobControlLock.RLock()
+		defer jobControlLock.RUnlock()
+
+		defer close(retChan)
+
+		if err := jobControl.Acquire(ctx, 1); err != nil {
+			retChan <- errors.Wrapf(err, "error acquiring job control semaphore")
+			return
+		}
+
+		err := fn()
+
+		jobControl.Release(1)
+
+		retChan <- err
+	}()
+
+	return retChan
+}