Move pod jobs to parallel execution

Make Podman pod operations that do not involve starting
containers (which needs to be done in a specific order) use the
same parallel operation code we use to make `podman stop` on
large numbers of containers fast. We were previously stopping
containers in a pod serially, which could take up to the timeout
(default 15 seconds) for each container - stopping 100 containers
that do not respond to SIGTERM would take 25 minutes.

To do this, refactor the parallel operation code a bit to remove
its dependency on libpod (damn circular import restrictions...)
and use parallel functions that just re-use the standard
container API operations - maximizes code reuse (previously each
pod handler had a separate implementation of the container
function it performed).

This is a bit of a palate cleanser after fighting CI for two
days - nice to be able to return to a land of sanity.

Signed-off-by: Matthew Heon <matthew.heon@pm.me>
This commit is contained in:
Matthew Heon
2020-08-19 16:15:35 -04:00
committed by Matthew Heon
parent a7500e54a4
commit 2bb2425704
7 changed files with 179 additions and 190 deletions

View File

@ -1,11 +1,10 @@
package parallel
package ctr
import (
"context"
"sync"
"github.com/containers/podman/v2/libpod"
"github.com/pkg/errors"
"github.com/containers/podman/v2/pkg/parallel"
"github.com/sirupsen/logrus"
)
@ -14,44 +13,28 @@ import (
// If no error is returned, each container specified in ctrs will have an entry
// in the resulting map; containers with no error will be set to nil.
func ContainerOp(ctx context.Context, ctrs []*libpod.Container, applyFunc func(*libpod.Container) error) (map[*libpod.Container]error, error) {
jobControlLock.RLock()
defer jobControlLock.RUnlock()
// We could use a sync.Map but given Go's lack of generic I'd rather
// just use a lock on a normal map...
// The expectation is that most of the time is spent in applyFunc
// anyways.
var (
errMap = make(map[*libpod.Container]error)
errLock sync.Mutex
allDone sync.WaitGroup
errMap = make(map[*libpod.Container]<-chan error)
)
for _, ctr := range ctrs {
// Block until a thread is available
if err := jobControl.Acquire(ctx, 1); err != nil {
return nil, errors.Wrapf(err, "error acquiring job control semaphore")
}
allDone.Add(1)
c := ctr
go func() {
logrus.Debugf("Launching job on container %s", c.ID())
err := applyFunc(c)
errLock.Lock()
errMap[c] = err
errLock.Unlock()
allDone.Done()
jobControl.Release(1)
}()
logrus.Debugf("Starting parallel job on container %s", c.ID())
errChan := parallel.Enqueue(ctx, func() error {
return applyFunc(c)
})
errMap[c] = errChan
}
allDone.Wait()
finalErr := make(map[*libpod.Container]error)
for ctr, errChan := range errMap {
err := <-errChan
finalErr[ctr] = err
}
return errMap, nil
return finalErr, nil
}
// TODO: Add an Enqueue() function that returns a promise

View File

@ -1,6 +1,7 @@
package parallel
import (
"context"
"sync"
"github.com/pkg/errors"
@ -42,3 +43,32 @@ func SetMaxThreads(threads uint) error {
func GetMaxThreads() uint {
return numThreads
}
// Enqueue adds a single function to the parallel jobs queue. This function will
// be run when an unused thread is available.
// Returns a receive-only error channel that will return the error (if any) from
// the provided function fn when fn has finished executing. The channel will be
// closed after this.
func Enqueue(ctx context.Context, fn func() error) <-chan error {
retChan := make(chan error)
go func() {
jobControlLock.RLock()
defer jobControlLock.RUnlock()
defer close(retChan)
if err := jobControl.Acquire(ctx, 1); err != nil {
retChan <- errors.Wrapf(err, "error acquiring job control semaphore")
return
}
err := fn()
jobControl.Release(1)
retChan <- err
}()
return retChan
}