fix goroutine leaks in events and logs backend

When running a single podman logs this is not really important since we
will exit when we finish reading the logs. However for the system
service this is very important. Leaking goroutines will cause an
increased memory and CPU ussage over time.

Both the the event and log backend have goroutine leaks with both the
file and journald drivers.

The journald backend has the problem that journal.Wait(IndefiniteWait)
will block until we get a new journald event. So when a client closes
the connection the goroutine would still wait until there is a new
journal entry. To fix this we just wait for a maximum of 5 seconds,
after that we can check if the client connection was closed and exit
correctly in this case.

For the file backend we can fix this by waiting for either the log line
or context cancel at the same time. Currently it would block waiting for
new log lines and only check afterwards if the client closed the
connection and thus hang forever if there are no new log lines.

[NO NEW TESTS NEEDED] I am open to ideas how we can test memory leaks in
CI.
To test manually run a container like this:
`podman run --log-driver $driver  --name test -d alpine sh -c 'i=1; while [ "$i" -ne 1000 ]; do echo "line $i"; i=$((i + 1)); done; sleep inf'`
where `$driver` can be either `journald` or `k8s-file`.
Then start the podman system service and use:
`curl -m 1 --output -  --unix-socket $XDG_RUNTIME_DIR/podman/podman.sock -v 'http://d/containers/test/logs?follow=1&since=0&stderr=1&stdout=1' &>/dev/null`
to get the logs from the API and then it closes the connection after 1 second.
Now run the curl command several times and check the memory usage of the service.

Fixes #14879

Signed-off-by: Paul Holzinger <pholzing@redhat.com>
This commit is contained in:
Paul Holzinger
2022-07-20 12:36:02 +02:00
committed by Matthew Heon
parent dc1dc46006
commit f6d18ed41c
4 changed files with 38 additions and 24 deletions

View File

@ -10,6 +10,7 @@ import (
"github.com/containers/podman/v4/libpod/define" "github.com/containers/podman/v4/libpod/define"
"github.com/containers/podman/v4/libpod/events" "github.com/containers/podman/v4/libpod/events"
"github.com/containers/podman/v4/libpod/logs" "github.com/containers/podman/v4/libpod/logs"
"github.com/nxadm/tail"
"github.com/nxadm/tail/watch" "github.com/nxadm/tail/watch"
"github.com/sirupsen/logrus" "github.com/sirupsen/logrus"
) )
@ -74,14 +75,19 @@ func (c *Container) readFromLogFile(ctx context.Context, options *logs.LogOption
go func() { go func() {
defer options.WaitGroup.Done() defer options.WaitGroup.Done()
var line *tail.Line
for line := range t.Lines { var ok bool
for {
select { select {
case <-ctx.Done(): case <-ctx.Done():
// the consumer has cancelled // the consumer has cancelled
t.Kill(errors.New("hangup by client"))
return return
default: case line, ok = <-t.Lines:
// fallthrough if !ok {
// channel was closed
return
}
} }
nll, err := logs.NewLogLine(line.Text) nll, err := logs.NewLogLine(line.Text)
if err != nil { if err != nil {

View File

@ -178,8 +178,13 @@ func (c *Container) readFromJournal(ctx context.Context, options *logs.LogOption
if !options.Follow || !containerCouldBeLogging { if !options.Follow || !containerCouldBeLogging {
return return
} }
// Sleep until something's happening on the journal.
journal.Wait(sdjournal.IndefiniteWait) // journal.Wait() is blocking, this would cause the goroutine to hang forever
// if no more journal entries are generated and thus if the client
// has closed the connection in the meantime to leak memory.
// Waiting only 5 seconds makes sure we can check if the client closed in the
// meantime at least every 5 seconds.
journal.Wait(5 * time.Second)
continue continue
} }
lastReadCursor = cursor lastReadCursor = cursor

View File

@ -141,9 +141,18 @@ func (e EventJournalD) Read(ctx context.Context, options ReadOptions) error {
if !options.Stream || (len(options.Until) > 0 && time.Now().After(untilTime)) { if !options.Stream || (len(options.Until) > 0 && time.Now().After(untilTime)) {
break break
} }
t := sdjournal.IndefiniteWait
// j.Wait() is blocking, this would cause the goroutine to hang forever
// if no more journal entries are generated and thus if the client
// has closed the connection in the meantime to leak memory.
// Waiting only 5 seconds makes sure we can check if the client closed in the
// meantime at least every 5 seconds.
t := 5 * time.Second
if len(options.Until) > 0 { if len(options.Until) > 0 {
t = time.Until(untilTime) until := time.Until(untilTime)
if until < t {
t = until
}
} }
_ = j.Wait(t) _ = j.Wait(t)
continue continue

View File

@ -108,23 +108,19 @@ func (e EventLogFile) Read(ctx context.Context, options ReadOptions) error {
} }
}() }()
} }
funcDone := make(chan bool) var line *tail.Line
copy := true var ok bool
go func() { for {
select {
case <-funcDone:
// Do nothing
case <-ctx.Done():
copy = false
t.Kill(errors.New("hangup by client"))
}
}()
for line := range t.Lines {
select { select {
case <-ctx.Done(): case <-ctx.Done():
// the consumer has cancelled // the consumer has cancelled
t.Kill(errors.New("hangup by client"))
return nil return nil
default: case line, ok = <-t.Lines:
if !ok {
// channel was closed
return nil
}
// fallthrough // fallthrough
} }
@ -138,12 +134,10 @@ func (e EventLogFile) Read(ctx context.Context, options ReadOptions) error {
default: default:
return fmt.Errorf("event type %s is not valid in %s", event.Type.String(), e.options.LogFilePath) return fmt.Errorf("event type %s is not valid in %s", event.Type.String(), e.options.LogFilePath)
} }
if copy && applyFilters(event, filterMap) { if applyFilters(event, filterMap) {
options.EventChannel <- event options.EventChannel <- event
} }
} }
funcDone <- true
return nil
} }
// String returns a string representation of the logger // String returns a string representation of the logger