system locks now reports held locks

To debug a deadlock, we really want to know what lock is actually
locked, so we can figure out what is using that lock. This PR
adds support for this, using trylock to check if every lock on
the system is free or in use. Will really need to be run a few
times in quick succession to verify that it's not a transient
lock and it's actually stuck, but that's not really a big deal.

Signed-off-by: Matt Heon <mheon@redhat.com>
This commit is contained in:
Matt Heon
2023-06-05 15:45:59 -04:00
parent 0948c078c2
commit 4fda7936c5
14 changed files with 178 additions and 26 deletions

View File

@ -10,9 +10,9 @@ import (
var (
locksCommand = &cobra.Command{
Use: "locks",
Short: "Debug Libpod's use of locks, identifying any potential conflicts",
Args: validate.NoArgs,
Use: "locks",
Short: "Debug Libpod's use of locks, identifying any potential conflicts",
Args: validate.NoArgs,
Hidden: true,
RunE: func(cmd *cobra.Command, args []string) error {
return runLocks()
@ -24,7 +24,7 @@ var (
func init() {
registry.Commands = append(registry.Commands, registry.CliCommand{
Command: locksCommand,
Parent: systemCmd,
Parent: systemCmd,
})
}
func runLocks() error {
@ -41,7 +41,13 @@ func runLocks() error {
}
if len(report.LockConflicts) > 0 {
fmt.Printf("\nLock conflicts have been detected. Recommend immediate use of `podman system renumber` to resolve.\n")
fmt.Printf("\nLock conflicts have been detected. Recommend immediate use of `podman system renumber` to resolve.\n\n")
} else {
fmt.Printf("\nNo lock conflicts have been detected, system safe from deadlocks.\n\n")
}
for _, lockNum := range report.LocksHeld {
fmt.Printf("Lock %d is presently being held\n", lockNum)
}
return nil

View File

@ -1,6 +1,7 @@
package lock
import (
"github.com/containers/podman/v4/libpod/define"
"github.com/containers/podman/v4/libpod/lock/file"
)
@ -81,10 +82,19 @@ func (m *FileLockManager) FreeAllLocks() error {
// AvailableLocks returns the number of available locks. Since this is not
// limited in the file lock implementation, nil is returned.
func (locks *FileLockManager) AvailableLocks() (*uint32, error) {
func (m *FileLockManager) AvailableLocks() (*uint32, error) {
return nil, nil
}
// LocksHeld returns any locks that are presently locked.
// It is not implemented for the file lock backend.
// It ought to be possible, but my motivation to dig into c/storage and add
// trylock semantics to the filelocker implementation for an uncommonly-used
// lock backend is lacking.
func (m *FileLockManager) LocksHeld() ([]uint32, error) {
return nil, define.ErrNotImplemented
}
// FileLock is an individual shared memory lock.
type FileLock struct {
lockID uint32

View File

@ -129,3 +129,20 @@ func (m *InMemoryManager) AvailableLocks() (*uint32, error) {
return &count, nil
}
// Get any locks that are presently being held.
// Useful for debugging deadlocks.
func (m *InMemoryManager) LocksHeld() ([]uint32, error) {
//nolint:prealloc
var locks []uint32
for _, lock := range m.locks {
if lock.lock.TryLock() {
lock.lock.Unlock()
continue
}
locks = append(locks, lock.ID())
}
return locks, nil
}

View File

@ -51,6 +51,10 @@ type Manager interface {
// allocate an unlimited number. These implementations should return
// a nil uin32.
AvailableLocks() (*uint32, error)
// Get a list of locks that are currently locked.
// This may not be supported by some drivers, depending on the exact
// backend implementation in use.
LocksHeld() ([]uint32, error)
}
// Locker is similar to sync.Locker, but provides a method for freeing the lock

View File

@ -20,12 +20,18 @@ static size_t compute_shm_size(uint32_t num_bitmaps) {
// Handles exceptional conditions, including a mutex locked by a process that
// died holding it.
// Returns 0 on success, or positive errno on failure.
static int take_mutex(pthread_mutex_t *mutex) {
static int take_mutex(pthread_mutex_t *mutex, bool trylock) {
int ret_code;
do {
ret_code = pthread_mutex_lock(mutex);
} while(ret_code == EAGAIN);
if (!trylock) {
do {
ret_code = pthread_mutex_lock(mutex);
} while(ret_code == EAGAIN);
} else {
do {
ret_code = pthread_mutex_trylock(mutex);
} while(ret_code == EAGAIN);
}
if (ret_code == EOWNERDEAD) {
// The previous owner of the mutex died while holding it
@ -309,7 +315,7 @@ int64_t allocate_semaphore(shm_struct_t *shm) {
}
// Lock the semaphore controlling access to our shared memory
ret_code = take_mutex(&(shm->segment_lock));
ret_code = take_mutex(&(shm->segment_lock), false);
if (ret_code != 0) {
return -1 * ret_code;
}
@ -383,7 +389,7 @@ int32_t allocate_given_semaphore(shm_struct_t *shm, uint32_t sem_index) {
test_map = 0x1 << index_in_bitmap;
// Lock the mutex controlling access to our shared memory
ret_code = take_mutex(&(shm->segment_lock));
ret_code = take_mutex(&(shm->segment_lock), false);
if (ret_code != 0) {
return -1 * ret_code;
}
@ -436,7 +442,7 @@ int32_t deallocate_semaphore(shm_struct_t *shm, uint32_t sem_index) {
test_map = 0x1 << index_in_bitmap;
// Lock the mutex controlling access to our shared memory
ret_code = take_mutex(&(shm->segment_lock));
ret_code = take_mutex(&(shm->segment_lock), false);
if (ret_code != 0) {
return -1 * ret_code;
}
@ -475,7 +481,7 @@ int32_t deallocate_all_semaphores(shm_struct_t *shm) {
}
// Lock the mutex controlling access to our shared memory
ret_code = take_mutex(&(shm->segment_lock));
ret_code = take_mutex(&(shm->segment_lock), false);
if (ret_code != 0) {
return -1 * ret_code;
}
@ -513,7 +519,7 @@ int32_t lock_semaphore(shm_struct_t *shm, uint32_t sem_index) {
bitmap_index = sem_index / BITMAP_SIZE;
index_in_bitmap = sem_index % BITMAP_SIZE;
return -1 * take_mutex(&(shm->locks[bitmap_index].locks[index_in_bitmap]));
return -1 * take_mutex(&(shm->locks[bitmap_index].locks[index_in_bitmap]), false);
}
// Unlock a given semaphore
@ -553,7 +559,7 @@ int64_t available_locks(shm_struct_t *shm) {
// Lock the semaphore controlling access to the SHM segment.
// This isn't strictly necessary as we're only reading, but it seems safer.
ret_code = take_mutex(&(shm->segment_lock));
ret_code = take_mutex(&(shm->segment_lock), false);
if (ret_code != 0) {
return -1 * ret_code;
}
@ -587,3 +593,48 @@ int64_t available_locks(shm_struct_t *shm) {
// Return free lock count.
return free_locks;
}
// Attempt to take a given semaphore. If successfully taken, it is immediately
// released before the function returns.
// Used to check if a semaphore is in use, to detect potential deadlocks where a
// lock has not been released for an extended period of time.
// Note that this is NOT POSIX trylock as the lock is immediately released if
// taken.
// Returns negative errno on failure.
// On success, returns 1 if the lock was successfully taken, and 0 if it was
// not.
int32_t try_lock(shm_struct_t *shm, uint32_t sem_index) {
int bitmap_index, index_in_bitmap, ret_code;
pthread_mutex_t *mutex;
if (shm == NULL) {
return -1 * EINVAL;
}
if (sem_index >= shm->num_locks) {
return -1 * EINVAL;
}
bitmap_index = sem_index / BITMAP_SIZE;
index_in_bitmap = sem_index % BITMAP_SIZE;
mutex = &(shm->locks[bitmap_index].locks[index_in_bitmap]);
ret_code = take_mutex(mutex, true);
if (ret_code == EBUSY) {
// Did not successfully take the lock
return 0;
} else if (ret_code != 0) {
// Another, unrelated error
return -1 * ret_code;
}
// Lock taken successfully, unlock and return.
ret_code = release_mutex(mutex);
if (ret_code != 0) {
return -1 * ret_code;
}
return 1;
}

View File

@ -281,6 +281,31 @@ func (locks *SHMLocks) GetFreeLocks() (uint32, error) {
return uint32(retCode), nil
}
// Get a list of locks that are currently taken.
func (locks *SHMLocks) GetTakenLocks() ([]uint32, error) {
if !locks.valid {
return nil, fmt.Errorf("locks have already been closed: %w", syscall.EINVAL)
}
var usedLocks []uint32
// I don't think we need to lock the OS thread here, since the lock (if
// taken) is immediately released, and Go shouldn't reschedule the CGo
// to another thread before the function finished executing.
var i uint32
for i = 0; i < locks.maxLocks; i++ {
retCode := C.try_lock(locks.lockStruct, C.uint32_t(i))
if retCode < 0 {
return nil, syscall.Errno(-1 * retCode)
}
if retCode == 0 {
usedLocks = append(usedLocks, i)
}
}
return usedLocks, nil
}
func unlinkSHMLock(path string) error {
cPath := C.CString(path)
defer C.free(unsafe.Pointer(cPath))

View File

@ -42,5 +42,6 @@ int32_t deallocate_all_semaphores(shm_struct_t *shm);
int32_t lock_semaphore(shm_struct_t *shm, uint32_t sem_index);
int32_t unlock_semaphore(shm_struct_t *shm, uint32_t sem_index);
int64_t available_locks(shm_struct_t *shm);
int32_t try_lock(shm_struct_t *shm, uint32_t sem_index);
#endif

View File

@ -101,3 +101,15 @@ func (locks *SHMLocks) UnlockSemaphore(sem uint32) error {
logrus.Error("Locks are not supported without cgo")
return nil
}
// GetFreeLocks gets the number of locks available to be allocated.
func (locks *SHMLocks) GetFreeLocks() (uint32, error) {
logrus.Error("Locks are not supported without cgo")
return 0, nil
}
// Get a list of locks that are currently taken.
func (locks *SHMLocks) GetTakenLocks() ([]uint32, error) {
logrus.Error("Locks are not supported without cgo")
return nil, nil
}

View File

@ -108,6 +108,10 @@ func (m *SHMLockManager) AvailableLocks() (*uint32, error) {
return &avail, nil
}
func (m *SHMLockManager) LocksHeld() ([]uint32, error) {
return m.locks.GetTakenLocks()
}
// SHMLock is an individual shared memory lock.
type SHMLock struct {
lockID uint32

View File

@ -33,3 +33,13 @@ func (m *SHMLockManager) RetrieveLock(id string) (Locker, error) {
func (m *SHMLockManager) FreeAllLocks() error {
return fmt.Errorf("not supported")
}
// AvailableLocks is not supported on this platform
func (m *SHMLockManager) AvailableLocks() (*uint32, error) {
return nil, fmt.Errorf("not supported")
}
// LocksHeld is not supported on this platform
func (m *SHMLockManager) LocksHeld() ([]uint32, error) {
return nil, fmt.Errorf("not supported")
}

View File

@ -1189,19 +1189,19 @@ func (r *Runtime) SetRemoteURI(uri string) {
r.config.Engine.RemoteURI = uri
}
// Get information on potential lock conflicts.
// Returns a map of lock number to object(s) using the lock, formatted as
// "container <id>" or "volume <id>" or "pod <id>".
// "container <id>" or "volume <id>" or "pod <id>", and an array of locks that
// are currently being held, formatted as []uint32.
// If the map returned is not empty, you should immediately renumber locks on
// the runtime, because you have a deadlock waiting to happen.
func (r *Runtime) LockConflicts() (map[uint32][]string, error) {
func (r *Runtime) LockConflicts() (map[uint32][]string, []uint32, error) {
// Make an internal map to store what lock is associated with what
locksInUse := make(map[uint32][]string)
ctrs, err := r.state.AllContainers(false)
if err != nil {
return nil, err
return nil, nil, err
}
for _, ctr := range ctrs {
lockNum := ctr.lock.ID()
@ -1216,7 +1216,7 @@ func (r *Runtime) LockConflicts() (map[uint32][]string, error) {
pods, err := r.state.AllPods()
if err != nil {
return nil, err
return nil, nil, err
}
for _, pod := range pods {
lockNum := pod.lock.ID()
@ -1231,7 +1231,7 @@ func (r *Runtime) LockConflicts() (map[uint32][]string, error) {
volumes, err := r.state.AllVolumes()
if err != nil {
return nil, err
return nil, nil, err
}
for _, vol := range volumes {
lockNum := vol.lock.ID()
@ -1256,5 +1256,15 @@ func (r *Runtime) LockConflicts() (map[uint32][]string, error) {
}
}
return toReturn, nil
locksHeld, err := r.lockManager.LocksHeld()
if err != nil {
if errors.Is(err, define.ErrNotImplemented) {
logrus.Warnf("Could not retrieve currently taken locks as the lock backend does not support this operation")
return toReturn, []uint32{}, nil
}
return nil, nil, err
}
return toReturn, locksHeld, nil
}

View File

@ -125,4 +125,5 @@ type AuthReport struct {
// lead to deadlocks.
type LocksReport struct {
LockConflicts map[uint32][]string
LocksHeld []uint32
}

View File

@ -432,10 +432,11 @@ func (ic ContainerEngine) Version(ctx context.Context) (*entities.SystemVersionR
func (ic ContainerEngine) Locks(ctx context.Context) (*entities.LocksReport, error) {
var report entities.LocksReport
conflicts, err := ic.Libpod.LockConflicts()
conflicts, held, err := ic.Libpod.LockConflicts()
if err != nil {
return nil, err
}
report.LockConflicts = conflicts
report.LocksHeld = held
return &report, nil
}

View File

@ -185,7 +185,7 @@ var _ = Describe("Podman Info", func() {
info1.WaitWithDefaultTimeout()
Expect(info1).To(Exit(0))
free1, err := strconv.Atoi(info1.OutputToString())
Expect(err).To(BeNil())
Expect(err).To(Not(HaveOccurred()))
ctr := podmanTest.Podman([]string{"create", ALPINE, "top"})
ctr.WaitWithDefaultTimeout()
@ -195,7 +195,7 @@ var _ = Describe("Podman Info", func() {
info2.WaitWithDefaultTimeout()
Expect(info2).To(Exit(0))
free2, err := strconv.Atoi(info2.OutputToString())
Expect(err).To(BeNil())
Expect(err).To(Not(HaveOccurred()))
Expect(free1).To(Equal(free2 + 1))
})