diff --git a/cmd/podman/system/locks.go b/cmd/podman/system/locks.go index 0b03bc0fbe..e1501c9863 100644 --- a/cmd/podman/system/locks.go +++ b/cmd/podman/system/locks.go @@ -10,9 +10,9 @@ import ( var ( locksCommand = &cobra.Command{ - Use: "locks", - Short: "Debug Libpod's use of locks, identifying any potential conflicts", - Args: validate.NoArgs, + Use: "locks", + Short: "Debug Libpod's use of locks, identifying any potential conflicts", + Args: validate.NoArgs, Hidden: true, RunE: func(cmd *cobra.Command, args []string) error { return runLocks() @@ -24,7 +24,7 @@ var ( func init() { registry.Commands = append(registry.Commands, registry.CliCommand{ Command: locksCommand, - Parent: systemCmd, + Parent: systemCmd, }) } func runLocks() error { @@ -41,7 +41,13 @@ func runLocks() error { } if len(report.LockConflicts) > 0 { - fmt.Printf("\nLock conflicts have been detected. Recommend immediate use of `podman system renumber` to resolve.\n") + fmt.Printf("\nLock conflicts have been detected. Recommend immediate use of `podman system renumber` to resolve.\n\n") + } else { + fmt.Printf("\nNo lock conflicts have been detected, system safe from deadlocks.\n\n") + } + + for _, lockNum := range report.LocksHeld { + fmt.Printf("Lock %d is presently being held\n", lockNum) } return nil diff --git a/libpod/lock/file_lock_manager.go b/libpod/lock/file_lock_manager.go index e2fabc66a6..4a82e687cc 100644 --- a/libpod/lock/file_lock_manager.go +++ b/libpod/lock/file_lock_manager.go @@ -1,6 +1,7 @@ package lock import ( + "github.com/containers/podman/v4/libpod/define" "github.com/containers/podman/v4/libpod/lock/file" ) @@ -81,10 +82,19 @@ func (m *FileLockManager) FreeAllLocks() error { // AvailableLocks returns the number of available locks. Since this is not // limited in the file lock implementation, nil is returned. -func (locks *FileLockManager) AvailableLocks() (*uint32, error) { +func (m *FileLockManager) AvailableLocks() (*uint32, error) { return nil, nil } +// LocksHeld returns any locks that are presently locked. +// It is not implemented for the file lock backend. +// It ought to be possible, but my motivation to dig into c/storage and add +// trylock semantics to the filelocker implementation for an uncommonly-used +// lock backend is lacking. +func (m *FileLockManager) LocksHeld() ([]uint32, error) { + return nil, define.ErrNotImplemented +} + // FileLock is an individual shared memory lock. type FileLock struct { lockID uint32 diff --git a/libpod/lock/in_memory_locks.go b/libpod/lock/in_memory_locks.go index a482392eee..e62b25ac15 100644 --- a/libpod/lock/in_memory_locks.go +++ b/libpod/lock/in_memory_locks.go @@ -129,3 +129,20 @@ func (m *InMemoryManager) AvailableLocks() (*uint32, error) { return &count, nil } + +// Get any locks that are presently being held. +// Useful for debugging deadlocks. +func (m *InMemoryManager) LocksHeld() ([]uint32, error) { + //nolint:prealloc + var locks []uint32 + + for _, lock := range m.locks { + if lock.lock.TryLock() { + lock.lock.Unlock() + continue + } + locks = append(locks, lock.ID()) + } + + return locks, nil +} diff --git a/libpod/lock/lock.go b/libpod/lock/lock.go index be4829e0cb..22f3d20317 100644 --- a/libpod/lock/lock.go +++ b/libpod/lock/lock.go @@ -51,6 +51,10 @@ type Manager interface { // allocate an unlimited number. These implementations should return // a nil uin32. AvailableLocks() (*uint32, error) + // Get a list of locks that are currently locked. + // This may not be supported by some drivers, depending on the exact + // backend implementation in use. + LocksHeld() ([]uint32, error) } // Locker is similar to sync.Locker, but provides a method for freeing the lock diff --git a/libpod/lock/shm/shm_lock.c b/libpod/lock/shm/shm_lock.c index 02e5f6700a..dc98ce91fd 100644 --- a/libpod/lock/shm/shm_lock.c +++ b/libpod/lock/shm/shm_lock.c @@ -20,12 +20,18 @@ static size_t compute_shm_size(uint32_t num_bitmaps) { // Handles exceptional conditions, including a mutex locked by a process that // died holding it. // Returns 0 on success, or positive errno on failure. -static int take_mutex(pthread_mutex_t *mutex) { +static int take_mutex(pthread_mutex_t *mutex, bool trylock) { int ret_code; - do { - ret_code = pthread_mutex_lock(mutex); - } while(ret_code == EAGAIN); + if (!trylock) { + do { + ret_code = pthread_mutex_lock(mutex); + } while(ret_code == EAGAIN); + } else { + do { + ret_code = pthread_mutex_trylock(mutex); + } while(ret_code == EAGAIN); + } if (ret_code == EOWNERDEAD) { // The previous owner of the mutex died while holding it @@ -309,7 +315,7 @@ int64_t allocate_semaphore(shm_struct_t *shm) { } // Lock the semaphore controlling access to our shared memory - ret_code = take_mutex(&(shm->segment_lock)); + ret_code = take_mutex(&(shm->segment_lock), false); if (ret_code != 0) { return -1 * ret_code; } @@ -383,7 +389,7 @@ int32_t allocate_given_semaphore(shm_struct_t *shm, uint32_t sem_index) { test_map = 0x1 << index_in_bitmap; // Lock the mutex controlling access to our shared memory - ret_code = take_mutex(&(shm->segment_lock)); + ret_code = take_mutex(&(shm->segment_lock), false); if (ret_code != 0) { return -1 * ret_code; } @@ -436,7 +442,7 @@ int32_t deallocate_semaphore(shm_struct_t *shm, uint32_t sem_index) { test_map = 0x1 << index_in_bitmap; // Lock the mutex controlling access to our shared memory - ret_code = take_mutex(&(shm->segment_lock)); + ret_code = take_mutex(&(shm->segment_lock), false); if (ret_code != 0) { return -1 * ret_code; } @@ -475,7 +481,7 @@ int32_t deallocate_all_semaphores(shm_struct_t *shm) { } // Lock the mutex controlling access to our shared memory - ret_code = take_mutex(&(shm->segment_lock)); + ret_code = take_mutex(&(shm->segment_lock), false); if (ret_code != 0) { return -1 * ret_code; } @@ -513,7 +519,7 @@ int32_t lock_semaphore(shm_struct_t *shm, uint32_t sem_index) { bitmap_index = sem_index / BITMAP_SIZE; index_in_bitmap = sem_index % BITMAP_SIZE; - return -1 * take_mutex(&(shm->locks[bitmap_index].locks[index_in_bitmap])); + return -1 * take_mutex(&(shm->locks[bitmap_index].locks[index_in_bitmap]), false); } // Unlock a given semaphore @@ -553,7 +559,7 @@ int64_t available_locks(shm_struct_t *shm) { // Lock the semaphore controlling access to the SHM segment. // This isn't strictly necessary as we're only reading, but it seems safer. - ret_code = take_mutex(&(shm->segment_lock)); + ret_code = take_mutex(&(shm->segment_lock), false); if (ret_code != 0) { return -1 * ret_code; } @@ -587,3 +593,48 @@ int64_t available_locks(shm_struct_t *shm) { // Return free lock count. return free_locks; } + +// Attempt to take a given semaphore. If successfully taken, it is immediately +// released before the function returns. +// Used to check if a semaphore is in use, to detect potential deadlocks where a +// lock has not been released for an extended period of time. +// Note that this is NOT POSIX trylock as the lock is immediately released if +// taken. +// Returns negative errno on failure. +// On success, returns 1 if the lock was successfully taken, and 0 if it was +// not. +int32_t try_lock(shm_struct_t *shm, uint32_t sem_index) { + int bitmap_index, index_in_bitmap, ret_code; + pthread_mutex_t *mutex; + + if (shm == NULL) { + return -1 * EINVAL; + } + + if (sem_index >= shm->num_locks) { + return -1 * EINVAL; + } + + bitmap_index = sem_index / BITMAP_SIZE; + index_in_bitmap = sem_index % BITMAP_SIZE; + + mutex = &(shm->locks[bitmap_index].locks[index_in_bitmap]); + + ret_code = take_mutex(mutex, true); + + if (ret_code == EBUSY) { + // Did not successfully take the lock + return 0; + } else if (ret_code != 0) { + // Another, unrelated error + return -1 * ret_code; + } + + // Lock taken successfully, unlock and return. + ret_code = release_mutex(mutex); + if (ret_code != 0) { + return -1 * ret_code; + } + + return 1; +} diff --git a/libpod/lock/shm/shm_lock.go b/libpod/lock/shm/shm_lock.go index c7e38a3d78..239d0e47a7 100644 --- a/libpod/lock/shm/shm_lock.go +++ b/libpod/lock/shm/shm_lock.go @@ -281,6 +281,31 @@ func (locks *SHMLocks) GetFreeLocks() (uint32, error) { return uint32(retCode), nil } +// Get a list of locks that are currently taken. +func (locks *SHMLocks) GetTakenLocks() ([]uint32, error) { + if !locks.valid { + return nil, fmt.Errorf("locks have already been closed: %w", syscall.EINVAL) + } + + var usedLocks []uint32 + + // I don't think we need to lock the OS thread here, since the lock (if + // taken) is immediately released, and Go shouldn't reschedule the CGo + // to another thread before the function finished executing. + var i uint32 + for i = 0; i < locks.maxLocks; i++ { + retCode := C.try_lock(locks.lockStruct, C.uint32_t(i)) + if retCode < 0 { + return nil, syscall.Errno(-1 * retCode) + } + if retCode == 0 { + usedLocks = append(usedLocks, i) + } + } + + return usedLocks, nil +} + func unlinkSHMLock(path string) error { cPath := C.CString(path) defer C.free(unsafe.Pointer(cPath)) diff --git a/libpod/lock/shm/shm_lock.h b/libpod/lock/shm/shm_lock.h index ebecae218b..a09cc81e2c 100644 --- a/libpod/lock/shm/shm_lock.h +++ b/libpod/lock/shm/shm_lock.h @@ -42,5 +42,6 @@ int32_t deallocate_all_semaphores(shm_struct_t *shm); int32_t lock_semaphore(shm_struct_t *shm, uint32_t sem_index); int32_t unlock_semaphore(shm_struct_t *shm, uint32_t sem_index); int64_t available_locks(shm_struct_t *shm); +int32_t try_lock(shm_struct_t *shm, uint32_t sem_index); #endif diff --git a/libpod/lock/shm/shm_lock_nocgo.go b/libpod/lock/shm/shm_lock_nocgo.go index 31fc022235..7e0ccd61f0 100644 --- a/libpod/lock/shm/shm_lock_nocgo.go +++ b/libpod/lock/shm/shm_lock_nocgo.go @@ -101,3 +101,15 @@ func (locks *SHMLocks) UnlockSemaphore(sem uint32) error { logrus.Error("Locks are not supported without cgo") return nil } + +// GetFreeLocks gets the number of locks available to be allocated. +func (locks *SHMLocks) GetFreeLocks() (uint32, error) { + logrus.Error("Locks are not supported without cgo") + return 0, nil +} + +// Get a list of locks that are currently taken. +func (locks *SHMLocks) GetTakenLocks() ([]uint32, error) { + logrus.Error("Locks are not supported without cgo") + return nil, nil +} diff --git a/libpod/lock/shm_lock_manager_linux.go b/libpod/lock/shm_lock_manager_linux.go index 9d618009fa..344183eb59 100644 --- a/libpod/lock/shm_lock_manager_linux.go +++ b/libpod/lock/shm_lock_manager_linux.go @@ -108,6 +108,10 @@ func (m *SHMLockManager) AvailableLocks() (*uint32, error) { return &avail, nil } +func (m *SHMLockManager) LocksHeld() ([]uint32, error) { + return m.locks.GetTakenLocks() +} + // SHMLock is an individual shared memory lock. type SHMLock struct { lockID uint32 diff --git a/libpod/lock/shm_lock_manager_unsupported.go b/libpod/lock/shm_lock_manager_unsupported.go index d578359aba..9ca01d57dd 100644 --- a/libpod/lock/shm_lock_manager_unsupported.go +++ b/libpod/lock/shm_lock_manager_unsupported.go @@ -33,3 +33,13 @@ func (m *SHMLockManager) RetrieveLock(id string) (Locker, error) { func (m *SHMLockManager) FreeAllLocks() error { return fmt.Errorf("not supported") } + +// AvailableLocks is not supported on this platform +func (m *SHMLockManager) AvailableLocks() (*uint32, error) { + return nil, fmt.Errorf("not supported") +} + +// LocksHeld is not supported on this platform +func (m *SHMLockManager) LocksHeld() ([]uint32, error) { + return nil, fmt.Errorf("not supported") +} diff --git a/libpod/runtime.go b/libpod/runtime.go index f0c29c79be..6ec0a6c010 100644 --- a/libpod/runtime.go +++ b/libpod/runtime.go @@ -1189,19 +1189,19 @@ func (r *Runtime) SetRemoteURI(uri string) { r.config.Engine.RemoteURI = uri } - // Get information on potential lock conflicts. // Returns a map of lock number to object(s) using the lock, formatted as -// "container " or "volume " or "pod ". +// "container " or "volume " or "pod ", and an array of locks that +// are currently being held, formatted as []uint32. // If the map returned is not empty, you should immediately renumber locks on // the runtime, because you have a deadlock waiting to happen. -func (r *Runtime) LockConflicts() (map[uint32][]string, error) { +func (r *Runtime) LockConflicts() (map[uint32][]string, []uint32, error) { // Make an internal map to store what lock is associated with what locksInUse := make(map[uint32][]string) ctrs, err := r.state.AllContainers(false) if err != nil { - return nil, err + return nil, nil, err } for _, ctr := range ctrs { lockNum := ctr.lock.ID() @@ -1216,7 +1216,7 @@ func (r *Runtime) LockConflicts() (map[uint32][]string, error) { pods, err := r.state.AllPods() if err != nil { - return nil, err + return nil, nil, err } for _, pod := range pods { lockNum := pod.lock.ID() @@ -1231,7 +1231,7 @@ func (r *Runtime) LockConflicts() (map[uint32][]string, error) { volumes, err := r.state.AllVolumes() if err != nil { - return nil, err + return nil, nil, err } for _, vol := range volumes { lockNum := vol.lock.ID() @@ -1256,5 +1256,15 @@ func (r *Runtime) LockConflicts() (map[uint32][]string, error) { } } - return toReturn, nil + locksHeld, err := r.lockManager.LocksHeld() + if err != nil { + if errors.Is(err, define.ErrNotImplemented) { + logrus.Warnf("Could not retrieve currently taken locks as the lock backend does not support this operation") + return toReturn, []uint32{}, nil + } + + return nil, nil, err + } + + return toReturn, locksHeld, nil } diff --git a/pkg/domain/entities/system.go b/pkg/domain/entities/system.go index 28517b1aab..473db35306 100644 --- a/pkg/domain/entities/system.go +++ b/pkg/domain/entities/system.go @@ -125,4 +125,5 @@ type AuthReport struct { // lead to deadlocks. type LocksReport struct { LockConflicts map[uint32][]string + LocksHeld []uint32 } diff --git a/pkg/domain/infra/abi/system.go b/pkg/domain/infra/abi/system.go index 730d74623b..bc5a90b912 100644 --- a/pkg/domain/infra/abi/system.go +++ b/pkg/domain/infra/abi/system.go @@ -432,10 +432,11 @@ func (ic ContainerEngine) Version(ctx context.Context) (*entities.SystemVersionR func (ic ContainerEngine) Locks(ctx context.Context) (*entities.LocksReport, error) { var report entities.LocksReport - conflicts, err := ic.Libpod.LockConflicts() + conflicts, held, err := ic.Libpod.LockConflicts() if err != nil { return nil, err } report.LockConflicts = conflicts + report.LocksHeld = held return &report, nil } diff --git a/test/e2e/info_test.go b/test/e2e/info_test.go index e48ce2f854..b124067bde 100644 --- a/test/e2e/info_test.go +++ b/test/e2e/info_test.go @@ -185,7 +185,7 @@ var _ = Describe("Podman Info", func() { info1.WaitWithDefaultTimeout() Expect(info1).To(Exit(0)) free1, err := strconv.Atoi(info1.OutputToString()) - Expect(err).To(BeNil()) + Expect(err).To(Not(HaveOccurred())) ctr := podmanTest.Podman([]string{"create", ALPINE, "top"}) ctr.WaitWithDefaultTimeout() @@ -195,7 +195,7 @@ var _ = Describe("Podman Info", func() { info2.WaitWithDefaultTimeout() Expect(info2).To(Exit(0)) free2, err := strconv.Atoi(info2.OutputToString()) - Expect(err).To(BeNil()) + Expect(err).To(Not(HaveOccurred())) Expect(free1).To(Equal(free2 + 1)) })