registry: lock start attempts

When running parallel, multiple tests could be trying to start
the registry at once. Make this parallel-safe.

Also, use a safer port range for the registry. Something
outside of /proc/sys/net/ipv4/ip_local_port_range

Sorry, I'm including a FIXME section that I haven't investigated
deeply enough.

Signed-off-by: Ed Santiago <santiago@redhat.com>
This commit is contained in:
Ed Santiago
2024-09-17 10:06:34 -06:00
parent bf6131780a
commit 5fc3de5583
2 changed files with 33 additions and 8 deletions

View File

@ -17,8 +17,26 @@ unset REGISTRY_AUTH_FILE
# Start a local registry. Only needed on demand (e.g. by 150-login.bats)
# and then only once: if we start, leave it running until final teardown.
function start_registry() {
if [[ -d "$PODMAN_LOGIN_WORKDIR/auth" ]]; then
# Already started
AUTHDIR=${PODMAN_LOGIN_WORKDIR}/auth
local startflag=${PODMAN_LOGIN_WORKDIR}/OK
if ! mkdir $AUTHDIR; then
# *Possibly* already started. Or, possibly (when running
# parallel tests) another process is trying to start it.
# Give it some time.
local timeout=30
while [[ $timeout -gt 0 ]]; do
if [[ -e $startflag ]]; then
echo "Registry has already been started by another process"
return
fi
sleep 1
timeout=$((timeout - 1))
done
die "Internal error: timed out waiting for another process to start registry"
# Fixes very obscure corner case in root system tests:
# 1) we run 150-login tests, starting a registry; then
@ -26,11 +44,15 @@ function start_registry() {
# 3) run 700-play, the "private" test, which needs the
# already-started registry, but its port is now DROPped,
# so the test times out trying to talk to registry
run_podman --storage-driver vfs $(podman_isolation_opts ${PODMAN_LOGIN_WORKDIR}) network reload --all
###### FIXME FIXME FIXME TEMPORARY!
###### Trying to understand flake #23725. What happens if we stop
###### doing the network reload?
###### FIXME FIXME FIXME, should we do it in stop_registry??
###### run_podman --storage-driver vfs $(podman_isolation_opts ${PODMAN_LOGIN_WORKDIR}) network reload --all
return
fi
AUTHDIR=${PODMAN_LOGIN_WORKDIR}/auth
mkdir -p $AUTHDIR
# Registry image; copy of docker.io, but on our own registry
@ -79,6 +101,9 @@ function start_registry() {
wait_for_port 127.0.0.1 ${PODMAN_LOGIN_REGISTRY_PORT}
# ...so we look in container logs for confirmation that registry is running.
_PODMAN_TEST_OPTS="${PODMAN_LOGIN_ARGS}" wait_for_output "listening on .::.:5000" $cid
touch $startflag
echo "I have started the registry"
}
function stop_registry() {
@ -103,10 +128,10 @@ function stop_registry() {
mount | grep ${PODMAN_LOGIN_WORKDIR} | awk '{print $3}' | xargs --no-run-if-empty umount
if [[ $(id -u) -eq 0 ]]; then
rm -rf ${PODMAN_LOGIN_WORKDIR}
rm -rf ${PODMAN_LOGIN_WORKDIR}/*
else
# rootless image data is owned by a subuid
run_podman unshare rm -rf ${PODMAN_LOGIN_WORKDIR}
run_podman unshare rm -rf ${PODMAN_LOGIN_WORKDIR}/*
fi
fi
@ -119,7 +144,7 @@ function stop_registry() {
echo ""
echo "lsof -i -P"
lsof -i -P
die "Socket still seems open"
die "Socket $PODMAN_LOGIN_REGISTRY_PORT still seems open"
fi
}

View File

@ -25,7 +25,7 @@ function setup_suite() {
# FIXME: racy! It could be many minutes between now and when we start it.
# To mitigate, we use a range not used anywhere else in system tests.
export PODMAN_LOGIN_REGISTRY_PORT=$(random_free_port 42000-42999)
export PODMAN_LOGIN_REGISTRY_PORT=$(random_free_port 27000-27999)
# The above does not handle errors. Do a final confirmation.
assert "$PODMAN_LOGIN_REGISTRY_PORT" != "" \