registry: lock start attempts

When running parallel, multiple tests could be trying to start the registry at once. Make this parallel-safe. Also, use a safer port range for the registry. Something outside of /proc/sys/net/ipv4/ip_local_port_range Sorry, I'm including a FIXME section that I haven't investigated deeply enough. Signed-off-by: Ed Santiago <santiago@redhat.com>
2025-10-16 10:43:52 +08:00 · 2024-09-17 10:06:34 -06:00
parent bf6131780a
commit 5fc3de5583
2 changed files with 33 additions and 8 deletions
--- a/test/system/helpers.registry.bash
+++ b/test/system/helpers.registry.bash
@ -17,8 +17,26 @@ unset REGISTRY_AUTH_FILE
 # Start a local registry. Only needed on demand (e.g. by 150-login.bats)
 # and then only once: if we start, leave it running until final teardown.
 function start_registry() {
-    if [[ -d "$PODMAN_LOGIN_WORKDIR/auth" ]]; then
-        # Already started
+    AUTHDIR=${PODMAN_LOGIN_WORKDIR}/auth
+
+    local startflag=${PODMAN_LOGIN_WORKDIR}/OK
+
+    if ! mkdir $AUTHDIR; then
+        # *Possibly* already started. Or, possibly (when running
+        # parallel tests) another process is trying to start it.
+        # Give it some time.
+        local timeout=30
+        while [[ $timeout -gt 0 ]]; do
+            if [[ -e $startflag ]]; then
+                echo "Registry has already been started by another process"
+                return
+            fi
+
+            sleep 1
+            timeout=$((timeout - 1))
+        done
+
+        die "Internal error: timed out waiting for another process to start registry"

        # Fixes very obscure corner case in root system tests:
        #  1) we run 150-login tests, starting a registry; then
@ -26,11 +44,15 @@ function start_registry() {
        #  3) run 700-play, the "private" test, which needs the
        #     already-started registry, but its port is now DROPped,
        #     so the test times out trying to talk to registry
-        run_podman --storage-driver vfs $(podman_isolation_opts ${PODMAN_LOGIN_WORKDIR}) network reload --all
+
+        ###### FIXME FIXME FIXME TEMPORARY!
+        ###### Trying to understand flake #23725. What happens if we stop
+        ###### doing the network reload?
+        ###### FIXME FIXME FIXME, should we do it in stop_registry??
+        ###### run_podman --storage-driver vfs $(podman_isolation_opts ${PODMAN_LOGIN_WORKDIR}) network reload --all
        return
    fi

-    AUTHDIR=${PODMAN_LOGIN_WORKDIR}/auth
    mkdir -p $AUTHDIR

    # Registry image; copy of docker.io, but on our own registry
@ -79,6 +101,9 @@ function start_registry() {
    wait_for_port 127.0.0.1 ${PODMAN_LOGIN_REGISTRY_PORT}
    # ...so we look in container logs for confirmation that registry is running.
    _PODMAN_TEST_OPTS="${PODMAN_LOGIN_ARGS}" wait_for_output "listening on .::.:5000" $cid
+
+    touch $startflag
+    echo "I have started the registry"
 }

 function stop_registry() {
@ -103,10 +128,10 @@ function stop_registry() {
        mount | grep ${PODMAN_LOGIN_WORKDIR} | awk '{print $3}' | xargs --no-run-if-empty umount

        if [[ $(id -u) -eq 0 ]]; then
-            rm -rf ${PODMAN_LOGIN_WORKDIR}
+            rm -rf ${PODMAN_LOGIN_WORKDIR}/*
        else
            # rootless image data is owned by a subuid
-            run_podman unshare rm -rf ${PODMAN_LOGIN_WORKDIR}
+            run_podman unshare rm -rf ${PODMAN_LOGIN_WORKDIR}/*
        fi
    fi

@ -119,7 +144,7 @@ function stop_registry() {
        echo ""
        echo "lsof -i -P"
        lsof -i -P
-        die "Socket still seems open"
+        die "Socket $PODMAN_LOGIN_REGISTRY_PORT still seems open"
    fi
 }

--- a/test/system/setup_suite.bash
+++ b/test/system/setup_suite.bash
@ -25,7 +25,7 @@ function setup_suite() {

    # FIXME: racy! It could be many minutes between now and when we start it.
    # To mitigate, we use a range not used anywhere else in system tests.
-    export PODMAN_LOGIN_REGISTRY_PORT=$(random_free_port 42000-42999)
+    export PODMAN_LOGIN_REGISTRY_PORT=$(random_free_port 27000-27999)

    # The above does not handle errors. Do a final confirmation.
    assert "$PODMAN_LOGIN_REGISTRY_PORT" != "" \