fix network DB desync after failed connect/disconnect

Networks are stored in two ways in the DB, first a static network list
which holds all the network with its option for the container. Second,
the network status which hold the actual network result from netavark
but only when the container is running.

If the container is running they must be in sync and podman inspect has
checks to ensure that as well it errors out of there is a desync between
the two.

As the adding to the db and doing actual networking configuration are
diffeent parts it possible that one worked while the other failed which
triggers the desync. To avoid this make the network connect/disconnect
code more robust against partial failures. When the network calls fail
we update the db again to remove/add the network back.

Fixes: https://issues.redhat.com/browse/RHEL-78037

Signed-off-by: Paul Holzinger <pholzing@redhat.com>
This commit is contained in:
Paul Holzinger
2025-04-04 14:34:39 +02:00
parent 0a0d05b3e3
commit f87ab2b7a6
2 changed files with 28 additions and 1 deletions

View File

@ -378,7 +378,7 @@ func (c *Container) NetworkDisconnect(nameOrID, netName string, force bool) erro
return err
}
_, nameExists := networks[netName]
netOpts, nameExists := networks[netName]
if !nameExists && len(networks) > 0 {
return fmt.Errorf("container %s is not connected to network %s", nameOrID, netName)
}
@ -393,12 +393,20 @@ func (c *Container) NetworkDisconnect(nameOrID, netName string, force bool) erro
return err
}
// Since we removed the new network from the container db we must have to add it back during partial setup errors
addContainerNetworkToDB := func() {
if err := c.runtime.state.NetworkConnect(c, netName, netOpts); err != nil {
logrus.Errorf("Failed to add network %s for container %s to DB after failed network disconnect", netName, nameOrID)
}
}
c.newNetworkEvent(events.NetworkDisconnect, netName)
if !c.ensureState(define.ContainerStateRunning, define.ContainerStateCreated) {
return nil
}
if c.state.NetNS == "" {
addContainerNetworkToDB()
return fmt.Errorf("unable to disconnect %s from %s: %w", nameOrID, netName, define.ErrNoNetwork)
}
@ -412,6 +420,7 @@ func (c *Container) NetworkDisconnect(nameOrID, netName string, force bool) erro
}
if err := c.runtime.teardownNetworkBackend(c.state.NetNS, opts); err != nil {
addContainerNetworkToDB()
return err
}
@ -524,11 +533,20 @@ func (c *Container) NetworkConnect(nameOrID, netName string, netOpts types.PerNe
return err
}
// Since we added the new network to the container db we must have to remove it from that during partial setup errors
removeContainerNetworkFromDB := func() {
if err := c.runtime.state.NetworkDisconnect(c, netName); err != nil {
logrus.Errorf("Failed to remove network %s for container %s from DB after failed network connect", netName, nameOrID)
}
}
c.newNetworkEvent(events.NetworkConnect, netName)
if !c.ensureState(define.ContainerStateRunning, define.ContainerStateCreated) {
return nil
}
if c.state.NetNS == "" {
removeContainerNetworkFromDB()
return fmt.Errorf("unable to connect %s to %s: %w", nameOrID, netName, define.ErrNoNetwork)
}
@ -543,6 +561,7 @@ func (c *Container) NetworkConnect(nameOrID, netName string, netOpts types.PerNe
results, err := c.runtime.setUpNetwork(c.state.NetNS, opts)
if err != nil {
removeContainerNetworkFromDB()
return err
}
if len(results) != 1 {

View File

@ -583,6 +583,14 @@ load helpers.network
run_podman network connect $netname $background_cid
is "$output" "" "(re)connect of container with no open ports"
# connect a network with an intentional error (bad mac address)
run_podman 125 network connect --mac-address 00:00:00:00:00:00 $netname2 $cid
assert "$output" =~ "Cannot assign requested address" "mac address error"
# podman inspect must still work correctly and not error due network desync
run_podman inspect --format '{{ range $index, $value := .NetworkSettings.Networks }}{{$index}}{{end}}' $cid
assert "$output" == "$netname" "only network1 must be connected"
# connect a second network
run_podman network connect $netname2 $cid
is "$output" "" "Output should be empty (no errors)"