Add a net health recovery service to Qemu machines

There is a network stability issue in qemu + virtio, affecting
some users after long periods of usage, which can lead to
suspended queue delivery. Until the issue is resolved, add a
temporary recovery service which restarts networking when host
communication becomes inoperable.

[NO NEW TESTS NEEDED]

Signed-off-by: Jason T. Greene <jason.greene@redhat.com>
This commit is contained in:
Jason T. Greene
2024-01-15 17:53:30 -06:00
parent f1ea4fbb3d
commit 79fad91dbb
6 changed files with 100 additions and 19 deletions

View File

@ -53,15 +53,16 @@ func GetNodeGrp(grpName string) NodeGroup {
}
type DynamicIgnition struct {
Name string
Key string
TimeZone string
UID int
VMName string
VMType define.VMType
WritePath string
Cfg Config
Rootful bool
Name string
Key string
TimeZone string
UID int
VMName string
VMType define.VMType
WritePath string
Cfg Config
Rootful bool
NetRecover bool
}
func (ign *DynamicIgnition) Write() error {
@ -97,7 +98,7 @@ func (ign *DynamicIgnition) GenerateIgnitionConfig() error {
ignStorage := Storage{
Directories: getDirs(ign.Name),
Files: getFiles(ign.Name, ign.UID, ign.Rootful, ign.VMType),
Files: getFiles(ign.Name, ign.UID, ign.Rootful, ign.VMType, ign.NetRecover),
Links: getLinks(ign.Name),
}
@ -231,6 +232,21 @@ func (ign *DynamicIgnition) GenerateIgnitionConfig() error {
}
ignSystemd.Units = append(ignSystemd.Units, qemuUnit)
}
if ign.NetRecover {
contents, err := GetNetRecoveryUnitFile().ToString()
if err != nil {
return err
}
recoveryUnit := Unit{
Enabled: BoolToPtr(true),
Name: "net-health-recovery.service",
Contents: &contents,
}
ignSystemd.Units = append(ignSystemd.Units, recoveryUnit)
}
// Only after all checks are done
// it's ready create the ingConfig
ign.Cfg = Config{
@ -303,7 +319,7 @@ func getDirs(usrName string) []Directory {
return dirs
}
func getFiles(usrName string, uid int, rootful bool, vmtype define.VMType) []File {
func getFiles(usrName string, uid int, rootful bool, vmtype define.VMType, netRecover bool) []File {
files := make([]File, 0)
lingerExample := parser.NewUnitFile()
@ -574,6 +590,23 @@ Delegate=memory pids cpu io
},
})
// Only necessary for qemu on mac
if netRecover {
files = append(files, File{
Node: Node{
User: GetNodeUsr("root"),
Group: GetNodeGrp("root"),
Path: "/usr/local/bin/net-health-recovery.sh",
},
FileEmbedded1: FileEmbedded1{
Mode: IntToPtr(0755),
Contents: Resource{
Source: EncodeDataURLPtr(GetNetRecoveryFile()),
},
},
})
}
return files
}
@ -743,6 +776,37 @@ func (i *IgnitionBuilder) Build() error {
return i.dynamicIgnition.Write()
}
func GetNetRecoveryFile() string {
return `#!/bin/bash
# Verify network health, and bounce the network device if host connectivity
# is lost. This is a temporary workaround for a known rare qemu/virtio issue
# that affects some systems
sleep 120 # allow time for network setup on initial boot
while true; do
sleep 30
curl -s -o /dev/null --max-time 30 http://192.168.127.1/health
if [ "$?" != "0" ]; then
echo "bouncing nic due to loss of connectivity with host"
ifconfig enp0s1 down; ifconfig enp0s1 up
fi
done
`
}
func GetNetRecoveryUnitFile() *parser.UnitFile {
recoveryUnit := parser.NewUnitFile()
recoveryUnit.Add("Unit", "Description", "Verifies health of network and recovers if necessary")
recoveryUnit.Add("Unit", "After", "sshd.socket sshd.service")
recoveryUnit.Add("Service", "ExecStart", "/usr/local/bin/net-health-recovery.sh")
recoveryUnit.Add("Service", "StandardOutput", "journal")
recoveryUnit.Add("Service", "StandardError", "journal")
recoveryUnit.Add("Service", "StandardInput", "null")
recoveryUnit.Add("Install", "WantedBy", "default.target")
return recoveryUnit
}
func DefaultReadyUnitFile() parser.UnitFile {
u := parser.NewUnitFile()
u.Add("Unit", "After", "remove-moby.service sshd.socket sshd.service")

View File

@ -194,14 +194,15 @@ func (v *MachineVM) Init(opts machine.InitOptions) (bool, error) {
}
builder := ignition.NewIgnitionBuilder(ignition.DynamicIgnition{
Name: opts.Username,
Key: key,
VMName: v.Name,
VMType: define.QemuVirt,
TimeZone: opts.TimeZone,
WritePath: v.getIgnitionFile(),
UID: v.UID,
Rootful: v.Rootful,
Name: opts.Username,
Key: key,
VMName: v.Name,
VMType: define.QemuVirt,
TimeZone: opts.TimeZone,
WritePath: v.getIgnitionFile(),
UID: v.UID,
Rootful: v.Rootful,
NetRecover: useNetworkRecover(),
})
// If the user provides an ignition file, we need to

View File

@ -11,3 +11,7 @@ func getRuntimeDir() (string, error) {
}
return tmpDir, nil
}
func useNetworkRecover() bool {
return true
}

View File

@ -11,3 +11,7 @@ func getRuntimeDir() (string, error) {
}
return tmpDir, nil
}
func useNetworkRecover() bool {
return false
}

View File

@ -11,3 +11,7 @@ func getRuntimeDir() (string, error) {
}
return util.GetRootlessRuntimeDir()
}
func useNetworkRecover() bool {
return false
}

View File

@ -11,3 +11,7 @@ func getRuntimeDir() (string, error) {
}
return tmpDir, nil
}
func useNetworkRecover() bool {
return false
}