machine start: qemu: wait for SSH readiness

During the exponential backoff waiting for the machine to be fully up
and running, also make sure that SSH is ready.  The systemd dependencies
of the ready.service include the sshd.service among others but that is
not enough.

Other CoreOS users reported the same issue on IRC, so I feel fairly
confident to use the pragmatic approach of making sure SSH works on the
client side.  #17403 is quite old and there are other pressing machine
issues that need attention.

[NO NEW TESTS NEEDED]

Fixes: #17403
Signed-off-by: Valentin Rothberg <vrothberg@redhat.com>
This commit is contained in:
Valentin Rothberg
2023-07-12 15:19:05 +02:00
parent 9d9f4aaafe
commit 8c16322a84

View File

@ -599,27 +599,47 @@ func (v *MachineVM) Start(name string, opts machine.StartOptions) error {
_ = v.writeConfig()
}
}
if len(v.Mounts) > 0 {
connected := false
backoff = 500 * time.Millisecond
for i := 0; i < maxBackoffs; i++ {
if i > 0 {
time.Sleep(backoff)
backoff *= 2
}
state, err := v.State(true)
if err != nil {
return err
}
if state == machine.Running && v.isListening() {
connected = true
break
}
if len(v.Mounts) == 0 {
v.waitAPIAndPrintInfo(forwardState, forwardSock, opts.NoInfo)
return nil
}
connected := false
backoff = defaultBackoff
var sshError error
for i := 0; i < maxBackoffs; i++ {
if i > 0 {
time.Sleep(backoff)
backoff *= 2
}
if !connected {
return fmt.Errorf("machine did not transition into running state")
state, err := v.State(true)
if err != nil {
return err
}
if state == machine.Running && v.isListening() {
// Also make sure that SSH is up and running. The
// ready service's dependencies don't fully make sure
// that clients can SSH into the machine immediately
// after boot.
//
// CoreOS users have reported the same observation but
// the underlying source of the issue remains unknown.
if sshError = v.SSH(name, machine.SSHOptions{Args: []string{"true"}}); sshError != nil {
logrus.Debugf("SSH readiness check for machine failed: %v", sshError)
continue
}
connected = true
break
}
}
if !connected {
msg := "machine did not transition into running state"
if sshError != nil {
return fmt.Errorf("%s: ssh error: %v", msg, sshError)
}
return errors.New(msg)
}
for _, mount := range v.Mounts {
if !opts.Quiet {
fmt.Printf("Mounting volume... %s:%s\n", mount.Source, mount.Target)