From 8c16322a84a553dde3209a0e68da444126d84d6a Mon Sep 17 00:00:00 2001 From: Valentin Rothberg Date: Wed, 12 Jul 2023 15:19:05 +0200 Subject: [PATCH] machine start: qemu: wait for SSH readiness During the exponential backoff waiting for the machine to be fully up and running, also make sure that SSH is ready. The systemd dependencies of the ready.service include the sshd.service among others but that is not enough. Other CoreOS users reported the same issue on IRC, so I feel fairly confident to use the pragmatic approach of making sure SSH works on the client side. #17403 is quite old and there are other pressing machine issues that need attention. [NO NEW TESTS NEEDED] Fixes: #17403 Signed-off-by: Valentin Rothberg --- pkg/machine/qemu/machine.go | 56 +++++++++++++++++++++++++------------ 1 file changed, 38 insertions(+), 18 deletions(-) diff --git a/pkg/machine/qemu/machine.go b/pkg/machine/qemu/machine.go index 783d917b68..d412ca82c4 100644 --- a/pkg/machine/qemu/machine.go +++ b/pkg/machine/qemu/machine.go @@ -599,27 +599,47 @@ func (v *MachineVM) Start(name string, opts machine.StartOptions) error { _ = v.writeConfig() } } - if len(v.Mounts) > 0 { - connected := false - backoff = 500 * time.Millisecond - for i := 0; i < maxBackoffs; i++ { - if i > 0 { - time.Sleep(backoff) - backoff *= 2 - } - state, err := v.State(true) - if err != nil { - return err - } - if state == machine.Running && v.isListening() { - connected = true - break - } + if len(v.Mounts) == 0 { + v.waitAPIAndPrintInfo(forwardState, forwardSock, opts.NoInfo) + return nil + } + + connected := false + backoff = defaultBackoff + var sshError error + for i := 0; i < maxBackoffs; i++ { + if i > 0 { + time.Sleep(backoff) + backoff *= 2 } - if !connected { - return fmt.Errorf("machine did not transition into running state") + state, err := v.State(true) + if err != nil { + return err + } + if state == machine.Running && v.isListening() { + // Also make sure that SSH is up and running. The + // ready service's dependencies don't fully make sure + // that clients can SSH into the machine immediately + // after boot. + // + // CoreOS users have reported the same observation but + // the underlying source of the issue remains unknown. + if sshError = v.SSH(name, machine.SSHOptions{Args: []string{"true"}}); sshError != nil { + logrus.Debugf("SSH readiness check for machine failed: %v", sshError) + continue + } + connected = true + break } } + if !connected { + msg := "machine did not transition into running state" + if sshError != nil { + return fmt.Errorf("%s: ssh error: %v", msg, sshError) + } + return errors.New(msg) + } + for _, mount := range v.Mounts { if !opts.Quiet { fmt.Printf("Mounting volume... %s:%s\n", mount.Source, mount.Target)