From c341a0ffe01fbcdcd272c72d928e5220ad004d73 Mon Sep 17 00:00:00 2001
From: Valentin Rothberg <vrothberg@redhat.com>
Date: Thu, 27 Jul 2023 14:00:07 +0200
Subject: [PATCH] machine: QEMU: lock VM on start

Lock the VM on start.  If the machine is in the "starting" state we know
that a previous start has failed and guide the user into resolving the
issue.

Concurrent starts will busy wait and return the expected "already
running" error.

NOTE: this change is only looking at the start issue (#18662).  Other
commands such as stop and update should also lock and will be updated
in a future change.  I expect the underlying issue to apply to all
machine providers, not only QEMU.  It's desirable to aim for extending
the machine interface to also allow to `Lock()` and `Unlock()`.  After
acquiring the lock, the VM should automatically be reloaded/updated.

[NO NEW TESTS NEEDED]

Fixes: #18662
Signed-off-by: Valentin Rothberg <vrothberg@redhat.com>
---
 pkg/machine/qemu/config.go  |  3 ++-
 pkg/machine/qemu/machine.go | 44 +++++++++++++++++++++++++++++++++++++
 2 files changed, 46 insertions(+), 1 deletion(-)

diff --git a/pkg/machine/qemu/config.go b/pkg/machine/qemu/config.go
index b061d7635a..84c68ef70c 100644
--- a/pkg/machine/qemu/config.go
+++ b/pkg/machine/qemu/config.go
@@ -215,8 +215,9 @@ func (p *QEMUVirtualization) CheckExclusiveActiveVM() (bool, string, error) {
 	if err != nil {
 		return false, "", fmt.Errorf("checking VM active: %w", err)
 	}
+	// NOTE: Start() takes care of dealing with the "starting" state.
 	for _, vm := range vms {
-		if vm.Running || vm.Starting {
+		if vm.Running {
 			return true, vm.Name, nil
 		}
 	}
diff --git a/pkg/machine/qemu/machine.go b/pkg/machine/qemu/machine.go
index b85c19808f..6dbf57ba61 100644
--- a/pkg/machine/qemu/machine.go
+++ b/pkg/machine/qemu/machine.go
@@ -27,6 +27,7 @@ import (
 	"github.com/containers/podman/v4/pkg/rootless"
 	"github.com/containers/podman/v4/pkg/util"
 	"github.com/containers/storage/pkg/ioutils"
+	"github.com/containers/storage/pkg/lockfile"
 	"github.com/digitalocean/go-qemu/qmp"
 	"github.com/sirupsen/logrus"
 )
@@ -75,6 +76,32 @@ type MachineVM struct {
 	Created time.Time
 	// LastUp contains the last recorded uptime
 	LastUp time.Time
+
+	// User at runtime for serializing write operations.
+	lock *lockfile.LockFile
+}
+
+func (v *MachineVM) setLock() error {
+	if v.lock != nil {
+		return nil
+	}
+
+	// FIXME: there's a painful amount of `GetConfDir` calls scattered
+	// across the code base.  This should be done once and stored
+	// somewhere instead.
+	vmConfigDir, err := machine.GetConfDir(vmtype)
+	if err != nil {
+		return err
+	}
+
+	lockPath := filepath.Join(vmConfigDir, v.Name+".lock")
+	lock, err := lockfile.GetLockFile(lockPath)
+	if err != nil {
+		return fmt.Errorf("creating lockfile for VM: %w", err)
+	}
+
+	v.lock = lock
+	return nil
 }
 
 type Monitor struct {
@@ -606,6 +633,23 @@ func (v *MachineVM) Start(name string, opts machine.StartOptions) error {
 	defaultBackoff := 500 * time.Millisecond
 	maxBackoffs := 6
 
+	if err := v.setLock(); err != nil {
+		return err
+	}
+	v.lock.Lock()
+	defer v.lock.Unlock()
+
+	state, err := v.State(false)
+	if err != nil {
+		return err
+	}
+	switch state {
+	case machine.Starting:
+		return fmt.Errorf("cannot start VM %q: starting state indicates that a previous start has failed: please stop and restart the VM", v.Name)
+	case machine.Running:
+		return fmt.Errorf("cannot start VM %q: %w", v.Name, machine.ErrVMAlreadyRunning)
+	}
+
 	v.Starting = true
 	if err := v.writeConfig(); err != nil {
 		return fmt.Errorf("writing JSON file: %w", err)