changeset 25640:3285827f3d5d

[illumos-gate merge] commit 2606939d92dd3044a9851b2930ebf533c3c03892 13275 bhyve needs richer INIT/SIPI support commit 78f846c0ab4f41678386d3e1b49c16cc8db07a8b 13438 Update prototypes to 2021 commit ab2fdd80a620c2b88e5ac2c4247ab79880761b18 13409 cxgbe: replace zero sized array by flexible array commit 6dc7d05754d992040097e8ba8f85e77512125c60 8040 NFSv4 client: 3-way deadlock between nfs4_bio(), nfs4_do_delegreturn(), and nfs4_flush_pages() Conflicts: usr/src/cmd/bhyve/bhyverun.c
author Dan McDonald <danmcd@joyent.com>
date Mon, 11 Jan 2021 08:50:27 -0500
parents ce2b70e7aab0 (current diff) 3fa93b6d354f (diff)
children b58b62d3de87
files usr/src/cmd/bhyve/bhyverun.c usr/src/cmd/bhyve/bhyverun.h usr/src/cmd/bhyve/spinup_ap.c usr/src/cmd/bhyve/spinup_ap.h usr/src/lib/libvmmapi/common/mapfile-vers usr/src/lib/libvmmapi/common/vmmapi.c usr/src/lib/libvmmapi/common/vmmapi.h usr/src/uts/common/fs/nfs/nfs4_vnops.c usr/src/uts/i86pc/io/vmm/amd/svm.c usr/src/uts/i86pc/io/vmm/intel/vmx.c usr/src/uts/i86pc/io/vmm/io/vlapic.c usr/src/uts/i86pc/io/vmm/io/vlapic.h usr/src/uts/i86pc/io/vmm/io/vlapic_priv.h usr/src/uts/i86pc/io/vmm/vmm.c usr/src/uts/i86pc/io/vmm/vmm_sol_dev.c usr/src/uts/i86pc/io/vmm/vmm_stat.c usr/src/uts/i86pc/io/vmm/vmm_stat.h usr/src/uts/i86pc/sys/vmm.h usr/src/uts/i86pc/sys/vmm_dev.h
diffstat 40 files changed, 787 insertions(+), 485 deletions(-) [+]
line wrap: on
line diff
--- a/usr/src/cmd/bhyve/bhyverun.c	Mon Jan 04 14:49:49 2021 -0500
+++ b/usr/src/cmd/bhyve/bhyverun.c	Mon Jan 11 08:50:27 2021 -0500
@@ -518,13 +518,14 @@
 fbsdrun_addcpu(struct vmctx *ctx, int fromcpu, int newcpu, uint64_t rip)
 #else
 void
-fbsdrun_addcpu(struct vmctx *ctx, int fromcpu, int newcpu, uint64_t rip,
-    bool suspend)
+fbsdrun_addcpu(struct vmctx *ctx, int newcpu, uint64_t rip, bool suspend)
 #endif
 {
 	int error;
 
+#ifdef __FreeBSD__
 	assert(fromcpu == BSP);
+#endif
 
 	/*
 	 * The 'newcpu' must be activated in the context of 'fromcpu'. If
@@ -577,7 +578,7 @@
 
 	assert(entry->cmd == VEC_DEFAULT);
 
-	entry->cmd = VEC_COMPLETE_MMIO;
+	entry->cmd = VEC_FULFILL_MMIO;
 	mmio->bytes = bytes;
 	mmio->read = 1;
 	mmio->gpa = gpa;
@@ -592,7 +593,7 @@
 
 	assert(entry->cmd == VEC_DEFAULT);
 
-	entry->cmd = VEC_COMPLETE_MMIO;
+	entry->cmd = VEC_FULFILL_MMIO;
 	mmio->bytes = bytes;
 	mmio->read = 0;
 	mmio->gpa = gpa;
@@ -607,7 +608,7 @@
 
 	assert(entry->cmd == VEC_DEFAULT);
 
-	entry->cmd = VEC_COMPLETE_INOUT;
+	entry->cmd = VEC_FULFILL_INOUT;
 	inout->bytes = bytes;
 	inout->flags = INOUT_IN;
 	inout->port = port;
@@ -622,7 +623,7 @@
 
 	assert(entry->cmd == VEC_DEFAULT);
 
-	entry->cmd = VEC_COMPLETE_INOUT;
+	entry->cmd = VEC_FULFILL_INOUT;
 	inout->bytes = bytes;
 	inout->flags = 0;
 	inout->port = port;
@@ -731,6 +732,7 @@
 	return (VMEXIT_CONTINUE);
 }
 
+#ifdef __FreeBSD__
 static int
 vmexit_spinup_ap(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
 {
@@ -740,6 +742,18 @@
 
 	return (VMEXIT_CONTINUE);
 }
+#else
+static int
+vmexit_run_state(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
+{
+	/*
+	 * Run-state transitions (INIT, SIPI, etc) are handled in-kernel, so an
+	 * exit to userspace with that code is not expected.
+	 */
+	fprintf(stderr, "unexpected run-state VM exit");
+	return (VMEXIT_ABORT);
+}
+#endif /* __FreeBSD__ */
 
 #ifdef __FreeBSD__
 #define	DEBUG_EPT_MISCONFIG
@@ -1017,7 +1031,11 @@
 	[VM_EXITCODE_WRMSR]  = vmexit_wrmsr,
 	[VM_EXITCODE_MTRAP]  = vmexit_mtrap,
 	[VM_EXITCODE_INST_EMUL] = vmexit_inst_emul,
+#ifdef __FreeBSD__
 	[VM_EXITCODE_SPINUP_AP] = vmexit_spinup_ap,
+#else
+	[VM_EXITCODE_RUN_STATE] = vmexit_run_state,
+#endif
 	[VM_EXITCODE_SUSPENDED] = vmexit_suspend,
 	[VM_EXITCODE_TASK_SWITCH] = vmexit_task_switch,
 	[VM_EXITCODE_DEBUG] = vmexit_debug,
@@ -1547,14 +1565,21 @@
 		errx(EX_OSERR, "cap_enter() failed");
 #endif
 
+#ifdef __FreeBSD__
 	/*
 	 * Add CPU 0
 	 */
-#ifdef __FreeBSD__
 	fbsdrun_addcpu(ctx, BSP, BSP, rip);
 #else
-	fbsdrun_addcpu(ctx, BSP, BSP, rip, suspend);
+	/* Set BSP to run (unlike the APs which wait for INIT) */
+	error = vm_set_run_state(ctx, BSP, VRS_RUN, 0);
+	assert(error == 0);
+	fbsdrun_addcpu(ctx, BSP, rip, suspend);
 
+	/* Add subsequent CPUs, which will wait until INIT/SIPI-ed */
+	for (uint_t i = 1; i < guest_ncpus; i++) {
+		spinup_halted_ap(ctx, i);
+	}
 	mark_provisioned();
 #endif
 
--- a/usr/src/cmd/bhyve/bhyverun.h	Mon Jan 04 14:49:49 2021 -0500
+++ b/usr/src/cmd/bhyve/bhyverun.h	Mon Jan 11 08:50:27 2021 -0500
@@ -58,8 +58,7 @@
 #ifdef __FreeBSD__
 void fbsdrun_addcpu(struct vmctx *ctx, int fromcpu, int newcpu, uint64_t rip);
 #else
-void fbsdrun_addcpu(struct vmctx *ctx, int fromcpu, int newcpu, uint64_t rip,
-    bool suspend);
+void fbsdrun_addcpu(struct vmctx *ctx, int newcpu, uint64_t rip, bool suspend);
 #endif
 int  fbsdrun_muxed(void);
 int  fbsdrun_vmexit_on_hlt(void);
--- a/usr/src/cmd/bhyve/spinup_ap.c	Mon Jan 04 14:49:49 2021 -0500
+++ b/usr/src/cmd/bhyve/spinup_ap.c	Mon Jan 11 08:50:27 2021 -0500
@@ -56,6 +56,7 @@
 #include "bhyverun.h"
 #include "spinup_ap.h"
 
+#ifdef __FreeBSD__
 static void
 spinup_ap_realmode(struct vmctx *ctx, int newcpu, uint64_t *rip)
 {
@@ -101,7 +102,6 @@
 
 	fbsdrun_set_capabilities(ctx, newcpu);
 
-#ifdef __FreeBSD__
 	/*
 	 * Enable the 'unrestricted guest' mode for 'newcpu'.
 	 *
@@ -110,17 +110,30 @@
 	 */
 	error = vm_set_capability(ctx, newcpu, VM_CAP_UNRESTRICTED_GUEST, 1);
 	assert(error == 0);
-#else
-	/* Unrestricted Guest is always enabled on illumos */
-#endif
 
 	spinup_ap_realmode(ctx, newcpu, &rip);
 
-#ifdef __FreeBSD__
 	fbsdrun_addcpu(ctx, vcpu, newcpu, rip);
-#else
-	fbsdrun_addcpu(ctx, vcpu, newcpu, rip, false);
-#endif
 
 	return (newcpu);
 }
+#else /* __FreeBSD__ */
+void
+spinup_halted_ap(struct vmctx *ctx, int newcpu)
+{
+	int error;
+
+	assert(newcpu != 0);
+	assert(newcpu < guest_ncpus);
+
+	error = vcpu_reset(ctx, newcpu);
+	assert(error == 0);
+
+	fbsdrun_set_capabilities(ctx, newcpu);
+
+	error = vm_set_run_state(ctx, newcpu, VRS_HALT, 0);
+	assert(error == 0);
+
+	fbsdrun_addcpu(ctx, newcpu, 0, false);
+}
+#endif /* __FreeBSD__ */
--- a/usr/src/cmd/bhyve/spinup_ap.h	Mon Jan 04 14:49:49 2021 -0500
+++ b/usr/src/cmd/bhyve/spinup_ap.h	Mon Jan 11 08:50:27 2021 -0500
@@ -31,6 +31,10 @@
 #ifndef	_SPINUP_AP_H_
 #define	_SPINUP_AP_H_
 
+#ifdef __FreeBSD__
 int spinup_ap(struct vmctx *ctx, int vcpu, int newcpu, uint64_t rip);
+#else
+void spinup_halted_ap(struct vmctx *ctx, int newcpu);
+#endif /* __FreeBSD__ */
 
 #endif
--- a/usr/src/lib/libvmmapi/common/mapfile-vers	Mon Jan 04 14:49:49 2021 -0500
+++ b/usr/src/lib/libvmmapi/common/mapfile-vers	Mon Jan 11 08:50:27 2021 -0500
@@ -123,6 +123,8 @@
 		vm_unassign_pptdev;
 		vm_pmtmr_set_location;
 		vm_wrlock_cycle;
+		vm_get_run_state;
+		vm_set_run_state;
 
 	local:
 		*;
--- a/usr/src/lib/libvmmapi/common/vmmapi.c	Mon Jan 04 14:49:49 2021 -0500
+++ b/usr/src/lib/libvmmapi/common/vmmapi.c	Mon Jan 11 08:50:27 2021 -0500
@@ -1302,6 +1302,18 @@
 	return (error);
 }
 
+#ifndef __FreeBSD__
+int
+vcpu_reset(struct vmctx *vmctx, int vcpu)
+{
+	struct vm_vcpu_reset vvr;
+
+	vvr.vcpuid = vcpu;
+	vvr.kind = VRK_RESET;
+
+	return (ioctl(vmctx->fd, VM_RESET_CPU, &vvr));
+}
+#else /* __FreeBSD__ */
 /*
  * From Intel Vol 3a:
  * Table 9-1. IA-32 Processor States Following Power-up, Reset or INIT
@@ -1458,6 +1470,7 @@
 done:
 	return (error);
 }
+#endif /* __FreeBSD__ */
 
 int
 vm_get_gpa_pmap(struct vmctx *ctx, uint64_t gpa, uint64_t *pte, int *num)
@@ -1839,6 +1852,39 @@
 	}
 	return (0);
 }
+
+int
+vm_get_run_state(struct vmctx *ctx, int vcpu, enum vcpu_run_state *state,
+    uint8_t *sipi_vector)
+{
+	struct vm_run_state data;
+
+	data.vcpuid = vcpu;
+	if (ioctl(ctx->fd, VM_GET_RUN_STATE, &data) != 0) {
+		return (errno);
+	}
+
+	*state = data.state;
+	*sipi_vector = data.sipi_vector;
+	return (0);
+}
+
+int
+vm_set_run_state(struct vmctx *ctx, int vcpu, enum vcpu_run_state state,
+    uint8_t sipi_vector)
+{
+	struct vm_run_state data;
+
+	data.vcpuid = vcpu;
+	data.state = state;
+	data.sipi_vector = sipi_vector;
+	if (ioctl(ctx->fd, VM_SET_RUN_STATE, &data) != 0) {
+		return (errno);
+	}
+
+	return (0);
+}
+
 #endif /* __FreeBSD__ */
 
 #ifdef __FreeBSD__
--- a/usr/src/lib/libvmmapi/common/vmmapi.h	Mon Jan 04 14:49:49 2021 -0500
+++ b/usr/src/lib/libvmmapi/common/vmmapi.h	Mon Jan 11 08:50:27 2021 -0500
@@ -304,6 +304,10 @@
 /* illumos-specific APIs */
 int	vm_pmtmr_set_location(struct vmctx *ctx, uint16_t ioport);
 int	vm_wrlock_cycle(struct vmctx *ctx);
+int vm_get_run_state(struct vmctx *ctx, int vcpu, enum vcpu_run_state *state,
+    uint8_t *sipi_vector);
+int vm_set_run_state(struct vmctx *ctx, int vcpu, enum vcpu_run_state state,
+    uint8_t sipi_vector);
 #endif	/* __FreeBSD__ */
 
 #ifdef	__FreeBSD__
--- a/usr/src/prototypes/README	Mon Jan 04 14:49:49 2021 -0500
+++ b/usr/src/prototypes/README	Mon Jan 11 08:50:27 2021 -0500
@@ -17,5 +17,5 @@
  */
 
 /*
- * Copyright 2020 <contributor>
+ * Copyright 2021 <contributor>
  */
--- a/usr/src/prototypes/prototype.Makefile	Mon Jan 04 14:49:49 2021 -0500
+++ b/usr/src/prototypes/prototype.Makefile	Mon Jan 11 08:50:27 2021 -0500
@@ -10,6 +10,6 @@
 #
 
 #
-# Copyright 2020 <contributor>
+# Copyright 2021 <contributor>
 #
 
--- a/usr/src/prototypes/prototype.c	Mon Jan 04 14:49:49 2021 -0500
+++ b/usr/src/prototypes/prototype.c	Mon Jan 11 08:50:27 2021 -0500
@@ -10,7 +10,7 @@
  */
 
 /*
- * Copyright 2020 <contributor>
+ * Copyright 2021 <contributor>
  */
 
 /*
--- a/usr/src/prototypes/prototype.csh	Mon Jan 04 14:49:49 2021 -0500
+++ b/usr/src/prototypes/prototype.csh	Mon Jan 11 08:50:27 2021 -0500
@@ -12,6 +12,6 @@
 #
 
 #
-# Copyright 2020 <contributor>
+# Copyright 2021 <contributor>
 #
 
--- a/usr/src/prototypes/prototype.h	Mon Jan 04 14:49:49 2021 -0500
+++ b/usr/src/prototypes/prototype.h	Mon Jan 11 08:50:27 2021 -0500
@@ -10,7 +10,7 @@
  */
 
 /*
- * Copyright 2020 <contributor>
+ * Copyright 2021 <contributor>
  */
 
 #ifndef _PROTOTYPE_H
--- a/usr/src/prototypes/prototype.java	Mon Jan 04 14:49:49 2021 -0500
+++ b/usr/src/prototypes/prototype.java	Mon Jan 11 08:50:27 2021 -0500
@@ -10,7 +10,7 @@
  */
 
 /*
- * Copyright 2020 <contributor>
+ * Copyright 2021 <contributor>
  */
 
 /*
--- a/usr/src/prototypes/prototype.ksh	Mon Jan 04 14:49:49 2021 -0500
+++ b/usr/src/prototypes/prototype.ksh	Mon Jan 11 08:50:27 2021 -0500
@@ -12,6 +12,6 @@
 #
 
 #
-# Copyright 2020 <contributor>
+# Copyright 2021 <contributor>
 #
 
--- a/usr/src/prototypes/prototype.man	Mon Jan 04 14:49:49 2021 -0500
+++ b/usr/src/prototypes/prototype.man	Mon Jan 11 08:50:27 2021 -0500
@@ -9,5 +9,5 @@
 .\" http://www.illumos.org/license/CDDL.
 .\"
 .\"
-.\" Copyright 2020 <contributor>
+.\" Copyright 2021 <contributor>
 .\"
--- a/usr/src/prototypes/prototype.man1	Mon Jan 04 14:49:49 2021 -0500
+++ b/usr/src/prototypes/prototype.man1	Mon Jan 11 08:50:27 2021 -0500
@@ -9,7 +9,7 @@
 .\" http://www.illumos.org/license/CDDL.
 .\"
 .\"
-.\" Copyright 2020 <contributor>
+.\" Copyright 2021 <contributor>
 .\"
 .Dd Month Day, Year
 .Dt COMMAND 1
--- a/usr/src/prototypes/prototype.man3x	Mon Jan 04 14:49:49 2021 -0500
+++ b/usr/src/prototypes/prototype.man3x	Mon Jan 11 08:50:27 2021 -0500
@@ -9,7 +9,7 @@
 .\" http://www.illumos.org/license/CDDL.
 .\"
 .\"
-.\" Copyright 2020 <contributor>
+.\" Copyright 2021 <contributor>
 .\"
 .Dd Month Day, Year
 .Dt MANUALPAGE 3SECTION
--- a/usr/src/prototypes/prototype.man7d	Mon Jan 04 14:49:49 2021 -0500
+++ b/usr/src/prototypes/prototype.man7d	Mon Jan 11 08:50:27 2021 -0500
@@ -9,7 +9,7 @@
 .\" http://www.illumos.org/license/CDDL.
 .\"
 .\"
-.\" Copyright 2020 <contributor>
+.\" Copyright 2021 <contributor>
 .\"
 .Dd Month Day, Year
 .Dt DRIVERNAME 7D
--- a/usr/src/prototypes/prototype.man9e	Mon Jan 04 14:49:49 2021 -0500
+++ b/usr/src/prototypes/prototype.man9e	Mon Jan 11 08:50:27 2021 -0500
@@ -9,7 +9,7 @@
 .\" http://www.illumos.org/license/CDDL.
 .\"
 .\"
-.\" Copyright 2020 <contributor>
+.\" Copyright 2021 <contributor>
 .\"
 .Dd Month Day, Year
 .Dt ENTRYNAME 9E
--- a/usr/src/prototypes/prototype.man9f	Mon Jan 04 14:49:49 2021 -0500
+++ b/usr/src/prototypes/prototype.man9f	Mon Jan 11 08:50:27 2021 -0500
@@ -9,7 +9,7 @@
 .\" http://www.illumos.org/license/CDDL.
 .\"
 .\"
-.\" Copyright 2020 <contributor>
+.\" Copyright 2021 <contributor>
 .\"
 .Dd Month Day, Year
 .Dt FUNCNAME 9F
--- a/usr/src/prototypes/prototype.mapfile-vers	Mon Jan 04 14:49:49 2021 -0500
+++ b/usr/src/prototypes/prototype.mapfile-vers	Mon Jan 11 08:50:27 2021 -0500
@@ -10,7 +10,7 @@
 #
 
 #
-# Copyright 2020 <contributor>
+# Copyright 2021 <contributor>
 #
 
 #
--- a/usr/src/prototypes/prototype.pl	Mon Jan 04 14:49:49 2021 -0500
+++ b/usr/src/prototypes/prototype.pl	Mon Jan 11 08:50:27 2021 -0500
@@ -11,7 +11,7 @@
 #
 
 #
-# Copyright 2020 <contributor>
+# Copyright 2021 <contributor>
 #
 
 #
--- a/usr/src/prototypes/prototype.py	Mon Jan 04 14:49:49 2021 -0500
+++ b/usr/src/prototypes/prototype.py	Mon Jan 11 08:50:27 2021 -0500
@@ -11,7 +11,7 @@
 #
 
 #
-# Copyright 2020 <contributor>
+# Copyright 2021 <contributor>
 #
 
 #
--- a/usr/src/prototypes/prototype.s	Mon Jan 04 14:49:49 2021 -0500
+++ b/usr/src/prototypes/prototype.s	Mon Jan 11 08:50:27 2021 -0500
@@ -10,7 +10,7 @@
  */
 
 /*
- * Copyright 2020 <contributor>
+ * Copyright 2021 <contributor>
  */
 
 	.file	"prototype.s"
--- a/usr/src/prototypes/prototype.sh	Mon Jan 04 14:49:49 2021 -0500
+++ b/usr/src/prototypes/prototype.sh	Mon Jan 11 08:50:27 2021 -0500
@@ -12,6 +12,6 @@
 #
 
 #
-# Copyright 2020 <contributor>
+# Copyright 2021 <contributor>
 #
 
--- a/usr/src/uts/common/fs/nfs/nfs4_client.c	Mon Jan 04 14:49:49 2021 -0500
+++ b/usr/src/uts/common/fs/nfs/nfs4_client.c	Mon Jan 11 08:50:27 2021 -0500
@@ -24,7 +24,7 @@
  */
 
 /*
- *  	Copyright (c) 1983,1984,1985,1986,1987,1988,1989  AT&T.
+ *	Copyright (c) 1983,1984,1985,1986,1987,1988,1989  AT&T.
  *	All Rights Reserved
  */
 
@@ -464,33 +464,15 @@
 	rp = VTOR4(vp);
 	mutex_enter(&rp->r_statelock);
 	was_serial = (rp->r_serial == curthread);
-	if (rp->r_serial && !was_serial) {
-		klwp_t *lwp = ttolwp(curthread);
-
+	if (rp->r_serial != NULL && !was_serial) {
 		/*
-		 * If we're the recovery thread, then purge current attrs
-		 * and bail out to avoid potential deadlock between another
-		 * thread caching attrs (r_serial thread), recov thread,
-		 * and an async writer thread.
+		 * Purge current attrs and bail out to avoid potential deadlock
+		 * between another thread caching attrs (r_serial thread), this
+		 * thread, and a thread trying to read or write pages.
 		 */
-		if (recov) {
-			PURGE_ATTRCACHE4_LOCKED(rp);
-			mutex_exit(&rp->r_statelock);
-			return;
-		}
-
-		if (lwp != NULL)
-			lwp->lwp_nostop++;
-		while (rp->r_serial != NULL) {
-			if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) {
-				mutex_exit(&rp->r_statelock);
-				if (lwp != NULL)
-					lwp->lwp_nostop--;
-				return;
-			}
-		}
-		if (lwp != NULL)
-			lwp->lwp_nostop--;
+		PURGE_ATTRCACHE4_LOCKED(rp);
+		mutex_exit(&rp->r_statelock);
+		return;
 	}
 
 	/*
@@ -3067,7 +3049,7 @@
 	nfs4_oo_hash_bucket_t   *bucketp;
 	nfs4_debug_msg_t	*msgp;
 	int i;
-	servinfo4_t 		*svp;
+	servinfo4_t		*svp;
 
 	/*
 	 * Code introduced here should be carefully evaluated to make
--- a/usr/src/uts/common/fs/nfs/nfs4_vnops.c	Mon Jan 04 14:49:49 2021 -0500
+++ b/usr/src/uts/common/fs/nfs/nfs4_vnops.c	Mon Jan 11 08:50:27 2021 -0500
@@ -2596,12 +2596,6 @@
 	osp->os_ref_count--;
 
 	if (ep->error == 0) {
-		/*
-		 * Avoid a deadlock with the r_serial thread waiting for
-		 * os_sync_lock in nfs4_get_otw_cred_by_osp() which might be
-		 * held by us. We will wait in nfs4_attr_cache() for the
-		 * completion of the r_serial thread.
-		 */
 		mutex_exit(&osp->os_sync_lock);
 		*have_sync_lockp = 0;
 
--- a/usr/src/uts/common/io/cxgbe/common/t4_msg.h	Mon Jan 04 14:49:49 2021 -0500
+++ b/usr/src/uts/common/io/cxgbe/common/t4_msg.h	Mon Jan 11 08:50:27 2021 -0500
@@ -2769,7 +2769,7 @@
 	__be64 addr0;
 
 #if !(defined C99_NOT_SUPPORTED)
-	struct ulptx_sge_pair sge[0];
+	struct ulptx_sge_pair sge[];
 #endif
 
 };
@@ -2785,7 +2785,7 @@
 	__be32 rsvd;
 
 #if !(defined C99_NOT_SUPPORTED)
-	struct ulptx_isge sge[0];
+	struct ulptx_isge sge[];
 #endif
 
 };
--- a/usr/src/uts/i86pc/io/vmm/amd/svm.c	Mon Jan 04 14:49:49 2021 -0500
+++ b/usr/src/uts/i86pc/io/vmm/amd/svm.c	Mon Jan 11 08:50:27 2021 -0500
@@ -1917,8 +1917,7 @@
  * Start vcpu with specified RIP.
  */
 static int
-svm_vmrun(void *arg, int vcpu, uint64_t rip, pmap_t pmap,
-    struct vm_eventinfo *evinfo)
+svm_vmrun(void *arg, int vcpu, uint64_t rip, pmap_t pmap)
 {
 	struct svm_regctx *gctx;
 	struct svm_softc *svm_sc;
@@ -2010,34 +2009,18 @@
 		inject_state = svm_inject_vlapic(svm_sc, vcpu, vlapic,
 		    inject_state);
 
-		if (vcpu_suspended(evinfo)) {
+		/*
+		 * Check for vCPU bail-out conditions.  This must be done after
+		 * svm_inject_events() to detect a triple-fault condition.
+		 */
+		if (vcpu_entry_bailout_checks(vm, vcpu, state->rip)) {
 			enable_gintr();
-			vm_exit_suspended(vm, vcpu, state->rip);
-			break;
-		}
-
-		if (vcpu_runblocked(evinfo)) {
-			enable_gintr();
-			vm_exit_runblock(vm, vcpu, state->rip);
 			break;
 		}
 
-		if (vcpu_reqidle(evinfo)) {
+		if (vcpu_run_state_pending(vm, vcpu)) {
 			enable_gintr();
-			vm_exit_reqidle(vm, vcpu, state->rip);
-			break;
-		}
-
-		/* We are asked to give the cpu by scheduler. */
-		if (vcpu_should_yield(vm, vcpu)) {
-			enable_gintr();
-			vm_exit_astpending(vm, vcpu, state->rip);
-			break;
-		}
-
-		if (vcpu_debugged(vm, vcpu)) {
-			enable_gintr();
-			vm_exit_debug(vm, vcpu, state->rip);
+			vm_exit_run_state(vm, vcpu, state->rip);
 			break;
 		}
 
@@ -2303,7 +2286,7 @@
 }
 
 static int
-svm_setdesc(void *arg, int vcpu, int reg, struct seg_desc *desc)
+svm_setdesc(void *arg, int vcpu, int reg, const struct seg_desc *desc)
 {
 	struct vmcb *vmcb;
 	struct svm_softc *sc;
--- a/usr/src/uts/i86pc/io/vmm/intel/vmx.c	Mon Jan 04 14:49:49 2021 -0500
+++ b/usr/src/uts/i86pc/io/vmm/intel/vmx.c	Mon Jan 11 08:50:27 2021 -0500
@@ -2738,8 +2738,7 @@
 }
 
 static int
-vmx_run(void *arg, int vcpu, uint64_t rip, pmap_t pmap,
-    struct vm_eventinfo *evinfo)
+vmx_run(void *arg, int vcpu, uint64_t rip, pmap_t pmap)
 {
 	int rc, handled, launched;
 	struct vmx *vmx;
@@ -2834,39 +2833,17 @@
 		}
 
 		/*
-		 * Check for vcpu suspension after injecting events because
-		 * vmx_inject_events() can suspend the vcpu due to a
-		 * triple fault.
+		 * Check for vCPU bail-out conditions.  This must be done after
+		 * vmx_inject_events() to detect a triple-fault condition.
 		 */
-		if (vcpu_suspended(evinfo)) {
+		if (vcpu_entry_bailout_checks(vmx->vm, vcpu, rip)) {
 			enable_intr();
-			vm_exit_suspended(vmx->vm, vcpu, rip);
-			break;
-		}
-
-		if (vcpu_runblocked(evinfo)) {
-			enable_intr();
-			vm_exit_runblock(vmx->vm, vcpu, rip);
 			break;
 		}
 
-		if (vcpu_reqidle(evinfo)) {
-			enable_intr();
-			vm_exit_reqidle(vmx->vm, vcpu, rip);
-			break;
-		}
-
-		if (vcpu_should_yield(vm, vcpu)) {
+		if (vcpu_run_state_pending(vm, vcpu)) {
 			enable_intr();
-			vm_exit_astpending(vmx->vm, vcpu, rip);
-			vmx_astpending_trace(vmx, vcpu, rip);
-			handled = HANDLED;
-			break;
-		}
-
-		if (vcpu_debugged(vm, vcpu)) {
-			enable_intr();
-			vm_exit_debug(vmx->vm, vcpu, rip);
+			vm_exit_run_state(vmx->vm, vcpu, rip);
 			break;
 		}
 
@@ -2985,19 +2962,12 @@
 		rip = vmexit->rip;
 	} while (handled);
 
-	/*
-	 * If a VM exit has been handled then the exitcode must be BOGUS
-	 * If a VM exit is not handled then the exitcode must not be BOGUS
-	 */
-	if ((handled && vmexit->exitcode != VM_EXITCODE_BOGUS) ||
-	    (!handled && vmexit->exitcode == VM_EXITCODE_BOGUS)) {
-		panic("Mismatch between handled (%d) and exitcode (%d)",
-		    handled, vmexit->exitcode);
+	/* If a VM exit has been handled then the exitcode must be BOGUS */
+	if (handled && vmexit->exitcode != VM_EXITCODE_BOGUS) {
+		panic("Non-BOGUS exitcode (%d) unexpected for handled VM exit",
+		    vmexit->exitcode);
 	}
 
-	if (!handled)
-		vmm_stat_incr(vm, vcpu, VMEXIT_USERSPACE, 1);
-
 	VCPU_CTR1(vm, vcpu, "returning from vmx_run: exitcode %d",
 	    vmexit->exitcode);
 
@@ -3261,7 +3231,7 @@
 }
 
 static int
-vmx_setdesc(void *arg, int vcpu, int seg, struct seg_desc *desc)
+vmx_setdesc(void *arg, int vcpu, int seg, const struct seg_desc *desc)
 {
 	int hostcpu, running;
 	struct vmx *vmx = arg;
--- a/usr/src/uts/i86pc/io/vmm/io/vlapic.c	Mon Jan 04 14:49:49 2021 -0500
+++ b/usr/src/uts/i86pc/io/vmm/io/vlapic.c	Mon Jan 11 08:50:27 2021 -0500
@@ -992,13 +992,10 @@
 vlapic_icrlo_write_handler(struct vlapic *vlapic)
 {
 	int i;
-	bool phys;
 	cpuset_t dmask;
 	uint64_t icrval;
-	uint32_t dest, vec, mode;
-	struct vlapic *vlapic2;
+	uint32_t dest, vec, mode, dsh;
 	struct LAPIC *lapic;
-	uint16_t maxcpus;
 
 	lapic = vlapic->apic_page;
 	lapic->icr_lo &= ~APIC_DELSTAT_PEND;
@@ -1010,93 +1007,79 @@
 		dest = icrval >> (32 + 24);
 	vec = icrval & APIC_VECTOR_MASK;
 	mode = icrval & APIC_DELMODE_MASK;
+	dsh = icrval & APIC_DEST_MASK;
 
 	if (mode == APIC_DELMODE_FIXED && vec < 16) {
 		vlapic_set_error(vlapic, APIC_ESR_SEND_ILLEGAL_VECTOR, false);
-		VLAPIC_CTR1(vlapic, "Ignoring invalid IPI %d", vec);
+		return (0);
+	}
+	if (mode == APIC_DELMODE_INIT &&
+	    (icrval & APIC_LEVEL_MASK) == APIC_LEVEL_DEASSERT) {
+		/* No work required to deassert INIT */
 		return (0);
 	}
+	if ((mode == APIC_DELMODE_STARTUP || mode == APIC_DELMODE_INIT) &&
+	    !(dsh == APIC_DEST_DESTFLD || dsh == APIC_DEST_ALLESELF)) {
+		/*
+		 * While Intel makes no mention of restrictions for destination
+		 * shorthand when sending INIT or SIPI, AMD requires either a
+		 * specific destination or all-excluding self.  Common use seems
+		 * to be restricted to those two cases.
+		 */
+		return (-1);
+	}
 
-	VLAPIC_CTR2(vlapic, "icrlo 0x%016lx triggered ipi %d", icrval, vec);
+	switch (dsh) {
+	case APIC_DEST_DESTFLD:
+		vlapic_calcdest(vlapic->vm, &dmask, dest,
+		    (icrval & APIC_DESTMODE_LOG) == 0, false, x2apic(vlapic));
+		break;
+	case APIC_DEST_SELF:
+		CPU_SETOF(vlapic->vcpuid, &dmask);
+		break;
+	case APIC_DEST_ALLISELF:
+		dmask = vm_active_cpus(vlapic->vm);
+		break;
+	case APIC_DEST_ALLESELF:
+		dmask = vm_active_cpus(vlapic->vm);
+		CPU_CLR(vlapic->vcpuid, &dmask);
+		break;
+	default:
+		/*
+		 * All possible delivery notations are covered above.
+		 * We should never end up here.
+		 */
+		panic("unknown delivery shorthand: %x", dsh);
+	}
 
-	if (mode == APIC_DELMODE_FIXED || mode == APIC_DELMODE_NMI) {
-		switch (icrval & APIC_DEST_MASK) {
-		case APIC_DEST_DESTFLD:
-			phys = ((icrval & APIC_DESTMODE_LOG) == 0);
-			vlapic_calcdest(vlapic->vm, &dmask, dest, phys, false,
-			    x2apic(vlapic));
+	while ((i = CPU_FFS(&dmask)) != 0) {
+		i--;
+		CPU_CLR(i, &dmask);
+		switch (mode) {
+		case APIC_DELMODE_FIXED:
+			lapic_intr_edge(vlapic->vm, i, vec);
+			vmm_stat_incr(vlapic->vm, vlapic->vcpuid,
+			    VLAPIC_IPI_SEND, 1);
+			vmm_stat_incr(vlapic->vm, i,
+			    VLAPIC_IPI_RECV, 1);
 			break;
-		case APIC_DEST_SELF:
-			CPU_SETOF(vlapic->vcpuid, &dmask);
+		case APIC_DELMODE_NMI:
+			vm_inject_nmi(vlapic->vm, i);
 			break;
-		case APIC_DEST_ALLISELF:
-			dmask = vm_active_cpus(vlapic->vm);
+		case APIC_DELMODE_INIT:
+			(void) vm_inject_init(vlapic->vm, i);
 			break;
-		case APIC_DEST_ALLESELF:
-			dmask = vm_active_cpus(vlapic->vm);
-			CPU_CLR(vlapic->vcpuid, &dmask);
+		case APIC_DELMODE_STARTUP:
+			(void) vm_inject_sipi(vlapic->vm, i, vec);
 			break;
+		case APIC_DELMODE_LOWPRIO:
+		case APIC_DELMODE_SMI:
 		default:
-			CPU_ZERO(&dmask);	/* satisfy gcc */
+			/* Unhandled IPI modes (for now) */
 			break;
 		}
-
-		while ((i = CPU_FFS(&dmask)) != 0) {
-			i--;
-			CPU_CLR(i, &dmask);
-			if (mode == APIC_DELMODE_FIXED) {
-				lapic_intr_edge(vlapic->vm, i, vec);
-				vmm_stat_incr(vlapic->vm, vlapic->vcpuid,
-				    VLAPIC_IPI_SEND, 1);
-				vmm_stat_incr(vlapic->vm, i,
-				    VLAPIC_IPI_RECV, 1);
-				VLAPIC_CTR2(vlapic, "vlapic sending ipi %d "
-				    "to vcpuid %d", vec, i);
-			} else {
-				vm_inject_nmi(vlapic->vm, i);
-				VLAPIC_CTR1(vlapic, "vlapic sending ipi nmi "
-				    "to vcpuid %d", i);
-			}
-		}
-
-		return (0);	/* handled completely in the kernel */
 	}
-
-	maxcpus = vm_get_maxcpus(vlapic->vm);
-	if (mode == APIC_DELMODE_INIT) {
-		if ((icrval & APIC_LEVEL_MASK) == APIC_LEVEL_DEASSERT)
-			return (0);
-
-		if (vlapic->vcpuid == 0 && dest != 0 && dest < maxcpus) {
-			vlapic2 = vm_lapic(vlapic->vm, dest);
-
-			/* move from INIT to waiting-for-SIPI state */
-			if (vlapic2->boot_state == BS_INIT) {
-				vlapic2->boot_state = BS_SIPI;
-			}
-
-			return (0);
-		}
-	}
-
-	if (mode == APIC_DELMODE_STARTUP) {
-		if (vlapic->vcpuid == 0 && dest != 0 && dest < maxcpus) {
-			vlapic2 = vm_lapic(vlapic->vm, dest);
-
-			/*
-			 * Ignore SIPIs in any state other than wait-for-SIPI
-			 */
-			if (vlapic2->boot_state != BS_SIPI)
-				return (0);
-
-			vlapic2->boot_state = BS_RUNNING;
-			vm_req_spinup_ap(vlapic->vm, dest, vec << PAGE_SHIFT);
-			return (0);
-		}
-	}
-
-	/* Return to userland.  */
-	return (-1);
+	return (0);
 }
 
 void
@@ -1450,30 +1433,72 @@
 	return (retval);
 }
 
-static void
+void
 vlapic_reset(struct vlapic *vlapic)
 {
-	struct LAPIC *lapic;
+	struct LAPIC *lapic = vlapic->apic_page;
+	uint32_t *isrptr, *tmrptr, *irrptr;
 
-	lapic = vlapic->apic_page;
-	bzero(lapic, sizeof (struct LAPIC));
+	/* Reset any timer-related state first */
+	VLAPIC_TIMER_LOCK(vlapic);
+	callout_stop(&vlapic->callout);
+	lapic->icr_timer = 0;
+	lapic->ccr_timer = 0;
+	VLAPIC_TIMER_UNLOCK(vlapic);
+	lapic->dcr_timer = 0;
+	vlapic_dcr_write_handler(vlapic);
+
+	/*
+	 * Sync any APIC acceleration (APICv/AVIC) state into the APIC page so
+	 * it is not leftover after the reset.  This is performed after the APIC
+	 * timer has been stopped, in case it happened to fire just prior to
+	 * being deactivated.
+	 */
+	if (vlapic->ops.sync_state) {
+		(*vlapic->ops.sync_state)(vlapic);
+	}
 
 	lapic->id = vlapic_get_id(vlapic);
 	lapic->version = VLAPIC_VERSION;
 	lapic->version |= (VLAPIC_MAXLVT_INDEX << MAXLVTSHIFT);
+
+	lapic->tpr = 0;
+	lapic->apr = 0;
+	lapic->ppr = 0;
+
+#ifdef __ISRVEC_DEBUG
+	/* With the PPR cleared, the isrvec tracking should be reset too */
+	vlapic->isrvec_stk_top = 0;
+#endif
+
+	lapic->eoi = 0;
+	lapic->ldr = 0;
 	lapic->dfr = 0xffffffff;
 	lapic->svr = APIC_SVR_VECTOR;
-	vlapic_mask_lvts(vlapic);
+	vlapic->svr_last = lapic->svr;
 
-	lapic->dcr_timer = 0;
-	vlapic_dcr_write_handler(vlapic);
+	isrptr = &lapic->isr0;
+	tmrptr = &lapic->tmr0;
+	irrptr = &lapic->irr0;
+	for (uint_t i = 0; i < 8; i++) {
+		atomic_store_rel_int(&isrptr[i * 4], 0);
+		atomic_store_rel_int(&tmrptr[i * 4], 0);
+		atomic_store_rel_int(&irrptr[i * 4], 0);
+	}
 
-	if (vlapic->vcpuid == 0)
-		vlapic->boot_state = BS_RUNNING;	/* BSP */
-	else
-		vlapic->boot_state = BS_INIT;		/* AP */
+	lapic->esr = 0;
+	vlapic->esr_pending = 0;
+	lapic->icr_lo = 0;
+	lapic->icr_hi = 0;
 
-	vlapic->svr_last = lapic->svr;
+	lapic->lvt_cmci = 0;
+	lapic->lvt_timer = 0;
+	lapic->lvt_thermal = 0;
+	lapic->lvt_pcint = 0;
+	lapic->lvt_lint0 = 0;
+	lapic->lvt_lint1 = 0;
+	lapic->lvt_error = 0;
+	vlapic_mask_lvts(vlapic);
 }
 
 void
--- a/usr/src/uts/i86pc/io/vmm/io/vlapic.h	Mon Jan 04 14:49:49 2021 -0500
+++ b/usr/src/uts/i86pc/io/vmm/io/vlapic.h	Mon Jan 11 08:50:27 2021 -0500
@@ -30,6 +30,7 @@
 
 /*
  * Copyright 2018 Joyent, Inc.
+ * Copyright 2020 Oxide Computer Company
  */
 
 #ifndef _VLAPIC_H_
@@ -38,6 +39,8 @@
 struct vm;
 enum x2apic_state;
 
+void vlapic_reset(struct vlapic *vlapic);
+
 int vlapic_write(struct vlapic *vlapic, int mmio_access, uint64_t offset,
     uint64_t data);
 int vlapic_read(struct vlapic *vlapic, int mmio_access, uint64_t offset,
--- a/usr/src/uts/i86pc/io/vmm/io/vlapic_priv.h	Mon Jan 04 14:49:49 2021 -0500
+++ b/usr/src/uts/i86pc/io/vmm/io/vlapic_priv.h	Mon Jan 11 08:50:27 2021 -0500
@@ -137,12 +137,6 @@
 	VLAPIC_CTR1((vlapic), msg " isr7 0x%08x", isrptr[7 << 2]);	\
 } while (0)
 
-enum boot_state {
-	BS_INIT,
-	BS_SIPI,
-	BS_RUNNING
-};
-
 /*
  * 16 priority levels with at most one vector injected per level.
  */
@@ -182,7 +176,6 @@
 	struct mtx	timer_mtx;
 
 	uint64_t	msr_apicbase;
-	enum boot_state	boot_state;
 
 	/*
 	 * Copies of some registers in the virtual APIC page. We do this for
--- a/usr/src/uts/i86pc/io/vmm/sys/vmm_kernel.h	Mon Jan 04 14:49:49 2021 -0500
+++ b/usr/src/uts/i86pc/io/vmm/sys/vmm_kernel.h	Mon Jan 11 08:50:27 2021 -0500
@@ -64,18 +64,12 @@
 struct vm_guest_paging;
 struct pmap;
 
-struct vm_eventinfo {
-	uint_t	*rptr;		/* runblock cookie */
-	int	*sptr;		/* suspend cookie */
-	int	*iptr;		/* reqidle cookie */
-};
-
 typedef int	(*vmm_init_func_t)(int ipinum);
 typedef int	(*vmm_cleanup_func_t)(void);
 typedef void	(*vmm_resume_func_t)(void);
 typedef void *	(*vmi_init_func_t)(struct vm *vm, struct pmap *pmap);
 typedef int	(*vmi_run_func_t)(void *vmi, int vcpu, uint64_t rip,
-    struct pmap *pmap, struct vm_eventinfo *info);
+    struct pmap *pmap);
 typedef void	(*vmi_cleanup_func_t)(void *vmi);
 typedef int	(*vmi_get_register_t)(void *vmi, int vcpu, int num,
     uint64_t *retval);
@@ -84,7 +78,7 @@
 typedef int	(*vmi_get_desc_t)(void *vmi, int vcpu, int num,
     struct seg_desc *desc);
 typedef int	(*vmi_set_desc_t)(void *vmi, int vcpu, int num,
-    struct seg_desc *desc);
+    const struct seg_desc *desc);
 typedef int	(*vmi_get_cap_t)(void *vmi, int vcpu, int num, int *retval);
 typedef int	(*vmi_set_cap_t)(void *vmi, int vcpu, int num, int val);
 typedef struct vmspace *(*vmi_vmspace_alloc)(vm_offset_t min, vm_offset_t max);
@@ -169,9 +163,13 @@
 int vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval);
 int vm_set_register(struct vm *vm, int vcpu, int reg, uint64_t val);
 int vm_get_seg_desc(struct vm *vm, int vcpu, int reg,
-		    struct seg_desc *ret_desc);
+    struct seg_desc *ret_desc);
 int vm_set_seg_desc(struct vm *vm, int vcpu, int reg,
-		    struct seg_desc *desc);
+    const struct seg_desc *desc);
+int vm_get_run_state(struct vm *vm, int vcpuid, uint32_t *state,
+    uint8_t *sipi_vec);
+int vm_set_run_state(struct vm *vm, int vcpuid, uint32_t state,
+    uint8_t sipi_vec);
 int vm_run(struct vm *vm, int vcpuid, const struct vm_entry *);
 int vm_suspend(struct vm *vm, enum vm_suspend_how how);
 int vm_inject_nmi(struct vm *vm, int vcpu);
@@ -180,6 +178,8 @@
 int vm_inject_extint(struct vm *vm, int vcpu);
 int vm_extint_pending(struct vm *vm, int vcpuid);
 void vm_extint_clear(struct vm *vm, int vcpuid);
+int vm_inject_init(struct vm *vm, int vcpuid);
+int vm_inject_sipi(struct vm *vm, int vcpuid, uint8_t vec);
 struct vlapic *vm_lapic(struct vm *vm, int cpu);
 struct vioapic *vm_ioapic(struct vm *vm);
 struct vhpet *vm_hpet(struct vm *vm);
@@ -195,14 +195,13 @@
 struct vie *vm_vie_ctx(struct vm *vm, int vcpuid);
 void vm_exit_suspended(struct vm *vm, int vcpuid, uint64_t rip);
 void vm_exit_debug(struct vm *vm, int vcpuid, uint64_t rip);
-void vm_exit_runblock(struct vm *vm, int vcpuid, uint64_t rip);
 void vm_exit_astpending(struct vm *vm, int vcpuid, uint64_t rip);
 void vm_exit_reqidle(struct vm *vm, int vcpuid, uint64_t rip);
+void vm_exit_run_state(struct vm *vm, int vcpuid, uint64_t rip);
 int vm_service_mmio_read(struct vm *vm, int cpuid, uint64_t gpa, uint64_t *rval,
     int rsize);
 int vm_service_mmio_write(struct vm *vm, int cpuid, uint64_t gpa, uint64_t wval,
     int wsize);
-void vm_req_spinup_ap(struct vm *vm, int req_vcpuid, uint64_t req_rip);
 
 #ifdef _SYS__CPUSET_H_
 cpuset_t vm_active_cpus(struct vm *vm);
@@ -210,28 +209,9 @@
 cpuset_t vm_suspended_cpus(struct vm *vm);
 #endif	/* _SYS__CPUSET_H_ */
 
-static __inline int
-vcpu_runblocked(struct vm_eventinfo *info)
-{
-
-	return (*info->rptr != 0);
-}
-
-static __inline int
-vcpu_suspended(struct vm_eventinfo *info)
-{
-
-	return (*info->sptr);
-}
-
-static __inline int
-vcpu_reqidle(struct vm_eventinfo *info)
-{
-
-	return (*info->iptr);
-}
-
-int vcpu_debugged(struct vm *vm, int vcpuid);
+bool vcpu_entry_bailout_checks(struct vm *vm, int vcpuid, uint64_t rip);
+bool vcpu_run_state_pending(struct vm *vm, int vcpuid);
+int vcpu_arch_reset(struct vm *vm, int vcpuid, bool init_only);
 
 /*
  * Return true if device indicated by bus/slot/func is supposed to be a
--- a/usr/src/uts/i86pc/io/vmm/vmm.c	Mon Jan 04 14:49:49 2021 -0500
+++ b/usr/src/uts/i86pc/io/vmm/vmm.c	Mon Jan 11 08:50:27 2021 -0500
@@ -109,17 +109,15 @@
  * (x) initialized before use
  */
 struct vcpu {
-	struct mtx	mtx;		/* (o) protects 'state' and 'hostcpu' */
+	/* (o) protects state, run_state, hostcpu, sipi_vector */
+	struct mtx	mtx;
+
 	enum vcpu_state	state;		/* (o) vcpu state */
-#ifndef __FreeBSD__
+	enum vcpu_run_state run_state;	/* (i) vcpu init/sipi/run state */
 	kcondvar_t	vcpu_cv;	/* (o) cpu waiter cv */
 	kcondvar_t	state_cv;	/* (o) IDLE-transition cv */
-#endif /* __FreeBSD__ */
 	int		hostcpu;	/* (o) vcpu's current host cpu */
-#ifndef __FreeBSD__
 	int		lastloccpu;	/* (o) last host cpu localized to */
-#endif
-	uint_t		runblock;	/* (i) block vcpu from run state */
 	int		reqidle;	/* (i) request vcpu to idle */
 	struct vlapic	*vlapic;	/* (i) APIC device model */
 	enum x2apic_state x2apic_state;	/* (i) APIC mode */
@@ -130,6 +128,7 @@
 	int	exc_vector;		/* (x) exception collateral */
 	int	exc_errcode_valid;
 	uint32_t exc_errcode;
+	uint8_t		sipi_vector;	/* (i) SIPI vector */
 	struct savefpu	*guestfpu;	/* (a,i) guest fpu state */
 	uint64_t	guest_xcr0;	/* (i) guest %xcr0 register */
 	void		*stats;		/* (a,i) statistics */
@@ -200,15 +199,6 @@
 	uint16_t	maxcpus;		/* (o) max pluggable cpus */
 
 	struct ioport_config ioports;		/* (o) ioport handling */
-
-	bool		sipi_req;		/* (i) SIPI requested */
-	int		sipi_req_vcpu;		/* (i) SIPI destination */
-	uint64_t	sipi_req_rip;		/* (i) SIPI start %rip */
-
-	/* Miscellaneous VM-wide statistics and counters */
-	struct vm_wide_stats {
-		uint64_t sipi_supersede;
-	} stats;
 };
 
 static int vmm_initialized;
@@ -249,8 +239,8 @@
 #define	VMM_RESUME()			((*ops->resume)())
 
 #define	VMINIT(vm, pmap)		((*ops->vminit)(vm, pmap))
-#define	VMRUN(vmi, vcpu, rip, pmap, evinfo) \
-	((*ops->vmrun)(vmi, vcpu, rip, pmap, evinfo))
+#define	VMRUN(vmi, vcpu, rip, pmap) \
+	((*ops->vmrun)(vmi, vcpu, rip, pmap))
 #define	VMCLEANUP(vmi)			((*ops->vmcleanup)(vmi))
 #define	VMSPACE_ALLOC(min, max)		((*ops->vmspace_alloc)(min, max))
 #define	VMSPACE_FREE(vmspace)		((*ops->vmspace_free)(vmspace))
@@ -292,6 +282,8 @@
 static void vm_free_memmap(struct vm *vm, int ident);
 static bool sysmem_mapping(struct vm *vm, struct mem_map *mm);
 static void vcpu_notify_event_locked(struct vcpu *vcpu, vcpu_notify_t);
+static bool vcpu_sleep_bailout_checks(struct vm *vm, int vcpuid);
+static int vcpu_vector_sipi(struct vm *vm, int vcpuid, uint8_t vector);
 
 #ifndef __FreeBSD__
 static void vm_clear_memseg(struct vm *, int);
@@ -370,9 +362,9 @@
 		bzero(&vcpu->exitinfo, sizeof (vcpu->exitinfo));
 	}
 
+	vcpu->run_state = VRS_HALT;
 	vcpu->vlapic = VLAPIC_INIT(vm->cookie, vcpu_id);
 	vm_set_x2apic_state(vm, vcpu_id, X2APIC_DISABLED);
-	vcpu->runblock = 0;
 	vcpu->reqidle = 0;
 	vcpu->exitintinfo = 0;
 	vcpu->nmi_pending = 0;
@@ -1233,7 +1225,7 @@
 }
 
 int
-vm_set_seg_desc(struct vm *vm, int vcpu, int reg, struct seg_desc *desc)
+vm_set_seg_desc(struct vm *vm, int vcpu, int reg, const struct seg_desc *desc)
 {
 	if (vcpu < 0 || vcpu >= vm->maxcpus)
 		return (EINVAL);
@@ -1244,6 +1236,49 @@
 	return (VMSETDESC(vm->cookie, vcpu, reg, desc));
 }
 
+int
+vm_get_run_state(struct vm *vm, int vcpuid, uint32_t *state, uint8_t *sipi_vec)
+{
+	struct vcpu *vcpu;
+
+	if (vcpuid < 0 || vcpuid >= vm->maxcpus) {
+		return (EINVAL);
+	}
+
+	vcpu = &vm->vcpu[vcpuid];
+
+	vcpu_lock(vcpu);
+	*state = vcpu->run_state;
+	*sipi_vec = vcpu->sipi_vector;
+	vcpu_unlock(vcpu);
+
+	return (0);
+}
+
+int
+vm_set_run_state(struct vm *vm, int vcpuid, uint32_t state, uint8_t sipi_vec)
+{
+	struct vcpu *vcpu;
+
+	if (vcpuid < 0 || vcpuid >= vm->maxcpus) {
+		return (EINVAL);
+	}
+	if (!VRS_IS_VALID(state)) {
+		return (EINVAL);
+	}
+
+	vcpu = &vm->vcpu[vcpuid];
+
+	vcpu_lock(vcpu);
+	vcpu->run_state = state;
+	vcpu->sipi_vector = sipi_vec;
+	vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT);
+	vcpu_unlock(vcpu);
+
+	return (0);
+}
+
+
 static void
 restore_guest_fpustate(struct vcpu *vcpu)
 {
@@ -1354,16 +1389,6 @@
 		break;
 	}
 
-	if (newstate == VCPU_RUNNING) {
-		while (vcpu->runblock != 0) {
-#ifdef __FreeBSD__
-			msleep_spin(&vcpu->state, &vcpu->mtx, "vcpublk", 0);
-#else
-			cv_wait(&vcpu->state_cv, &vcpu->mtx.m);
-#endif
-		}
-	}
-
 	if (error)
 		return (EBUSY);
 
@@ -1376,8 +1401,7 @@
 	else
 		vcpu->hostcpu = NOCPU;
 
-	if (newstate == VCPU_IDLE ||
-	    (newstate == VCPU_FROZEN && vcpu->runblock != 0)) {
+	if (newstate == VCPU_IDLE) {
 #ifdef __FreeBSD__
 		wakeup(&vcpu->state);
 #else
@@ -1413,12 +1437,8 @@
 vm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled)
 {
 	struct vcpu *vcpu;
-#ifdef __FreeBSD__
-	const char *wmesg;
-#else
-	const char *wmesg __unused;
-#endif
 	int t, vcpu_halted, vm_halted;
+	bool userspace_exit = false;
 
 	KASSERT(!CPU_ISSET(vcpuid, &vm->halted_cpus), ("vcpu already halted"));
 
@@ -1429,17 +1449,12 @@
 	vcpu_lock(vcpu);
 	while (1) {
 		/*
-		 * Do a final check for pending NMI or interrupts before
-		 * really putting this thread to sleep. Also check for
-		 * software events that would cause this vcpu to wakeup.
-		 *
-		 * These interrupts/events could have happened after the
-		 * vcpu returned from VMRUN() and before it acquired the
-		 * vcpu lock above.
+		 * Do a final check for pending interrupts (including NMI and
+		 * INIT) before putting this thread to sleep.
 		 */
-		if (vm->suspend || vcpu->reqidle)
+		if (vm_nmi_pending(vm, vcpuid))
 			break;
-		if (vm_nmi_pending(vm, vcpuid))
+		if (vcpu_run_state_pending(vm, vcpuid))
 			break;
 		if (!intr_disabled) {
 			if (vm_extint_pending(vm, vcpuid) ||
@@ -1448,12 +1463,15 @@
 			}
 		}
 
-		/* Don't go to sleep if the vcpu thread needs to yield */
-		if (vcpu_should_yield(vm, vcpuid))
+		/*
+		 * Also check for software events which would cause a wake-up.
+		 * This will set the appropriate exitcode directly, rather than
+		 * requiring a trip through VM_RUN().
+		 */
+		if (vcpu_sleep_bailout_checks(vm, vcpuid)) {
+			userspace_exit = true;
 			break;
-
-		if (vcpu_debugged(vm, vcpuid))
-			break;
+		}
 
 		/*
 		 * Some Linux guests implement "halt" by having all vcpus
@@ -1462,8 +1480,6 @@
 		 * vcpus enter the halted state the virtual machine is halted.
 		 */
 		if (intr_disabled) {
-			wmesg = "vmhalt";
-			VCPU_CTR0(vm, vcpuid, "Halted");
 			if (!vcpu_halted && halt_detection_enabled) {
 				vcpu_halted = 1;
 				CPU_SET_ATOMIC(vcpuid, &vm->halted_cpus);
@@ -1472,25 +1488,11 @@
 				vm_halted = 1;
 				break;
 			}
-		} else {
-			wmesg = "vmidle";
 		}
 
 		t = ticks;
 		vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING);
-#ifdef __FreeBSD__
-		/*
-		 * XXX msleep_spin() cannot be interrupted by signals so
-		 * wake up periodically to check pending signals.
-		 */
-		msleep_spin(vcpu, &vcpu->mtx, wmesg, hz);
-#else
-		/*
-		 * Fortunately, cv_wait_sig can be interrupted by signals, so
-		 * there is no need to periodically wake up.
-		 */
 		(void) cv_wait_sig(&vcpu->vcpu_cv, &vcpu->mtx.m);
-#endif
 		vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN);
 		vmm_stat_incr(vm, vcpuid, VCPU_IDLE_TICKS, ticks - t);
 	}
@@ -1503,7 +1505,7 @@
 	if (vm_halted)
 		vm_suspend(vm, VM_SUSPEND_HALT);
 
-	return (0);
+	return (userspace_exit ? -1 : 0);
 }
 
 static int
@@ -1832,6 +1834,62 @@
 	return (-1);
 }
 
+static int
+vm_handle_run_state(struct vm *vm, int vcpuid)
+{
+	struct vcpu *vcpu = &vm->vcpu[vcpuid];
+	bool handled = false;
+
+	vcpu_lock(vcpu);
+	while (1) {
+		if ((vcpu->run_state & VRS_PEND_INIT) != 0) {
+			vcpu_unlock(vcpu);
+			VERIFY0(vcpu_arch_reset(vm, vcpuid, true));
+			vcpu_lock(vcpu);
+
+			vcpu->run_state &= ~(VRS_RUN | VRS_PEND_INIT);
+			vcpu->run_state |= VRS_INIT;
+		}
+
+		if ((vcpu->run_state & (VRS_INIT | VRS_RUN | VRS_PEND_SIPI)) ==
+		    (VRS_INIT | VRS_PEND_SIPI)) {
+			const uint8_t vector = vcpu->sipi_vector;
+
+			vcpu_unlock(vcpu);
+			VERIFY0(vcpu_vector_sipi(vm, vcpuid, vector));
+			vcpu_lock(vcpu);
+
+			vcpu->run_state &= ~VRS_PEND_SIPI;
+			vcpu->run_state |= VRS_RUN;
+		}
+
+		/*
+		 * If the vCPU is now in the running state, there is no need to
+		 * wait for anything prior to re-entry.
+		 */
+		if ((vcpu->run_state & VRS_RUN) != 0) {
+			handled = true;
+			break;
+		}
+
+		/*
+		 * Also check for software events which would cause a wake-up.
+		 * This will set the appropriate exitcode directly, rather than
+		 * requiring a trip through VM_RUN().
+		 */
+		if (vcpu_sleep_bailout_checks(vm, vcpuid)) {
+			break;
+		}
+
+		vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING);
+		(void) cv_wait_sig(&vcpu->vcpu_cv, &vcpu->mtx.m);
+		vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN);
+	}
+	vcpu_unlock(vcpu);
+
+	return (handled ? 0 : -1);
+}
+
 #ifndef __FreeBSD__
 static int
 vm_handle_wrmsr(struct vm *vm, int vcpuid, struct vm_exit *vme)
@@ -1850,18 +1908,6 @@
 }
 #endif /* __FreeBSD__ */
 
-void
-vm_req_spinup_ap(struct vm *vm, int req_vcpuid, uint64_t req_rip)
-{
-	if (vm->sipi_req) {
-		/* This should never occur if userspace is doing its job. */
-		vm->stats.sipi_supersede++;
-	}
-	vm->sipi_req = true;
-	vm->sipi_req_vcpu = req_vcpuid;
-	vm->sipi_req_rip = req_rip;
-}
-
 int
 vm_suspend(struct vm *vm, enum vm_suspend_how how)
 {
@@ -1890,66 +1936,17 @@
 }
 
 void
-vm_exit_suspended(struct vm *vm, int vcpuid, uint64_t rip)
-{
-	struct vm_exit *vmexit;
-
-	KASSERT(vm->suspend > VM_SUSPEND_NONE && vm->suspend < VM_SUSPEND_LAST,
-	    ("vm_exit_suspended: invalid suspend type %d", vm->suspend));
-
-	vmexit = vm_exitinfo(vm, vcpuid);
-	vmexit->rip = rip;
-	vmexit->inst_length = 0;
-	vmexit->exitcode = VM_EXITCODE_SUSPENDED;
-	vmexit->u.suspended.how = vm->suspend;
-}
-
-void
-vm_exit_debug(struct vm *vm, int vcpuid, uint64_t rip)
+vm_exit_run_state(struct vm *vm, int vcpuid, uint64_t rip)
 {
 	struct vm_exit *vmexit;
 
 	vmexit = vm_exitinfo(vm, vcpuid);
 	vmexit->rip = rip;
 	vmexit->inst_length = 0;
-	vmexit->exitcode = VM_EXITCODE_DEBUG;
-}
-
-void
-vm_exit_runblock(struct vm *vm, int vcpuid, uint64_t rip)
-{
-	struct vm_exit *vmexit;
-
-	vmexit = vm_exitinfo(vm, vcpuid);
-	vmexit->rip = rip;
-	vmexit->inst_length = 0;
-	vmexit->exitcode = VM_EXITCODE_RUNBLOCK;
-	vmm_stat_incr(vm, vcpuid, VMEXIT_RUNBLOCK, 1);
+	vmexit->exitcode = VM_EXITCODE_RUN_STATE;
+	vmm_stat_incr(vm, vcpuid, VMEXIT_RUN_STATE, 1);
 }
 
-void
-vm_exit_reqidle(struct vm *vm, int vcpuid, uint64_t rip)
-{
-	struct vm_exit *vmexit;
-
-	vmexit = vm_exitinfo(vm, vcpuid);
-	vmexit->rip = rip;
-	vmexit->inst_length = 0;
-	vmexit->exitcode = VM_EXITCODE_REQIDLE;
-	vmm_stat_incr(vm, vcpuid, VMEXIT_REQIDLE, 1);
-}
-
-void
-vm_exit_astpending(struct vm *vm, int vcpuid, uint64_t rip)
-{
-	struct vm_exit *vmexit;
-
-	vmexit = vm_exitinfo(vm, vcpuid);
-	vmexit->rip = rip;
-	vmexit->inst_length = 0;
-	vmexit->exitcode = VM_EXITCODE_BOGUS;
-	vmm_stat_incr(vm, vcpuid, VMEXIT_ASTPENDING, 1);
-}
 
 #ifndef __FreeBSD__
 /*
@@ -2072,7 +2069,7 @@
 	case VEC_DISCARD_INSTR:
 		vie_reset(vie);
 		return (0);
-	case VEC_COMPLETE_MMIO:
+	case VEC_FULFILL_MMIO:
 		err = vie_fulfill_mmio(vie, &entry->u.mmio);
 		if (err == 0) {
 			err = vie_emulate_mmio(vie, vm, vcpuid);
@@ -2091,7 +2088,7 @@
 			}
 		}
 		break;
-	case VEC_COMPLETE_INOUT:
+	case VEC_FULFILL_INOUT:
 		err = vie_fulfill_inout(vie, &entry->u.inout);
 		if (err == 0) {
 			err = vie_emulate_inout(vie, vm, vcpuid);
@@ -2132,25 +2129,12 @@
 		return (-1);
 	}
 
-	if (vcpuid == 0 && vm->sipi_req) {
-		/* The boot vCPU has sent a SIPI to one of the other CPUs */
-		vme->exitcode = VM_EXITCODE_SPINUP_AP;
-		vme->u.spinup_ap.vcpu = vm->sipi_req_vcpu;
-		vme->u.spinup_ap.rip = vm->sipi_req_rip;
-
-		vm->sipi_req = false;
-		vm->sipi_req_vcpu = 0;
-		vm->sipi_req_rip = 0;
-		return (-1);
-	}
-
 	return (0);
 }
 
 int
 vm_run(struct vm *vm, int vcpuid, const struct vm_entry *entry)
 {
-	struct vm_eventinfo evinfo;
 	int error;
 	struct vcpu *vcpu;
 #ifdef	__FreeBSD__
@@ -2177,9 +2161,6 @@
 	pmap = vmspace_pmap(vm->vmspace);
 	vcpu = &vm->vcpu[vcpuid];
 	vme = &vcpu->exitinfo;
-	evinfo.rptr = &vcpu->runblock;
-	evinfo.sptr = &vm->suspend;
-	evinfo.iptr = &vcpu->reqidle;
 
 #ifndef	__FreeBSD__
 	vtc.vtc_vm = vm;
@@ -2242,7 +2223,7 @@
 #endif
 
 	vcpu_require_state(vm, vcpuid, VCPU_RUNNING);
-	error = VMRUN(vm->cookie, vcpuid, vcpu->nextrip, pmap, &evinfo);
+	error = VMRUN(vm->cookie, vcpuid, vcpu->nextrip, pmap);
 	vcpu_require_state(vm, vcpuid, VCPU_FROZEN);
 
 #ifdef	__FreeBSD__
@@ -2273,6 +2254,9 @@
 	case VM_EXITCODE_REQIDLE:
 		error = vm_handle_reqidle(vm, vcpuid);
 		break;
+	case VM_EXITCODE_RUN_STATE:
+		error = vm_handle_run_state(vm, vcpuid);
+		break;
 	case VM_EXITCODE_SUSPENDED:
 		error = vm_handle_suspend(vm, vcpuid);
 		break;
@@ -2280,8 +2264,6 @@
 		vioapic_process_eoi(vm, vcpuid,
 		    vme->u.ioapic_eoi.vector);
 		break;
-	case VM_EXITCODE_RUNBLOCK:
-		break;
 	case VM_EXITCODE_HLT:
 		intr_disabled = ((vme->u.hlt.rflags & PSL_I) == 0);
 		error = vm_handle_hlt(vm, vcpuid, intr_disabled);
@@ -2792,6 +2774,196 @@
 }
 
 int
+vm_inject_init(struct vm *vm, int vcpuid)
+{
+	struct vcpu *vcpu;
+
+	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
+		return (EINVAL);
+
+	vcpu = &vm->vcpu[vcpuid];
+	vcpu_lock(vcpu);
+	vcpu->run_state |= VRS_PEND_INIT;
+	vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT);
+	vcpu_unlock(vcpu);
+	return (0);
+}
+
+int
+vm_inject_sipi(struct vm *vm, int vcpuid, uint8_t vector)
+{
+	struct vcpu *vcpu;
+
+	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
+		return (EINVAL);
+
+	vcpu = &vm->vcpu[vcpuid];
+	vcpu_lock(vcpu);
+	vcpu->run_state |= VRS_PEND_SIPI;
+	vcpu->sipi_vector = vector;
+	/* SIPI is only actionable if the CPU is waiting in INIT state */
+	if ((vcpu->run_state & (VRS_INIT | VRS_RUN)) == VRS_INIT) {
+		vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT);
+	}
+	vcpu_unlock(vcpu);
+	return (0);
+}
+
+bool
+vcpu_run_state_pending(struct vm *vm, int vcpuid)
+{
+	struct vcpu *vcpu;
+
+	ASSERT(vcpuid >= 0 && vcpuid < vm->maxcpus);
+	vcpu = &vm->vcpu[vcpuid];
+
+	/* Of interest: vCPU not in running state or with pending INIT */
+	return ((vcpu->run_state & (VRS_RUN | VRS_PEND_INIT)) != VRS_RUN);
+}
+
+int
+vcpu_arch_reset(struct vm *vm, int vcpuid, bool init_only)
+{
+	struct seg_desc desc;
+	const enum vm_reg_name clear_regs[] = {
+		VM_REG_GUEST_CR2,
+		VM_REG_GUEST_CR3,
+		VM_REG_GUEST_CR4,
+		VM_REG_GUEST_RAX,
+		VM_REG_GUEST_RBX,
+		VM_REG_GUEST_RCX,
+		VM_REG_GUEST_RSI,
+		VM_REG_GUEST_RDI,
+		VM_REG_GUEST_RBP,
+		VM_REG_GUEST_RSP,
+		VM_REG_GUEST_R8,
+		VM_REG_GUEST_R9,
+		VM_REG_GUEST_R10,
+		VM_REG_GUEST_R11,
+		VM_REG_GUEST_R12,
+		VM_REG_GUEST_R13,
+		VM_REG_GUEST_R14,
+		VM_REG_GUEST_R15,
+		VM_REG_GUEST_DR0,
+		VM_REG_GUEST_DR1,
+		VM_REG_GUEST_DR2,
+		VM_REG_GUEST_DR3,
+		VM_REG_GUEST_EFER,
+	};
+	const enum vm_reg_name data_segs[] = {
+		VM_REG_GUEST_SS,
+		VM_REG_GUEST_DS,
+		VM_REG_GUEST_ES,
+		VM_REG_GUEST_FS,
+		VM_REG_GUEST_GS,
+	};
+	struct vcpu *vcpu = &vm->vcpu[vcpuid];
+
+	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
+		return (EINVAL);
+
+	for (uint_t i = 0; i < nitems(clear_regs); i++) {
+		VERIFY0(vm_set_register(vm, vcpuid, clear_regs[i], 0));
+	}
+
+	VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, 2));
+	VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RIP, 0xfff0));
+	VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_CR0, 0x60000010));
+
+	/*
+	 * The prescribed contents of %rdx differ slightly between the Intel and
+	 * AMD architectural definitions.  The former expects the Extended Model
+	 * in bits 16-19 where the latter expects all the Family, Model, and
+	 * Stepping be there.  Common boot ROMs appear to disregard this
+	 * anyways, so we stick with a compromise value similar to what is
+	 * spelled out in the Intel SDM.
+	 */
+	VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RDX, 0x600));
+
+	VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_DR6, 0xffff0ff0));
+	VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_DR7, 0x400));
+
+	/* CS: Present, R/W, Accessed */
+	desc.access = 0x0093;
+	desc.base = 0xffff0000;
+	desc.limit = 0xffff;
+	VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_CS, &desc));
+	VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_CS, 0xf000));
+
+	/* SS, DS, ES, FS, GS: Present, R/W, Accessed */
+	desc.access = 0x0093;
+	desc.base = 0;
+	desc.limit = 0xffff;
+	for (uint_t i = 0; i < nitems(data_segs); i++) {
+		VERIFY0(vm_set_seg_desc(vm, vcpuid, data_segs[i], &desc));
+		VERIFY0(vm_set_register(vm, vcpuid, data_segs[i], 0));
+	}
+
+	/* GDTR, IDTR */
+	desc.base = 0;
+	desc.limit = 0xffff;
+	VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_GDTR, &desc));
+	VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_IDTR, &desc));
+
+	/* LDTR: Present, LDT */
+	desc.access = 0x0082;
+	desc.base = 0;
+	desc.limit = 0xffff;
+	VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_LDTR, &desc));
+	VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_LDTR, 0));
+
+	/* TR: Present, 32-bit TSS */
+	desc.access = 0x008b;
+	desc.base = 0;
+	desc.limit = 0xffff;
+	VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_TR, &desc));
+	VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_TR, 0));
+
+	vlapic_reset(vm_lapic(vm, vcpuid));
+
+	VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_INTR_SHADOW, 0));
+
+	vcpu->exitintinfo = 0;
+	vcpu->exception_pending = 0;
+	vcpu->nmi_pending = 0;
+	vcpu->extint_pending = 0;
+
+	/*
+	 * A CPU reset caused by power-on or system reset clears more state than
+	 * one which is trigged from an INIT IPI.
+	 */
+	if (!init_only) {
+		vcpu->guest_xcr0 = XFEATURE_ENABLED_X87;
+		fpu_save_area_reset(vcpu->guestfpu);
+
+		/* XXX: clear MSRs and other pieces */
+	}
+
+	return (0);
+}
+
+static int
+vcpu_vector_sipi(struct vm *vm, int vcpuid, uint8_t vector)
+{
+	struct seg_desc desc;
+
+	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
+		return (EINVAL);
+
+	/* CS: Present, R/W, Accessed */
+	desc.access = 0x0093;
+	desc.base = (uint64_t)vector << 12;
+	desc.limit = 0xffff;
+	VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_CS, &desc));
+	VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_CS,
+	    (uint64_t)vector << 8));
+
+	VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RIP, 0));
+
+	return (0);
+}
+
+int
 vm_get_capability(struct vm *vm, int vcpu, int type, int *retval)
 {
 	if (vcpu < 0 || vcpu >= vm->maxcpus)
@@ -2894,7 +3066,7 @@
 	struct vcpu *vcpu;
 
 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
-		panic("vm_set_run_state: invalid vcpuid %d", vcpuid);
+		panic("vcpu_set_state: invalid vcpuid %d", vcpuid);
 
 	vcpu = &vm->vcpu[vcpuid];
 
@@ -2912,7 +3084,7 @@
 	enum vcpu_state state;
 
 	if (vcpuid < 0 || vcpuid >= vm->maxcpus)
-		panic("vm_get_run_state: invalid vcpuid %d", vcpuid);
+		panic("vcpu_get_state: invalid vcpuid %d", vcpuid);
 
 	vcpu = &vm->vcpu[vcpuid];
 
@@ -2925,54 +3097,6 @@
 	return (state);
 }
 
-void
-vcpu_block_run(struct vm *vm, int vcpuid)
-{
-	struct vcpu *vcpu;
-
-	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
-		panic("vcpu_block_run: invalid vcpuid %d", vcpuid);
-
-	vcpu = &vm->vcpu[vcpuid];
-
-	vcpu_lock(vcpu);
-	vcpu->runblock++;
-	if (vcpu->runblock == 1 && vcpu->state == VCPU_RUNNING) {
-		vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT);
-	}
-	while (vcpu->state == VCPU_RUNNING) {
-#ifdef __FreeBSD__
-		msleep_spin(&vcpu->state, &vcpu->mtx, "vcpublk", 0);
-#else
-		cv_wait(&vcpu->state_cv, &vcpu->mtx.m);
-#endif
-	}
-	vcpu_unlock(vcpu);
-}
-
-void
-vcpu_unblock_run(struct vm *vm, int vcpuid)
-{
-	struct vcpu *vcpu;
-
-	if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
-		panic("vcpu_block_run: invalid vcpuid %d", vcpuid);
-
-	vcpu = &vm->vcpu[vcpuid];
-
-	vcpu_lock(vcpu);
-	KASSERT(vcpu->runblock != 0, ("expected non-zero runblock"));
-	vcpu->runblock--;
-	if (vcpu->runblock == 0) {
-#ifdef __FreeBSD__
-		wakeup(&vcpu->state);
-#else
-		cv_broadcast(&vcpu->state_cv);
-#endif
-	}
-	vcpu_unlock(vcpu);
-}
-
 #ifndef	__FreeBSD__
 uint64_t
 vcpu_tsc_offset(struct vm *vm, int vcpuid)
@@ -3038,11 +3162,93 @@
 	return (0);
 }
 
-int
-vcpu_debugged(struct vm *vm, int vcpuid)
+static bool
+vcpu_bailout_checks(struct vm *vm, int vcpuid, bool on_entry,
+    uint64_t entry_rip)
 {
-
-	return (CPU_ISSET(vcpuid, &vm->debug_cpus));
+	struct vcpu *vcpu = &vm->vcpu[vcpuid];
+	struct vm_exit *vme = &vcpu->exitinfo;
+	bool bail = false;
+
+	ASSERT(vcpuid >= 0 && vcpuid < vm->maxcpus);
+
+	if (vm->suspend) {
+		if (on_entry) {
+			VERIFY(vm->suspend > VM_SUSPEND_NONE &&
+			    vm->suspend < VM_SUSPEND_LAST);
+
+			vme->exitcode = VM_EXITCODE_SUSPENDED;
+			vme->u.suspended.how = vm->suspend;
+		} else {
+			/*
+			 * Handling VM suspend is complicated, so if that
+			 * condition is detected outside of VM-entry itself,
+			 * just emit a BOGUS exitcode so we take a lap to pick
+			 * up the event during an entry and are directed into
+			 * the vm_handle_suspend() logic.
+			 */
+			vme->exitcode = VM_EXITCODE_BOGUS;
+		}
+		bail = true;
+	}
+	if (vcpu->reqidle) {
+		vme->exitcode = VM_EXITCODE_REQIDLE;
+		vmm_stat_incr(vm, vcpuid, VMEXIT_REQIDLE, 1);
+
+		if (!on_entry) {
+			/*
+			 * A reqidle request detected outside of VM-entry can be
+			 * handled directly by clearing the request (and taking
+			 * a lap to userspace).
+			 */
+			vcpu_assert_locked(vcpu);
+			vcpu->reqidle = 0;
+		}
+		bail = true;
+	}
+	if (vcpu_should_yield(vm, vcpuid)) {
+		vme->exitcode = VM_EXITCODE_BOGUS;
+		vmm_stat_incr(vm, vcpuid, VMEXIT_ASTPENDING, 1);
+		bail = true;
+	}
+	if (CPU_ISSET(vcpuid, &vm->debug_cpus)) {
+		vme->exitcode = VM_EXITCODE_DEBUG;
+		bail = true;
+	}
+
+	if (bail) {
+		if (on_entry) {
+			/*
+			 * If bailing out during VM-entry, the current %rip must
+			 * be recorded in the exitinfo.
+			 */
+			vme->rip = entry_rip;
+		}
+		vme->inst_length = 0;
+	}
+	return (bail);
+}
+
+static bool
+vcpu_sleep_bailout_checks(struct vm *vm, int vcpuid)
+{
+	/*
+	 * Bail-out check done prior to sleeping (in vCPU contexts like HLT or
+	 * wait-for-SIPI) expect that %rip is already populated in the vm_exit
+	 * structure, and we would only modify the exitcode.
+	 */
+	return (vcpu_bailout_checks(vm, vcpuid, false, 0));
+}
+
+bool
+vcpu_entry_bailout_checks(struct vm *vm, int vcpuid, uint64_t rip)
+{
+	/*
+	 * Bail-out checks done as part of VM entry require an updated %rip to
+	 * populate the vm_exit struct if any of the conditions of interest are
+	 * matched in the check.
+	 */
+	return (vcpu_bailout_checks(vm, vcpuid, true, rip));
 }
 
 cpuset_t
--- a/usr/src/uts/i86pc/io/vmm/vmm_sol_dev.c	Mon Jan 04 14:49:49 2021 -0500
+++ b/usr/src/uts/i86pc/io/vmm/vmm_sol_dev.c	Mon Jan 11 08:50:27 2021 -0500
@@ -443,6 +443,9 @@
 	case VM_RESTART_INSTRUCTION:
 	case VM_SET_KERNEMU_DEV:
 	case VM_GET_KERNEMU_DEV:
+	case VM_RESET_CPU:
+	case VM_GET_RUN_STATE:
+	case VM_SET_RUN_STATE:
 		/*
 		 * Copy in the ID of the vCPU chosen for this operation.
 		 * Since a nefarious caller could update their struct between
@@ -989,6 +992,45 @@
 		}
 		break;
 	}
+	case VM_RESET_CPU: {
+		struct vm_vcpu_reset vvr;
+
+		if (ddi_copyin(datap, &vvr, sizeof (vvr), md)) {
+			error = EFAULT;
+			break;
+		}
+		if (vvr.kind != VRK_RESET && vvr.kind != VRK_INIT) {
+			error = EINVAL;
+		}
+
+		error = vcpu_arch_reset(sc->vmm_vm, vcpu, vvr.kind == VRK_INIT);
+		break;
+	}
+	case VM_GET_RUN_STATE: {
+		struct vm_run_state vrs;
+
+		bzero(&vrs, sizeof (vrs));
+		error = vm_get_run_state(sc->vmm_vm, vcpu, &vrs.state,
+		    &vrs.sipi_vector);
+		if (error == 0) {
+			if (ddi_copyout(&vrs, datap, sizeof (vrs), md)) {
+				error = EFAULT;
+				break;
+			}
+		}
+		break;
+	}
+	case VM_SET_RUN_STATE: {
+		struct vm_run_state vrs;
+
+		if (ddi_copyin(datap, &vrs, sizeof (vrs), md)) {
+			error = EFAULT;
+			break;
+		}
+		error = vm_set_run_state(sc->vmm_vm, vcpu, vrs.state,
+		    vrs.sipi_vector);
+		break;
+	}
 
 	case VM_SET_KERNEMU_DEV:
 	case VM_GET_KERNEMU_DEV: {
--- a/usr/src/uts/i86pc/io/vmm/vmm_stat.c	Mon Jan 04 14:49:49 2021 -0500
+++ b/usr/src/uts/i86pc/io/vmm/vmm_stat.c	Mon Jan 11 08:50:27 2021 -0500
@@ -167,6 +167,5 @@
 VMM_STAT(VMEXIT_UNKNOWN, "number of vm exits for unknown reason");
 VMM_STAT(VMEXIT_ASTPENDING, "number of times astpending at exit");
 VMM_STAT(VMEXIT_REQIDLE, "number of times idle requested at exit");
-VMM_STAT(VMEXIT_USERSPACE, "number of vm exits handled in userspace");
-VMM_STAT(VMEXIT_RUNBLOCK, "number of times runblock at exit");
 VMM_STAT(VMEXIT_EXCEPTION, "number of vm exits due to exceptions");
+VMM_STAT(VMEXIT_RUN_STATE, "number of vm exits due to run_state change");
--- a/usr/src/uts/i86pc/io/vmm/vmm_stat.h	Mon Jan 04 14:49:49 2021 -0500
+++ b/usr/src/uts/i86pc/io/vmm/vmm_stat.h	Mon Jan 11 08:50:27 2021 -0500
@@ -165,8 +165,7 @@
 VMM_STAT_DECLARE(VMEXIT_MMIO_EMUL);
 VMM_STAT_DECLARE(VMEXIT_UNKNOWN);
 VMM_STAT_DECLARE(VMEXIT_ASTPENDING);
-VMM_STAT_DECLARE(VMEXIT_USERSPACE);
-VMM_STAT_DECLARE(VMEXIT_RUNBLOCK);
 VMM_STAT_DECLARE(VMEXIT_EXCEPTION);
 VMM_STAT_DECLARE(VMEXIT_REQIDLE);
+VMM_STAT_DECLARE(VMEXIT_RUN_STATE);
 #endif
--- a/usr/src/uts/i86pc/sys/vmm.h	Mon Jan 04 14:49:49 2021 -0500
+++ b/usr/src/uts/i86pc/sys/vmm.h	Mon Jan 11 08:50:27 2021 -0500
@@ -217,9 +217,9 @@
 	VM_EXITCODE_PAUSE,
 	VM_EXITCODE_PAGING,
 	VM_EXITCODE_INST_EMUL,
-	VM_EXITCODE_SPINUP_AP,
+	VM_EXITCODE_RUN_STATE,
 	VM_EXITCODE_MMIO_EMUL,
-	VM_EXITCODE_RUNBLOCK,
+	VM_EXITCODE_DEPRECATED,	/* formerly RUNBLOCK */
 	VM_EXITCODE_IOAPIC_EOI,
 	VM_EXITCODE_SUSPENDED,
 	VM_EXITCODE_MMIO,
@@ -287,6 +287,18 @@
 	struct vm_guest_paging paging;
 };
 
+enum vcpu_run_state {
+	VRS_HALT		= 0,
+	VRS_INIT		= (1 << 0),
+	VRS_RUN			= (1 << 1),
+
+	VRS_PEND_INIT		= (1 << 14),
+	VRS_PEND_SIPI		= (1 << 15),
+};
+#define VRS_MASK_VALID(v)	\
+	((v) & (VRS_INIT | VRS_RUN | VRS_PEND_SIPI | VRS_PEND_SIPI))
+#define VRS_IS_VALID(v)		((v) == VRS_MASK_VALID(v))
+
 struct vm_exit {
 	enum vm_exitcode	exitcode;
 	int			inst_length;	/* 0 means unknown */
@@ -348,10 +360,6 @@
 			uint64_t	wval;
 		} msr;
 		struct {
-			int		vcpu;
-			uint64_t	rip;
-		} spinup_ap;
-		struct {
 			uint64_t	rflags;
 		} hlt;
 		struct {
@@ -367,8 +375,8 @@
 enum vm_entry_cmds {
 	VEC_DEFAULT = 0,
 	VEC_DISCARD_INSTR,	/* discard inst emul state */
-	VEC_COMPLETE_MMIO,	/* entry includes result for mmio emul */
-	VEC_COMPLETE_INOUT,	/* entry includes result for inout emul */
+	VEC_FULFILL_MMIO,	/* entry includes result for mmio emul */
+	VEC_FULFILL_INOUT,	/* entry includes result for inout emul */
 };
 
 struct vm_entry {
--- a/usr/src/uts/i86pc/sys/vmm_dev.h	Mon Jan 04 14:49:49 2021 -0500
+++ b/usr/src/uts/i86pc/sys/vmm_dev.h	Mon Jan 11 08:50:27 2021 -0500
@@ -259,6 +259,28 @@
 };
 _Static_assert(sizeof(struct vm_readwrite_kernemu_device) == 24, "ABI");
 
+enum vcpu_reset_kind {
+	VRK_RESET = 0,
+	/*
+	 * The reset performed by an INIT IPI clears much of the CPU state, but
+	 * some portions are left untouched, unlike VRK_RESET, which represents
+	 * a "full" reset as if the system was freshly powered on.
+	 */
+	VRK_INIT = 1,
+};
+
+struct vm_vcpu_reset {
+	int		vcpuid;
+	uint32_t	kind;	/* contains: enum vcpu_reset_kind */
+};
+
+struct vm_run_state {
+	int		vcpuid;
+	uint32_t	state;	/* of enum cpu_init_status type */
+	uint8_t		sipi_vector;	/* vector of SIPI, if any */
+	uint8_t		_pad[3];
+};
+
 #define	VMMCTL_IOC_BASE		(('V' << 16) | ('M' << 8))
 #define	VMM_IOC_BASE		(('v' << 16) | ('m' << 8))
 #define	VMM_LOCK_IOC_BASE	(('v' << 16) | ('l' << 8))
@@ -291,6 +313,9 @@
 #define	VM_RESTART_INSTRUCTION		(VMM_CPU_IOC_BASE | 0x13)
 #define	VM_SET_KERNEMU_DEV		(VMM_CPU_IOC_BASE | 0x14)
 #define	VM_GET_KERNEMU_DEV		(VMM_CPU_IOC_BASE | 0x15)
+#define	VM_RESET_CPU			(VMM_CPU_IOC_BASE | 0x16)
+#define	VM_GET_RUN_STATE		(VMM_CPU_IOC_BASE | 0x17)
+#define	VM_SET_RUN_STATE		(VMM_CPU_IOC_BASE | 0x18)
 
 /* Operations requiring write-locking the VM */
 #define	VM_REINIT		(VMM_LOCK_IOC_BASE | 0x01)