# HG changeset patch # User Dan McDonald # Date 1610373027 18000 # Node ID 3285827f3d5df2c7d43fc46c659fc31b2591bfe6 # Parent ce2b70e7aab017144f3b78439b09364ed5b3032b# Parent 3fa93b6d354f65472bc015aaa00246b1997c1626 [illumos-gate merge] commit 2606939d92dd3044a9851b2930ebf533c3c03892 13275 bhyve needs richer INIT/SIPI support commit 78f846c0ab4f41678386d3e1b49c16cc8db07a8b 13438 Update prototypes to 2021 commit ab2fdd80a620c2b88e5ac2c4247ab79880761b18 13409 cxgbe: replace zero sized array by flexible array commit 6dc7d05754d992040097e8ba8f85e77512125c60 8040 NFSv4 client: 3-way deadlock between nfs4_bio(), nfs4_do_delegreturn(), and nfs4_flush_pages() Conflicts: usr/src/cmd/bhyve/bhyverun.c diff -r ce2b70e7aab0 -r 3285827f3d5d usr/src/cmd/bhyve/bhyverun.c --- a/usr/src/cmd/bhyve/bhyverun.c Mon Jan 04 14:49:49 2021 -0500 +++ b/usr/src/cmd/bhyve/bhyverun.c Mon Jan 11 08:50:27 2021 -0500 @@ -518,13 +518,14 @@ fbsdrun_addcpu(struct vmctx *ctx, int fromcpu, int newcpu, uint64_t rip) #else void -fbsdrun_addcpu(struct vmctx *ctx, int fromcpu, int newcpu, uint64_t rip, - bool suspend) +fbsdrun_addcpu(struct vmctx *ctx, int newcpu, uint64_t rip, bool suspend) #endif { int error; +#ifdef __FreeBSD__ assert(fromcpu == BSP); +#endif /* * The 'newcpu' must be activated in the context of 'fromcpu'. If @@ -577,7 +578,7 @@ assert(entry->cmd == VEC_DEFAULT); - entry->cmd = VEC_COMPLETE_MMIO; + entry->cmd = VEC_FULFILL_MMIO; mmio->bytes = bytes; mmio->read = 1; mmio->gpa = gpa; @@ -592,7 +593,7 @@ assert(entry->cmd == VEC_DEFAULT); - entry->cmd = VEC_COMPLETE_MMIO; + entry->cmd = VEC_FULFILL_MMIO; mmio->bytes = bytes; mmio->read = 0; mmio->gpa = gpa; @@ -607,7 +608,7 @@ assert(entry->cmd == VEC_DEFAULT); - entry->cmd = VEC_COMPLETE_INOUT; + entry->cmd = VEC_FULFILL_INOUT; inout->bytes = bytes; inout->flags = INOUT_IN; inout->port = port; @@ -622,7 +623,7 @@ assert(entry->cmd == VEC_DEFAULT); - entry->cmd = VEC_COMPLETE_INOUT; + entry->cmd = VEC_FULFILL_INOUT; inout->bytes = bytes; inout->flags = 0; inout->port = port; @@ -731,6 +732,7 @@ return (VMEXIT_CONTINUE); } +#ifdef __FreeBSD__ static int vmexit_spinup_ap(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu) { @@ -740,6 +742,18 @@ return (VMEXIT_CONTINUE); } +#else +static int +vmexit_run_state(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu) +{ + /* + * Run-state transitions (INIT, SIPI, etc) are handled in-kernel, so an + * exit to userspace with that code is not expected. + */ + fprintf(stderr, "unexpected run-state VM exit"); + return (VMEXIT_ABORT); +} +#endif /* __FreeBSD__ */ #ifdef __FreeBSD__ #define DEBUG_EPT_MISCONFIG @@ -1017,7 +1031,11 @@ [VM_EXITCODE_WRMSR] = vmexit_wrmsr, [VM_EXITCODE_MTRAP] = vmexit_mtrap, [VM_EXITCODE_INST_EMUL] = vmexit_inst_emul, +#ifdef __FreeBSD__ [VM_EXITCODE_SPINUP_AP] = vmexit_spinup_ap, +#else + [VM_EXITCODE_RUN_STATE] = vmexit_run_state, +#endif [VM_EXITCODE_SUSPENDED] = vmexit_suspend, [VM_EXITCODE_TASK_SWITCH] = vmexit_task_switch, [VM_EXITCODE_DEBUG] = vmexit_debug, @@ -1547,14 +1565,21 @@ errx(EX_OSERR, "cap_enter() failed"); #endif +#ifdef __FreeBSD__ /* * Add CPU 0 */ -#ifdef __FreeBSD__ fbsdrun_addcpu(ctx, BSP, BSP, rip); #else - fbsdrun_addcpu(ctx, BSP, BSP, rip, suspend); + /* Set BSP to run (unlike the APs which wait for INIT) */ + error = vm_set_run_state(ctx, BSP, VRS_RUN, 0); + assert(error == 0); + fbsdrun_addcpu(ctx, BSP, rip, suspend); + /* Add subsequent CPUs, which will wait until INIT/SIPI-ed */ + for (uint_t i = 1; i < guest_ncpus; i++) { + spinup_halted_ap(ctx, i); + } mark_provisioned(); #endif diff -r ce2b70e7aab0 -r 3285827f3d5d usr/src/cmd/bhyve/bhyverun.h --- a/usr/src/cmd/bhyve/bhyverun.h Mon Jan 04 14:49:49 2021 -0500 +++ b/usr/src/cmd/bhyve/bhyverun.h Mon Jan 11 08:50:27 2021 -0500 @@ -58,8 +58,7 @@ #ifdef __FreeBSD__ void fbsdrun_addcpu(struct vmctx *ctx, int fromcpu, int newcpu, uint64_t rip); #else -void fbsdrun_addcpu(struct vmctx *ctx, int fromcpu, int newcpu, uint64_t rip, - bool suspend); +void fbsdrun_addcpu(struct vmctx *ctx, int newcpu, uint64_t rip, bool suspend); #endif int fbsdrun_muxed(void); int fbsdrun_vmexit_on_hlt(void); diff -r ce2b70e7aab0 -r 3285827f3d5d usr/src/cmd/bhyve/spinup_ap.c --- a/usr/src/cmd/bhyve/spinup_ap.c Mon Jan 04 14:49:49 2021 -0500 +++ b/usr/src/cmd/bhyve/spinup_ap.c Mon Jan 11 08:50:27 2021 -0500 @@ -56,6 +56,7 @@ #include "bhyverun.h" #include "spinup_ap.h" +#ifdef __FreeBSD__ static void spinup_ap_realmode(struct vmctx *ctx, int newcpu, uint64_t *rip) { @@ -101,7 +102,6 @@ fbsdrun_set_capabilities(ctx, newcpu); -#ifdef __FreeBSD__ /* * Enable the 'unrestricted guest' mode for 'newcpu'. * @@ -110,17 +110,30 @@ */ error = vm_set_capability(ctx, newcpu, VM_CAP_UNRESTRICTED_GUEST, 1); assert(error == 0); -#else - /* Unrestricted Guest is always enabled on illumos */ -#endif spinup_ap_realmode(ctx, newcpu, &rip); -#ifdef __FreeBSD__ fbsdrun_addcpu(ctx, vcpu, newcpu, rip); -#else - fbsdrun_addcpu(ctx, vcpu, newcpu, rip, false); -#endif return (newcpu); } +#else /* __FreeBSD__ */ +void +spinup_halted_ap(struct vmctx *ctx, int newcpu) +{ + int error; + + assert(newcpu != 0); + assert(newcpu < guest_ncpus); + + error = vcpu_reset(ctx, newcpu); + assert(error == 0); + + fbsdrun_set_capabilities(ctx, newcpu); + + error = vm_set_run_state(ctx, newcpu, VRS_HALT, 0); + assert(error == 0); + + fbsdrun_addcpu(ctx, newcpu, 0, false); +} +#endif /* __FreeBSD__ */ diff -r ce2b70e7aab0 -r 3285827f3d5d usr/src/cmd/bhyve/spinup_ap.h --- a/usr/src/cmd/bhyve/spinup_ap.h Mon Jan 04 14:49:49 2021 -0500 +++ b/usr/src/cmd/bhyve/spinup_ap.h Mon Jan 11 08:50:27 2021 -0500 @@ -31,6 +31,10 @@ #ifndef _SPINUP_AP_H_ #define _SPINUP_AP_H_ +#ifdef __FreeBSD__ int spinup_ap(struct vmctx *ctx, int vcpu, int newcpu, uint64_t rip); +#else +void spinup_halted_ap(struct vmctx *ctx, int newcpu); +#endif /* __FreeBSD__ */ #endif diff -r ce2b70e7aab0 -r 3285827f3d5d usr/src/lib/libvmmapi/common/mapfile-vers --- a/usr/src/lib/libvmmapi/common/mapfile-vers Mon Jan 04 14:49:49 2021 -0500 +++ b/usr/src/lib/libvmmapi/common/mapfile-vers Mon Jan 11 08:50:27 2021 -0500 @@ -123,6 +123,8 @@ vm_unassign_pptdev; vm_pmtmr_set_location; vm_wrlock_cycle; + vm_get_run_state; + vm_set_run_state; local: *; diff -r ce2b70e7aab0 -r 3285827f3d5d usr/src/lib/libvmmapi/common/vmmapi.c --- a/usr/src/lib/libvmmapi/common/vmmapi.c Mon Jan 04 14:49:49 2021 -0500 +++ b/usr/src/lib/libvmmapi/common/vmmapi.c Mon Jan 11 08:50:27 2021 -0500 @@ -1302,6 +1302,18 @@ return (error); } +#ifndef __FreeBSD__ +int +vcpu_reset(struct vmctx *vmctx, int vcpu) +{ + struct vm_vcpu_reset vvr; + + vvr.vcpuid = vcpu; + vvr.kind = VRK_RESET; + + return (ioctl(vmctx->fd, VM_RESET_CPU, &vvr)); +} +#else /* __FreeBSD__ */ /* * From Intel Vol 3a: * Table 9-1. IA-32 Processor States Following Power-up, Reset or INIT @@ -1458,6 +1470,7 @@ done: return (error); } +#endif /* __FreeBSD__ */ int vm_get_gpa_pmap(struct vmctx *ctx, uint64_t gpa, uint64_t *pte, int *num) @@ -1839,6 +1852,39 @@ } return (0); } + +int +vm_get_run_state(struct vmctx *ctx, int vcpu, enum vcpu_run_state *state, + uint8_t *sipi_vector) +{ + struct vm_run_state data; + + data.vcpuid = vcpu; + if (ioctl(ctx->fd, VM_GET_RUN_STATE, &data) != 0) { + return (errno); + } + + *state = data.state; + *sipi_vector = data.sipi_vector; + return (0); +} + +int +vm_set_run_state(struct vmctx *ctx, int vcpu, enum vcpu_run_state state, + uint8_t sipi_vector) +{ + struct vm_run_state data; + + data.vcpuid = vcpu; + data.state = state; + data.sipi_vector = sipi_vector; + if (ioctl(ctx->fd, VM_SET_RUN_STATE, &data) != 0) { + return (errno); + } + + return (0); +} + #endif /* __FreeBSD__ */ #ifdef __FreeBSD__ diff -r ce2b70e7aab0 -r 3285827f3d5d usr/src/lib/libvmmapi/common/vmmapi.h --- a/usr/src/lib/libvmmapi/common/vmmapi.h Mon Jan 04 14:49:49 2021 -0500 +++ b/usr/src/lib/libvmmapi/common/vmmapi.h Mon Jan 11 08:50:27 2021 -0500 @@ -304,6 +304,10 @@ /* illumos-specific APIs */ int vm_pmtmr_set_location(struct vmctx *ctx, uint16_t ioport); int vm_wrlock_cycle(struct vmctx *ctx); +int vm_get_run_state(struct vmctx *ctx, int vcpu, enum vcpu_run_state *state, + uint8_t *sipi_vector); +int vm_set_run_state(struct vmctx *ctx, int vcpu, enum vcpu_run_state state, + uint8_t sipi_vector); #endif /* __FreeBSD__ */ #ifdef __FreeBSD__ diff -r ce2b70e7aab0 -r 3285827f3d5d usr/src/prototypes/README --- a/usr/src/prototypes/README Mon Jan 04 14:49:49 2021 -0500 +++ b/usr/src/prototypes/README Mon Jan 11 08:50:27 2021 -0500 @@ -17,5 +17,5 @@ */ /* - * Copyright 2020 + * Copyright 2021 */ diff -r ce2b70e7aab0 -r 3285827f3d5d usr/src/prototypes/prototype.Makefile --- a/usr/src/prototypes/prototype.Makefile Mon Jan 04 14:49:49 2021 -0500 +++ b/usr/src/prototypes/prototype.Makefile Mon Jan 11 08:50:27 2021 -0500 @@ -10,6 +10,6 @@ # # -# Copyright 2020 +# Copyright 2021 # diff -r ce2b70e7aab0 -r 3285827f3d5d usr/src/prototypes/prototype.c --- a/usr/src/prototypes/prototype.c Mon Jan 04 14:49:49 2021 -0500 +++ b/usr/src/prototypes/prototype.c Mon Jan 11 08:50:27 2021 -0500 @@ -10,7 +10,7 @@ */ /* - * Copyright 2020 + * Copyright 2021 */ /* diff -r ce2b70e7aab0 -r 3285827f3d5d usr/src/prototypes/prototype.csh --- a/usr/src/prototypes/prototype.csh Mon Jan 04 14:49:49 2021 -0500 +++ b/usr/src/prototypes/prototype.csh Mon Jan 11 08:50:27 2021 -0500 @@ -12,6 +12,6 @@ # # -# Copyright 2020 +# Copyright 2021 # diff -r ce2b70e7aab0 -r 3285827f3d5d usr/src/prototypes/prototype.h --- a/usr/src/prototypes/prototype.h Mon Jan 04 14:49:49 2021 -0500 +++ b/usr/src/prototypes/prototype.h Mon Jan 11 08:50:27 2021 -0500 @@ -10,7 +10,7 @@ */ /* - * Copyright 2020 + * Copyright 2021 */ #ifndef _PROTOTYPE_H diff -r ce2b70e7aab0 -r 3285827f3d5d usr/src/prototypes/prototype.java --- a/usr/src/prototypes/prototype.java Mon Jan 04 14:49:49 2021 -0500 +++ b/usr/src/prototypes/prototype.java Mon Jan 11 08:50:27 2021 -0500 @@ -10,7 +10,7 @@ */ /* - * Copyright 2020 + * Copyright 2021 */ /* diff -r ce2b70e7aab0 -r 3285827f3d5d usr/src/prototypes/prototype.ksh --- a/usr/src/prototypes/prototype.ksh Mon Jan 04 14:49:49 2021 -0500 +++ b/usr/src/prototypes/prototype.ksh Mon Jan 11 08:50:27 2021 -0500 @@ -12,6 +12,6 @@ # # -# Copyright 2020 +# Copyright 2021 # diff -r ce2b70e7aab0 -r 3285827f3d5d usr/src/prototypes/prototype.man --- a/usr/src/prototypes/prototype.man Mon Jan 04 14:49:49 2021 -0500 +++ b/usr/src/prototypes/prototype.man Mon Jan 11 08:50:27 2021 -0500 @@ -9,5 +9,5 @@ .\" http://www.illumos.org/license/CDDL. .\" .\" -.\" Copyright 2020 +.\" Copyright 2021 .\" diff -r ce2b70e7aab0 -r 3285827f3d5d usr/src/prototypes/prototype.man1 --- a/usr/src/prototypes/prototype.man1 Mon Jan 04 14:49:49 2021 -0500 +++ b/usr/src/prototypes/prototype.man1 Mon Jan 11 08:50:27 2021 -0500 @@ -9,7 +9,7 @@ .\" http://www.illumos.org/license/CDDL. .\" .\" -.\" Copyright 2020 +.\" Copyright 2021 .\" .Dd Month Day, Year .Dt COMMAND 1 diff -r ce2b70e7aab0 -r 3285827f3d5d usr/src/prototypes/prototype.man3x --- a/usr/src/prototypes/prototype.man3x Mon Jan 04 14:49:49 2021 -0500 +++ b/usr/src/prototypes/prototype.man3x Mon Jan 11 08:50:27 2021 -0500 @@ -9,7 +9,7 @@ .\" http://www.illumos.org/license/CDDL. .\" .\" -.\" Copyright 2020 +.\" Copyright 2021 .\" .Dd Month Day, Year .Dt MANUALPAGE 3SECTION diff -r ce2b70e7aab0 -r 3285827f3d5d usr/src/prototypes/prototype.man7d --- a/usr/src/prototypes/prototype.man7d Mon Jan 04 14:49:49 2021 -0500 +++ b/usr/src/prototypes/prototype.man7d Mon Jan 11 08:50:27 2021 -0500 @@ -9,7 +9,7 @@ .\" http://www.illumos.org/license/CDDL. .\" .\" -.\" Copyright 2020 +.\" Copyright 2021 .\" .Dd Month Day, Year .Dt DRIVERNAME 7D diff -r ce2b70e7aab0 -r 3285827f3d5d usr/src/prototypes/prototype.man9e --- a/usr/src/prototypes/prototype.man9e Mon Jan 04 14:49:49 2021 -0500 +++ b/usr/src/prototypes/prototype.man9e Mon Jan 11 08:50:27 2021 -0500 @@ -9,7 +9,7 @@ .\" http://www.illumos.org/license/CDDL. .\" .\" -.\" Copyright 2020 +.\" Copyright 2021 .\" .Dd Month Day, Year .Dt ENTRYNAME 9E diff -r ce2b70e7aab0 -r 3285827f3d5d usr/src/prototypes/prototype.man9f --- a/usr/src/prototypes/prototype.man9f Mon Jan 04 14:49:49 2021 -0500 +++ b/usr/src/prototypes/prototype.man9f Mon Jan 11 08:50:27 2021 -0500 @@ -9,7 +9,7 @@ .\" http://www.illumos.org/license/CDDL. .\" .\" -.\" Copyright 2020 +.\" Copyright 2021 .\" .Dd Month Day, Year .Dt FUNCNAME 9F diff -r ce2b70e7aab0 -r 3285827f3d5d usr/src/prototypes/prototype.mapfile-vers --- a/usr/src/prototypes/prototype.mapfile-vers Mon Jan 04 14:49:49 2021 -0500 +++ b/usr/src/prototypes/prototype.mapfile-vers Mon Jan 11 08:50:27 2021 -0500 @@ -10,7 +10,7 @@ # # -# Copyright 2020 +# Copyright 2021 # # diff -r ce2b70e7aab0 -r 3285827f3d5d usr/src/prototypes/prototype.pl --- a/usr/src/prototypes/prototype.pl Mon Jan 04 14:49:49 2021 -0500 +++ b/usr/src/prototypes/prototype.pl Mon Jan 11 08:50:27 2021 -0500 @@ -11,7 +11,7 @@ # # -# Copyright 2020 +# Copyright 2021 # # diff -r ce2b70e7aab0 -r 3285827f3d5d usr/src/prototypes/prototype.py --- a/usr/src/prototypes/prototype.py Mon Jan 04 14:49:49 2021 -0500 +++ b/usr/src/prototypes/prototype.py Mon Jan 11 08:50:27 2021 -0500 @@ -11,7 +11,7 @@ # # -# Copyright 2020 +# Copyright 2021 # # diff -r ce2b70e7aab0 -r 3285827f3d5d usr/src/prototypes/prototype.s --- a/usr/src/prototypes/prototype.s Mon Jan 04 14:49:49 2021 -0500 +++ b/usr/src/prototypes/prototype.s Mon Jan 11 08:50:27 2021 -0500 @@ -10,7 +10,7 @@ */ /* - * Copyright 2020 + * Copyright 2021 */ .file "prototype.s" diff -r ce2b70e7aab0 -r 3285827f3d5d usr/src/prototypes/prototype.sh --- a/usr/src/prototypes/prototype.sh Mon Jan 04 14:49:49 2021 -0500 +++ b/usr/src/prototypes/prototype.sh Mon Jan 11 08:50:27 2021 -0500 @@ -12,6 +12,6 @@ # # -# Copyright 2020 +# Copyright 2021 # diff -r ce2b70e7aab0 -r 3285827f3d5d usr/src/uts/common/fs/nfs/nfs4_client.c --- a/usr/src/uts/common/fs/nfs/nfs4_client.c Mon Jan 04 14:49:49 2021 -0500 +++ b/usr/src/uts/common/fs/nfs/nfs4_client.c Mon Jan 11 08:50:27 2021 -0500 @@ -24,7 +24,7 @@ */ /* - * Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T. + * Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T. * All Rights Reserved */ @@ -464,33 +464,15 @@ rp = VTOR4(vp); mutex_enter(&rp->r_statelock); was_serial = (rp->r_serial == curthread); - if (rp->r_serial && !was_serial) { - klwp_t *lwp = ttolwp(curthread); - + if (rp->r_serial != NULL && !was_serial) { /* - * If we're the recovery thread, then purge current attrs - * and bail out to avoid potential deadlock between another - * thread caching attrs (r_serial thread), recov thread, - * and an async writer thread. + * Purge current attrs and bail out to avoid potential deadlock + * between another thread caching attrs (r_serial thread), this + * thread, and a thread trying to read or write pages. */ - if (recov) { - PURGE_ATTRCACHE4_LOCKED(rp); - mutex_exit(&rp->r_statelock); - return; - } - - if (lwp != NULL) - lwp->lwp_nostop++; - while (rp->r_serial != NULL) { - if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) { - mutex_exit(&rp->r_statelock); - if (lwp != NULL) - lwp->lwp_nostop--; - return; - } - } - if (lwp != NULL) - lwp->lwp_nostop--; + PURGE_ATTRCACHE4_LOCKED(rp); + mutex_exit(&rp->r_statelock); + return; } /* @@ -3067,7 +3049,7 @@ nfs4_oo_hash_bucket_t *bucketp; nfs4_debug_msg_t *msgp; int i; - servinfo4_t *svp; + servinfo4_t *svp; /* * Code introduced here should be carefully evaluated to make diff -r ce2b70e7aab0 -r 3285827f3d5d usr/src/uts/common/fs/nfs/nfs4_vnops.c --- a/usr/src/uts/common/fs/nfs/nfs4_vnops.c Mon Jan 04 14:49:49 2021 -0500 +++ b/usr/src/uts/common/fs/nfs/nfs4_vnops.c Mon Jan 11 08:50:27 2021 -0500 @@ -2596,12 +2596,6 @@ osp->os_ref_count--; if (ep->error == 0) { - /* - * Avoid a deadlock with the r_serial thread waiting for - * os_sync_lock in nfs4_get_otw_cred_by_osp() which might be - * held by us. We will wait in nfs4_attr_cache() for the - * completion of the r_serial thread. - */ mutex_exit(&osp->os_sync_lock); *have_sync_lockp = 0; diff -r ce2b70e7aab0 -r 3285827f3d5d usr/src/uts/common/io/cxgbe/common/t4_msg.h --- a/usr/src/uts/common/io/cxgbe/common/t4_msg.h Mon Jan 04 14:49:49 2021 -0500 +++ b/usr/src/uts/common/io/cxgbe/common/t4_msg.h Mon Jan 11 08:50:27 2021 -0500 @@ -2769,7 +2769,7 @@ __be64 addr0; #if !(defined C99_NOT_SUPPORTED) - struct ulptx_sge_pair sge[0]; + struct ulptx_sge_pair sge[]; #endif }; @@ -2785,7 +2785,7 @@ __be32 rsvd; #if !(defined C99_NOT_SUPPORTED) - struct ulptx_isge sge[0]; + struct ulptx_isge sge[]; #endif }; diff -r ce2b70e7aab0 -r 3285827f3d5d usr/src/uts/i86pc/io/vmm/amd/svm.c --- a/usr/src/uts/i86pc/io/vmm/amd/svm.c Mon Jan 04 14:49:49 2021 -0500 +++ b/usr/src/uts/i86pc/io/vmm/amd/svm.c Mon Jan 11 08:50:27 2021 -0500 @@ -1917,8 +1917,7 @@ * Start vcpu with specified RIP. */ static int -svm_vmrun(void *arg, int vcpu, uint64_t rip, pmap_t pmap, - struct vm_eventinfo *evinfo) +svm_vmrun(void *arg, int vcpu, uint64_t rip, pmap_t pmap) { struct svm_regctx *gctx; struct svm_softc *svm_sc; @@ -2010,34 +2009,18 @@ inject_state = svm_inject_vlapic(svm_sc, vcpu, vlapic, inject_state); - if (vcpu_suspended(evinfo)) { + /* + * Check for vCPU bail-out conditions. This must be done after + * svm_inject_events() to detect a triple-fault condition. + */ + if (vcpu_entry_bailout_checks(vm, vcpu, state->rip)) { enable_gintr(); - vm_exit_suspended(vm, vcpu, state->rip); - break; - } - - if (vcpu_runblocked(evinfo)) { - enable_gintr(); - vm_exit_runblock(vm, vcpu, state->rip); break; } - if (vcpu_reqidle(evinfo)) { + if (vcpu_run_state_pending(vm, vcpu)) { enable_gintr(); - vm_exit_reqidle(vm, vcpu, state->rip); - break; - } - - /* We are asked to give the cpu by scheduler. */ - if (vcpu_should_yield(vm, vcpu)) { - enable_gintr(); - vm_exit_astpending(vm, vcpu, state->rip); - break; - } - - if (vcpu_debugged(vm, vcpu)) { - enable_gintr(); - vm_exit_debug(vm, vcpu, state->rip); + vm_exit_run_state(vm, vcpu, state->rip); break; } @@ -2303,7 +2286,7 @@ } static int -svm_setdesc(void *arg, int vcpu, int reg, struct seg_desc *desc) +svm_setdesc(void *arg, int vcpu, int reg, const struct seg_desc *desc) { struct vmcb *vmcb; struct svm_softc *sc; diff -r ce2b70e7aab0 -r 3285827f3d5d usr/src/uts/i86pc/io/vmm/intel/vmx.c --- a/usr/src/uts/i86pc/io/vmm/intel/vmx.c Mon Jan 04 14:49:49 2021 -0500 +++ b/usr/src/uts/i86pc/io/vmm/intel/vmx.c Mon Jan 11 08:50:27 2021 -0500 @@ -2738,8 +2738,7 @@ } static int -vmx_run(void *arg, int vcpu, uint64_t rip, pmap_t pmap, - struct vm_eventinfo *evinfo) +vmx_run(void *arg, int vcpu, uint64_t rip, pmap_t pmap) { int rc, handled, launched; struct vmx *vmx; @@ -2834,39 +2833,17 @@ } /* - * Check for vcpu suspension after injecting events because - * vmx_inject_events() can suspend the vcpu due to a - * triple fault. + * Check for vCPU bail-out conditions. This must be done after + * vmx_inject_events() to detect a triple-fault condition. */ - if (vcpu_suspended(evinfo)) { + if (vcpu_entry_bailout_checks(vmx->vm, vcpu, rip)) { enable_intr(); - vm_exit_suspended(vmx->vm, vcpu, rip); - break; - } - - if (vcpu_runblocked(evinfo)) { - enable_intr(); - vm_exit_runblock(vmx->vm, vcpu, rip); break; } - if (vcpu_reqidle(evinfo)) { - enable_intr(); - vm_exit_reqidle(vmx->vm, vcpu, rip); - break; - } - - if (vcpu_should_yield(vm, vcpu)) { + if (vcpu_run_state_pending(vm, vcpu)) { enable_intr(); - vm_exit_astpending(vmx->vm, vcpu, rip); - vmx_astpending_trace(vmx, vcpu, rip); - handled = HANDLED; - break; - } - - if (vcpu_debugged(vm, vcpu)) { - enable_intr(); - vm_exit_debug(vmx->vm, vcpu, rip); + vm_exit_run_state(vmx->vm, vcpu, rip); break; } @@ -2985,19 +2962,12 @@ rip = vmexit->rip; } while (handled); - /* - * If a VM exit has been handled then the exitcode must be BOGUS - * If a VM exit is not handled then the exitcode must not be BOGUS - */ - if ((handled && vmexit->exitcode != VM_EXITCODE_BOGUS) || - (!handled && vmexit->exitcode == VM_EXITCODE_BOGUS)) { - panic("Mismatch between handled (%d) and exitcode (%d)", - handled, vmexit->exitcode); + /* If a VM exit has been handled then the exitcode must be BOGUS */ + if (handled && vmexit->exitcode != VM_EXITCODE_BOGUS) { + panic("Non-BOGUS exitcode (%d) unexpected for handled VM exit", + vmexit->exitcode); } - if (!handled) - vmm_stat_incr(vm, vcpu, VMEXIT_USERSPACE, 1); - VCPU_CTR1(vm, vcpu, "returning from vmx_run: exitcode %d", vmexit->exitcode); @@ -3261,7 +3231,7 @@ } static int -vmx_setdesc(void *arg, int vcpu, int seg, struct seg_desc *desc) +vmx_setdesc(void *arg, int vcpu, int seg, const struct seg_desc *desc) { int hostcpu, running; struct vmx *vmx = arg; diff -r ce2b70e7aab0 -r 3285827f3d5d usr/src/uts/i86pc/io/vmm/io/vlapic.c --- a/usr/src/uts/i86pc/io/vmm/io/vlapic.c Mon Jan 04 14:49:49 2021 -0500 +++ b/usr/src/uts/i86pc/io/vmm/io/vlapic.c Mon Jan 11 08:50:27 2021 -0500 @@ -992,13 +992,10 @@ vlapic_icrlo_write_handler(struct vlapic *vlapic) { int i; - bool phys; cpuset_t dmask; uint64_t icrval; - uint32_t dest, vec, mode; - struct vlapic *vlapic2; + uint32_t dest, vec, mode, dsh; struct LAPIC *lapic; - uint16_t maxcpus; lapic = vlapic->apic_page; lapic->icr_lo &= ~APIC_DELSTAT_PEND; @@ -1010,93 +1007,79 @@ dest = icrval >> (32 + 24); vec = icrval & APIC_VECTOR_MASK; mode = icrval & APIC_DELMODE_MASK; + dsh = icrval & APIC_DEST_MASK; if (mode == APIC_DELMODE_FIXED && vec < 16) { vlapic_set_error(vlapic, APIC_ESR_SEND_ILLEGAL_VECTOR, false); - VLAPIC_CTR1(vlapic, "Ignoring invalid IPI %d", vec); + return (0); + } + if (mode == APIC_DELMODE_INIT && + (icrval & APIC_LEVEL_MASK) == APIC_LEVEL_DEASSERT) { + /* No work required to deassert INIT */ return (0); } + if ((mode == APIC_DELMODE_STARTUP || mode == APIC_DELMODE_INIT) && + !(dsh == APIC_DEST_DESTFLD || dsh == APIC_DEST_ALLESELF)) { + /* + * While Intel makes no mention of restrictions for destination + * shorthand when sending INIT or SIPI, AMD requires either a + * specific destination or all-excluding self. Common use seems + * to be restricted to those two cases. + */ + return (-1); + } - VLAPIC_CTR2(vlapic, "icrlo 0x%016lx triggered ipi %d", icrval, vec); + switch (dsh) { + case APIC_DEST_DESTFLD: + vlapic_calcdest(vlapic->vm, &dmask, dest, + (icrval & APIC_DESTMODE_LOG) == 0, false, x2apic(vlapic)); + break; + case APIC_DEST_SELF: + CPU_SETOF(vlapic->vcpuid, &dmask); + break; + case APIC_DEST_ALLISELF: + dmask = vm_active_cpus(vlapic->vm); + break; + case APIC_DEST_ALLESELF: + dmask = vm_active_cpus(vlapic->vm); + CPU_CLR(vlapic->vcpuid, &dmask); + break; + default: + /* + * All possible delivery notations are covered above. + * We should never end up here. + */ + panic("unknown delivery shorthand: %x", dsh); + } - if (mode == APIC_DELMODE_FIXED || mode == APIC_DELMODE_NMI) { - switch (icrval & APIC_DEST_MASK) { - case APIC_DEST_DESTFLD: - phys = ((icrval & APIC_DESTMODE_LOG) == 0); - vlapic_calcdest(vlapic->vm, &dmask, dest, phys, false, - x2apic(vlapic)); + while ((i = CPU_FFS(&dmask)) != 0) { + i--; + CPU_CLR(i, &dmask); + switch (mode) { + case APIC_DELMODE_FIXED: + lapic_intr_edge(vlapic->vm, i, vec); + vmm_stat_incr(vlapic->vm, vlapic->vcpuid, + VLAPIC_IPI_SEND, 1); + vmm_stat_incr(vlapic->vm, i, + VLAPIC_IPI_RECV, 1); break; - case APIC_DEST_SELF: - CPU_SETOF(vlapic->vcpuid, &dmask); + case APIC_DELMODE_NMI: + vm_inject_nmi(vlapic->vm, i); break; - case APIC_DEST_ALLISELF: - dmask = vm_active_cpus(vlapic->vm); + case APIC_DELMODE_INIT: + (void) vm_inject_init(vlapic->vm, i); break; - case APIC_DEST_ALLESELF: - dmask = vm_active_cpus(vlapic->vm); - CPU_CLR(vlapic->vcpuid, &dmask); + case APIC_DELMODE_STARTUP: + (void) vm_inject_sipi(vlapic->vm, i, vec); break; + case APIC_DELMODE_LOWPRIO: + case APIC_DELMODE_SMI: default: - CPU_ZERO(&dmask); /* satisfy gcc */ + /* Unhandled IPI modes (for now) */ break; } - - while ((i = CPU_FFS(&dmask)) != 0) { - i--; - CPU_CLR(i, &dmask); - if (mode == APIC_DELMODE_FIXED) { - lapic_intr_edge(vlapic->vm, i, vec); - vmm_stat_incr(vlapic->vm, vlapic->vcpuid, - VLAPIC_IPI_SEND, 1); - vmm_stat_incr(vlapic->vm, i, - VLAPIC_IPI_RECV, 1); - VLAPIC_CTR2(vlapic, "vlapic sending ipi %d " - "to vcpuid %d", vec, i); - } else { - vm_inject_nmi(vlapic->vm, i); - VLAPIC_CTR1(vlapic, "vlapic sending ipi nmi " - "to vcpuid %d", i); - } - } - - return (0); /* handled completely in the kernel */ } - - maxcpus = vm_get_maxcpus(vlapic->vm); - if (mode == APIC_DELMODE_INIT) { - if ((icrval & APIC_LEVEL_MASK) == APIC_LEVEL_DEASSERT) - return (0); - - if (vlapic->vcpuid == 0 && dest != 0 && dest < maxcpus) { - vlapic2 = vm_lapic(vlapic->vm, dest); - - /* move from INIT to waiting-for-SIPI state */ - if (vlapic2->boot_state == BS_INIT) { - vlapic2->boot_state = BS_SIPI; - } - - return (0); - } - } - - if (mode == APIC_DELMODE_STARTUP) { - if (vlapic->vcpuid == 0 && dest != 0 && dest < maxcpus) { - vlapic2 = vm_lapic(vlapic->vm, dest); - - /* - * Ignore SIPIs in any state other than wait-for-SIPI - */ - if (vlapic2->boot_state != BS_SIPI) - return (0); - - vlapic2->boot_state = BS_RUNNING; - vm_req_spinup_ap(vlapic->vm, dest, vec << PAGE_SHIFT); - return (0); - } - } - - /* Return to userland. */ - return (-1); + return (0); } void @@ -1450,30 +1433,72 @@ return (retval); } -static void +void vlapic_reset(struct vlapic *vlapic) { - struct LAPIC *lapic; + struct LAPIC *lapic = vlapic->apic_page; + uint32_t *isrptr, *tmrptr, *irrptr; - lapic = vlapic->apic_page; - bzero(lapic, sizeof (struct LAPIC)); + /* Reset any timer-related state first */ + VLAPIC_TIMER_LOCK(vlapic); + callout_stop(&vlapic->callout); + lapic->icr_timer = 0; + lapic->ccr_timer = 0; + VLAPIC_TIMER_UNLOCK(vlapic); + lapic->dcr_timer = 0; + vlapic_dcr_write_handler(vlapic); + + /* + * Sync any APIC acceleration (APICv/AVIC) state into the APIC page so + * it is not leftover after the reset. This is performed after the APIC + * timer has been stopped, in case it happened to fire just prior to + * being deactivated. + */ + if (vlapic->ops.sync_state) { + (*vlapic->ops.sync_state)(vlapic); + } lapic->id = vlapic_get_id(vlapic); lapic->version = VLAPIC_VERSION; lapic->version |= (VLAPIC_MAXLVT_INDEX << MAXLVTSHIFT); + + lapic->tpr = 0; + lapic->apr = 0; + lapic->ppr = 0; + +#ifdef __ISRVEC_DEBUG + /* With the PPR cleared, the isrvec tracking should be reset too */ + vlapic->isrvec_stk_top = 0; +#endif + + lapic->eoi = 0; + lapic->ldr = 0; lapic->dfr = 0xffffffff; lapic->svr = APIC_SVR_VECTOR; - vlapic_mask_lvts(vlapic); + vlapic->svr_last = lapic->svr; - lapic->dcr_timer = 0; - vlapic_dcr_write_handler(vlapic); + isrptr = &lapic->isr0; + tmrptr = &lapic->tmr0; + irrptr = &lapic->irr0; + for (uint_t i = 0; i < 8; i++) { + atomic_store_rel_int(&isrptr[i * 4], 0); + atomic_store_rel_int(&tmrptr[i * 4], 0); + atomic_store_rel_int(&irrptr[i * 4], 0); + } - if (vlapic->vcpuid == 0) - vlapic->boot_state = BS_RUNNING; /* BSP */ - else - vlapic->boot_state = BS_INIT; /* AP */ + lapic->esr = 0; + vlapic->esr_pending = 0; + lapic->icr_lo = 0; + lapic->icr_hi = 0; - vlapic->svr_last = lapic->svr; + lapic->lvt_cmci = 0; + lapic->lvt_timer = 0; + lapic->lvt_thermal = 0; + lapic->lvt_pcint = 0; + lapic->lvt_lint0 = 0; + lapic->lvt_lint1 = 0; + lapic->lvt_error = 0; + vlapic_mask_lvts(vlapic); } void diff -r ce2b70e7aab0 -r 3285827f3d5d usr/src/uts/i86pc/io/vmm/io/vlapic.h --- a/usr/src/uts/i86pc/io/vmm/io/vlapic.h Mon Jan 04 14:49:49 2021 -0500 +++ b/usr/src/uts/i86pc/io/vmm/io/vlapic.h Mon Jan 11 08:50:27 2021 -0500 @@ -30,6 +30,7 @@ /* * Copyright 2018 Joyent, Inc. + * Copyright 2020 Oxide Computer Company */ #ifndef _VLAPIC_H_ @@ -38,6 +39,8 @@ struct vm; enum x2apic_state; +void vlapic_reset(struct vlapic *vlapic); + int vlapic_write(struct vlapic *vlapic, int mmio_access, uint64_t offset, uint64_t data); int vlapic_read(struct vlapic *vlapic, int mmio_access, uint64_t offset, diff -r ce2b70e7aab0 -r 3285827f3d5d usr/src/uts/i86pc/io/vmm/io/vlapic_priv.h --- a/usr/src/uts/i86pc/io/vmm/io/vlapic_priv.h Mon Jan 04 14:49:49 2021 -0500 +++ b/usr/src/uts/i86pc/io/vmm/io/vlapic_priv.h Mon Jan 11 08:50:27 2021 -0500 @@ -137,12 +137,6 @@ VLAPIC_CTR1((vlapic), msg " isr7 0x%08x", isrptr[7 << 2]); \ } while (0) -enum boot_state { - BS_INIT, - BS_SIPI, - BS_RUNNING -}; - /* * 16 priority levels with at most one vector injected per level. */ @@ -182,7 +176,6 @@ struct mtx timer_mtx; uint64_t msr_apicbase; - enum boot_state boot_state; /* * Copies of some registers in the virtual APIC page. We do this for diff -r ce2b70e7aab0 -r 3285827f3d5d usr/src/uts/i86pc/io/vmm/sys/vmm_kernel.h --- a/usr/src/uts/i86pc/io/vmm/sys/vmm_kernel.h Mon Jan 04 14:49:49 2021 -0500 +++ b/usr/src/uts/i86pc/io/vmm/sys/vmm_kernel.h Mon Jan 11 08:50:27 2021 -0500 @@ -64,18 +64,12 @@ struct vm_guest_paging; struct pmap; -struct vm_eventinfo { - uint_t *rptr; /* runblock cookie */ - int *sptr; /* suspend cookie */ - int *iptr; /* reqidle cookie */ -}; - typedef int (*vmm_init_func_t)(int ipinum); typedef int (*vmm_cleanup_func_t)(void); typedef void (*vmm_resume_func_t)(void); typedef void * (*vmi_init_func_t)(struct vm *vm, struct pmap *pmap); typedef int (*vmi_run_func_t)(void *vmi, int vcpu, uint64_t rip, - struct pmap *pmap, struct vm_eventinfo *info); + struct pmap *pmap); typedef void (*vmi_cleanup_func_t)(void *vmi); typedef int (*vmi_get_register_t)(void *vmi, int vcpu, int num, uint64_t *retval); @@ -84,7 +78,7 @@ typedef int (*vmi_get_desc_t)(void *vmi, int vcpu, int num, struct seg_desc *desc); typedef int (*vmi_set_desc_t)(void *vmi, int vcpu, int num, - struct seg_desc *desc); + const struct seg_desc *desc); typedef int (*vmi_get_cap_t)(void *vmi, int vcpu, int num, int *retval); typedef int (*vmi_set_cap_t)(void *vmi, int vcpu, int num, int val); typedef struct vmspace *(*vmi_vmspace_alloc)(vm_offset_t min, vm_offset_t max); @@ -169,9 +163,13 @@ int vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval); int vm_set_register(struct vm *vm, int vcpu, int reg, uint64_t val); int vm_get_seg_desc(struct vm *vm, int vcpu, int reg, - struct seg_desc *ret_desc); + struct seg_desc *ret_desc); int vm_set_seg_desc(struct vm *vm, int vcpu, int reg, - struct seg_desc *desc); + const struct seg_desc *desc); +int vm_get_run_state(struct vm *vm, int vcpuid, uint32_t *state, + uint8_t *sipi_vec); +int vm_set_run_state(struct vm *vm, int vcpuid, uint32_t state, + uint8_t sipi_vec); int vm_run(struct vm *vm, int vcpuid, const struct vm_entry *); int vm_suspend(struct vm *vm, enum vm_suspend_how how); int vm_inject_nmi(struct vm *vm, int vcpu); @@ -180,6 +178,8 @@ int vm_inject_extint(struct vm *vm, int vcpu); int vm_extint_pending(struct vm *vm, int vcpuid); void vm_extint_clear(struct vm *vm, int vcpuid); +int vm_inject_init(struct vm *vm, int vcpuid); +int vm_inject_sipi(struct vm *vm, int vcpuid, uint8_t vec); struct vlapic *vm_lapic(struct vm *vm, int cpu); struct vioapic *vm_ioapic(struct vm *vm); struct vhpet *vm_hpet(struct vm *vm); @@ -195,14 +195,13 @@ struct vie *vm_vie_ctx(struct vm *vm, int vcpuid); void vm_exit_suspended(struct vm *vm, int vcpuid, uint64_t rip); void vm_exit_debug(struct vm *vm, int vcpuid, uint64_t rip); -void vm_exit_runblock(struct vm *vm, int vcpuid, uint64_t rip); void vm_exit_astpending(struct vm *vm, int vcpuid, uint64_t rip); void vm_exit_reqidle(struct vm *vm, int vcpuid, uint64_t rip); +void vm_exit_run_state(struct vm *vm, int vcpuid, uint64_t rip); int vm_service_mmio_read(struct vm *vm, int cpuid, uint64_t gpa, uint64_t *rval, int rsize); int vm_service_mmio_write(struct vm *vm, int cpuid, uint64_t gpa, uint64_t wval, int wsize); -void vm_req_spinup_ap(struct vm *vm, int req_vcpuid, uint64_t req_rip); #ifdef _SYS__CPUSET_H_ cpuset_t vm_active_cpus(struct vm *vm); @@ -210,28 +209,9 @@ cpuset_t vm_suspended_cpus(struct vm *vm); #endif /* _SYS__CPUSET_H_ */ -static __inline int -vcpu_runblocked(struct vm_eventinfo *info) -{ - - return (*info->rptr != 0); -} - -static __inline int -vcpu_suspended(struct vm_eventinfo *info) -{ - - return (*info->sptr); -} - -static __inline int -vcpu_reqidle(struct vm_eventinfo *info) -{ - - return (*info->iptr); -} - -int vcpu_debugged(struct vm *vm, int vcpuid); +bool vcpu_entry_bailout_checks(struct vm *vm, int vcpuid, uint64_t rip); +bool vcpu_run_state_pending(struct vm *vm, int vcpuid); +int vcpu_arch_reset(struct vm *vm, int vcpuid, bool init_only); /* * Return true if device indicated by bus/slot/func is supposed to be a diff -r ce2b70e7aab0 -r 3285827f3d5d usr/src/uts/i86pc/io/vmm/vmm.c --- a/usr/src/uts/i86pc/io/vmm/vmm.c Mon Jan 04 14:49:49 2021 -0500 +++ b/usr/src/uts/i86pc/io/vmm/vmm.c Mon Jan 11 08:50:27 2021 -0500 @@ -109,17 +109,15 @@ * (x) initialized before use */ struct vcpu { - struct mtx mtx; /* (o) protects 'state' and 'hostcpu' */ + /* (o) protects state, run_state, hostcpu, sipi_vector */ + struct mtx mtx; + enum vcpu_state state; /* (o) vcpu state */ -#ifndef __FreeBSD__ + enum vcpu_run_state run_state; /* (i) vcpu init/sipi/run state */ kcondvar_t vcpu_cv; /* (o) cpu waiter cv */ kcondvar_t state_cv; /* (o) IDLE-transition cv */ -#endif /* __FreeBSD__ */ int hostcpu; /* (o) vcpu's current host cpu */ -#ifndef __FreeBSD__ int lastloccpu; /* (o) last host cpu localized to */ -#endif - uint_t runblock; /* (i) block vcpu from run state */ int reqidle; /* (i) request vcpu to idle */ struct vlapic *vlapic; /* (i) APIC device model */ enum x2apic_state x2apic_state; /* (i) APIC mode */ @@ -130,6 +128,7 @@ int exc_vector; /* (x) exception collateral */ int exc_errcode_valid; uint32_t exc_errcode; + uint8_t sipi_vector; /* (i) SIPI vector */ struct savefpu *guestfpu; /* (a,i) guest fpu state */ uint64_t guest_xcr0; /* (i) guest %xcr0 register */ void *stats; /* (a,i) statistics */ @@ -200,15 +199,6 @@ uint16_t maxcpus; /* (o) max pluggable cpus */ struct ioport_config ioports; /* (o) ioport handling */ - - bool sipi_req; /* (i) SIPI requested */ - int sipi_req_vcpu; /* (i) SIPI destination */ - uint64_t sipi_req_rip; /* (i) SIPI start %rip */ - - /* Miscellaneous VM-wide statistics and counters */ - struct vm_wide_stats { - uint64_t sipi_supersede; - } stats; }; static int vmm_initialized; @@ -249,8 +239,8 @@ #define VMM_RESUME() ((*ops->resume)()) #define VMINIT(vm, pmap) ((*ops->vminit)(vm, pmap)) -#define VMRUN(vmi, vcpu, rip, pmap, evinfo) \ - ((*ops->vmrun)(vmi, vcpu, rip, pmap, evinfo)) +#define VMRUN(vmi, vcpu, rip, pmap) \ + ((*ops->vmrun)(vmi, vcpu, rip, pmap)) #define VMCLEANUP(vmi) ((*ops->vmcleanup)(vmi)) #define VMSPACE_ALLOC(min, max) ((*ops->vmspace_alloc)(min, max)) #define VMSPACE_FREE(vmspace) ((*ops->vmspace_free)(vmspace)) @@ -292,6 +282,8 @@ static void vm_free_memmap(struct vm *vm, int ident); static bool sysmem_mapping(struct vm *vm, struct mem_map *mm); static void vcpu_notify_event_locked(struct vcpu *vcpu, vcpu_notify_t); +static bool vcpu_sleep_bailout_checks(struct vm *vm, int vcpuid); +static int vcpu_vector_sipi(struct vm *vm, int vcpuid, uint8_t vector); #ifndef __FreeBSD__ static void vm_clear_memseg(struct vm *, int); @@ -370,9 +362,9 @@ bzero(&vcpu->exitinfo, sizeof (vcpu->exitinfo)); } + vcpu->run_state = VRS_HALT; vcpu->vlapic = VLAPIC_INIT(vm->cookie, vcpu_id); vm_set_x2apic_state(vm, vcpu_id, X2APIC_DISABLED); - vcpu->runblock = 0; vcpu->reqidle = 0; vcpu->exitintinfo = 0; vcpu->nmi_pending = 0; @@ -1233,7 +1225,7 @@ } int -vm_set_seg_desc(struct vm *vm, int vcpu, int reg, struct seg_desc *desc) +vm_set_seg_desc(struct vm *vm, int vcpu, int reg, const struct seg_desc *desc) { if (vcpu < 0 || vcpu >= vm->maxcpus) return (EINVAL); @@ -1244,6 +1236,49 @@ return (VMSETDESC(vm->cookie, vcpu, reg, desc)); } +int +vm_get_run_state(struct vm *vm, int vcpuid, uint32_t *state, uint8_t *sipi_vec) +{ + struct vcpu *vcpu; + + if (vcpuid < 0 || vcpuid >= vm->maxcpus) { + return (EINVAL); + } + + vcpu = &vm->vcpu[vcpuid]; + + vcpu_lock(vcpu); + *state = vcpu->run_state; + *sipi_vec = vcpu->sipi_vector; + vcpu_unlock(vcpu); + + return (0); +} + +int +vm_set_run_state(struct vm *vm, int vcpuid, uint32_t state, uint8_t sipi_vec) +{ + struct vcpu *vcpu; + + if (vcpuid < 0 || vcpuid >= vm->maxcpus) { + return (EINVAL); + } + if (!VRS_IS_VALID(state)) { + return (EINVAL); + } + + vcpu = &vm->vcpu[vcpuid]; + + vcpu_lock(vcpu); + vcpu->run_state = state; + vcpu->sipi_vector = sipi_vec; + vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT); + vcpu_unlock(vcpu); + + return (0); +} + + static void restore_guest_fpustate(struct vcpu *vcpu) { @@ -1354,16 +1389,6 @@ break; } - if (newstate == VCPU_RUNNING) { - while (vcpu->runblock != 0) { -#ifdef __FreeBSD__ - msleep_spin(&vcpu->state, &vcpu->mtx, "vcpublk", 0); -#else - cv_wait(&vcpu->state_cv, &vcpu->mtx.m); -#endif - } - } - if (error) return (EBUSY); @@ -1376,8 +1401,7 @@ else vcpu->hostcpu = NOCPU; - if (newstate == VCPU_IDLE || - (newstate == VCPU_FROZEN && vcpu->runblock != 0)) { + if (newstate == VCPU_IDLE) { #ifdef __FreeBSD__ wakeup(&vcpu->state); #else @@ -1413,12 +1437,8 @@ vm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled) { struct vcpu *vcpu; -#ifdef __FreeBSD__ - const char *wmesg; -#else - const char *wmesg __unused; -#endif int t, vcpu_halted, vm_halted; + bool userspace_exit = false; KASSERT(!CPU_ISSET(vcpuid, &vm->halted_cpus), ("vcpu already halted")); @@ -1429,17 +1449,12 @@ vcpu_lock(vcpu); while (1) { /* - * Do a final check for pending NMI or interrupts before - * really putting this thread to sleep. Also check for - * software events that would cause this vcpu to wakeup. - * - * These interrupts/events could have happened after the - * vcpu returned from VMRUN() and before it acquired the - * vcpu lock above. + * Do a final check for pending interrupts (including NMI and + * INIT) before putting this thread to sleep. */ - if (vm->suspend || vcpu->reqidle) + if (vm_nmi_pending(vm, vcpuid)) break; - if (vm_nmi_pending(vm, vcpuid)) + if (vcpu_run_state_pending(vm, vcpuid)) break; if (!intr_disabled) { if (vm_extint_pending(vm, vcpuid) || @@ -1448,12 +1463,15 @@ } } - /* Don't go to sleep if the vcpu thread needs to yield */ - if (vcpu_should_yield(vm, vcpuid)) + /* + * Also check for software events which would cause a wake-up. + * This will set the appropriate exitcode directly, rather than + * requiring a trip through VM_RUN(). + */ + if (vcpu_sleep_bailout_checks(vm, vcpuid)) { + userspace_exit = true; break; - - if (vcpu_debugged(vm, vcpuid)) - break; + } /* * Some Linux guests implement "halt" by having all vcpus @@ -1462,8 +1480,6 @@ * vcpus enter the halted state the virtual machine is halted. */ if (intr_disabled) { - wmesg = "vmhalt"; - VCPU_CTR0(vm, vcpuid, "Halted"); if (!vcpu_halted && halt_detection_enabled) { vcpu_halted = 1; CPU_SET_ATOMIC(vcpuid, &vm->halted_cpus); @@ -1472,25 +1488,11 @@ vm_halted = 1; break; } - } else { - wmesg = "vmidle"; } t = ticks; vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING); -#ifdef __FreeBSD__ - /* - * XXX msleep_spin() cannot be interrupted by signals so - * wake up periodically to check pending signals. - */ - msleep_spin(vcpu, &vcpu->mtx, wmesg, hz); -#else - /* - * Fortunately, cv_wait_sig can be interrupted by signals, so - * there is no need to periodically wake up. - */ (void) cv_wait_sig(&vcpu->vcpu_cv, &vcpu->mtx.m); -#endif vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN); vmm_stat_incr(vm, vcpuid, VCPU_IDLE_TICKS, ticks - t); } @@ -1503,7 +1505,7 @@ if (vm_halted) vm_suspend(vm, VM_SUSPEND_HALT); - return (0); + return (userspace_exit ? -1 : 0); } static int @@ -1832,6 +1834,62 @@ return (-1); } +static int +vm_handle_run_state(struct vm *vm, int vcpuid) +{ + struct vcpu *vcpu = &vm->vcpu[vcpuid]; + bool handled = false; + + vcpu_lock(vcpu); + while (1) { + if ((vcpu->run_state & VRS_PEND_INIT) != 0) { + vcpu_unlock(vcpu); + VERIFY0(vcpu_arch_reset(vm, vcpuid, true)); + vcpu_lock(vcpu); + + vcpu->run_state &= ~(VRS_RUN | VRS_PEND_INIT); + vcpu->run_state |= VRS_INIT; + } + + if ((vcpu->run_state & (VRS_INIT | VRS_RUN | VRS_PEND_SIPI)) == + (VRS_INIT | VRS_PEND_SIPI)) { + const uint8_t vector = vcpu->sipi_vector; + + vcpu_unlock(vcpu); + VERIFY0(vcpu_vector_sipi(vm, vcpuid, vector)); + vcpu_lock(vcpu); + + vcpu->run_state &= ~VRS_PEND_SIPI; + vcpu->run_state |= VRS_RUN; + } + + /* + * If the vCPU is now in the running state, there is no need to + * wait for anything prior to re-entry. + */ + if ((vcpu->run_state & VRS_RUN) != 0) { + handled = true; + break; + } + + /* + * Also check for software events which would cause a wake-up. + * This will set the appropriate exitcode directly, rather than + * requiring a trip through VM_RUN(). + */ + if (vcpu_sleep_bailout_checks(vm, vcpuid)) { + break; + } + + vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING); + (void) cv_wait_sig(&vcpu->vcpu_cv, &vcpu->mtx.m); + vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN); + } + vcpu_unlock(vcpu); + + return (handled ? 0 : -1); +} + #ifndef __FreeBSD__ static int vm_handle_wrmsr(struct vm *vm, int vcpuid, struct vm_exit *vme) @@ -1850,18 +1908,6 @@ } #endif /* __FreeBSD__ */ -void -vm_req_spinup_ap(struct vm *vm, int req_vcpuid, uint64_t req_rip) -{ - if (vm->sipi_req) { - /* This should never occur if userspace is doing its job. */ - vm->stats.sipi_supersede++; - } - vm->sipi_req = true; - vm->sipi_req_vcpu = req_vcpuid; - vm->sipi_req_rip = req_rip; -} - int vm_suspend(struct vm *vm, enum vm_suspend_how how) { @@ -1890,66 +1936,17 @@ } void -vm_exit_suspended(struct vm *vm, int vcpuid, uint64_t rip) -{ - struct vm_exit *vmexit; - - KASSERT(vm->suspend > VM_SUSPEND_NONE && vm->suspend < VM_SUSPEND_LAST, - ("vm_exit_suspended: invalid suspend type %d", vm->suspend)); - - vmexit = vm_exitinfo(vm, vcpuid); - vmexit->rip = rip; - vmexit->inst_length = 0; - vmexit->exitcode = VM_EXITCODE_SUSPENDED; - vmexit->u.suspended.how = vm->suspend; -} - -void -vm_exit_debug(struct vm *vm, int vcpuid, uint64_t rip) +vm_exit_run_state(struct vm *vm, int vcpuid, uint64_t rip) { struct vm_exit *vmexit; vmexit = vm_exitinfo(vm, vcpuid); vmexit->rip = rip; vmexit->inst_length = 0; - vmexit->exitcode = VM_EXITCODE_DEBUG; -} - -void -vm_exit_runblock(struct vm *vm, int vcpuid, uint64_t rip) -{ - struct vm_exit *vmexit; - - vmexit = vm_exitinfo(vm, vcpuid); - vmexit->rip = rip; - vmexit->inst_length = 0; - vmexit->exitcode = VM_EXITCODE_RUNBLOCK; - vmm_stat_incr(vm, vcpuid, VMEXIT_RUNBLOCK, 1); + vmexit->exitcode = VM_EXITCODE_RUN_STATE; + vmm_stat_incr(vm, vcpuid, VMEXIT_RUN_STATE, 1); } -void -vm_exit_reqidle(struct vm *vm, int vcpuid, uint64_t rip) -{ - struct vm_exit *vmexit; - - vmexit = vm_exitinfo(vm, vcpuid); - vmexit->rip = rip; - vmexit->inst_length = 0; - vmexit->exitcode = VM_EXITCODE_REQIDLE; - vmm_stat_incr(vm, vcpuid, VMEXIT_REQIDLE, 1); -} - -void -vm_exit_astpending(struct vm *vm, int vcpuid, uint64_t rip) -{ - struct vm_exit *vmexit; - - vmexit = vm_exitinfo(vm, vcpuid); - vmexit->rip = rip; - vmexit->inst_length = 0; - vmexit->exitcode = VM_EXITCODE_BOGUS; - vmm_stat_incr(vm, vcpuid, VMEXIT_ASTPENDING, 1); -} #ifndef __FreeBSD__ /* @@ -2072,7 +2069,7 @@ case VEC_DISCARD_INSTR: vie_reset(vie); return (0); - case VEC_COMPLETE_MMIO: + case VEC_FULFILL_MMIO: err = vie_fulfill_mmio(vie, &entry->u.mmio); if (err == 0) { err = vie_emulate_mmio(vie, vm, vcpuid); @@ -2091,7 +2088,7 @@ } } break; - case VEC_COMPLETE_INOUT: + case VEC_FULFILL_INOUT: err = vie_fulfill_inout(vie, &entry->u.inout); if (err == 0) { err = vie_emulate_inout(vie, vm, vcpuid); @@ -2132,25 +2129,12 @@ return (-1); } - if (vcpuid == 0 && vm->sipi_req) { - /* The boot vCPU has sent a SIPI to one of the other CPUs */ - vme->exitcode = VM_EXITCODE_SPINUP_AP; - vme->u.spinup_ap.vcpu = vm->sipi_req_vcpu; - vme->u.spinup_ap.rip = vm->sipi_req_rip; - - vm->sipi_req = false; - vm->sipi_req_vcpu = 0; - vm->sipi_req_rip = 0; - return (-1); - } - return (0); } int vm_run(struct vm *vm, int vcpuid, const struct vm_entry *entry) { - struct vm_eventinfo evinfo; int error; struct vcpu *vcpu; #ifdef __FreeBSD__ @@ -2177,9 +2161,6 @@ pmap = vmspace_pmap(vm->vmspace); vcpu = &vm->vcpu[vcpuid]; vme = &vcpu->exitinfo; - evinfo.rptr = &vcpu->runblock; - evinfo.sptr = &vm->suspend; - evinfo.iptr = &vcpu->reqidle; #ifndef __FreeBSD__ vtc.vtc_vm = vm; @@ -2242,7 +2223,7 @@ #endif vcpu_require_state(vm, vcpuid, VCPU_RUNNING); - error = VMRUN(vm->cookie, vcpuid, vcpu->nextrip, pmap, &evinfo); + error = VMRUN(vm->cookie, vcpuid, vcpu->nextrip, pmap); vcpu_require_state(vm, vcpuid, VCPU_FROZEN); #ifdef __FreeBSD__ @@ -2273,6 +2254,9 @@ case VM_EXITCODE_REQIDLE: error = vm_handle_reqidle(vm, vcpuid); break; + case VM_EXITCODE_RUN_STATE: + error = vm_handle_run_state(vm, vcpuid); + break; case VM_EXITCODE_SUSPENDED: error = vm_handle_suspend(vm, vcpuid); break; @@ -2280,8 +2264,6 @@ vioapic_process_eoi(vm, vcpuid, vme->u.ioapic_eoi.vector); break; - case VM_EXITCODE_RUNBLOCK: - break; case VM_EXITCODE_HLT: intr_disabled = ((vme->u.hlt.rflags & PSL_I) == 0); error = vm_handle_hlt(vm, vcpuid, intr_disabled); @@ -2792,6 +2774,196 @@ } int +vm_inject_init(struct vm *vm, int vcpuid) +{ + struct vcpu *vcpu; + + if (vcpuid < 0 || vcpuid >= vm->maxcpus) + return (EINVAL); + + vcpu = &vm->vcpu[vcpuid]; + vcpu_lock(vcpu); + vcpu->run_state |= VRS_PEND_INIT; + vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT); + vcpu_unlock(vcpu); + return (0); +} + +int +vm_inject_sipi(struct vm *vm, int vcpuid, uint8_t vector) +{ + struct vcpu *vcpu; + + if (vcpuid < 0 || vcpuid >= vm->maxcpus) + return (EINVAL); + + vcpu = &vm->vcpu[vcpuid]; + vcpu_lock(vcpu); + vcpu->run_state |= VRS_PEND_SIPI; + vcpu->sipi_vector = vector; + /* SIPI is only actionable if the CPU is waiting in INIT state */ + if ((vcpu->run_state & (VRS_INIT | VRS_RUN)) == VRS_INIT) { + vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT); + } + vcpu_unlock(vcpu); + return (0); +} + +bool +vcpu_run_state_pending(struct vm *vm, int vcpuid) +{ + struct vcpu *vcpu; + + ASSERT(vcpuid >= 0 && vcpuid < vm->maxcpus); + vcpu = &vm->vcpu[vcpuid]; + + /* Of interest: vCPU not in running state or with pending INIT */ + return ((vcpu->run_state & (VRS_RUN | VRS_PEND_INIT)) != VRS_RUN); +} + +int +vcpu_arch_reset(struct vm *vm, int vcpuid, bool init_only) +{ + struct seg_desc desc; + const enum vm_reg_name clear_regs[] = { + VM_REG_GUEST_CR2, + VM_REG_GUEST_CR3, + VM_REG_GUEST_CR4, + VM_REG_GUEST_RAX, + VM_REG_GUEST_RBX, + VM_REG_GUEST_RCX, + VM_REG_GUEST_RSI, + VM_REG_GUEST_RDI, + VM_REG_GUEST_RBP, + VM_REG_GUEST_RSP, + VM_REG_GUEST_R8, + VM_REG_GUEST_R9, + VM_REG_GUEST_R10, + VM_REG_GUEST_R11, + VM_REG_GUEST_R12, + VM_REG_GUEST_R13, + VM_REG_GUEST_R14, + VM_REG_GUEST_R15, + VM_REG_GUEST_DR0, + VM_REG_GUEST_DR1, + VM_REG_GUEST_DR2, + VM_REG_GUEST_DR3, + VM_REG_GUEST_EFER, + }; + const enum vm_reg_name data_segs[] = { + VM_REG_GUEST_SS, + VM_REG_GUEST_DS, + VM_REG_GUEST_ES, + VM_REG_GUEST_FS, + VM_REG_GUEST_GS, + }; + struct vcpu *vcpu = &vm->vcpu[vcpuid]; + + if (vcpuid < 0 || vcpuid >= vm->maxcpus) + return (EINVAL); + + for (uint_t i = 0; i < nitems(clear_regs); i++) { + VERIFY0(vm_set_register(vm, vcpuid, clear_regs[i], 0)); + } + + VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, 2)); + VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RIP, 0xfff0)); + VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_CR0, 0x60000010)); + + /* + * The prescribed contents of %rdx differ slightly between the Intel and + * AMD architectural definitions. The former expects the Extended Model + * in bits 16-19 where the latter expects all the Family, Model, and + * Stepping be there. Common boot ROMs appear to disregard this + * anyways, so we stick with a compromise value similar to what is + * spelled out in the Intel SDM. + */ + VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RDX, 0x600)); + + VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_DR6, 0xffff0ff0)); + VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_DR7, 0x400)); + + /* CS: Present, R/W, Accessed */ + desc.access = 0x0093; + desc.base = 0xffff0000; + desc.limit = 0xffff; + VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_CS, &desc)); + VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_CS, 0xf000)); + + /* SS, DS, ES, FS, GS: Present, R/W, Accessed */ + desc.access = 0x0093; + desc.base = 0; + desc.limit = 0xffff; + for (uint_t i = 0; i < nitems(data_segs); i++) { + VERIFY0(vm_set_seg_desc(vm, vcpuid, data_segs[i], &desc)); + VERIFY0(vm_set_register(vm, vcpuid, data_segs[i], 0)); + } + + /* GDTR, IDTR */ + desc.base = 0; + desc.limit = 0xffff; + VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_GDTR, &desc)); + VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_IDTR, &desc)); + + /* LDTR: Present, LDT */ + desc.access = 0x0082; + desc.base = 0; + desc.limit = 0xffff; + VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_LDTR, &desc)); + VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_LDTR, 0)); + + /* TR: Present, 32-bit TSS */ + desc.access = 0x008b; + desc.base = 0; + desc.limit = 0xffff; + VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_TR, &desc)); + VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_TR, 0)); + + vlapic_reset(vm_lapic(vm, vcpuid)); + + VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_INTR_SHADOW, 0)); + + vcpu->exitintinfo = 0; + vcpu->exception_pending = 0; + vcpu->nmi_pending = 0; + vcpu->extint_pending = 0; + + /* + * A CPU reset caused by power-on or system reset clears more state than + * one which is trigged from an INIT IPI. + */ + if (!init_only) { + vcpu->guest_xcr0 = XFEATURE_ENABLED_X87; + fpu_save_area_reset(vcpu->guestfpu); + + /* XXX: clear MSRs and other pieces */ + } + + return (0); +} + +static int +vcpu_vector_sipi(struct vm *vm, int vcpuid, uint8_t vector) +{ + struct seg_desc desc; + + if (vcpuid < 0 || vcpuid >= vm->maxcpus) + return (EINVAL); + + /* CS: Present, R/W, Accessed */ + desc.access = 0x0093; + desc.base = (uint64_t)vector << 12; + desc.limit = 0xffff; + VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_CS, &desc)); + VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_CS, + (uint64_t)vector << 8)); + + VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RIP, 0)); + + return (0); +} + +int vm_get_capability(struct vm *vm, int vcpu, int type, int *retval) { if (vcpu < 0 || vcpu >= vm->maxcpus) @@ -2894,7 +3066,7 @@ struct vcpu *vcpu; if (vcpuid < 0 || vcpuid >= vm->maxcpus) - panic("vm_set_run_state: invalid vcpuid %d", vcpuid); + panic("vcpu_set_state: invalid vcpuid %d", vcpuid); vcpu = &vm->vcpu[vcpuid]; @@ -2912,7 +3084,7 @@ enum vcpu_state state; if (vcpuid < 0 || vcpuid >= vm->maxcpus) - panic("vm_get_run_state: invalid vcpuid %d", vcpuid); + panic("vcpu_get_state: invalid vcpuid %d", vcpuid); vcpu = &vm->vcpu[vcpuid]; @@ -2925,54 +3097,6 @@ return (state); } -void -vcpu_block_run(struct vm *vm, int vcpuid) -{ - struct vcpu *vcpu; - - if (vcpuid < 0 || vcpuid >= VM_MAXCPU) - panic("vcpu_block_run: invalid vcpuid %d", vcpuid); - - vcpu = &vm->vcpu[vcpuid]; - - vcpu_lock(vcpu); - vcpu->runblock++; - if (vcpu->runblock == 1 && vcpu->state == VCPU_RUNNING) { - vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT); - } - while (vcpu->state == VCPU_RUNNING) { -#ifdef __FreeBSD__ - msleep_spin(&vcpu->state, &vcpu->mtx, "vcpublk", 0); -#else - cv_wait(&vcpu->state_cv, &vcpu->mtx.m); -#endif - } - vcpu_unlock(vcpu); -} - -void -vcpu_unblock_run(struct vm *vm, int vcpuid) -{ - struct vcpu *vcpu; - - if (vcpuid < 0 || vcpuid >= VM_MAXCPU) - panic("vcpu_block_run: invalid vcpuid %d", vcpuid); - - vcpu = &vm->vcpu[vcpuid]; - - vcpu_lock(vcpu); - KASSERT(vcpu->runblock != 0, ("expected non-zero runblock")); - vcpu->runblock--; - if (vcpu->runblock == 0) { -#ifdef __FreeBSD__ - wakeup(&vcpu->state); -#else - cv_broadcast(&vcpu->state_cv); -#endif - } - vcpu_unlock(vcpu); -} - #ifndef __FreeBSD__ uint64_t vcpu_tsc_offset(struct vm *vm, int vcpuid) @@ -3038,11 +3162,93 @@ return (0); } -int -vcpu_debugged(struct vm *vm, int vcpuid) +static bool +vcpu_bailout_checks(struct vm *vm, int vcpuid, bool on_entry, + uint64_t entry_rip) { - - return (CPU_ISSET(vcpuid, &vm->debug_cpus)); + struct vcpu *vcpu = &vm->vcpu[vcpuid]; + struct vm_exit *vme = &vcpu->exitinfo; + bool bail = false; + + ASSERT(vcpuid >= 0 && vcpuid < vm->maxcpus); + + if (vm->suspend) { + if (on_entry) { + VERIFY(vm->suspend > VM_SUSPEND_NONE && + vm->suspend < VM_SUSPEND_LAST); + + vme->exitcode = VM_EXITCODE_SUSPENDED; + vme->u.suspended.how = vm->suspend; + } else { + /* + * Handling VM suspend is complicated, so if that + * condition is detected outside of VM-entry itself, + * just emit a BOGUS exitcode so we take a lap to pick + * up the event during an entry and are directed into + * the vm_handle_suspend() logic. + */ + vme->exitcode = VM_EXITCODE_BOGUS; + } + bail = true; + } + if (vcpu->reqidle) { + vme->exitcode = VM_EXITCODE_REQIDLE; + vmm_stat_incr(vm, vcpuid, VMEXIT_REQIDLE, 1); + + if (!on_entry) { + /* + * A reqidle request detected outside of VM-entry can be + * handled directly by clearing the request (and taking + * a lap to userspace). + */ + vcpu_assert_locked(vcpu); + vcpu->reqidle = 0; + } + bail = true; + } + if (vcpu_should_yield(vm, vcpuid)) { + vme->exitcode = VM_EXITCODE_BOGUS; + vmm_stat_incr(vm, vcpuid, VMEXIT_ASTPENDING, 1); + bail = true; + } + if (CPU_ISSET(vcpuid, &vm->debug_cpus)) { + vme->exitcode = VM_EXITCODE_DEBUG; + bail = true; + } + + if (bail) { + if (on_entry) { + /* + * If bailing out during VM-entry, the current %rip must + * be recorded in the exitinfo. + */ + vme->rip = entry_rip; + } + vme->inst_length = 0; + } + return (bail); +} + +static bool +vcpu_sleep_bailout_checks(struct vm *vm, int vcpuid) +{ + /* + * Bail-out check done prior to sleeping (in vCPU contexts like HLT or + * wait-for-SIPI) expect that %rip is already populated in the vm_exit + * structure, and we would only modify the exitcode. + */ + return (vcpu_bailout_checks(vm, vcpuid, false, 0)); +} + +bool +vcpu_entry_bailout_checks(struct vm *vm, int vcpuid, uint64_t rip) +{ + /* + * Bail-out checks done as part of VM entry require an updated %rip to + * populate the vm_exit struct if any of the conditions of interest are + * matched in the check. + */ + return (vcpu_bailout_checks(vm, vcpuid, true, rip)); } cpuset_t diff -r ce2b70e7aab0 -r 3285827f3d5d usr/src/uts/i86pc/io/vmm/vmm_sol_dev.c --- a/usr/src/uts/i86pc/io/vmm/vmm_sol_dev.c Mon Jan 04 14:49:49 2021 -0500 +++ b/usr/src/uts/i86pc/io/vmm/vmm_sol_dev.c Mon Jan 11 08:50:27 2021 -0500 @@ -443,6 +443,9 @@ case VM_RESTART_INSTRUCTION: case VM_SET_KERNEMU_DEV: case VM_GET_KERNEMU_DEV: + case VM_RESET_CPU: + case VM_GET_RUN_STATE: + case VM_SET_RUN_STATE: /* * Copy in the ID of the vCPU chosen for this operation. * Since a nefarious caller could update their struct between @@ -989,6 +992,45 @@ } break; } + case VM_RESET_CPU: { + struct vm_vcpu_reset vvr; + + if (ddi_copyin(datap, &vvr, sizeof (vvr), md)) { + error = EFAULT; + break; + } + if (vvr.kind != VRK_RESET && vvr.kind != VRK_INIT) { + error = EINVAL; + } + + error = vcpu_arch_reset(sc->vmm_vm, vcpu, vvr.kind == VRK_INIT); + break; + } + case VM_GET_RUN_STATE: { + struct vm_run_state vrs; + + bzero(&vrs, sizeof (vrs)); + error = vm_get_run_state(sc->vmm_vm, vcpu, &vrs.state, + &vrs.sipi_vector); + if (error == 0) { + if (ddi_copyout(&vrs, datap, sizeof (vrs), md)) { + error = EFAULT; + break; + } + } + break; + } + case VM_SET_RUN_STATE: { + struct vm_run_state vrs; + + if (ddi_copyin(datap, &vrs, sizeof (vrs), md)) { + error = EFAULT; + break; + } + error = vm_set_run_state(sc->vmm_vm, vcpu, vrs.state, + vrs.sipi_vector); + break; + } case VM_SET_KERNEMU_DEV: case VM_GET_KERNEMU_DEV: { diff -r ce2b70e7aab0 -r 3285827f3d5d usr/src/uts/i86pc/io/vmm/vmm_stat.c --- a/usr/src/uts/i86pc/io/vmm/vmm_stat.c Mon Jan 04 14:49:49 2021 -0500 +++ b/usr/src/uts/i86pc/io/vmm/vmm_stat.c Mon Jan 11 08:50:27 2021 -0500 @@ -167,6 +167,5 @@ VMM_STAT(VMEXIT_UNKNOWN, "number of vm exits for unknown reason"); VMM_STAT(VMEXIT_ASTPENDING, "number of times astpending at exit"); VMM_STAT(VMEXIT_REQIDLE, "number of times idle requested at exit"); -VMM_STAT(VMEXIT_USERSPACE, "number of vm exits handled in userspace"); -VMM_STAT(VMEXIT_RUNBLOCK, "number of times runblock at exit"); VMM_STAT(VMEXIT_EXCEPTION, "number of vm exits due to exceptions"); +VMM_STAT(VMEXIT_RUN_STATE, "number of vm exits due to run_state change"); diff -r ce2b70e7aab0 -r 3285827f3d5d usr/src/uts/i86pc/io/vmm/vmm_stat.h --- a/usr/src/uts/i86pc/io/vmm/vmm_stat.h Mon Jan 04 14:49:49 2021 -0500 +++ b/usr/src/uts/i86pc/io/vmm/vmm_stat.h Mon Jan 11 08:50:27 2021 -0500 @@ -165,8 +165,7 @@ VMM_STAT_DECLARE(VMEXIT_MMIO_EMUL); VMM_STAT_DECLARE(VMEXIT_UNKNOWN); VMM_STAT_DECLARE(VMEXIT_ASTPENDING); -VMM_STAT_DECLARE(VMEXIT_USERSPACE); -VMM_STAT_DECLARE(VMEXIT_RUNBLOCK); VMM_STAT_DECLARE(VMEXIT_EXCEPTION); VMM_STAT_DECLARE(VMEXIT_REQIDLE); +VMM_STAT_DECLARE(VMEXIT_RUN_STATE); #endif diff -r ce2b70e7aab0 -r 3285827f3d5d usr/src/uts/i86pc/sys/vmm.h --- a/usr/src/uts/i86pc/sys/vmm.h Mon Jan 04 14:49:49 2021 -0500 +++ b/usr/src/uts/i86pc/sys/vmm.h Mon Jan 11 08:50:27 2021 -0500 @@ -217,9 +217,9 @@ VM_EXITCODE_PAUSE, VM_EXITCODE_PAGING, VM_EXITCODE_INST_EMUL, - VM_EXITCODE_SPINUP_AP, + VM_EXITCODE_RUN_STATE, VM_EXITCODE_MMIO_EMUL, - VM_EXITCODE_RUNBLOCK, + VM_EXITCODE_DEPRECATED, /* formerly RUNBLOCK */ VM_EXITCODE_IOAPIC_EOI, VM_EXITCODE_SUSPENDED, VM_EXITCODE_MMIO, @@ -287,6 +287,18 @@ struct vm_guest_paging paging; }; +enum vcpu_run_state { + VRS_HALT = 0, + VRS_INIT = (1 << 0), + VRS_RUN = (1 << 1), + + VRS_PEND_INIT = (1 << 14), + VRS_PEND_SIPI = (1 << 15), +}; +#define VRS_MASK_VALID(v) \ + ((v) & (VRS_INIT | VRS_RUN | VRS_PEND_SIPI | VRS_PEND_SIPI)) +#define VRS_IS_VALID(v) ((v) == VRS_MASK_VALID(v)) + struct vm_exit { enum vm_exitcode exitcode; int inst_length; /* 0 means unknown */ @@ -348,10 +360,6 @@ uint64_t wval; } msr; struct { - int vcpu; - uint64_t rip; - } spinup_ap; - struct { uint64_t rflags; } hlt; struct { @@ -367,8 +375,8 @@ enum vm_entry_cmds { VEC_DEFAULT = 0, VEC_DISCARD_INSTR, /* discard inst emul state */ - VEC_COMPLETE_MMIO, /* entry includes result for mmio emul */ - VEC_COMPLETE_INOUT, /* entry includes result for inout emul */ + VEC_FULFILL_MMIO, /* entry includes result for mmio emul */ + VEC_FULFILL_INOUT, /* entry includes result for inout emul */ }; struct vm_entry { diff -r ce2b70e7aab0 -r 3285827f3d5d usr/src/uts/i86pc/sys/vmm_dev.h --- a/usr/src/uts/i86pc/sys/vmm_dev.h Mon Jan 04 14:49:49 2021 -0500 +++ b/usr/src/uts/i86pc/sys/vmm_dev.h Mon Jan 11 08:50:27 2021 -0500 @@ -259,6 +259,28 @@ }; _Static_assert(sizeof(struct vm_readwrite_kernemu_device) == 24, "ABI"); +enum vcpu_reset_kind { + VRK_RESET = 0, + /* + * The reset performed by an INIT IPI clears much of the CPU state, but + * some portions are left untouched, unlike VRK_RESET, which represents + * a "full" reset as if the system was freshly powered on. + */ + VRK_INIT = 1, +}; + +struct vm_vcpu_reset { + int vcpuid; + uint32_t kind; /* contains: enum vcpu_reset_kind */ +}; + +struct vm_run_state { + int vcpuid; + uint32_t state; /* of enum cpu_init_status type */ + uint8_t sipi_vector; /* vector of SIPI, if any */ + uint8_t _pad[3]; +}; + #define VMMCTL_IOC_BASE (('V' << 16) | ('M' << 8)) #define VMM_IOC_BASE (('v' << 16) | ('m' << 8)) #define VMM_LOCK_IOC_BASE (('v' << 16) | ('l' << 8)) @@ -291,6 +313,9 @@ #define VM_RESTART_INSTRUCTION (VMM_CPU_IOC_BASE | 0x13) #define VM_SET_KERNEMU_DEV (VMM_CPU_IOC_BASE | 0x14) #define VM_GET_KERNEMU_DEV (VMM_CPU_IOC_BASE | 0x15) +#define VM_RESET_CPU (VMM_CPU_IOC_BASE | 0x16) +#define VM_GET_RUN_STATE (VMM_CPU_IOC_BASE | 0x17) +#define VM_SET_RUN_STATE (VMM_CPU_IOC_BASE | 0x18) /* Operations requiring write-locking the VM */ #define VM_REINIT (VMM_LOCK_IOC_BASE | 0x01)