From 5cbb30dda9f1a1fc8ad10c8199d963c88af37fde Mon Sep 17 00:00:00 2001 From: Ludvig Liljenberg <4257730+ludfjig@users.noreply.github.com> Date: Tue, 11 Nov 2025 14:05:38 -0800 Subject: [PATCH 1/4] Refactor 'run' Signed-off-by: Ludvig Liljenberg <4257730+ludfjig@users.noreply.github.com> --- .../src/hypervisor/crashdump.rs | 8 +- .../src/hypervisor/gdb/arch.rs | 34 +- .../src/hypervisor/gdb/hyperv_debug.rs | 51 +- .../src/hypervisor/gdb/kvm_debug.rs | 47 +- src/hyperlight_host/src/hypervisor/gdb/mod.rs | 4 +- .../src/hypervisor/gdb/mshv_debug.rs | 51 +- .../src/hypervisor/hyperv_linux.rs | 410 ++----- .../src/hypervisor/hyperv_windows.rs | 516 +++----- src/hyperlight_host/src/hypervisor/kvm.rs | 375 ++---- src/hyperlight_host/src/hypervisor/mod.rs | 1053 +++++++++-------- .../hypervisor/windows_hypervisor_platform.rs | 50 - .../src/sandbox/initialized_multi_use.rs | 47 +- src/hyperlight_host/src/sandbox/outb.rs | 20 +- src/tests/rust_guests/witguest/Cargo.lock | 8 +- 14 files changed, 956 insertions(+), 1718 deletions(-) diff --git a/src/hyperlight_host/src/hypervisor/crashdump.rs b/src/hyperlight_host/src/hypervisor/crashdump.rs index 86dce75ff..467ca9633 100644 --- a/src/hyperlight_host/src/hypervisor/crashdump.rs +++ b/src/hyperlight_host/src/hypervisor/crashdump.rs @@ -23,7 +23,6 @@ use elfcore::{ ReadProcessMemory, ThreadView, VaProtection, VaRegion, }; -use super::Hypervisor; use crate::mem::memory_region::{MemoryRegion, MemoryRegionFlags}; use crate::{Result, new_error}; @@ -262,12 +261,7 @@ impl ReadProcessMemory for GuestMemReader { /// /// # Returns /// * `Result<()>`: Success or error -pub(crate) fn generate_crashdump(hv: &dyn Hypervisor) -> Result<()> { - // Get crash context from hypervisor - let ctx = hv - .crashdump_context() - .map_err(|e| new_error!("Failed to get crashdump context: {:?}", e))?; - +pub(crate) fn generate_crashdump(ctx: Option) -> Result<()> { // Get env variable for core dump directory let core_dump_dir = std::env::var("HYPERLIGHT_CORE_DUMP_DIR").ok(); diff --git a/src/hyperlight_host/src/hypervisor/gdb/arch.rs b/src/hyperlight_host/src/hypervisor/gdb/arch.rs index e75eade74..652b3af47 100644 --- a/src/hyperlight_host/src/hypervisor/gdb/arch.rs +++ b/src/hyperlight_host/src/hypervisor/gdb/arch.rs @@ -16,9 +16,8 @@ limitations under the License. //! This file contains architecture specific code for the x86_64 -use std::collections::HashMap; - use super::VcpuStopReason; +use crate::Result; // Described in Table 6-1. Exceptions and Interrupts at Page 6-13 Vol. 1 // of Intel 64 and IA-32 Architectures Software Developer's Manual @@ -51,61 +50,50 @@ pub(crate) const DR6_HW_BP_FLAGS_MASK: u64 = 0x0F << DR6_HW_BP_FLAGS_POS; /// Determine the reason the vCPU stopped /// This is done by checking the DR6 register and the exception id -/// NOTE: Additional checks are done for the entrypoint, stored hw_breakpoints -/// and sw_breakpoints to ensure the stop reason is valid with internal state pub(crate) fn vcpu_stop_reason( - single_step: bool, rip: u64, - dr6: u64, entrypoint: u64, + dr6: u64, exception: u32, - hw_breakpoints: &[u64], - sw_breakpoints: &HashMap, -) -> VcpuStopReason { +) -> Result { if DB_EX_ID == exception { // If the BS flag in DR6 register is set, it means a single step // instruction triggered the exit // Check page 19-4 Vol. 3B of Intel 64 and IA-32 // Architectures Software Developer's Manual - if dr6 & DR6_BS_FLAG_MASK != 0 && single_step { - return VcpuStopReason::DoneStep; + if dr6 & DR6_BS_FLAG_MASK != 0 { + return Ok(VcpuStopReason::DoneStep); } // If any of the B0-B3 flags in DR6 register is set, it means a // hardware breakpoint triggered the exit // Check page 19-4 Vol. 3B of Intel 64 and IA-32 // Architectures Software Developer's Manual - if DR6_HW_BP_FLAGS_MASK & dr6 != 0 && hw_breakpoints.contains(&rip) { + if DR6_HW_BP_FLAGS_MASK & dr6 != 0 { if rip == entrypoint { - return VcpuStopReason::EntryPointBp; + return Ok(VcpuStopReason::EntryPointBp); } - return VcpuStopReason::HwBp; + return Ok(VcpuStopReason::HwBp); } } - if BP_EX_ID == exception && sw_breakpoints.contains_key(&rip) { - return VcpuStopReason::SwBp; + if BP_EX_ID == exception { + return Ok(VcpuStopReason::SwBp); } // Log an error and provide internal debugging info log::error!( r"The vCPU exited because of an unknown reason: - single_step: {:?} rip: {:?} dr6: {:?} entrypoint: {:?} exception: {:?} - hw_breakpoints: {:?} - sw_breakpoints: {:?} ", - single_step, rip, dr6, entrypoint, exception, - hw_breakpoints, - sw_breakpoints, ); - VcpuStopReason::Unknown + Ok(VcpuStopReason::Unknown) } diff --git a/src/hyperlight_host/src/hypervisor/gdb/hyperv_debug.rs b/src/hyperlight_host/src/hypervisor/gdb/hyperv_debug.rs index f5b332d02..f1bb1bc56 100644 --- a/src/hyperlight_host/src/hypervisor/gdb/hyperv_debug.rs +++ b/src/hyperlight_host/src/hypervisor/gdb/hyperv_debug.rs @@ -16,10 +16,8 @@ limitations under the License. use std::collections::HashMap; -use windows::Win32::System::Hypervisor::WHV_VP_EXCEPTION_CONTEXT; - -use super::arch::{MAX_NO_OF_HW_BP, vcpu_stop_reason}; -use super::{GuestDebug, SW_BP_SIZE, VcpuStopReason}; +use super::arch::MAX_NO_OF_HW_BP; +use super::{GuestDebug, SW_BP_SIZE}; use crate::hypervisor::regs::{CommonFpu, CommonRegisters}; use crate::hypervisor::windows_hypervisor_platform::VMProcessor; use crate::hypervisor::wrappers::WHvDebugRegisters; @@ -52,15 +50,6 @@ impl HypervDebug { } } - /// Returns the instruction pointer from the stopped vCPU - fn get_instruction_pointer(&self, vcpu_fd: &VMProcessor) -> Result { - let regs = vcpu_fd - .regs() - .map_err(|e| new_error!("Could not retrieve registers from vCPU: {:?}", e))?; - - Ok(regs.rip) - } - /// This method sets the kvm debugreg fields to enable breakpoints at /// specific addresses /// @@ -120,42 +109,6 @@ impl HypervDebug { Ok(()) } - - /// Get the reason the vCPU has stopped - pub(crate) fn get_stop_reason( - &mut self, - vcpu_fd: &VMProcessor, - exception: WHV_VP_EXCEPTION_CONTEXT, - entrypoint: u64, - ) -> Result { - let rip = self.get_instruction_pointer(vcpu_fd)?; - let rip = self.translate_gva(vcpu_fd, rip)?; - - let debug_regs = vcpu_fd - .get_debug_regs() - .map_err(|e| new_error!("Could not retrieve registers from vCPU: {:?}", e))?; - - // Check if the vCPU stopped because of a hardware breakpoint - let reason = vcpu_stop_reason( - self.single_step, - rip, - debug_regs.dr6, - entrypoint, - exception.ExceptionType as u32, - &self.hw_breakpoints, - &self.sw_breakpoints, - ); - - if let VcpuStopReason::EntryPointBp = reason { - // In case the hw breakpoint is the entry point, remove it to - // avoid hanging here as gdb does not remove breakpoints it - // has not set. - // Gdb expects the target to be stopped when connected. - self.remove_hw_breakpoint(vcpu_fd, entrypoint)?; - } - - Ok(reason) - } } impl GuestDebug for HypervDebug { diff --git a/src/hyperlight_host/src/hypervisor/gdb/kvm_debug.rs b/src/hyperlight_host/src/hypervisor/gdb/kvm_debug.rs index ae5996d49..61111a23b 100644 --- a/src/hyperlight_host/src/hypervisor/gdb/kvm_debug.rs +++ b/src/hyperlight_host/src/hypervisor/gdb/kvm_debug.rs @@ -18,12 +18,12 @@ use std::collections::HashMap; use kvm_bindings::{ KVM_GUESTDBG_ENABLE, KVM_GUESTDBG_SINGLESTEP, KVM_GUESTDBG_USE_HW_BP, KVM_GUESTDBG_USE_SW_BP, - kvm_debug_exit_arch, kvm_guest_debug, + kvm_guest_debug, }; use kvm_ioctls::VcpuFd; -use super::arch::{MAX_NO_OF_HW_BP, SW_BP_SIZE, vcpu_stop_reason}; -use super::{GuestDebug, VcpuStopReason}; +use super::GuestDebug; +use super::arch::{MAX_NO_OF_HW_BP, SW_BP_SIZE}; use crate::hypervisor::regs::{CommonFpu, CommonRegisters}; use crate::{HyperlightError, Result, new_error}; @@ -59,15 +59,6 @@ impl KvmDebug { } } - /// Returns the instruction pointer from the stopped vCPU - fn get_instruction_pointer(&self, vcpu_fd: &VcpuFd) -> Result { - let regs = vcpu_fd - .get_regs() - .map_err(|e| new_error!("Could not retrieve registers from vCPU: {:?}", e))?; - - Ok(regs.rip) - } - /// This method sets the kvm debugreg fields to enable breakpoints at /// specific addresses /// @@ -106,38 +97,6 @@ impl KvmDebug { Ok(()) } - - /// Get the reason the vCPU has stopped - pub(crate) fn get_stop_reason( - &mut self, - vcpu_fd: &VcpuFd, - debug_exit: kvm_debug_exit_arch, - entrypoint: u64, - ) -> Result { - let rip = self.get_instruction_pointer(vcpu_fd)?; - let rip = self.translate_gva(vcpu_fd, rip)?; - - // Check if the vCPU stopped because of a hardware breakpoint - let reason = vcpu_stop_reason( - self.single_step, - rip, - debug_exit.dr6, - entrypoint, - debug_exit.exception, - &self.hw_breakpoints, - &self.sw_breakpoints, - ); - - if let VcpuStopReason::EntryPointBp = reason { - // In case the hw breakpoint is the entry point, remove it to - // avoid hanging here as gdb does not remove breakpoints it - // has not set. - // Gdb expects the target to be stopped when connected. - self.remove_hw_breakpoint(vcpu_fd, entrypoint)?; - } - - Ok(reason) - } } impl GuestDebug for KvmDebug { diff --git a/src/hyperlight_host/src/hypervisor/gdb/mod.rs b/src/hyperlight_host/src/hypervisor/gdb/mod.rs index fb4829ef0..e6459d1a3 100644 --- a/src/hyperlight_host/src/hypervisor/gdb/mod.rs +++ b/src/hyperlight_host/src/hypervisor/gdb/mod.rs @@ -14,7 +14,7 @@ See the License for the specific language governing permissions and limitations under the License. */ -mod arch; +pub(crate) mod arch; mod event_loop; #[cfg(target_os = "windows")] mod hyperv_debug; @@ -235,7 +235,7 @@ impl DebugMemoryAccess { } /// Defines the possible reasons for which a vCPU can be stopped when debugging -#[derive(Debug)] +#[derive(Debug, PartialEq, Eq)] pub enum VcpuStopReason { Crash, DoneStep, diff --git a/src/hyperlight_host/src/hypervisor/gdb/mshv_debug.rs b/src/hyperlight_host/src/hypervisor/gdb/mshv_debug.rs index 0b5aba47c..f59d593a6 100644 --- a/src/hyperlight_host/src/hypervisor/gdb/mshv_debug.rs +++ b/src/hyperlight_host/src/hypervisor/gdb/mshv_debug.rs @@ -26,8 +26,8 @@ use mshv_bindings::{ }; use mshv_ioctls::VcpuFd; -use super::arch::{MAX_NO_OF_HW_BP, SW_BP_SIZE, vcpu_stop_reason}; -use super::{GuestDebug, VcpuStopReason}; +use super::GuestDebug; +use super::arch::{MAX_NO_OF_HW_BP, SW_BP_SIZE}; use crate::hypervisor::regs::{CommonFpu, CommonRegisters}; use crate::{HyperlightError, Result, new_error}; @@ -55,15 +55,6 @@ impl MshvDebug { } } - /// Returns the instruction pointer from the stopped vCPU - fn get_instruction_pointer(&self, vcpu_fd: &VcpuFd) -> Result { - let regs = vcpu_fd - .get_regs() - .map_err(|e| new_error!("Could not retrieve registers from vCPU: {:?}", e))?; - - Ok(regs.rip) - } - /// This method sets the vCPU debug register fields to enable breakpoints at /// specific addresses /// @@ -121,44 +112,6 @@ impl MshvDebug { Ok(()) } - - /// Returns the vCPU stop reason - pub(crate) fn get_stop_reason( - &mut self, - vcpu_fd: &VcpuFd, - exception: u16, - entrypoint: u64, - ) -> Result { - let regs = vcpu_fd - .get_debug_regs() - .map_err(|e| new_error!("Cannot retrieve debug registers from vCPU: {}", e))?; - - // DR6 register contains debug state related information - let debug_status = regs.dr6; - - let rip = self.get_instruction_pointer(vcpu_fd)?; - let rip = self.translate_gva(vcpu_fd, rip)?; - - let reason = vcpu_stop_reason( - self.single_step, - rip, - debug_status, - entrypoint, - exception as u32, - &self.hw_breakpoints, - &self.sw_breakpoints, - ); - - if let VcpuStopReason::EntryPointBp = reason { - // In case the hw breakpoint is the entry point, remove it to - // avoid hanging here as gdb does not remove breakpoints it - // has not set. - // Gdb expects the target to be stopped when connected. - self.remove_hw_breakpoint(vcpu_fd, entrypoint)?; - } - - Ok(reason) - } } impl GuestDebug for MshvDebug { diff --git a/src/hyperlight_host/src/hypervisor/hyperv_linux.rs b/src/hyperlight_host/src/hypervisor/hyperv_linux.rs index 64074b5f9..9c96de39f 100644 --- a/src/hyperlight_host/src/hypervisor/hyperv_linux.rs +++ b/src/hyperlight_host/src/hypervisor/hyperv_linux.rs @@ -18,10 +18,16 @@ extern crate mshv_bindings; extern crate mshv_ioctls; use std::fmt::{Debug, Formatter}; -use std::sync::atomic::{AtomicBool, AtomicU64, Ordering}; +use std::sync::atomic::{AtomicBool, AtomicU64}; use std::sync::{Arc, Mutex}; use log::{LevelFilter, error}; +#[cfg(gdb)] +use mshv_bindings::{ + DebugRegisters, HV_INTERCEPT_ACCESS_MASK_EXECUTE, hv_intercept_parameters, + hv_intercept_type_HV_INTERCEPT_TYPE_EXCEPTION, hv_message_type_HVMSG_X64_EXCEPTION_INTERCEPT, + mshv_install_intercept, +}; use mshv_bindings::{ FloatingPointUnit, SpecialRegisters, StandardRegisters, hv_message_type, hv_message_type_HVMSG_GPA_INTERCEPT, hv_message_type_HVMSG_UNMAPPED_GPA, @@ -30,16 +36,8 @@ use mshv_bindings::{ hv_partition_synthetic_processor_features, hv_register_assoc, hv_register_name_HV_X64_REGISTER_RIP, hv_register_value, mshv_user_mem_region, }; -#[cfg(gdb)] -use mshv_bindings::{ - HV_INTERCEPT_ACCESS_MASK_EXECUTE, hv_intercept_parameters, - hv_intercept_type_HV_INTERCEPT_TYPE_EXCEPTION, hv_message_type_HVMSG_X64_EXCEPTION_INTERCEPT, - mshv_install_intercept, -}; use mshv_ioctls::{Mshv, VcpuFd, VmFd}; use tracing::{Span, instrument}; -#[cfg(feature = "trace_guest")] -use tracing_opentelemetry::OpenTelemetrySpanExt; #[cfg(crashdump)] use {super::crashdump, std::path::Path}; @@ -48,18 +46,17 @@ use super::gdb::{ DebugCommChannel, DebugMemoryAccess, DebugMsg, DebugResponse, GuestDebug, MshvDebug, VcpuStopReason, }; -use super::{HyperlightExit, Hypervisor, LinuxInterruptHandle, VirtualCPU}; +use super::{Hypervisor, LinuxInterruptHandle}; #[cfg(gdb)] use crate::HyperlightError; -use crate::hypervisor::get_memory_access_violation; use crate::hypervisor::regs::CommonFpu; +use crate::hypervisor::{InterruptHandle, InterruptHandleImpl, VmExit}; use crate::mem::memory_region::{MemoryRegion, MemoryRegionFlags}; use crate::mem::mgr::SandboxMemoryManager; use crate::mem::ptr::{GuestPtr, RawPtr}; use crate::mem::shared_mem::HostSharedMemory; use crate::sandbox::SandboxConfiguration; use crate::sandbox::host_funcs::FunctionRegistry; -use crate::sandbox::outb::handle_outb; #[cfg(feature = "mem_profile")] use crate::sandbox::trace::MemTraceInfo; #[cfg(crashdump)] @@ -68,9 +65,8 @@ use crate::{Result, log_then_return, new_error}; #[cfg(gdb)] mod debug { - use super::mshv_bindings::hv_x64_exception_intercept_message; use super::{HypervLinuxDriver, *}; - use crate::hypervisor::gdb::{DebugMemoryAccess, DebugMsg, DebugResponse, VcpuStopReason}; + use crate::hypervisor::gdb::{DebugMemoryAccess, DebugMsg, DebugResponse}; use crate::{Result, new_error}; impl HypervLinuxDriver { @@ -85,19 +81,6 @@ mod debug { Ok(()) } - /// Get the reason the vCPU has stopped - pub(crate) fn get_stop_reason( - &mut self, - ex_info: hv_x64_exception_intercept_message, - ) -> Result { - let debug = self - .debug - .as_mut() - .ok_or_else(|| new_error!("Debug is not enabled"))?; - - debug.get_stop_reason(&self.vcpu_fd, ex_info.exception_vector, self.entrypoint) - } - pub(crate) fn process_dbg_request( &mut self, req: DebugMsg, @@ -273,9 +256,7 @@ pub(crate) struct HypervLinuxDriver { vcpu_fd: VcpuFd, orig_rsp: GuestPtr, entrypoint: u64, - interrupt_handle: Arc, - mem_mgr: Option>, - host_funcs: Option>>, + interrupt_handle: Arc, sandbox_regions: Vec, // Initially mapped regions when sandbox is created mmap_regions: Vec, // Later mapped regions @@ -374,10 +355,8 @@ impl HypervLinuxDriver { vm_fd.map_user_memory(mshv_region) })?; - let interrupt_handle = Arc::new(LinuxInterruptHandle { + let interrupt_handle: Arc = Arc::new(LinuxInterruptHandle { running: AtomicU64::new(0), - cancel_requested: AtomicU64::new(0), - call_active: AtomicBool::new(false), #[cfg(gdb)] debug_interrupt: AtomicBool::new(false), #[cfg(all( @@ -409,8 +388,6 @@ impl HypervLinuxDriver { entrypoint: entrypoint_ptr.absolute()?, orig_rsp: rsp_ptr, interrupt_handle: interrupt_handle.clone(), - mem_mgr: None, - host_funcs: None, #[cfg(gdb)] debug, #[cfg(gdb)] @@ -471,13 +448,11 @@ impl Hypervisor for HypervLinuxDriver { peb_addr: RawPtr, seed: u64, page_size: u32, - mem_mgr: SandboxMemoryManager, + mut mem_mgr: SandboxMemoryManager, host_funcs: Arc>, max_guest_log_level: Option, #[cfg(gdb)] dbg_mem_access_fn: Arc>>, ) -> Result<()> { - self.mem_mgr = Some(mem_mgr); - self.host_funcs = Some(host_funcs); self.page_size = page_size as usize; let max_guest_log_level: u64 = match max_guest_log_level { @@ -500,10 +475,17 @@ impl Hypervisor for HypervLinuxDriver { }; self.vcpu_fd.set_regs(®s)?; - VirtualCPU::run( - self.as_mut_hypervisor(), + self.run( + self.entrypoint, + self.interrupt_handle.clone(), + &self.sandbox_regions.clone(), + &self.mmap_regions.clone(), + &mut mem_mgr, + host_funcs.clone(), #[cfg(gdb)] dbg_mem_access_fn, + #[cfg(crashdump)] + &self.rt_cfg.clone(), ) } @@ -546,6 +528,8 @@ impl Hypervisor for HypervLinuxDriver { fn dispatch_call_from_host( &mut self, dispatch_func_addr: RawPtr, + mem_mgr: &mut SandboxMemoryManager, + host_funcs: Arc>, #[cfg(gdb)] dbg_mem_access_fn: Arc>>, ) -> Result<()> { // Reset general purpose registers, then set RIP and RSP @@ -560,273 +544,85 @@ impl Hypervisor for HypervLinuxDriver { // reset fpu state self.set_fpu(&CommonFpu::default())?; - // run - VirtualCPU::run( - self.as_mut_hypervisor(), + // run - extract values before calling to avoid borrow conflicts + self.run( + self.entrypoint, + self.interrupt_handle.clone(), + &self.sandbox_regions.clone(), + &self.mmap_regions.clone(), + mem_mgr, + host_funcs.clone(), #[cfg(gdb)] dbg_mem_access_fn, - )?; - - Ok(()) - } - - #[instrument(err(Debug), skip_all, parent = Span::current(), level = "Trace")] - fn handle_io( - &mut self, - port: u16, - data: Vec, - rip: u64, - instruction_length: u64, - ) -> Result<()> { - let mut padded = [0u8; 4]; - let copy_len = data.len().min(4); - padded[..copy_len].copy_from_slice(&data[..copy_len]); - let val = u32::from_le_bytes(padded); - - #[cfg(feature = "mem_profile")] - { - // We need to handle the borrow checker issue where we need both: - // - &mut SandboxMemoryManager (from self.mem_mgr) - // - &mut dyn Hypervisor (from self) - // We'll use a temporary approach to extract the mem_mgr temporarily - let mem_mgr_option = self.mem_mgr.take(); - let mut mem_mgr = mem_mgr_option - .ok_or_else(|| new_error!("mem_mgr should be initialized before handling IO"))?; - let host_funcs = self - .host_funcs - .as_ref() - .ok_or_else(|| new_error!("host_funcs should be initialized before handling IO"))? - .clone(); - - handle_outb(&mut mem_mgr, host_funcs, self, port, val)?; - - // Put the mem_mgr back - self.mem_mgr = Some(mem_mgr); - } - - #[cfg(not(feature = "mem_profile"))] - { - let mem_mgr = self - .mem_mgr - .as_mut() - .ok_or_else(|| new_error!("mem_mgr should be initialized before handling IO"))?; - let host_funcs = self - .host_funcs - .as_ref() - .ok_or_else(|| new_error!("host_funcs should be initialized before handling IO"))? - .clone(); - - handle_outb(mem_mgr, host_funcs, port, val)?; - } - - // update rip - self.vcpu_fd.set_reg(&[hv_register_assoc { - name: hv_register_name_HV_X64_REGISTER_RIP, - value: hv_register_value { - reg64: rip + instruction_length, - }, - ..Default::default() - }])?; - Ok(()) + #[cfg(crashdump)] + &self.rt_cfg.clone(), + ) } #[instrument(err(Debug), skip_all, parent = Span::current(), level = "Trace")] - fn run( - &mut self, - #[cfg(feature = "trace_guest")] tc: &mut crate::sandbox::trace::TraceContext, - ) -> Result { - const HALT_MESSAGE: hv_message_type = hv_message_type_HVMSG_X64_HALT; - const IO_PORT_INTERCEPT_MESSAGE: hv_message_type = - hv_message_type_HVMSG_X64_IO_PORT_INTERCEPT; - const UNMAPPED_GPA_MESSAGE: hv_message_type = hv_message_type_HVMSG_UNMAPPED_GPA; - const INVALID_GPA_ACCESS_MESSAGE: hv_message_type = hv_message_type_HVMSG_GPA_INTERCEPT; + fn run_vcpu(&mut self) -> Result { + const HALT: hv_message_type = hv_message_type_HVMSG_X64_HALT; + const IO_PORT: hv_message_type = hv_message_type_HVMSG_X64_IO_PORT_INTERCEPT; + const UNMAPPED_GPA: hv_message_type = hv_message_type_HVMSG_UNMAPPED_GPA; + const INVALID_GPA: hv_message_type = hv_message_type_HVMSG_GPA_INTERCEPT; #[cfg(gdb)] const EXCEPTION_INTERCEPT: hv_message_type = hv_message_type_HVMSG_X64_EXCEPTION_INTERCEPT; - self.interrupt_handle - .tid - .store(unsafe { libc::pthread_self() as u64 }, Ordering::Release); - // Note: if `InterruptHandle::kill()` is called while this thread is **here** - // Cast to internal trait for access to internal methods - let interrupt_handle_internal = - self.interrupt_handle.as_ref() as &dyn super::InterruptHandleInternal; - - // (after set_running_bit but before checking cancel_requested): - // - kill() will stamp cancel_requested with the current generation - // - We will check cancel_requested below and skip the VcpuFd::run() call - // - This is the desired behavior - the kill takes effect immediately - let generation = interrupt_handle_internal.set_running_bit(); - - #[cfg(not(gdb))] - let debug_interrupt = false; - #[cfg(gdb)] - let debug_interrupt = self - .interrupt_handle - .debug_interrupt - .load(Ordering::Relaxed); - - // Don't run the vcpu if `cancel_requested` is set for our generation - // - // Note: if `InterruptHandle::kill()` is called while this thread is **here** - // (after checking cancel_requested but before vcpu.run()): - // - kill() will stamp cancel_requested with the current generation - // - We will proceed with vcpu.run(), but signals will be sent to interrupt it - // - The vcpu will be interrupted and return EINTR (handled below) - let exit_reason = if interrupt_handle_internal - .is_cancel_requested_for_generation(generation) - || debug_interrupt - { - Err(mshv_ioctls::MshvError::from(libc::EINTR)) - } else { - #[cfg(feature = "trace_guest")] - tc.setup_guest_trace(Span::current().context()); - - // Note: if a `InterruptHandle::kill()` called while this thread is **here** - // Then the vcpu will run, but we will keep sending signals to this thread - // to interrupt it until `running` is set to false. The `vcpu_fd::run()` call will - // return either normally with an exit reason, or from being "kicked" by out signal handler, with an EINTR error, - // both of which are fine. - self.vcpu_fd.run() - }; - // Note: if `InterruptHandle::kill()` is called while this thread is **here** - // (after vcpu.run() returns but before clear_running_bit): - // - kill() continues sending signals to this thread (running bit is still set) - // - The signals are harmless (no-op handler), we just need to check cancel_requested - // - We load cancel_requested below to determine if this run was cancelled - let cancel_requested = - interrupt_handle_internal.is_cancel_requested_for_generation(generation); - #[cfg(gdb)] - let debug_interrupt = self - .interrupt_handle - .debug_interrupt - .load(Ordering::Relaxed); - // Note: if `InterruptHandle::kill()` is called while this thread is **here** - // (after loading cancel_requested but before clear_running_bit): - // - kill() stamps cancel_requested with the CURRENT generation (not the one we just loaded) - // - kill() continues sending signals until running bit is cleared - // - The newly stamped cancel_requested will affect the NEXT vcpu.run() call - // - Signals sent now are harmless (no-op handler) - interrupt_handle_internal.clear_running_bit(); - // At this point, running bit is clear so kill() will stop sending signals. - // However, we may still receive delayed signals that were sent before clear_running_bit. - // These stale signals are harmless because: - // - The signal handler is a no-op - // - We check generation matching in cancel_requested before treating EINTR as cancellation - // - If generation doesn't match, we return Retry instead of Cancelled - let result = match exit_reason { + let run_result = self.vcpu_fd.run(); + let result = match run_result { Ok(m) => match m.header.message_type { - HALT_MESSAGE => { - crate::debug!("mshv - Halt Details : {:#?}", &self); - HyperlightExit::Halt() - } - IO_PORT_INTERCEPT_MESSAGE => { + HALT => VmExit::Halt(), + IO_PORT => { let io_message = m.to_ioport_info().map_err(mshv_ioctls::MshvError::from)?; let port_number = io_message.port_number; - let rip = io_message.header.rip; let rax = io_message.rax; - let instruction_length = io_message.header.instruction_length() as u64; - crate::debug!("mshv IO Details : \nPort : {}\n{:#?}", port_number, &self); - HyperlightExit::IoOut( - port_number, - rax.to_le_bytes().to_vec(), - rip, - instruction_length, - ) + // mshv, unlike kvm, does not automatically increment RIP + self.vcpu_fd.set_reg(&[hv_register_assoc { + name: hv_register_name_HV_X64_REGISTER_RIP, + value: hv_register_value { + reg64: io_message.header.rip + + io_message.header.instruction_length() as u64, + }, + ..Default::default() + }])?; + VmExit::IoOut(port_number, rax.to_le_bytes().to_vec()) } - UNMAPPED_GPA_MESSAGE => { + UNMAPPED_GPA => { let mimo_message = m.to_memory_info().map_err(mshv_ioctls::MshvError::from)?; let addr = mimo_message.guest_physical_address; - crate::debug!( - "mshv MMIO unmapped GPA -Details: Address: {} \n {:#?}", - addr, - &self - ); - HyperlightExit::Mmio(addr) + match MemoryRegionFlags::try_from(mimo_message)? { + MemoryRegionFlags::READ => VmExit::MmioRead(addr), + MemoryRegionFlags::WRITE => VmExit::MmioWrite(addr), + _ => VmExit::Unknown("Unknown MMIO access".to_string()), + } } - INVALID_GPA_ACCESS_MESSAGE => { + INVALID_GPA => { let mimo_message = m.to_memory_info().map_err(mshv_ioctls::MshvError::from)?; let gpa = mimo_message.guest_physical_address; let access_info = MemoryRegionFlags::try_from(mimo_message)?; - crate::debug!( - "mshv MMIO invalid GPA access -Details: Address: {} \n {:#?}", - gpa, - &self - ); - match get_memory_access_violation( - gpa as usize, - self.sandbox_regions.iter().chain(self.mmap_regions.iter()), - access_info, - ) { - Some(access_info_violation) => access_info_violation, - None => HyperlightExit::Mmio(gpa), + match access_info { + MemoryRegionFlags::READ => VmExit::MmioRead(gpa), + MemoryRegionFlags::WRITE => VmExit::MmioWrite(gpa), + _ => VmExit::Unknown("Unknown MMIO access".to_string()), } } - // The only case an intercept exit is expected is when debugging is enabled - // and the intercepts are installed. - // Provide the extra information about the exception to accurately determine - // the stop reason #[cfg(gdb)] EXCEPTION_INTERCEPT => { - // Extract exception info from the message so we can figure out - // more information about the vCPU state - let ex_info = match m.to_exception_info().map_err(mshv_ioctls::MshvError::from) - { - Ok(info) => info, - Err(e) => { - log_then_return!("Error converting to exception info: {:?}", e); - } - }; - - match self.get_stop_reason(ex_info) { - Ok(reason) => HyperlightExit::Debug(reason), - Err(e) => { - log_then_return!("Error getting stop reason: {:?}", e); - } + let exception_message = m + .to_exception_info() + .map_err(mshv_ioctls::MshvError::from)?; + let DebugRegisters { dr6, .. } = self.vcpu_fd.get_debug_regs()?; + VmExit::Debug { + dr6, + exception: exception_message.exception_vector as u32, } } - other => { - crate::debug!("mshv Other Exit: Exit: {:#?} \n {:#?}", other, &self); - #[cfg(crashdump)] - let _ = crashdump::generate_crashdump(self); - log_then_return!("unknown Hyper-V run message type {:?}", other); - } + other => VmExit::Unknown(format!("Unknown MSHV VCPU exit: {:?}", other)), }, Err(e) => match e.errno() { - // We send a signal (SIGRTMIN+offset) to interrupt the vcpu, which causes EINTR - libc::EINTR => { - // Check if cancellation was requested for THIS specific generation. - // If not, the EINTR came from: - // - A debug interrupt (if GDB is enabled) - // - A stale signal from a previous guest call (generation mismatch) - // - A signal meant for a different sandbox on the same thread - // In these cases, we return Retry to continue execution. - if cancel_requested { - interrupt_handle_internal.clear_cancel_requested(); - HyperlightExit::Cancelled() - } else { - #[cfg(gdb)] - if debug_interrupt { - self.interrupt_handle - .debug_interrupt - .store(false, Ordering::Relaxed); - - // If the vCPU was stopped because of an interrupt, we need to - // return a special exit reason so that the gdb thread can handle it - // and resume execution - HyperlightExit::Debug(VcpuStopReason::Interrupt) - } else { - HyperlightExit::Retry() - } - - #[cfg(not(gdb))] - HyperlightExit::Retry() - } - } - libc::EAGAIN => HyperlightExit::Retry(), - _ => { - crate::debug!("mshv Error - Details: Error: {} \n {:#?}", e, &self); - log_then_return!("Error running VCPU {:?}", e); - } + libc::EINTR => VmExit::Cancelled(), + libc::EAGAIN => VmExit::Retry(), + _ => VmExit::Unknown(format!("Unknown MSHV VCPU error: {}", e)), }, }; Ok(result) @@ -865,12 +661,7 @@ impl Hypervisor for HypervLinuxDriver { Ok(()) } - #[instrument(skip_all, parent = Span::current(), level = "Trace")] - fn as_mut_hypervisor(&mut self) -> &mut dyn Hypervisor { - self as &mut dyn Hypervisor - } - - fn interrupt_handle(&self) -> Arc { + fn interrupt_handle(&self) -> Arc { self.interrupt_handle.clone() } @@ -1030,6 +821,16 @@ impl Hypervisor for HypervLinuxDriver { // If the vCPU stopped because of any other reason except a crash, we can handle it // normally _ => { + // Temporary spot to remove hw breakpoints on exit + // TODO: remove in future PR + if stop_reason == VcpuStopReason::EntryPointBp { + #[allow(clippy::unwrap_used)] // we checked this above + self.debug + .as_mut() + .unwrap() + .remove_hw_breakpoint(&self.vcpu_fd, self.entrypoint)?; + } + // Send the stop reason to the gdb thread self.send_dbg_msg(DebugResponse::VcpuStopped(stop_reason)) .map_err(|e| { @@ -1074,23 +875,32 @@ impl Hypervisor for HypervLinuxDriver { Ok(()) } - fn check_stack_guard(&self) -> Result { - if let Some(mgr) = self.mem_mgr.as_ref() { - mgr.check_stack_guard() - } else { - Err(new_error!("Memory manager is not initialized")) - } + #[cfg(gdb)] + fn gdb_connection(&self) -> Option<&DebugCommChannel> { + self.gdb_conn.as_ref() + } + + #[cfg(gdb)] + fn translate_gva(&self, gva: u64) -> Result { + use mshv_bindings::{HV_TRANSLATE_GVA_VALIDATE_READ, HV_TRANSLATE_GVA_VALIDATE_WRITE}; + + let flags = (HV_TRANSLATE_GVA_VALIDATE_READ | HV_TRANSLATE_GVA_VALIDATE_WRITE) as u64; + let (addr, _) = self + .vcpu_fd + .translate_gva(gva, flags) + .map_err(|_| HyperlightError::TranslateGuestAddress(gva))?; + + Ok(addr) } #[cfg(feature = "trace_guest")] - fn handle_trace(&mut self, tc: &mut crate::sandbox::trace::TraceContext) -> Result<()> { + fn handle_trace( + &mut self, + tc: &mut crate::sandbox::trace::TraceContext, + mem_mgr: &mut SandboxMemoryManager, + ) -> Result<()> { let regs = self.regs()?; - tc.handle_trace( - ®s, - self.mem_mgr.as_mut().ok_or_else(|| { - new_error!("Memory manager is not initialized before handling trace") - })?, - ) + tc.handle_trace(®s, mem_mgr) } #[cfg(feature = "mem_profile")] @@ -1102,7 +912,7 @@ impl Hypervisor for HypervLinuxDriver { impl Drop for HypervLinuxDriver { #[instrument(skip_all, parent = Span::current(), level = "Trace")] fn drop(&mut self) { - self.interrupt_handle.dropped.store(true, Ordering::Relaxed); + self.interrupt_handle.set_dropped(); for region in self.sandbox_regions.iter().chain(self.mmap_regions.iter()) { let mshv_region: mshv_user_mem_region = region.to_owned().into(); match self.vm_fd.unmap_user_memory(mshv_region) { diff --git a/src/hyperlight_host/src/hypervisor/hyperv_windows.rs b/src/hyperlight_host/src/hypervisor/hyperv_windows.rs index ca1f75997..9cb611f67 100644 --- a/src/hyperlight_host/src/hypervisor/hyperv_windows.rs +++ b/src/hyperlight_host/src/hypervisor/hyperv_windows.rs @@ -14,19 +14,19 @@ See the License for the specific language governing permissions and limitations under the License. */ +use std::ffi::c_void; use std::fmt; use std::fmt::{Debug, Formatter}; use std::string::String; -use std::sync::atomic::{AtomicBool, AtomicU64, Ordering}; +use std::sync::atomic::{AtomicBool, AtomicU64}; use std::sync::{Arc, Mutex}; use log::LevelFilter; use tracing::{Span, instrument}; -#[cfg(feature = "trace_guest")] -use tracing_opentelemetry::OpenTelemetrySpanExt; use windows::Win32::System::Hypervisor::{ - WHV_MEMORY_ACCESS_TYPE, WHV_PARTITION_HANDLE, WHV_RUN_VP_EXIT_CONTEXT, WHV_RUN_VP_EXIT_REASON, - WHvCancelRunVirtualProcessor, + WHV_MEMORY_ACCESS_TYPE, WHV_REGISTER_VALUE, WHV_RUN_VP_EXIT_CONTEXT, WHV_RUN_VP_EXIT_REASON, + WHvRunVirtualProcessor, WHvRunVpExitReasonCanceled, WHvRunVpExitReasonMemoryAccess, + WHvRunVpExitReasonX64Halt, WHvRunVpExitReasonX64IoPortAccess, WHvX64RegisterRip, }; #[cfg(crashdump)] use {super::crashdump, std::path::Path}; @@ -37,6 +37,10 @@ use { VcpuStopReason, }, crate::HyperlightError, + crate::new_error, + windows::Win32::System::Hypervisor::WHvGetVirtualProcessorRegisters, + windows::Win32::System::Hypervisor::WHvRunVpExitReasonException, + windows::Win32::System::Hypervisor::WHvX64RegisterDr6, }; use super::regs::CommonSpecialRegisters; @@ -44,28 +48,26 @@ use super::surrogate_process::SurrogateProcess; use super::surrogate_process_manager::*; use super::windows_hypervisor_platform::{VMPartition, VMProcessor}; use super::wrappers::HandleWrapper; -use super::{HyperlightExit, Hypervisor, InterruptHandle, VirtualCPU}; -use crate::hypervisor::regs::{CommonFpu, CommonRegisters}; -use crate::hypervisor::{InterruptHandleInternal, get_memory_access_violation}; +use super::{Hypervisor, InterruptHandle}; +use crate::hypervisor::regs::{Align16, CommonFpu, CommonRegisters}; +use crate::hypervisor::{InterruptHandleImpl, VmExit, WindowsInterruptHandle}; use crate::mem::memory_region::{MemoryRegion, MemoryRegionFlags}; use crate::mem::mgr::SandboxMemoryManager; use crate::mem::ptr::{GuestPtr, RawPtr}; use crate::mem::shared_mem::HostSharedMemory; use crate::sandbox::host_funcs::FunctionRegistry; -use crate::sandbox::outb::handle_outb; #[cfg(feature = "mem_profile")] use crate::sandbox::trace::MemTraceInfo; #[cfg(crashdump)] use crate::sandbox::uninitialized::SandboxRuntimeConfig; -use crate::{Result, debug, log_then_return, new_error}; +use crate::{Result, log_then_return}; #[cfg(gdb)] mod debug { - use windows::Win32::System::Hypervisor::WHV_VP_EXCEPTION_CONTEXT; use super::{HypervWindowsDriver, *}; use crate::Result; - use crate::hypervisor::gdb::{DebugMemoryAccess, DebugMsg, DebugResponse, VcpuStopReason}; + use crate::hypervisor::gdb::{DebugMemoryAccess, DebugMsg, DebugResponse}; impl HypervWindowsDriver { /// Resets the debug information to disable debugging @@ -79,19 +81,6 @@ mod debug { Ok(()) } - /// Get the reason the vCPU has stopped - pub(crate) fn get_stop_reason( - &mut self, - exception: WHV_VP_EXCEPTION_CONTEXT, - ) -> Result { - let debug = self - .debug - .as_mut() - .ok_or_else(|| new_error!("Debug is not enabled"))?; - - debug.get_stop_reason(&self.processor, exception, self.entrypoint) - } - pub(crate) fn process_dbg_request( &mut self, req: DebugMsg, @@ -263,9 +252,7 @@ pub(crate) struct HypervWindowsDriver { _surrogate_process: SurrogateProcess, // we need to keep a reference to the SurrogateProcess for the duration of the driver since otherwise it will dropped and the memory mapping will be unmapped and the surrogate process will be returned to the pool entrypoint: u64, orig_rsp: GuestPtr, - interrupt_handle: Arc, - mem_mgr: Option>, - host_funcs: Option>>, + interrupt_handle: Arc, sandbox_regions: Vec, // Initially mapped regions when sandbox is created mmap_regions: Vec, // Later mapped regions @@ -326,12 +313,10 @@ impl HypervWindowsDriver { (None, None) }; - let interrupt_handle = Arc::new(WindowsInterruptHandle { - running: AtomicU64::new(0), - cancel_requested: AtomicU64::new(0), + let interrupt_handle: Arc = Arc::new(WindowsInterruptHandle { + state: AtomicU64::new(0), #[cfg(gdb)] debug_interrupt: AtomicBool::new(false), - call_active: AtomicBool::new(false), partition_handle, dropped: AtomicBool::new(false), }); @@ -342,8 +327,6 @@ impl HypervWindowsDriver { entrypoint, orig_rsp: GuestPtr::try_from(RawPtr::from(rsp))?, interrupt_handle: interrupt_handle.clone(), - mem_mgr: None, - host_funcs: None, sandbox_regions: mem_regions, mmap_regions: Vec::new(), #[cfg(gdb)] @@ -367,17 +350,6 @@ impl HypervWindowsDriver { Ok(hv) } - - #[inline] - #[instrument(err(Debug), skip_all, parent = Span::current(), level = "Trace")] - fn get_exit_details(&self, exit_reason: WHV_RUN_VP_EXIT_REASON) -> Result { - let mut error = String::new(); - error.push_str(&format!( - "Did not receive a halt from Hypervisor as expected - Received {exit_reason:?}!\n" - )); - error.push_str(&format!("Registers: \n{:#?}", self.processor.regs()?)); - Ok(error) - } } impl Debug for HypervWindowsDriver { @@ -415,14 +387,11 @@ impl Hypervisor for HypervWindowsDriver { peb_address: RawPtr, seed: u64, page_size: u32, - mem_mgr: SandboxMemoryManager, + mut mem_mgr: SandboxMemoryManager, host_funcs: Arc>, max_guest_log_level: Option, - #[cfg(gdb)] dbg_mem_access_hdl: Arc>>, + #[cfg(gdb)] dbg_mem_access_fn: Arc>>, ) -> Result<()> { - self.mem_mgr = Some(mem_mgr); - self.host_funcs = Some(host_funcs); - let max_guest_log_level: u64 = match max_guest_log_level { Some(level) => level as u64, None => self.get_max_log_level().into(), @@ -443,10 +412,17 @@ impl Hypervisor for HypervWindowsDriver { }; self.set_regs(®s)?; - VirtualCPU::run( - self.as_mut_hypervisor(), + self.run( + self.entrypoint, + self.interrupt_handle.clone(), + &self.sandbox_regions.clone(), + &self.mmap_regions.clone(), + &mut mem_mgr, + host_funcs.clone(), #[cfg(gdb)] - dbg_mem_access_hdl, + dbg_mem_access_fn, + #[cfg(crashdump)] + &self.rt_cfg.clone(), ) } @@ -468,7 +444,9 @@ impl Hypervisor for HypervWindowsDriver { fn dispatch_call_from_host( &mut self, dispatch_func_addr: RawPtr, - #[cfg(gdb)] dbg_mem_access_hdl: Arc>>, + mem_mgr: &mut SandboxMemoryManager, + host_funcs: Arc>, + #[cfg(gdb)] dbg_mem_access_fn: Arc>>, ) -> Result<()> { // Reset general purpose registers, then set RIP and RSP let regs = CommonRegisters { @@ -482,158 +460,53 @@ impl Hypervisor for HypervWindowsDriver { // reset fpu state self.processor.set_fpu(&CommonFpu::default())?; - VirtualCPU::run( - self.as_mut_hypervisor(), + self.run( + self.entrypoint, + self.interrupt_handle.clone(), + &self.sandbox_regions.clone(), + &self.mmap_regions.clone(), + mem_mgr, + host_funcs.clone(), #[cfg(gdb)] - dbg_mem_access_hdl, - )?; - - Ok(()) - } - - #[instrument(err(Debug), skip_all, parent = Span::current(), level = "Trace")] - fn handle_io( - &mut self, - port: u16, - data: Vec, - rip: u64, - instruction_length: u64, - ) -> Result<()> { - let mut padded = [0u8; 4]; - let copy_len = data.len().min(4); - padded[..copy_len].copy_from_slice(&data[..copy_len]); - let val = u32::from_le_bytes(padded); - - #[cfg(feature = "mem_profile")] - { - // We need to handle the borrow checker issue where we need both: - // - &mut SandboxMemoryManager (from self.mem_mgr.as_mut()) - // - &mut dyn Hypervisor (from self) - // We'll use a temporary approach to extract the mem_mgr temporarily - let mem_mgr_option = self.mem_mgr.take(); - let mut mem_mgr = mem_mgr_option - .ok_or_else(|| new_error!("mem_mgr should be initialized before handling IO"))?; - let host_funcs = self - .host_funcs - .as_ref() - .ok_or_else(|| new_error!("host_funcs should be initialized before handling IO"))? - .clone(); - - handle_outb(&mut mem_mgr, host_funcs, self, port, val)?; - - // Put the mem_mgr back - self.mem_mgr = Some(mem_mgr); - } - - #[cfg(not(feature = "mem_profile"))] - { - let mem_mgr = self - .mem_mgr - .as_mut() - .ok_or_else(|| new_error!("mem_mgr should be initialized before handling IO"))?; - let host_funcs = self - .host_funcs - .as_ref() - .ok_or_else(|| new_error!("host_funcs should be initialized before handling IO"))? - .clone(); - - handle_outb(mem_mgr, host_funcs, port, val)?; - } - - let mut regs = self.regs()?; - regs.rip = rip + instruction_length; - self.set_regs(®s) + dbg_mem_access_fn, + #[cfg(crashdump)] + &self.rt_cfg.clone(), + ) } - #[instrument(err(Debug), skip_all, parent = Span::current(), level = "Trace")] - fn run( - &mut self, - #[cfg(feature = "trace_guest")] tc: &mut crate::sandbox::trace::TraceContext, - ) -> Result { - // Cast to internal trait for access to internal methods - let interrupt_handle_internal = - self.interrupt_handle.as_ref() as &dyn super::InterruptHandleInternal; - - // Get current generation and set running bit - let generation = interrupt_handle_internal.set_running_bit(); - - #[cfg(not(gdb))] - let debug_interrupt = false; - #[cfg(gdb)] - let debug_interrupt = self - .interrupt_handle - .debug_interrupt - .load(Ordering::Relaxed); - - // Check if cancellation was requested for THIS generation - let exit_context = if interrupt_handle_internal - .is_cancel_requested_for_generation(generation) - || debug_interrupt - { - WHV_RUN_VP_EXIT_CONTEXT { - ExitReason: WHV_RUN_VP_EXIT_REASON(8193i32), // WHvRunVpExitReasonCanceled - VpContext: Default::default(), - Anonymous: Default::default(), - Reserved: Default::default(), - } - } else { - #[cfg(feature = "trace_guest")] - tc.setup_guest_trace(Span::current().context()); - - self.processor.run()? - }; - - // Clear running bit - interrupt_handle_internal.clear_running_bit(); - - let is_canceled = exit_context.ExitReason == WHV_RUN_VP_EXIT_REASON(8193i32); // WHvRunVpExitReasonCanceled - - // Check if this was a manual cancellation (vs internal Windows cancellation) - let cancel_was_requested_manually = - interrupt_handle_internal.is_cancel_requested_for_generation(generation); - - // Only clear cancel_requested if we're actually processing a cancellation for this generation - if is_canceled && cancel_was_requested_manually { - interrupt_handle_internal.clear_cancel_requested(); + #[expect(non_upper_case_globals, reason = "Windows API constant are lower case")] + fn run_vcpu(&mut self) -> Result { + let mut exit_context: WHV_RUN_VP_EXIT_CONTEXT = Default::default(); + + unsafe { + WHvRunVirtualProcessor( + self.processor.get_partition_hdl(), + 0, + &mut exit_context as *mut _ as *mut c_void, + std::mem::size_of::() as u32, + )?; } - #[cfg(gdb)] - let debug_interrupt = self - .interrupt_handle - .debug_interrupt - .load(Ordering::Relaxed); - let result = match exit_context.ExitReason { - // WHvRunVpExitReasonX64IoPortAccess - WHV_RUN_VP_EXIT_REASON(2i32) => { - // size of current instruction is in lower byte of _bitfield - // see https://learn.microsoft.com/en-us/virtualization/api/hypervisor-platform/funcs/whvexitcontextdatatypes) + WHvRunVpExitReasonX64IoPortAccess => unsafe { let instruction_length = exit_context.VpContext._bitfield & 0xF; - unsafe { - debug!( - "HyperV IO Details :\n Port: {:#x} \n {:#?}", - exit_context.Anonymous.IoPortAccess.PortNumber, &self - ); - HyperlightExit::IoOut( - exit_context.Anonymous.IoPortAccess.PortNumber, - exit_context - .Anonymous - .IoPortAccess - .Rax - .to_le_bytes() - .to_vec(), - exit_context.VpContext.Rip, - instruction_length as u64, - ) - } - } - // HvRunVpExitReasonX64Halt - WHV_RUN_VP_EXIT_REASON(8i32) => { - debug!("HyperV Halt Details :\n {:#?}", &self); - HyperlightExit::Halt() - } - // WHvRunVpExitReasonMemoryAccess - WHV_RUN_VP_EXIT_REASON(1i32) => { + let rip = exit_context.VpContext.Rip + instruction_length as u64; + self.processor.set_registers(&[( + WHvX64RegisterRip, + Align16(WHV_REGISTER_VALUE { Reg64: rip }), + )])?; + VmExit::IoOut( + exit_context.Anonymous.IoPortAccess.PortNumber, + exit_context + .Anonymous + .IoPortAccess + .Rax + .to_le_bytes() + .to_vec(), + ) + }, + WHvRunVpExitReasonX64Halt => VmExit::Halt(), + WHvRunVpExitReasonMemoryAccess => { let gpa = unsafe { exit_context.Anonymous.MemoryAccess.Gpa }; let access_info = unsafe { WHV_MEMORY_ACCESS_TYPE( @@ -642,86 +515,44 @@ impl Hypervisor for HypervWindowsDriver { ) }; let access_info = MemoryRegionFlags::try_from(access_info)?; - debug!( - "HyperV Memory Access Details :\n GPA: {:#?}\n Access Info :{:#?}\n {:#?} ", - gpa, access_info, &self - ); - - match get_memory_access_violation( - gpa as usize, - self.sandbox_regions.iter().chain(self.mmap_regions.iter()), - access_info, - ) { - Some(access_info) => access_info, - None => HyperlightExit::Mmio(gpa), - } - } - // WHvRunVpExitReasonCanceled - // Execution was cancelled by the host. - // This will happen when guest code runs for too long - WHV_RUN_VP_EXIT_REASON(8193i32) => { - debug!("HyperV Cancelled Details :\n {:#?}", &self); - #[cfg(gdb)] - if debug_interrupt { - self.interrupt_handle - .debug_interrupt - .store(false, Ordering::Relaxed); - - // If the vCPU was stopped because of an interrupt, we need to - // return a special exit reason so that the gdb thread can handle it - // and resume execution - HyperlightExit::Debug(VcpuStopReason::Interrupt) - } else if !cancel_was_requested_manually { - // This was an internal cancellation - // The virtualization stack can use this function to return the control - // of a virtual processor back to the virtualization stack in case it - // needs to change the state of a VM or to inject an event into the processor - // see https://learn.microsoft.com/en-us/virtualization/api/hypervisor-platform/funcs/whvcancelrunvirtualprocessor#remarks - debug!("Internal cancellation detected, returning Retry error"); - HyperlightExit::Retry() - } else { - HyperlightExit::Cancelled() - } - - #[cfg(not(gdb))] - { - if !cancel_was_requested_manually { - // This was an internal cancellation - // The virtualization stack can use this function to return the control - // of a virtual processor back to the virtualization stack in case it - // needs to change the state of a VM or to inject an event into the processor - // see https://learn.microsoft.com/en-us/virtualization/api/hypervisor-platform/funcs/whvcancelrunvirtualprocessor#remarks - debug!("Internal cancellation detected, returning Retry error"); - HyperlightExit::Retry() - } else { - HyperlightExit::Cancelled() - } + match access_info { + MemoryRegionFlags::READ => VmExit::MmioRead(gpa), + MemoryRegionFlags::WRITE => VmExit::MmioWrite(gpa), + _ => VmExit::Unknown("Unknown memory access type".to_string()), } } + // Execution was cancelled by the host. + WHvRunVpExitReasonCanceled => VmExit::Cancelled(), #[cfg(gdb)] - WHV_RUN_VP_EXIT_REASON(4098i32) => { - // Get information about the exception that triggered the exit + WHvRunVpExitReasonException => { let exception = unsafe { exit_context.Anonymous.VpException }; - match self.get_stop_reason(exception) { - Ok(reason) => HyperlightExit::Debug(reason), - Err(e) => { - log_then_return!("Error getting stop reason: {}", e); + // Get the DR6 register to see which breakpoint was hit + let dr6 = { + let names = [WHvX64RegisterDr6]; + let mut out: [Align16; 1] = unsafe { std::mem::zeroed() }; + unsafe { + WHvGetVirtualProcessorRegisters( + self.processor.get_partition_hdl(), + 0, + names.as_ptr(), + out.len() as u32, + out.as_mut_ptr() as *mut WHV_REGISTER_VALUE, + )?; } + unsafe { out[0].0.Reg64 } + }; + + VmExit::Debug { + dr6, + exception: exception.ExceptionType as u32, } } - WHV_RUN_VP_EXIT_REASON(_) => { - debug!( - "HyperV Unexpected Exit Details :#nReason {:#?}\n {:#?}", - exit_context.ExitReason, &self - ); - match self.get_exit_details(exit_context.ExitReason) { - Ok(error) => HyperlightExit::Unknown(error), - Err(e) => HyperlightExit::Unknown(format!("Error getting exit details: {}", e)), - } - } + WHV_RUN_VP_EXIT_REASON(_) => VmExit::Unknown(format!( + "Unknown exit reason '{}'", + exit_context.ExitReason.0 + )), }; - Ok(result) } @@ -754,15 +585,10 @@ impl Hypervisor for HypervWindowsDriver { self.processor.set_sregs(sregs) } - fn interrupt_handle(&self) -> Arc { + fn interrupt_handle(&self) -> Arc { self.interrupt_handle.clone() } - #[instrument(skip_all, parent = Span::current(), level = "Trace")] - fn as_mut_hypervisor(&mut self) -> &mut dyn Hypervisor { - self as &mut dyn Hypervisor - } - #[cfg(crashdump)] fn crashdump_context(&self) -> Result> { if self.rt_cfg.guest_core_dump { @@ -920,6 +746,16 @@ impl Hypervisor for HypervWindowsDriver { // If the vCPU stopped because of any other reason except a crash, we can handle it // normally _ => { + // Temporary spot to remove hw breakpoints on exit + // TODO: remove in future PR + if stop_reason == VcpuStopReason::EntryPointBp { + #[allow(clippy::unwrap_used)] // we checked this above + self.debug + .as_mut() + .unwrap() + .remove_hw_breakpoint(&self.processor, self.entrypoint)?; + } + self.send_dbg_msg(DebugResponse::VcpuStopped(stop_reason)) .map_err(|e| { new_error!("Couldn't signal vCPU stopped event to GDB thread: {:?}", e) @@ -963,23 +799,45 @@ impl Hypervisor for HypervWindowsDriver { Ok(()) } - fn check_stack_guard(&self) -> Result { - if let Some(mgr) = self.mem_mgr.as_ref() { - mgr.check_stack_guard() - } else { - Err(new_error!("Memory manager is not initialized")) + #[cfg(gdb)] + fn gdb_connection(&self) -> Option<&DebugCommChannel> { + self.gdb_conn.as_ref() + } + + #[cfg(gdb)] + fn translate_gva(&self, gva: u64) -> Result { + use windows::Win32::System::Hypervisor::{ + WHV_TRANSLATE_GVA_RESULT, WHvTranslateGva, WHvTranslateGvaFlagValidateRead, + }; + + let partition_handle = self.processor.get_partition_hdl(); + let mut gpa = 0; + let mut result = WHV_TRANSLATE_GVA_RESULT::default(); + + unsafe { + WHvTranslateGva( + partition_handle, + 0, + gva, + // Only validate read access because the write access is handled through the + // host memory mapping + WHvTranslateGvaFlagValidateRead, + &mut result, + &mut gpa, + )?; } + + Ok(gpa) } #[cfg(feature = "trace_guest")] - fn handle_trace(&mut self, tc: &mut crate::sandbox::trace::TraceContext) -> Result<()> { + fn handle_trace( + &mut self, + tc: &mut crate::sandbox::trace::TraceContext, + mem_mgr: &mut SandboxMemoryManager, + ) -> Result<()> { let regs = self.regs()?; - tc.handle_trace( - ®s, - self.mem_mgr.as_mut().ok_or_else(|| { - new_error!("Memory manager is not initialized before handling trace") - })?, - ) + tc.handle_trace(®s, mem_mgr) } #[cfg(feature = "mem_profile")] @@ -990,86 +848,6 @@ impl Hypervisor for HypervWindowsDriver { impl Drop for HypervWindowsDriver { fn drop(&mut self) { - self.interrupt_handle.dropped.store(true, Ordering::Relaxed); - } -} - -#[derive(Debug)] -pub struct WindowsInterruptHandle { - /// Combined running flag (bit 63) and generation counter (bits 0-62). - /// - /// The generation increments with each guest function call to prevent - /// stale cancellations from affecting new calls (ABA problem). - /// - /// Layout: `[running:1 bit][generation:63 bits]` - running: AtomicU64, - - /// Combined cancel_requested flag (bit 63) and generation counter (bits 0-62). - /// - /// When kill() is called, this stores the current generation along with - /// the cancellation flag. The VCPU only honors the cancellation if the - /// generation matches its current generation. - /// - /// Layout: `[cancel_requested:1 bit][generation:63 bits]` - cancel_requested: AtomicU64, - - // This is used to signal the GDB thread to stop the vCPU - #[cfg(gdb)] - debug_interrupt: AtomicBool, - /// Flag indicating whether a guest function call is currently in progress. - /// - /// **true**: A guest function call is active (between call start and completion) - /// **false**: No guest function call is active - /// - /// # Purpose - /// - /// This flag prevents kill() from having any effect when called outside of a - /// guest function call. This solves the "kill-in-advance" problem where kill() - /// could be called before a guest function starts and would incorrectly cancel it. - call_active: AtomicBool, - partition_handle: WHV_PARTITION_HANDLE, - dropped: AtomicBool, -} - -impl InterruptHandle for WindowsInterruptHandle { - fn kill(&self) -> bool { - // Check if a call is actually active first - if !self.call_active.load(Ordering::Acquire) { - return false; - } - - // Get the current running state and generation - let (running, generation) = self.get_running_and_generation(); - - // Set cancel_requested with the current generation - self.set_cancel_requested(generation); - - // Only call WHvCancelRunVirtualProcessor if VCPU is actually running in guest mode - running && unsafe { WHvCancelRunVirtualProcessor(self.partition_handle, 0, 0).is_ok() } - } - - #[cfg(gdb)] - fn kill_from_debugger(&self) -> bool { - self.debug_interrupt.store(true, Ordering::Relaxed); - let (running, _) = self.get_running_and_generation(); - running && unsafe { WHvCancelRunVirtualProcessor(self.partition_handle, 0, 0).is_ok() } - } - - fn dropped(&self) -> bool { - self.dropped.load(Ordering::Relaxed) - } -} - -impl InterruptHandleInternal for WindowsInterruptHandle { - fn get_call_active(&self) -> &AtomicBool { - &self.call_active - } - - fn get_running(&self) -> &AtomicU64 { - &self.running - } - - fn get_cancel_requested(&self) -> &AtomicU64 { - &self.cancel_requested + self.interrupt_handle.set_dropped(); } } diff --git a/src/hyperlight_host/src/hypervisor/kvm.rs b/src/hyperlight_host/src/hypervisor/kvm.rs index 04b8ed60f..647b1338a 100644 --- a/src/hyperlight_host/src/hypervisor/kvm.rs +++ b/src/hyperlight_host/src/hypervisor/kvm.rs @@ -15,7 +15,7 @@ limitations under the License. */ use std::fmt::Debug; -use std::sync::atomic::{AtomicBool, AtomicU64, Ordering}; +use std::sync::atomic::{AtomicBool, AtomicU64}; use std::sync::{Arc, Mutex}; use kvm_bindings::{kvm_fpu, kvm_regs, kvm_sregs, kvm_userspace_memory_region}; @@ -23,8 +23,6 @@ use kvm_ioctls::Cap::UserMemory; use kvm_ioctls::{Kvm, VcpuExit, VcpuFd, VmFd}; use log::LevelFilter; use tracing::{Span, instrument}; -#[cfg(feature = "trace_guest")] -use tracing_opentelemetry::OpenTelemetrySpanExt; #[cfg(crashdump)] use {super::crashdump, std::path::Path}; @@ -33,18 +31,17 @@ use super::gdb::{ DebugCommChannel, DebugMemoryAccess, DebugMsg, DebugResponse, GuestDebug, KvmDebug, VcpuStopReason, }; -use super::{HyperlightExit, Hypervisor, LinuxInterruptHandle, VirtualCPU}; +use super::{Hypervisor, LinuxInterruptHandle}; #[cfg(gdb)] use crate::HyperlightError; -use crate::hypervisor::get_memory_access_violation; use crate::hypervisor::regs::{CommonFpu, CommonRegisters}; -use crate::mem::memory_region::{MemoryRegion, MemoryRegionFlags}; +use crate::hypervisor::{InterruptHandle, InterruptHandleImpl, VmExit}; +use crate::mem::memory_region::MemoryRegion; use crate::mem::mgr::SandboxMemoryManager; use crate::mem::ptr::{GuestPtr, RawPtr}; use crate::mem::shared_mem::HostSharedMemory; use crate::sandbox::SandboxConfiguration; use crate::sandbox::host_funcs::FunctionRegistry; -use crate::sandbox::outb::handle_outb; #[cfg(feature = "mem_profile")] use crate::sandbox::trace::MemTraceInfo; #[cfg(crashdump)] @@ -75,11 +72,9 @@ pub(crate) fn is_hypervisor_present() -> bool { #[cfg(gdb)] mod debug { - use kvm_bindings::kvm_debug_exit_arch; - use super::KVMDriver; use crate::hypervisor::gdb::{ - DebugMemoryAccess, DebugMsg, DebugResponse, GuestDebug, KvmDebug, VcpuStopReason, + DebugMemoryAccess, DebugMsg, DebugResponse, GuestDebug, KvmDebug, }; use crate::{Result, new_error}; @@ -95,19 +90,6 @@ mod debug { Ok(()) } - /// Get the reason the vCPU has stopped - pub(crate) fn get_stop_reason( - &mut self, - debug_exit: kvm_debug_exit_arch, - ) -> Result { - let debug = self - .debug - .as_mut() - .ok_or_else(|| new_error!("Debug is not enabled"))?; - - debug.get_stop_reason(&self.vcpu_fd, debug_exit, self.entrypoint) - } - pub(crate) fn process_dbg_request( &mut self, req: DebugMsg, @@ -271,9 +253,7 @@ pub(crate) struct KVMDriver { vcpu_fd: VcpuFd, entrypoint: u64, orig_rsp: GuestPtr, - interrupt_handle: Arc, - mem_mgr: Option>, - host_funcs: Option>>, + interrupt_handle: Arc, sandbox_regions: Vec, // Initially mapped regions when sandbox is created mmap_regions: Vec<(MemoryRegion, u32)>, // Later mapped regions (region, slot number) @@ -332,10 +312,8 @@ impl KVMDriver { let rsp_gp = GuestPtr::try_from(RawPtr::from(rsp))?; - let interrupt_handle = Arc::new(LinuxInterruptHandle { + let interrupt_handle: Arc = Arc::new(LinuxInterruptHandle { running: AtomicU64::new(0), - cancel_requested: AtomicU64::new(0), - call_active: AtomicBool::new(false), #[cfg(gdb)] debug_interrupt: AtomicBool::new(false), #[cfg(all( @@ -353,8 +331,8 @@ impl KVMDriver { )))] tid: AtomicU64::new(unsafe { libc::pthread_self() }), retry_delay: config.get_interrupt_retry_delay(), - dropped: AtomicBool::new(false), sig_rt_min_offset: config.get_interrupt_vcpu_sigrtmin_offset(), + dropped: AtomicBool::new(false), }); let mut kvm = Self { @@ -369,8 +347,6 @@ impl KVMDriver { mmap_regions: Vec::new(), freed_slots: Vec::new(), interrupt_handle: interrupt_handle.clone(), - mem_mgr: None, - host_funcs: None, #[cfg(gdb)] debug, #[cfg(gdb)] @@ -432,13 +408,11 @@ impl Hypervisor for KVMDriver { peb_addr: RawPtr, seed: u64, page_size: u32, - mem_mgr: SandboxMemoryManager, + mut mem_mgr: SandboxMemoryManager, host_funcs: Arc>, max_guest_log_level: Option, #[cfg(gdb)] dbg_mem_access_fn: Arc>>, ) -> Result<()> { - self.mem_mgr = Some(mem_mgr); - self.host_funcs = Some(host_funcs); self.page_size = page_size as usize; let max_guest_log_level: u64 = match max_guest_log_level { @@ -460,10 +434,21 @@ impl Hypervisor for KVMDriver { }; self.set_regs(®s)?; - VirtualCPU::run( - self.as_mut_hypervisor(), + self.run( + self.entrypoint, + self.interrupt_handle.clone(), + &self.sandbox_regions.clone(), + &self + .mmap_regions + .iter() + .map(|(r, _)| r.clone()) + .collect::>(), + &mut mem_mgr, + host_funcs.clone(), #[cfg(gdb)] dbg_mem_access_fn, + #[cfg(crashdump)] + &self.rt_cfg.clone(), ) } @@ -530,6 +515,8 @@ impl Hypervisor for KVMDriver { fn dispatch_call_from_host( &mut self, dispatch_func_addr: RawPtr, + mem_mgr: &mut SandboxMemoryManager, + host_funcs: Arc>, #[cfg(gdb)] dbg_mem_access_fn: Arc>>, ) -> Result<()> { // Reset general purpose registers, then set RIP and RSP @@ -543,243 +530,50 @@ impl Hypervisor for KVMDriver { // reset fpu state self.set_fpu(&CommonFpu::default())?; - // run - VirtualCPU::run( - self.as_mut_hypervisor(), + self.run( + self.entrypoint, + self.interrupt_handle.clone(), + &self.sandbox_regions.clone(), + &self + .mmap_regions + .iter() + .map(|(r, _)| r.clone()) + .collect::>(), + mem_mgr, + host_funcs.clone(), #[cfg(gdb)] dbg_mem_access_fn, - )?; - - Ok(()) - } - - #[instrument(err(Debug), skip_all, parent = Span::current(), level = "Trace")] - fn handle_io( - &mut self, - port: u16, - data: Vec, - _rip: u64, - _instruction_length: u64, - ) -> Result<()> { - // KVM does not need RIP or instruction length, as it automatically sets the RIP - - // The payload param for the outb_handle_fn is the first byte - // of the data array cast to an u64. Thus, we need to make sure - // the data array has at least one u8, then convert that to an u64 - if data.is_empty() { - log_then_return!("no data was given in IO interrupt"); - } else { - let mut padded = [0u8; 4]; - let copy_len = data.len().min(4); - padded[..copy_len].copy_from_slice(&data[..copy_len]); - let value = u32::from_le_bytes(padded); - - #[cfg(feature = "mem_profile")] - { - // We need to handle the borrow checker issue where we need both: - // - &mut SandboxMemoryManager (from self.mem_mgr.as_mut()) - // - &mut dyn Hypervisor (from self) - // We'll use a temporary approach to extract the mem_mgr temporarily - let mem_mgr_option = self.mem_mgr.take(); - let mut mem_mgr = - mem_mgr_option.ok_or_else(|| new_error!("mem_mgr not initialized"))?; - let host_funcs = self - .host_funcs - .as_ref() - .ok_or_else(|| new_error!("host_funcs not initialized"))? - .clone(); - - handle_outb(&mut mem_mgr, host_funcs, self, port, value)?; - - // Put the mem_mgr back - self.mem_mgr = Some(mem_mgr); - } - - #[cfg(not(feature = "mem_profile"))] - { - let mem_mgr = self - .mem_mgr - .as_mut() - .ok_or_else(|| new_error!("mem_mgr not initialized"))?; - let host_funcs = self - .host_funcs - .as_ref() - .ok_or_else(|| new_error!("host_funcs not initialized"))? - .clone(); - - handle_outb(mem_mgr, host_funcs, port, value)?; - } - } - - Ok(()) + #[cfg(crashdump)] + &self.rt_cfg.clone(), + ) } #[instrument(err(Debug), skip_all, parent = Span::current(), level = "Trace")] - fn run( - &mut self, - #[cfg(feature = "trace_guest")] tc: &mut crate::sandbox::trace::TraceContext, - ) -> Result { - self.interrupt_handle - .tid - .store(unsafe { libc::pthread_self() as u64 }, Ordering::Release); - // Note: if `InterruptHandle::kill()` is called while this thread is **here** - // Cast to internal trait for access to internal methods - let interrupt_handle_internal = - self.interrupt_handle.as_ref() as &dyn super::InterruptHandleInternal; - - // (after set_running_bit but before checking cancel_requested): - // - kill() will stamp cancel_requested with the current generation - // - We will check cancel_requested below and skip the VcpuFd::run() call - // - This is the desired behavior - the kill takes effect immediately - let generation = interrupt_handle_internal.set_running_bit(); - - #[cfg(not(gdb))] - let debug_interrupt = false; - #[cfg(gdb)] - let debug_interrupt = self - .interrupt_handle - .debug_interrupt - .load(Ordering::Relaxed); - // Don't run the vcpu if `cancel_requested` is set for our generation - // - // Note: if `InterruptHandle::kill()` is called while this thread is **here** - // (after checking cancel_requested but before vcpu.run()): - // - kill() will stamp cancel_requested with the current generation - // - We will proceed with vcpu.run(), but signals will be sent to interrupt it - // - The vcpu will be interrupted and return EINTR (handled below) - let exit_reason = if interrupt_handle_internal - .is_cancel_requested_for_generation(generation) - || debug_interrupt - { - Err(kvm_ioctls::Error::new(libc::EINTR)) - } else { - #[cfg(feature = "trace_guest")] - tc.setup_guest_trace(Span::current().context()); - - // Note: if `InterruptHandle::kill()` is called while this thread is **here** - // (during vcpu.run() execution): - // - kill() stamps cancel_requested with the current generation - // - kill() sends signals (SIGRTMIN+offset) to this thread repeatedly - // - The signal handler is a no-op, but it causes vcpu.run() to return EINTR - // - We check cancel_requested below and return Cancelled if generation matches - self.vcpu_fd.run() - }; - // Note: if `InterruptHandle::kill()` is called while this thread is **here** - // (after vcpu.run() returns but before clear_running_bit): - // - kill() continues sending signals to this thread (running bit is still set) - // - The signals are harmless (no-op handler), we just need to check cancel_requested - // - We load cancel_requested below to determine if this run was cancelled - let cancel_requested = - interrupt_handle_internal.is_cancel_requested_for_generation(generation); - #[cfg(gdb)] - let debug_interrupt = self - .interrupt_handle - .debug_interrupt - .load(Ordering::Relaxed); - // Note: if `InterruptHandle::kill()` is called while this thread is **here** - // (after loading cancel_requested but before clear_running_bit): - // - kill() stamps cancel_requested with the CURRENT generation (not the one we just loaded) - // - kill() continues sending signals until running bit is cleared - // - The newly stamped cancel_requested will affect the NEXT vcpu.run() call - // - Signals sent now are harmless (no-op handler) - interrupt_handle_internal.clear_running_bit(); - // At this point, running bit is clear so kill() will stop sending signals. - // However, we may still receive delayed signals that were sent before clear_running_bit. - // These stale signals are harmless because: - // - The signal handler is a no-op - // - We check generation matching in cancel_requested before treating EINTR as cancellation - // - If generation doesn't match, we return Retry instead of Cancelled - let result = match exit_reason { - Ok(VcpuExit::Hlt) => { - crate::debug!("KVM - Halt Details : {:#?}", &self); - HyperlightExit::Halt() - } - Ok(VcpuExit::IoOut(port, data)) => { - // because vcpufd.run() mutably borrows self we cannot pass self to crate::debug! macro here - crate::debug!("KVM IO Details : \nPort : {}\nData : {:?}", port, data); - // KVM does not need to set RIP or instruction length so these are set to 0 - HyperlightExit::IoOut(port, data.to_vec(), 0, 0) - } - Ok(VcpuExit::MmioRead(addr, _)) => { - crate::debug!("KVM MMIO Read -Details: Address: {} \n {:#?}", addr, &self); - - match get_memory_access_violation( - addr as usize, - self.sandbox_regions - .iter() - .chain(self.mmap_regions.iter().map(|(r, _)| r)), - MemoryRegionFlags::READ, - ) { - Some(access_violation_exit) => access_violation_exit, - None => HyperlightExit::Mmio(addr), - } - } - Ok(VcpuExit::MmioWrite(addr, _)) => { - crate::debug!("KVM MMIO Write -Details: Address: {} \n {:#?}", addr, &self); - - match get_memory_access_violation( - addr as usize, - self.sandbox_regions - .iter() - .chain(self.mmap_regions.iter().map(|(r, _)| r)), - MemoryRegionFlags::WRITE, - ) { - Some(access_violation_exit) => access_violation_exit, - None => HyperlightExit::Mmio(addr), - } - } + fn run_vcpu(&mut self) -> Result { + match self.vcpu_fd.run() { + Ok(VcpuExit::Hlt) => Ok(VmExit::Halt()), + Ok(VcpuExit::IoOut(port, data)) => Ok(VmExit::IoOut(port, data.to_vec())), + Ok(VcpuExit::MmioRead(addr, _)) => Ok(VmExit::MmioRead(addr)), + Ok(VcpuExit::MmioWrite(addr, _)) => Ok(VmExit::MmioWrite(addr)), #[cfg(gdb)] - // KVM provides architecture specific information about the vCPU state when exiting - Ok(VcpuExit::Debug(debug_exit)) => match self.get_stop_reason(debug_exit) { - Ok(reason) => HyperlightExit::Debug(reason), - Err(e) => { - log_then_return!("Error getting stop reason: {:?}", e); - } - }, + Ok(VcpuExit::Debug(debug_exit)) => Ok(VmExit::Debug { + dr6: debug_exit.dr6, + exception: debug_exit.exception, + }), Err(e) => match e.errno() { - // We send a signal (SIGRTMIN+offset) to interrupt the vcpu, which causes EINTR - libc::EINTR => { - // Check if cancellation was requested for THIS specific generation. - // If not, the EINTR came from: - // - A debug interrupt (if GDB is enabled) - // - A stale signal from a previous guest call (generation mismatch) - // - A signal meant for a different sandbox on the same thread - // In these cases, we return Retry to continue execution. - if cancel_requested { - interrupt_handle_internal.clear_cancel_requested(); - HyperlightExit::Cancelled() - } else { - #[cfg(gdb)] - if debug_interrupt { - self.interrupt_handle - .debug_interrupt - .store(false, Ordering::Relaxed); - - // If the vCPU was stopped because of an interrupt, we need to - // return a special exit reason so that the gdb thread can handle it - // and resume execution - HyperlightExit::Debug(VcpuStopReason::Interrupt) - } else { - HyperlightExit::Retry() - } + libc::EINTR => Ok(VmExit::Cancelled()), + libc::EAGAIN => Ok(VmExit::Retry()), - #[cfg(not(gdb))] - HyperlightExit::Retry() - } - } - libc::EAGAIN => HyperlightExit::Retry(), - _ => { - crate::debug!("KVM Error -Details: Address: {} \n {:#?}", e, &self); - log_then_return!("Error running VCPU {:?}", e); - } + other => Ok(VmExit::Unknown(format!( + "Unknown KVM VCPU error: {}", + other + ))), }, - Ok(other) => { - let err_msg = format!("Unexpected KVM Exit {:?}", other); - crate::debug!("KVM Other Exit Details: {:#?}", &self); - HyperlightExit::Unknown(err_msg) - } - }; - Ok(result) + Ok(other) => Ok(VmExit::Unknown(format!( + "Unknown KVM VCPU exit: {:?}", + other + ))), + } } fn regs(&self) -> Result { @@ -815,12 +609,7 @@ impl Hypervisor for KVMDriver { Ok(()) } - #[instrument(skip_all, parent = Span::current(), level = "Trace")] - fn as_mut_hypervisor(&mut self) -> &mut dyn Hypervisor { - self as &mut dyn Hypervisor - } - - fn interrupt_handle(&self) -> Arc { + fn interrupt_handle(&self) -> Arc { self.interrupt_handle.clone() } @@ -986,6 +775,16 @@ impl Hypervisor for KVMDriver { // If the vCPU stopped because of any other reason except a crash, we can handle it // normally _ => { + // Temporary spot to remove hw breakpoints on exit + // TODO: remove in future PR + if stop_reason == VcpuStopReason::EntryPointBp { + #[allow(clippy::unwrap_used)] // we checked this above + self.debug + .as_mut() + .unwrap() + .remove_hw_breakpoint(&self.vcpu_fd, self.entrypoint)?; + } + // Send the stop reason to the gdb thread self.send_dbg_msg(DebugResponse::VcpuStopped(stop_reason)) .map_err(|e| { @@ -1030,23 +829,33 @@ impl Hypervisor for KVMDriver { Ok(()) } - fn check_stack_guard(&self) -> Result { - if let Some(mgr) = self.mem_mgr.as_ref() { - mgr.check_stack_guard() + #[cfg(gdb)] + fn gdb_connection(&self) -> Option<&DebugCommChannel> { + self.gdb_conn.as_ref() + } + + #[cfg(gdb)] + fn translate_gva(&self, gva: u64) -> Result { + let tr = self + .vcpu_fd + .translate_gva(gva) + .map_err(|_| HyperlightError::TranslateGuestAddress(gva))?; + + if tr.valid == 0 { + Err(HyperlightError::TranslateGuestAddress(gva)) } else { - Err(new_error!("Memory manager is not initialized")) + Ok(tr.physical_address) } } #[cfg(feature = "trace_guest")] - fn handle_trace(&mut self, tc: &mut crate::sandbox::trace::TraceContext) -> Result<()> { + fn handle_trace( + &mut self, + tc: &mut crate::sandbox::trace::TraceContext, + mem_mgr: &mut SandboxMemoryManager, + ) -> Result<()> { let regs = self.regs()?; - tc.handle_trace( - ®s, - self.mem_mgr.as_mut().ok_or_else(|| { - new_error!("Memory manager is not initialized before handling trace") - })?, - ) + tc.handle_trace(®s, mem_mgr) } #[cfg(feature = "mem_profile")] @@ -1057,6 +866,6 @@ impl Hypervisor for KVMDriver { impl Drop for KVMDriver { fn drop(&mut self) { - self.interrupt_handle.dropped.store(true, Ordering::Relaxed); + self.interrupt_handle.set_dropped(); } } diff --git a/src/hyperlight_host/src/hypervisor/mod.rs b/src/hyperlight_host/src/hypervisor/mod.rs index f5575b989..cde789009 100644 --- a/src/hyperlight_host/src/hypervisor/mod.rs +++ b/src/hyperlight_host/src/hypervisor/mod.rs @@ -14,19 +14,22 @@ See the License for the specific language governing permissions and limitations under the License. */ -use log::{LevelFilter, debug}; -use tracing::{Span, instrument}; +use log::LevelFilter; -use crate::HyperlightError::StackOverflow; use crate::error::HyperlightError::ExecutionCanceledByHost; +#[cfg(gdb)] +use crate::hypervisor::gdb::{DebugCommChannel, DebugMsg, DebugResponse, arch}; use crate::hypervisor::regs::{ CommonFpu, CommonRegisters, CommonSegmentRegister, CommonSpecialRegisters, }; -use crate::mem::memory_region::{MemoryRegion, MemoryRegionFlags}; +use crate::mem::memory_region::{MemoryRegion, MemoryRegionFlags, MemoryRegionType}; use crate::metrics::METRIC_GUEST_CANCELLATION; +use crate::sandbox::outb::handle_outb; #[cfg(feature = "mem_profile")] use crate::sandbox::trace::MemTraceInfo; -use crate::{HyperlightError, Result, log_then_return}; +#[cfg(crashdump)] +use crate::sandbox::uninitialized::SandboxRuntimeConfig; +use crate::{HyperlightError, Result, log_then_return, new_error}; /// HyperV-on-linux functionality #[cfg(mshv3)] @@ -97,26 +100,31 @@ cfg_if::cfg_if! { /// These are the generic exit reasons that we can handle from a Hypervisor the Hypervisors run method is responsible for mapping from /// the hypervisor specific exit reasons to these generic ones -pub enum HyperlightExit { - #[cfg(gdb)] - /// The vCPU has exited due to a debug event - Debug(VcpuStopReason), +pub(crate) enum VmExit { /// The vCPU has halted Halt(), /// The vCPU has issued a write to the given port with the given value - IoOut(u16, Vec, u64, u64), - /// The vCPU has attempted to read or write from an unmapped address - Mmio(u64), - /// The vCPU tried to access memory but was missing the required permissions - AccessViolation(u64, MemoryRegionFlags, MemoryRegionFlags), + IoOut(u16, Vec), + /// The vCPU tried to read from the given (unmapped) addr + MmioRead(u64), + /// The vCPU tried to write to the given (unmapped) addr + MmioWrite(u64), /// The vCPU execution has been cancelled Cancelled(), /// The vCPU has exited for a reason that is not handled by Hyperlight Unknown(String), - /// The operation should be retried - /// On Linux this can happen where a call to run the CPU can return EAGAIN - /// On Windows the platform could cause a cancelation of the VM run + /// The operation should be retried, for example this can happen on Linux where a call to run the CPU can return EAGAIN + #[cfg_attr( + target_os = "windows", + expect( + dead_code, + reason = "Retry() is never constructed on Windows, but it is still matched on (which dead_code lint ignores)" + ) + )] Retry(), + #[cfg(gdb)] + /// The vCPU has exited due to a debug event (usually breakpoint) + Debug { dr6: u64, exception: u32 }, } /// A common set of hypervisor functionality @@ -159,26 +167,264 @@ pub(crate) trait Hypervisor: Debug + Send { fn dispatch_call_from_host( &mut self, dispatch_func_addr: RawPtr, + mem_mgr: &mut SandboxMemoryManager, + host_funcs: Arc>, #[cfg(gdb)] dbg_mem_access_fn: Arc>>, ) -> Result<()>; - /// Handle an IO exit from the internally stored vCPU. - fn handle_io( - &mut self, - port: u16, - data: Vec, - rip: u64, - instruction_length: u64, - ) -> Result<()>; + // Run the vCPU "one step until it exits" + fn run_vcpu(&mut self) -> Result; - /// Run the vCPU + /// Run the vCPU while handling potential exists. Returns when the vCPU halts or an error occurs. + #[allow(clippy::too_many_arguments)] fn run( &mut self, - #[cfg(feature = "trace_guest")] tc: &mut crate::sandbox::trace::TraceContext, - ) -> Result; + _entrypoint: u64, + interrupt_handle: Arc, + sandbox_regions: &[MemoryRegion], + mmap_regions: &[MemoryRegion], + mem_mgr: &mut SandboxMemoryManager, + host_funcs: Arc>, + #[cfg(gdb)] dbg_mem_access_fn: Arc>>, + #[cfg(crashdump)] rt_cfg: &SandboxRuntimeConfig, + ) -> Result<()> { + // ===== KILL() TIMING POINT 1: Between guest function calls ===== + // Clear any stale cancellation from a previous guest function call or if kill() was called too early. + // This ensures that kill() called BETWEEN different guest function calls doesn't affect the next call. + // + // If kill() was called and ran to completion BEFORE this line executes: + // - kill() has NO effect on this guest function call because CANCEL_BIT is cleared here. + // - NOTE: stale signals can still be delivered, but they will be ignored. + interrupt_handle.clear_cancel(); + + // Keeps the trace context and open spans + #[cfg(feature = "trace_guest")] + let mut tc = crate::sandbox::trace::TraceContext::new(); + + let result = loop { + // ===== KILL() TIMING POINT 2: Before set_tid() ===== + // If kill() is called and ran to completion BEFORE this line executes: + // - CANCEL_BIT will be set and we will return an early VmExit::Cancelled() + interrupt_handle.set_tid(); + interrupt_handle.set_running(); + + let exit_reason = + if interrupt_handle.is_cancelled() || interrupt_handle.is_debug_interrupted() { + Ok(VmExit::Cancelled()) + } else { + #[cfg(feature = "trace_guest")] + tc.setup_guest_trace(tracing_opentelemetry::OpenTelemetrySpanExt::context( + &tracing::Span::current(), + )); - /// Get InterruptHandle to underlying VM (returns internal trait) - fn interrupt_handle(&self) -> Arc; + // ===== KILL() TIMING POINT 3: Before calling run_vcpu() ===== + // If kill() is called and ran to completion BEFORE this line executes: + // - CANCEL_BIT will be set, but it's too late to prevent entering the guest this iteration + // - Signals will interrupt the guest (RUNNING_BIT=true), causing VmExit::Cancelled() + // - If the guest completes before any signals arrive, kill() may have no effect + // - If there are more iterations to do (IO/host func, etc.), the next iteration will be cancelled + let exit_reason = self.run_vcpu(); + + // End current host trace by closing the current span that captures traces + // happening when a guest exits and re-enters. + #[cfg(feature = "trace_guest")] + tc.end_host_trace(); + + // Handle the guest trace data if any + #[cfg(feature = "trace_guest")] + if let Err(e) = self.handle_trace(&mut tc, mem_mgr) { + // If no trace data is available, we just log a message and continue + // Is this the right thing to do? + log::debug!("Error handling guest trace: {:?}", e); + } + exit_reason + }; + + // ===== KILL() TIMING POINT 4: Before capturing cancel_requested ===== + // If kill() is called and ran to completion BEFORE this line executes: + // - CANCEL_BIT will be set + // - Signals may still be sent (RUNNING_BIT=true) but are harmless no-ops + // - kill() will have no effect on this iteration, but CANCEL_BIT will persist + // - If the loop continues (e.g., for a host call), the next iteration will be cancelled + // - Stale signals from before clear_running() may arrive and kick future iterations, + // but will be filtered out by the cancel_requested check below (and retried). + let cancel_requested = interrupt_handle.is_cancelled(); + let debug_interrupted = interrupt_handle.is_debug_interrupted(); + + // ===== KILL() TIMING POINT 5: Before calling clear_running() ===== + // Same as point 4. + interrupt_handle.clear_running(); + + // ===== KILL() TIMING POINT 6: After calling clear_running() ===== + // If kill() is called and ran to completion BEFORE this line executes: + // - CANCEL_BIT will be set but won't affect this iteration, it is never read below this comment + // and cleared at next run() start + // - RUNNING_BIT=false, so no new signals will be sent + // - Stale signals from before clear_running() may arrive and kick future iterations, + // but will be filtered out by the cancel_requested check below (and retried). + match exit_reason { + #[cfg(gdb)] + Ok(VmExit::Debug { dr6, exception }) => { + let regs = self.regs()?; + let rip_gva = self.translate_gva(regs.rip)?; + // Handle debug event (breakpoints) + let stop_reason = arch::vcpu_stop_reason(rip_gva, _entrypoint, dr6, exception)?; + + if stop_reason == VcpuStopReason::EntryPointBp { + // TODO in next PR: make sure to remove hw breakpoint here + // In case the hw breakpoint is the entry point, remove it to + // avoid hanging here as gdb does not remove breakpoints it + // has not set. + // Gdb expects the target to be stopped when connected. + // self.remove_hw_breakpoint(vcpu_fd, entrypoint)?; + } + if let Err(e) = self.handle_debug(dbg_mem_access_fn.clone(), stop_reason) { + break Err(e); + } + } + + Ok(VmExit::Halt()) => { + break Ok(()); + } + Ok(VmExit::IoOut(port, data)) => { + if data.is_empty() { + log_then_return!("no data was given in IO interrupt"); + } + + #[allow(clippy::get_first)] + let val = u32::from_le_bytes([ + data.get(0).copied().unwrap_or(0), + data.get(1).copied().unwrap_or(0), + data.get(2).copied().unwrap_or(0), + data.get(3).copied().unwrap_or(0), + ]); + + let regs = self.regs()?; + + #[cfg(feature = "mem_profile")] + { + let trace_info = self.trace_info_mut(); + handle_outb(mem_mgr, host_funcs.clone(), ®s, port, val, trace_info)?; + } + + #[cfg(not(feature = "mem_profile"))] + { + handle_outb(mem_mgr, host_funcs.clone(), ®s, port, val)?; + } + } + Ok(VmExit::MmioRead(addr)) => { + let all_regions = sandbox_regions.iter().chain(mmap_regions.iter()); + match get_memory_access_violation( + addr as usize, + MemoryRegionFlags::WRITE, + all_regions, + ) { + Some(MemoryAccess::StackGuardPageViolation) => { + break Err(HyperlightError::StackOverflow()); + } + Some(MemoryAccess::AccessViolation(region_flags)) => { + break Err(HyperlightError::MemoryAccessViolation( + addr, + MemoryRegionFlags::READ, + region_flags, + )); + } + None => { + if !mem_mgr.check_stack_guard()? { + break Err(HyperlightError::StackOverflow()); + } + + break Err(new_error!("MMIO READ access address {:#x}", addr)); + } + } + } + Ok(VmExit::MmioWrite(addr)) => { + let all_regions = sandbox_regions.iter().chain(mmap_regions.iter()); + match get_memory_access_violation( + addr as usize, + MemoryRegionFlags::WRITE, + all_regions, + ) { + Some(MemoryAccess::StackGuardPageViolation) => { + break Err(HyperlightError::StackOverflow()); + } + Some(MemoryAccess::AccessViolation(region_flags)) => { + break Err(HyperlightError::MemoryAccessViolation( + addr, + MemoryRegionFlags::WRITE, + region_flags, + )); + } + None => { + if !mem_mgr.check_stack_guard()? { + break Err(HyperlightError::StackOverflow()); + } + + break Err(new_error!("MMIO WRITE access address {:#x}", addr)); + } + } + } + Ok(VmExit::Cancelled()) => { + // If cancellation was not requested for this specific guest function call, + // the vcpu was interrupted by a stale cancellation from a previous call + if !cancel_requested && !debug_interrupted { + // treat this the same as a VmExit::Retry, the cancel was not meant for this call + continue; + } + + #[cfg(gdb)] + if debug_interrupted { + // If the vcpu was interrupted by a debugger, we need to handle it + interrupt_handle.clear_debug_interrupt(); + if let Err(e) = + self.handle_debug(dbg_mem_access_fn.clone(), VcpuStopReason::Interrupt) + { + break Err(e); + } + } + + metrics::counter!(METRIC_GUEST_CANCELLATION).increment(1); + break Err(ExecutionCanceledByHost()); + } + Ok(VmExit::Unknown(reason)) => { + break Err(new_error!("Unexpected VM Exit: {:?}", reason)); + } + Ok(VmExit::Retry()) => continue, + Err(e) => { + break Err(e); + } + } + }; + + match result { + Ok(_) => Ok(()), + Err(HyperlightError::ExecutionCanceledByHost()) => { + // no need to crashdump this + Err(HyperlightError::ExecutionCanceledByHost()) + } + Err(e) => { + #[cfg(crashdump)] + if rt_cfg.guest_core_dump { + let ctx = self + .crashdump_context() + .map_err(|e| new_error!("Failed to get crashdump context: {:?}", e))?; + crashdump::generate_crashdump(ctx)?; + } + + // If GDB is enabled, we handle the debug memory access + // Disregard return value as we want to return the error + #[cfg(gdb)] + if self.gdb_connection().is_some() { + self.handle_debug(dbg_mem_access_fn.clone(), VcpuStopReason::Crash)?; + } + + log_then_return!(e); + } + } + } + + /// Get InterruptHandle to underlying VM + fn interrupt_handle(&self) -> Arc; /// Get regs #[allow(dead_code)] @@ -298,9 +544,6 @@ pub(crate) trait Hypervisor: Debug + Send { LevelFilter::from_str(level).unwrap_or(LevelFilter::Error) as u32 } - /// get a mutable trait object from self - fn as_mut_hypervisor(&mut self) -> &mut dyn Hypervisor; - #[cfg(crashdump)] fn crashdump_context(&self) -> Result>; @@ -314,201 +557,93 @@ pub(crate) trait Hypervisor: Debug + Send { unimplemented!() } - /// Check stack guard to see if the stack is still valid - fn check_stack_guard(&self) -> Result; + #[cfg(gdb)] + fn gdb_connection(&self) -> Option<&DebugCommChannel>; + + /// Translates the guest virtual address to physical address + #[cfg(gdb)] + fn translate_gva(&self, gva: u64) -> crate::Result; #[cfg(feature = "trace_guest")] - fn handle_trace(&mut self, tc: &mut crate::sandbox::trace::TraceContext) -> Result<()>; + fn handle_trace( + &mut self, + tc: &mut crate::sandbox::trace::TraceContext, + mem_mgr: &mut SandboxMemoryManager, + ) -> Result<()>; /// Get a mutable reference of the trace info for the guest #[cfg(feature = "mem_profile")] fn trace_info_mut(&mut self) -> &mut MemTraceInfo; } +/// The vCPU tried to access the given addr +pub(crate) enum MemoryAccess { + /// The accessed region has the given flags + AccessViolation(MemoryRegionFlags), + /// The accessed region is a stack guard page + StackGuardPageViolation, +} + /// Returns a Some(HyperlightExit::AccessViolation(..)) if the given gpa doesn't have /// access its corresponding region. Returns None otherwise, or if the region is not found. pub(crate) fn get_memory_access_violation<'a>( gpa: usize, + tried: MemoryRegionFlags, mut mem_regions: impl Iterator, - access_info: MemoryRegionFlags, -) -> Option { +) -> Option { // find the region containing the given gpa let region = mem_regions.find(|region| region.guest_region.contains(&gpa)); - if let Some(region) = region - && (!region.flags.contains(access_info) - || region.flags.contains(MemoryRegionFlags::STACK_GUARD)) - { - return Some(HyperlightExit::AccessViolation( - gpa as u64, - access_info, - region.flags, - )); + if let Some(region) = region { + if region.region_type == MemoryRegionType::GuardPage { + return Some(MemoryAccess::StackGuardPageViolation); + } else if !region.flags.contains(tried) { + return Some(MemoryAccess::AccessViolation(region.flags)); + } } None } -/// A virtual CPU that can be run until an exit occurs -pub struct VirtualCPU {} - -impl VirtualCPU { - /// Run the given hypervisor until a halt instruction is reached - #[instrument(err(Debug), skip_all, parent = Span::current(), level = "Trace")] - pub(crate) fn run( - hv: &mut dyn Hypervisor, - #[cfg(gdb)] dbg_mem_access_fn: Arc>>, - ) -> Result<()> { - // Keeps the trace context and open spans - #[cfg(feature = "trace_guest")] - let mut tc = crate::sandbox::trace::TraceContext::new(); - - loop { - #[cfg(feature = "trace_guest")] - let result = { - let result = hv.run(&mut tc); - // End current host trace by closing the current span that captures traces - // happening when a guest exits and re-enters. - tc.end_host_trace(); - - // Handle the guest trace data if any - if let Err(e) = hv.handle_trace(&mut tc) { - // If no trace data is available, we just log a message and continue - // Is this the right thing to do? - log::debug!("Error handling guest trace: {:?}", e); - } - - result - }; - #[cfg(not(feature = "trace_guest"))] - let result = hv.run(); - - match result { - #[cfg(gdb)] - Ok(HyperlightExit::Debug(stop_reason)) => { - if let Err(e) = hv.handle_debug(dbg_mem_access_fn.clone(), stop_reason) { - log_then_return!(e); - } - } - - Ok(HyperlightExit::Halt()) => { - break; - } - Ok(HyperlightExit::IoOut(port, data, rip, instruction_length)) => { - hv.handle_io(port, data, rip, instruction_length)? - } - Ok(HyperlightExit::Mmio(addr)) => { - #[cfg(crashdump)] - crashdump::generate_crashdump(hv)?; +/// A trait for platform-specific interrupt handle implementation details +pub(crate) trait InterruptHandleImpl: InterruptHandle { + /// Set the thread ID for the vcpu thread (no-op on Windows) + fn set_tid(&self); - if !hv.check_stack_guard()? { - log_then_return!(StackOverflow()); - } + /// Set the running state and increment generation if needed + /// Returns Ok(generation) on success, Err(generation) if generation wrapped + fn set_running(&self); - log_then_return!("MMIO access address {:#x}", addr); - } - Ok(HyperlightExit::AccessViolation(addr, tried, region_permission)) => { - #[cfg(crashdump)] - crashdump::generate_crashdump(hv)?; + /// Clear the running state + /// On Windows, this also clears cancel_requested and debug_interrupt + /// On Linux, this only clears the running bit + fn clear_running(&self); - // If GDB is enabled, we handle the debug memory access - // Disregard return value as we want to return the error - #[cfg(gdb)] - let _ = hv.handle_debug(dbg_mem_access_fn.clone(), VcpuStopReason::Crash); + /// Mark the handle as dropped + fn set_dropped(&self); - if region_permission.intersects(MemoryRegionFlags::STACK_GUARD) { - return Err(HyperlightError::StackOverflow()); - } - log_then_return!(HyperlightError::MemoryAccessViolation( - addr, - tried, - region_permission - )); - } - Ok(HyperlightExit::Cancelled()) => { - // Shutdown is returned when the host has cancelled execution - // After termination, the main thread will re-initialize the VM - metrics::counter!(METRIC_GUEST_CANCELLATION).increment(1); - log_then_return!(ExecutionCanceledByHost()); - } - Ok(HyperlightExit::Unknown(reason)) => { - #[cfg(crashdump)] - crashdump::generate_crashdump(hv)?; - // If GDB is enabled, we handle the debug memory access - // Disregard return value as we want to return the error - #[cfg(gdb)] - let _ = hv.handle_debug(dbg_mem_access_fn.clone(), VcpuStopReason::Crash); + /// Check if cancellation was requested + fn is_cancelled(&self) -> bool; - log_then_return!("Unexpected VM Exit {:?}", reason); - } - Ok(HyperlightExit::Retry()) => { - debug!("[VCPU] Retry - continuing VM run loop"); - continue; - } - Err(e) => { - #[cfg(crashdump)] - crashdump::generate_crashdump(hv)?; - // If GDB is enabled, we handle the debug memory access - // Disregard return value as we want to return the error - #[cfg(gdb)] - let _ = hv.handle_debug(dbg_mem_access_fn.clone(), VcpuStopReason::Crash); + /// Clear the cancellation request flag + fn clear_cancel(&self); - return Err(e); - } - } - } + /// Check if debug interrupt was requested (always returns false when gdb feature is disabled) + fn is_debug_interrupted(&self) -> bool; - Ok(()) - } + // Clear the debug interrupt request flag + #[cfg(gdb)] + fn clear_debug_interrupt(&self); } -/// A trait for handling interrupts to a sandbox's vcpu (public API) -pub trait InterruptHandle: Debug + Send + Sync { +/// A trait for handling interrupts to a sandbox's vcpu +pub trait InterruptHandle: Send + Sync + Debug { /// Interrupt the corresponding sandbox from running. /// - /// This method attempts to cancel a currently executing guest function call by sending - /// a signal to the VCPU thread. It uses generation tracking and call_active flag to - /// ensure the interruption is safe and precise. - /// - /// # Behavior - /// - /// - **Guest function running**: If called while a guest function is executing (VCPU running - /// or in a host function call), this stamps the current generation into cancel_requested - /// and sends a signal to interrupt the VCPU. Returns `true`. - /// - /// - **No active call**: If called when no guest function call is in progress (call_active=false), - /// this has no effect and returns `false`. This prevents "kill-in-advance" where kill() - /// is called before a guest function starts. - /// - /// - **During host function**: If the guest call is currently executing a host function - /// (VCPU not running but call_active=true), this stamps cancel_requested. When the - /// host function returns and attempts to re-enter the guest, the cancellation will - /// be detected and the call will abort. Returns `true`. - /// - /// # Generation Tracking - /// - /// The method stamps the current generation number along with the cancellation request. - /// This ensures that: - /// - Stale signals from previous calls are ignored (generation mismatch) - /// - Only the intended guest function call is affected - /// - Multiple rapid kill() calls on the same generation are idempotent - /// - /// # Blocking Behavior - /// - /// This function will block while attempting to deliver the signal to the VCPU thread, - /// retrying until either: - /// - The signal is successfully delivered (VCPU transitions from running to not running) - /// - The VCPU stops running for another reason (e.g., call completes normally) - /// - /// # Returns - /// - /// - `true`: Cancellation request was stamped (kill will take effect) - /// - `false`: No active call, cancellation request was not stamped (no effect) + /// - If this is called while the the sandbox currently executing a guest function call, it will interrupt the sandbox and return `true`. + /// - If this is called while the sandbox is not running (for example before or after calling a guest function), it will do nothing and return `false`. /// /// # Note - /// - /// To reliably interrupt a guest call, ensure `kill()` is called while the guest - /// function is actually executing. Calling kill() before call_guest_function() will - /// have no effect. + /// This function will block for the duration of the time it takes for the vcpu thread to be interrupted. fn kill(&self) -> bool; /// Used by a debugger to interrupt the corresponding sandbox from running. @@ -523,310 +658,113 @@ pub trait InterruptHandle: Debug + Send + Sync { #[cfg(gdb)] fn kill_from_debugger(&self) -> bool; - /// Check if the corresponding VM has been dropped. + /// Returns true if the corresponding sandbox has been dropped fn dropped(&self) -> bool; } -/// Internal trait for interrupt handle implementation details (private, cross-platform). -/// -/// This trait contains all the internal atomics access methods and helper functions -/// that are shared between Linux and Windows implementations. It extends InterruptHandle -/// to inherit the public API. -/// -/// This trait should NOT be used outside of hypervisor implementations. -pub(crate) trait InterruptHandleInternal: InterruptHandle { - /// Returns the call_active atomic bool reference for internal implementations. - fn get_call_active(&self) -> &AtomicBool; - - /// Returns the running atomic u64 reference for internal implementations. - fn get_running(&self) -> &AtomicU64; - - /// Returns the cancel_requested atomic u64 reference for internal implementations. - fn get_cancel_requested(&self) -> &AtomicU64; - - /// Set call_active - increments generation and sets flag. - /// - /// Increments the generation counter and sets the call_active flag to true, - /// indicating that a guest function call is now in progress. This allows - /// kill() to stamp cancel_requested with the correct generation. - /// - /// Must be called at the start of call_guest_function_by_name_no_reset(), - /// before any VCPU execution begins. - /// - /// Returns true if call_active was already set (indicating a guard already exists), - /// false otherwise. - fn set_call_active(&self) -> bool { - self.increment_generation(); - self.get_call_active().swap(true, Ordering::AcqRel) - } - - /// Clear call_active - clears the call_active flag. - /// - /// Clears the call_active flag, indicating that no guest function call is - /// in progress. After this, kill() will have no effect and will return false. - /// - /// Must be called at the end of call_guest_function_by_name_no_reset(), - /// after the guest call has fully completed (whether successfully or with error). - fn clear_call_active(&self) { - self.get_call_active().store(false, Ordering::Release) - } - - /// Set cancel_requested to true with the given generation. - /// - /// This stamps the cancellation request with the current generation number, - /// ensuring that only the VCPU running with this exact generation will honor - /// the cancellation. - fn set_cancel_requested(&self, generation: u64) { - const CANCEL_REQUESTED_BIT: u64 = 1 << 63; - const MAX_GENERATION: u64 = CANCEL_REQUESTED_BIT - 1; - let value = CANCEL_REQUESTED_BIT | (generation & MAX_GENERATION); - self.get_cancel_requested().store(value, Ordering::Release); - } - - /// Clear cancel_requested (reset to no cancellation). - /// - /// This is called after a cancellation has been processed to reset the - /// cancellation flag for the next guest call. - fn clear_cancel_requested(&self) { - self.get_cancel_requested().store(0, Ordering::Release); - } - - /// Check if cancel_requested is set for the given generation. - /// - /// Returns true only if BOTH: - /// - The cancellation flag is set - /// - The stored generation matches the provided generation - /// - /// This prevents stale cancellations from affecting new guest calls. - fn is_cancel_requested_for_generation(&self, generation: u64) -> bool { - const CANCEL_REQUESTED_BIT: u64 = 1 << 63; - const MAX_GENERATION: u64 = CANCEL_REQUESTED_BIT - 1; - let raw = self.get_cancel_requested().load(Ordering::Acquire); - let is_set = raw & CANCEL_REQUESTED_BIT != 0; - let stored_generation = raw & MAX_GENERATION; - is_set && stored_generation == generation - } - - /// Set running bit to true, return current generation. - /// - /// This is called when the VCPU is about to enter guest mode. It atomically - /// sets the running flag while preserving the generation counter. - fn set_running_bit(&self) -> u64 { - const RUNNING_BIT: u64 = 1 << 63; - self.get_running() - .fetch_update(Ordering::Release, Ordering::Acquire, |raw| { - Some(raw | RUNNING_BIT) - }) - .map(|raw| raw & !RUNNING_BIT) // Return the current generation - .unwrap_or(0) - } - - /// Increment the generation for a new guest function call. - /// - /// The generation counter wraps around at MAX_GENERATION (2^63 - 1). - /// This is called at the start of each new guest function call to provide - /// a unique identifier that prevents ABA problems with stale cancellations. - /// - /// Returns the NEW generation number (after incrementing). - fn increment_generation(&self) -> u64 { - const RUNNING_BIT: u64 = 1 << 63; - const MAX_GENERATION: u64 = RUNNING_BIT - 1; - self.get_running() - .fetch_update(Ordering::Release, Ordering::Acquire, |raw| { - let current_generation = raw & !RUNNING_BIT; - let running_bit = raw & RUNNING_BIT; - if current_generation == MAX_GENERATION { - // Restart generation from 0 - return Some(running_bit); - } - Some((current_generation + 1) | running_bit) - }) - .map(|raw| (raw & !RUNNING_BIT) + 1) // Return the NEW generation - .unwrap_or(1) // If wrapped, return 1 - } - - /// Get the current running state and generation counter. - /// - /// Returns a tuple of (running, generation) where: - /// - running: true if VCPU is currently in guest mode - /// - generation: current generation counter value - fn get_running_and_generation(&self) -> (bool, u64) { - const RUNNING_BIT: u64 = 1 << 63; - let raw = self.get_running().load(Ordering::Acquire); - let running = raw & RUNNING_BIT != 0; - let generation = raw & !RUNNING_BIT; - (running, generation) - } - - /// Clear the running bit and return the old value. - /// - /// This is called when the VCPU exits from guest mode back to host mode. - /// The return value (which includes the generation and the old running bit) - /// is currently unused by all callers. - fn clear_running_bit(&self) -> u64 { - const RUNNING_BIT: u64 = 1 << 63; - self.get_running() - .fetch_and(!RUNNING_BIT, Ordering::Release) - } -} - #[cfg(any(kvm, mshv3))] #[derive(Debug)] pub(super) struct LinuxInterruptHandle { - /// Atomic flag combining running state and generation counter. - /// - /// **Bit 63**: VCPU running state (1 = running, 0 = not running) - /// **Bits 0-62**: Generation counter (incremented once per guest function call) - /// - /// # Generation Tracking - /// - /// The generation counter is incremented once at the start of each guest function call - /// and remains constant throughout that call, even if the VCPU is run multiple times - /// (due to host function calls, retries, etc.). This design solves the race condition - /// where a kill() from a previous call could spuriously cancel a new call. - /// - /// ## Why Generations Are Needed - /// - /// Consider this scenario WITHOUT generation tracking: - /// 1. Thread A starts guest call 1, VCPU runs - /// 2. Thread B calls kill(), sends signal to Thread A - /// 3. Guest call 1 completes before signal arrives - /// 4. Thread A starts guest call 2, VCPU runs again - /// 5. Stale signal from step 2 arrives and incorrectly cancels call 2 - /// - /// WITH generation tracking: - /// 1. Thread A starts guest call 1 (generation N), VCPU runs - /// 2. Thread B calls kill(), stamps cancel_requested with generation N - /// 3. Guest call 1 completes, signal may or may not have arrived yet - /// 4. Thread A starts guest call 2 (generation N+1), VCPU runs again - /// 5. If stale signal arrives, signal handler checks: cancel_requested.generation (N) != current generation (N+1) - /// 6. Stale signal is ignored, call 2 continues normally - /// - /// ## Per-Call vs Per-Run Generation + /// Atomic value packing vcpu execution state. /// - /// It's critical that generation is incremented per GUEST FUNCTION CALL, not per vcpu.run(): - /// - A single guest function call may invoke vcpu.run() multiple times (host calls, retries) - /// - All run() calls within the same guest call must share the same generation - /// - This ensures kill() affects the entire guest function call atomically + /// Bit layout: + /// - Bit 63: RUNNING_BIT - set when vcpu is actively running + /// - Bit 62: CANCEL_BIT - set when cancellation has been requested + /// - Bits 61-0: generation counter - tracks vcpu run iterations to prevent ABA problem /// - /// # Invariants - /// - /// - If VCPU is running: bit 63 is set (neither converse nor inverse holds) - /// - If VCPU is running: bits 0-62 match the current guest call's generation + /// CANCEL_BIT persists across vcpu exits/re-entries within a single `HyperlightVm::run()` call + /// (e.g., during host function calls), but is cleared at the start of each new `HyperlightVm::run()` call. running: AtomicU64, - /// Thread ID where the VCPU is currently running. - /// - /// # Invariants + /// Thread ID where the vcpu is running. /// - /// - If VCPU is running: tid contains the thread ID of the executing thread - /// - Multiple VMs may share the same tid, but at most one will have running=true + /// Note: Multiple VMs may have the same `tid` (same thread runs multiple sandboxes sequentially), + /// but at most one VM will have RUNNING_BIT set at any given time. tid: AtomicU64, - /// Generation-aware cancellation request flag. - /// - /// **Bit 63**: Cancellation requested flag (1 = kill requested, 0 = no kill) - /// **Bits 0-62**: Generation number when cancellation was requested - /// - /// # Purpose - /// - /// This flag serves three critical functions: - /// - /// 1. **Prevent stale signals**: A VCPU may only be interrupted if cancel_requested - /// is set AND the generation matches the current call's generation - /// - /// 2. **Handle host function calls**: If kill() is called while a host function is - /// executing (VCPU not running but call is active), cancel_requested is stamped - /// with the current generation. When the host function returns and the VCPU - /// attempts to re-enter the guest, it will see the cancellation and abort. - /// - /// 3. **Detect stale kills**: If cancel_requested.generation doesn't match the - /// current generation, it's from a previous call and should be ignored - /// - /// # States and Transitions - /// - /// - **No cancellation**: cancel_requested = 0 (bit 63 clear) - /// - **Cancellation for generation N**: cancel_requested = (1 << 63) | N - /// - Signal handler checks: (cancel_requested & 0x7FFFFFFFFFFFFFFF) == current_generation - cancel_requested: AtomicU64, - - /// Flag indicating whether a guest function call is currently in progress. - /// - /// **true**: A guest function call is active (between call start and completion) - /// **false**: No guest function call is active - /// - /// # Purpose - /// - /// This flag prevents kill() from having any effect when called outside of a - /// guest function call. This solves the "kill-in-advance" problem where kill() - /// could be called before a guest function starts and would incorrectly cancel it. - /// - /// # Behavior - /// - /// - Set to true at the start of call_guest_function_by_name_no_reset() - /// - Cleared at the end of call_guest_function_by_name_no_reset() - /// - kill() only stamps cancel_requested if call_active is true - /// - If kill() is called when call_active=false, it returns false and has no effect - /// - /// # Why AtomicBool is Safe - /// - /// Although there's a theoretical race where: - /// 1. Thread A checks call_active (false) - /// 2. Thread B sets call_active (true) and starts guest call - /// 3. Thread A's kill() returns false (no effect) - /// - /// This is acceptable because the generation tracking provides an additional - /// safety layer. Even if a stale kill somehow stamped cancel_requested, the - /// generation mismatch would cause it to be ignored. - call_active: AtomicBool, - - /// Debugger interrupt request flag (GDB only). - /// - /// Set when kill_from_debugger() is called, cleared when VCPU stops running. - /// Used to distinguish debugger interrupts from normal kill() interrupts. + /// Debugger interrupt flag (gdb feature only). + /// Set when `kill_from_debugger()` is called, cleared when vcpu stops running. #[cfg(gdb)] debug_interrupt: AtomicBool, /// Whether the corresponding VM has been dropped. dropped: AtomicBool, - /// Delay between retry attempts when sending signals to the VCPU thread. + /// Delay between retry attempts when sending signals to interrupt the vcpu. retry_delay: Duration, - /// Offset from SIGRTMIN for the signal used to interrupt the VCPU thread. + /// Offset from SIGRTMIN for the signal used to interrupt the vcpu thread. sig_rt_min_offset: u8, } #[cfg(any(kvm, mshv3))] impl LinuxInterruptHandle { - fn send_signal(&self, stamp_generation: bool) -> bool { + const RUNNING_BIT: u64 = 1 << 63; + const CANCEL_BIT: u64 = 1 << 62; + const MAX_GENERATION: u64 = (1 << 62) - 1; + + /// Sets the RUNNING_BIT and increments the generation counter. + /// + /// # Preserves + /// - CANCEL_BIT: The current value of CANCEL_BIT is preserved + /// + /// # Invariants Maintained + /// - Generation increments by 1 (wraps to 0 at MAX_GENERATION) + /// - RUNNING_BIT is set + /// - CANCEL_BIT remains unchanged + /// + /// # Memory Ordering + /// Uses `Release` ordering to ensure that the `tid` store (which uses `Relaxed`) + /// is visible to any thread that observes RUNNING_BIT=true via `Acquire` ordering. + /// This prevents the interrupt thread from reading a stale `tid` value. + #[expect(clippy::expect_used)] + fn set_running_and_increment_generation(&self) -> u64 { + self.running + .fetch_update(Ordering::Release, Ordering::Relaxed, |raw| { + let cancel_bit = raw & Self::CANCEL_BIT; // Preserve CANCEL_BIT + let generation = raw & Self::MAX_GENERATION; + let new_generation = if generation == Self::MAX_GENERATION { + // restart generation from 0 + 0 + } else { + generation + 1 + }; + // Set RUNNING_BIT, preserve CANCEL_BIT, increment generation + Some(Self::RUNNING_BIT | cancel_bit | new_generation) + }) + .expect("Should never fail since we always return Some") + } + + /// Get the running and cancel bits, return the previous value. + /// + /// # Memory Ordering + /// Uses `Acquire` ordering to synchronize with the `Release` in `set_running_and_increment_generation()`. + /// This ensures that when we observe RUNNING_BIT=true, we also see the correct `tid` value. + fn get_running_cancel_and_generation(&self) -> (bool, bool, u64) { + let raw = self.running.load(Ordering::Acquire); + let running = raw & Self::RUNNING_BIT != 0; + let cancel = raw & Self::CANCEL_BIT != 0; + let generation = raw & Self::MAX_GENERATION; + (running, cancel, generation) + } + + fn send_signal(&self) -> bool { let signal_number = libc::SIGRTMIN() + self.sig_rt_min_offset as libc::c_int; let mut sent_signal = false; let mut target_generation: Option = None; loop { - if !self.call_active.load(Ordering::Acquire) { - // No active call, so no need to send signal - break; - } + let (running, cancel, generation) = self.get_running_cancel_and_generation(); - let (running, generation) = self.get_running_and_generation(); - - // Stamp generation into cancel_requested if requested and this is the first iteration - // We stamp even when running=false to support killing during host function calls - // The generation tracking will prevent stale kills from affecting new calls - // Only stamp if a call is actually active (call_active=true) - if stamp_generation - && target_generation.is_none() - && self.call_active.load(Ordering::Acquire) - { - self.set_cancel_requested(generation); - target_generation = Some(generation); - } + // Check if we should continue sending signals + // Exit if not running OR if neither cancel nor debug_interrupt is set + #[cfg(gdb)] + let should_continue = + running && (cancel || self.debug_interrupt.load(Ordering::Relaxed)); + #[cfg(not(gdb))] + let should_continue = running && cancel; - // If not running, we've stamped the generation (if requested), so we're done - // This handles the host function call scenario - if !running { + if !should_continue { break; } @@ -839,6 +777,8 @@ impl LinuxInterruptHandle { log::info!("Sending signal to kill vcpu thread..."); sent_signal = true; + // Acquire ordering to synchronize with the Release store in set_tid() + // This ensures we see the correct tid value for the currently running vcpu unsafe { libc::pthread_kill(self.tid.load(Ordering::Acquire) as _, signal_number); } @@ -849,42 +789,189 @@ impl LinuxInterruptHandle { } } +#[cfg(any(kvm, mshv3))] +impl InterruptHandleImpl for LinuxInterruptHandle { + fn set_tid(&self) { + // Release ordering to synchronize with the Acquire load of `running` in send_signal() + // This ensures that when send_signal() observes RUNNING_BIT=true (via Acquire), + // it also sees the correct tid value stored here + self.tid + .store(unsafe { libc::pthread_self() as u64 }, Ordering::Release); + } + + fn set_running(&self) { + self.set_running_and_increment_generation(); + } + + fn is_cancelled(&self) -> bool { + // Acquire ordering to synchronize with the Release in kill() + // This ensures we see the CANCEL_BIT set by the interrupt thread + self.running.load(Ordering::Acquire) & Self::CANCEL_BIT != 0 + } + + fn clear_cancel(&self) { + // Relaxed is sufficient here - we're the only thread that clears this bit + // at the start of run(), and there's no data race on the clear operation itself + self.running.fetch_and(!Self::CANCEL_BIT, Ordering::Relaxed); + } + + fn clear_running(&self) { + // Release ordering to ensure all vcpu operations are visible before clearing RUNNING_BIT + self.running + .fetch_and(!Self::RUNNING_BIT, Ordering::Release); + } + + fn is_debug_interrupted(&self) -> bool { + #[cfg(gdb)] + { + self.debug_interrupt.load(Ordering::Relaxed) + } + #[cfg(not(gdb))] + { + false + } + } + + #[cfg(gdb)] + fn clear_debug_interrupt(&self) { + self.debug_interrupt.store(false, Ordering::Relaxed); + } + + fn set_dropped(&self) { + self.dropped.store(true, Ordering::Relaxed); + } +} + #[cfg(any(kvm, mshv3))] impl InterruptHandle for LinuxInterruptHandle { fn kill(&self) -> bool { - if !(self.call_active.load(Ordering::Acquire)) { - // No active call, so no effect - return false; - } + // Release ordering ensures that any writes before kill() are visible to the vcpu thread + // when it checks is_cancelled() with Acquire ordering + self.running.fetch_or(Self::CANCEL_BIT, Ordering::Release); - // send_signal will stamp the generation into cancel_requested - // right before sending each signal, ensuring they're always in sync - self.send_signal(true) + // Send signals to interrupt the vcpu if it's currently running + self.send_signal() } #[cfg(gdb)] fn kill_from_debugger(&self) -> bool { self.debug_interrupt.store(true, Ordering::Relaxed); - self.send_signal(false) + self.send_signal() } - fn dropped(&self) -> bool { self.dropped.load(Ordering::Relaxed) } } -#[cfg(any(kvm, mshv3))] -impl InterruptHandleInternal for LinuxInterruptHandle { - fn get_call_active(&self) -> &AtomicBool { - &self.call_active +#[cfg(target_os = "windows")] +#[derive(Debug)] +pub(super) struct WindowsInterruptHandle { + /// Atomic value packing vcpu execution state. + /// + /// Bit layout: + /// - Bit 1: RUNNING_BIT - set when vcpu is actively running + /// - Bit 0: CANCEL_BIT - set when cancellation has been requested + /// + /// `WHvCancelRunVirtualProcessor()` will return Ok even if the vcpu is not running, + /// which is why we need the RUNNING_BIT. + /// + /// CANCEL_BIT persists across vcpu exits/re-entries within a single `HyperlightVm::run()` call + /// (e.g., during host function calls), but is cleared at the start of each new `HyperlightVm::run()` call. + state: AtomicU64, + + // This is used to signal the GDB thread to stop the vCPU + #[cfg(gdb)] + debug_interrupt: AtomicBool, + partition_handle: windows::Win32::System::Hypervisor::WHV_PARTITION_HANDLE, + dropped: AtomicBool, +} + +#[cfg(target_os = "windows")] +impl WindowsInterruptHandle { + const RUNNING_BIT: u64 = 1 << 1; + const CANCEL_BIT: u64 = 1 << 0; +} + +#[cfg(target_os = "windows")] +impl InterruptHandleImpl for WindowsInterruptHandle { + fn set_tid(&self) { + // No-op on Windows - we don't need to track thread ID + } + + fn set_running(&self) { + // Release ordering to ensure prior memory operations are visible when another thread observes running=true + self.state.fetch_or(Self::RUNNING_BIT, Ordering::Release); + } + + fn is_cancelled(&self) -> bool { + // Acquire ordering to synchronize with the Release in kill() + // This ensures we see the CANCEL_BIT set by the interrupt thread + self.state.load(Ordering::Acquire) & Self::CANCEL_BIT != 0 } - fn get_running(&self) -> &AtomicU64 { - &self.running + fn clear_cancel(&self) { + // Relaxed is sufficient here - we're the only thread that clears this bit + // at the start of run(), and there's no data race on the clear operation itself + self.state.fetch_and(!Self::CANCEL_BIT, Ordering::Relaxed); + } + + fn clear_running(&self) { + // Release ordering to ensure all vcpu operations are visible before clearing running + self.state.fetch_and(!Self::RUNNING_BIT, Ordering::Release); + #[cfg(gdb)] + self.debug_interrupt.store(false, Ordering::Relaxed); + } + + fn is_debug_interrupted(&self) -> bool { + #[cfg(gdb)] + { + self.debug_interrupt.load(Ordering::Relaxed) + } + #[cfg(not(gdb))] + { + false + } + } + + #[cfg(gdb)] + fn clear_debug_interrupt(&self) { + #[cfg(gdb)] + self.debug_interrupt.store(false, Ordering::Relaxed); } - fn get_cancel_requested(&self) -> &AtomicU64 { - &self.cancel_requested + fn set_dropped(&self) { + self.dropped.store(true, Ordering::Relaxed); + } +} + +#[cfg(target_os = "windows")] +impl InterruptHandle for WindowsInterruptHandle { + fn kill(&self) -> bool { + use windows::Win32::System::Hypervisor::WHvCancelRunVirtualProcessor; + + // Release ordering ensures that any writes before kill() are visible to the vcpu thread + // when it checks is_cancelled() with Acquire ordering + self.state.fetch_or(Self::CANCEL_BIT, Ordering::Release); + + // Acquire ordering to synchronize with the Release in set_running() + // This ensures we see the running state set by the vcpu thread + let state = self.state.load(Ordering::Acquire); + (state & Self::RUNNING_BIT != 0) + && unsafe { WHvCancelRunVirtualProcessor(self.partition_handle, 0, 0).is_ok() } + } + #[cfg(gdb)] + fn kill_from_debugger(&self) -> bool { + use windows::Win32::System::Hypervisor::WHvCancelRunVirtualProcessor; + + self.debug_interrupt.store(true, Ordering::Relaxed); + // Acquire ordering to synchronize with the Release in set_running() + let state = self.state.load(Ordering::Acquire); + (state & Self::RUNNING_BIT != 0) + && unsafe { WHvCancelRunVirtualProcessor(self.partition_handle, 0, 0).is_ok() } + } + + fn dropped(&self) -> bool { + self.dropped.load(Ordering::Relaxed) } } diff --git a/src/hyperlight_host/src/hypervisor/windows_hypervisor_platform.rs b/src/hyperlight_host/src/hypervisor/windows_hypervisor_platform.rs index 4f4e82555..750d37d56 100644 --- a/src/hyperlight_host/src/hypervisor/windows_hypervisor_platform.rs +++ b/src/hyperlight_host/src/hypervisor/windows_hypervisor_platform.rs @@ -520,56 +520,6 @@ impl VMProcessor { self.set_registers(®isters) } - - #[cfg(gdb)] - pub(super) fn get_debug_regs(&self) -> Result { - const LEN: usize = 6; - - let names: [WHV_REGISTER_NAME; LEN] = [ - WHvX64RegisterDr0, - WHvX64RegisterDr1, - WHvX64RegisterDr2, - WHvX64RegisterDr3, - WHvX64RegisterDr6, - WHvX64RegisterDr7, - ]; - - let mut out: [Align16; LEN] = unsafe { std::mem::zeroed() }; - unsafe { - WHvGetVirtualProcessorRegisters( - self.get_partition_hdl(), - 0, - names.as_ptr(), - LEN as u32, - out.as_mut_ptr() as *mut WHV_REGISTER_VALUE, - )?; - Ok(WHvDebugRegisters { - dr0: out[0].0.Reg64, - dr1: out[1].0.Reg64, - dr2: out[2].0.Reg64, - dr3: out[3].0.Reg64, - dr6: out[4].0.Reg64, - dr7: out[5].0.Reg64, - }) - } - } - - #[instrument(err(Debug), skip_all, parent = Span::current(), level= "Trace")] - pub(super) fn run(&mut self) -> Result { - let partition_handle = self.get_partition_hdl(); - let mut exit_context: WHV_RUN_VP_EXIT_CONTEXT = Default::default(); - - unsafe { - WHvRunVirtualProcessor( - partition_handle, - 0, - &mut exit_context as *mut _ as *mut c_void, - std::mem::size_of::() as u32, - )?; - } - - Ok(exit_context) - } } impl Drop for VMProcessor { diff --git a/src/hyperlight_host/src/sandbox/initialized_multi_use.rs b/src/hyperlight_host/src/sandbox/initialized_multi_use.rs index 2f83b1f5c..bc81ce172 100644 --- a/src/hyperlight_host/src/sandbox/initialized_multi_use.rs +++ b/src/hyperlight_host/src/sandbox/initialized_multi_use.rs @@ -47,47 +47,11 @@ use crate::mem::shared_mem::HostSharedMemory; use crate::metrics::{ METRIC_GUEST_ERROR, METRIC_GUEST_ERROR_LABEL_CODE, maybe_time_and_emit_guest_call, }; -use crate::{Result, log_then_return, new_error}; +use crate::{Result, log_then_return}; /// Global counter for assigning unique IDs to sandboxes static SANDBOX_ID_COUNTER: AtomicU64 = AtomicU64::new(0); -/// RAII guard that automatically calls `clear_call_active()` when dropped. -/// -/// This ensures that the call_active flag is always cleared when a guest function -/// call completes, even if the function returns early due to an error. -/// -/// Only one guard can exist per interrupt handle at a time - attempting to create -/// a second guard will return an error. -struct CallActiveGuard { - interrupt_handle: Arc, -} - -impl CallActiveGuard { - /// Creates a new guard and marks a guest function call as active. - /// - /// # Errors - /// - /// Returns an error if `call_active` is already true (i.e., another guard already exists). - fn new(interrupt_handle: Arc) -> Result { - // Atomically check that call_active is false and set it to true. - // This prevents creating multiple guards for the same interrupt handle. - let was_active = interrupt_handle.set_call_active(); - if was_active { - return Err(new_error!( - "Attempted to create CallActiveGuard when a call is already active" - )); - } - Ok(Self { interrupt_handle }) - } -} - -impl Drop for CallActiveGuard { - fn drop(&mut self) { - self.interrupt_handle.clear_call_active(); - } -} - /// A fully initialized sandbox that can execute guest functions multiple times. /// /// Guest functions can be called repeatedly while maintaining state between calls. @@ -611,10 +575,6 @@ impl MultiUseSandbox { if self.poisoned { return Err(crate::HyperlightError::PoisonedSandbox); } - // Mark that a guest function call is now active - // (This also increments the generation counter internally) - // The guard will automatically clear call_active when dropped - let _guard = CallActiveGuard::new(self.vm.interrupt_handle())?; let res = (|| { let estimated_capacity = estimate_flatbuffer_capacity(function_name, &args); @@ -633,6 +593,8 @@ impl MultiUseSandbox { self.vm.dispatch_call_from_host( self.dispatch_ptr.clone(), + &mut self.mem_mgr, + self._host_funcs.clone(), #[cfg(gdb)] self.dbg_mem_access_fn.clone(), )?; @@ -746,7 +708,8 @@ impl MultiUseSandbox { #[cfg(crashdump)] #[instrument(err(Debug), skip_all, parent = Span::current())] pub fn generate_crashdump(&self) -> Result<()> { - crate::hypervisor::crashdump::generate_crashdump(self.vm.as_ref()) + let ctx = self.vm.crashdump_context()?; + crate::hypervisor::crashdump::generate_crashdump(ctx) } /// Returns whether the sandbox is currently poisoned. diff --git a/src/hyperlight_host/src/sandbox/outb.rs b/src/hyperlight_host/src/sandbox/outb.rs index e5e938ae3..6354e308c 100644 --- a/src/hyperlight_host/src/sandbox/outb.rs +++ b/src/hyperlight_host/src/sandbox/outb.rs @@ -25,10 +25,11 @@ use tracing::{Span, instrument}; use tracing_log::format_trace; use super::host_funcs::FunctionRegistry; -#[cfg(feature = "mem_profile")] -use crate::hypervisor::Hypervisor; +use crate::hypervisor::regs::CommonRegisters; use crate::mem::mgr::SandboxMemoryManager; use crate::mem::shared_mem::HostSharedMemory; +#[cfg(feature = "mem_profile")] +use crate::sandbox::trace::MemTraceInfo; use crate::{HyperlightError, Result, new_error}; #[instrument(err(Debug), skip_all, parent = Span::current(), level="Trace")] @@ -148,9 +149,10 @@ fn outb_abort(mem_mgr: &mut SandboxMemoryManager, data: u32) - pub(crate) fn handle_outb( mem_mgr: &mut SandboxMemoryManager, host_funcs: Arc>, - #[cfg(feature = "mem_profile")] _hv: &mut dyn Hypervisor, + _regs: &CommonRegisters, port: u16, data: u32, + #[cfg(feature = "mem_profile")] trace_info: &mut MemTraceInfo, ) -> Result<()> { match port.try_into()? { OutBAction::Log => outb_log(mem_mgr), @@ -185,17 +187,9 @@ pub(crate) fn handle_outb( #[cfg(feature = "trace_guest")] OutBAction::TraceBatch => Ok(()), #[cfg(feature = "mem_profile")] - OutBAction::TraceMemoryAlloc => { - let regs = _hv.regs()?; - let trace_info = _hv.trace_info_mut(); - trace_info.handle_trace_mem_alloc(®s, mem_mgr) - } + OutBAction::TraceMemoryAlloc => trace_info.handle_trace_mem_alloc(_regs, mem_mgr), #[cfg(feature = "mem_profile")] - OutBAction::TraceMemoryFree => { - let regs = _hv.regs()?; - let trace_info = _hv.trace_info_mut(); - trace_info.handle_trace_mem_free(®s, mem_mgr) - } + OutBAction::TraceMemoryFree => trace_info.handle_trace_mem_free(_regs, mem_mgr), } } #[cfg(test)] diff --git a/src/tests/rust_guests/witguest/Cargo.lock b/src/tests/rust_guests/witguest/Cargo.lock index 4a0de8cf2..3dd9fee09 100644 --- a/src/tests/rust_guests/witguest/Cargo.lock +++ b/src/tests/rust_guests/witguest/Cargo.lock @@ -378,9 +378,9 @@ dependencies = [ [[package]] name = "quote" -version = "1.0.41" +version = "1.0.42" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ce25767e7b499d1b604768e7cde645d14cc8584231ea6b295e9c9eb22c02e1d1" +checksum = "a338cc41d27e6cc6dce6cefc13a0729dfbb81c262b1f519331575dd80ef3067f" dependencies = [ "proc-macro2", ] @@ -499,9 +499,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.108" +version = "2.0.110" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "da58917d35242480a05c2897064da0a80589a2a0476c9a3f2fdc83b53502e917" +checksum = "a99801b5bd34ede4cf3fc688c5919368fea4e4814a4664359503e6015b280aea" dependencies = [ "proc-macro2", "quote", From 0360070143a158b9d133acf56030917cdbf7057a Mon Sep 17 00:00:00 2001 From: Ludvig Liljenberg <4257730+ludfjig@users.noreply.github.com> Date: Thu, 13 Nov 2025 11:40:16 -0800 Subject: [PATCH 2/4] Make sure run_vcpu does not create spans Signed-off-by: Ludvig Liljenberg <4257730+ludfjig@users.noreply.github.com> --- src/hyperlight_host/src/hypervisor/hyperv_linux.rs | 2 +- src/hyperlight_host/src/hypervisor/hyperv_windows.rs | 1 + src/hyperlight_host/src/hypervisor/kvm.rs | 2 +- src/hyperlight_host/src/hypervisor/mod.rs | 2 +- 4 files changed, 4 insertions(+), 3 deletions(-) diff --git a/src/hyperlight_host/src/hypervisor/hyperv_linux.rs b/src/hyperlight_host/src/hypervisor/hyperv_linux.rs index 9c96de39f..323359d2d 100644 --- a/src/hyperlight_host/src/hypervisor/hyperv_linux.rs +++ b/src/hyperlight_host/src/hypervisor/hyperv_linux.rs @@ -559,7 +559,7 @@ impl Hypervisor for HypervLinuxDriver { ) } - #[instrument(err(Debug), skip_all, parent = Span::current(), level = "Trace")] + // Note, this function should not be instrumented with a span as it is called after setting up guest trace span fn run_vcpu(&mut self) -> Result { const HALT: hv_message_type = hv_message_type_HVMSG_X64_HALT; const IO_PORT: hv_message_type = hv_message_type_HVMSG_X64_IO_PORT_INTERCEPT; diff --git a/src/hyperlight_host/src/hypervisor/hyperv_windows.rs b/src/hyperlight_host/src/hypervisor/hyperv_windows.rs index 9cb611f67..9160e2a64 100644 --- a/src/hyperlight_host/src/hypervisor/hyperv_windows.rs +++ b/src/hyperlight_host/src/hypervisor/hyperv_windows.rs @@ -474,6 +474,7 @@ impl Hypervisor for HypervWindowsDriver { ) } + // Note, this function should not be instrumented with a span as it is called after setting up guest trace span #[expect(non_upper_case_globals, reason = "Windows API constant are lower case")] fn run_vcpu(&mut self) -> Result { let mut exit_context: WHV_RUN_VP_EXIT_CONTEXT = Default::default(); diff --git a/src/hyperlight_host/src/hypervisor/kvm.rs b/src/hyperlight_host/src/hypervisor/kvm.rs index 647b1338a..82b03f228 100644 --- a/src/hyperlight_host/src/hypervisor/kvm.rs +++ b/src/hyperlight_host/src/hypervisor/kvm.rs @@ -548,7 +548,7 @@ impl Hypervisor for KVMDriver { ) } - #[instrument(err(Debug), skip_all, parent = Span::current(), level = "Trace")] + // Note, this function should not be instrumented with a span as it is called after setting up guest trace span fn run_vcpu(&mut self) -> Result { match self.vcpu_fd.run() { Ok(VcpuExit::Hlt) => Ok(VmExit::Halt()), diff --git a/src/hyperlight_host/src/hypervisor/mod.rs b/src/hyperlight_host/src/hypervisor/mod.rs index cde789009..09829a7cf 100644 --- a/src/hyperlight_host/src/hypervisor/mod.rs +++ b/src/hyperlight_host/src/hypervisor/mod.rs @@ -223,7 +223,7 @@ pub(crate) trait Hypervisor: Debug + Send { // - Signals will interrupt the guest (RUNNING_BIT=true), causing VmExit::Cancelled() // - If the guest completes before any signals arrive, kill() may have no effect // - If there are more iterations to do (IO/host func, etc.), the next iteration will be cancelled - let exit_reason = self.run_vcpu(); + let exit_reason = self.run_vcpu(); // Note, this function must not create any spans as it is called after setting up guest trace span // End current host trace by closing the current span that captures traces // happening when a guest exits and re-enters. From 8ba0396631b5659202c13b5e3da8ba6f91359f54 Mon Sep 17 00:00:00 2001 From: Ludvig Liljenberg <4257730+ludfjig@users.noreply.github.com> Date: Fri, 14 Nov 2025 14:17:53 -0800 Subject: [PATCH 3/4] Some PR feedback fixes + docs Signed-off-by: Ludvig Liljenberg <4257730+ludfjig@users.noreply.github.com> --- docs/cancellation.md | 444 ++++++++++++++++++ .../src/hypervisor/gdb/arch.rs | 13 +- src/hyperlight_host/src/hypervisor/mod.rs | 4 +- 3 files changed, 452 insertions(+), 9 deletions(-) create mode 100644 docs/cancellation.md diff --git a/docs/cancellation.md b/docs/cancellation.md new file mode 100644 index 000000000..5a3ba8b01 --- /dev/null +++ b/docs/cancellation.md @@ -0,0 +1,444 @@ +# Cancellation and Memory Ordering in Hyperlight (Linux) + +This document describes the cancellation mechanism and memory ordering guarantees for Hyperlight on Linux platforms (KVM and mshv3). + +## Overview + +Hyperlight provides a mechanism to forcefully interrupt guest execution through the `InterruptHandle::kill()` method. This involves coordination between multiple threads using atomic operations and POSIX signals to ensure safe and reliable cancellation. + +## Key Components + +### LinuxInterruptHandle State + +The `LinuxInterruptHandle` uses an atomic `u64` value to pack multiple pieces of state: + +``` +Bit Layout of running: AtomicU64 +┌─────────────┬──────────────┬───────────────────────────┐ +│ Bit 63 │ Bit 62 │ Bits 61-0 │ +│ RUNNING_BIT │ CANCEL_BIT │ Generation Counter │ +└─────────────┴──────────────┴───────────────────────────┘ +``` + +- **RUNNING_BIT (bit 63)**: Set when vCPU is actively running in guest mode +- **CANCEL_BIT (bit 62)**: Set when cancellation has been requested via `kill()` +- **Generation Counter (bits 0-61)**: Incremented on each vCPU run to prevent ABA problems +- **tid (AtomicU64)**: Thread ID where the vCPU is running +- **debug_interrupt (AtomicBool)**: Set when debugger interrupt is requested (gdb feature only) +- **dropped (AtomicBool)**: Set when the corresponding VM has been dropped + +### Signal Mechanism + +On Linux, Hyperlight uses `SIGRTMIN + offset` (configurable, default offset is typically 0) to interrupt the vCPU thread. The signal handler is intentionally a no-op - the signal's only purpose is to cause a VM exit via `EINTR` from the `ioctl` call that runs the vCPU. + +## Run Loop Flow + +The main execution loop in `HyperlightVm::run()` coordinates vCPU execution with potential interrupts. Here's the detailed flow: + +```mermaid +sequenceDiagram + participant VM as run() Loop + participant Guest as vCPU (Guest) + participant IH as InterruptHandle + + Note over VM: === TIMING POINT 1 === + VM->>IH: clear_cancel() + Note right of VM: Clear premature kill()
requests + + loop Run Loop + Note over VM: === TIMING POINT 2 === + VM->>IH: set_tid() + Note right of VM: Store current thread ID + VM->>IH: set_running() + Note right of VM: Set RUNNING_BIT=true
Increment generation + + VM->>IH: is_cancelled() + + alt is_cancelled() == true + VM->>VM: return Cancelled() + else is_cancelled() == false + Note over VM: === TIMING POINT 3 === + VM->>Guest: run_vcpu() + activate Guest + Note right of Guest: Guest code executes
in vCPU + + alt Guest completes normally + Guest-->>VM: VmExit::Halt() + else Guest performs I/O + Guest-->>VM: VmExit::IoOut()/MmioRead() + else Signal received + Guest-->>VM: VmExit::Cancelled() + end + deactivate Guest + end + + Note over VM: === TIMING POINT 4 === + VM->>IH: is_cancelled() + IH-->>VM: cancel_requested (bool) + Note right of VM: Capture for filtering
stale signals later + + Note over VM: === TIMING POINT 5 === + VM->>IH: clear_running() + Note right of VM: Clear RUNNING_BIT + + Note over VM: === TIMING POINT 6 === + + alt Exit reason is Halt + VM->>VM: break Ok(()) + else Exit reason is Cancelled AND cancel_requested==true + VM->>VM: break Err(ExecutionCanceledByHost) + else Exit reason is Cancelled AND cancel_requested==false + Note right of VM: Stale signal, retry + VM->>VM: continue (retry iteration) + else Exit reason is I/O or host call + VM->>VM: Handle and continue loop + else Other exit reasons + VM->>VM: Handle appropriately + end + end +``` + +### Detailed Run Loop Steps + +1. **Timing Point 1** - Between guest function calls: + - `clear_cancel()` is called to clear any stale CANCEL_BIT + - If `kill()` completes before this point, it has NO effect on this call + - Ensures that `kill()` between different guest function calls doesn't affect the next call + +2. **Timing Point 2** - Before entering run loop iteration: + - `set_tid()` stores the current thread ID + - `set_running()` sets RUNNING_BIT and increments generation counter + - If `kill()` completes before this, early `Cancelled()` is returned + +3. **Timing Point 3** - Before calling `run_vcpu()`: + - If `kill()` completes before this, CANCEL_BIT is set but too late to prevent entering guest + - Signals will interrupt the guest (RUNNING_BIT=true), causing `VmExit::Cancelled()` + - If guest completes before signals arrive, `kill()` may have no effect on this iteration + +4. **Timing Point 4** - After vCPU exits, before capturing `cancel_requested`: + - CANCEL_BIT is captured for filtering stale signals + - If `kill()` completes before this, CANCEL_BIT persists for next iteration + +5. **Timing Point 5** - Before calling `clear_running()`: + - Same as point 4 + +6. **Timing Point 6** - After calling `clear_running()`: + - RUNNING_BIT is now false, no new signals will be sent + - CANCEL_BIT may be set but won't affect this iteration + - Stale signals may arrive but are filtered by the `cancel_requested` check + +## Kill Operation Flow + +The `kill()` operation involves setting the CANCEL_BIT and sending signals to interrupt the vCPU: + +```mermaid +sequenceDiagram + participant Caller as Caller Thread + participant IH as InterruptHandle + participant Signal as Signal Delivery + participant VM as vCPU Thread + + Caller->>IH: kill() + activate IH + + IH->>IH: fetch_or(CANCEL_BIT, Release) + Note right of IH: Atomically set CANCEL_BIT
with Release ordering + + IH->>IH: send_signal() + activate IH + + loop Retry Loop + IH->>IH: get_running_cancel_and_generation() + Note right of IH: Load with Acquire ordering + + alt Not running OR not cancelled + IH-->>IH: break (sent_signal=false) + else Generation changed (ABA prevention) + IH-->>IH: break (sent_signal=true) + else Running AND cancelled + IH->>IH: tid.load(Acquire) + IH->>Signal: pthread_kill(tid, SIGRTMIN+offset) + activate Signal + Note right of Signal: Send signal to vCPU thread + Signal->>VM: SIGRTMIN+offset delivered + Note right of VM: Signal handler is no-op
Purpose is to cause EINTR + deactivate Signal + + alt Signal arrives during ioctl + VM->>VM: ioctl returns EINTR + VM->>VM: return VmExit::Cancelled() + else Signal arrives between ioctls + Note right of VM: Signal is harmless + end + + IH->>IH: sleep(retry_delay) + Note right of IH: Default ~1μs between retries + end + end + + deactivate IH + IH-->>Caller: sent_signal + deactivate IH +``` + +### Kill Operation Steps + +1. **Set CANCEL_BIT**: Atomically set the CANCEL_BIT using `fetch_or` with `Release` ordering + - Ensures all writes before `kill()` are visible when vCPU thread checks `is_cancelled()` with `Acquire` + +2. **Send Signals**: Enter retry loop to send signals + - Check if vCPU is running and cancellation/debug interrupt is requested + - Use `Acquire` ordering to read `running` to synchronize with `Release` in `set_running()` + - This ensures we see the correct `tid` value + +3. **Signal Delivery**: Send `SIGRTMIN+offset` via `pthread_kill` + - Signal interrupts the `ioctl` that runs the vCPU, causing `EINTR` + - Signal handler is intentionally a no-op + - Returns `VmExit::Cancelled()` when `EINTR` is received + +4. **Retry Loop**: Continue sending signals until: + - vCPU is no longer running (RUNNING_BIT cleared) + - Generation counter changes (prevents ABA problem) + - Cancellation/debug interrupt is cleared + +## Memory Ordering Guarantees + +### Release-Acquire Semantics Overview + +A **synchronizes-with** relationship is established when: +1. Thread A performs an atomic operation with `Release` ordering that writes a value +2. Thread B performs an atomic operation with `Acquire` ordering on the same atomic variable +3. Thread B's `Acquire` load reads the exact value that Thread A's `Release` operation wrote + +When this occurs, all memory operations that happened-before the `Release` in Thread A become visible to Thread B after the `Acquire`. This creates a **happens-before** relationship that ensures memory consistency across threads. + +### Synchronization in Hyperlight + +Hyperlight uses careful memory ordering to ensure correctness across threads: + +```mermaid +graph TB + subgraph "vCPU Thread" + A[set_tid
Store tid with Release] + B[set_running
fetch_update RUNNING_BIT
with Release] + C[is_cancelled
Load with Acquire] + D[clear_running
fetch_and with Release] + end + + subgraph "Interrupt Thread" + E[kill
fetch_or CANCEL_BIT
with Release] + F[send_signal
Load running with Acquire] + G[Load tid with Acquire] + H[pthread_kill] + end + + B -->|Synchronizes-with| F + A -->|Happens-before via B→F| G + E -->|Synchronizes-with| C + D -->|Synchronizes-with| F +``` + +### Ordering Rules + +1. **tid Store → running Load Synchronization**: + - `set_tid()`: Stores `tid` with `Release` ordering + - `set_running()`: Uses `Release` ordering + - `send_signal()`: Loads `running` with `Acquire` ordering + - **Guarantee**: When interrupt thread observes RUNNING_BIT=true, it sees the correct `tid` value + +2. **CANCEL_BIT Synchronization**: + - `kill()`: Sets CANCEL_BIT with `Release` ordering + - `is_cancelled()`: Loads with `Acquire` ordering + - **Guarantee**: When vCPU thread observes CANCEL_BIT=true, it sees all writes before `kill()` + +3. **clear_running Synchronization**: + - `clear_running()`: Clears RUNNING_BIT with `Release` ordering + - `send_signal()`: Loads with `Acquire` ordering + - **Guarantee**: When interrupt thread observes RUNNING_BIT=false, all vCPU operations are complete + +4. **clear_cancel**: + - Uses `Relaxed` ordering + - **Rationale**: Only vCPU thread clears this bit at the start of `run()`, no race condition + +### Happens-Before Relationships + +```mermaid +sequenceDiagram + participant VT as vCPU Thread + participant IT as Interrupt Thread + + VT->>VT: set_tid() (Release) + Note right of VT: Store thread ID + + VT->>VT: set_running() (Release) + Note right of VT: Set RUNNING_BIT=true
Increment generation + + Note over VT,IT: Sync #1: set_running(Release) → send_signal(Acquire) + VT-->>IT: synchronizes-with + Note over IT: send_signal() + IT->>IT: Load running (Acquire) + Note right of IT: Observes RUNNING_BIT=true + + IT->>IT: Load tid (Acquire) + Note right of IT: Sees correct tid value
(from set_tid Release) + + VT->>VT: run_vcpu() + Note right of VT: Guest executes + + IT->>IT: pthread_kill() + Note right of IT: Send signal to tid + + par Concurrent Operations + Note over IT: kill() + IT->>IT: Set CANCEL_BIT (Release) + Note right of IT: Request cancellation + and + Note over VT: Guest interrupted + VT->>VT: is_cancelled() (Acquire) + Note right of VT: Observes CANCEL_BIT=true
Sees all writes before kill() + end + + Note over IT,VT: Sync #2: kill(Release) → is_cancelled(Acquire) + IT-->>VT: synchronizes-with + + VT->>VT: clear_running() (Release) + Note right of VT: Clear RUNNING_BIT
All vCPU ops complete + + Note over VT,IT: Sync #3: clear_running(Release) → send_signal(Acquire) + VT-->>IT: synchronizes-with + + IT->>IT: send_signal() observes + Note right of IT: RUNNING_BIT=false
Stop sending signals +``` + +## Interaction with Host Function Calls + +When a guest performs a host function call, the vCPU exits and the host function executes with `RUNNING_BIT=false`, preventing signal delivery during host execution. The `CANCEL_BIT` persists across this exit and re-entry, so if `kill()` was called, cancellation will be detected when the guest attempts to resume execution. This ensures cancellation takes effect even if it occurs during a host call, while avoiding signals during non-guest code execution. + +## ABA Problem Prevention + +The generation counter prevents the ABA problem where: + +1. vCPU runs with generation N (e.g., in guest code) +2. Interrupt thread reads generation N and RUNNING_BIT=true, begins sending signals +3. vCPU exits (e.g., for a host call), RUNNING_BIT=false +4. vCPU runs again with generation N+1 (re-entering guest), RUNNING_BIT=true +5. Without generation counter, interrupt thread would continue sending signals to the new iteration + +**Solution**: The interrupt thread captures the generation when it first observes RUNNING_BIT=true, and stops sending signals if the generation changes. This ensures signals are only sent during the intended vCPU run iteration within a single `run()` call. + +```mermaid +stateDiagram-v2 + [*] --> NotRunning: Generation=N + NotRunning --> Running1: set_running()
Generation=N+1 + Running1 --> NotRunning: clear_running()
Generation=N+1 + NotRunning --> Running2: set_running()
Generation=N+2 + + note right of Running1 + Interrupt thread captures + target_generation=N+1 + end note + + note right of Running2 + Interrupt thread detects + generation changed: + N+2 ≠ N+1 + Stops sending signals + end note +``` + +## Signal Safety + +The signal handler is designed to be async-signal-safe: + +```c +extern "C" fn vm_kill_signal(_: libc::c_int, _: *mut libc::siginfo_t, _: *mut libc::c_void) { + // Do nothing. SIGRTMIN is just used to issue a VM exit to the underlying VM. +} +``` + +**Why no-op?** +- Signal's purpose is to interrupt the `ioctl` system call via `EINTR` +- No unsafe operations (allocation, locks, etc.) in signal handler +- Actual cancellation handling occurs in the main vCPU thread after `ioctl` returns + +**Signal Chaining Note**: Hyperlight does not provide signal chaining for `SIGRTMIN+offset`. Since Hyperlight may issue ~200 signals back-to-back during cancellation retry loop, it's unlikely embedders want to handle these signals. + +## Race Conditions and Edge Cases + +### Race 1: kill() called between guest function calls + +``` +Timeline: +t1: Guest function #1 completes, run() returns +t2: kill() is called (sets CANCEL_BIT) +t3: Guest function #2 starts, run() is called +t4: clear_cancel() clears CANCEL_BIT + +Result: Guest function #2 executes normally (not cancelled) +``` + +**This is by design** - cancellation is scoped to a single guest function call. + +### Race 2: kill() called just before run_vcpu() + +``` +Timeline: +t1: set_running() sets RUNNING_BIT +t2: kill() sets CANCEL_BIT and sends signals +t3: run_vcpu() enters guest + +Result: Signals interrupt the guest, causing VmExit::Cancelled() +``` + +**Handled correctly** - signals cause VM exit. + +### Race 3: Guest completes before signal arrives + +``` +Timeline: +t1: kill() sets CANCEL_BIT and sends signal +t2: Guest completes naturally +t3: clear_running() clears RUNNING_BIT +t4: Signal arrives (too late) + +Result: If guest completes normally (Halt), returns Ok() + If guest exits for I/O, next iteration will be cancelled +``` + +**Acceptable behavior** - cancellation is best-effort. + +### Race 4: Stale signals + +``` +Timeline: +t1: kill() sends signals while RUNNING_BIT=true +t2: clear_running() clears RUNNING_BIT +t3: Stale signal arrives + +Result: is_cancelled() returns true, causing VmExit::Cancelled() + But cancel_requested was captured at timing point 4 + Stale signal is filtered out and iteration continues +``` + +**Handled correctly** - `cancel_requested` flag filters stale signals. + +## Performance Considerations + +- **Signal Retry**: Default retry delay is ~1μs between signals +- **Signal Volume**: May send up to ~200 signals during cancellation +- **Memory Ordering**: Acquire/Release ordering has minimal overhead on modern x86_64 CPUs +- **Atomic Operations**: All atomic operations are lock-free on 64-bit platforms + +## Summary + +Hyperlight's cancellation mechanism on Linux provides: + +1. **Thread-safe cancellation** via atomic operations and memory ordering +2. **Signal-based interruption** using `SIGRTMIN+offset` to cause VM exits +3. **ABA problem prevention** through generation counters +4. **Scoped cancellation** that applies to single guest function calls +5. **Race condition handling** for various timing scenarios +6. **Async-signal-safe implementation** with no-op signal handler + +The design balances correctness, performance, and usability for forceful guest interruption. diff --git a/src/hyperlight_host/src/hypervisor/gdb/arch.rs b/src/hyperlight_host/src/hypervisor/gdb/arch.rs index 652b3af47..5ec86298b 100644 --- a/src/hyperlight_host/src/hypervisor/gdb/arch.rs +++ b/src/hyperlight_host/src/hypervisor/gdb/arch.rs @@ -17,7 +17,6 @@ limitations under the License. //! This file contains architecture specific code for the x86_64 use super::VcpuStopReason; -use crate::Result; // Described in Table 6-1. Exceptions and Interrupts at Page 6-13 Vol. 1 // of Intel 64 and IA-32 Architectures Software Developer's Manual @@ -55,14 +54,14 @@ pub(crate) fn vcpu_stop_reason( entrypoint: u64, dr6: u64, exception: u32, -) -> Result { +) -> VcpuStopReason { if DB_EX_ID == exception { // If the BS flag in DR6 register is set, it means a single step // instruction triggered the exit // Check page 19-4 Vol. 3B of Intel 64 and IA-32 // Architectures Software Developer's Manual if dr6 & DR6_BS_FLAG_MASK != 0 { - return Ok(VcpuStopReason::DoneStep); + return VcpuStopReason::DoneStep; } // If any of the B0-B3 flags in DR6 register is set, it means a @@ -71,14 +70,14 @@ pub(crate) fn vcpu_stop_reason( // Architectures Software Developer's Manual if DR6_HW_BP_FLAGS_MASK & dr6 != 0 { if rip == entrypoint { - return Ok(VcpuStopReason::EntryPointBp); + return VcpuStopReason::EntryPointBp; } - return Ok(VcpuStopReason::HwBp); + return VcpuStopReason::HwBp; } } if BP_EX_ID == exception { - return Ok(VcpuStopReason::SwBp); + return VcpuStopReason::SwBp; } // Log an error and provide internal debugging info @@ -95,5 +94,5 @@ pub(crate) fn vcpu_stop_reason( exception, ); - Ok(VcpuStopReason::Unknown) + VcpuStopReason::Unknown } diff --git a/src/hyperlight_host/src/hypervisor/mod.rs b/src/hyperlight_host/src/hypervisor/mod.rs index 09829a7cf..bd3752a6c 100644 --- a/src/hyperlight_host/src/hypervisor/mod.rs +++ b/src/hyperlight_host/src/hypervisor/mod.rs @@ -268,7 +268,7 @@ pub(crate) trait Hypervisor: Debug + Send { let regs = self.regs()?; let rip_gva = self.translate_gva(regs.rip)?; // Handle debug event (breakpoints) - let stop_reason = arch::vcpu_stop_reason(rip_gva, _entrypoint, dr6, exception)?; + let stop_reason = arch::vcpu_stop_reason(rip_gva, _entrypoint, dr6, exception); if stop_reason == VcpuStopReason::EntryPointBp { // TODO in next PR: make sure to remove hw breakpoint here @@ -714,7 +714,7 @@ impl LinuxInterruptHandle { /// - CANCEL_BIT remains unchanged /// /// # Memory Ordering - /// Uses `Release` ordering to ensure that the `tid` store (which uses `Relaxed`) + /// Uses `Release` ordering to ensure that the `tid` store (which uses `Release`) /// is visible to any thread that observes RUNNING_BIT=true via `Acquire` ordering. /// This prevents the interrupt thread from reading a stale `tid` value. #[expect(clippy::expect_used)] From d7344f0eec877fd14faf70f70cb4e223f83f1adc Mon Sep 17 00:00:00 2001 From: Ludvig Liljenberg <4257730+ludfjig@users.noreply.github.com> Date: Fri, 14 Nov 2025 16:31:07 -0800 Subject: [PATCH 4/4] Remove interrupt generation number + add docs Signed-off-by: Ludvig Liljenberg <4257730+ludfjig@users.noreply.github.com> --- docs/cancellation.md | 159 ++++++------------ .../src/hypervisor/hyperv_linux.rs | 2 +- src/hyperlight_host/src/hypervisor/kvm.rs | 2 +- src/hyperlight_host/src/hypervisor/mod.rs | 117 +++++-------- 4 files changed, 97 insertions(+), 183 deletions(-) diff --git a/docs/cancellation.md b/docs/cancellation.md index 5a3ba8b01..527e65fbb 100644 --- a/docs/cancellation.md +++ b/docs/cancellation.md @@ -10,23 +10,17 @@ Hyperlight provides a mechanism to forcefully interrupt guest execution through ### LinuxInterruptHandle State -The `LinuxInterruptHandle` uses an atomic `u64` value to pack multiple pieces of state: +The `LinuxInterruptHandle` uses a packed atomic u64 to track execution state: -``` -Bit Layout of running: AtomicU64 -┌─────────────┬──────────────┬───────────────────────────┐ -│ Bit 63 │ Bit 62 │ Bits 61-0 │ -│ RUNNING_BIT │ CANCEL_BIT │ Generation Counter │ -└─────────────┴──────────────┴───────────────────────────┘ -``` - -- **RUNNING_BIT (bit 63)**: Set when vCPU is actively running in guest mode -- **CANCEL_BIT (bit 62)**: Set when cancellation has been requested via `kill()` -- **Generation Counter (bits 0-61)**: Incremented on each vCPU run to prevent ABA problems +- **state (AtomicU64)**: Packs two bits: + - **Bit 1 (RUNNING_BIT)**: Set when vCPU is actively running in guest mode + - **Bit 0 (CANCEL_BIT)**: Set when cancellation has been requested via `kill()` - **tid (AtomicU64)**: Thread ID where the vCPU is running - **debug_interrupt (AtomicBool)**: Set when debugger interrupt is requested (gdb feature only) - **dropped (AtomicBool)**: Set when the corresponding VM has been dropped +The packed state allows atomic snapshots of both RUNNING_BIT and CANCEL_BIT via `get_running_and_cancel()`. The CANCEL_BIT persists across vcpu exits/re-entries within a single `run()` call (e.g., during host function calls), but is cleared at the start of each new `run()` call. + ### Signal Mechanism On Linux, Hyperlight uses `SIGRTMIN + offset` (configurable, default offset is typically 0) to interrupt the vCPU thread. The signal handler is intentionally a no-op - the signal's only purpose is to cause a VM exit via `EINTR` from the `ioctl` call that runs the vCPU. @@ -50,7 +44,7 @@ sequenceDiagram VM->>IH: set_tid() Note right of VM: Store current thread ID VM->>IH: set_running() - Note right of VM: Set RUNNING_BIT=true
Increment generation + Note right of VM: Set running=true VM->>IH: is_cancelled() @@ -107,7 +101,7 @@ sequenceDiagram 2. **Timing Point 2** - Before entering run loop iteration: - `set_tid()` stores the current thread ID - - `set_running()` sets RUNNING_BIT and increments generation counter + - `set_running()` sets running to true - If `kill()` completes before this, early `Cancelled()` is returned 3. **Timing Point 3** - Before calling `run_vcpu()`: @@ -142,19 +136,17 @@ sequenceDiagram activate IH IH->>IH: fetch_or(CANCEL_BIT, Release) - Note right of IH: Atomically set CANCEL_BIT
with Release ordering + Note right of IH: Atomically set cancel=true
with Release ordering IH->>IH: send_signal() activate IH loop Retry Loop - IH->>IH: get_running_cancel_and_generation() + IH->>IH: get_running_and_cancel() Note right of IH: Load with Acquire ordering alt Not running OR not cancelled - IH-->>IH: break (sent_signal=false) - else Generation changed (ABA prevention) - IH-->>IH: break (sent_signal=true) + IH-->>IH: break (sent_signal=false/true) else Running AND cancelled IH->>IH: tid.load(Acquire) IH->>Signal: pthread_kill(tid, SIGRTMIN+offset) @@ -183,23 +175,23 @@ sequenceDiagram ### Kill Operation Steps -1. **Set CANCEL_BIT**: Atomically set the CANCEL_BIT using `fetch_or` with `Release` ordering +1. **Set Cancel Flag**: Atomically set the CANCEL_BIT using `fetch_or(CANCEL_BIT)` with `Release` ordering - Ensures all writes before `kill()` are visible when vCPU thread checks `is_cancelled()` with `Acquire` -2. **Send Signals**: Enter retry loop to send signals - - Check if vCPU is running and cancellation/debug interrupt is requested - - Use `Acquire` ordering to read `running` to synchronize with `Release` in `set_running()` - - This ensures we see the correct `tid` value +2. **Send Signals**: Enter retry loop via `send_signal()` + - Atomically load both running and cancel flags via `get_running_and_cancel()` with `Acquire` ordering + - Continue if `running=true AND cancel=true` (or `running=true AND debug_interrupt=true` with gdb) + - Exit loop immediately if `running=false OR cancel=false` 3. **Signal Delivery**: Send `SIGRTMIN+offset` via `pthread_kill` - Signal interrupts the `ioctl` that runs the vCPU, causing `EINTR` - Signal handler is intentionally a no-op - Returns `VmExit::Cancelled()` when `EINTR` is received -4. **Retry Loop**: Continue sending signals until: - - vCPU is no longer running (RUNNING_BIT cleared) - - Generation counter changes (prevents ABA problem) - - Cancellation/debug interrupt is cleared +4. **Loop Termination**: The signal loop terminates when: + - vCPU is no longer running (`running=false`), OR + - Cancellation is no longer requested (`cancel=false`) + - See the loop termination proof in the source code for rigorous correctness analysis ## Memory Ordering Guarantees @@ -242,23 +234,28 @@ graph TB 1. **tid Store → running Load Synchronization**: - `set_tid()`: Stores `tid` with `Release` ordering - - `set_running()`: Uses `Release` ordering - - `send_signal()`: Loads `running` with `Acquire` ordering + - `set_running()`: Sets RUNNING_BIT with `Release` ordering (via `fetch_or`) + - `send_signal()`: Loads `state` with `Acquire` ordering via `get_running_and_cancel()` - **Guarantee**: When interrupt thread observes RUNNING_BIT=true, it sees the correct `tid` value 2. **CANCEL_BIT Synchronization**: - - `kill()`: Sets CANCEL_BIT with `Release` ordering - - `is_cancelled()`: Loads with `Acquire` ordering + - `kill()`: Sets CANCEL_BIT with `Release` ordering (via `fetch_or`) + - `is_cancelled()`: Loads `state` with `Acquire` ordering - **Guarantee**: When vCPU thread observes CANCEL_BIT=true, it sees all writes before `kill()` 3. **clear_running Synchronization**: - - `clear_running()`: Clears RUNNING_BIT with `Release` ordering - - `send_signal()`: Loads with `Acquire` ordering + - `clear_running()`: Clears RUNNING_BIT with `Release` ordering (via `fetch_and`) + - `send_signal()`: Loads `state` with `Acquire` ordering via `get_running_and_cancel()` - **Guarantee**: When interrupt thread observes RUNNING_BIT=false, all vCPU operations are complete -4. **clear_cancel**: - - Uses `Relaxed` ordering - - **Rationale**: Only vCPU thread clears this bit at the start of `run()`, no race condition +4. **clear_cancel Synchronization**: + - `clear_cancel()`: Clears CANCEL_BIT with `Release` ordering (via `fetch_and`) + - **Rationale**: Uses Release because the VM can move between threads across guest calls, ensuring operations from previous run() are visible to other threads + +5. **dropped flag**: + - `set_dropped()`: Uses `Release` ordering + - `dropped()`: Uses `Acquire` ordering + - **Guarantee**: All VM cleanup operations are visible when `dropped()` returns true ### Happens-Before Relationships @@ -271,13 +268,13 @@ sequenceDiagram Note right of VT: Store thread ID VT->>VT: set_running() (Release) - Note right of VT: Set RUNNING_BIT=true
Increment generation + Note right of VT: Set running=true Note over VT,IT: Sync #1: set_running(Release) → send_signal(Acquire) VT-->>IT: synchronizes-with Note over IT: send_signal() - IT->>IT: Load running (Acquire) - Note right of IT: Observes RUNNING_BIT=true + IT->>IT: get_running_and_cancel() (Acquire) + Note right of IT: Atomically load both bits
Observes running=true IT->>IT: Load tid (Acquire) Note right of IT: Sees correct tid value
(from set_tid Release) @@ -290,79 +287,43 @@ sequenceDiagram par Concurrent Operations Note over IT: kill() - IT->>IT: Set CANCEL_BIT (Release) - Note right of IT: Request cancellation + IT->>IT: fetch_or(CANCEL_BIT, Release) + Note right of IT: Atomically set CANCEL_BIT and Note over VT: Guest interrupted VT->>VT: is_cancelled() (Acquire) - Note right of VT: Observes CANCEL_BIT=true
Sees all writes before kill() + Note right of VT: Observes cancel=true
Sees all writes before kill() end Note over IT,VT: Sync #2: kill(Release) → is_cancelled(Acquire) IT-->>VT: synchronizes-with VT->>VT: clear_running() (Release) - Note right of VT: Clear RUNNING_BIT
All vCPU ops complete + Note right of VT: fetch_and to clear RUNNING_BIT
All vCPU ops complete Note over VT,IT: Sync #3: clear_running(Release) → send_signal(Acquire) VT-->>IT: synchronizes-with IT->>IT: send_signal() observes - Note right of IT: RUNNING_BIT=false
Stop sending signals + Note right of IT: running=false
Stop sending signals ``` ## Interaction with Host Function Calls When a guest performs a host function call, the vCPU exits and the host function executes with `RUNNING_BIT=false`, preventing signal delivery during host execution. The `CANCEL_BIT` persists across this exit and re-entry, so if `kill()` was called, cancellation will be detected when the guest attempts to resume execution. This ensures cancellation takes effect even if it occurs during a host call, while avoiding signals during non-guest code execution. -## ABA Problem Prevention +## Signal Behavior Across Loop Iterations -The generation counter prevents the ABA problem where: +When the run loop iterates (e.g., for host calls or IO operations): -1. vCPU runs with generation N (e.g., in guest code) -2. Interrupt thread reads generation N and RUNNING_BIT=true, begins sending signals -3. vCPU exits (e.g., for a host call), RUNNING_BIT=false -4. vCPU runs again with generation N+1 (re-entering guest), RUNNING_BIT=true -5. Without generation counter, interrupt thread would continue sending signals to the new iteration +1. Before host call: `clear_running()` sets `running=false` +2. `send_signal()` loop checks `running && cancel` - exits immediately when `running=false` +3. After host call: `set_running()` sets `running=true` again +4. `is_cancelled()` check detects persistent `cancel` flag and returns early -**Solution**: The interrupt thread captures the generation when it first observes RUNNING_BIT=true, and stops sending signals if the generation changes. This ensures signals are only sent during the intended vCPU run iteration within a single `run()` call. - -```mermaid -stateDiagram-v2 - [*] --> NotRunning: Generation=N - NotRunning --> Running1: set_running()
Generation=N+1 - Running1 --> NotRunning: clear_running()
Generation=N+1 - NotRunning --> Running2: set_running()
Generation=N+2 - - note right of Running1 - Interrupt thread captures - target_generation=N+1 - end note - - note right of Running2 - Interrupt thread detects - generation changed: - N+2 ≠ N+1 - Stops sending signals - end note -``` +**Key insight**: The `running && cancel` check is sufficient. When `running` becomes false (host call starts), the signal loop exits immediately. When the vCPU would resume, the early `is_cancelled()` check catches the persistent `cancel` flag before entering the guest. -## Signal Safety - -The signal handler is designed to be async-signal-safe: - -```c -extern "C" fn vm_kill_signal(_: libc::c_int, _: *mut libc::siginfo_t, _: *mut libc::c_void) { - // Do nothing. SIGRTMIN is just used to issue a VM exit to the underlying VM. -} -``` - -**Why no-op?** -- Signal's purpose is to interrupt the `ioctl` system call via `EINTR` -- No unsafe operations (allocation, locks, etc.) in signal handler -- Actual cancellation handling occurs in the main vCPU thread after `ioctl` returns - -**Signal Chaining Note**: Hyperlight does not provide signal chaining for `SIGRTMIN+offset`. Since Hyperlight may issue ~200 signals back-to-back during cancellation retry loop, it's unlikely embedders want to handle these signals. +**Signal Chaining Note**: Hyperlight does not provide signal chaining for `SIGRTMIN+offset`. Since Hyperlight may issue signals back-to-back during cancellation retry loop, it's unlikely embedders want to handle these signals. ## Race Conditions and Edge Cases @@ -423,22 +384,6 @@ Result: is_cancelled() returns true, causing VmExit::Cancelled() **Handled correctly** - `cancel_requested` flag filters stale signals. -## Performance Considerations - -- **Signal Retry**: Default retry delay is ~1μs between signals -- **Signal Volume**: May send up to ~200 signals during cancellation -- **Memory Ordering**: Acquire/Release ordering has minimal overhead on modern x86_64 CPUs -- **Atomic Operations**: All atomic operations are lock-free on 64-bit platforms - -## Summary - -Hyperlight's cancellation mechanism on Linux provides: - -1. **Thread-safe cancellation** via atomic operations and memory ordering -2. **Signal-based interruption** using `SIGRTMIN+offset` to cause VM exits -3. **ABA problem prevention** through generation counters -4. **Scoped cancellation** that applies to single guest function calls -5. **Race condition handling** for various timing scenarios -6. **Async-signal-safe implementation** with no-op signal handler +### Race 5: ABA Problem -The design balances correctness, performance, and usability for forceful guest interruption. +The ABA problem (where a new guest call starts during the InterruptHandle's `send_signal()` loop, potentially causing the loop to send signals to a different guest call) is prevented by clearing CANCEL_BIT at the start of each `run()` call, ensuring each guest call starts with a clean cancellation state. This breaks out any ongoing slow `send_signal()` loops from previous calls that did not have time to observe the cleared CANCEL_BIT after the first `run()` call completed. diff --git a/src/hyperlight_host/src/hypervisor/hyperv_linux.rs b/src/hyperlight_host/src/hypervisor/hyperv_linux.rs index 323359d2d..88ffbd22a 100644 --- a/src/hyperlight_host/src/hypervisor/hyperv_linux.rs +++ b/src/hyperlight_host/src/hypervisor/hyperv_linux.rs @@ -356,7 +356,7 @@ impl HypervLinuxDriver { })?; let interrupt_handle: Arc = Arc::new(LinuxInterruptHandle { - running: AtomicU64::new(0), + state: AtomicU64::new(0), #[cfg(gdb)] debug_interrupt: AtomicBool::new(false), #[cfg(all( diff --git a/src/hyperlight_host/src/hypervisor/kvm.rs b/src/hyperlight_host/src/hypervisor/kvm.rs index 82b03f228..78322dd30 100644 --- a/src/hyperlight_host/src/hypervisor/kvm.rs +++ b/src/hyperlight_host/src/hypervisor/kvm.rs @@ -313,7 +313,7 @@ impl KVMDriver { let rsp_gp = GuestPtr::try_from(RawPtr::from(rsp))?; let interrupt_handle: Arc = Arc::new(LinuxInterruptHandle { - running: AtomicU64::new(0), + state: AtomicU64::new(0), #[cfg(gdb)] debug_interrupt: AtomicBool::new(false), #[cfg(all( diff --git a/src/hyperlight_host/src/hypervisor/mod.rs b/src/hyperlight_host/src/hypervisor/mod.rs index bd3752a6c..9c3c917c8 100644 --- a/src/hyperlight_host/src/hypervisor/mod.rs +++ b/src/hyperlight_host/src/hypervisor/mod.rs @@ -668,13 +668,12 @@ pub(super) struct LinuxInterruptHandle { /// Atomic value packing vcpu execution state. /// /// Bit layout: - /// - Bit 63: RUNNING_BIT - set when vcpu is actively running - /// - Bit 62: CANCEL_BIT - set when cancellation has been requested - /// - Bits 61-0: generation counter - tracks vcpu run iterations to prevent ABA problem + /// - Bit 1: RUNNING_BIT - set when vcpu is actively running + /// - Bit 0: CANCEL_BIT - set when cancellation has been requested /// /// CANCEL_BIT persists across vcpu exits/re-entries within a single `HyperlightVm::run()` call /// (e.g., during host function calls), but is cleared at the start of each new `HyperlightVm::run()` call. - running: AtomicU64, + state: AtomicU64, /// Thread ID where the vcpu is running. /// @@ -699,62 +698,27 @@ pub(super) struct LinuxInterruptHandle { #[cfg(any(kvm, mshv3))] impl LinuxInterruptHandle { - const RUNNING_BIT: u64 = 1 << 63; - const CANCEL_BIT: u64 = 1 << 62; - const MAX_GENERATION: u64 = (1 << 62) - 1; - - /// Sets the RUNNING_BIT and increments the generation counter. - /// - /// # Preserves - /// - CANCEL_BIT: The current value of CANCEL_BIT is preserved - /// - /// # Invariants Maintained - /// - Generation increments by 1 (wraps to 0 at MAX_GENERATION) - /// - RUNNING_BIT is set - /// - CANCEL_BIT remains unchanged - /// - /// # Memory Ordering - /// Uses `Release` ordering to ensure that the `tid` store (which uses `Release`) - /// is visible to any thread that observes RUNNING_BIT=true via `Acquire` ordering. - /// This prevents the interrupt thread from reading a stale `tid` value. - #[expect(clippy::expect_used)] - fn set_running_and_increment_generation(&self) -> u64 { - self.running - .fetch_update(Ordering::Release, Ordering::Relaxed, |raw| { - let cancel_bit = raw & Self::CANCEL_BIT; // Preserve CANCEL_BIT - let generation = raw & Self::MAX_GENERATION; - let new_generation = if generation == Self::MAX_GENERATION { - // restart generation from 0 - 0 - } else { - generation + 1 - }; - // Set RUNNING_BIT, preserve CANCEL_BIT, increment generation - Some(Self::RUNNING_BIT | cancel_bit | new_generation) - }) - .expect("Should never fail since we always return Some") - } + const RUNNING_BIT: u64 = 1 << 1; + const CANCEL_BIT: u64 = 1 << 0; - /// Get the running and cancel bits, return the previous value. + /// Get the running and cancel flags atomically. /// /// # Memory Ordering - /// Uses `Acquire` ordering to synchronize with the `Release` in `set_running_and_increment_generation()`. - /// This ensures that when we observe RUNNING_BIT=true, we also see the correct `tid` value. - fn get_running_cancel_and_generation(&self) -> (bool, bool, u64) { - let raw = self.running.load(Ordering::Acquire); - let running = raw & Self::RUNNING_BIT != 0; - let cancel = raw & Self::CANCEL_BIT != 0; - let generation = raw & Self::MAX_GENERATION; - (running, cancel, generation) + /// Uses `Acquire` ordering to synchronize with the `Release` in `set_running()` and `kill()`. + /// This ensures that when we observe running=true, we also see the correct `tid` value. + fn get_running_and_cancel(&self) -> (bool, bool) { + let state = self.state.load(Ordering::Acquire); + let running = state & Self::RUNNING_BIT != 0; + let cancel = state & Self::CANCEL_BIT != 0; + (running, cancel) } fn send_signal(&self) -> bool { let signal_number = libc::SIGRTMIN() + self.sig_rt_min_offset as libc::c_int; let mut sent_signal = false; - let mut target_generation: Option = None; loop { - let (running, cancel, generation) = self.get_running_cancel_and_generation(); + let (running, cancel) = self.get_running_and_cancel(); // Check if we should continue sending signals // Exit if not running OR if neither cancel nor debug_interrupt is set @@ -768,13 +732,6 @@ impl LinuxInterruptHandle { break; } - match target_generation { - None => target_generation = Some(generation), - // prevent ABA problem - Some(expected) if expected != generation => break, - _ => {} - } - log::info!("Sending signal to kill vcpu thread..."); sent_signal = true; // Acquire ordering to synchronize with the Release store in set_tid() @@ -800,25 +757,28 @@ impl InterruptHandleImpl for LinuxInterruptHandle { } fn set_running(&self) { - self.set_running_and_increment_generation(); + // Release ordering to ensure that the tid store (which uses Release) + // is visible to any thread that observes running=true via Acquire ordering. + // This prevents the interrupt thread from reading a stale tid value. + self.state.fetch_or(Self::RUNNING_BIT, Ordering::Release); } fn is_cancelled(&self) -> bool { // Acquire ordering to synchronize with the Release in kill() - // This ensures we see the CANCEL_BIT set by the interrupt thread - self.running.load(Ordering::Acquire) & Self::CANCEL_BIT != 0 + // This ensures we see the cancel flag set by the interrupt thread + self.state.load(Ordering::Acquire) & Self::CANCEL_BIT != 0 } fn clear_cancel(&self) { - // Relaxed is sufficient here - we're the only thread that clears this bit - // at the start of run(), and there's no data race on the clear operation itself - self.running.fetch_and(!Self::CANCEL_BIT, Ordering::Relaxed); + // Release ordering to ensure that any operations from the previous run() + // are visible to other threads. While this is typically called by the vcpu thread + // at the start of run(), the VM itself can move between threads across guest calls. + self.state.fetch_and(!Self::CANCEL_BIT, Ordering::Release); } fn clear_running(&self) { - // Release ordering to ensure all vcpu operations are visible before clearing RUNNING_BIT - self.running - .fetch_and(!Self::RUNNING_BIT, Ordering::Release); + // Release ordering to ensure all vcpu operations are visible before clearing running + self.state.fetch_and(!Self::RUNNING_BIT, Ordering::Release); } fn is_debug_interrupted(&self) -> bool { @@ -838,7 +798,9 @@ impl InterruptHandleImpl for LinuxInterruptHandle { } fn set_dropped(&self) { - self.dropped.store(true, Ordering::Relaxed); + // Release ordering to ensure all VM cleanup operations are visible + // to any thread that checks dropped() via Acquire + self.dropped.store(true, Ordering::Release); } } @@ -847,7 +809,7 @@ impl InterruptHandle for LinuxInterruptHandle { fn kill(&self) -> bool { // Release ordering ensures that any writes before kill() are visible to the vcpu thread // when it checks is_cancelled() with Acquire ordering - self.running.fetch_or(Self::CANCEL_BIT, Ordering::Release); + self.state.fetch_or(Self::CANCEL_BIT, Ordering::Release); // Send signals to interrupt the vcpu if it's currently running self.send_signal() @@ -859,7 +821,9 @@ impl InterruptHandle for LinuxInterruptHandle { self.send_signal() } fn dropped(&self) -> bool { - self.dropped.load(Ordering::Relaxed) + // Acquire ordering to synchronize with the Release in set_dropped() + // This ensures we see all VM cleanup operations that happened before drop + self.dropped.load(Ordering::Acquire) } } @@ -910,9 +874,10 @@ impl InterruptHandleImpl for WindowsInterruptHandle { } fn clear_cancel(&self) { - // Relaxed is sufficient here - we're the only thread that clears this bit - // at the start of run(), and there's no data race on the clear operation itself - self.state.fetch_and(!Self::CANCEL_BIT, Ordering::Relaxed); + // Release ordering to ensure that any operations from the previous run() + // are visible to other threads. While this is typically called by the vcpu thread + // at the start of run(), the VM itself can move between threads across guest calls. + self.state.fetch_and(!Self::CANCEL_BIT, Ordering::Release); } fn clear_running(&self) { @@ -940,7 +905,9 @@ impl InterruptHandleImpl for WindowsInterruptHandle { } fn set_dropped(&self) { - self.dropped.store(true, Ordering::Relaxed); + // Release ordering to ensure all VM cleanup operations are visible + // to any thread that checks dropped() via Acquire + self.dropped.store(true, Ordering::Release); } } @@ -971,7 +938,9 @@ impl InterruptHandle for WindowsInterruptHandle { } fn dropped(&self) -> bool { - self.dropped.load(Ordering::Relaxed) + // Acquire ordering to synchronize with the Release in set_dropped() + // This ensures we see all VM cleanup operations that happened before drop + self.dropped.load(Ordering::Acquire) } }