diff --git a/Cargo.lock b/Cargo.lock index 26eb4b46d..8f40b3066 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -823,6 +823,7 @@ name = "svsm" version = "0.1.0" dependencies = [ "aes-gcm", + "bitfield-struct", "bitflags 2.4.2", "bootlib", "cpuarch", diff --git a/Cargo.toml b/Cargo.toml index 6a9f8ea9c..761d9520f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -30,6 +30,7 @@ syscall = { path = "syscall" } # crates.io aes-gcm = { version = "0.10.3", default-features = false } arbitrary = "1.3.0" +bitfield-struct = "0.5" bitflags = "2.4" clap = { version = "4.4.14", default-features = false} gdbstub = { version = "0.6.6", default-features = false } diff --git a/kernel/Cargo.toml b/kernel/Cargo.toml index b1f489ded..3031edf9c 100644 --- a/kernel/Cargo.toml +++ b/kernel/Cargo.toml @@ -24,6 +24,7 @@ elf.workspace = true syscall.workspace = true aes-gcm = { workspace = true, features = ["aes", "alloc"] } +bitfield-struct.workspace = true bitflags.workspace = true gdbstub = { workspace = true, optional = true } gdbstub_arch = { workspace = true, optional = true } diff --git a/kernel/src/cpu/idt/common.rs b/kernel/src/cpu/idt/common.rs index 6a7fc8a74..87a0e3832 100644 --- a/kernel/src/cpu/idt/common.rs +++ b/kernel/src/cpu/idt/common.rs @@ -42,7 +42,6 @@ pub const PF_ERROR_WRITE: usize = 2; #[derive(Default, Debug, Clone, Copy)] pub struct X86ExceptionContext { pub regs: X86GeneralRegs, - pub vector: usize, pub error_code: usize, pub frame: X86InterruptFrame, } @@ -257,7 +256,7 @@ global_asm!( popq %rbx popq %rax - addq $16, %rsp /* Skip vector and error code */ + addq $8, %rsp /* Skip error code */ iretq "#, diff --git a/kernel/src/cpu/idt/entry.S b/kernel/src/cpu/idt/entry.S index f78f18b80..b6bcb6b15 100644 --- a/kernel/src/cpu/idt/entry.S +++ b/kernel/src/cpu/idt/entry.S @@ -5,6 +5,12 @@ // Authors: Joerg Roedel .code64 + +.section .data +.globl HV_DOORBELL_ADDR +HV_DOORBELL_ADDR: + .quad 0 + .section .text .macro push_regs @@ -25,51 +31,232 @@ pushq %r15 .endm -.macro pop_regs - popq %r15 - popq %r14 - popq %r13 - popq %r12 - popq %r11 - popq %r10 - popq %r9 - popq %r8 - popq %rbp - popq %rdi - popq %rsi - popq %rdx - popq %rcx - popq %rbx - popq %rax - - addq $16, %rsp /* Skip vector and error code */ -.endm - .macro default_entry_no_ist name: req handler:req error_code:req vector:req .globl asm_entry_\name asm_entry_\name: .if \error_code == 0 pushq $0 .endif - pushq $\vector push_regs + movl $\vector, %esi movq %rsp, %rdi call ex_handler_\handler jmp default_return .endm +// The #HV handler is coded specially in order to deal with control flow +// alterations that may be required based on when the #HV arrives. If the #HV +// arrives from a context in which interrupts are enabled, then the #HV can +// be handled immediately. In general, if the #HV arrives from a context in +// which interrupts are disabled, processing is postponed to a point in time +// when interrupt processing is safe. However, there are two cases in which +// #HV processing is required even when interrupts are disabled. +// 1. The #HV arrives just before a return to the guest VMPL. In this case, +// the return to the guest VMPL must be cancelled so the #HV can be handled +// immediately. Otherwise, if the return to the guest occurs while the #HV +// remains pending, it will remain pending until the next time the SVSM +// is reentered, which could block delivery of critical events while the +// guest is executing. +// 2. The #HV arrives while preparing to execute IRET to return to a context +// in which interrupts are enabled. If such an #HV is not handled, then +// it will remain pending indefinitely, which could block delivery of +// critical events. When an #HV arrives at a time that the IRET is +// is committed to complete, the #HV handler will "take over" the +// exception context established previously (the one from which the IRET +// intends to return). In this case, the #HV handler will complete +// processing and will perform the IRET to the point of the original +// exception. +.globl asm_entry_hv +asm_entry_hv: + // Push a dummy error code, and only three registers. If no #HV + // processing is required, then only these three registers will need to + // be popped. + pushq $0 + pushq %rax + pushq %rbx + pushq %rcx + // Check whether interrupts were enabled at the time of #HV. If so, + // commit to processing all #HV events immediately. + testl $0x200, 0x30(%rsp) + jnz continue_hv + // Check whether the trap RIP is within the guest VMPL return window. + movq 0x20(%rsp), %rax // fetch RIP from the trap frame. + leaq switch_vmpl_window_start(%rip), %rbx + leaq switch_vmpl_window_end(%rip), %rcx + cmp %rbx, %rax + jb hv_not_vmpl_switch + cmp %rcx, %rax + jae hv_not_vmpl_switch + // RIP is in the return window, so update RIP to the cancel point. + leaq switch_vmpl_cancel(%rip), %rbx + movq %rbx, 0x20(%rsp) + // Defer any further processing until interrupts can be processed. + jmp postpone_hv +hv_not_vmpl_switch: + // Load the RSP value that was live at the time of the #HV. + movq 0x38(%rsp), %rcx + // Check to see whether this interrupt occurred on the IRET path + leaq iret_return_window(%rip), %rbx + cmp %rbx, %rax + jb postpone_hv + leaq default_iret(%rip), %rbx + cmp %rbx, %rax + ja postpone_hv + // RIP is within the IRET sequence, so the IRET should be aborted, and + // the previous exception should be handled as if it were #HV. At this + // point, there are two possibilities. If RIP is before the IRET + // instruction itself, then the RSP at the time of #HV exception + // points to the register context that was established for the previous + // exceptoin. In that case, the current RSP can be changed to point + // to that exception context, and the #HV can be handled using that + // register context, and when #HV processing completes, the subsequent + // end-of-interrupt flow will restore the context at the time of the + // previous exception. On the other hand, if RIP has advanced to the + // point of the IRET instruction itself, then all of the registers + // have already been reloaded with the previous exception context, + // and the RSP at the time of #HV points at the stack frame that + // would be consumed by the IRET instruction. In that case, a new + // exception context will need to be constructed. At this point, + // EFLAGS.ZF=1 if the previous RIP was at the IRET instruction. + jz restart_hv + // Check to see whether interrupts were enabled at the time the + // previous exception was taken. If not, no further processing is + // required. This could not be performed before the RIP check because + // the previous RIP determines where to find the previous EFLAGS.IF + // value on the stack. + testl $0x200, 18*8(%rcx) + jz postpone_hv + // Switch to the stack pointer from the previous exception, which + // points to the register save area, and continue with #HV + // processing. + movq %rcx, %rsp + jmp handle_as_hv + +postpone_hv: + popq %rcx + popq %rbx + popq %rax + addq $8, %rsp + iretq + +restart_hv: + // The previous RIP was on an IRET instruction. Before moving forward + // with #HV processing, check to see whether interrupts were enabled at + // the time the previous exception was taken. If not, no further + // processing is required. This could not be done when RIP was + // checked because the stack location of the previous EFLAGS.IF value + // was not known until RIP was determined to be at the IRET + // instruction. + testl $0x200, 0x10(%rcx) + jz postpone_hv + // Since interrupts were enabled in the previous exception frame, + // #HV processing is now required. The previous RSP points to the + // exception frame (minus error code) as it would be consumed by + // IRET. In order to set up a new exception context, the three + // registers that were saved upon entry to the #HV handler will need to + // be copied to the top of the stack (adjacent to the space for a + // dummy erro code). Then, the stack pointer will be loaded with + // the previous RSP and the remaining register state will be pushed + // normally to create a complete exception context reflecting the + // register state at the time of the exception that was returning at + // the time the #HV arrived. + // At this point, RCX holds the stack pointer at the time of the + // IRET taht was aborted. The first QWORD below that pointer is + // reserved for the dummy error code, then the three QWORDS below that + // will hold the RAX, RBX, and RCX values, which are presently stored + // in the top three QWORDs of the current stack. + movq 0*8(%rsp), %rax + movq %rax, -4*8(%rcx) + movq 1*8(%rsp), %rax + movq %rax, -3*8(%rcx) + movq 2*8(%rsp), %rax + movq %rax, -2*8(%rcx) + leaq -4*8(%rcx), %rsp + +continue_hv: + // At this point, only the dummy error code and first three registers + // have been pushed onto the stack. Push the remainder o construct a + // full exception context. + pushq %rdx + pushq %rsi + pushq %rdi + pushq %rbp + pushq %r8 + pushq %r9 + pushq %r10 + pushq %r11 + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + // Load the address of the #HV doorbell page. The global address + // might not yet be configured, and the per-CPU page might also not + // yet be configured, so only process events if there is a valid + // doorbell page. + movq HV_DOORBELL_ADDR(%rip), %rsi + testq %rsi, %rsi + jz default_return + movq (%rsi), %rdi + testq %rdi, %rdi + jz default_return +handle_as_hv: + call process_hv_events + // fall through to default_return + .globl default_return default_return: - testb $3, 18*8(%rsp) // Check CS in exception frame - jnz return_user - pop_regs + // Ensure that interrupts are disabled before attempting any return. + cli + testb $3, 17*8(%rsp) // Check CS in exception frame + jnz return_user +return_all_paths: + // If interrupts were prerviously available, then check whether any #HV + // events are pending. If so, proceed as if the original trap was + // #HV. + testl $0x200, 18*8(%rsp) // check EFLAGS.IF in exception frame + jz begin_iret_return + movq HV_DOORBELL_ADDR(%rip), %rdi + test %rdi, %rdi + jz begin_iret_return + movq (%rdi), %rdi + test %rdi, %rdi + jz begin_iret_return + testw $0x8000, (%rdi) + // The memory access to the NoFurtherSignal bit must be the last + // instruction prior to the IRET RIP window checked by the #HV entry + // code above. After this point, all code must execute within this + // instruction range to ensure that the #HV handler will be able to + // detect any #HV that arrives after the check above, except for + // the specific case of processing pending #HV events. +iret_return_window: + jnz handle_as_hv +begin_iret_return: + // Reload registers without modifying the stack pointer so that if #HV + // occurs within this window, the saved registers are still intact. + movq 0*8(%rsp), %r15 + movq 1*8(%rsp), %r14 + movq 2*8(%rsp), %r13 + movq 3*8(%rsp), %r12 + movq 4*8(%rsp), %r11 + movq 5*8(%rsp), %r10 + movq 6*8(%rsp), %r9 + movq 7*8(%rsp), %r8 + movq 8*8(%rsp), %rbp + movq 9*8(%rsp), %rdi + movq 10*8(%rsp), %rsi + movq 11*8(%rsp), %rdx + movq 12*8(%rsp), %rcx + movq 13*8(%rsp), %rbx + movq 14*8(%rsp), %rax + + addq $16*8, %rsp + default_iret: iretq return_user: // Put user-mode specific return code here - pop_regs - jmp default_iret + jmp return_all_paths // #DE Divide-by-Zero-Error Exception (Vector 0) default_entry_no_ist name=de handler=panic error_code=0 vector=0 @@ -137,9 +324,6 @@ default_entry_no_ist name=cp handler=panic error_code=1 vector=21 // Vectors 22-27 not defined -// #HV Hypervisor Injection Exception (Vector 28) -default_entry_no_ist name=hv handler=hypervisor_injection error_code=0 vector=28 - // #VC VMM Communication Exception (Vector 29) default_entry_no_ist name=vc handler=vmm_communication error_code=1 vector=29 diff --git a/kernel/src/cpu/idt/stage2.rs b/kernel/src/cpu/idt/stage2.rs index 701c7585d..5db173d49 100644 --- a/kernel/src/cpu/idt/stage2.rs +++ b/kernel/src/cpu/idt/stage2.rs @@ -28,8 +28,8 @@ pub fn early_idt_init() { } #[no_mangle] -pub extern "C" fn stage2_generic_idt_handler(ctx: &mut X86ExceptionContext) { - match ctx.vector { +pub extern "C" fn stage2_generic_idt_handler(ctx: &mut X86ExceptionContext, vector: usize) { + match vector { DF_VECTOR => { let cr2 = read_cr2(); let rip = ctx.frame.rip; @@ -46,20 +46,19 @@ pub extern "C" fn stage2_generic_idt_handler(ctx: &mut X86ExceptionContext) { {} _ => { let err = ctx.error_code; - let vec = ctx.vector; let rip = ctx.frame.rip; panic!( "Unhandled exception {} RIP {:#018x} error code: {:#018x}", - vec, rip, err + vector, rip, err ); } } } #[no_mangle] -pub extern "C" fn stage2_generic_idt_handler_no_ghcb(ctx: &mut X86ExceptionContext) { - match ctx.vector { +pub extern "C" fn stage2_generic_idt_handler_no_ghcb(ctx: &mut X86ExceptionContext, vector: usize) { + match vector { DF_VECTOR => { let cr2 = read_cr2(); let rip = ctx.frame.rip; @@ -72,12 +71,11 @@ pub extern "C" fn stage2_generic_idt_handler_no_ghcb(ctx: &mut X86ExceptionConte VC_VECTOR => stage2_handle_vc_exception_no_ghcb(ctx).expect("Failed to handle #VC"), _ => { let err = ctx.error_code; - let vec = ctx.vector; let rip = ctx.frame.rip; panic!( "Unhandled exception {} RIP {:#018x} error code: {:#018x}", - vec, rip, err + vector, rip, err ); } } @@ -93,11 +91,12 @@ global_asm!( /* Early tage 2 handler array setup */ .text push_regs_no_ghcb: - pushq %rax pushq %rbx pushq %rcx pushq %rdx pushq %rsi + movq 0x20(%rsp), %rsi + movq %rax, 0x20(%rsp) pushq %rdi pushq %rbp pushq %r8 @@ -131,11 +130,12 @@ global_asm!( /* Stage 2 handler array setup */ .text push_regs_stage2: - pushq %rax pushq %rbx pushq %rcx pushq %rdx pushq %rsi + movq 0x20(%rsp), %rsi + movq %rax, 0x20(%rsp) pushq %rdi pushq %rbp pushq %r8 diff --git a/kernel/src/cpu/idt/svsm.rs b/kernel/src/cpu/idt/svsm.rs index 51cd728d7..2c9478ba7 100644 --- a/kernel/src/cpu/idt/svsm.rs +++ b/kernel/src/cpu/idt/svsm.rs @@ -16,8 +16,10 @@ use super::common::{ OF_VECTOR, PF_VECTOR, SS_VECTOR, SX_VECTOR, TS_VECTOR, UD_VECTOR, VC_VECTOR, XF_VECTOR, }; use crate::address::VirtAddr; +use crate::cpu::percpu::this_cpu_unsafe; use crate::cpu::X86ExceptionContext; use crate::debug::gdbstub::svsm_gdbstub::handle_debug_exception; +use crate::platform::SVSM_PLATFORM; use crate::task::{is_task_fault, terminate}; use core::arch::global_asm; @@ -50,6 +52,8 @@ extern "C" { fn asm_entry_vc(); fn asm_entry_sx(); fn asm_entry_int80(); + + pub static mut HV_DOORBELL_ADDR: usize; } fn init_ist_vectors() { @@ -94,6 +98,19 @@ pub fn early_idt_init() { pub fn idt_init() { // Set IST vectors init_ist_vectors(); + + // Capture an address that can be used by assembly code to read the #HV + // doorbell page. The address of each CPU's doorbell page may be + // different, but the address of the field in the PerCpu structure that + // holds the actual pointer is constant across all CPUs, so that is the + // pointer that is actually captured. The address that is captured is + // stored as a usize instead of a typed value, because the declarations + // required for type safety here are cumbersome, and the assembly code + // that uses the value is not type safe in any case, so enforcing type + // safety on the pointer would offer no meaningful value. + unsafe { + HV_DOORBELL_ADDR = (*this_cpu_unsafe()).hv_doorbell_addr(); + }; } // Debug handler @@ -153,7 +170,7 @@ extern "C" fn ex_handler_general_protection(ctxt: &mut X86ExceptionContext) { // Page-Fault handler #[no_mangle] -extern "C" fn ex_handler_page_fault(ctxt: &mut X86ExceptionContext) { +extern "C" fn ex_handler_page_fault(ctxt: &mut X86ExceptionContext, vector: usize) { let cr2 = read_cr2(); let rip = ctxt.frame.rip; let err = ctxt.error_code; @@ -178,7 +195,7 @@ extern "C" fn ex_handler_page_fault(ctxt: &mut X86ExceptionContext) { .is_err() && !handle_exception_table(ctxt) { - handle_debug_exception(ctxt, ctxt.vector); + handle_debug_exception(ctxt, vector); panic!( "Unhandled Page-Fault at RIP {:#018x} CR2: {:#018x} error code: {:#018x}", rip, cr2, err @@ -186,22 +203,13 @@ extern "C" fn ex_handler_page_fault(ctxt: &mut X86ExceptionContext) { } } -// Hypervisor Injection handler -#[no_mangle] -extern "C" fn ex_handler_hypervisor_injection(_ctxt: &mut X86ExceptionContext) { - // #HV processing is not required in the SVSM. If a maskable - // interrupt occurs, it will be processed prior to the next exit. - // There are no NMI sources, and #MC cannot be handled anyway - // and can safely be ignored. -} - // VMM Communication handler #[no_mangle] -extern "C" fn ex_handler_vmm_communication(ctxt: &mut X86ExceptionContext) { +extern "C" fn ex_handler_vmm_communication(ctxt: &mut X86ExceptionContext, vector: usize) { let rip = ctxt.frame.rip; let code = ctxt.error_code; - if let Err(err) = handle_vc_exception(ctxt) { + if let Err(err) = handle_vc_exception(ctxt, vector) { log::error!("#VC handling error: {:?}", err); if user_mode(ctxt) { log::error!("Failed to handle #VC from user-mode at RIP {:#018x} code: {:#018x} - Terminating task", rip, code); @@ -235,16 +243,21 @@ extern "C" fn ex_handler_system_call(ctxt: &mut X86ExceptionContext) { } #[no_mangle] -pub extern "C" fn ex_handler_panic(ctx: &mut X86ExceptionContext) { - let vec = ctx.vector; +pub extern "C" fn ex_handler_panic(ctx: &mut X86ExceptionContext, vector: usize) { let rip = ctx.frame.rip; let err = ctx.error_code; let rsp = ctx.frame.rsp; let ss = ctx.frame.ss; panic!( "Unhandled exception {} RIP {:#018x} error code: {:#018x} RSP: {:#018x} SS: {:#x}", - vec, rip, err, rsp, ss + vector, rip, err, rsp, ss ); } +#[no_mangle] +pub extern "C" fn common_isr_handler(_vector: usize) { + // Treat any unhandled interrupt as a spurious interrupt. + SVSM_PLATFORM.as_dyn_ref().eoi(); +} + global_asm!(include_str!("entry.S"), options(att_syntax)); diff --git a/kernel/src/cpu/percpu.rs b/kernel/src/cpu/percpu.rs index 8fd81e84c..9a6dce812 100644 --- a/kernel/src/cpu/percpu.rs +++ b/kernel/src/cpu/percpu.rs @@ -9,6 +9,7 @@ extern crate alloc; use super::gdt_mut; use super::tss::{X86Tss, IST_DF}; use crate::address::{Address, PhysAddr, VirtAddr}; +use crate::cpu::ghcb::current_ghcb; use crate::cpu::tss::TSS_LIMIT; use crate::cpu::vmsa::init_guest_vmsa; use crate::cpu::vmsa::vmsa_mut_ref_from_vaddr; @@ -25,8 +26,10 @@ use crate::mm::{ }; use crate::platform::SvsmPlatform; use crate::sev::ghcb::GHCB; -use crate::sev::utils::RMPFlags; +use crate::sev::hv_doorbell::HVDoorbell; +use crate::sev::msr_protocol::{hypervisor_ghcb_features, GHCBHvFeatures}; use crate::sev::vmsa::allocate_new_vmsa; +use crate::sev::RMPFlags; use crate::task::{schedule, schedule_task, RunQueue, Task, TaskPointer, WaitQueue}; use crate::types::{PAGE_SHIFT, PAGE_SHIFT_2M, PAGE_SIZE, PAGE_SIZE_2M, SVSM_TR_FLAGS, SVSM_TSS}; use crate::utils::MemoryRegion; @@ -252,6 +255,7 @@ pub struct PerCpuUnsafe { shared: PerCpuShared, private: RefCell, ghcb: *mut GHCB, + hv_doorbell: *mut HVDoorbell, init_stack: Option, ist: IstStacks, @@ -266,6 +270,7 @@ impl PerCpuUnsafe { private: RefCell::new(PerCpu::new(apic_id, cpu_unsafe_ptr)), shared: PerCpuShared::new(), ghcb: ptr::null_mut(), + hv_doorbell: ptr::null_mut(), init_stack: None, ist: IstStacks::new(), current_stack: MemoryRegion::new(VirtAddr::null(), 0), @@ -317,6 +322,14 @@ impl PerCpuUnsafe { self.ghcb } + pub fn hv_doorbell_unsafe(&self) -> *mut HVDoorbell { + self.hv_doorbell + } + + pub fn hv_doorbell_addr(&self) -> usize { + ptr::addr_of!(self.hv_doorbell) as usize + } + pub fn get_top_of_stack(&self) -> VirtAddr { self.init_stack.unwrap() } @@ -451,6 +464,32 @@ impl PerCpu { } } + pub fn setup_hv_doorbell(&self) -> Result<(), SvsmError> { + let paddr = allocate_zeroed_page()?; + let ghcb = &mut current_ghcb(); + if let Err(e) = HVDoorbell::init(paddr, ghcb) { + free_page(paddr); + return Err(e); + } + + unsafe { + let cpu_unsafe = self.cpu_unsafe as *mut PerCpuUnsafe; + (*cpu_unsafe).hv_doorbell = paddr.as_mut_ptr::(); + } + + Ok(()) + } + + pub fn configure_hv_doorbell(&self) -> Result<(), SvsmError> { + // #HV doorbell configuration is only required if this system will make + // use of restricted injection. + if hypervisor_ghcb_features().contains(GHCBHvFeatures::SEV_SNP_RESTR_INJ) { + self.setup_hv_doorbell() + } else { + Ok(()) + } + } + fn setup_tss(&mut self) { let double_fault_stack = unsafe { (*self.cpu_unsafe).get_top_of_df_stack() }; self.tss.ist_stacks[IST_DF] = double_fault_stack; diff --git a/kernel/src/cpu/smp.rs b/kernel/src/cpu/smp.rs index f3fb7a524..0907242c0 100644 --- a/kernel/src/cpu/smp.rs +++ b/kernel/src/cpu/smp.rs @@ -6,7 +6,7 @@ use crate::acpi::tables::ACPICPUInfo; use crate::cpu::ghcb::current_ghcb; -use crate::cpu::percpu::{this_cpu_mut, this_cpu_shared, PerCpu}; +use crate::cpu::percpu::{this_cpu, this_cpu_mut, this_cpu_shared, PerCpu}; use crate::cpu::vmsa::init_svsm_vmsa; use crate::platform::SvsmPlatform; use crate::platform::SVSM_PLATFORM; @@ -66,6 +66,11 @@ fn start_ap() { .setup_on_cpu(SVSM_PLATFORM.as_dyn_ref()) .expect("setup_on_cpu() failed"); + // Configure the #HV doorbell page as required. + this_cpu() + .configure_hv_doorbell() + .expect("configure_hv_doorbell() failed"); + this_cpu_mut() .setup_idle_task(ap_request_loop) .expect("Failed to allocated idle task for AP"); diff --git a/kernel/src/cpu/vc.rs b/kernel/src/cpu/vc.rs index 169f2f679..461ab7485 100644 --- a/kernel/src/cpu/vc.rs +++ b/kernel/src/cpu/vc.rs @@ -121,7 +121,7 @@ pub fn stage2_handle_vc_exception(ctx: &mut X86ExceptionContext) -> Result<(), S Ok(()) } -pub fn handle_vc_exception(ctx: &mut X86ExceptionContext) -> Result<(), SvsmError> { +pub fn handle_vc_exception(ctx: &mut X86ExceptionContext, vector: usize) -> Result<(), SvsmError> { let error_code = ctx.error_code; // To handle NAE events, we're supposed to reset the VALID_BITMAP field of @@ -138,7 +138,7 @@ pub fn handle_vc_exception(ctx: &mut X86ExceptionContext) -> Result<(), SvsmErro // will cause either an exception via DB_VECTOR if the DEBUG_SWAP sev_feature is // clear, or a VC exception with an error code of X86_TRAP if set. (X86_TRAP, _) => { - handle_debug_exception(ctx, ctx.vector); + handle_debug_exception(ctx, vector); Ok(()) } (SVM_EXIT_CPUID, Some(DecodedInsn::Cpuid)) => handle_cpuid(ctx), diff --git a/kernel/src/mm/page_visibility.rs b/kernel/src/mm/page_visibility.rs index a09367e24..07401afcc 100644 --- a/kernel/src/mm/page_visibility.rs +++ b/kernel/src/mm/page_visibility.rs @@ -6,7 +6,7 @@ use crate::address::VirtAddr; use crate::cpu::flush_tlb_global_sync; -use crate::cpu::percpu::this_cpu_mut; +use crate::cpu::percpu::{this_cpu, this_cpu_mut}; use crate::error::SvsmError; use crate::mm::validate::{ valid_bitmap_clear_valid_4k, valid_bitmap_set_valid_4k, valid_bitmap_valid_addr, @@ -35,7 +35,10 @@ pub fn make_page_shared(vaddr: VirtAddr) -> Result<(), SvsmError> { )?; // Update the page tables to map the page as shared. - this_cpu_mut().get_pgtable().set_shared_4k(vaddr)?; + this_cpu() + .get_pgtable() + .set_shared_4k(vaddr) + .expect("Failed to remap shared page in page tables"); flush_tlb_global_sync(); Ok(()) diff --git a/kernel/src/platform/mod.rs b/kernel/src/platform/mod.rs index da080b0d5..d8248f26d 100644 --- a/kernel/src/platform/mod.rs +++ b/kernel/src/platform/mod.rs @@ -77,6 +77,9 @@ pub trait SvsmPlatform { region: MemoryRegion, op: PvalidateOp, ) -> Result<(), SvsmError>; + + /// Perform an EOI of the current interrupt. + fn eoi(&self); } //FIXME - remove Copy trait diff --git a/kernel/src/platform/native.rs b/kernel/src/platform/native.rs index f47d8a637..694735bdc 100644 --- a/kernel/src/platform/native.rs +++ b/kernel/src/platform/native.rs @@ -76,4 +76,8 @@ impl SvsmPlatform for NativePlatform { ) -> Result<(), SvsmError> { Ok(()) } + + fn eoi(&self) { + todo!(); + } } diff --git a/kernel/src/platform/snp.rs b/kernel/src/platform/snp.rs index b25b66a3b..59bc642d2 100644 --- a/kernel/src/platform/snp.rs +++ b/kernel/src/platform/snp.rs @@ -50,7 +50,8 @@ impl SvsmPlatform for SnpPlatform { } fn setup_percpu_current(&self, cpu: &mut PerCpu) -> Result<(), SvsmError> { - cpu.register_ghcb() + cpu.register_ghcb()?; + Ok(()) } fn get_page_encryption_masks(&self, vtom: usize) -> PageEncryptionMasks { @@ -112,4 +113,10 @@ impl SvsmPlatform for SnpPlatform { ) -> Result<(), SvsmError> { pvalidate_range(region, op) } + + fn eoi(&self) { + // 0x80E is the X2APIC EOI MSR. + // Errors here cannot be handled but should not be grounds for panic. + let _ = current_ghcb().wrmsr(0x80E, 0); + } } diff --git a/kernel/src/requests.rs b/kernel/src/requests.rs index 2161c0948..935921482 100644 --- a/kernel/src/requests.rs +++ b/kernel/src/requests.rs @@ -5,12 +5,12 @@ // Author: Joerg Roedel use crate::cpu::flush_tlb_global_sync; -use crate::cpu::ghcb::current_ghcb; use crate::cpu::percpu::{process_requests, this_cpu, wait_for_requests}; use crate::error::SvsmError; use crate::mm::GuestPtr; use crate::protocols::core::core_protocol_request; use crate::protocols::errors::{SvsmReqError, SvsmResultCode}; +use crate::sev::ghcb::switch_to_vmpl; #[cfg(all(feature = "mstpm", not(test)))] use crate::protocols::{vtpm::vtpm_protocol_request, SVSM_VTPM_PROTOCOL}; @@ -132,9 +132,7 @@ pub fn request_loop() { flush_tlb_global_sync(); - current_ghcb() - .run_vmpl(GUEST_VMPL as u64) - .expect("Failed to run guest VMPL"); + switch_to_vmpl(GUEST_VMPL as u32); } else { loop { log::debug!("No VMSA or CAA! Halting"); diff --git a/kernel/src/sev/ghcb.rs b/kernel/src/sev/ghcb.rs index ae67a5b76..9a5fe4486 100644 --- a/kernel/src/sev/ghcb.rs +++ b/kernel/src/sev/ghcb.rs @@ -5,9 +5,9 @@ // Author: Joerg Roedel use crate::address::{Address, PhysAddr, VirtAddr}; -use crate::cpu::flush_tlb_global_sync; use crate::cpu::msr::{write_msr, SEV_GHCB}; -use crate::cpu::X86GeneralRegs; +use crate::cpu::percpu::this_cpu_unsafe; +use crate::cpu::{flush_tlb_global_sync, X86GeneralRegs}; use crate::error::SvsmError; use crate::mm::pagetable::get_init_pgtable_locked; use crate::mm::validate::{ @@ -15,10 +15,13 @@ use crate::mm::validate::{ }; use crate::mm::virt_to_phys; use crate::platform::PageStateChangeOp; +use crate::sev::hv_doorbell::HVDoorbell; use crate::sev::sev_snp_enabled; use crate::sev::utils::raw_vmgexit; use crate::types::{PageSize, PAGE_SIZE_2M}; use crate::utils::MemoryRegion; + +use core::arch::global_asm; use core::mem::{self, offset_of}; use core::ptr; @@ -115,7 +118,6 @@ impl From for SvsmError { } } -#[non_exhaustive] enum GHCBExitCode {} impl GHCBExitCode { @@ -127,7 +129,7 @@ impl GHCBExitCode { pub const GUEST_REQUEST: u64 = 0x8000_0011; pub const GUEST_EXT_REQUEST: u64 = 0x8000_0012; pub const AP_CREATE: u64 = 0x80000013; - pub const RUN_VMPL: u64 = 0x80000018; + pub const HV_DOORBELL: u64 = 0x8000_0014; } #[derive(Clone, Copy, Debug)] @@ -229,12 +231,20 @@ impl GHCB { Ok(()) } + pub fn wrmsr(&mut self, msr_index: u32, value: u64) -> Result<(), SvsmError> { + self.wrmsr_raw(msr_index as u64, value & 0xFFFF_FFFF, value >> 32) + } + pub fn wrmsr_regs(&mut self, regs: &X86GeneralRegs) -> Result<(), SvsmError> { + self.wrmsr_raw(regs.rcx as u64, regs.rax as u64, regs.rdx as u64) + } + + pub fn wrmsr_raw(&mut self, rcx: u64, rax: u64, rdx: u64) -> Result<(), SvsmError> { self.clear(); - self.set_rcx_valid(regs.rcx as u64); - self.set_rax_valid(regs.rax as u64); - self.set_rdx_valid(regs.rdx as u64); + self.set_rcx_valid(rcx); + self.set_rax_valid(rax); + self.set_rdx_valid(rdx); self.vmgexit(GHCBExitCode::MSR, 1, 0)?; Ok(()) @@ -519,6 +529,12 @@ impl GHCB { Ok(()) } + pub fn register_hv_doorbell(&mut self, paddr: PhysAddr) -> Result<(), SvsmError> { + self.clear(); + self.vmgexit(GHCBExitCode::HV_DOORBELL, 1, u64::from(paddr))?; + Ok(()) + } + pub fn guest_request( &mut self, req_page: VirtAddr, @@ -568,14 +584,89 @@ impl GHCB { Ok(()) } +} - pub fn run_vmpl(&mut self, vmpl: u64) -> Result<(), SvsmError> { - self.clear(); - self.vmgexit(GHCBExitCode::RUN_VMPL, vmpl, 0)?; - Ok(()) +extern "C" { + pub fn switch_to_vmpl_unsafe(hv_doorbell: *mut HVDoorbell, vmpl: u32) -> bool; +} + +pub fn switch_to_vmpl(vmpl: u32) { + // The switch to a lower VMPL must be done with an assembly sequence in + // order to ensure that any #HV that occurs during the sequence will + // correctly block the VMPL switch so that events can be processed. + unsafe { + let cpu_unsafe = this_cpu_unsafe(); + let hv_doorbell = (*cpu_unsafe).hv_doorbell_unsafe(); + + // Process any pending #HV events before leaving the SVSM. No event + // can cancel the request to enter the guest VMPL, so proceed with + // guest entry once events have been handled. + if !hv_doorbell.is_null() { + (*hv_doorbell).process_pending_events(); + } + if !switch_to_vmpl_unsafe(hv_doorbell, vmpl) { + panic!("Failed to switch to VMPL {}", vmpl); + } } } +global_asm!( + r#" + .globl switch_to_vmpl_unsafe + switch_to_vmpl_unsafe: + + /* Upon entry, + * rdi = pointer to the HV doorbell page + * esi = target VMPL + */ + /* Check if NoFurtherSignal is set (bit 15 of the first word of the + * #HV doorbell page). If so, abort the transition. */ + test %rdi, %rdi + jz switch_vmpl_proceed + testw $0x8000, (%rdi) + + /* From this point until the vmgexit, if a #HV arrives, the #HV handler + * must prevent the VMPL transition. */ + .globl switch_vmpl_window_start + switch_vmpl_window_start: + jnz switch_vmpl_cancel + + switch_vmpl_proceed: + /* Use the MSR-based VMPL switch request to avoid any need to use the + * GHCB page. Run VMPL request is 0x16 and response is 0x17. */ + movl $0x16, %eax + movl %esi, %edx + movl $0xC0010130, %ecx + wrmsr + rep; vmmcall + + .globl switch_vmpl_window_end + switch_vmpl_window_end: + /* Verify that the request was honored. ECX still contains the MSR + * number. */ + rdmsr + andl $0xFFF, %eax + cmpl $0x17, %eax + jz switch_vmpl_cancel + xorl %eax, %eax + ret + + /* An aborted VMPL switch is treated as a successful switch. */ + .globl switch_vmpl_cancel + switch_vmpl_cancel: + /* Process any pending events if NoFurtherSignal has been set. */ + test %rdi, %rdi + jz no_pending_events + testw $0x8000, (%rdi) + jz no_pending_events + call process_hv_events + no_pending_events: + movl $1, %eax + ret + "#, + options(att_syntax) +); + #[cfg(test)] mod tests { use super::*; diff --git a/kernel/src/sev/hv_doorbell.rs b/kernel/src/sev/hv_doorbell.rs new file mode 100644 index 000000000..c62146215 --- /dev/null +++ b/kernel/src/sev/hv_doorbell.rs @@ -0,0 +1,89 @@ +// SPDX-License-Identifier: MIT OR Apache-2.0 Copyright (c) Microsoft Corporation +// Author: Jon Lange (jlange@microsoft.com) + +use crate::address::VirtAddr; +use crate::cpu::idt::svsm::common_isr_handler; +use crate::error::SvsmError; +use crate::mm::page_visibility::{make_page_private, make_page_shared}; +use crate::mm::virt_to_phys; +use crate::sev::ghcb::GHCB; + +use bitfield_struct::bitfield; +use core::sync::atomic::{AtomicU8, Ordering}; + +#[bitfield(u8)] +pub struct HVDoorbellFlags { + pub nmi_pending: bool, + pub mc_pending: bool, + #[bits(5)] + rsvd_6_2: u8, + pub no_further_signal: bool, +} + +#[repr(C)] +#[derive(Debug)] +pub struct HVDoorbell { + pub vector: AtomicU8, + pub flags: AtomicU8, + pub no_eoi_required: AtomicU8, + reserved: u8, +} + +impl HVDoorbell { + pub fn init(vaddr: VirtAddr, ghcb: &mut GHCB) -> Result<(), SvsmError> { + // The #HV doorbell page must be private before it can be used. + make_page_shared(vaddr)?; + + // Register the #HV doorbell page using the GHCB protocol. + let paddr = virt_to_phys(vaddr); + ghcb.register_hv_doorbell(paddr).map_err(|e| { + // Return the page to a private state. + make_page_private(vaddr).expect("Failed to restore page visibility"); + e + })?; + + Ok(()) + } + + pub fn process_pending_events(&self) { + // Clear the NoFurtherSignal bit before processing. If any additional + // signal comes in after processing has commenced, it may be missed by + // this loop, but it will be detected when interrupts are processed + // again. Also clear the NMI bit, since NMIs are not expected. + let no_further_signal_mask: u8 = HVDoorbellFlags::new() + .with_no_further_signal(true) + .with_nmi_pending(true) + .into(); + let flags = HVDoorbellFlags::from( + self.flags + .fetch_and(!no_further_signal_mask, Ordering::Relaxed), + ); + + // #MC handling is not possible, so panic if a machine check has + // occurred. + if flags.mc_pending() { + panic!("#MC exception delivered via #HV"); + } + + // Consume interrupts as long as they are available. + loop { + // Consume any interrupt that may be present. + let vector = self.vector.swap(0, Ordering::Relaxed); + if vector == 0 { + break; + } + common_isr_handler(vector as usize); + } + } +} + +/// # Safety +/// This function takes a raw pointer to the #HV doorbell page because it is +/// called directly from assembly, and should not be invoked directly from +/// Rust code. +#[no_mangle] +pub unsafe extern "C" fn process_hv_events(hv_doorbell: *mut HVDoorbell) { + unsafe { + (*hv_doorbell).process_pending_events(); + } +} diff --git a/kernel/src/sev/mod.rs b/kernel/src/sev/mod.rs index a3ef7bfab..02d3c50ac 100644 --- a/kernel/src/sev/mod.rs +++ b/kernel/src/sev/mod.rs @@ -5,6 +5,7 @@ // Author: Joerg Roedel pub mod ghcb; +pub mod hv_doorbell; pub mod msr_protocol; pub mod secrets_page; pub mod status; diff --git a/kernel/src/svsm.rs b/kernel/src/svsm.rs index b1465e2b7..486d43d1b 100755 --- a/kernel/src/svsm.rs +++ b/kernel/src/svsm.rs @@ -397,6 +397,10 @@ pub extern "C" fn svsm_main() { init_hypervisor_ghcb_features().expect("Failed to obtain hypervisor GHCB features"); + this_cpu() + .configure_hv_doorbell() + .expect("Failed to configure #HV doorbell"); + let launch_info = &*LAUNCH_INFO; let config = if launch_info.igvm_params_virt_addr != 0 { let igvm_params = IgvmParams::new(VirtAddr::from(launch_info.igvm_params_virt_addr)) diff --git a/kernel/src/task/tasks.rs b/kernel/src/task/tasks.rs index fa5b86d17..edd87f570 100644 --- a/kernel/src/task/tasks.rs +++ b/kernel/src/task/tasks.rs @@ -366,11 +366,12 @@ impl Task { // 'Push' the task frame onto the stack unsafe { - // Setup IRQ return frame + // Setup IRQ return frame. User-mode tasks always run with + // interrupts enabled. let mut iret_frame = X86ExceptionContext::default(); iret_frame.frame.rip = user_entry; iret_frame.frame.cs = (SVSM_USER_CS | 3).into(); - iret_frame.frame.flags = 0; + iret_frame.frame.flags = 0x202; iret_frame.frame.rsp = (USER_MEM_END - 8).into(); iret_frame.frame.ss = (SVSM_USER_DS | 3).into();