arceos 之 hypervisor-x86 源码阅读

源代码地址：https://github.com/arceos-hypervisor/arceos/tree/hypervisor-x86

apps/hv/src/main.rs

创建虚拟机全过程：

fn main(hart_id: usize) {
	println!("Hello, hv!");
	#[cfg(target_arch = "x86_64")]
    {
        println!("into main {}", hart_id);

        // 第一步：打开 vmx
        let mut p = PerCpu::<HyperCraftHalImpl>::new(hart_id);
        p.hardware_enable().unwrap();

        // 第二步：设置页表
        let gpm = x64::setup_gpm(hart_id).unwrap();
        let npt = gpm.nest_page_table_root();   // 页表根节点物理地址
        info!("{:#x?}", gpm);

        // 第三步，创建虚拟机需要的 vcpu
        let mut vcpus = VmCpus::<HyperCraftHalImpl, X64VcpuDevices<HyperCraftHalImpl>>::new();
        vcpus.add_vcpu(VCpu::new(0, p.vmcs_revision_id(), 0x7c00, npt).unwrap());
        
        let mut vm = VM::<HyperCraftHalImpl, X64VcpuDevices<HyperCraftHalImpl>, X64VmDevices<HyperCraftHalImpl>>::new(vcpus);
        vm.bind_vcpu(0);

        if hart_id == 0 {   // 主 cpu 执行
            let (_, dev) = vm.get_vcpu_and_device(0).unwrap();
            *(dev.console.lock().backend()) = device::device_emu::MultiplexConsoleBackend::Primary;

            for v in 0..256 {
                libax::hv::set_host_irq_enabled(v, true);
            }
        }

        println!("Running guest...");
        println!("{:?}", vm.run_vcpu(0));

        p.hardware_disable().unwrap();

        panic!("done");

        return;
    }
}

第一步：打开 vmx

首先看 PerCpu 相关数据结构：

// arceos/crates/hypercraft/src/arch/x86_64/percpu.rs
/// Host per-CPU states to run the guest. All methods must be called on the corresponding CPU.
pub struct PerCpu<H: HyperCraftHal> {
    cpu_id: usize,  // 物理 cpu-id
    arch: VmxPerCpuState<H>,
}

// arceos/crates/hypercraft/src/arch/x86_64/vmx/percpu.rs
/// State per vmx physical cpu.
pub struct VmxPerCpuState<H: HyperCraftHal> {
    pub(super) vmcs_revision_id: u32,
    vmx_region: VmxRegion<H>,
}

// arceos/crates/hypercraft/src/arch/x86_64/vmx/region.rs
/// VMCS/VMXON region in 4K size. (SDM Vol. 3C, Section 24.2)
#[derive(Debug)]
pub struct VmxRegion<H: HyperCraftHal> {
    frame: PhysFrame<H>,        // 4K 页面
}

PerCpu 与一个逻辑 cpu 绑定，进行 vmx 的启动与关闭等操作。

impl<H: HyperCraftHal> PerCpu<H> {
    /// Create an uninitialized instance.
    pub fn new(cpu_id: usize) -> Self {
        Self {
            cpu_id: cpu_id,     // 物理 cpu id
            arch: VmxPerCpuState::new(),    // vmcs_revision_id 为 0，VmxRegion 未分配空间
        }
    }

    /// Whether the current CPU has hardware virtualization enabled.
    pub fn is_enabled(&self) -> bool {
        self.arch.is_enabled()  // 检查 CR4 寄存器 VMXE 是否打开
    }

    /// Enable hardware virtualization on the current CPU.
    pub fn hardware_enable(&mut self) -> HyperResult {
        match self.arch.hardware_enable() {
            Ok(_) => {
                info!("VMX enabled on cpu {}.", self.cpu_id);
                Ok(())
            },
            e @ Err(_) => {
                e
            }
        }
    }

    /// Disable hardware virtualization on the current CPU.
    pub fn hardware_disable(&mut self) -> HyperResult {
        match self.arch.hardware_disable() {
            Ok(_) => {
                info!("VMX disabled on cpu {}.", self.cpu_id);
                Ok(())
            },
            e @ Err(_) => {
                e
            }
        }
    }

    /// Get current vmcs revision id.
    pub fn vmcs_revision_id(&self) -> u32 {
        self.arch.vmcs_revision_id()
    }
}

impl<H: HyperCraftHal> Drop for PerCpu<H> {
    fn drop(&mut self) {
        if self.is_enabled() {
            self.hardware_disable().unwrap();
        }
    }
}

第二步，设置页表

现在需要支持两个虚拟机，所以内存只分配两个虚拟机的量。

#[repr(align(4096))]
pub(super) struct AlignedMemory<const LEN: usize>([u8; LEN]);

// 两个虚拟机的内存空间，大小为 GUEST_PHYS_MEMORY_SIZE（16M）
pub(super) static mut GUEST_PHYS_MEMORY: [AlignedMemory<GUEST_PHYS_MEMORY_SIZE>; 2] =
    [AlignedMemory([0; GUEST_PHYS_MEMORY_SIZE]), AlignedMemory([0; GUEST_PHYS_MEMORY_SIZE])];

// 获取第 id 个虚拟机的 guest_paddr 对应的物理机虚拟地址
fn gpa_as_mut_ptr(id: usize, guest_paddr: GuestPhysAddr) -> *mut u8 {
    let offset = unsafe { &(GUEST_PHYS_MEMORY[id]) as *const _ as usize };
    let host_vaddr = guest_paddr + offset;
    host_vaddr as *mut u8
}

// 将 host 物理地址为 hpa，大小为 size 的镜像复制到第 id 个虚拟机的物理地址为 load_gpa 处。
#[cfg(target_arch = "x86_64")]
fn load_guest_image(id: usize, hpa: HostPhysAddr, load_gpa: GuestPhysAddr, size: usize) {
    let image_ptr = usize::from(phys_to_virt(hpa.into())) as *const u8; // host 物理地址转为 host 虚拟地址
    let image = unsafe { core::slice::from_raw_parts(image_ptr, size) };

    trace!("loading to guest memory: host {:#x} to guest {:#x}, size {:#x}", image_ptr as usize, load_gpa, size);

    unsafe {
        core::slice::from_raw_parts_mut(gpa_as_mut_ptr(id, load_gpa), size).copy_from_slice(image)
    }
}

#[cfg(target_arch = "x86_64")]
pub fn setup_gpm(id: usize) -> HyperResult<GuestPhysMemorySet> {
    // copy BIOS and guest images

    load_guest_image(id, BIOS_PADDR, BIOS_ENTRY, BIOS_SIZE);
    #[cfg(feature = "guest_nimbos")]
    {
        load_guest_image(id, GUEST_IMAGE_PADDR, GUEST_ENTRY, GUEST_IMAGE_SIZE);
    }
    
    // create nested page table and add mapping
    let mut gpm = GuestPhysMemorySet::new()?;
    let guest_memory_regions = [
        GuestMemoryRegion {
            // Low RAM
            gpa: GUEST_PHYS_MEMORY_BASE,
            hpa: virt_to_phys((gpa_as_mut_ptr(id, GUEST_PHYS_MEMORY_BASE) as HostVirtAddr).into()).into(),
            size: GUEST_PHYS_MEMORY_SIZE,
            flags: MappingFlags::READ | MappingFlags::WRITE | MappingFlags::EXECUTE,
        },
        #[cfg(feature = "guest_linux")]
        GuestMemoryRegion {
            // Low RAM2
            gpa: 0x100_0000,
            hpa: 0x6100_0000,
            size: 0xf00_0000,
            flags: MappingFlags::READ | MappingFlags::WRITE | MappingFlags::EXECUTE,
        },
        #[cfg(feature = "guest_linux")]
        GuestMemoryRegion {
            // RAM
            gpa: 0x7000_0000,
            hpa: 0x7000_0000,
            size: 0x1000_0000,
            flags: MappingFlags::READ | MappingFlags::WRITE | MappingFlags::EXECUTE,
        },
        GuestMemoryRegion {
            // PCI
            gpa: 0x8000_0000,
            hpa: 0x8000_0000,
            size: 0x1000_0000,
            flags: MappingFlags::READ | MappingFlags::WRITE | MappingFlags::DEVICE,
        },
        GuestMemoryRegion {
            gpa: 0xfe00_0000,
            hpa: 0xfe00_0000,
            size: 0x1_0000,
            flags: MappingFlags::READ | MappingFlags::WRITE | MappingFlags::DEVICE,
        },
        GuestMemoryRegion {
            gpa: 0xfeb0_0000,
            hpa: 0xfeb0_0000,
            size: 0x10_0000,
            flags: MappingFlags::READ | MappingFlags::WRITE | MappingFlags::DEVICE,
        },
        GuestMemoryRegion {
            // IO APIC
            gpa: 0xfec0_0000,
            hpa: 0xfec0_0000,
            size: 0x1000,
            flags: MappingFlags::READ | MappingFlags::WRITE | MappingFlags::DEVICE,
        },
        GuestMemoryRegion {
            // HPET
            gpa: 0xfed0_0000,
            hpa: 0xfed0_0000,
            size: 0x1000,
            flags: MappingFlags::READ | MappingFlags::WRITE | MappingFlags::DEVICE,
        },
        GuestMemoryRegion {
            // Local APIC
            gpa: 0xfee0_0000,
            hpa: 0xfee0_0000,
            size: 0x1000,
            flags: MappingFlags::READ | MappingFlags::WRITE | MappingFlags::DEVICE,
        },
    ];
    for r in guest_memory_regions.into_iter() {
        gpm.map_region(r.into())?;  // 建立虚拟机内存映射
    }
    Ok(gpm)
}

第三步，创建虚拟机需要的 vcpu

首先阅读 VmCpus 相关数据结构。

/// The maximum number of CPUs we can support.
pub const MAX_CPUS: usize = 8;

pub const VM_CPUS_MAX: usize = MAX_CPUS;

/// The set of vCPUs in a VM.
#[derive(Default)]
pub struct VmCpus<H: HyperCraftHal, PD: PerCpuDevices<H>> {
    inner: [Once<VCpu<H>>; VM_CPUS_MAX],    // Once：https://zhuanlan.zhihu.com/p/272848155
    device: [Once<PD>; VM_CPUS_MAX],
}

impl<H: HyperCraftHal, PD: PerCpuDevices<H>> VmCpus<H, PD> {
    /// Creates a new vCPU tracking structure.
    pub fn new() -> Self {
        Self {
            inner: [Once::INIT; VM_CPUS_MAX],
            device: [Once::INIT; VM_CPUS_MAX],
        }
    }

    /// Adds the given vCPU to the set of vCPUs.
    pub fn add_vcpu(&mut self, vcpu: VCpu<H>) -> HyperResult<()> {
        let vcpu_id = vcpu.vcpu_id();
        let once_entry = self.inner.get(vcpu_id).ok_or(HyperError::BadState)?;

        let real_vcpu = once_entry.call_once(|| vcpu);  // 初始化第 vcpu_id 个虚拟机的 vcpu
        let device_once_entry = self.device.get(vcpu_id).ok_or(HyperError::BadState)?;

        device_once_entry.call_once(|| PD::new(real_vcpu).unwrap()); // 初始化第 vcpu_id 个虚拟机的 device

        Ok(())
    }

    /// Returns a reference to the vCPU with `vcpu_id` if it exists.
    pub fn get_vcpu_and_device(&mut self, vcpu_id: usize) -> HyperResult<(&mut VCpu<H>, &mut PD)> {
        let vcpu = self
            .inner
            .get_mut(vcpu_id)
            .and_then(|once| once.get_mut())
            .ok_or(HyperError::NotFound)?;
        let device = self
            .device
            .get_mut(vcpu_id)
            .and_then(|once| once.get_mut())
            .ok_or(HyperError::NotFound)?;
        Ok((vcpu, device))
    }
}

// Safety: Each VCpu is wrapped with a Mutex to provide safe concurrent access to VCpu.
unsafe impl<H: HyperCraftHal, PD: PerCpuDevices<H>> Sync for VmCpus<H, PD> {}
unsafe impl<H: HyperCraftHal, PD: PerCpuDevices<H>> Send for VmCpus<H, PD> {}

/// A virtual CPU within a guest.
#[repr(C)]
pub struct VmxVcpu<H: HyperCraftHal> {
    // DO NOT modify `guest_regs` and `host_stack_top` and their order unless you do know what you are doing!
    // DO NOT add anything before or between them unless you do know what you are doing!
    guest_regs: GeneralRegisters,   // guest 通用寄存器
    host_stack_top: u64,            
    vcpu_id: usize,                 // vcpu id。因为现在只支持单核，所以只能为 0 
    launched: bool,                 
    vmcs: VmxRegion<H>,
    msr_bitmap: MsrBitmap<H>,
    pending_events: VecDeque<(u8, Option<u32>)>,
    xstate: XState,
}

impl<H: HyperCraftHal> VmxVcpu<H> {
    /// Create a new [`VmxVcpu`].
    pub fn new(
        vcpu_id: usize,
        vmcs_revision_id: u32,
        entry: GuestPhysAddr,
        ept_root: HostPhysAddr,
    ) -> HyperResult<Self> {
        XState::enable_xsave();
        let mut vcpu = Self {
            guest_regs: GeneralRegisters::default(),
            host_stack_top: 0,
            vcpu_id,
            launched: false,
            vmcs: VmxRegion::new(vmcs_revision_id, false)?,
            msr_bitmap: MsrBitmap::passthrough_all()?,
            pending_events: VecDeque::with_capacity(8),
            xstate: XState::new(),
        };
        vcpu.setup_msr_bitmap()?;   // 设置 msr_bitmap，使 vmm 拦截 IA32_APIC_BASE MSR accesses，all x2APIC MSR accesses
        vcpu.setup_vmcs(entry, ept_root)?;  // 设置 vmcs
        info!("[HV] created VmxVcpu(vmcs: {:#x})", vcpu.vmcs.phys_addr());
        Ok(vcpu)
    }
}

第四步，将新创建的 vcpu 加入到 vm

/// VM define.
pub struct VM<H: HyperCraftHal, PD: PerCpuDevices<H>, VD: PerVmDevices<H>> {
    vcpus: VmCpus<H, PD>,   // 虚拟机持有的 vcpu
    vcpu_bond: BitSet,      // 索引是 vcpuid，位图，代表 cpu 上正在运行的 vcpu。
    device: VD,             // 每个虚拟机拥有的设备
}

impl<H: HyperCraftHal, PD: PerCpuDevices<H>, VD: PerVmDevices<H>> VM<H, PD, VD> {
    /// Create a new [`VM`].
    pub fn new(vcpus: VmCpus<H, PD>) -> Self {
        Self { vcpus, vcpu_bond: BitSet::new(), device: VD::new().unwrap() }
    }

    /// Bind the specified [`VCpu`] to current physical processor.
    pub fn bind_vcpu(&mut self, vcpu_id: usize) -> HyperResult<(&mut VCpu<H>, &mut PD)> {
        if self.vcpu_bond.contains(vcpu_id) {  
            Err(HyperError::InvalidParam)   // 已绑定
        } else {
            match self.vcpus.get_vcpu_and_device(vcpu_id) {
                Ok((vcpu, device)) => {
                    self.vcpu_bond.insert(vcpu_id);     // vcpu-id 位置 1
                    vcpu.bind_to_current_processor()?;  // 使用 vmptrld 与现有 cpu 绑定
                    Ok((vcpu, device))
                },
                e @ Err(_) => e,
            }
        }
    }
}

第五步，运行 vm

impl<H: HyperCraftHal, PD: PerCpuDevices<H>, VD: PerVmDevices<H>> VM<H, PD, VD> {
    #[allow(unreachable_code)]
    /// Run a specified [`VCpu`] on current logical vcpu.
    pub fn run_vcpu(&mut self, vcpu_id: usize) -> HyperResult {
        let (vcpu, vcpu_device) = self.vcpus.get_vcpu_and_device(vcpu_id).unwrap();
        
        loop {
            if let Some(exit_info) = vcpu.run() {
                // we need to handle vm-exit this by ourselves
                let result = vcpu_device.vmexit_handler(vcpu, &exit_info)
                    .or_else(|| self.device.vmexit_handler(vcpu, &exit_info));

                match result {
                    Some(result) => {
                        if result.is_err() {
                            panic!("VM failed to handle a vm-exit: {:?}, error {:?}, vcpu: {:#x?}", exit_info.exit_reason, result.unwrap_err(), vcpu);
                        }
                    },
                    None => {
                        panic!("nobody wants to handle this vm-exit: {:?}, vcpu: {:#x?}", exit_info, vcpu);
                    },
                }
            }

            vcpu_device.check_events(vcpu)?;
        }

        Ok(())
    }
}

impl<H: HyperCraftHal> VmxVcpu<H> {
    /// Run the guest. It returns when a vm-exit happens and returns the vm-exit if it cannot be handled by this [`VmxVcpu`] itself.
    pub fn run(&mut self) -> Option<VmxExitInfo> {
        // Inject pending events
        if self.launched {
            self.inject_pending_events().unwrap();
        }
        
        // Run guest
        self.load_guest_xstate();
        unsafe { 
            if self.launched {
                self.vmx_resume();
            } else {
                self.launched = true;
                VmcsHostNW::RSP.write(&self.host_stack_top as *const _ as usize).unwrap();

                self.vmx_launch();
            }
        }
        self.load_host_xstate();

        // Handle vm-exits
        let exit_info = self.exit_info().unwrap();
        trace!("VM exit: {:#x?}", exit_info);    

        let cr4 = VmcsGuestNW::CR4.read().unwrap();
        if cr4.get_bit(18) {
            // panic!("osxsave dead!");
        }

        match self.builtin_vmexit_handler(&exit_info) {
            Some(result) => {   
                if result.is_err() {
                    panic!("VmxVcpu failed to handle a VM-exit that should be handled by itself: {:?}, error {:?}, vcpu: {:#x?}", exit_info.exit_reason, result.unwrap_err(), self);
                }

                None
            },
            None => Some(exit_info),
        }
    }
}