0%

arceos 之 hypervisor-x86 源码阅读

源代码地址:https://github.com/arceos-hypervisor/arceos/tree/hypervisor-x86

apps/hv/src/main.rs

创建虚拟机全过程:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
fn main(hart_id: usize) {
println!("Hello, hv!");
#[cfg(target_arch = "x86_64")]
{
println!("into main {}", hart_id);

// 第一步:打开 vmx
let mut p = PerCpu::<HyperCraftHalImpl>::new(hart_id);
p.hardware_enable().unwrap();

// 第二步:设置页表
let gpm = x64::setup_gpm(hart_id).unwrap();
let npt = gpm.nest_page_table_root(); // 页表根节点物理地址
info!("{:#x?}", gpm);

// 第三步,创建虚拟机需要的 vcpu
let mut vcpus = VmCpus::<HyperCraftHalImpl, X64VcpuDevices<HyperCraftHalImpl>>::new();
vcpus.add_vcpu(VCpu::new(0, p.vmcs_revision_id(), 0x7c00, npt).unwrap());

let mut vm = VM::<HyperCraftHalImpl, X64VcpuDevices<HyperCraftHalImpl>, X64VmDevices<HyperCraftHalImpl>>::new(vcpus);
vm.bind_vcpu(0);

if hart_id == 0 { // 主 cpu 执行
let (_, dev) = vm.get_vcpu_and_device(0).unwrap();
*(dev.console.lock().backend()) = device::device_emu::MultiplexConsoleBackend::Primary;

for v in 0..256 {
libax::hv::set_host_irq_enabled(v, true);
}
}

println!("Running guest...");
println!("{:?}", vm.run_vcpu(0));

p.hardware_disable().unwrap();

panic!("done");

return;
}
}

第一步:打开 vmx

首先看 PerCpu 相关数据结构:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
// arceos/crates/hypercraft/src/arch/x86_64/percpu.rs
/// Host per-CPU states to run the guest. All methods must be called on the corresponding CPU.
pub struct PerCpu<H: HyperCraftHal> {
cpu_id: usize, // 物理 cpu-id
arch: VmxPerCpuState<H>,
}

// arceos/crates/hypercraft/src/arch/x86_64/vmx/percpu.rs
/// State per vmx physical cpu.
pub struct VmxPerCpuState<H: HyperCraftHal> {
pub(super) vmcs_revision_id: u32,
vmx_region: VmxRegion<H>,
}

// arceos/crates/hypercraft/src/arch/x86_64/vmx/region.rs
/// VMCS/VMXON region in 4K size. (SDM Vol. 3C, Section 24.2)
#[derive(Debug)]
pub struct VmxRegion<H: HyperCraftHal> {
frame: PhysFrame<H>, // 4K 页面
}

PerCpu 与一个逻辑 cpu 绑定,进行 vmx 的启动与关闭等操作。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
impl<H: HyperCraftHal> PerCpu<H> {
/// Create an uninitialized instance.
pub fn new(cpu_id: usize) -> Self {
Self {
cpu_id: cpu_id, // 物理 cpu id
arch: VmxPerCpuState::new(), // vmcs_revision_id 为 0,VmxRegion 未分配空间
}
}

/// Whether the current CPU has hardware virtualization enabled.
pub fn is_enabled(&self) -> bool {
self.arch.is_enabled() // 检查 CR4 寄存器 VMXE 是否打开
}

/// Enable hardware virtualization on the current CPU.
pub fn hardware_enable(&mut self) -> HyperResult {
match self.arch.hardware_enable() {
Ok(_) => {
info!("VMX enabled on cpu {}.", self.cpu_id);
Ok(())
},
e @ Err(_) => {
e
}
}
}

/// Disable hardware virtualization on the current CPU.
pub fn hardware_disable(&mut self) -> HyperResult {
match self.arch.hardware_disable() {
Ok(_) => {
info!("VMX disabled on cpu {}.", self.cpu_id);
Ok(())
},
e @ Err(_) => {
e
}
}
}

/// Get current vmcs revision id.
pub fn vmcs_revision_id(&self) -> u32 {
self.arch.vmcs_revision_id()
}
}

impl<H: HyperCraftHal> Drop for PerCpu<H> {
fn drop(&mut self) {
if self.is_enabled() {
self.hardware_disable().unwrap();
}
}
}

第二步,设置页表

现在需要支持两个虚拟机,所以内存只分配两个虚拟机的量。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
#[repr(align(4096))]
pub(super) struct AlignedMemory<const LEN: usize>([u8; LEN]);

// 两个虚拟机的内存空间,大小为 GUEST_PHYS_MEMORY_SIZE(16M)
pub(super) static mut GUEST_PHYS_MEMORY: [AlignedMemory<GUEST_PHYS_MEMORY_SIZE>; 2] =
[AlignedMemory([0; GUEST_PHYS_MEMORY_SIZE]), AlignedMemory([0; GUEST_PHYS_MEMORY_SIZE])];

// 获取第 id 个虚拟机的 guest_paddr 对应的物理机虚拟地址
fn gpa_as_mut_ptr(id: usize, guest_paddr: GuestPhysAddr) -> *mut u8 {
let offset = unsafe { &(GUEST_PHYS_MEMORY[id]) as *const _ as usize };
let host_vaddr = guest_paddr + offset;
host_vaddr as *mut u8
}

// 将 host 物理地址为 hpa,大小为 size 的镜像复制到第 id 个虚拟机的物理地址为 load_gpa 处。
#[cfg(target_arch = "x86_64")]
fn load_guest_image(id: usize, hpa: HostPhysAddr, load_gpa: GuestPhysAddr, size: usize) {
let image_ptr = usize::from(phys_to_virt(hpa.into())) as *const u8; // host 物理地址转为 host 虚拟地址
let image = unsafe { core::slice::from_raw_parts(image_ptr, size) };

trace!("loading to guest memory: host {:#x} to guest {:#x}, size {:#x}", image_ptr as usize, load_gpa, size);

unsafe {
core::slice::from_raw_parts_mut(gpa_as_mut_ptr(id, load_gpa), size).copy_from_slice(image)
}
}

#[cfg(target_arch = "x86_64")]
pub fn setup_gpm(id: usize) -> HyperResult<GuestPhysMemorySet> {
// copy BIOS and guest images

load_guest_image(id, BIOS_PADDR, BIOS_ENTRY, BIOS_SIZE);
#[cfg(feature = "guest_nimbos")]
{
load_guest_image(id, GUEST_IMAGE_PADDR, GUEST_ENTRY, GUEST_IMAGE_SIZE);
}

// create nested page table and add mapping
let mut gpm = GuestPhysMemorySet::new()?;
let guest_memory_regions = [
GuestMemoryRegion {
// Low RAM
gpa: GUEST_PHYS_MEMORY_BASE,
hpa: virt_to_phys((gpa_as_mut_ptr(id, GUEST_PHYS_MEMORY_BASE) as HostVirtAddr).into()).into(),
size: GUEST_PHYS_MEMORY_SIZE,
flags: MappingFlags::READ | MappingFlags::WRITE | MappingFlags::EXECUTE,
},
#[cfg(feature = "guest_linux")]
GuestMemoryRegion {
// Low RAM2
gpa: 0x100_0000,
hpa: 0x6100_0000,
size: 0xf00_0000,
flags: MappingFlags::READ | MappingFlags::WRITE | MappingFlags::EXECUTE,
},
#[cfg(feature = "guest_linux")]
GuestMemoryRegion {
// RAM
gpa: 0x7000_0000,
hpa: 0x7000_0000,
size: 0x1000_0000,
flags: MappingFlags::READ | MappingFlags::WRITE | MappingFlags::EXECUTE,
},
GuestMemoryRegion {
// PCI
gpa: 0x8000_0000,
hpa: 0x8000_0000,
size: 0x1000_0000,
flags: MappingFlags::READ | MappingFlags::WRITE | MappingFlags::DEVICE,
},
GuestMemoryRegion {
gpa: 0xfe00_0000,
hpa: 0xfe00_0000,
size: 0x1_0000,
flags: MappingFlags::READ | MappingFlags::WRITE | MappingFlags::DEVICE,
},
GuestMemoryRegion {
gpa: 0xfeb0_0000,
hpa: 0xfeb0_0000,
size: 0x10_0000,
flags: MappingFlags::READ | MappingFlags::WRITE | MappingFlags::DEVICE,
},
GuestMemoryRegion {
// IO APIC
gpa: 0xfec0_0000,
hpa: 0xfec0_0000,
size: 0x1000,
flags: MappingFlags::READ | MappingFlags::WRITE | MappingFlags::DEVICE,
},
GuestMemoryRegion {
// HPET
gpa: 0xfed0_0000,
hpa: 0xfed0_0000,
size: 0x1000,
flags: MappingFlags::READ | MappingFlags::WRITE | MappingFlags::DEVICE,
},
GuestMemoryRegion {
// Local APIC
gpa: 0xfee0_0000,
hpa: 0xfee0_0000,
size: 0x1000,
flags: MappingFlags::READ | MappingFlags::WRITE | MappingFlags::DEVICE,
},
];
for r in guest_memory_regions.into_iter() {
gpm.map_region(r.into())?; // 建立虚拟机内存映射
}
Ok(gpm)
}

第三步,创建虚拟机需要的 vcpu

首先阅读 VmCpus 相关数据结构。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
/// The maximum number of CPUs we can support.
pub const MAX_CPUS: usize = 8;

pub const VM_CPUS_MAX: usize = MAX_CPUS;

/// The set of vCPUs in a VM.
#[derive(Default)]
pub struct VmCpus<H: HyperCraftHal, PD: PerCpuDevices<H>> {
inner: [Once<VCpu<H>>; VM_CPUS_MAX], // Once:https://zhuanlan.zhihu.com/p/272848155
device: [Once<PD>; VM_CPUS_MAX],
}

impl<H: HyperCraftHal, PD: PerCpuDevices<H>> VmCpus<H, PD> {
/// Creates a new vCPU tracking structure.
pub fn new() -> Self {
Self {
inner: [Once::INIT; VM_CPUS_MAX],
device: [Once::INIT; VM_CPUS_MAX],
}
}

/// Adds the given vCPU to the set of vCPUs.
pub fn add_vcpu(&mut self, vcpu: VCpu<H>) -> HyperResult<()> {
let vcpu_id = vcpu.vcpu_id();
let once_entry = self.inner.get(vcpu_id).ok_or(HyperError::BadState)?;

let real_vcpu = once_entry.call_once(|| vcpu); // 初始化第 vcpu_id 个虚拟机的 vcpu
let device_once_entry = self.device.get(vcpu_id).ok_or(HyperError::BadState)?;

device_once_entry.call_once(|| PD::new(real_vcpu).unwrap()); // 初始化第 vcpu_id 个虚拟机的 device

Ok(())
}

/// Returns a reference to the vCPU with `vcpu_id` if it exists.
pub fn get_vcpu_and_device(&mut self, vcpu_id: usize) -> HyperResult<(&mut VCpu<H>, &mut PD)> {
let vcpu = self
.inner
.get_mut(vcpu_id)
.and_then(|once| once.get_mut())
.ok_or(HyperError::NotFound)?;
let device = self
.device
.get_mut(vcpu_id)
.and_then(|once| once.get_mut())
.ok_or(HyperError::NotFound)?;
Ok((vcpu, device))
}
}

// Safety: Each VCpu is wrapped with a Mutex to provide safe concurrent access to VCpu.
unsafe impl<H: HyperCraftHal, PD: PerCpuDevices<H>> Sync for VmCpus<H, PD> {}
unsafe impl<H: HyperCraftHal, PD: PerCpuDevices<H>> Send for VmCpus<H, PD> {}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
/// A virtual CPU within a guest.
#[repr(C)]
pub struct VmxVcpu<H: HyperCraftHal> {
// DO NOT modify `guest_regs` and `host_stack_top` and their order unless you do know what you are doing!
// DO NOT add anything before or between them unless you do know what you are doing!
guest_regs: GeneralRegisters, // guest 通用寄存器
host_stack_top: u64,
vcpu_id: usize, // vcpu id。因为现在只支持单核,所以只能为 0
launched: bool,
vmcs: VmxRegion<H>,
msr_bitmap: MsrBitmap<H>,
pending_events: VecDeque<(u8, Option<u32>)>,
xstate: XState,
}

impl<H: HyperCraftHal> VmxVcpu<H> {
/// Create a new [`VmxVcpu`].
pub fn new(
vcpu_id: usize,
vmcs_revision_id: u32,
entry: GuestPhysAddr,
ept_root: HostPhysAddr,
) -> HyperResult<Self> {
XState::enable_xsave();
let mut vcpu = Self {
guest_regs: GeneralRegisters::default(),
host_stack_top: 0,
vcpu_id,
launched: false,
vmcs: VmxRegion::new(vmcs_revision_id, false)?,
msr_bitmap: MsrBitmap::passthrough_all()?,
pending_events: VecDeque::with_capacity(8),
xstate: XState::new(),
};
vcpu.setup_msr_bitmap()?; // 设置 msr_bitmap,使 vmm 拦截 IA32_APIC_BASE MSR accesses,all x2APIC MSR accesses
vcpu.setup_vmcs(entry, ept_root)?; // 设置 vmcs
info!("[HV] created VmxVcpu(vmcs: {:#x})", vcpu.vmcs.phys_addr());
Ok(vcpu)
}
}

第四步,将新创建的 vcpu 加入到 vm

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
/// VM define.
pub struct VM<H: HyperCraftHal, PD: PerCpuDevices<H>, VD: PerVmDevices<H>> {
vcpus: VmCpus<H, PD>, // 虚拟机持有的 vcpu
vcpu_bond: BitSet, // 索引是 vcpuid,位图,代表 cpu 上正在运行的 vcpu。
device: VD, // 每个虚拟机拥有的设备
}

impl<H: HyperCraftHal, PD: PerCpuDevices<H>, VD: PerVmDevices<H>> VM<H, PD, VD> {
/// Create a new [`VM`].
pub fn new(vcpus: VmCpus<H, PD>) -> Self {
Self { vcpus, vcpu_bond: BitSet::new(), device: VD::new().unwrap() }
}

/// Bind the specified [`VCpu`] to current physical processor.
pub fn bind_vcpu(&mut self, vcpu_id: usize) -> HyperResult<(&mut VCpu<H>, &mut PD)> {
if self.vcpu_bond.contains(vcpu_id) {
Err(HyperError::InvalidParam) // 已绑定
} else {
match self.vcpus.get_vcpu_and_device(vcpu_id) {
Ok((vcpu, device)) => {
self.vcpu_bond.insert(vcpu_id); // vcpu-id 位置 1
vcpu.bind_to_current_processor()?; // 使用 vmptrld 与现有 cpu 绑定
Ok((vcpu, device))
},
e @ Err(_) => e,
}
}
}
}

第五步,运行 vm

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
impl<H: HyperCraftHal, PD: PerCpuDevices<H>, VD: PerVmDevices<H>> VM<H, PD, VD> {
#[allow(unreachable_code)]
/// Run a specified [`VCpu`] on current logical vcpu.
pub fn run_vcpu(&mut self, vcpu_id: usize) -> HyperResult {
let (vcpu, vcpu_device) = self.vcpus.get_vcpu_and_device(vcpu_id).unwrap();

loop {
if let Some(exit_info) = vcpu.run() {
// we need to handle vm-exit this by ourselves
let result = vcpu_device.vmexit_handler(vcpu, &exit_info)
.or_else(|| self.device.vmexit_handler(vcpu, &exit_info));

match result {
Some(result) => {
if result.is_err() {
panic!("VM failed to handle a vm-exit: {:?}, error {:?}, vcpu: {:#x?}", exit_info.exit_reason, result.unwrap_err(), vcpu);
}
},
None => {
panic!("nobody wants to handle this vm-exit: {:?}, vcpu: {:#x?}", exit_info, vcpu);
},
}
}

vcpu_device.check_events(vcpu)?;
}

Ok(())
}
}

impl<H: HyperCraftHal> VmxVcpu<H> {
/// Run the guest. It returns when a vm-exit happens and returns the vm-exit if it cannot be handled by this [`VmxVcpu`] itself.
pub fn run(&mut self) -> Option<VmxExitInfo> {
// Inject pending events
if self.launched {
self.inject_pending_events().unwrap();
}

// Run guest
self.load_guest_xstate();
unsafe {
if self.launched {
self.vmx_resume();
} else {
self.launched = true;
VmcsHostNW::RSP.write(&self.host_stack_top as *const _ as usize).unwrap();

self.vmx_launch();
}
}
self.load_host_xstate();

// Handle vm-exits
let exit_info = self.exit_info().unwrap();
trace!("VM exit: {:#x?}", exit_info);

let cr4 = VmcsGuestNW::CR4.read().unwrap();
if cr4.get_bit(18) {
// panic!("osxsave dead!");
}

match self.builtin_vmexit_handler(&exit_info) {
Some(result) => {
if result.is_err() {
panic!("VmxVcpu failed to handle a VM-exit that should be handled by itself: {:?}, error {:?}, vcpu: {:#x?}", exit_info.exit_reason, result.unwrap_err(), self);
}

None
},
None => Some(exit_info),
}
}
}