0%

rvm1.5

源代码仓库

RVM1.5:https://github.com/rvm-rtos/RVM1.5/tree/main
Jailhouse:https://github.com/siemens/jailhouse.git
Jailhouse patch:https://github.com/rvm-rtos/RVM1.5/blob/main/scripts/guest/jailhouse.patch

运行

源代码阅读

jailhouse_init

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
// jailhouse/driver/main.c
static int __init jailhouse_init(void)
{
int err;

#if defined(CONFIG_KALLSYMS_ALL) && LINUX_VERSION_CODE < KERNEL_VERSION(5,7,0)
#define __RESOLVE_EXTERNAL_SYMBOL(symbol) \
// kallsyms_lookup_name 是 Linux 内核中的一个函数,用于查找与给定符号名称相关联的内核地址。
symbol##_sym = (void *)kallsyms_lookup_name(#symbol); \
if (!symbol##_sym) \
return -EINVAL
#else
#define __RESOLVE_EXTERNAL_SYMBOL(symbol) \
symbol##_sym = &symbol
#endif
#define RESOLVE_EXTERNAL_SYMBOL(symbol...) __RESOLVE_EXTERNAL_SYMBOL(symbol)
// ioremap_page_range 是 Linux 内核中的函数,用于将物理地址映射到内核虚拟地址空间的页表中。
RESOLVE_EXTERNAL_SYMBOL(ioremap_page_range);
#ifdef CONFIG_X86
// lapic_timer_period 通常是指在多处理器系统中的 LAPIC 中定时器的周期。
RESOLVE_EXTERNAL_SYMBOL(lapic_timer_period);
#endif
...
// 在 sysfs 中有这样一个目录:/sys/devices,系统中所有的设备,都归集在该目录下。
// 有些设备,是通过 device_register 注册到 Kernel 并体现在 /sys/devices/xxx/ 下。
// 但有时候我们仅仅需要在 /sys/devices/ 下注册一个目录,该目录不代表任何的实体设备,这时可以使用 root_device_register 接口。
jailhouse_dev = root_device_register("jailhouse");
...
err = jailhouse_sysfs_init(jailhouse_dev);
...
// misc_register 是 Linux 内核中用于注册杂项字符设备(Miscellaneous Character Device)的函数。
// Miscellaneous Character Devices 是一类特殊用途的字符设备,通常用于表示不属于其他特定类别(如块设备或网络设备)的小型设备。
// 通过这样的注册,Linux 内核会在 /dev 目录下创建一个相应的设备文件,用户空间的应用程序可以通过该文件进行与设备的交互。
err = misc_register(&jailhouse_misc_dev);
...
err = jailhouse_pci_register();
...
// register_reboot_notifier 是 Linux 内核中用于注册重启通知回调函数的函数。这个函数允许内核模块或子系统在系统即将重启时执行自定义的操作。
register_reboot_notifier(&jailhouse_shutdown_nb);

init_hypercall();

return 0;
...
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
// jailhouse/driver/sysfs.c
static struct attribute_group jailhouse_attribute_group = {
.name = NULL,
.attrs = jailhouse_sysfs_entries,
};
static struct attribute *jailhouse_sysfs_entries[] = {
&dev_attr_console.attr,
&dev_attr_enabled.attr,
&dev_attr_mem_pool_size.attr,
&dev_attr_mem_pool_used.attr,
&dev_attr_remap_pool_size.attr,
&dev_attr_remap_pool_used.attr,
NULL
};
int jailhouse_sysfs_init(struct device *dev)
{
int err;

err = sysfs_create_group(&dev->kobj, &jailhouse_attribute_group);
if (err)
return err;

cells_dir = kobject_create_and_add("cells", &dev->kobj);
if (!cells_dir) {
sysfs_remove_group(&dev->kobj, &jailhouse_attribute_group);
return -ENOMEM;
}

return 0;
}

sysfs_create_group 是 Linux 内核中用于在 sysfs(系统文件系统)中创建属性组(attribute group)的函数。Sysfs 提供了一种将内核信息以文件和目录的形式暴露给用户空间的机制,而属性组则允许将相关的属性组织在一起。

kobject_create_and_add 是 Linux 内核中用于创建并添加内核对象(kobject)的函数。Sysfs 提供了一种将内核中的对象以文件和目录的形式暴露给用户空间的机制。
这样创建的内核对象可以在 sysfs 中表示为一个目录,用户空间可以通过 sysfs 接口访问其中的文件和属性。

1
2
3
4
5
6
7
8
9
10
// jailhouse/driver/pci.c
/**
* Register jailhouse as a PCI device driver so it can claim assigned devices.
*
* @return 0 on success, or error code
*/
int jailhouse_pci_register(void)
{
return pci_register_driver(&jailhouse_pci_stub_driver);
}

pci_register_driver 是 Linux 内核中用于注册 PCI 驱动程序的函数。

1
2
3
4
static void init_hypercall(void)
{
jailhouse_use_vmcall = boot_cpu_has(X86_FEATURE_VMX);
}

因为默认支持 vmx,所以 jailhouse_use_vmcall 为 true。

这样,初始化的工作就完成了。

jailhouse_ioctl

jailhouse_ioctl 会根据传递进来的参数,判断调用哪个函数并执行。从这里可以看出,jailhouse 主要的函数有:
jailhouse_cmd_enable,jailhouse_cmd_disable,jailhouse_cmd_cell_create,jailhouse_cmd_cell_load,
jailhouse_cmd_cell_start,jailhouse_cmd_cell_destroy。

在 jailhouse/tools/jailhouse.c 中可以看到 jailhouse 的一些使用方法。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
static long jailhouse_ioctl(struct file *file, unsigned int ioctl,
unsigned long arg)
{
long err;

switch (ioctl) {
case JAILHOUSE_ENABLE:
err = jailhouse_cmd_enable(
(struct jailhouse_system __user *)arg);
break;
case JAILHOUSE_DISABLE:
err = jailhouse_cmd_disable();
break;
case JAILHOUSE_CELL_CREATE:
err = jailhouse_cmd_cell_create(
(struct jailhouse_cell_create __user *)arg);
break;
case JAILHOUSE_CELL_LOAD:
err = jailhouse_cmd_cell_load(
(struct jailhouse_cell_load __user *)arg);
break;
case JAILHOUSE_CELL_START:
err = jailhouse_cmd_cell_start((const char __user *)arg);
break;
case JAILHOUSE_CELL_DESTROY:
err = jailhouse_cmd_cell_destroy((const char __user *)arg);
break;
default:
err = -EINVAL;
break;
}

return err;
}

jailhouse_cmd_enable

在对代码分析之前,需要看如何使用 enable 功能。查看 RVM1.5 中的 enable-rvm.sh(https://github.com/rvm-rtos/RVM1.5/blob/main/scripts/guest/enable-rvm.sh)文件,可以看到使用的命令为:

1
2
3
JH_DIR=~/jailhouse
JH=$JH_DIR/tools/jailhouse
sudo $JH enable $JH_DIR/configs/x86/qemu-ubuntu.cell

这里面 enable 后面附带一个参数: $JH_DIR/configs/x86/qemu-ubuntu.cell。在 gen-config 中有这么一行:

1
sudo python3 ./tools/jailhouse-config-create --mem-hv 512M ./configs/x86/qemu-ubuntu.c

这个会生成 qemu-ubuntu.c 文件,里面是关于 jailhouse 的一些配置。在后续 make 时,会根据该文件生成对应的 qemu-ubuntu.cell 文件。具体可以查看 jailhouse/configs/Makefile 文件。

接着,看 jailhouse_cmd_enable 的函数签名:

1
static int jailhouse_cmd_enable(struct jailhouse_system __user *arg);

我们可以看 jailhouse_cmd_enable 的调用路线:enable(argc, argv)(tools/jailhouse.c) -> ioctl(fd, JAILHOUSE_ENABLE, config) -> jailhouse_cmd_enable((struct jailhouse_system __user *)arg)。
第一个 enable 函数中 argc 和 argv 跟 main 函数的一致。iocal 中的 fd 是打开 jailhouse 的文件描述符。config 是读取 argv 第三个参数所对应的文件内容。最后的 arg 就跟 config 对应。
根据以上内容可知,arg 对应的就是 $JH_DIR/configs/x86/qemu-ubuntu.cell 文件的内容,即系统配置文件。

内存布局

在开始阅读代码之前,了解 hypervisor image 的内存布局。RVM 1.5 的内存布局如下图所示。

接下来详细分析以下 enable 的全过程。

第一步,将 arg 中的前 sizeof(jailhouse_system) 个字节拷贝到 config_header

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
static int jailhouse_cmd_enable(struct jailhouse_system __user *arg)
{
...
if (copy_from_user(&config_header, arg, sizeof(config_header)))
return -EFAULT;
// 检查 config_header.signature 和 config_header.revision
if (memcmp(config_header.signature, JAILHOUSE_SYSTEM_SIGNATURE,
sizeof(config_header.signature)) != 0) {
pr_err("jailhouse: Not a system configuration\n");
return -EINVAL;
}
if (config_header.revision != JAILHOUSE_CONFIG_REVISION) {
pr_err("jailhouse: Configuration revision mismatch\n");
return -EINVAL;
}

config_header.root_cell.name[JAILHOUSE_CELL_NAME_MAXLEN] = 0;
...
}

第二步,上锁,增加模块引用计数,检查 vmx 功能是否打开

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
static int jailhouse_cmd_enable(struct jailhouse_system __user *arg)
{
...
// mutex_lock_interruptible 函数是一种获取互斥锁的方式,允许在等待锁的过程中被信号中断。
// 这样的机制有助于避免在用户空间中调用的操作导致内核中断等待的死锁情况。
if (mutex_lock_interruptible(&jailhouse_lock) != 0)
return -EINTR;

err = -EBUSY;
// try_module_get(THIS_MODULE) 是 Linux 内核中用于尝试增加模块引用计数的函数。
// 模块引用计数是跟踪内核模块被使用的计数器,当模块被使用时,引用计数会增加,当不再使用时,引用计数会减少。
// 引用计数为零时,模块可以被卸载。
// jailhouse_enabled 代表是否已经启用,所以必须要为 false
if (jailhouse_enabled || !try_module_get(THIS_MODULE))
goto error_unlock;

#ifdef CONFIG_X86
if (boot_cpu_has(X86_FEATURE_VMX)) {
u64 features;

rdmsrl(MSR_IA32_FEAT_CTL, features);
if ((features & FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX) == 0) {
pr_err("jailhouse: VT-x disabled by Firmware/BIOS\n");
err = -ENODEV;
goto error_put_module;
}
}
#endif
...
}

第三步,加载 hypervisor 镜像

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
static int jailhouse_cmd_enable(struct jailhouse_system __user *arg)
{
...
fw_name = jailhouse_get_fw_name(); // rvm-intel.bin
if (!fw_name) {
pr_err("jailhouse: Missing or unsupported HVM technology\n");
return -ENODEV;
}
...
// request_firmware 是用于请求固件文件的函数。它允许内核驱动程序在运行时动态地获取与硬件设备相关的固件信息。
// 这对于支持一些设备而无需将固件硬编码到内核中的情况非常有用。
// request_firmware 用于请求名为 "rvm-intel.bin" 的固件文件。
// 如果请求成功,固件数据将包含在 hypervisor->data 中,然后你可以在这个数据上执行你的操作。
// 最后,使用 release_firmware 函数释放固件数据。
err = request_firmware(&hypervisor, fw_name, jailhouse_dev);
if (err) {
pr_err("jailhouse: Missing hypervisor image %s\n", fw_name);
goto error_put_module;
}
// 内容在 rvm-intel.bin 中的 .header 节中,具体在 RVM1.5/src/header.rs/HvHeaderStuff 中。
header = (struct jailhouse_header *)hypervisor->data;

err = -EINVAL;
// 检查签名
if (memcmp(header->signature, JAILHOUSE_SIGNATURE,
sizeof(header->signature)) != 0 ||
hypervisor->size >= hv_mem->size)
goto error_release_fw;
...
}

第四步,分配内存,建立虚拟映射,并用 hypervisor 初始化 hypervisor_mem

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
static int jailhouse_cmd_enable(struct jailhouse_system __user *arg)
{
...
struct jailhouse_memory *hv_mem = &config_header.hypervisor_memory;

jailhouse_firmware_free(); // 释放 hypervisor_mem_res

hypervisor_mem_res = request_mem_region(hv_mem->phys_start,
hv_mem->size,
"Jailhouse hypervisor"); // 分配物理内存
if (!hypervisor_mem_res) {
pr_err("jailhouse: request_mem_region failed for hypervisor "
"memory.\n");
pr_notice("jailhouse: Did you reserve the memory with "
"\"memmap=\" or \"mem=\"?\n");
goto error_release_fw;
}

/* Map physical memory region reserved for Jailhouse. */
hypervisor_mem = jailhouse_ioremap(hv_mem->phys_start, remap_addr,
hv_mem->size); // 建立虚拟映射
if (!hypervisor_mem) {
pr_err("jailhouse: Unable to map RAM reserved for hypervisor "
"at %08lx\n", (unsigned long)hv_mem->phys_start);
goto error_release_memreg;
}

console_page = (struct jailhouse_virt_console*)
(hypervisor_mem + header->console_page);
last_console.valid = false;

/* Copy hypervisor's binary image at beginning of the memory region
* and clear the rest to zero. */
// 用 hypervisor 初始化 hypervisor_mem。
memcpy(hypervisor_mem, hypervisor->data, hypervisor->size);
memset(hypervisor_mem + hypervisor->size, 0,
hv_mem->size - hypervisor->size);
...
}

第五步,完善 hypervisor_mem 数据

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
static int jailhouse_cmd_enable(struct jailhouse_system __user *arg)
{
...
max_cpus = get_max_cpus(config_header.root_cell.cpu_set_size, arg);
header = (struct jailhouse_header *)hypervisor_mem;
header->max_cpus = max_cpus;
// 在 sysfs 中创建二进制文件。成功创建后,用户空间可以通过文件系统接口来读取该二进制文件。
err = jailhouse_sysfs_core_init(jailhouse_dev, header->core_size);
if (err)
goto error_unmap;

/*
* ARMv8 requires to clean D-cache and invalidate I-cache for memory
* containing new instructions. On x86 this is a NOP. On ARMv7 the
* firmware does its own cache maintenance, so it is an
* extraneous (but harmless) flush.
*/
flush_icache_range((unsigned long)hypervisor_mem,
(unsigned long)(hypervisor_mem + header->core_size));

/* Copy system configuration to its target address in hypervisor memory
* region. */
config = (struct jailhouse_system *)
(hypervisor_mem + hv_core_and_percpu_size);
// 将 qemu-ubuntu.c 中的 config 写入 hypervisor_mem 中的 systemconfig 区域
if (copy_from_user(config, arg, config_size)) {
err = -EFAULT;
goto error_unmap;
}

if (config->debug_console.clock_reg) {
clock_reg = ioremap(config->debug_console.clock_reg,
sizeof(clock_gates));
if (!clock_reg) {
err = -EINVAL;
pr_err("jailhouse: Unable to map clock register at "
"%08lx\n",
(unsigned long)config->debug_console.clock_reg);
goto error_unmap;
}

clock_gates = readl(clock_reg);
if (CON_HAS_INVERTED_GATE(config->debug_console.flags))
clock_gates &= ~(1 << config->debug_console.gate_nr);
else
clock_gates |= (1 << config->debug_console.gate_nr);
writel(clock_gates, clock_reg);

iounmap(clock_reg);
}

#ifdef JAILHOUSE_BORROW_ROOT_PT
if (CON_IS_MMIO(config->debug_console.flags)) {
console = ioremap(config->debug_console.address,
config->debug_console.size);
if (!console) {
err = -EINVAL;
pr_err("jailhouse: Unable to map hypervisor debug "
"console at %08lx\n",
(unsigned long)config->debug_console.address);
goto error_unmap;
}
/* The hypervisor has no notion of address spaces, so we need
* to enforce conversion. */
header->debug_console_base = (void * __force)console;
}
#endif

console_available = SYS_FLAGS_VIRTUAL_DEBUG_CONSOLE(config->flags);

#ifdef CONFIG_X86
if (config->platform_info.x86.tsc_khz == 0)
config->platform_info.x86.tsc_khz = tsc_khz;
if (config->platform_info.x86.apic_khz == 0)
config->platform_info.x86.apic_khz =
*lapic_timer_period_sym / (1000 / HZ);
#endif
...
}

第六步,创建 cell

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
static struct cell *cell_create(const struct jailhouse_cell_desc *cell_desc)
{
struct cell *cell;
unsigned int id;
int err;

if (cell_desc->num_memory_regions >=
ULONG_MAX / sizeof(struct jailhouse_memory))
return ERR_PTR(-EINVAL);

/* determine cell id */
id = 0;
retry:
list_for_each_entry(cell, &cells, entry)
if (cell->id == id) {
id++;
goto retry;
}

cell = kzalloc(sizeof(*cell), GFP_KERNEL);
if (!cell)
return ERR_PTR(-ENOMEM);

INIT_LIST_HEAD(&cell->entry);
// 用 cell_desc 初始化 cell
cell->id = id;

bitmap_copy(cpumask_bits(&cell->cpus_assigned),
jailhouse_cell_cpu_set(cell_desc),
min((unsigned int)nr_cpumask_bits,
cell_desc->cpu_set_size * 8));

cell->num_memory_regions = cell_desc->num_memory_regions;
cell->memory_regions = vmalloc(sizeof(struct jailhouse_memory) *
cell->num_memory_regions);
if (!cell->memory_regions) {
kfree(cell);
return ERR_PTR(-ENOMEM);
}

memcpy(cell->name, cell_desc->name, JAILHOUSE_CELL_ID_NAMELEN);
cell->name[JAILHOUSE_CELL_ID_NAMELEN] = 0;

memcpy(cell->memory_regions, jailhouse_cell_mem_regions(cell_desc),
sizeof(struct jailhouse_memory) * cell->num_memory_regions);

err = jailhouse_pci_cell_setup(cell, cell_desc);
if (err) {
vfree(cell->memory_regions);
kfree(cell);
return ERR_PTR(err);
}
// 使用 kobject_init_and_add 初始化和添加内核对象。
// 成功添加后,用户空间可以通过 sysfs 文件系统接口访问该对象。
// 然后调用 sysfs_create_group 在 sysfs 中创建属性组。
// 成功创建后,用户空间可以通过文件系统接口访问该属性组。
err = jailhouse_sysfs_cell_create(cell);
if (err)
/* cleanup done by jailhouse_sysfs_cell_create */
return ERR_PTR(err);

return cell;
}

int jailhouse_cell_prepare_root(const struct jailhouse_cell_desc *cell_desc)
{
root_cell = cell_create(cell_desc);
if (IS_ERR(root_cell))
return PTR_ERR(root_cell);

cpumask_and(&root_cell->cpus_assigned, &root_cell->cpus_assigned,
cpu_online_mask);

return 0;
}

static int jailhouse_cmd_enable(struct jailhouse_system __user *arg)
{
...
err = jailhouse_cell_prepare_root(&config->root_cell);
if (err)
goto error_unmap;
...
}

第七步,跳转到 RVM1.5 中执行

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
static int jailhouse_cmd_enable(struct jailhouse_system __user *arg)
{
...
preempt_disable();

header->online_cpus = num_online_cpus();

atomic_set(&call_done, 0);
// on_each_cpu 是 Linux 内核中用于在所有 CPU 上执行指定函数的宏。
// 该宏的目的是在对称多处理(SMP)系统中并行地执行相同的函数,以便在所有 CPU 上进行某些全局操作。
// 第一个参数:指向将在每个 CPU 上执行的函数的指针。
// 第二个参数:传递给 func 函数的额外参数。
// 第三个参数:指定是否等待所有 CPU 上的函数执行完成。如果为 1,等待;如果为 0,不等待。
on_each_cpu(enter_hypervisor, header, 0);
// 等待所有 cpu 完成
while (atomic_read(&call_done) != num_online_cpus())
cpu_relax();

preempt_enable();
...
}

/*
* Called for each cpu by the JAILHOUSE_ENABLE ioctl.
* It jumps to the entry point set in the header, reports the result and
* signals completion to the main thread that invoked it.
*/
static void enter_hypervisor(void *info)
{
struct jailhouse_header *header = info;
unsigned int cpu = smp_processor_id();
int (*entry)(unsigned int);
int err;
// 查看 RVM1.5/linker.lds,__entry_offset = arch_entry - BASE_ADDRESS;
entry = header->entry + (unsigned long) hypervisor_mem;

if (cpu < header->max_cpus)
/* either returns 0 or the same error code across all CPUs */
// 进入 RVM1.5 执行
err = entry(cpu);
else
err = -EINVAL;
// 此时,已经进入 guest 状态执行
if (err)
error_code = err;

#if defined(CONFIG_X86) && LINUX_VERSION_CODE >= KERNEL_VERSION(4,0,0)
/* on Intel, VMXE is now on - update the shadow */
cr4_init_shadow();
#endif

atomic_inc(&call_done);
}

第八步,进入 RVM 1.5,从 arch_entry 到 main 函数

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
pub unsafe extern "C" fn arch_entry() -> i32 {
core::arch::asm!("
// rip is pushed
cli
push rbp
push rbx
push r12
push r13
push r14
push r15
push 0 // skip gs_base

mov rdi, rsp // switch_stack 参数为此时的栈指针
call {0} // 调用 switch_stack

pop r15 // skip gs_base
pop r15
pop r14
pop r13
pop r12
pop rbx
pop rbp
ret
// rip will pop when return",
sym switch_stack,
options(noreturn),
);
}
// extern "sysv64" 通过 FFI 调用 非Windows x86_64 平台下的 C 资源所使用的默认调用约定。
unsafe extern "sysv64" fn switch_stack(linux_sp: usize) -> i32 {
let linux_tp = Msr::IA32_GS_BASE.read();
let cpu_data = match PerCpu::new() { // 初始化 cpu_data
Ok(c) => c,
Err(e) => return e.code(),
};
let hv_sp = cpu_data.stack_top();
let ret;
core::arch::asm!("
mov [rsi], {linux_tp} // save gs_base to stack
mov rcx, rsp // 保存 rsp
mov rsp, {hv_sp} // 更换 rsp
push rcx
call {entry}
pop rsp",
entry = sym crate::entry,
linux_tp = in(reg) linux_tp,
hv_sp = in(reg) hv_sp,
in("rdi") cpu_data, // entry 第一个参数
in("rsi") linux_sp, // entry 第二个参数
lateout("rax") ret, // rax 为返回值,为 ret
out("rcx") _,
);
Msr::IA32_GS_BASE.write(linux_tp);
ret
}

extern "sysv64" fn entry(cpu_data: &mut PerCpu, linux_sp: usize) -> i32 {
if let Err(e) = main(cpu_data, linux_sp) {
error!("{:?}", e);
ERROR_NUM.store(e.code(), Ordering::Release);
}
let code = ERROR_NUM.load(Ordering::Acquire);
println!(
"CPU {} return back to driver with code {}.",
cpu_data.id, code
);
code
}

第九步,primary_init_early

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
fn main(cpu_data: &mut PerCpu, linux_sp: usize) -> HvResult {
// 此时 cpu_data 状态为:state 为 HvDisabled,LinuxContext 为空
let is_primary = cpu_data.id == 0; // primary cpu
let online_cpus = HvHeader::get().online_cpus; //HvHeaderStuff 的数据
wait_for(|| PerCpu::entered_cpus() < online_cpus)?; // 同步

if is_primary {
primary_init_early()?;
} else { // 等待 primary_init_early 完成
wait_for_counter(&INIT_EARLY_OK, 1)?
}
......
}
fn primary_init_early() -> HvResult {
logging::init();

let system_config = HvSystemConfig::get(); // 获取 system_config

memory::init_heap(); // 分配一段 32 MB 内存,初始化 PHYS_VIRT_OFFSET
system_config.check()?; // 检查 signature 和 revision

memory::init_frame_allocator(); // 初始化物理内存分配器
memory::init_hv_page_table()?; // 初始化 HV_PT,hypervisor 用的页表
cell::init()?; // 初始化 ROOT_CELL,包括页表和 CellConfig

INIT_EARLY_OK.store(1, Ordering::Release);
Ok(())
}

第十步,更新 cpu_data

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
fn main(cpu_data: &mut PerCpu, linux_sp: usize) -> HvResult {
cpu_data.init(linux_sp, cell::root_cell())?; // 初始化 Vcpu,state,linux
INITED_CPUS.fetch_add(1, Ordering::SeqCst);
wait_for_counter(&INITED_CPUS, online_cpus)?;
......
}
impl PerCpu {
pub fn init(&mut self, linux_sp: usize, cell: &Cell) -> HvResult {
// Save CPU state used for linux.
self.state = CpuState::HvDisabled;
self.linux = LinuxContext::load_from(linux_sp);
self.arch.init();

// Activate hypervisor page table on each cpu.
unsafe { crate::memory::hv_page_table().read().activate() };

// Initialize vCPU. Use `ptr::write()` to avoid dropping
unsafe { core::ptr::write(&mut self.vcpu, Vcpu::new(&self.linux, cell)?) };

self.state = CpuState::HvEnabled;
Ok(())
}
}
impl LinuxContext {
/// Load linux callee-saved registers from the stack, and other system registers.
pub fn load_from(linux_sp: usize) -> Self {
// 注意,这里的 linux_sp 为更换栈之前的栈指针
let regs = unsafe { core::slice::from_raw_parts(linux_sp as *const u64, SAVED_LINUX_REGS) };
let gdt = GdtStruct::sgdt();
let mut fs = Segment::from_selector(segmentation::fs(), &gdt);
let mut gs = Segment::from_selector(segmentation::gs(), &gdt);
fs.base = Msr::IA32_FS_BASE.read();
gs.base = regs[0];

Self {
// 注意,这里的 rsp 并不是直接等于 linux_sp,而是 regs 末尾,相当于 linux_sp 对应的栈中 r15 到 rip 全出栈
rsp: regs.as_ptr_range().end as _,
r15: regs[1],
r14: regs[2],
r13: regs[3],
r12: regs[4],
rbx: regs[5],
rbp: regs[6],
// 进入 arch_entry 前,要把返回地址压栈,所以 rip 中保存的应该是 enter_hypervisor 中 entry(cpu) 的下一条指令。
rip: regs[7],
es: Segment::from_selector(segmentation::es(), &gdt),
cs: Segment::from_selector(segmentation::cs(), &gdt),
ss: Segment::from_selector(segmentation::ss(), &gdt),
ds: Segment::from_selector(segmentation::ds(), &gdt),
fs,
gs,
tss: Segment::from_selector(unsafe { task::tr() }, &gdt),
gdt,
idt: IdtStruct::sidt(),
cr0: Cr0::read(),
cr3: Cr3::read().0.start_address().as_u64(),
cr4: Cr4::read(),
efer: Msr::IA32_EFER.read(),
star: Msr::IA32_STAR.read(),
lstar: Msr::IA32_LSTAR.read(),
cstar: Msr::IA32_CSTAR.read(),
fmask: Msr::IA32_FMASK.read(),
kernel_gsbase: Msr::IA32_KERNEL_GSBASE.read(),
pat: Msr::IA32_PAT.read(),
mtrr_def_type: Msr::IA32_MTRR_DEF_TYPE.read(),
}
}
}

第十一步,activate_vmm

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
fn main(cpu_data: &mut PerCpu, linux_sp: usize) -> HvResult {
if is_primary {
primary_init_late();
} else { // 等待 primary_init_late 完成
wait_for_counter(&INIT_LATE_OK, 1)?
}

cpu_data.activate_vmm()
}
impl PerCpu {
pub fn activate_vmm(&mut self) -> HvResult {
println!("Activating hypervisor on CPU {}...", self.id);
ACTIVATED_CPUS.fetch_add(1, Ordering::SeqCst);

self.vcpu.enter(&self.linux)?;
unreachable!()
}
}
impl Vcpu {
pub fn enter(&mut self, linux: &LinuxContext) -> HvResult {
let regs = self.regs_mut();
regs.rax = 0;
regs.rbx = linux.rbx;
regs.rbp = linux.rbp;
regs.r12 = linux.r12;
regs.r13 = linux.r13;
regs.r14 = linux.r14;
regs.r15 = linux.r15;
unsafe {
asm!(
"mov rsp, {0}",
restore_regs_from_stack!(),
"vmlaunch", // 进入 guest 执行,执行的地址为前面加载的 rip
in(reg) regs as * const _ as usize,
);
}
// Never return if successful
error!(
"Activate hypervisor failed: {:?}",
Vmcs::instruction_error()
);
hv_result_err!(EIO)
}
}

第十二步,guest 运行

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
static void enter_hypervisor(void *info)
{
struct jailhouse_header *header = info;
unsigned int cpu = smp_processor_id();
int (*entry)(unsigned int);
int err;

entry = header->entry + (unsigned long) hypervisor_mem;

if (cpu < header->max_cpus)
/* either returns 0 or the same error code across all CPUs */
err = entry(cpu);
else
err = -EINVAL;

if (err) // guest 运行的第一条指令
error_code = err;

#if defined(CONFIG_X86) && LINUX_VERSION_CODE >= KERNEL_VERSION(4,0,0)
/* on Intel, VMXE is now on - update the shadow */
cr4_init_shadow();
#endif

atomic_inc(&call_done);
}

static int jailhouse_cmd_enable(struct jailhouse_system __user *arg)
{
preempt_disable();

header->online_cpus = num_online_cpus();

atomic_set(&call_done, 0);
on_each_cpu(enter_hypervisor, header, 0);
while (atomic_read(&call_done) != num_online_cpus())
cpu_relax();

preempt_enable();

if (error_code) {
err = error_code;
goto error_free_cell;
}
// 现在处于 guest 中,把不必要的资源关闭
if (console)
iounmap(console);

release_firmware(hypervisor);
// 注册一个 cellid 为 0 的 cell
jailhouse_cell_register_root();
// 增加 PCI 设备
jailhouse_pci_virtual_root_devices_add(&config_header);

jailhouse_enabled = true;

mutex_unlock(&jailhouse_lock);

pr_info("The Jailhouse is opening.\n");

return 0;
}

jailhouse_cmd_disable

jailhouse 部分

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
static int jailhouse_cmd_disable(void)
{
int err;
// 上锁
if (mutex_lock_interruptible(&jailhouse_lock) != 0)
return -EINTR;
// jailhouse_enabled 要为 true
if (!jailhouse_enabled) {
err = -EINVAL;
goto unlock_out;
}
// 回收 jailhouse 相关资源
err = jailhouse_cmd_cell_destroy_non_root();
if (err)
goto unlock_out;

jailhouse_pci_virtual_root_devices_remove();

error_code = 0;

preempt_disable();

if (num_online_cpus() != cpumask_weight(&root_cell->cpus_assigned)) {
/*
* Not all assigned CPUs are currently online. If we disable
* now, we will lose the offlined ones.
*/

preempt_enable();

err = -EBUSY;
goto unlock_out;
}

atomic_set(&call_done, 0);
on_each_cpu(leave_hypervisor, NULL, 0);
while (atomic_read(&call_done) != num_online_cpus())
cpu_relax();

preempt_enable();

err = error_code;
if (err) {
pr_warn("jailhouse: Failed to disable hypervisor: %d\n", err);
}

jailhouse_cell_delete_root();
jailhouse_enabled = false;
module_put(THIS_MODULE);

pr_info("The Jailhouse was closed.\n");

unlock_out:
mutex_unlock(&jailhouse_lock);

return err;
}

static void leave_hypervisor(void *info)
{
void *page;
int err;

/* Touch each hypervisor page we may need during the switch so that
* the active mm definitely contains all mappings. At least x86 does
* not support taking any faults while switching worlds. */
for (page = hypervisor_mem;
page < hypervisor_mem + hv_core_and_percpu_size;
page += PAGE_SIZE)
readl((void __iomem *)page);

/* either returns 0 or the same error code across all CPUs */
err = jailhouse_call(JAILHOUSE_HC_DISABLE);
if (err)
error_code = err;

#if defined(CONFIG_X86) && LINUX_VERSION_CODE >= KERNEL_VERSION(4,0,0)
/* on Intel, VMXE is now off - update the shadow */
cr4_init_shadow();
#endif

atomic_inc(&call_done);
}
#define JAILHOUSE_CALL_CODE \
"cmpb $0x01, %[use_vmcall]\n\t"\ // 判断 jailhouse_use_vmcall 是否为 1
"jne 1f\n\t"\
"vmcall\n\t"\ // 触发 vmexit
"jmp 2f\n\t"\
"1: vmmcall\n\t"\
"2:"

#define JAILHOUSE_CALL_RESULT "=a" (result)
#define JAILHOUSE_USE_VMCALL [use_vmcall] "m" (jailhouse_use_vmcall)
#define JAILHOUSE_CALL_NUM "a" (num)

static inline __u32 jailhouse_call(__u32 num)
{
__u32 result;

asm volatile(JAILHOUSE_CALL_CODE
: JAILHOUSE_CALL_RESULT
: JAILHOUSE_USE_VMCALL, JAILHOUSE_CALL_NUM
: "memory");
return result;
}

RVM1.5 部分

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
impl VmExit<'_> {
pub fn handle_exit(&mut self) -> HvResult {
...
let res = match exit_info.exit_reason {
VmxExitReason::EXCEPTION_NMI => self.handle_exception_nmi(&exit_info),
VmxExitReason::CPUID => self.handle_cpuid(),
VmxExitReason::VMCALL => self.handle_hypercall(), // vmcall 触发
VmxExitReason::MSR_READ => self.handle_msr_read(),
VmxExitReason::MSR_WRITE => self.handle_msr_write(),
VmxExitReason::EPT_VIOLATION => self.handle_ept_violation(&exit_info),
VmxExitReason::TRIPLE_FAULT => {
error!("Triple fault: {:#x?}", exit_info);
self.cpu_data.vcpu.inject_fault()?;
Ok(())
}
_ => hv_result_err!(ENOSYS),
};
...
}

pub fn handle_hypercall(&mut self) -> HvResult {
use crate::hypercall::HyperCall;
self.cpu_data.vcpu.advance_rip(VM_EXIT_LEN_HYPERCALL)?;
let guest_regs = self.cpu_data.vcpu.regs();
let (code, arg0, arg1) = (guest_regs.rax, guest_regs.rdi, guest_regs.rsi);
HyperCall::new(self.cpu_data).hypercall(code as _, arg0, arg1)?; // 调用 hypercall
Ok(())
}
}

impl<'a> HyperCall<'a> {
pub fn hypercall(&mut self, code: u32, arg0: u64, _arg1: u64) -> HvResult {
......
let ret = match code {
HyperCallCode::HypervisorDisable => self.hypervisor_disable(),
};
......
}

fn hypervisor_disable(&mut self) -> HyperCallResult {
let cpus = PerCpu::activated_cpus();

static TRY_DISABLE_CPUS: AtomicU32 = AtomicU32::new(0);
TRY_DISABLE_CPUS.fetch_add(1, Ordering::SeqCst);
while TRY_DISABLE_CPUS.load(Ordering::Acquire) < cpus {
core::hint::spin_loop();
}

self.cpu_data.deactivate_vmm(0)?; // 关闭 vmm
unreachable!()
}
}
impl PerCpu {
pub fn deactivate_vmm(&mut self, ret_code: usize) -> HvResult {
println!("Deactivating hypervisor on CPU {}...", self.id);
ACTIVATED_CPUS.fetch_sub(1, Ordering::SeqCst);

self.vcpu.set_return_val(ret_code);
self.vcpu.exit(&mut self.linux)?;
self.linux.restore(); // 用 self.linux 恢复系统寄存器
self.state = CpuState::HvDisabled;
self.linux.return_to_linux(self.vcpu.regs());
}
}
impl Vcpu {
pub fn exit(&self, linux: &mut LinuxContext) -> HvResult {
self.load_vmcs_guest(linux)?; // 保存 guest 的状态到 linux
Vmcs::clear(self.vmcs_region.paddr())?;
unsafe { vmx::vmxoff()? };
info!("successed to turn off VMX.");
Ok(())
}
}
impl LinuxContext {
/// Restore linux general-purpose registers and stack, then return back to linux.
pub fn return_to_linux(&self, guest_regs: &GeneralRegisters) -> ! {
// 用 guest 状态恢复 linux,栈换成 linux_rsp,跳转到 linux_rip 执行
unsafe {
Msr::IA32_GS_BASE.write(self.gs.base);
core::arch::asm!(
"mov rsp, {linux_rsp}",
"push {linux_rip}",
"mov rcx, rsp",
"mov rsp, {guest_regs}",
"mov [rsp + {guest_regs_size}], rcx",
restore_regs_from_stack!(),
"pop rsp",
"ret",
linux_rsp = in(reg) self.rsp,
linux_rip = in(reg) self.rip,
guest_regs = in(reg) guest_regs,
guest_regs_size = const core::mem::size_of::<GeneralRegisters>(),
options(noreturn),
);
}
}
}