Developing hypervisor from scratch: Part 4 - Setting up HOST and GUEST State

In this article series you are going to learn how to develop your own hypervisor for virtualization in linux ecosystem. In this part we will do the setup of HOST and GUEST state area and at last the vmlaunch.

7 min read
Developing hypervisor from scratch: Part 4 - Setting up HOST and GUEST State

In the last part of the series we have setup following fields in VMCS structure:

  • VM-execution control fields
  • VM-exit control fields
  • VM-entry control fields

In this part we are going to initialize HOST state area and GUEST state area in VMCS.

HOST state area

Host state area is saves state of HOST where the cpu will return after a VM exit. We need to setup following fields inside Host state for successful return after a VM exit.

VMCS Host state area

We will load the current host state value of our system into the host state area and it should work.

Let's first set the Control registers.

...
// for checks on host control registers
#define HOST_CR0						0x00006c00
#define	HOST_CR3						0x00006c02
#define	HOST_CR4						0x00006c04
static inline uint64_t get_cr0(void)
{
	uint64_t cr0;
	__asm__ __volatile__("mov %%cr0, %[cr0]"
			     : /* output */ [cr0]"=r"(cr0));
	return cr0;
}

static inline uint64_t get_cr3(void)
{
	uint64_t cr3;
	__asm__ __volatile__("mov %%cr3, %[cr3]"
			     : /* output */ [cr3]"=r"(cr3));
	return cr3;
}

static inline uint64_t get_cr4(void)
{
	uint64_t cr4;
	__asm__ __volatile__("mov %%cr4, %[cr4]"
			     : /* output */ [cr4]"=r"(cr4));
	return cr4;
}

...
bool initVmcsControlField(void) {
...
	vmwrite(HOST_CR0, get_cr0());
	vmwrite(HOST_CR3, get_cr3());
	vmwrite(HOST_CR4, get_cr4());
...
}

Now, lets initialize selectors(CS, SS, DS, ES, FS, GS, and TR) and base fields(FS, GS, TR, GDTR, and IDTR).


...
#define HOST_ES_SELECTOR				0x00000c00
#define HOST_CS_SELECTOR				0x00000c02
#define HOST_SS_SELECTOR				0x00000c04
#define HOST_DS_SELECTOR				0x00000c06
#define HOST_FS_SELECTOR				0x00000c08
#define HOST_GS_SELECTOR				0x00000c0a
#define HOST_TR_SELECTOR				0x00000c0c
#define HOST_FS_BASE					0x00006c06
#define HOST_GS_BASE					0x00006c08
#define HOST_TR_BASE					0x00006c0a
#define HOST_GDTR_BASE					0x00006c0c
#define HOST_IDTR_BASE					0x00006c0e
...
static inline uint16_t get_es1(void)
{
	uint16_t es;

	__asm__ __volatile__("mov %%es, %[es]"
			     : /* output */ [es]"=rm"(es));
	return es;
}

static inline uint16_t get_cs1(void)
{
	uint16_t cs;

	__asm__ __volatile__("mov %%cs, %[cs]"
			     : /* output */ [cs]"=rm"(cs));
	return cs;
}

static inline uint16_t get_ss1(void)
{
	uint16_t ss;

	__asm__ __volatile__("mov %%ss, %[ss]"
			     : /* output */ [ss]"=rm"(ss));
	return ss;
}

static inline uint16_t get_ds1(void)
{
	uint16_t ds;

	__asm__ __volatile__("mov %%ds, %[ds]"
			     : /* output */ [ds]"=rm"(ds));
	return ds;
}

static inline uint16_t get_fs1(void)
{
	uint16_t fs;

	__asm__ __volatile__("mov %%fs, %[fs]"
			     : /* output */ [fs]"=rm"(fs));
	return fs;
}

static inline uint16_t get_gs1(void)
{
	uint16_t gs;

	__asm__ __volatile__("mov %%gs, %[gs]"
			     : /* output */ [gs]"=rm"(gs));
	return gs;
}

static inline uint16_t get_tr1(void)
{
	uint16_t tr;

	__asm__ __volatile__("str %[tr]"
			     : /* output */ [tr]"=rm"(tr));
	return tr;
}

static inline uint64_t get_gdt_base1(void)
{
	struct desc_ptr gdt;
	__asm__ __volatile__("sgdt %[gdt]"
			     : /* output */ [gdt]"=m"(gdt));
	return gdt.address;
}

static inline uint64_t get_idt_base1(void)
{
	struct desc_ptr idt;
	__asm__ __volatile__("sidt %[idt]"
			     : /* output */ [idt]"=m"(idt));
	return idt.address;
}

static inline uint64_t get_desc64_base(const struct desc64 *desc)
{
	return ((uint64_t)desc->base3 << 32) |
		(desc->base0 | ((desc->base1) << 16) | ((desc->base2) << 24));
}

...
bool initVmcsControlField(void) {
...
	vmwrite(HOST_ES_SELECTOR, get_es1());
	vmwrite(HOST_CS_SELECTOR, get_cs1());
	vmwrite(HOST_SS_SELECTOR, get_ss1());
	vmwrite(HOST_DS_SELECTOR, get_ds1());
	vmwrite(HOST_FS_SELECTOR, get_fs1());
	vmwrite(HOST_GS_SELECTOR, get_gs1());
	vmwrite(HOST_TR_SELECTOR, get_tr1());
	vmwrite(HOST_FS_BASE, __rdmsr1(MSR_FS_BASE));
	vmwrite(HOST_GS_BASE, __rdmsr1(MSR_GS_BASE));
	vmwrite(HOST_TR_BASE, get_desc64_base((struct desc64 *)(get_gdt_base1() + get_tr1())));
	vmwrite(HOST_GDTR_BASE, get_gdt_base1());
	vmwrite(HOST_IDTR_BASE, get_idt_base1());
...
}

At last, lets initialize the MSRs.

...
#define HOST_IA32_SYSENTER_ESP			0x00006c10
#define HOST_IA32_SYSENTER_EIP			0x00006c12
#define HOST_IA32_SYSENTER_CS			0x00004c00
...
bool initVmcsControlField(void) {
...
	vmwrite(HOST_IA32_SYSENTER_ESP, __rdmsr1(MSR_IA32_SYSENTER_ESP));
	vmwrite(HOST_IA32_SYSENTER_EIP, __rdmsr1(MSR_IA32_SYSENTER_EIP));
	vmwrite(HOST_IA32_SYSENTER_CS, __rdmsr(MSR_IA32_SYSENTER_CS));
...
}

What now left is to set up the RSP and RIP. We will do that just before calling the vmlaunch so that we can return at correct state of stack and text section.

GUEST state area

Guest area contain processor state which get loaded into every VM entry and stored back into these fields on every VM exit. Guest state area is divided into

  • Guest Register state
  • Guest Non-Register state

You can consult https://www.sandpile.org/x86/initial.htm to set the initial state values on VM entry.

Guest Register state

Registers and Fields that are required to set

Let's set all require guest state area:

We want to continue the execution with as similar cpu state as possible from host to guest. Hence, we will copy CR{0..3} register state from Host state only:

...
#define GUEST_CR0						0x00006800
#define GUEST_CR3						0x00006802
#define GUEST_CR4						0x00006804
...
bool initVmcsControlField(void) {
...    
    vmwrite(GUEST_CR0, vmreadz(HOST_CR0));
	vmwrite(GUEST_CR3, vmreadz(HOST_CR3));
	vmwrite(GUEST_CR4, vmreadz(HOST_CR4));
}

We don't need DR7 for now and will set the guest RSP and RIP later. So, now lets set segment registers Selectors, Base address, Segment limits and Access rights:

...
#define GUEST_ES_SELECTOR				0x00000800
#define GUEST_CS_SELECTOR				0x00000802
#define GUEST_SS_SELECTOR				0x00000804
#define GUEST_DS_SELECTOR				0x00000806
#define GUEST_FS_SELECTOR				0x00000808
#define GUEST_GS_SELECTOR				0x0000080a
#define GUEST_LDTR_SELECTOR				0x0000080c
#define GUEST_TR_SELECTOR				0x0000080e
#define GUEST_ES_LIMIT					0x00004800
#define GUEST_CS_LIMIT					0x00004802
#define GUEST_SS_LIMIT					0x00004804
#define GUEST_DS_LIMIT					0x00004806
#define GUEST_FS_LIMIT					0x00004808
#define GUEST_GS_LIMIT					0x0000480a
#define GUEST_LDTR_LIMIT				0x0000480c
#define GUEST_TR_LIMIT					0x0000480e
#define GUEST_GDTR_LIMIT				0x00004810
#define GUEST_IDTR_LIMIT				0x00004812
#define GUEST_ES_AR_BYTES				0x00004814
#define GUEST_CS_AR_BYTES				0x00004816
#define GUEST_SS_AR_BYTES				0x00004818
#define GUEST_DS_AR_BYTES				0x0000481a
#define GUEST_FS_AR_BYTES				0x0000481c
#define GUEST_GS_AR_BYTES				0x0000481e
#define GUEST_LDTR_AR_BYTES				0x00004820
#define GUEST_TR_AR_BYTES				0x00004822
#define GUEST_ES_BASE					0x00006806
#define GUEST_CS_BASE					0x00006808
#define GUEST_SS_BASE					0x0000680a
#define GUEST_DS_BASE					0x0000680c
#define GUEST_FS_BASE					0x0000680e
#define GUEST_GS_BASE					0x00006810
#define GUEST_LDTR_BASE					0x00006812
#define GUEST_TR_BASE					0x00006814
#define GUEST_GDTR_BASE					0x00006816
#define GUEST_IDTR_BASE					0x00006818
...
bool initVmcsControlField(void) {
...    
    vmwrite(GUEST_ES_SELECTOR, vmreadz(HOST_ES_SELECTOR));
	vmwrite(GUEST_CS_SELECTOR, vmreadz(HOST_CS_SELECTOR));
	vmwrite(GUEST_SS_SELECTOR, vmreadz(HOST_SS_SELECTOR));
	vmwrite(GUEST_DS_SELECTOR, vmreadz(HOST_DS_SELECTOR));
	vmwrite(GUEST_FS_SELECTOR, vmreadz(HOST_FS_SELECTOR));
	vmwrite(GUEST_GS_SELECTOR, vmreadz(HOST_GS_SELECTOR));
	vmwrite(GUEST_LDTR_SELECTOR, 0);
	vmwrite(GUEST_TR_SELECTOR, vmreadz(HOST_TR_SELECTOR));
    vmwrite(GUEST_ES_LIMIT, -1);
	vmwrite(GUEST_CS_LIMIT, -1);
	vmwrite(GUEST_SS_LIMIT, -1);
	vmwrite(GUEST_DS_LIMIT, -1);
	vmwrite(GUEST_FS_LIMIT, -1);
	vmwrite(GUEST_GS_LIMIT, -1);
	vmwrite(GUEST_LDTR_LIMIT, -1);
	vmwrite(GUEST_TR_LIMIT, 0x67);
	vmwrite(GUEST_GDTR_LIMIT, 0xffff);
	vmwrite(GUEST_IDTR_LIMIT, 0xffff);
	vmwrite(GUEST_ES_AR_BYTES,
		vmreadz(GUEST_ES_SELECTOR) == 0 ? 0x10000 : 0xc093);
	vmwrite(GUEST_CS_AR_BYTES, 0xa09b);
	vmwrite(GUEST_SS_AR_BYTES, 0xc093);
	vmwrite(GUEST_DS_AR_BYTES,
		vmreadz(GUEST_DS_SELECTOR) == 0 ? 0x10000 : 0xc093);
	vmwrite(GUEST_FS_AR_BYTES,
		vmreadz(GUEST_FS_SELECTOR) == 0 ? 0x10000 : 0xc093);
	vmwrite(GUEST_GS_AR_BYTES,
		vmreadz(GUEST_GS_SELECTOR) == 0 ? 0x10000 : 0xc093);
	vmwrite(GUEST_LDTR_AR_BYTES, 0x10000);
	vmwrite(GUEST_TR_AR_BYTES, 0x8b);
    vmwrite(GUEST_ES_BASE, 0);
	vmwrite(GUEST_CS_BASE, 0);
	vmwrite(GUEST_SS_BASE, 0);
	vmwrite(GUEST_DS_BASE, 0);
	vmwrite(GUEST_FS_BASE, vmreadz(HOST_FS_BASE));
	vmwrite(GUEST_GS_BASE, vmreadz(HOST_GS_BASE));
	vmwrite(GUEST_LDTR_BASE, 0);
	vmwrite(GUEST_TR_BASE, vmreadz(HOST_TR_BASE));
	vmwrite(GUEST_GDTR_BASE, vmreadz(HOST_GDTR_BASE));
	vmwrite(GUEST_IDTR_BASE, vmreadz(HOST_IDTR_BASE));
}

Let's set the required MSR's now.

...
#define GUEST_IA32_DEBUGCTL				0x00002802
#define GUEST_IA32_PAT					0x00002804
#define GUEST_IA32_EFER					0x00002806
#define GUEST_IA32_PERF_GLOBAL_CTRL		0x00002808
#define GUEST_SYSENTER_CS				0x0000482A
#define GUEST_SYSENTER_ESP				0x00006824
#define GUEST_SYSENTER_EIP				0x00006826
...
bool initVmcsControlField(void) {
...    
    vmwrite(GUEST_IA32_DEBUGCTL, 0);
	vmwrite(GUEST_IA32_PAT, vmreadz(HOST_IA32_PAT));
	vmwrite(GUEST_IA32_EFER, vmreadz(HOST_IA32_EFER));
	vmwrite(GUEST_IA32_PERF_GLOBAL_CTRL,
		vmreadz(HOST_IA32_PERF_GLOBAL_CTRL));
    vmwrite(GUEST_SYSENTER_CS, vmreadz(HOST_IA32_SYSENTER_CS));
    vmwrite(GUEST_SYSENTER_ESP, vmreadz(HOST_IA32_SYSENTER_ESP));
	vmwrite(GUEST_SYSENTER_EIP, vmreadz(HOST_IA32_SYSENTER_EIP));
 }

Guest Non Register state

It consists of required fields that are not corresponded to processor registers.

We only required to set few fields from the above list for making things work.

...
#define GUEST_ACTIVITY_STATE			0X00004826
#define VMX_PREEMPTION_TIMER_VALUE		0x0000482E
#define VMCS_LINK_POINTER				0x00002800
#define GUEST_INTR_STATUS				0x00000810
#define GUEST_PML_INDEX					0x00000812
...
bool initVmcsControlField(void) {
...    
	vmwrite(GUEST_ACTIVITY_STATE, 0);
    vmwrite(VMCS_LINK_POINTER, -1ll);
    vmwrite(VMX_PREEMPTION_TIMER_VALUE, 0);
    vmwrite(GUEST_INTR_STATUS, 0);
	vmwrite(GUEST_PML_INDEX, 0);
}

What now remains is setting guest RIP and RSP. This will be the code and stack that will going to execute after VMLAUNCH as a virtual machine code.

...
#define GUEST_STACK_SIZE 				64
...
static void guest_code(void)
{
    asm volatile("vmcall");

}

bool initVmcsControlField(void) {
...    
	void *costum_rip;
	void *costum_rsp;
	
	unsigned long guest_stack[GUEST_STACK_SIZE];
	costum_rsp = &guest_stack[GUEST_STACK_SIZE];
	costum_rip = guest_code;
	vmwrite(GUEST_RSP, (uint64_t)costum_rsp);
	vmwrite(GUEST_RIP, (uint64_t)costum_rip);
}

Here, the rsp inside the guest will points to the address of guest_stack  buffer and rip will point to code of guest_code . So, whatever inline assembly we're going to put inside guest_code will be run inside our virtual environment.

Launching the Guest

Now is the time run vmlaunch. But before that, we need to push our register states and setup host rip and rsp which we have left earlier.

#define HOST_RSP						0x00006c14
#define	HOST_RIP						0x00006c16

static inline int _vmlaunch(void)
{
	int ret;

	__asm__ __volatile__("push %%rbp;"
			     "push %%rcx;"
			     "push %%rdx;"
			     "push %%rsi;"
			     "push %%rdi;"
			     "push $0;"
			     "vmwrite %%rsp, %[host_rsp];"
			     "lea 1f(%%rip), %%rax;"
			     "vmwrite %%rax, %[host_rip];"
			     "vmlaunch;"
			     "incq (%%rsp);"
			     "1: pop %%rax;"
			     "pop %%rdi;"
			     "pop %%rsi;"
			     "pop %%rdx;"
			     "pop %%rcx;"
			     "pop %%rbp;"
			     : [ret]"=&a"(ret)
			     : [host_rsp]"r"((uint64_t)HOST_RSP),
			       [host_rip]"r"((uint64_t)HOST_RIP)
			     : "memory", "cc", "rbx", "r8", "r9", "r10",
			       "r11", "r12", "r13", "r14", "r15");
	return ret;
}

Before vmlaunch we are moving the rsp to the HOST_RSP and incq (%%rsp) address to rip so that after vmexit remaining code after vmlaunch will run.

Now, we let's call that function and check for the vm exit reason. Exit reason will tell what cause the vmexit. It is very useful for debugging the cause.

#define VM_EXIT_REASON			 		0x00004402

// CH 27.2.1, Vol 3
// Basic VM exit reason
uint32_t vmExit_reason(void) {
	uint32_t exit_reason = vmreadz(VM_EXIT_REASON);
	exit_reason = exit_reason & 0xffff;
	return exit_reason;
}

bool initVmLaunchProcess(void){
	int vmlaunch_status = _vmlaunch();
	if (vmlaunch_status != 0){
		return false;
	}
	printk(KERN_INFO "VM exit reason is %lu!\n", (unsigned long)vmExit_reason());
	return true;
}

By now we should have running guest VM code. You must got the VM exit reason to be 0x12. If not, look at the exit reason in the exit reason table on Appendix C in IA-32 and 64 software developers manual Vol 3 to identify the exit cause.

VMXOFF operation

If you are able to get a successful vmlaunch then you can clean all the memory and execute vmexit before unloading the module.

// Dealloc vmxon region
bool deallocate_vmxon_region(void) {
	if(vmxonRegion){
	    kfree(vmxonRegion);
		return true;
   	}
   	return false;
}

/* Dealloc vmcs guest region*/
bool deallocate_vmcs_region(void) {
	if(vmcsRegion){
    	printk(KERN_INFO "Freeing allocated vmcs region!\n");
    	kfree(vmcsRegion);
		return true;
	}
	return false;
}

bool vmxoffOperation(void)
{
	if (deallocate_vmxon_region()) {
		printk(KERN_INFO "Successfully freed allocated vmxon region!\n");
	}
	else {
		printk(KERN_INFO "Error freeing allocated vmxon region!\n");
	}
	if (deallocate_vmcs_region()) {
		printk(KERN_INFO "Successfully freed allocated vmcs region!\n");
	}
	else {
		printk(KERN_INFO "Error freeing allocated vmcs region!\n");
	}
	asm volatile ("vmxoff\n" : : : "cc");
	return true;
}

You can see the complete code here.

Conclusion

So, our minimal hypervisor is ready and is ready to execute the instructions set defined in guest_code function. There is lots of further functionality that you can add like EPT, VT-D (for I/O support) etc to make your hypervisor as useful and comparable to hypervisor in market.
I may continue the series for further features.