Skip to content

Commit

Permalink
KVM: nVMX: Prepare vmcs02 from vmcs01 and vmcs12
Browse files Browse the repository at this point in the history
This patch contains code to prepare the VMCS which can be used to actually
run the L2 guest, vmcs02. prepare_vmcs02 appropriately merges the information
in vmcs12 (the vmcs that L1 built for L2) and in vmcs01 (our desires for our
own guests).

Signed-off-by: Nadav Har'El <nyh@il.ibm.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
  • Loading branch information
Nadav Har'El authored and Avi Kivity committed Jul 12, 2011
1 parent bf8179a commit fe3ef05
Showing 1 changed file with 279 additions and 0 deletions.
279 changes: 279 additions & 0 deletions arch/x86/kvm/vmx.c
Original file line number Diff line number Diff line change
Expand Up @@ -345,6 +345,12 @@ struct nested_vmx {
/* vmcs02_list cache of VMCSs recently used to run L2 guests */
struct list_head vmcs02_pool;
int vmcs02_num;
u64 vmcs01_tsc_offset;
/*
* Guest pages referred to in vmcs02 with host-physical pointers, so
* we must keep them pinned while L2 runs.
*/
struct page *apic_access_page;
};

struct vcpu_vmx {
Expand Down Expand Up @@ -847,6 +853,18 @@ static inline bool report_flexpriority(void)
return flexpriority_enabled;
}

static inline bool nested_cpu_has(struct vmcs12 *vmcs12, u32 bit)
{
return vmcs12->cpu_based_vm_exec_control & bit;
}

static inline bool nested_cpu_has2(struct vmcs12 *vmcs12, u32 bit)
{
return (vmcs12->cpu_based_vm_exec_control &
CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) &&
(vmcs12->secondary_vm_exec_control & bit);
}

static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
{
int i;
Expand Down Expand Up @@ -1441,6 +1459,22 @@ static void vmx_fpu_activate(struct kvm_vcpu *vcpu)

static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu);

/*
* Return the cr0 value that a nested guest would read. This is a combination
* of the real cr0 used to run the guest (guest_cr0), and the bits shadowed by
* its hypervisor (cr0_read_shadow).
*/
static inline unsigned long nested_read_cr0(struct vmcs12 *fields)
{
return (fields->guest_cr0 & ~fields->cr0_guest_host_mask) |
(fields->cr0_read_shadow & fields->cr0_guest_host_mask);
}
static inline unsigned long nested_read_cr4(struct vmcs12 *fields)
{
return (fields->guest_cr4 & ~fields->cr4_guest_host_mask) |
(fields->cr4_read_shadow & fields->cr4_guest_host_mask);
}

static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu)
{
vmx_decache_cr0_guest_bits(vcpu);
Expand Down Expand Up @@ -3438,6 +3472,9 @@ static void set_cr4_guest_host_mask(struct vcpu_vmx *vmx)
vmx->vcpu.arch.cr4_guest_owned_bits = KVM_CR4_GUEST_OWNED_BITS;
if (enable_ept)
vmx->vcpu.arch.cr4_guest_owned_bits |= X86_CR4_PGE;
if (is_guest_mode(&vmx->vcpu))
vmx->vcpu.arch.cr4_guest_owned_bits &=
~get_vmcs12(&vmx->vcpu)->cr4_guest_host_mask;
vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits);
}

Expand Down Expand Up @@ -4736,6 +4773,11 @@ static void free_nested(struct vcpu_vmx *vmx)
vmx->nested.current_vmptr = -1ull;
vmx->nested.current_vmcs12 = NULL;
}
/* Unpin physical memory we referred to in current vmcs02 */
if (vmx->nested.apic_access_page) {
nested_release_page(vmx->nested.apic_access_page);
vmx->nested.apic_access_page = 0;
}

nested_free_all_saved_vmcss(vmx);
}
Expand Down Expand Up @@ -5810,6 +5852,243 @@ static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
{
}

/*
* prepare_vmcs02 is called when the L1 guest hypervisor runs its nested
* L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it
* with L0's requirements for its guest (a.k.a. vmsc01), so we can run the L2
* guest in a way that will both be appropriate to L1's requests, and our
* needs. In addition to modifying the active vmcs (which is vmcs02), this
* function also has additional necessary side-effects, like setting various
* vcpu->arch fields.
*/
static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
u32 exec_control;

vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector);
vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector);
vmcs_write16(GUEST_SS_SELECTOR, vmcs12->guest_ss_selector);
vmcs_write16(GUEST_DS_SELECTOR, vmcs12->guest_ds_selector);
vmcs_write16(GUEST_FS_SELECTOR, vmcs12->guest_fs_selector);
vmcs_write16(GUEST_GS_SELECTOR, vmcs12->guest_gs_selector);
vmcs_write16(GUEST_LDTR_SELECTOR, vmcs12->guest_ldtr_selector);
vmcs_write16(GUEST_TR_SELECTOR, vmcs12->guest_tr_selector);
vmcs_write32(GUEST_ES_LIMIT, vmcs12->guest_es_limit);
vmcs_write32(GUEST_CS_LIMIT, vmcs12->guest_cs_limit);
vmcs_write32(GUEST_SS_LIMIT, vmcs12->guest_ss_limit);
vmcs_write32(GUEST_DS_LIMIT, vmcs12->guest_ds_limit);
vmcs_write32(GUEST_FS_LIMIT, vmcs12->guest_fs_limit);
vmcs_write32(GUEST_GS_LIMIT, vmcs12->guest_gs_limit);
vmcs_write32(GUEST_LDTR_LIMIT, vmcs12->guest_ldtr_limit);
vmcs_write32(GUEST_TR_LIMIT, vmcs12->guest_tr_limit);
vmcs_write32(GUEST_GDTR_LIMIT, vmcs12->guest_gdtr_limit);
vmcs_write32(GUEST_IDTR_LIMIT, vmcs12->guest_idtr_limit);
vmcs_write32(GUEST_ES_AR_BYTES, vmcs12->guest_es_ar_bytes);
vmcs_write32(GUEST_CS_AR_BYTES, vmcs12->guest_cs_ar_bytes);
vmcs_write32(GUEST_SS_AR_BYTES, vmcs12->guest_ss_ar_bytes);
vmcs_write32(GUEST_DS_AR_BYTES, vmcs12->guest_ds_ar_bytes);
vmcs_write32(GUEST_FS_AR_BYTES, vmcs12->guest_fs_ar_bytes);
vmcs_write32(GUEST_GS_AR_BYTES, vmcs12->guest_gs_ar_bytes);
vmcs_write32(GUEST_LDTR_AR_BYTES, vmcs12->guest_ldtr_ar_bytes);
vmcs_write32(GUEST_TR_AR_BYTES, vmcs12->guest_tr_ar_bytes);
vmcs_writel(GUEST_ES_BASE, vmcs12->guest_es_base);
vmcs_writel(GUEST_CS_BASE, vmcs12->guest_cs_base);
vmcs_writel(GUEST_SS_BASE, vmcs12->guest_ss_base);
vmcs_writel(GUEST_DS_BASE, vmcs12->guest_ds_base);
vmcs_writel(GUEST_FS_BASE, vmcs12->guest_fs_base);
vmcs_writel(GUEST_GS_BASE, vmcs12->guest_gs_base);
vmcs_writel(GUEST_LDTR_BASE, vmcs12->guest_ldtr_base);
vmcs_writel(GUEST_TR_BASE, vmcs12->guest_tr_base);
vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base);
vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base);

vmcs_write64(GUEST_IA32_DEBUGCTL, vmcs12->guest_ia32_debugctl);
vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
vmcs12->vm_entry_intr_info_field);
vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
vmcs12->vm_entry_exception_error_code);
vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
vmcs12->vm_entry_instruction_len);
vmcs_write32(GUEST_INTERRUPTIBILITY_INFO,
vmcs12->guest_interruptibility_info);
vmcs_write32(GUEST_ACTIVITY_STATE, vmcs12->guest_activity_state);
vmcs_write32(GUEST_SYSENTER_CS, vmcs12->guest_sysenter_cs);
vmcs_writel(GUEST_DR7, vmcs12->guest_dr7);
vmcs_writel(GUEST_RFLAGS, vmcs12->guest_rflags);
vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
vmcs12->guest_pending_dbg_exceptions);
vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->guest_sysenter_esp);
vmcs_writel(GUEST_SYSENTER_EIP, vmcs12->guest_sysenter_eip);

vmcs_write64(VMCS_LINK_POINTER, -1ull);

vmcs_write32(PIN_BASED_VM_EXEC_CONTROL,
(vmcs_config.pin_based_exec_ctrl |
vmcs12->pin_based_vm_exec_control));

/*
* Whether page-faults are trapped is determined by a combination of
* 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF.
* If enable_ept, L0 doesn't care about page faults and we should
* set all of these to L1's desires. However, if !enable_ept, L0 does
* care about (at least some) page faults, and because it is not easy
* (if at all possible?) to merge L0 and L1's desires, we simply ask
* to exit on each and every L2 page fault. This is done by setting
* MASK=MATCH=0 and (see below) EB.PF=1.
* Note that below we don't need special code to set EB.PF beyond the
* "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept,
* vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when
* !enable_ept, EB.PF is 1, so the "or" will always be 1.
*
* A problem with this approach (when !enable_ept) is that L1 may be
* injected with more page faults than it asked for. This could have
* caused problems, but in practice existing hypervisors don't care.
* To fix this, we will need to emulate the PFEC checking (on the L1
* page tables), using walk_addr(), when injecting PFs to L1.
*/
vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK,
enable_ept ? vmcs12->page_fault_error_code_mask : 0);
vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH,
enable_ept ? vmcs12->page_fault_error_code_match : 0);

if (cpu_has_secondary_exec_ctrls()) {
u32 exec_control = vmx_secondary_exec_control(vmx);
if (!vmx->rdtscp_enabled)
exec_control &= ~SECONDARY_EXEC_RDTSCP;
/* Take the following fields only from vmcs12 */
exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
if (nested_cpu_has(vmcs12,
CPU_BASED_ACTIVATE_SECONDARY_CONTROLS))
exec_control |= vmcs12->secondary_vm_exec_control;

if (exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES) {
/*
* Translate L1 physical address to host physical
* address for vmcs02. Keep the page pinned, so this
* physical address remains valid. We keep a reference
* to it so we can release it later.
*/
if (vmx->nested.apic_access_page) /* shouldn't happen */
nested_release_page(vmx->nested.apic_access_page);
vmx->nested.apic_access_page =
nested_get_page(vcpu, vmcs12->apic_access_addr);
/*
* If translation failed, no matter: This feature asks
* to exit when accessing the given address, and if it
* can never be accessed, this feature won't do
* anything anyway.
*/
if (!vmx->nested.apic_access_page)
exec_control &=
~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
else
vmcs_write64(APIC_ACCESS_ADDR,
page_to_phys(vmx->nested.apic_access_page));
}

vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
}


/*
* Set host-state according to L0's settings (vmcs12 is irrelevant here)
* Some constant fields are set here by vmx_set_constant_host_state().
* Other fields are different per CPU, and will be set later when
* vmx_vcpu_load() is called, and when vmx_save_host_state() is called.
*/
vmx_set_constant_host_state();

/*
* HOST_RSP is normally set correctly in vmx_vcpu_run() just before
* entry, but only if the current (host) sp changed from the value
* we wrote last (vmx->host_rsp). This cache is no longer relevant
* if we switch vmcs, and rather than hold a separate cache per vmcs,
* here we just force the write to happen on entry.
*/
vmx->host_rsp = 0;

exec_control = vmx_exec_control(vmx); /* L0's desires */
exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING;
exec_control &= ~CPU_BASED_TPR_SHADOW;
exec_control |= vmcs12->cpu_based_vm_exec_control;
/*
* Merging of IO and MSR bitmaps not currently supported.
* Rather, exit every time.
*/
exec_control &= ~CPU_BASED_USE_MSR_BITMAPS;
exec_control &= ~CPU_BASED_USE_IO_BITMAPS;
exec_control |= CPU_BASED_UNCOND_IO_EXITING;

vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, exec_control);

/* EXCEPTION_BITMAP and CR0_GUEST_HOST_MASK should basically be the
* bitwise-or of what L1 wants to trap for L2, and what we want to
* trap. Note that CR0.TS also needs updating - we do this later.
*/
update_exception_bitmap(vcpu);
vcpu->arch.cr0_guest_owned_bits &= ~vmcs12->cr0_guest_host_mask;
vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);

/* Note: IA32_MODE, LOAD_IA32_EFER are modified by vmx_set_efer below */
vmcs_write32(VM_EXIT_CONTROLS,
vmcs12->vm_exit_controls | vmcs_config.vmexit_ctrl);
vmcs_write32(VM_ENTRY_CONTROLS, vmcs12->vm_entry_controls |
(vmcs_config.vmentry_ctrl & ~VM_ENTRY_IA32E_MODE));

if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT)
vmcs_write64(GUEST_IA32_PAT, vmcs12->guest_ia32_pat);
else if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);


set_cr4_guest_host_mask(vmx);

vmcs_write64(TSC_OFFSET,
vmx->nested.vmcs01_tsc_offset + vmcs12->tsc_offset);

if (enable_vpid) {
/*
* Trivially support vpid by letting L2s share their parent
* L1's vpid. TODO: move to a more elaborate solution, giving
* each L2 its own vpid and exposing the vpid feature to L1.
*/
vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
vmx_flush_tlb(vcpu);
}

if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER)
vcpu->arch.efer = vmcs12->guest_ia32_efer;
if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE)
vcpu->arch.efer |= (EFER_LMA | EFER_LME);
else
vcpu->arch.efer &= ~(EFER_LMA | EFER_LME);
/* Note: modifies VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */
vmx_set_efer(vcpu, vcpu->arch.efer);

/*
* This sets GUEST_CR0 to vmcs12->guest_cr0, with possibly a modified
* TS bit (for lazy fpu) and bits which we consider mandatory enabled.
* The CR0_READ_SHADOW is what L2 should have expected to read given
* the specifications by L1; It's not enough to take
* vmcs12->cr0_read_shadow because on our cr0_guest_host_mask we we
* have more bits than L1 expected.
*/
vmx_set_cr0(vcpu, vmcs12->guest_cr0);
vmcs_writel(CR0_READ_SHADOW, nested_read_cr0(vmcs12));

vmx_set_cr4(vcpu, vmcs12->guest_cr4);
vmcs_writel(CR4_READ_SHADOW, nested_read_cr4(vmcs12));

/* shadow page tables on either EPT or shadow page tables */
kvm_set_cr3(vcpu, vmcs12->guest_cr3);
kvm_mmu_reset_context(vcpu);

kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->guest_rsp);
kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->guest_rip);
}

static int vmx_check_intercept(struct kvm_vcpu *vcpu,
struct x86_instruction_info *info,
enum x86_intercept_stage stage)
Expand Down

0 comments on commit fe3ef05

Please sign in to comment.