int vmx_add_host_load_msr(struct vcpu *v, u32 msr) { unsigned int i, msr_count = v->arch.hvm_vmx.host_msr_count; struct vmx_msr_entry *msr_area = v->arch.hvm_vmx.host_msr_area; for ( i = 0; i < msr_count; i++ ) if ( msr_area[i].index == msr ) return 0; if ( msr_count == (PAGE_SIZE / sizeof(struct vmx_msr_entry)) ) return -ENOSPC; if ( msr_area == NULL ) { if ( (msr_area = alloc_xenheap_page()) == NULL ) return -ENOMEM; v->arch.hvm_vmx.host_msr_area = msr_area; __vmwrite(VM_EXIT_MSR_LOAD_ADDR, virt_to_maddr(msr_area)); } msr_area[msr_count].index = msr; msr_area[msr_count].mbz = 0; rdmsrl(msr, msr_area[msr_count].data); v->arch.hvm_vmx.host_msr_count = ++msr_count; __vmwrite(VM_EXIT_MSR_LOAD_COUNT, msr_count); return 0; }
static void realmode_emulate_one(struct hvm_emulate_ctxt *hvmemul_ctxt) { struct vcpu *curr = current; int rc; perfc_incr(realmode_emulations); rc = hvm_emulate_one(hvmemul_ctxt); if ( rc == X86EMUL_UNHANDLEABLE ) { gdprintk(XENLOG_ERR, "Failed to emulate insn.\n"); goto fail; } if ( rc == X86EMUL_EXCEPTION ) { if ( !hvmemul_ctxt->exn_pending ) { unsigned long intr_info; __vmread(VM_ENTRY_INTR_INFO, &intr_info); __vmwrite(VM_ENTRY_INTR_INFO, 0); if ( !(intr_info & INTR_INFO_VALID_MASK) ) { gdprintk(XENLOG_ERR, "Exception pending but no info.\n"); goto fail; } hvmemul_ctxt->trap.vector = (uint8_t)intr_info; hvmemul_ctxt->trap.insn_len = 0; } if ( unlikely(curr->domain->debugger_attached) && ((hvmemul_ctxt->trap.vector == TRAP_debug) || (hvmemul_ctxt->trap.vector == TRAP_int3)) ) { domain_pause_for_debugger(); } else if ( curr->arch.hvm_vcpu.guest_cr[0] & X86_CR0_PE ) { gdprintk(XENLOG_ERR, "Exception %02x in protected mode.\n", hvmemul_ctxt->trap.vector); goto fail; } else { realmode_deliver_exception( hvmemul_ctxt->trap.vector, hvmemul_ctxt->trap.insn_len, hvmemul_ctxt); } } return; fail: hvm_dump_emulation_state(XENLOG_G_ERR "Real-mode", hvmemul_ctxt); domain_crash(curr->domain); }
static void vmx_set_host_env(struct vcpu *v) { unsigned int cpu = smp_processor_id(); __vmwrite(HOST_IDTR_BASE, (unsigned long)idt_tables[cpu]); __vmwrite(HOST_TR_SELECTOR, __TSS(cpu) << 3); __vmwrite(HOST_TR_BASE, (unsigned long)&init_tss[cpu]); __vmwrite(HOST_SYSENTER_ESP, get_stack_bottom()); /* * Skip end of cpu_user_regs when entering the hypervisor because the * CPU does not save context onto the stack. SS,RSP,CS,RIP,RFLAGS,etc * all get saved into the VMCS instead. */ __vmwrite(HOST_RSP, (unsigned long)&get_cpu_info()->guest_cpu_user_regs.error_code); }
static void enable_intr_window(struct vcpu *v, struct hvm_intack intack) { u32 *cpu_exec_control = &v->arch.hvm_vmx.exec_control; u32 ctl = CPU_BASED_VIRTUAL_INTR_PENDING; ASSERT(intack.source != hvm_intsrc_none); if ( unlikely(tb_init_done) ) { unsigned int intr = __vmread(VM_ENTRY_INTR_INFO); HVMTRACE_3D(INTR_WINDOW, intack.vector, intack.source, (intr & INTR_INFO_VALID_MASK) ? intr & 0xff : -1); } if ( (intack.source == hvm_intsrc_nmi) && cpu_has_vmx_vnmi ) { /* * We set MOV-SS blocking in lieu of STI blocking when delivering an * NMI. This is because it is processor-specific whether STI-blocking * blocks NMIs. Hence we *must* check for STI-blocking on NMI delivery * (otherwise vmentry will fail on processors that check for STI- * blocking) but if the processor does not check for STI-blocking then * we may immediately vmexit and hance make no progress! * (see SDM 3B 21.3, "Other Causes of VM Exits"). */ u32 intr_shadow = __vmread(GUEST_INTERRUPTIBILITY_INFO); if ( intr_shadow & VMX_INTR_SHADOW_STI ) { /* Having both STI-blocking and MOV-SS-blocking fails vmentry. */ intr_shadow &= ~VMX_INTR_SHADOW_STI; intr_shadow |= VMX_INTR_SHADOW_MOV_SS; __vmwrite(GUEST_INTERRUPTIBILITY_INFO, intr_shadow); } ctl = CPU_BASED_VIRTUAL_NMI_PENDING; } if ( !(*cpu_exec_control & ctl) ) { *cpu_exec_control |= ctl; __vmwrite(CPU_BASED_VM_EXEC_CONTROL, *cpu_exec_control); } }
void vmx_do_resume(struct vcpu *v) { bool_t debug_state; if ( v->arch.hvm_vmx.active_cpu == smp_processor_id() ) { if ( v->arch.hvm_vmx.vmcs != this_cpu(current_vmcs) ) vmx_load_vmcs(v); } else { /* * For pass-through domain, guest PCI-E device driver may leverage the * "Non-Snoop" I/O, and explicitly WBINVD or CLFLUSH to a RAM space. * Since migration may occur before WBINVD or CLFLUSH, we need to * maintain data consistency either by: * 1: flushing cache (wbinvd) when the guest is scheduled out if * there is no wbinvd exit, or * 2: execute wbinvd on all dirty pCPUs when guest wbinvd exits. */ if ( !list_empty(&(domain_hvm_iommu(v->domain)->pdev_list)) && !cpu_has_wbinvd_exiting ) { int cpu = v->arch.hvm_vmx.active_cpu; if ( cpu != -1 ) on_selected_cpus(cpumask_of_cpu(cpu), wbinvd_ipi, NULL, 1, 1); } vmx_clear_vmcs(v); vmx_load_vmcs(v); hvm_migrate_timers(v); vmx_set_host_env(v); } debug_state = v->domain->debugger_attached; if ( unlikely(v->arch.hvm_vcpu.debug_state_latch != debug_state) ) { unsigned long intercepts = __vmread(EXCEPTION_BITMAP); unsigned long mask = (1U << TRAP_debug) | (1U << TRAP_int3); v->arch.hvm_vcpu.debug_state_latch = debug_state; if ( debug_state ) intercepts |= mask; else intercepts &= ~mask; __vmwrite(EXCEPTION_BITMAP, intercepts); } hvm_do_resume(v); reset_stack_and_jump(vmx_asm_do_vmentry); }
asmlinkage void vmx_intr_assist(void) { struct hvm_intack intack; struct vcpu *v = current; unsigned int tpr_threshold = 0; enum hvm_intblk intblk; /* Block event injection when single step with MTF. */ if ( unlikely(v->arch.hvm_vcpu.single_step) ) { v->arch.hvm_vmx.exec_control |= CPU_BASED_MONITOR_TRAP_FLAG; __vmwrite(CPU_BASED_VM_EXEC_CONTROL, v->arch.hvm_vmx.exec_control); return; } /* Crank the handle on interrupt state. */ pt_update_irq(v); hvm_dirq_assist(v); do { intack = hvm_vcpu_has_pending_irq(v); if ( likely(intack.source == hvm_intsrc_none) ) goto out; intblk = hvm_interrupt_blocked(v, intack); if ( intblk == hvm_intblk_tpr ) { ASSERT(vlapic_enabled(vcpu_vlapic(v))); ASSERT(intack.source == hvm_intsrc_lapic); tpr_threshold = intack.vector >> 4; goto out; } if ( (intblk != hvm_intblk_none) || (__vmread(VM_ENTRY_INTR_INFO) & INTR_INFO_VALID_MASK) ) { enable_intr_window(v, intack); goto out; } intack = hvm_vcpu_ack_pending_irq(v, intack); } while ( intack.source == hvm_intsrc_none );
static int construct_vmcs(struct vcpu *v) { uint16_t sysenter_cs; unsigned long sysenter_eip; vmx_vmcs_enter(v); /* VMCS controls. */ __vmwrite(PIN_BASED_VM_EXEC_CONTROL, vmx_pin_based_exec_control); __vmwrite(VM_EXIT_CONTROLS, vmx_vmexit_control); __vmwrite(VM_ENTRY_CONTROLS, vmx_vmentry_control); __vmwrite(CPU_BASED_VM_EXEC_CONTROL, vmx_cpu_based_exec_control); v->arch.hvm_vmx.exec_control = vmx_cpu_based_exec_control; if ( vmx_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS ) __vmwrite(SECONDARY_VM_EXEC_CONTROL, vmx_secondary_exec_control); /* MSR access bitmap. */ if ( cpu_has_vmx_msr_bitmap ) { char *msr_bitmap = alloc_xenheap_page(); if ( msr_bitmap == NULL ) return -ENOMEM; memset(msr_bitmap, ~0, PAGE_SIZE); v->arch.hvm_vmx.msr_bitmap = msr_bitmap; __vmwrite(MSR_BITMAP, virt_to_maddr(msr_bitmap)); vmx_disable_intercept_for_msr(v, MSR_FS_BASE); vmx_disable_intercept_for_msr(v, MSR_GS_BASE); vmx_disable_intercept_for_msr(v, MSR_IA32_SYSENTER_CS); vmx_disable_intercept_for_msr(v, MSR_IA32_SYSENTER_ESP); vmx_disable_intercept_for_msr(v, MSR_IA32_SYSENTER_EIP); } /* I/O access bitmap. */ __vmwrite(IO_BITMAP_A, virt_to_maddr(hvm_io_bitmap)); __vmwrite(IO_BITMAP_B, virt_to_maddr(hvm_io_bitmap + PAGE_SIZE)); /* Host GDTR base. */ __vmwrite(HOST_GDTR_BASE, GDT_VIRT_START(v)); /* Host data selectors. */ __vmwrite(HOST_SS_SELECTOR, __HYPERVISOR_DS); __vmwrite(HOST_DS_SELECTOR, __HYPERVISOR_DS); __vmwrite(HOST_ES_SELECTOR, __HYPERVISOR_DS); __vmwrite(HOST_FS_SELECTOR, 0); __vmwrite(HOST_GS_SELECTOR, 0); __vmwrite(HOST_FS_BASE, 0); __vmwrite(HOST_GS_BASE, 0); /* Host control registers. */ v->arch.hvm_vmx.host_cr0 = read_cr0() | X86_CR0_TS; __vmwrite(HOST_CR0, v->arch.hvm_vmx.host_cr0); __vmwrite(HOST_CR4, mmu_cr4_features); /* Host CS:RIP. */ __vmwrite(HOST_CS_SELECTOR, __HYPERVISOR_CS); __vmwrite(HOST_RIP, (unsigned long)vmx_asm_vmexit_handler); /* Host SYSENTER CS:RIP. */ rdmsrl(MSR_IA32_SYSENTER_CS, sysenter_cs); __vmwrite(HOST_SYSENTER_CS, sysenter_cs); rdmsrl(MSR_IA32_SYSENTER_EIP, sysenter_eip); __vmwrite(HOST_SYSENTER_EIP, sysenter_eip); /* MSR intercepts. */ __vmwrite(VM_EXIT_MSR_LOAD_COUNT, 0); __vmwrite(VM_EXIT_MSR_STORE_COUNT, 0); __vmwrite(VM_ENTRY_MSR_LOAD_COUNT, 0); __vmwrite(VM_ENTRY_INTR_INFO, 0); __vmwrite(CR0_GUEST_HOST_MASK, ~0UL); __vmwrite(CR4_GUEST_HOST_MASK, ~0UL); __vmwrite(PAGE_FAULT_ERROR_CODE_MASK, 0); __vmwrite(PAGE_FAULT_ERROR_CODE_MATCH, 0); __vmwrite(CR3_TARGET_COUNT, 0); __vmwrite(GUEST_ACTIVITY_STATE, 0); /* Guest segment bases. */ __vmwrite(GUEST_ES_BASE, 0); __vmwrite(GUEST_SS_BASE, 0); __vmwrite(GUEST_DS_BASE, 0); __vmwrite(GUEST_FS_BASE, 0); __vmwrite(GUEST_GS_BASE, 0); __vmwrite(GUEST_CS_BASE, 0); /* Guest segment limits. */ __vmwrite(GUEST_ES_LIMIT, ~0u); __vmwrite(GUEST_SS_LIMIT, ~0u); __vmwrite(GUEST_DS_LIMIT, ~0u); __vmwrite(GUEST_FS_LIMIT, ~0u); __vmwrite(GUEST_GS_LIMIT, ~0u); __vmwrite(GUEST_CS_LIMIT, ~0u); /* Guest segment AR bytes. */ __vmwrite(GUEST_ES_AR_BYTES, 0xc093); /* read/write, accessed */ __vmwrite(GUEST_SS_AR_BYTES, 0xc093); __vmwrite(GUEST_DS_AR_BYTES, 0xc093); __vmwrite(GUEST_FS_AR_BYTES, 0xc093); __vmwrite(GUEST_GS_AR_BYTES, 0xc093); __vmwrite(GUEST_CS_AR_BYTES, 0xc09b); /* exec/read, accessed */ /* Guest IDT. */ __vmwrite(GUEST_IDTR_BASE, 0); __vmwrite(GUEST_IDTR_LIMIT, 0); /* Guest GDT. */ __vmwrite(GUEST_GDTR_BASE, 0); __vmwrite(GUEST_GDTR_LIMIT, 0); /* Guest LDT. */ __vmwrite(GUEST_LDTR_AR_BYTES, 0x0082); /* LDT */ __vmwrite(GUEST_LDTR_SELECTOR, 0); __vmwrite(GUEST_LDTR_BASE, 0); __vmwrite(GUEST_LDTR_LIMIT, 0); /* Guest TSS. */ __vmwrite(GUEST_TR_AR_BYTES, 0x008b); /* 32-bit TSS (busy) */ __vmwrite(GUEST_TR_BASE, 0); __vmwrite(GUEST_TR_LIMIT, 0xff); __vmwrite(GUEST_INTERRUPTIBILITY_INFO, 0); __vmwrite(GUEST_DR7, 0); __vmwrite(VMCS_LINK_POINTER, ~0UL); #if defined(__i386__) __vmwrite(VMCS_LINK_POINTER_HIGH, ~0UL); #endif __vmwrite(EXCEPTION_BITMAP, (HVM_TRAP_MASK | (1U << TRAP_page_fault) | (1U << TRAP_no_device))); v->arch.hvm_vcpu.guest_cr[0] = X86_CR0_PE | X86_CR0_ET; hvm_update_guest_cr(v, 0); v->arch.hvm_vcpu.guest_cr[4] = 0; hvm_update_guest_cr(v, 4); if ( cpu_has_vmx_tpr_shadow ) { __vmwrite(VIRTUAL_APIC_PAGE_ADDR, page_to_maddr(vcpu_vlapic(v)->regs_page)); __vmwrite(TPR_THRESHOLD, 0); } vmx_vmcs_exit(v); paging_update_paging_modes(v); /* will update HOST & GUEST_CR3 as reqd */ vmx_vlapic_msr_changed(v); return 0; }
static void realmode_deliver_exception( unsigned int vector, unsigned int insn_len, struct hvm_emulate_ctxt *hvmemul_ctxt) { struct segment_register *idtr, *csr; struct cpu_user_regs *regs = hvmemul_ctxt->ctxt.regs; uint32_t cs_eip, pstk; uint16_t frame[3]; unsigned int last_byte; idtr = hvmemul_get_seg_reg(x86_seg_idtr, hvmemul_ctxt); csr = hvmemul_get_seg_reg(x86_seg_cs, hvmemul_ctxt); __set_bit(x86_seg_cs, &hvmemul_ctxt->seg_reg_dirty); again: last_byte = (vector * 4) + 3; if ( idtr->limit < last_byte || hvm_copy_from_guest_phys(&cs_eip, idtr->base + vector * 4, 4) != HVMCOPY_okay ) { /* Software interrupt? */ if ( insn_len != 0 ) { insn_len = 0; vector = TRAP_gp_fault; goto again; } /* Exception or hardware interrupt. */ switch ( vector ) { case TRAP_double_fault: hvm_triple_fault(); return; case TRAP_gp_fault: vector = TRAP_double_fault; goto again; default: vector = TRAP_gp_fault; goto again; } } frame[0] = regs->eip + insn_len; frame[1] = csr->sel; frame[2] = regs->eflags & ~X86_EFLAGS_RF; /* We can't test hvmemul_ctxt->ctxt.sp_size: it may not be initialised. */ if ( hvmemul_ctxt->seg_reg[x86_seg_ss].attr.fields.db ) { regs->esp -= 6; pstk = regs->esp; } else { pstk = (uint16_t)(regs->esp - 6); regs->esp &= ~0xffff; regs->esp |= pstk; } pstk += hvmemul_get_seg_reg(x86_seg_ss, hvmemul_ctxt)->base; (void)hvm_copy_to_guest_phys(pstk, frame, sizeof(frame)); csr->sel = cs_eip >> 16; csr->base = (uint32_t)csr->sel << 4; regs->eip = (uint16_t)cs_eip; regs->eflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_IF | X86_EFLAGS_RF); /* Exception delivery clears STI and MOV-SS blocking. */ if ( hvmemul_ctxt->intr_shadow & (VMX_INTR_SHADOW_STI|VMX_INTR_SHADOW_MOV_SS) ) { hvmemul_ctxt->intr_shadow &= ~(VMX_INTR_SHADOW_STI|VMX_INTR_SHADOW_MOV_SS); __vmwrite(GUEST_INTERRUPTIBILITY_INFO, hvmemul_ctxt->intr_shadow); } }
void vmx_realmode(struct cpu_user_regs *regs) { struct vcpu *curr = current; struct hvm_emulate_ctxt hvmemul_ctxt; struct segment_register *sreg; struct hvm_vcpu_io *vio = &curr->arch.hvm_vcpu.hvm_io; unsigned long intr_info; unsigned int emulations = 0; /* Get-and-clear VM_ENTRY_INTR_INFO. */ __vmread(VM_ENTRY_INTR_INFO, &intr_info); if ( intr_info & INTR_INFO_VALID_MASK ) __vmwrite(VM_ENTRY_INTR_INFO, 0); hvm_emulate_prepare(&hvmemul_ctxt, regs); if ( vio->io_state == HVMIO_completed ) realmode_emulate_one(&hvmemul_ctxt); /* Only deliver interrupts into emulated real mode. */ if ( !(curr->arch.hvm_vcpu.guest_cr[0] & X86_CR0_PE) && (intr_info & INTR_INFO_VALID_MASK) ) { realmode_deliver_exception((uint8_t)intr_info, 0, &hvmemul_ctxt); intr_info = 0; } curr->arch.hvm_vmx.vmx_emulate = 1; while ( curr->arch.hvm_vmx.vmx_emulate && !softirq_pending(smp_processor_id()) && (vio->io_state == HVMIO_none) ) { /* * Check for pending interrupts only every 16 instructions, because * hvm_local_events_need_delivery() is moderately expensive, and only * in real mode, because we don't emulate protected-mode IDT vectoring. */ if ( unlikely(!(++emulations & 15)) && curr->arch.hvm_vmx.vmx_realmode && hvm_local_events_need_delivery(curr) ) break; realmode_emulate_one(&hvmemul_ctxt); /* Stop emulating unless our segment state is not safe */ if ( curr->arch.hvm_vmx.vmx_realmode ) curr->arch.hvm_vmx.vmx_emulate = (curr->arch.hvm_vmx.vm86_segment_mask != 0); else curr->arch.hvm_vmx.vmx_emulate = ((hvmemul_ctxt.seg_reg[x86_seg_cs].sel & 3) || (hvmemul_ctxt.seg_reg[x86_seg_ss].sel & 3)); } /* Need to emulate next time if we've started an IO operation */ if ( vio->io_state != HVMIO_none ) curr->arch.hvm_vmx.vmx_emulate = 1; if ( !curr->arch.hvm_vmx.vmx_emulate && !curr->arch.hvm_vmx.vmx_realmode ) { /* * Cannot enter protected mode with bogus selector RPLs and DPLs. * At this point CS.RPL == SS.RPL == CS.DPL == SS.DPL == 0. For * DS, ES, FS and GS the most uninvasive trick is to set DPL == RPL. */ sreg = hvmemul_get_seg_reg(x86_seg_ds, &hvmemul_ctxt); sreg->attr.fields.dpl = sreg->sel & 3; sreg = hvmemul_get_seg_reg(x86_seg_es, &hvmemul_ctxt); sreg->attr.fields.dpl = sreg->sel & 3; sreg = hvmemul_get_seg_reg(x86_seg_fs, &hvmemul_ctxt); sreg->attr.fields.dpl = sreg->sel & 3; sreg = hvmemul_get_seg_reg(x86_seg_gs, &hvmemul_ctxt); sreg->attr.fields.dpl = sreg->sel & 3; hvmemul_ctxt.seg_reg_dirty |= (1ul << x86_seg_ds) | (1ul << x86_seg_es) | (1ul << x86_seg_fs) | (1ul << x86_seg_gs); } hvm_emulate_writeback(&hvmemul_ctxt); /* Re-instate VM_ENTRY_INTR_INFO if we did not discharge it. */ if ( intr_info & INTR_INFO_VALID_MASK ) __vmwrite(VM_ENTRY_INTR_INFO, intr_info); }
static void realmode_emulate_one(struct hvm_emulate_ctxt *hvmemul_ctxt) { struct vcpu *curr = current; uint32_t intr_info; int rc; perfc_incr(realmode_emulations); rc = hvm_emulate_one(hvmemul_ctxt); if ( rc == X86EMUL_UNHANDLEABLE ) { gdprintk(XENLOG_ERR, "Failed to emulate insn.\n"); goto fail; } if ( rc == X86EMUL_EXCEPTION ) { if ( !hvmemul_ctxt->exn_pending ) { intr_info = __vmread(VM_ENTRY_INTR_INFO); __vmwrite(VM_ENTRY_INTR_INFO, 0); if ( !(intr_info & INTR_INFO_VALID_MASK) ) { gdprintk(XENLOG_ERR, "Exception pending but no info.\n"); goto fail; } hvmemul_ctxt->exn_vector = (uint8_t)intr_info; hvmemul_ctxt->exn_insn_len = 0; } if ( unlikely(curr->domain->debugger_attached) && ((hvmemul_ctxt->exn_vector == TRAP_debug) || (hvmemul_ctxt->exn_vector == TRAP_int3)) ) { domain_pause_for_debugger(); } else if ( curr->arch.hvm_vcpu.guest_cr[0] & X86_CR0_PE ) { gdprintk(XENLOG_ERR, "Exception %02x in protected mode.\n", hvmemul_ctxt->exn_vector); goto fail; } else { realmode_deliver_exception( hvmemul_ctxt->exn_vector, hvmemul_ctxt->exn_insn_len, hvmemul_ctxt); } } return; fail: gdprintk(XENLOG_ERR, "Real-mode emulation failed @ %04x:%08lx: " "%02x %02x %02x %02x %02x %02x\n", hvmemul_get_seg_reg(x86_seg_cs, hvmemul_ctxt)->sel, hvmemul_ctxt->insn_buf_eip, hvmemul_ctxt->insn_buf[0], hvmemul_ctxt->insn_buf[1], hvmemul_ctxt->insn_buf[2], hvmemul_ctxt->insn_buf[3], hvmemul_ctxt->insn_buf[4], hvmemul_ctxt->insn_buf[5]); domain_crash(curr->domain); }