int set_mmio_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn) { int rc = 0; p2m_access_t a; p2m_type_t ot; mfn_t omfn; struct p2m_domain *p2m = p2m_get_hostp2m(d); if ( !paging_mode_translate(d) ) return 0; gfn_lock(p2m, gfn, 0); omfn = p2m->get_entry(p2m, gfn, &ot, &a, 0, NULL); if ( p2m_is_grant(ot) ) { p2m_unlock(p2m); domain_crash(d); return 0; } else if ( p2m_is_ram(ot) ) { ASSERT(mfn_valid(omfn)); set_gpfn_from_mfn(mfn_x(omfn), INVALID_M2P_ENTRY); } P2M_DEBUG("set mmio %lx %lx\n", gfn, mfn_x(mfn)); rc = set_p2m_entry(p2m, gfn, mfn, PAGE_ORDER_4K, p2m_mmio_direct, p2m->default_access); gfn_unlock(p2m, gfn, 0); if ( 0 == rc ) gdprintk(XENLOG_ERR, "set_mmio_p2m_entry: set_p2m_entry failed! mfn=%08lx\n", mfn_x(get_gfn_query_unlocked(p2m->domain, gfn, &ot))); return rc; }
static void realmode_emulate_one(struct hvm_emulate_ctxt *hvmemul_ctxt) { struct vcpu *curr = current; int rc; perfc_incr(realmode_emulations); rc = hvm_emulate_one(hvmemul_ctxt); if ( rc == X86EMUL_UNHANDLEABLE ) { gdprintk(XENLOG_ERR, "Failed to emulate insn.\n"); goto fail; } if ( rc == X86EMUL_EXCEPTION ) { if ( !hvmemul_ctxt->exn_pending ) { unsigned long intr_info; __vmread(VM_ENTRY_INTR_INFO, &intr_info); __vmwrite(VM_ENTRY_INTR_INFO, 0); if ( !(intr_info & INTR_INFO_VALID_MASK) ) { gdprintk(XENLOG_ERR, "Exception pending but no info.\n"); goto fail; } hvmemul_ctxt->trap.vector = (uint8_t)intr_info; hvmemul_ctxt->trap.insn_len = 0; } if ( unlikely(curr->domain->debugger_attached) && ((hvmemul_ctxt->trap.vector == TRAP_debug) || (hvmemul_ctxt->trap.vector == TRAP_int3)) ) { domain_pause_for_debugger(); } else if ( curr->arch.hvm_vcpu.guest_cr[0] & X86_CR0_PE ) { gdprintk(XENLOG_ERR, "Exception %02x in protected mode.\n", hvmemul_ctxt->trap.vector); goto fail; } else { realmode_deliver_exception( hvmemul_ctxt->trap.vector, hvmemul_ctxt->trap.insn_len, hvmemul_ctxt); } } return; fail: hvm_dump_emulation_state(XENLOG_G_ERR "Real-mode", hvmemul_ctxt); domain_crash(curr->domain); }
unsigned long hap_gva_to_gfn(GUEST_PAGING_LEVELS)( struct vcpu *v, unsigned long gva, uint32_t *pfec) { gdprintk(XENLOG_ERR, "Guest paging level is greater than host paging level!\n"); domain_crash(v->domain); return INVALID_GFN; }
unsigned long do_iret(void) { struct cpu_user_regs *regs = guest_cpu_user_regs(); struct iret_context iret_saved; struct vcpu *v = current; if ( unlikely(copy_from_user(&iret_saved, (void *)regs->rsp, sizeof(iret_saved))) ) { gprintk(XENLOG_ERR, "Fault while reading IRET context from guest stack\n"); goto exit_and_crash; } /* Returning to user mode? */ if ( (iret_saved.cs & 3) == 3 ) { if ( unlikely(pagetable_is_null(v->arch.guest_table_user)) ) { gprintk(XENLOG_ERR, "Guest switching to user mode with no user page tables\n"); goto exit_and_crash; } toggle_guest_mode(v); } if ( VM_ASSIST(v->domain, architectural_iopl) ) v->arch.pv_vcpu.iopl = iret_saved.rflags & X86_EFLAGS_IOPL; regs->rip = iret_saved.rip; regs->cs = iret_saved.cs | 3; /* force guest privilege */ regs->rflags = ((iret_saved.rflags & ~(X86_EFLAGS_IOPL|X86_EFLAGS_VM)) | X86_EFLAGS_IF); regs->rsp = iret_saved.rsp; regs->ss = iret_saved.ss | 3; /* force guest privilege */ if ( !(iret_saved.flags & VGCF_in_syscall) ) { regs->entry_vector &= ~TRAP_syscall; regs->r11 = iret_saved.r11; regs->rcx = iret_saved.rcx; } /* Restore upcall mask from supplied EFLAGS.IF. */ vcpu_info(v, evtchn_upcall_mask) = !(iret_saved.rflags & X86_EFLAGS_IF); async_exception_cleanup(v); /* Saved %rax gets written back to regs->rax in entry.S. */ return iret_saved.rax; exit_and_crash: domain_crash(v->domain); return 0; }
struct hvm_io_handler *hvm_next_io_handler(struct domain *d) { unsigned int i = d->arch.hvm_domain.io_handler_count++; if ( i == NR_IO_HANDLERS ) { domain_crash(d); return NULL; } return &d->arch.hvm_domain.io_handler[i]; }
static void initialize_apic_assist(struct vcpu *v) { struct domain *d = v->domain; unsigned long gmfn = v->arch.hvm_vcpu.viridian.apic_assist.msr.fields.pfn; struct page_info *page = get_page_from_gfn(d, gmfn, NULL, P2M_ALLOC); void *va; /* * See section 13.3.4.1 of the specification for details of this * enlightenment. */ if ( !page ) goto fail; if ( !get_page_type(page, PGT_writable_page) ) { put_page(page); goto fail; } va = __map_domain_page_global(page); if ( !va ) { put_page_and_type(page); goto fail; } *(uint32_t *)va = 0; if ( viridian_feature_mask(v->domain) & HVMPV_apic_assist ) { /* * If we overwrite an existing address here then something has * gone wrong and a domain page will leak. Instead crash the * domain to make the problem obvious. */ if ( v->arch.hvm_vcpu.viridian.apic_assist.va ) domain_crash(d); v->arch.hvm_vcpu.viridian.apic_assist.va = va; return; } unmap_domain_page_global(va); put_page_and_type(page); return; fail: gdprintk(XENLOG_WARNING, "Bad GMFN %#"PRI_gfn" (MFN %#"PRI_mfn")\n", gmfn, page ? page_to_mfn(page) : mfn_x(INVALID_MFN)); }
static bool_t hvm_mmio_accept(const struct hvm_io_handler *handler, const ioreq_t *p) { paddr_t first = hvm_mmio_first_byte(p); paddr_t last = hvm_mmio_last_byte(p); BUG_ON(handler->type != IOREQ_TYPE_COPY); if ( !handler->mmio.ops->check(current, first) ) return 0; /* Make sure the handler will accept the whole access */ if ( p->size > 1 && !handler->mmio.ops->check(current, last) ) domain_crash(current->domain); return 1; }
mfn_t __get_gfn_type_access(struct p2m_domain *p2m, unsigned long gfn, p2m_type_t *t, p2m_access_t *a, p2m_query_t q, unsigned int *page_order, bool_t locked) { mfn_t mfn; /* Unshare makes no sense withuot populate. */ if ( q & P2M_UNSHARE ) q |= P2M_ALLOC; if ( !p2m || !paging_mode_translate(p2m->domain) ) { /* Not necessarily true, but for non-translated guests, we claim * it's the most generic kind of memory */ *t = p2m_ram_rw; return _mfn(gfn); } if ( locked ) /* Grab the lock here, don't release until put_gfn */ gfn_lock(p2m, gfn, 0); mfn = p2m->get_entry(p2m, gfn, t, a, q, page_order); if ( (q & P2M_UNSHARE) && p2m_is_shared(*t) ) { ASSERT(!p2m_is_nestedp2m(p2m)); /* Try to unshare. If we fail, communicate ENOMEM without * sleeping. */ if ( mem_sharing_unshare_page(p2m->domain, gfn, 0) < 0 ) (void)mem_sharing_notify_enomem(p2m->domain, gfn, 0); mfn = p2m->get_entry(p2m, gfn, t, a, q, page_order); } if (unlikely((p2m_is_broken(*t)))) { /* Return invalid_mfn to avoid caller's access */ mfn = _mfn(INVALID_MFN); if ( q & P2M_ALLOC ) domain_crash(p2m->domain); } return mfn; }
void viridian_start_apic_assist(struct vcpu *v, int vector) { uint32_t *va = v->arch.hvm_vcpu.viridian.apic_assist.va; if ( !va ) return; if ( vector < 0x10 ) return; /* * If there is already an assist pending then something has gone * wrong and the VM will most likely hang so force a crash now * to make the problem clear. */ if ( v->arch.hvm_vcpu.viridian.apic_assist.vector ) domain_crash(v->domain); v->arch.hvm_vcpu.viridian.apic_assist.vector = vector; *va |= 1u; }
void mc_memerr_dhandler(struct mca_binfo *binfo, enum mce_result *result, struct cpu_user_regs *regs) { struct mcinfo_bank *bank = binfo->mib; struct mcinfo_global *global = binfo->mig; struct domain *d; unsigned long mfn, gfn; uint32_t status; int vmce_vcpuid; if (!mc_check_addr(bank->mc_status, bank->mc_misc, MC_ADDR_PHYSICAL)) { dprintk(XENLOG_WARNING, "No physical address provided for memory error\n"); return; } mfn = bank->mc_addr >> PAGE_SHIFT; if (offline_page(mfn, 1, &status)) { dprintk(XENLOG_WARNING, "Failed to offline page %lx for MCE error\n", mfn); return; } mci_action_add_pageoffline(binfo->bank, binfo->mi, mfn, status); /* This is free page */ if (status & PG_OFFLINE_OFFLINED) *result = MCER_RECOVERED; else if (status & PG_OFFLINE_AGAIN) *result = MCER_CONTINUE; else if (status & PG_OFFLINE_PENDING) { /* This page has owner */ if (status & PG_OFFLINE_OWNED) { bank->mc_domid = status >> PG_OFFLINE_OWNER_SHIFT; mce_printk(MCE_QUIET, "MCE: This error page is ownded" " by DOM %d\n", bank->mc_domid); /* XXX: Cannot handle shared pages yet * (this should identify all domains and gfn mapping to * the mfn in question) */ BUG_ON( bank->mc_domid == DOMID_COW ); if ( bank->mc_domid != DOMID_XEN ) { d = get_domain_by_id(bank->mc_domid); ASSERT(d); gfn = get_gpfn_from_mfn((bank->mc_addr) >> PAGE_SHIFT); if ( !is_vmce_ready(bank, d) ) { printk("DOM%d not ready for vMCE\n", d->domain_id); goto vmce_failed; } if ( unmmap_broken_page(d, _mfn(mfn), gfn) ) { printk("Unmap broken memory %lx for DOM%d failed\n", mfn, d->domain_id); goto vmce_failed; } bank->mc_addr = gfn << PAGE_SHIFT | (bank->mc_addr & (PAGE_SIZE -1 )); if ( fill_vmsr_data(bank, d, global->mc_gstatus) == -1 ) { mce_printk(MCE_QUIET, "Fill vMCE# data for DOM%d " "failed\n", bank->mc_domid); goto vmce_failed; } if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL ) vmce_vcpuid = VMCE_INJECT_BROADCAST; else vmce_vcpuid = global->mc_vcpuid; /* We will inject vMCE to DOMU*/ if ( inject_vmce(d, vmce_vcpuid) < 0 ) { mce_printk(MCE_QUIET, "inject vMCE to DOM%d" " failed\n", d->domain_id); goto vmce_failed; } /* Impacted domain go on with domain's recovery job * if the domain has its own MCA handler. * For xen, it has contained the error and finished * its own recovery job. */ *result = MCER_RECOVERED; put_domain(d); return; vmce_failed: put_domain(d); domain_crash(d); } }
/* Shared #MC handler. */ void mcheck_cmn_handler(struct cpu_user_regs *regs, long error_code, struct mca_banks *bankmask) { int xen_state_lost, dom0_state_lost, domU_state_lost; struct vcpu *v = current; struct domain *curdom = v->domain; domid_t domid = curdom->domain_id; int ctx_xen, ctx_dom0, ctx_domU; uint32_t dom_state = DOM_NORMAL; mctelem_cookie_t mctc = NULL; struct mca_summary bs; struct mc_info *mci = NULL; int irqlocked = 0; uint64_t gstatus; int ripv; /* This handler runs as interrupt gate. So IPIs from the * polling service routine are defered until we're finished. */ /* Disable interrupts for the _vcpu_. It may not re-scheduled to * another physical CPU. */ vcpu_schedule_lock_irq(v); irqlocked = 1; /* Read global status; if it does not indicate machine check * in progress then bail as long as we have a valid ip to return to. */ gstatus = mca_rdmsr(MSR_IA32_MCG_STATUS); ripv = ((gstatus & MCG_STATUS_RIPV) != 0); if (!(gstatus & MCG_STATUS_MCIP) && ripv) { add_taint(TAINT_MACHINE_CHECK); /* questionable */ vcpu_schedule_unlock_irq(v); irqlocked = 0; goto cmn_handler_done; } /* Go and grab error telemetry. We must choose whether to commit * for logging or dismiss the cookie that is returned, and must not * reference the cookie after that action. */ mctc = mcheck_mca_logout(MCA_MCE_HANDLER, bankmask, &bs, NULL); if (mctc != NULL) mci = (struct mc_info *)mctelem_dataptr(mctc); /* Clear MCIP or another #MC will enter shutdown state */ gstatus &= ~MCG_STATUS_MCIP; mca_wrmsr(MSR_IA32_MCG_STATUS, gstatus); wmb(); /* If no valid errors and our stack is intact, we're done */ if (ripv && bs.errcnt == 0) { vcpu_schedule_unlock_irq(v); irqlocked = 0; goto cmn_handler_done; } if (bs.uc || bs.pcc) add_taint(TAINT_MACHINE_CHECK); /* Machine check exceptions will usually be for UC and/or PCC errors, * but it is possible to configure machine check for some classes * of corrected error. * * UC errors could compromise any domain or the hypervisor * itself - for example a cache writeback of modified data that * turned out to be bad could be for data belonging to anyone, not * just the current domain. In the absence of known data poisoning * to prevent consumption of such bad data in the system we regard * all UC errors as terminal. It may be possible to attempt some * heuristics based on the address affected, which guests have * mappings to that mfn etc. * * PCC errors apply to the current context. * * If MCG_STATUS indicates !RIPV then even a #MC that is not UC * and not PCC is terminal - the return instruction pointer * pushed onto the stack is bogus. If the interrupt context is * the hypervisor or dom0 the game is over, otherwise we can * limit the impact to a single domU but only if we trampoline * somewhere safely - we can't return and unwind the stack. * Since there is no trampoline in place we will treat !RIPV * as terminal for any context. */ ctx_xen = SEG_PL(regs->cs) == 0; ctx_dom0 = !ctx_xen && (domid == 0); ctx_domU = !ctx_xen && !ctx_dom0; xen_state_lost = bs.uc != 0 || (ctx_xen && (bs.pcc || !ripv)) || !ripv; dom0_state_lost = bs.uc != 0 || (ctx_dom0 && (bs.pcc || !ripv)); domU_state_lost = bs.uc != 0 || (ctx_domU && (bs.pcc || !ripv)); if (xen_state_lost) { /* Now we are going to panic anyway. Allow interrupts, so that * printk on serial console can work. */ vcpu_schedule_unlock_irq(v); irqlocked = 0; printk("Terminal machine check exception occurred in " "hypervisor context.\n"); /* If MCG_STATUS_EIPV indicates, the IP on the stack is related * to the error then it makes sense to print a stack trace. * That can be useful for more detailed error analysis and/or * error case studies to figure out, if we can clear * xen_impacted and kill a DomU instead * (i.e. if a guest only control structure is affected, but then * we must ensure the bad pages are not re-used again). */ if (bs.eipv & MCG_STATUS_EIPV) { printk("MCE: Instruction Pointer is related to the " "error, therefore print the execution state.\n"); show_execution_state(regs); } /* Commit the telemetry so that panic flow can find it. */ if (mctc != NULL) { x86_mcinfo_dump(mci); mctelem_commit(mctc); } mc_panic("Hypervisor state lost due to machine check " "exception.\n"); /*NOTREACHED*/ } /* * Xen hypervisor state is intact. If dom0 state is lost then * give it a chance to decide what to do if it has registered * a handler for this event, otherwise panic. * * XXFM Could add some Solaris dom0 contract kill here? */ if (dom0_state_lost) { if (dom0 && dom0->max_vcpus && dom0->vcpu[0] && guest_has_trap_callback(dom0, 0, TRAP_machine_check)) { dom_state = DOM0_TRAP; send_guest_trap(dom0, 0, TRAP_machine_check); /* XXFM case of return with !ripv ??? */ } else { /* Commit telemetry for panic flow. */ if (mctc != NULL) { x86_mcinfo_dump(mci); mctelem_commit(mctc); } mc_panic("Dom0 state lost due to machine check " "exception\n"); /*NOTREACHED*/ } } /* * If a domU has lost state then send it a trap if it has registered * a handler, otherwise crash the domain. * XXFM Revisit this functionality. */ if (domU_state_lost) { if (guest_has_trap_callback(v->domain, v->vcpu_id, TRAP_machine_check)) { dom_state = DOMU_TRAP; send_guest_trap(curdom, v->vcpu_id, TRAP_machine_check); } else { dom_state = DOMU_KILLED; /* Enable interrupts. This basically results in * calling sti on the *physical* cpu. But after * domain_crash() the vcpu pointer is invalid. * Therefore, we must unlock the irqs before killing * it. */ vcpu_schedule_unlock_irq(v); irqlocked = 0; /* DomU is impacted. Kill it and continue. */ domain_crash(curdom); } } switch (dom_state) { case DOM0_TRAP: case DOMU_TRAP: /* Enable interrupts. */ vcpu_schedule_unlock_irq(v); irqlocked = 0; /* guest softirqs and event callbacks are scheduled * immediately after this handler exits. */ break; case DOMU_KILLED: /* Nothing to do here. */ break; case DOM_NORMAL: vcpu_schedule_unlock_irq(v); irqlocked = 0; break; } cmn_handler_done: BUG_ON(irqlocked); BUG_ON(!ripv); if (bs.errcnt) { /* Not panicing, so forward telemetry to dom0 now if it * is interested. */ if (dom0_vmce_enabled()) { if (mctc != NULL) mctelem_commit(mctc); send_guest_global_virq(dom0, VIRQ_MCA); } else { x86_mcinfo_dump(mci); if (mctc != NULL) mctelem_dismiss(mctc); } } else if (mctc != NULL) { mctelem_dismiss(mctc); } }
/* Machine Check Handler for AMD K8 family series */ void k8_machine_check(struct cpu_user_regs *regs, long error_code) { struct vcpu *vcpu = current; struct domain *curdom; struct mc_info *mc_data; struct mcinfo_global mc_global; struct mcinfo_bank mc_info; uint64_t status, addrv, miscv, uc; uint32_t i; unsigned int cpu_nr; uint32_t xen_impacted = 0; #define DOM_NORMAL 0 #define DOM0_TRAP 1 #define DOMU_TRAP 2 #define DOMU_KILLED 4 uint32_t dom_state = DOM_NORMAL; /* This handler runs as interrupt gate. So IPIs from the * polling service routine are defered until we finished. */ /* Disable interrupts for the _vcpu_. It may not re-scheduled to * an other physical CPU or the impacted process in the guest * continues running with corrupted data, otherwise. */ vcpu_schedule_lock_irq(vcpu); mc_data = x86_mcinfo_getptr(); cpu_nr = smp_processor_id(); curdom = vcpu->domain; memset(&mc_global, 0, sizeof(mc_global)); mc_global.common.type = MC_TYPE_GLOBAL; mc_global.common.size = sizeof(mc_global); mc_global.mc_domid = curdom->domain_id; /* impacted domain */ mc_global.mc_coreid = vcpu->processor; /* impacted physical cpu */ BUG_ON(cpu_nr != vcpu->processor); mc_global.mc_core_threadid = 0; mc_global.mc_vcpuid = vcpu->vcpu_id; /* impacted vcpu */ #if 0 /* TODO: on which socket is this physical core? It's not clear to me how to figure this out. */ mc_global.mc_socketid = ???; #endif mc_global.mc_flags |= MC_FLAG_UNCORRECTABLE; rdmsrl(MSR_IA32_MCG_STATUS, mc_global.mc_gstatus); /* Quick check, who is impacted */ xen_impacted = is_idle_domain(curdom); /* Dom0 */ x86_mcinfo_clear(mc_data); x86_mcinfo_add(mc_data, &mc_global); for (i = 0; i < nr_mce_banks; i++) { struct domain *d; rdmsrl(MSR_IA32_MC0_STATUS + 4 * i, status); if (!(status & MCi_STATUS_VAL)) continue; /* An error happened in this bank. * This is expected to be an uncorrectable error, * since correctable errors get polled. */ uc = status & MCi_STATUS_UC; memset(&mc_info, 0, sizeof(mc_info)); mc_info.common.type = MC_TYPE_BANK; mc_info.common.size = sizeof(mc_info); mc_info.mc_bank = i; mc_info.mc_status = status; addrv = 0; if (status & MCi_STATUS_ADDRV) { rdmsrl(MSR_IA32_MC0_ADDR + 4 * i, addrv); d = maddr_get_owner(addrv); if (d != NULL) mc_info.mc_domid = d->domain_id; } miscv = 0; if (status & MCi_STATUS_MISCV) rdmsrl(MSR_IA32_MC0_MISC + 4 * i, miscv); mc_info.mc_addr = addrv; mc_info.mc_misc = miscv; x86_mcinfo_add(mc_data, &mc_info); /* Dom0 */ if (mc_callback_bank_extended) mc_callback_bank_extended(mc_data, i, status); /* clear status */ wrmsrl(MSR_IA32_MC0_STATUS + 4 * i, 0x0ULL); wmb(); add_taint(TAINT_MACHINE_CHECK); } status = mc_global.mc_gstatus; /* clear MCIP or cpu enters shutdown state * in case another MCE occurs. */ status &= ~MCG_STATUS_MCIP; wrmsrl(MSR_IA32_MCG_STATUS, status); wmb(); /* For the details see the discussion "MCE/MCA concept" on xen-devel. * The thread started here: * http://lists.xensource.com/archives/html/xen-devel/2007-05/msg01015.html */ /* MCG_STATUS_RIPV: * When this bit is not set, then the instruction pointer onto the stack * to resume at is not valid. If xen is interrupted, then we panic anyway * right below. Otherwise it is up to the guest to figure out if * guest kernel or guest userland is affected and should kill either * itself or the affected process. */ /* MCG_STATUS_EIPV: * Evaluation of EIPV is the job of the guest. */ if (xen_impacted) { /* Now we are going to panic anyway. Allow interrupts, so that * printk on serial console can work. */ vcpu_schedule_unlock_irq(vcpu); /* Uh, that means, machine check exception * inside Xen occured. */ printk("Machine check exception occured in Xen.\n"); /* if MCG_STATUS_EIPV indicates, the IP on the stack is related * to the error then it makes sense to print a stack trace. * That can be useful for more detailed error analysis and/or * error case studies to figure out, if we can clear * xen_impacted and kill a DomU instead * (i.e. if a guest only control structure is affected, but then * we must ensure the bad pages are not re-used again). */ if (status & MCG_STATUS_EIPV) { printk("MCE: Instruction Pointer is related to the error. " "Therefore, print the execution state.\n"); show_execution_state(regs); } x86_mcinfo_dump(mc_data); mc_panic("End of MCE. Use mcelog to decode above error codes.\n"); } /* If Dom0 registered a machine check handler, which is only possible * with a PV MCA driver, then ... */ if ( guest_has_trap_callback(dom0, 0, TRAP_machine_check) ) { dom_state = DOM0_TRAP; /* ... deliver machine check trap to Dom0. */ send_guest_trap(dom0, 0, TRAP_machine_check); /* Xen may tell Dom0 now to notify the DomU. * But this will happen through a hypercall. */ } else /* Dom0 did not register a machine check handler, but if DomU * did so, then... */ if ( guest_has_trap_callback(curdom, vcpu->vcpu_id, TRAP_machine_check) ) { dom_state = DOMU_TRAP; /* ... deliver machine check trap to DomU */ send_guest_trap(curdom, vcpu->vcpu_id, TRAP_machine_check); } else { /* hmm... noone feels responsible to handle the error. * So, do a quick check if a DomU is impacted or not. */ if (curdom == dom0) { /* Dom0 is impacted. Since noone can't handle * this error, panic! */ x86_mcinfo_dump(mc_data); mc_panic("MCE occured in Dom0, which it can't handle\n"); /* UNREACHED */ } else { dom_state = DOMU_KILLED; /* Enable interrupts. This basically results in * calling sti on the *physical* cpu. But after * domain_crash() the vcpu pointer is invalid. * Therefore, we must unlock the irqs before killing * it. */ vcpu_schedule_unlock_irq(vcpu); /* DomU is impacted. Kill it and continue. */ domain_crash(curdom); } } switch (dom_state) { case DOM0_TRAP: case DOMU_TRAP: /* Enable interrupts. */ vcpu_schedule_unlock_irq(vcpu); /* guest softirqs and event callbacks are scheduled * immediately after this handler exits. */ break; case DOMU_KILLED: /* Nothing to do here. */ break; default: BUG(); } }
int hvm_process_io_intercept(const struct hvm_io_handler *handler, ioreq_t *p) { const struct hvm_io_ops *ops = handler->ops; int rc = X86EMUL_OKAY, i, step = p->df ? -p->size : p->size; uint64_t data; uint64_t addr; if ( p->dir == IOREQ_READ ) { for ( i = 0; i < p->count; i++ ) { addr = (p->type == IOREQ_TYPE_COPY) ? p->addr + step * i : p->addr; rc = ops->read(handler, addr, p->size, &data); if ( rc != X86EMUL_OKAY ) break; if ( p->data_is_ptr ) { switch ( hvm_copy_to_guest_phys(p->data + step * i, &data, p->size) ) { case HVMCOPY_okay: break; case HVMCOPY_bad_gfn_to_mfn: /* Drop the write as real hardware would. */ continue; case HVMCOPY_bad_gva_to_gfn: case HVMCOPY_gfn_paged_out: case HVMCOPY_gfn_shared: ASSERT_UNREACHABLE(); /* fall through */ default: domain_crash(current->domain); return X86EMUL_UNHANDLEABLE; } } else p->data = data; } } else /* p->dir == IOREQ_WRITE */ { for ( i = 0; i < p->count; i++ ) { if ( p->data_is_ptr ) { switch ( hvm_copy_from_guest_phys(&data, p->data + step * i, p->size) ) { case HVMCOPY_okay: break; case HVMCOPY_bad_gfn_to_mfn: data = ~0; break; case HVMCOPY_bad_gva_to_gfn: case HVMCOPY_gfn_paged_out: case HVMCOPY_gfn_shared: ASSERT_UNREACHABLE(); /* fall through */ default: domain_crash(current->domain); return X86EMUL_UNHANDLEABLE; } } else data = p->data; addr = (p->type == IOREQ_TYPE_COPY) ? p->addr + step * i : p->addr; rc = ops->write(handler, addr, p->size, data); if ( rc != X86EMUL_OKAY ) break; } } if ( i ) { p->count = i; rc = X86EMUL_OKAY; } else if ( rc == X86EMUL_UNHANDLEABLE ) { /* * Don't forward entire batches to the device model: This would * prevent the internal handlers to see subsequent iterations of * the request. */ p->count = 1; } return rc; }
bool_t p2m_mem_access_check(paddr_t gpa, vaddr_t gla, const struct npfec npfec) { int rc; bool_t violation; xenmem_access_t xma; vm_event_request_t *req; struct vcpu *v = current; struct p2m_domain *p2m = p2m_get_hostp2m(v->domain); /* Mem_access is not in use. */ if ( !p2m->mem_access_enabled ) return true; rc = p2m_get_mem_access(v->domain, _gfn(paddr_to_pfn(gpa)), &xma); if ( rc ) return true; /* Now check for mem_access violation. */ switch ( xma ) { case XENMEM_access_rwx: violation = false; break; case XENMEM_access_rw: violation = npfec.insn_fetch; break; case XENMEM_access_wx: violation = npfec.read_access; break; case XENMEM_access_rx: case XENMEM_access_rx2rw: violation = npfec.write_access; break; case XENMEM_access_x: violation = npfec.read_access || npfec.write_access; break; case XENMEM_access_w: violation = npfec.read_access || npfec.insn_fetch; break; case XENMEM_access_r: violation = npfec.write_access || npfec.insn_fetch; break; default: case XENMEM_access_n: case XENMEM_access_n2rwx: violation = true; break; } if ( !violation ) return true; /* First, handle rx2rw and n2rwx conversion automatically. */ if ( npfec.write_access && xma == XENMEM_access_rx2rw ) { rc = p2m_set_mem_access(v->domain, _gfn(paddr_to_pfn(gpa)), 1, 0, ~0, XENMEM_access_rw, 0); return false; } else if ( xma == XENMEM_access_n2rwx ) { rc = p2m_set_mem_access(v->domain, _gfn(paddr_to_pfn(gpa)), 1, 0, ~0, XENMEM_access_rwx, 0); } /* Otherwise, check if there is a vm_event monitor subscriber */ if ( !vm_event_check_ring(&v->domain->vm_event->monitor) ) { /* No listener */ if ( p2m->access_required ) { gdprintk(XENLOG_INFO, "Memory access permissions failure, " "no vm_event listener VCPU %d, dom %d\n", v->vcpu_id, v->domain->domain_id); domain_crash(v->domain); } else { /* n2rwx was already handled */ if ( xma != XENMEM_access_n2rwx ) { /* A listener is not required, so clear the access * restrictions. */ rc = p2m_set_mem_access(v->domain, _gfn(paddr_to_pfn(gpa)), 1, 0, ~0, XENMEM_access_rwx, 0); } } /* No need to reinject */ return false; } req = xzalloc(vm_event_request_t); if ( req ) { req->reason = VM_EVENT_REASON_MEM_ACCESS; /* Send request to mem access subscriber */ req->u.mem_access.gfn = gpa >> PAGE_SHIFT; req->u.mem_access.offset = gpa & ((1 << PAGE_SHIFT) - 1); if ( npfec.gla_valid ) { req->u.mem_access.flags |= MEM_ACCESS_GLA_VALID; req->u.mem_access.gla = gla; if ( npfec.kind == npfec_kind_with_gla ) req->u.mem_access.flags |= MEM_ACCESS_FAULT_WITH_GLA; else if ( npfec.kind == npfec_kind_in_gpt ) req->u.mem_access.flags |= MEM_ACCESS_FAULT_IN_GPT; } req->u.mem_access.flags |= npfec.read_access ? MEM_ACCESS_R : 0; req->u.mem_access.flags |= npfec.write_access ? MEM_ACCESS_W : 0; req->u.mem_access.flags |= npfec.insn_fetch ? MEM_ACCESS_X : 0; if ( monitor_traps(v, (xma != XENMEM_access_n2rwx), req) < 0 ) domain_crash(v->domain); xfree(req); } return false; }
int guest_physmap_add_entry(struct domain *d, unsigned long gfn, unsigned long mfn, unsigned int page_order, p2m_type_t t) { struct p2m_domain *p2m = p2m_get_hostp2m(d); unsigned long i, ogfn; p2m_type_t ot; p2m_access_t a; mfn_t omfn; int pod_count = 0; int rc = 0; if ( !paging_mode_translate(d) ) { if ( need_iommu(d) && t == p2m_ram_rw ) { for ( i = 0; i < (1 << page_order); i++ ) { rc = iommu_map_page( d, mfn + i, mfn + i, IOMMUF_readable|IOMMUF_writable); if ( rc != 0 ) { while ( i-- > 0 ) iommu_unmap_page(d, mfn + i); return rc; } } } return 0; } p2m_lock(p2m); P2M_DEBUG("adding gfn=%#lx mfn=%#lx\n", gfn, mfn); /* First, remove m->p mappings for existing p->m mappings */ for ( i = 0; i < (1UL << page_order); i++ ) { omfn = p2m->get_entry(p2m, gfn + i, &ot, &a, 0, NULL); if ( p2m_is_shared(ot) ) { /* Do an unshare to cleanly take care of all corner * cases. */ int rc; rc = mem_sharing_unshare_page(p2m->domain, gfn + i, 0); if ( rc ) { p2m_unlock(p2m); /* NOTE: Should a guest domain bring this upon itself, * there is not a whole lot we can do. We are buried * deep in locks from most code paths by now. So, fail * the call and don't try to sleep on a wait queue * while placing the mem event. * * However, all current (changeset 3432abcf9380) code * paths avoid this unsavoury situation. For now. * * Foreign domains are okay to place an event as they * won't go to sleep. */ (void)mem_sharing_notify_enomem(p2m->domain, gfn + i, 0); return rc; } omfn = p2m->get_entry(p2m, gfn + i, &ot, &a, 0, NULL); ASSERT(!p2m_is_shared(ot)); } if ( p2m_is_grant(ot) ) { /* Really shouldn't be unmapping grant maps this way */ domain_crash(d); p2m_unlock(p2m); return -EINVAL; } else if ( p2m_is_ram(ot) && !p2m_is_paged(ot) ) { ASSERT(mfn_valid(omfn)); set_gpfn_from_mfn(mfn_x(omfn), INVALID_M2P_ENTRY); } else if ( ot == p2m_populate_on_demand ) { /* Count how man PoD entries we'll be replacing if successful */ pod_count++; } else if ( p2m_is_paging(ot) && (ot != p2m_ram_paging_out) ) { /* We're plugging a hole in the physmap where a paged out page was */ atomic_dec(&d->paged_pages); } } /* Then, look for m->p mappings for this range and deal with them */ for ( i = 0; i < (1UL << page_order); i++ ) { if ( page_get_owner(mfn_to_page(_mfn(mfn + i))) == dom_cow ) { /* This is no way to add a shared page to your physmap! */ gdprintk(XENLOG_ERR, "Adding shared mfn %lx directly to dom %hu " "physmap not allowed.\n", mfn+i, d->domain_id); p2m_unlock(p2m); return -EINVAL; } if ( page_get_owner(mfn_to_page(_mfn(mfn + i))) != d ) continue; ogfn = mfn_to_gfn(d, _mfn(mfn+i)); if ( (ogfn != INVALID_M2P_ENTRY) && (ogfn != gfn + i) ) { /* This machine frame is already mapped at another physical * address */ P2M_DEBUG("aliased! mfn=%#lx, old gfn=%#lx, new gfn=%#lx\n", mfn + i, ogfn, gfn + i); omfn = p2m->get_entry(p2m, ogfn, &ot, &a, 0, NULL); if ( p2m_is_ram(ot) && !p2m_is_paged(ot) ) { ASSERT(mfn_valid(omfn)); P2M_DEBUG("old gfn=%#lx -> mfn %#lx\n", ogfn , mfn_x(omfn)); if ( mfn_x(omfn) == (mfn + i) ) p2m_remove_page(p2m, ogfn, mfn + i, 0); } } } /* Now, actually do the two-way mapping */ if ( mfn_valid(_mfn(mfn)) ) { if ( !set_p2m_entry(p2m, gfn, _mfn(mfn), page_order, t, p2m->default_access) ) { rc = -EINVAL; goto out; /* Failed to update p2m, bail without updating m2p. */ } if ( !p2m_is_grant(t) ) { for ( i = 0; i < (1UL << page_order); i++ ) set_gpfn_from_mfn(mfn+i, gfn+i); } } else { gdprintk(XENLOG_WARNING, "Adding bad mfn to p2m map (%#lx -> %#lx)\n", gfn, mfn); if ( !set_p2m_entry(p2m, gfn, _mfn(INVALID_MFN), page_order, p2m_invalid, p2m->default_access) ) rc = -EINVAL; else { pod_lock(p2m); p2m->pod.entry_count -= pod_count; BUG_ON(p2m->pod.entry_count < 0); pod_unlock(p2m); } } out: p2m_unlock(p2m); return rc; }
static void realmode_emulate_one(struct hvm_emulate_ctxt *hvmemul_ctxt) { struct vcpu *curr = current; uint32_t intr_info; int rc; perfc_incr(realmode_emulations); rc = hvm_emulate_one(hvmemul_ctxt); if ( rc == X86EMUL_UNHANDLEABLE ) { gdprintk(XENLOG_ERR, "Failed to emulate insn.\n"); goto fail; } if ( rc == X86EMUL_EXCEPTION ) { if ( !hvmemul_ctxt->exn_pending ) { intr_info = __vmread(VM_ENTRY_INTR_INFO); __vmwrite(VM_ENTRY_INTR_INFO, 0); if ( !(intr_info & INTR_INFO_VALID_MASK) ) { gdprintk(XENLOG_ERR, "Exception pending but no info.\n"); goto fail; } hvmemul_ctxt->exn_vector = (uint8_t)intr_info; hvmemul_ctxt->exn_insn_len = 0; } if ( unlikely(curr->domain->debugger_attached) && ((hvmemul_ctxt->exn_vector == TRAP_debug) || (hvmemul_ctxt->exn_vector == TRAP_int3)) ) { domain_pause_for_debugger(); } else if ( curr->arch.hvm_vcpu.guest_cr[0] & X86_CR0_PE ) { gdprintk(XENLOG_ERR, "Exception %02x in protected mode.\n", hvmemul_ctxt->exn_vector); goto fail; } else { realmode_deliver_exception( hvmemul_ctxt->exn_vector, hvmemul_ctxt->exn_insn_len, hvmemul_ctxt); } } return; fail: gdprintk(XENLOG_ERR, "Real-mode emulation failed @ %04x:%08lx: " "%02x %02x %02x %02x %02x %02x\n", hvmemul_get_seg_reg(x86_seg_cs, hvmemul_ctxt)->sel, hvmemul_ctxt->insn_buf_eip, hvmemul_ctxt->insn_buf[0], hvmemul_ctxt->insn_buf[1], hvmemul_ctxt->insn_buf[2], hvmemul_ctxt->insn_buf[3], hvmemul_ctxt->insn_buf[4], hvmemul_ctxt->insn_buf[5]); domain_crash(curr->domain); }
unsigned int compat_iret(void) { struct cpu_user_regs *regs = guest_cpu_user_regs(); struct vcpu *v = current; u32 eflags; /* Trim stack pointer to 32 bits. */ regs->rsp = (u32)regs->rsp; /* Restore EAX (clobbered by hypercall). */ if ( unlikely(__get_user(regs->eax, (u32 *)regs->rsp)) ) { domain_crash(v->domain); return 0; } /* Restore CS and EIP. */ if ( unlikely(__get_user(regs->eip, (u32 *)regs->rsp + 1)) || unlikely(__get_user(regs->cs, (u32 *)regs->rsp + 2)) ) { domain_crash(v->domain); return 0; } /* * Fix up and restore EFLAGS. We fix up in a local staging area * to avoid firing the BUG_ON(IOPL) check in arch_get_info_guest. */ if ( unlikely(__get_user(eflags, (u32 *)regs->rsp + 3)) ) { domain_crash(v->domain); return 0; } if ( VM_ASSIST(v->domain, architectural_iopl) ) v->arch.pv_vcpu.iopl = eflags & X86_EFLAGS_IOPL; regs->eflags = (eflags & ~X86_EFLAGS_IOPL) | X86_EFLAGS_IF; if ( unlikely(eflags & X86_EFLAGS_VM) ) { /* * Cannot return to VM86 mode: inject a GP fault instead. Note that * the GP fault is reported on the first VM86 mode instruction, not on * the IRET (which is why we can simply leave the stack frame as-is * (except for perhaps having to copy it), which in turn seems better * than teaching create_bounce_frame() to needlessly deal with vm86 * mode frames). */ const struct trap_info *ti; u32 x, ksp = v->arch.pv_vcpu.kernel_sp - 40; unsigned int i; int rc = 0; gdprintk(XENLOG_ERR, "VM86 mode unavailable (ksp:%08X->%08X)\n", regs->esp, ksp); if ( ksp < regs->esp ) { for (i = 1; i < 10; ++i) { rc |= __get_user(x, (u32 *)regs->rsp + i); rc |= __put_user(x, (u32 *)(unsigned long)ksp + i); } } else if ( ksp > regs->esp ) { for ( i = 9; i > 0; --i ) { rc |= __get_user(x, (u32 *)regs->rsp + i); rc |= __put_user(x, (u32 *)(unsigned long)ksp + i); } } if ( rc ) { domain_crash(v->domain); return 0; } regs->esp = ksp; regs->ss = v->arch.pv_vcpu.kernel_ss; ti = &v->arch.pv_vcpu.trap_ctxt[TRAP_gp_fault]; if ( TI_GET_IF(ti) ) eflags &= ~X86_EFLAGS_IF; regs->eflags &= ~(X86_EFLAGS_VM|X86_EFLAGS_RF| X86_EFLAGS_NT|X86_EFLAGS_TF); if ( unlikely(__put_user(0, (u32 *)regs->rsp)) ) { domain_crash(v->domain); return 0; } regs->eip = ti->address; regs->cs = ti->cs; } else if ( unlikely(ring_0(regs)) ) { domain_crash(v->domain); return 0; } else if ( ring_1(regs) ) regs->esp += 16; /* Return to ring 2/3: restore ESP and SS. */ else if ( __get_user(regs->ss, (u32 *)regs->rsp + 5) || __get_user(regs->esp, (u32 *)regs->rsp + 4) ) { domain_crash(v->domain); return 0; } /* Restore upcall mask from supplied EFLAGS.IF. */ vcpu_info(v, evtchn_upcall_mask) = !(eflags & X86_EFLAGS_IF); async_exception_cleanup(v); /* * The hypercall exit path will overwrite EAX with this return * value. */ return regs->eax; }
bool_t p2m_mem_access_check(paddr_t gpa, unsigned long gla, struct npfec npfec, vm_event_request_t **req_ptr) { struct vcpu *v = current; unsigned long gfn = gpa >> PAGE_SHIFT; struct domain *d = v->domain; struct p2m_domain *p2m = NULL; mfn_t mfn; p2m_type_t p2mt; p2m_access_t p2ma; vm_event_request_t *req; int rc; if ( altp2m_active(d) ) p2m = p2m_get_altp2m(v); if ( !p2m ) p2m = p2m_get_hostp2m(d); /* First, handle rx2rw conversion automatically. * These calls to p2m->set_entry() must succeed: we have the gfn * locked and just did a successful get_entry(). */ gfn_lock(p2m, gfn, 0); mfn = p2m->get_entry(p2m, gfn, &p2mt, &p2ma, 0, NULL, NULL); if ( npfec.write_access && p2ma == p2m_access_rx2rw ) { rc = p2m->set_entry(p2m, gfn, mfn, PAGE_ORDER_4K, p2mt, p2m_access_rw, -1); ASSERT(rc == 0); gfn_unlock(p2m, gfn, 0); return 1; } else if ( p2ma == p2m_access_n2rwx ) { ASSERT(npfec.write_access || npfec.read_access || npfec.insn_fetch); rc = p2m->set_entry(p2m, gfn, mfn, PAGE_ORDER_4K, p2mt, p2m_access_rwx, -1); ASSERT(rc == 0); } gfn_unlock(p2m, gfn, 0); /* Otherwise, check if there is a memory event listener, and send the message along */ if ( !vm_event_check_ring(&d->vm_event->monitor) || !req_ptr ) { /* No listener */ if ( p2m->access_required ) { gdprintk(XENLOG_INFO, "Memory access permissions failure, " "no vm_event listener VCPU %d, dom %d\n", v->vcpu_id, d->domain_id); domain_crash(v->domain); return 0; } else { gfn_lock(p2m, gfn, 0); mfn = p2m->get_entry(p2m, gfn, &p2mt, &p2ma, 0, NULL, NULL); if ( p2ma != p2m_access_n2rwx ) { /* A listener is not required, so clear the access * restrictions. This set must succeed: we have the * gfn locked and just did a successful get_entry(). */ rc = p2m->set_entry(p2m, gfn, mfn, PAGE_ORDER_4K, p2mt, p2m_access_rwx, -1); ASSERT(rc == 0); } gfn_unlock(p2m, gfn, 0); return 1; } } *req_ptr = NULL; req = xzalloc(vm_event_request_t); if ( req ) { *req_ptr = req; req->reason = VM_EVENT_REASON_MEM_ACCESS; req->u.mem_access.gfn = gfn; req->u.mem_access.offset = gpa & ((1 << PAGE_SHIFT) - 1); if ( npfec.gla_valid ) { req->u.mem_access.flags |= MEM_ACCESS_GLA_VALID; req->u.mem_access.gla = gla; if ( npfec.kind == npfec_kind_with_gla ) req->u.mem_access.flags |= MEM_ACCESS_FAULT_WITH_GLA; else if ( npfec.kind == npfec_kind_in_gpt ) req->u.mem_access.flags |= MEM_ACCESS_FAULT_IN_GPT; } req->u.mem_access.flags |= npfec.read_access ? MEM_ACCESS_R : 0; req->u.mem_access.flags |= npfec.write_access ? MEM_ACCESS_W : 0; req->u.mem_access.flags |= npfec.insn_fetch ? MEM_ACCESS_X : 0; } /* Return whether vCPU pause is required (aka. sync event) */ return (p2ma != p2m_access_n2rwx); }