/* * for Intel MCE, broadcast vMCE to all vcpus * for AMD MCE, only inject vMCE to vcpu0 * * @ d, domain to which would inject vmce * @ vcpu, * -1 (VMCE_INJECT_BROADCAST), broadcast vMCE to all vcpus * >= 0, vcpu, the vMCE is injected to */ int inject_vmce(struct domain *d, int vcpu) { struct vcpu *v; int ret = -ESRCH; for_each_vcpu ( d, v ) { if ( vcpu != VMCE_INJECT_BROADCAST && vcpu != v->vcpu_id ) continue; if ( (has_hvm_container_domain(d) || guest_has_trap_callback(d, v->vcpu_id, TRAP_machine_check)) && !test_and_set_bool(v->mce_pending) ) { mce_printk(MCE_VERBOSE, "MCE: inject vMCE to d%d:v%d\n", d->domain_id, v->vcpu_id); vcpu_kick(v); ret = 0; } else { mce_printk(MCE_QUIET, "Failed to inject vMCE to d%d:v%d\n", d->domain_id, v->vcpu_id); ret = -EBUSY; break; } if ( vcpu != VMCE_INJECT_BROADCAST ) break; } return ret; }
/* * < 0: Unsupported and will #GP fault to guest * = 0: Not handled, should be handled by other components * > 0: Success */ int vmce_wrmsr(uint32_t msr, uint64_t val) { struct vcpu *cur = current; int ret = 1; spin_lock(&cur->arch.vmce.lock); switch ( msr ) { case MSR_IA32_MCG_CTL: /* If MCG_CTL exists then stick to all 1's, else ignore. */ break; case MSR_IA32_MCG_STATUS: cur->arch.vmce.mcg_status = val; mce_printk(MCE_VERBOSE, "MCE: wr MCG_STATUS %"PRIx64"\n", val); break; case MSR_IA32_MCG_CAP: /* * According to Intel SDM, IA32_MCG_CAP is a read-only register, * the effect of writing to the IA32_MCG_CAP is undefined. Here we * treat writing as 'write not change'. Guest would not surprise. */ mce_printk(MCE_VERBOSE, "MCE: MCG_CAP is r/o\n"); break; default: ret = mce_bank_msr(cur, msr) ? bank_mce_wrmsr(cur, msr, val) : 0; break; } spin_unlock(&cur->arch.vmce.lock); return ret; }
/* * < 0: Unsupported and will #GP fault to guest * = 0: Not handled, should be handled by other components * > 0: Success */ int vmce_rdmsr(uint32_t msr, uint64_t *val) { struct vcpu *cur = current; int ret = 1; *val = 0; spin_lock(&cur->arch.vmce.lock); switch ( msr ) { case MSR_IA32_MCG_STATUS: *val = cur->arch.vmce.mcg_status; if (*val) mce_printk(MCE_VERBOSE, "MCE: rd MCG_STATUS %#"PRIx64"\n", *val); break; case MSR_IA32_MCG_CAP: *val = cur->arch.vmce.mcg_cap; mce_printk(MCE_VERBOSE, "MCE: rd MCG_CAP %#"PRIx64"\n", *val); break; case MSR_IA32_MCG_CTL: if ( cur->arch.vmce.mcg_cap & MCG_CTL_P ) *val = ~0ULL; mce_printk(MCE_VERBOSE, "MCE: rd MCG_CTL %#"PRIx64"\n", *val); break; default: ret = mce_bank_msr(cur, msr) ? bank_mce_rdmsr(cur, msr, val) : 0; break; } spin_unlock(&cur->arch.vmce.lock); return ret; }
/* * For historic version reason, bank number may greater than GUEST_MC_BANK_NUM, * when migrating from old vMCE version to new vMCE. */ static int bank_mce_rdmsr(const struct vcpu *v, uint32_t msr, uint64_t *val) { int ret = 1; unsigned int bank = (msr - MSR_IA32_MC0_CTL) / 4; *val = 0; switch ( msr & (MSR_IA32_MC0_CTL | 3) ) { case MSR_IA32_MC0_CTL: /* stick all 1's to MCi_CTL */ *val = ~0UL; mce_printk(MCE_VERBOSE, "MCE: rd MC%u_CTL %#"PRIx64"\n", bank, *val); break; case MSR_IA32_MC0_STATUS: if ( bank < GUEST_MC_BANK_NUM ) { *val = v->arch.vmce.bank[bank].mci_status; if ( *val ) mce_printk(MCE_VERBOSE, "MCE: rd MC%u_STATUS %#"PRIx64"\n", bank, *val); } break; case MSR_IA32_MC0_ADDR: if ( bank < GUEST_MC_BANK_NUM ) { *val = v->arch.vmce.bank[bank].mci_addr; if ( *val ) mce_printk(MCE_VERBOSE, "MCE: rd MC%u_ADDR %#"PRIx64"\n", bank, *val); } break; case MSR_IA32_MC0_MISC: if ( bank < GUEST_MC_BANK_NUM ) { *val = v->arch.vmce.bank[bank].mci_misc; if ( *val ) mce_printk(MCE_VERBOSE, "MCE: rd MC%u_MISC %#"PRIx64"\n", bank, *val); } break; default: switch ( boot_cpu_data.x86_vendor ) { case X86_VENDOR_INTEL: ret = vmce_intel_rdmsr(v, msr, val); break; case X86_VENDOR_AMD: ret = vmce_amd_rdmsr(v, msr, val); break; default: ret = 0; break; } break; } return ret; }
/* * For historic version reason, bank number may greater than GUEST_MC_BANK_NUM, * when migratie from old vMCE version to new vMCE. */ static int bank_mce_wrmsr(struct vcpu *v, uint32_t msr, uint64_t val) { int ret = 1; unsigned int bank = (msr - MSR_IA32_MC0_CTL) / 4; switch ( msr & (MSR_IA32_MC0_CTL | 3) ) { case MSR_IA32_MC0_CTL: /* * if guest crazy clear any bit of MCi_CTL, * treat it as not implement and ignore write change it. */ break; case MSR_IA32_MC0_STATUS: mce_printk(MCE_VERBOSE, "MCE: wr MC%u_STATUS %#"PRIx64"\n", bank, val); if ( val ) ret = -1; else if ( bank < GUEST_MC_BANK_NUM ) v->arch.vmce.bank[bank].mci_status = val; break; case MSR_IA32_MC0_ADDR: mce_printk(MCE_VERBOSE, "MCE: wr MC%u_ADDR %#"PRIx64"\n", bank, val); if ( val ) ret = -1; else if ( bank < GUEST_MC_BANK_NUM ) v->arch.vmce.bank[bank].mci_addr = val; break; case MSR_IA32_MC0_MISC: mce_printk(MCE_VERBOSE, "MCE: wr MC%u_MISC %#"PRIx64"\n", bank, val); if ( val ) ret = -1; else if ( bank < GUEST_MC_BANK_NUM ) v->arch.vmce.bank[bank].mci_misc = val; break; default: switch ( boot_cpu_data.x86_vendor ) { case X86_VENDOR_INTEL: ret = vmce_intel_wrmsr(v, msr, val); break; case X86_VENDOR_AMD: ret = vmce_amd_wrmsr(v, msr, val); break; default: ret = 0; break; } break; } return ret; }
int fill_vmsr_data(struct mcinfo_bank *mc_bank, struct domain *d, uint64_t gstatus) { struct vcpu *v = d->vcpu[0]; if ( mc_bank->mc_domid != (uint16_t)~0 ) { if ( v->arch.vmce.mcg_status & MCG_STATUS_MCIP ) { mce_printk(MCE_QUIET, "MCE: guest has not handled previous" " vMCE yet!\n"); return -1; } spin_lock(&v->arch.vmce.lock); v->arch.vmce.mcg_status = gstatus; /* * 1. Skip bank 0 to avoid 'bank 0 quirk' of old processors * 2. Filter MCi_STATUS MSCOD model specific error code to guest */ v->arch.vmce.bank[1].mci_status = mc_bank->mc_status & MCi_STATUS_MSCOD_MASK; v->arch.vmce.bank[1].mci_addr = mc_bank->mc_addr; v->arch.vmce.bank[1].mci_misc = mc_bank->mc_misc; spin_unlock(&v->arch.vmce.lock); } return 0; }
void mc_memerr_dhandler(struct mca_binfo *binfo, enum mce_result *result, struct cpu_user_regs *regs) { struct mcinfo_bank *bank = binfo->mib; struct mcinfo_global *global = binfo->mig; struct domain *d; unsigned long mfn, gfn; uint32_t status; int vmce_vcpuid; if (!mc_check_addr(bank->mc_status, bank->mc_misc, MC_ADDR_PHYSICAL)) { dprintk(XENLOG_WARNING, "No physical address provided for memory error\n"); return; } mfn = bank->mc_addr >> PAGE_SHIFT; if (offline_page(mfn, 1, &status)) { dprintk(XENLOG_WARNING, "Failed to offline page %lx for MCE error\n", mfn); return; } mci_action_add_pageoffline(binfo->bank, binfo->mi, mfn, status); /* This is free page */ if (status & PG_OFFLINE_OFFLINED) *result = MCER_RECOVERED; else if (status & PG_OFFLINE_AGAIN) *result = MCER_CONTINUE; else if (status & PG_OFFLINE_PENDING) { /* This page has owner */ if (status & PG_OFFLINE_OWNED) { bank->mc_domid = status >> PG_OFFLINE_OWNER_SHIFT; mce_printk(MCE_QUIET, "MCE: This error page is ownded" " by DOM %d\n", bank->mc_domid); /* XXX: Cannot handle shared pages yet * (this should identify all domains and gfn mapping to * the mfn in question) */ BUG_ON( bank->mc_domid == DOMID_COW ); if ( bank->mc_domid != DOMID_XEN ) { d = get_domain_by_id(bank->mc_domid); ASSERT(d); gfn = get_gpfn_from_mfn((bank->mc_addr) >> PAGE_SHIFT); if ( !is_vmce_ready(bank, d) ) { printk("DOM%d not ready for vMCE\n", d->domain_id); goto vmce_failed; } if ( unmmap_broken_page(d, _mfn(mfn), gfn) ) { printk("Unmap broken memory %lx for DOM%d failed\n", mfn, d->domain_id); goto vmce_failed; } bank->mc_addr = gfn << PAGE_SHIFT | (bank->mc_addr & (PAGE_SIZE -1 )); if ( fill_vmsr_data(bank, d, global->mc_gstatus) == -1 ) { mce_printk(MCE_QUIET, "Fill vMCE# data for DOM%d " "failed\n", bank->mc_domid); goto vmce_failed; } if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL ) vmce_vcpuid = VMCE_INJECT_BROADCAST; else vmce_vcpuid = global->mc_vcpuid; /* We will inject vMCE to DOMU*/ if ( inject_vmce(d, vmce_vcpuid) < 0 ) { mce_printk(MCE_QUIET, "inject vMCE to DOM%d" " failed\n", d->domain_id); goto vmce_failed; } /* Impacted domain go on with domain's recovery job * if the domain has its own MCA handler. * For xen, it has contained the error and finished * its own recovery job. */ *result = MCER_RECOVERED; put_domain(d); return; vmce_failed: put_domain(d); domain_crash(d); } }
/* Shared #MC handler. */ void mcheck_cmn_handler(struct cpu_user_regs *regs, long error_code, struct mca_banks *bankmask, struct mca_banks *clear_bank) { uint64_t gstatus; mctelem_cookie_t mctc = NULL; struct mca_summary bs; mce_spin_lock(&mce_logout_lock); if (clear_bank != NULL) { memset( clear_bank->bank_map, 0x0, sizeof(long) * BITS_TO_LONGS(clear_bank->num)); } mctc = mcheck_mca_logout(MCA_MCE_SCAN, bankmask, &bs, clear_bank); if (bs.errcnt) { /* * Uncorrected errors must be dealt with in softirq context. */ if (bs.uc || bs.pcc) { add_taint(TAINT_MACHINE_CHECK); if (mctc != NULL) mctelem_defer(mctc); /* * For PCC=1 and can't be recovered, context is lost, so * reboot now without clearing the banks, and deal with * the telemetry after reboot (the MSRs are sticky) */ if (bs.pcc || !bs.recoverable) cpumask_set_cpu(smp_processor_id(), &mce_fatal_cpus); } else { if (mctc != NULL) mctelem_commit(mctc); } atomic_set(&found_error, 1); /* The last CPU will be take check/clean-up etc */ atomic_set(&severity_cpu, smp_processor_id()); mce_printk(MCE_CRITICAL, "MCE: clear_bank map %lx on CPU%d\n", *((unsigned long*)clear_bank), smp_processor_id()); if (clear_bank != NULL) mcheck_mca_clearbanks(clear_bank); } else { if (mctc != NULL) mctelem_dismiss(mctc); } mce_spin_unlock(&mce_logout_lock); mce_barrier_enter(&mce_trap_bar); if ( mctc != NULL && mce_urgent_action(regs, mctc)) cpumask_set_cpu(smp_processor_id(), &mce_fatal_cpus); mce_barrier_exit(&mce_trap_bar); /* * Wait until everybody has processed the trap. */ mce_barrier_enter(&mce_trap_bar); if (atomic_read(&severity_cpu) == smp_processor_id()) { /* According to SDM, if no error bank found on any cpus, * something unexpected happening, we can't do any * recovery job but to reset the system. */ if (atomic_read(&found_error) == 0) mc_panic("MCE: No CPU found valid MCE, need reset\n"); if (!cpumask_empty(&mce_fatal_cpus)) { char *ebufp, ebuf[96] = "MCE: Fatal error happened on CPUs "; ebufp = ebuf + strlen(ebuf); cpumask_scnprintf(ebufp, 95 - strlen(ebuf), &mce_fatal_cpus); mc_panic(ebuf); } atomic_set(&found_error, 0); } mce_barrier_exit(&mce_trap_bar); /* Clear flags after above fatal check */ mce_barrier_enter(&mce_trap_bar); gstatus = mca_rdmsr(MSR_IA32_MCG_STATUS); if ((gstatus & MCG_STATUS_MCIP) != 0) { mce_printk(MCE_CRITICAL, "MCE: Clear MCIP@ last step"); mca_wrmsr(MSR_IA32_MCG_STATUS, gstatus & ~MCG_STATUS_MCIP); } mce_barrier_exit(&mce_trap_bar); raise_softirq(MACHINE_CHECK_SOFTIRQ); }
/* AMD specific MCA MSR */ int vmce_amd_wrmsr(struct vcpu *v, uint32_t msr, uint64_t val) { /* Do nothing as we don't emulate this MC bank currently */ mce_printk(MCE_VERBOSE, "MCE: wr msr %#"PRIx64"\n", val); return 1; }