static void mcabank_clear(int banknum) { uint64_t status; status = mca_rdmsr(MSR_IA32_MCx_STATUS(banknum)); if (status & MCi_STATUS_ADDRV) mca_wrmsr(MSR_IA32_MCx_ADDR(banknum), 0x0ULL); if (status & MCi_STATUS_MISCV) mca_wrmsr(MSR_IA32_MCx_MISC(banknum), 0x0ULL); mca_wrmsr(MSR_IA32_MCx_STATUS(banknum), 0x0ULL); }
void mcheck_mca_clearbanks(struct mca_banks *bankmask) { int i; uint64_t status; for (i = 0; i < 32 && i < nr_mce_banks; i++) { if (!mcabanks_test(i, bankmask)) continue; status = mca_rdmsr(MSR_IA32_MCx_STATUS(i)); if (!(status & MCi_STATUS_VAL)) continue; mca_wrmsr(MSR_IA32_MCx_STATUS(i), 0x0ULL); } }
/* Shared #MC handler. */ void mcheck_cmn_handler(struct cpu_user_regs *regs, long error_code, struct mca_banks *bankmask, struct mca_banks *clear_bank) { uint64_t gstatus; mctelem_cookie_t mctc = NULL; struct mca_summary bs; mce_spin_lock(&mce_logout_lock); if (clear_bank != NULL) { memset( clear_bank->bank_map, 0x0, sizeof(long) * BITS_TO_LONGS(clear_bank->num)); } mctc = mcheck_mca_logout(MCA_MCE_SCAN, bankmask, &bs, clear_bank); if (bs.errcnt) { /* * Uncorrected errors must be dealt with in softirq context. */ if (bs.uc || bs.pcc) { add_taint(TAINT_MACHINE_CHECK); if (mctc != NULL) mctelem_defer(mctc); /* * For PCC=1 and can't be recovered, context is lost, so * reboot now without clearing the banks, and deal with * the telemetry after reboot (the MSRs are sticky) */ if (bs.pcc || !bs.recoverable) cpumask_set_cpu(smp_processor_id(), &mce_fatal_cpus); } else { if (mctc != NULL) mctelem_commit(mctc); } atomic_set(&found_error, 1); /* The last CPU will be take check/clean-up etc */ atomic_set(&severity_cpu, smp_processor_id()); mce_printk(MCE_CRITICAL, "MCE: clear_bank map %lx on CPU%d\n", *((unsigned long*)clear_bank), smp_processor_id()); if (clear_bank != NULL) mcheck_mca_clearbanks(clear_bank); } else { if (mctc != NULL) mctelem_dismiss(mctc); } mce_spin_unlock(&mce_logout_lock); mce_barrier_enter(&mce_trap_bar); if ( mctc != NULL && mce_urgent_action(regs, mctc)) cpumask_set_cpu(smp_processor_id(), &mce_fatal_cpus); mce_barrier_exit(&mce_trap_bar); /* * Wait until everybody has processed the trap. */ mce_barrier_enter(&mce_trap_bar); if (atomic_read(&severity_cpu) == smp_processor_id()) { /* According to SDM, if no error bank found on any cpus, * something unexpected happening, we can't do any * recovery job but to reset the system. */ if (atomic_read(&found_error) == 0) mc_panic("MCE: No CPU found valid MCE, need reset\n"); if (!cpumask_empty(&mce_fatal_cpus)) { char *ebufp, ebuf[96] = "MCE: Fatal error happened on CPUs "; ebufp = ebuf + strlen(ebuf); cpumask_scnprintf(ebufp, 95 - strlen(ebuf), &mce_fatal_cpus); mc_panic(ebuf); } atomic_set(&found_error, 0); } mce_barrier_exit(&mce_trap_bar); /* Clear flags after above fatal check */ mce_barrier_enter(&mce_trap_bar); gstatus = mca_rdmsr(MSR_IA32_MCG_STATUS); if ((gstatus & MCG_STATUS_MCIP) != 0) { mce_printk(MCE_CRITICAL, "MCE: Clear MCIP@ last step"); mca_wrmsr(MSR_IA32_MCG_STATUS, gstatus & ~MCG_STATUS_MCIP); } mce_barrier_exit(&mce_trap_bar); raise_softirq(MACHINE_CHECK_SOFTIRQ); }
/* Shared #MC handler. */ void mcheck_cmn_handler(struct cpu_user_regs *regs, long error_code, struct mca_banks *bankmask) { int xen_state_lost, dom0_state_lost, domU_state_lost; struct vcpu *v = current; struct domain *curdom = v->domain; domid_t domid = curdom->domain_id; int ctx_xen, ctx_dom0, ctx_domU; uint32_t dom_state = DOM_NORMAL; mctelem_cookie_t mctc = NULL; struct mca_summary bs; struct mc_info *mci = NULL; int irqlocked = 0; uint64_t gstatus; int ripv; /* This handler runs as interrupt gate. So IPIs from the * polling service routine are defered until we're finished. */ /* Disable interrupts for the _vcpu_. It may not re-scheduled to * another physical CPU. */ vcpu_schedule_lock_irq(v); irqlocked = 1; /* Read global status; if it does not indicate machine check * in progress then bail as long as we have a valid ip to return to. */ gstatus = mca_rdmsr(MSR_IA32_MCG_STATUS); ripv = ((gstatus & MCG_STATUS_RIPV) != 0); if (!(gstatus & MCG_STATUS_MCIP) && ripv) { add_taint(TAINT_MACHINE_CHECK); /* questionable */ vcpu_schedule_unlock_irq(v); irqlocked = 0; goto cmn_handler_done; } /* Go and grab error telemetry. We must choose whether to commit * for logging or dismiss the cookie that is returned, and must not * reference the cookie after that action. */ mctc = mcheck_mca_logout(MCA_MCE_HANDLER, bankmask, &bs, NULL); if (mctc != NULL) mci = (struct mc_info *)mctelem_dataptr(mctc); /* Clear MCIP or another #MC will enter shutdown state */ gstatus &= ~MCG_STATUS_MCIP; mca_wrmsr(MSR_IA32_MCG_STATUS, gstatus); wmb(); /* If no valid errors and our stack is intact, we're done */ if (ripv && bs.errcnt == 0) { vcpu_schedule_unlock_irq(v); irqlocked = 0; goto cmn_handler_done; } if (bs.uc || bs.pcc) add_taint(TAINT_MACHINE_CHECK); /* Machine check exceptions will usually be for UC and/or PCC errors, * but it is possible to configure machine check for some classes * of corrected error. * * UC errors could compromise any domain or the hypervisor * itself - for example a cache writeback of modified data that * turned out to be bad could be for data belonging to anyone, not * just the current domain. In the absence of known data poisoning * to prevent consumption of such bad data in the system we regard * all UC errors as terminal. It may be possible to attempt some * heuristics based on the address affected, which guests have * mappings to that mfn etc. * * PCC errors apply to the current context. * * If MCG_STATUS indicates !RIPV then even a #MC that is not UC * and not PCC is terminal - the return instruction pointer * pushed onto the stack is bogus. If the interrupt context is * the hypervisor or dom0 the game is over, otherwise we can * limit the impact to a single domU but only if we trampoline * somewhere safely - we can't return and unwind the stack. * Since there is no trampoline in place we will treat !RIPV * as terminal for any context. */ ctx_xen = SEG_PL(regs->cs) == 0; ctx_dom0 = !ctx_xen && (domid == 0); ctx_domU = !ctx_xen && !ctx_dom0; xen_state_lost = bs.uc != 0 || (ctx_xen && (bs.pcc || !ripv)) || !ripv; dom0_state_lost = bs.uc != 0 || (ctx_dom0 && (bs.pcc || !ripv)); domU_state_lost = bs.uc != 0 || (ctx_domU && (bs.pcc || !ripv)); if (xen_state_lost) { /* Now we are going to panic anyway. Allow interrupts, so that * printk on serial console can work. */ vcpu_schedule_unlock_irq(v); irqlocked = 0; printk("Terminal machine check exception occurred in " "hypervisor context.\n"); /* If MCG_STATUS_EIPV indicates, the IP on the stack is related * to the error then it makes sense to print a stack trace. * That can be useful for more detailed error analysis and/or * error case studies to figure out, if we can clear * xen_impacted and kill a DomU instead * (i.e. if a guest only control structure is affected, but then * we must ensure the bad pages are not re-used again). */ if (bs.eipv & MCG_STATUS_EIPV) { printk("MCE: Instruction Pointer is related to the " "error, therefore print the execution state.\n"); show_execution_state(regs); } /* Commit the telemetry so that panic flow can find it. */ if (mctc != NULL) { x86_mcinfo_dump(mci); mctelem_commit(mctc); } mc_panic("Hypervisor state lost due to machine check " "exception.\n"); /*NOTREACHED*/ } /* * Xen hypervisor state is intact. If dom0 state is lost then * give it a chance to decide what to do if it has registered * a handler for this event, otherwise panic. * * XXFM Could add some Solaris dom0 contract kill here? */ if (dom0_state_lost) { if (dom0 && dom0->max_vcpus && dom0->vcpu[0] && guest_has_trap_callback(dom0, 0, TRAP_machine_check)) { dom_state = DOM0_TRAP; send_guest_trap(dom0, 0, TRAP_machine_check); /* XXFM case of return with !ripv ??? */ } else { /* Commit telemetry for panic flow. */ if (mctc != NULL) { x86_mcinfo_dump(mci); mctelem_commit(mctc); } mc_panic("Dom0 state lost due to machine check " "exception\n"); /*NOTREACHED*/ } } /* * If a domU has lost state then send it a trap if it has registered * a handler, otherwise crash the domain. * XXFM Revisit this functionality. */ if (domU_state_lost) { if (guest_has_trap_callback(v->domain, v->vcpu_id, TRAP_machine_check)) { dom_state = DOMU_TRAP; send_guest_trap(curdom, v->vcpu_id, TRAP_machine_check); } else { dom_state = DOMU_KILLED; /* Enable interrupts. This basically results in * calling sti on the *physical* cpu. But after * domain_crash() the vcpu pointer is invalid. * Therefore, we must unlock the irqs before killing * it. */ vcpu_schedule_unlock_irq(v); irqlocked = 0; /* DomU is impacted. Kill it and continue. */ domain_crash(curdom); } } switch (dom_state) { case DOM0_TRAP: case DOMU_TRAP: /* Enable interrupts. */ vcpu_schedule_unlock_irq(v); irqlocked = 0; /* guest softirqs and event callbacks are scheduled * immediately after this handler exits. */ break; case DOMU_KILLED: /* Nothing to do here. */ break; case DOM_NORMAL: vcpu_schedule_unlock_irq(v); irqlocked = 0; break; } cmn_handler_done: BUG_ON(irqlocked); BUG_ON(!ripv); if (bs.errcnt) { /* Not panicing, so forward telemetry to dom0 now if it * is interested. */ if (dom0_vmce_enabled()) { if (mctc != NULL) mctelem_commit(mctc); send_guest_global_virq(dom0, VIRQ_MCA); } else { x86_mcinfo_dump(mci); if (mctc != NULL) mctelem_dismiss(mctc); } } else if (mctc != NULL) { mctelem_dismiss(mctc); } }
/* Add out_param clear_bank for Machine Check Handler Caller. * For Intel latest CPU, whether to clear the error bank status needs to * be judged by the callback function defined above. */ mctelem_cookie_t mcheck_mca_logout(enum mca_source who, struct mca_banks *bankmask, struct mca_summary *sp, struct mca_banks* clear_bank) { uint64_t gstatus, status; struct mcinfo_global *mig = NULL; /* on stack */ mctelem_cookie_t mctc = NULL; uint32_t uc = 0, pcc = 0, recover, need_clear = 1, mc_flags = 0; struct mc_info *mci = NULL; mctelem_class_t which = MC_URGENT; /* XXXgcc */ int errcnt = 0; int i; gstatus = mca_rdmsr(MSR_IA32_MCG_STATUS); switch (who) { case MCA_MCE_HANDLER: case MCA_MCE_SCAN: mc_flags = MC_FLAG_MCE; which = MC_URGENT; break; case MCA_POLLER: case MCA_RESET: mc_flags = MC_FLAG_POLLED; which = MC_NONURGENT; break; case MCA_CMCI_HANDLER: mc_flags = MC_FLAG_CMCI; which = MC_NONURGENT; break; default: BUG(); } /* If no mc_recovery_scan callback handler registered, * this error is not recoverable */ recover = (mc_recoverable_scan)? 1: 0; for (i = 0; i < 32 && i < nr_mce_banks; i++) { struct mcinfo_bank *mib; /* on stack */ /* Skip bank if corresponding bit in bankmask is clear */ if (!mcabanks_test(i, bankmask)) continue; status = mca_rdmsr(MSR_IA32_MCx_STATUS(i)); if (!(status & MCi_STATUS_VAL)) continue; /* this bank has no valid telemetry */ /* For Intel Latest CPU CMCI/MCE Handler caller, we need to * decide whether to clear bank by MCi_STATUS bit value such as * OVER/UC/EN/PCC/S/AR */ if ( mc_need_clearbank_scan ) need_clear = mc_need_clearbank_scan(who, status); /* If this is the first bank with valid MCA DATA, then * try to reserve an entry from the urgent/nonurgent queue * depending on whethere we are called from an exception or * a poller; this can fail (for example dom0 may not * yet have consumed past telemetry). */ if (errcnt++ == 0) { if ( (mctc = mctelem_reserve(which)) != NULL ) { mci = mctelem_dataptr(mctc); mcinfo_clear(mci); mig = (struct mcinfo_global*)x86_mcinfo_reserve (mci, sizeof(struct mcinfo_global)); /* mc_info should at least hold up the global information */ ASSERT(mig); mca_init_global(mc_flags, mig); /* A hook here to get global extended msrs */ { struct mcinfo_extended *intel_get_extended_msrs( struct mcinfo_global *mig, struct mc_info *mi); if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) intel_get_extended_msrs(mig, mci); } } } /* form a mask of which banks have logged uncorrected errors */ if ((status & MCi_STATUS_UC) != 0) uc |= (1 << i); /* likewise for those with processor context corrupt */ if ((status & MCi_STATUS_PCC) != 0) pcc |= (1 << i); if (recover && uc) /* uc = 1, recover = 1, we need not panic. */ recover = mc_recoverable_scan(status); mib = mca_init_bank(who, mci, i); if (mc_callback_bank_extended) mc_callback_bank_extended(mci, i, status); /* By default, need_clear = 1 */ if (who != MCA_MCE_SCAN && need_clear) /* Clear status */ mca_wrmsr(MSR_IA32_MCx_STATUS(i), 0x0ULL); else if ( who == MCA_MCE_SCAN && need_clear) mcabanks_set(i, clear_bank); wmb(); } if (mig && errcnt > 0) { if (pcc) mig->mc_flags |= MC_FLAG_UNCORRECTABLE; else if (uc) mig->mc_flags |= MC_FLAG_RECOVERABLE; else mig->mc_flags |= MC_FLAG_CORRECTABLE; } if (sp) { sp->errcnt = errcnt; sp->ripv = (gstatus & MCG_STATUS_RIPV) != 0; sp->eipv = (gstatus & MCG_STATUS_EIPV) != 0; sp->uc = uc; sp->pcc = pcc; sp->recoverable = recover; } return mci != NULL ? mctc : NULL; /* may be NULL */ }