예제 #1
0
파일: non-fatal.c 프로젝트: amodj/Utopia
static void mce_checkregs (void *info)
{
	mctelem_cookie_t mctc;
	struct mca_summary bs;
	static uint64_t dumpcount = 0;

	mctc = mcheck_mca_logout(MCA_POLLER, __get_cpu_var(poll_bankmask), &bs);

	if (bs.errcnt && mctc != NULL) {
		adjust++;

		/* If Dom0 enabled the VIRQ_MCA event, then notify it.
		 * Otherwise, if dom0 has had plenty of time to register
		 * the virq handler but still hasn't then dump telemetry
		 * to the Xen console.  The call count may be incremented
		 * on multiple cpus at once and is indicative only - just
		 * a simple-minded attempt to avoid spamming the console
		 * for corrected errors in early startup.
		 */

		if (guest_enabled_event(dom0->vcpu[0], VIRQ_MCA)) {
			mctelem_commit(mctc);
			send_guest_global_virq(dom0, VIRQ_MCA);
		} else if (++dumpcount >= 10) {
			x86_mcinfo_dump((struct mc_info *)mctelem_dataptr(mctc));
			mctelem_dismiss(mctc);
		} else {
			mctelem_dismiss(mctc);
		}
	} else if (mctc != NULL) {
		mctelem_dismiss(mctc);
	}
}
예제 #2
0
파일: mce.c 프로젝트: dzan/xenOnArm
/* Add out_param clear_bank for Machine Check Handler Caller.
 * For Intel latest CPU, whether to clear the error bank status needs to
 * be judged by the callback function defined above.
 */
mctelem_cookie_t
mcheck_mca_logout(enum mca_source who, struct mca_banks *bankmask,
                  struct mca_summary *sp, struct mca_banks *clear_bank)
{
    uint64_t gstatus, status;
    struct mcinfo_global *mig = NULL; /* on stack */
    mctelem_cookie_t mctc = NULL;
    bool_t uc = 0, pcc = 0, recover = 1, need_clear = 1;
    uint32_t mc_flags = 0;
    struct mc_info *mci = NULL;
    mctelem_class_t which = MC_URGENT; /* XXXgcc */
    int errcnt = 0;
    int i;

    gstatus = mca_rdmsr(MSR_IA32_MCG_STATUS);
    switch (who) {
    case MCA_MCE_SCAN:
        mc_flags = MC_FLAG_MCE;
        which = MC_URGENT;
        break;

    case MCA_POLLER:
    case MCA_RESET:
        mc_flags = MC_FLAG_POLLED;
        which = MC_NONURGENT;
        break;

    case MCA_CMCI_HANDLER:
        mc_flags = MC_FLAG_CMCI;
        which = MC_NONURGENT;
        break;

    default:
        BUG();
    }

    /* If no mc_recovery_scan callback handler registered,
     * this error is not recoverable
     */
    recover = (mc_recoverable_scan) ? 1 : 0;

    for (i = 0; i < nr_mce_banks; i++) {
        /* Skip bank if corresponding bit in bankmask is clear */
        if (!mcabanks_test(i, bankmask))
            continue;

        status = mca_rdmsr(MSR_IA32_MCx_STATUS(i));
        if (!(status & MCi_STATUS_VAL))
            continue; /* this bank has no valid telemetry */

        /* For Intel Latest CPU CMCI/MCE Handler caller, we need to
         * decide whether to clear bank by MCi_STATUS bit value such as
         * OVER/UC/EN/PCC/S/AR
         */
        if ( mc_need_clearbank_scan )
            need_clear = mc_need_clearbank_scan(who, status);

        /* If this is the first bank with valid MCA DATA, then
         * try to reserve an entry from the urgent/nonurgent queue
         * depending on whether we are called from an exception or
         * a poller;  this can fail (for example dom0 may not
         * yet have consumed past telemetry). */
        if (errcnt++ == 0) {
            if ( (mctc = mctelem_reserve(which)) != NULL ) {
                mci = mctelem_dataptr(mctc);
                mcinfo_clear(mci);
                mig = x86_mcinfo_reserve(mci, sizeof(*mig));
                /* mc_info should at least hold up the global information */
                ASSERT(mig);
                mca_init_global(mc_flags, mig);
                /* A hook here to get global extended msrs */
                {
                    if (boot_cpu_data.x86_vendor ==
                        X86_VENDOR_INTEL)
                        intel_get_extended_msrs(mig, mci);
                }
            }
        }

        /* flag for uncorrected errors */
        if (!uc && ((status & MCi_STATUS_UC) != 0))
            uc = 1;

        /* flag processor context corrupt */
        if (!pcc && ((status & MCi_STATUS_PCC) != 0))
            pcc = 1;

        if (recover && uc)
            /* uc = 1, recover = 1, we need not panic.
             */
            recover = mc_recoverable_scan(status);

        mca_init_bank(who, mci, i);

        if (mc_callback_bank_extended)
            mc_callback_bank_extended(mci, i, status);

        /* By default, need_clear = 1 */
        if (who != MCA_MCE_SCAN && need_clear)
            /* Clear bank */
            mcabank_clear(i);
        else if ( who == MCA_MCE_SCAN && need_clear)
            mcabanks_set(i, clear_bank);

        wmb();
    }

    if (mig && errcnt > 0) {
        if (pcc)
            mig->mc_flags |= MC_FLAG_UNCORRECTABLE;
        else if (uc)
            mig->mc_flags |= MC_FLAG_RECOVERABLE;
        else
            mig->mc_flags |= MC_FLAG_CORRECTABLE;
    }


    if (sp) {
        sp->errcnt = errcnt;
        sp->ripv = (gstatus & MCG_STATUS_RIPV) != 0;
        sp->eipv = (gstatus & MCG_STATUS_EIPV) != 0;
        sp->uc = uc;
        sp->pcc = pcc;
        sp->recoverable = recover;
    }

    return mci != NULL ? mctc : NULL; /* may be NULL */
}
예제 #3
0
/* Shared #MC handler. */
void mcheck_cmn_handler(struct cpu_user_regs *regs, long error_code,
                        struct mca_banks *bankmask)
{
    int xen_state_lost, dom0_state_lost, domU_state_lost;
    struct vcpu *v = current;
    struct domain *curdom = v->domain;
    domid_t domid = curdom->domain_id;
    int ctx_xen, ctx_dom0, ctx_domU;
    uint32_t dom_state = DOM_NORMAL;
    mctelem_cookie_t mctc = NULL;
    struct mca_summary bs;
    struct mc_info *mci = NULL;
    int irqlocked = 0;
    uint64_t gstatus;
    int ripv;

    /* This handler runs as interrupt gate. So IPIs from the
     * polling service routine are defered until we're finished.
     */

    /* Disable interrupts for the _vcpu_. It may not re-scheduled to
     * another physical CPU. */
    vcpu_schedule_lock_irq(v);
    irqlocked = 1;

    /* Read global status;  if it does not indicate machine check
     * in progress then bail as long as we have a valid ip to return to. */
    gstatus = mca_rdmsr(MSR_IA32_MCG_STATUS);
    ripv = ((gstatus & MCG_STATUS_RIPV) != 0);
    if (!(gstatus & MCG_STATUS_MCIP) && ripv) {
        add_taint(TAINT_MACHINE_CHECK); /* questionable */
        vcpu_schedule_unlock_irq(v);
        irqlocked = 0;
        goto cmn_handler_done;
    }

    /* Go and grab error telemetry.  We must choose whether to commit
     * for logging or dismiss the cookie that is returned, and must not
     * reference the cookie after that action.
     */
    mctc = mcheck_mca_logout(MCA_MCE_HANDLER, bankmask, &bs, NULL);
    if (mctc != NULL)
        mci = (struct mc_info *)mctelem_dataptr(mctc);

    /* Clear MCIP or another #MC will enter shutdown state */
    gstatus &= ~MCG_STATUS_MCIP;
    mca_wrmsr(MSR_IA32_MCG_STATUS, gstatus);
    wmb();

    /* If no valid errors and our stack is intact, we're done */
    if (ripv && bs.errcnt == 0) {
        vcpu_schedule_unlock_irq(v);
        irqlocked = 0;
        goto cmn_handler_done;
    }

    if (bs.uc || bs.pcc)
        add_taint(TAINT_MACHINE_CHECK);

    /* Machine check exceptions will usually be for UC and/or PCC errors,
     * but it is possible to configure machine check for some classes
     * of corrected error.
     *
     * UC errors could compromise any domain or the hypervisor
     * itself - for example a cache writeback of modified data that
     * turned out to be bad could be for data belonging to anyone, not
     * just the current domain.  In the absence of known data poisoning
     * to prevent consumption of such bad data in the system we regard
     * all UC errors as terminal.  It may be possible to attempt some
     * heuristics based on the address affected, which guests have
     * mappings to that mfn etc.
     *
     * PCC errors apply to the current context.
     *
     * If MCG_STATUS indicates !RIPV then even a #MC that is not UC
     * and not PCC is terminal - the return instruction pointer
     * pushed onto the stack is bogus.  If the interrupt context is
     * the hypervisor or dom0 the game is over, otherwise we can
     * limit the impact to a single domU but only if we trampoline
     * somewhere safely - we can't return and unwind the stack.
     * Since there is no trampoline in place we will treat !RIPV
     * as terminal for any context.
     */
    ctx_xen = SEG_PL(regs->cs) == 0;
    ctx_dom0 = !ctx_xen && (domid == 0);
    ctx_domU = !ctx_xen && !ctx_dom0;

    xen_state_lost = bs.uc != 0 || (ctx_xen && (bs.pcc || !ripv)) ||
        !ripv;
    dom0_state_lost = bs.uc != 0 || (ctx_dom0 && (bs.pcc || !ripv));
    domU_state_lost = bs.uc != 0 || (ctx_domU && (bs.pcc || !ripv));

    if (xen_state_lost) {
        /* Now we are going to panic anyway. Allow interrupts, so that
         * printk on serial console can work. */
        vcpu_schedule_unlock_irq(v);
        irqlocked = 0;

        printk("Terminal machine check exception occurred in "
               "hypervisor context.\n");

        /* If MCG_STATUS_EIPV indicates, the IP on the stack is related
         * to the error then it makes sense to print a stack trace.
         * That can be useful for more detailed error analysis and/or
         * error case studies to figure out, if we can clear
         * xen_impacted and kill a DomU instead
         * (i.e. if a guest only control structure is affected, but then
         * we must ensure the bad pages are not re-used again).
         */
        if (bs.eipv & MCG_STATUS_EIPV) {
            printk("MCE: Instruction Pointer is related to the "
                   "error, therefore print the execution state.\n");
            show_execution_state(regs);
        }

        /* Commit the telemetry so that panic flow can find it. */
        if (mctc != NULL) {
            x86_mcinfo_dump(mci);
            mctelem_commit(mctc);
        }
        mc_panic("Hypervisor state lost due to machine check "
                 "exception.\n");
        /*NOTREACHED*/
    }

    /*
     * Xen hypervisor state is intact.  If dom0 state is lost then
     * give it a chance to decide what to do if it has registered
     * a handler for this event, otherwise panic.
     *
     * XXFM Could add some Solaris dom0 contract kill here?
     */
    if (dom0_state_lost) {
        if (dom0 && dom0->max_vcpus && dom0->vcpu[0] &&
            guest_has_trap_callback(dom0, 0, TRAP_machine_check)) {
            dom_state = DOM0_TRAP;
            send_guest_trap(dom0, 0, TRAP_machine_check);
            /* XXFM case of return with !ripv ??? */
        } else {
            /* Commit telemetry for panic flow. */
            if (mctc != NULL) {
                x86_mcinfo_dump(mci);
                mctelem_commit(mctc);
            }
            mc_panic("Dom0 state lost due to machine check "
                     "exception\n");
            /*NOTREACHED*/
        }
    }

    /*
     * If a domU has lost state then send it a trap if it has registered
     * a handler, otherwise crash the domain.
     * XXFM Revisit this functionality.
     */
    if (domU_state_lost) {
        if (guest_has_trap_callback(v->domain, v->vcpu_id,
                                    TRAP_machine_check)) {
            dom_state = DOMU_TRAP;
            send_guest_trap(curdom, v->vcpu_id,
                            TRAP_machine_check);
        } else {
            dom_state = DOMU_KILLED;
            /* Enable interrupts. This basically results in
             * calling sti on the *physical* cpu. But after
             * domain_crash() the vcpu pointer is invalid.
             * Therefore, we must unlock the irqs before killing
             * it. */
            vcpu_schedule_unlock_irq(v);
            irqlocked = 0;

            /* DomU is impacted. Kill it and continue. */
            domain_crash(curdom);
        }
    }

    switch (dom_state) {
    case DOM0_TRAP:
    case DOMU_TRAP:
        /* Enable interrupts. */
        vcpu_schedule_unlock_irq(v);
        irqlocked = 0;

        /* guest softirqs and event callbacks are scheduled
         * immediately after this handler exits. */
        break;
    case DOMU_KILLED:
        /* Nothing to do here. */
        break;

    case DOM_NORMAL:
        vcpu_schedule_unlock_irq(v);
        irqlocked = 0;
        break;
    }

 cmn_handler_done:
    BUG_ON(irqlocked);
    BUG_ON(!ripv);

    if (bs.errcnt) {
        /* Not panicing, so forward telemetry to dom0 now if it
         * is interested. */
        if (dom0_vmce_enabled()) {
            if (mctc != NULL)
                mctelem_commit(mctc);
            send_guest_global_virq(dom0, VIRQ_MCA);
        } else {
            x86_mcinfo_dump(mci);
            if (mctc != NULL)
                mctelem_dismiss(mctc);
        }
    } else if (mctc != NULL) {
        mctelem_dismiss(mctc);
    }
}