Example #1
0
/* Shared #MC handler. */
void mcheck_cmn_handler(struct cpu_user_regs *regs, long error_code,
                        struct mca_banks *bankmask)
{
    int xen_state_lost, dom0_state_lost, domU_state_lost;
    struct vcpu *v = current;
    struct domain *curdom = v->domain;
    domid_t domid = curdom->domain_id;
    int ctx_xen, ctx_dom0, ctx_domU;
    uint32_t dom_state = DOM_NORMAL;
    mctelem_cookie_t mctc = NULL;
    struct mca_summary bs;
    struct mc_info *mci = NULL;
    int irqlocked = 0;
    uint64_t gstatus;
    int ripv;

    /* This handler runs as interrupt gate. So IPIs from the
     * polling service routine are defered until we're finished.
     */

    /* Disable interrupts for the _vcpu_. It may not re-scheduled to
     * another physical CPU. */
    vcpu_schedule_lock_irq(v);
    irqlocked = 1;

    /* Read global status;  if it does not indicate machine check
     * in progress then bail as long as we have a valid ip to return to. */
    gstatus = mca_rdmsr(MSR_IA32_MCG_STATUS);
    ripv = ((gstatus & MCG_STATUS_RIPV) != 0);
    if (!(gstatus & MCG_STATUS_MCIP) && ripv) {
        add_taint(TAINT_MACHINE_CHECK); /* questionable */
        vcpu_schedule_unlock_irq(v);
        irqlocked = 0;
        goto cmn_handler_done;
    }

    /* Go and grab error telemetry.  We must choose whether to commit
     * for logging or dismiss the cookie that is returned, and must not
     * reference the cookie after that action.
     */
    mctc = mcheck_mca_logout(MCA_MCE_HANDLER, bankmask, &bs, NULL);
    if (mctc != NULL)
        mci = (struct mc_info *)mctelem_dataptr(mctc);

    /* Clear MCIP or another #MC will enter shutdown state */
    gstatus &= ~MCG_STATUS_MCIP;
    mca_wrmsr(MSR_IA32_MCG_STATUS, gstatus);
    wmb();

    /* If no valid errors and our stack is intact, we're done */
    if (ripv && bs.errcnt == 0) {
        vcpu_schedule_unlock_irq(v);
        irqlocked = 0;
        goto cmn_handler_done;
    }

    if (bs.uc || bs.pcc)
        add_taint(TAINT_MACHINE_CHECK);

    /* Machine check exceptions will usually be for UC and/or PCC errors,
     * but it is possible to configure machine check for some classes
     * of corrected error.
     *
     * UC errors could compromise any domain or the hypervisor
     * itself - for example a cache writeback of modified data that
     * turned out to be bad could be for data belonging to anyone, not
     * just the current domain.  In the absence of known data poisoning
     * to prevent consumption of such bad data in the system we regard
     * all UC errors as terminal.  It may be possible to attempt some
     * heuristics based on the address affected, which guests have
     * mappings to that mfn etc.
     *
     * PCC errors apply to the current context.
     *
     * If MCG_STATUS indicates !RIPV then even a #MC that is not UC
     * and not PCC is terminal - the return instruction pointer
     * pushed onto the stack is bogus.  If the interrupt context is
     * the hypervisor or dom0 the game is over, otherwise we can
     * limit the impact to a single domU but only if we trampoline
     * somewhere safely - we can't return and unwind the stack.
     * Since there is no trampoline in place we will treat !RIPV
     * as terminal for any context.
     */
    ctx_xen = SEG_PL(regs->cs) == 0;
    ctx_dom0 = !ctx_xen && (domid == 0);
    ctx_domU = !ctx_xen && !ctx_dom0;

    xen_state_lost = bs.uc != 0 || (ctx_xen && (bs.pcc || !ripv)) ||
        !ripv;
    dom0_state_lost = bs.uc != 0 || (ctx_dom0 && (bs.pcc || !ripv));
    domU_state_lost = bs.uc != 0 || (ctx_domU && (bs.pcc || !ripv));

    if (xen_state_lost) {
        /* Now we are going to panic anyway. Allow interrupts, so that
         * printk on serial console can work. */
        vcpu_schedule_unlock_irq(v);
        irqlocked = 0;

        printk("Terminal machine check exception occurred in "
               "hypervisor context.\n");

        /* If MCG_STATUS_EIPV indicates, the IP on the stack is related
         * to the error then it makes sense to print a stack trace.
         * That can be useful for more detailed error analysis and/or
         * error case studies to figure out, if we can clear
         * xen_impacted and kill a DomU instead
         * (i.e. if a guest only control structure is affected, but then
         * we must ensure the bad pages are not re-used again).
         */
        if (bs.eipv & MCG_STATUS_EIPV) {
            printk("MCE: Instruction Pointer is related to the "
                   "error, therefore print the execution state.\n");
            show_execution_state(regs);
        }

        /* Commit the telemetry so that panic flow can find it. */
        if (mctc != NULL) {
            x86_mcinfo_dump(mci);
            mctelem_commit(mctc);
        }
        mc_panic("Hypervisor state lost due to machine check "
                 "exception.\n");
        /*NOTREACHED*/
    }

    /*
     * Xen hypervisor state is intact.  If dom0 state is lost then
     * give it a chance to decide what to do if it has registered
     * a handler for this event, otherwise panic.
     *
     * XXFM Could add some Solaris dom0 contract kill here?
     */
    if (dom0_state_lost) {
        if (dom0 && dom0->max_vcpus && dom0->vcpu[0] &&
            guest_has_trap_callback(dom0, 0, TRAP_machine_check)) {
            dom_state = DOM0_TRAP;
            send_guest_trap(dom0, 0, TRAP_machine_check);
            /* XXFM case of return with !ripv ??? */
        } else {
            /* Commit telemetry for panic flow. */
            if (mctc != NULL) {
                x86_mcinfo_dump(mci);
                mctelem_commit(mctc);
            }
            mc_panic("Dom0 state lost due to machine check "
                     "exception\n");
            /*NOTREACHED*/
        }
    }

    /*
     * If a domU has lost state then send it a trap if it has registered
     * a handler, otherwise crash the domain.
     * XXFM Revisit this functionality.
     */
    if (domU_state_lost) {
        if (guest_has_trap_callback(v->domain, v->vcpu_id,
                                    TRAP_machine_check)) {
            dom_state = DOMU_TRAP;
            send_guest_trap(curdom, v->vcpu_id,
                            TRAP_machine_check);
        } else {
            dom_state = DOMU_KILLED;
            /* Enable interrupts. This basically results in
             * calling sti on the *physical* cpu. But after
             * domain_crash() the vcpu pointer is invalid.
             * Therefore, we must unlock the irqs before killing
             * it. */
            vcpu_schedule_unlock_irq(v);
            irqlocked = 0;

            /* DomU is impacted. Kill it and continue. */
            domain_crash(curdom);
        }
    }

    switch (dom_state) {
    case DOM0_TRAP:
    case DOMU_TRAP:
        /* Enable interrupts. */
        vcpu_schedule_unlock_irq(v);
        irqlocked = 0;

        /* guest softirqs and event callbacks are scheduled
         * immediately after this handler exits. */
        break;
    case DOMU_KILLED:
        /* Nothing to do here. */
        break;

    case DOM_NORMAL:
        vcpu_schedule_unlock_irq(v);
        irqlocked = 0;
        break;
    }

 cmn_handler_done:
    BUG_ON(irqlocked);
    BUG_ON(!ripv);

    if (bs.errcnt) {
        /* Not panicing, so forward telemetry to dom0 now if it
         * is interested. */
        if (dom0_vmce_enabled()) {
            if (mctc != NULL)
                mctelem_commit(mctc);
            send_guest_global_virq(dom0, VIRQ_MCA);
        } else {
            x86_mcinfo_dump(mci);
            if (mctc != NULL)
                mctelem_dismiss(mctc);
        }
    } else if (mctc != NULL) {
        mctelem_dismiss(mctc);
    }
}
Example #2
0
/* Machine Check Handler for AMD K8 family series */
void k8_machine_check(struct cpu_user_regs *regs, long error_code)
{
	struct vcpu *vcpu = current;
	struct domain *curdom;
	struct mc_info *mc_data;
	struct mcinfo_global mc_global;
	struct mcinfo_bank mc_info;
	uint64_t status, addrv, miscv, uc;
	uint32_t i;
	unsigned int cpu_nr;
	uint32_t xen_impacted = 0;
#define DOM_NORMAL	0
#define DOM0_TRAP	1
#define DOMU_TRAP	2
#define DOMU_KILLED	4
	uint32_t dom_state = DOM_NORMAL;

	/* This handler runs as interrupt gate. So IPIs from the
	 * polling service routine are defered until we finished.
	 */

        /* Disable interrupts for the _vcpu_. It may not re-scheduled to
	 * an other physical CPU or the impacted process in the guest
	 * continues running with corrupted data, otherwise. */
        vcpu_schedule_lock_irq(vcpu);

	mc_data = x86_mcinfo_getptr();
	cpu_nr = smp_processor_id();
	curdom = vcpu->domain;

	memset(&mc_global, 0, sizeof(mc_global));
	mc_global.common.type = MC_TYPE_GLOBAL;
	mc_global.common.size = sizeof(mc_global);

	mc_global.mc_domid = curdom->domain_id; /* impacted domain */
	mc_global.mc_coreid = vcpu->processor; /* impacted physical cpu */
	BUG_ON(cpu_nr != vcpu->processor);
	mc_global.mc_core_threadid = 0;
	mc_global.mc_vcpuid = vcpu->vcpu_id; /* impacted vcpu */
#if 0 /* TODO: on which socket is this physical core?
         It's not clear to me how to figure this out. */
	mc_global.mc_socketid = ???;
#endif
	mc_global.mc_flags |= MC_FLAG_UNCORRECTABLE;
	rdmsrl(MSR_IA32_MCG_STATUS, mc_global.mc_gstatus);

	/* Quick check, who is impacted */
	xen_impacted = is_idle_domain(curdom);

	/* Dom0 */
	x86_mcinfo_clear(mc_data);
	x86_mcinfo_add(mc_data, &mc_global);

	for (i = 0; i < nr_mce_banks; i++) {
		struct domain *d;

		rdmsrl(MSR_IA32_MC0_STATUS + 4 * i, status);

		if (!(status & MCi_STATUS_VAL))
			continue;

		/* An error happened in this bank.
		 * This is expected to be an uncorrectable error,
		 * since correctable errors get polled.
		 */
		uc = status & MCi_STATUS_UC;

		memset(&mc_info, 0, sizeof(mc_info));
		mc_info.common.type = MC_TYPE_BANK;
		mc_info.common.size = sizeof(mc_info);
		mc_info.mc_bank = i;
		mc_info.mc_status = status;

		addrv = 0;
		if (status & MCi_STATUS_ADDRV) {
			rdmsrl(MSR_IA32_MC0_ADDR + 4 * i, addrv);
			
			d = maddr_get_owner(addrv);
			if (d != NULL)
				mc_info.mc_domid = d->domain_id;
		}

		miscv = 0;
		if (status & MCi_STATUS_MISCV)
			rdmsrl(MSR_IA32_MC0_MISC + 4 * i, miscv);

		mc_info.mc_addr = addrv;
		mc_info.mc_misc = miscv;

		x86_mcinfo_add(mc_data, &mc_info); /* Dom0 */

		if (mc_callback_bank_extended)
			mc_callback_bank_extended(mc_data, i, status);

		/* clear status */
		wrmsrl(MSR_IA32_MC0_STATUS + 4 * i, 0x0ULL);
		wmb();
		add_taint(TAINT_MACHINE_CHECK);
	}

	status = mc_global.mc_gstatus;

	/* clear MCIP or cpu enters shutdown state
	 * in case another MCE occurs. */
	status &= ~MCG_STATUS_MCIP;
	wrmsrl(MSR_IA32_MCG_STATUS, status);
	wmb();

	/* For the details see the discussion "MCE/MCA concept" on xen-devel.
	 * The thread started here:
	 * http://lists.xensource.com/archives/html/xen-devel/2007-05/msg01015.html
	 */

	/* MCG_STATUS_RIPV: 
	 * When this bit is not set, then the instruction pointer onto the stack
	 * to resume at is not valid. If xen is interrupted, then we panic anyway
	 * right below. Otherwise it is up to the guest to figure out if 
	 * guest kernel or guest userland is affected and should kill either
	 * itself or the affected process.
	 */

	/* MCG_STATUS_EIPV:
	 * Evaluation of EIPV is the job of the guest.
	 */

	if (xen_impacted) {
		/* Now we are going to panic anyway. Allow interrupts, so that
		 * printk on serial console can work. */
		vcpu_schedule_unlock_irq(vcpu);

		/* Uh, that means, machine check exception
		 * inside Xen occured. */
		printk("Machine check exception occured in Xen.\n");

		/* if MCG_STATUS_EIPV indicates, the IP on the stack is related
		 * to the error then it makes sense to print a stack trace.
		 * That can be useful for more detailed error analysis and/or
		 * error case studies to figure out, if we can clear
		 * xen_impacted and kill a DomU instead
		 * (i.e. if a guest only control structure is affected, but then
		 * we must ensure the bad pages are not re-used again).
		 */
		if (status & MCG_STATUS_EIPV) {
			printk("MCE: Instruction Pointer is related to the error. "
				"Therefore, print the execution state.\n");
			show_execution_state(regs);
		}
		x86_mcinfo_dump(mc_data);
		mc_panic("End of MCE. Use mcelog to decode above error codes.\n");
	}

	/* If Dom0 registered a machine check handler, which is only possible
	 * with a PV MCA driver, then ... */
	if ( guest_has_trap_callback(dom0, 0, TRAP_machine_check) ) {
		dom_state = DOM0_TRAP;

		/* ... deliver machine check trap to Dom0. */
		send_guest_trap(dom0, 0, TRAP_machine_check);

		/* Xen may tell Dom0 now to notify the DomU.
		 * But this will happen through a hypercall. */
	} else
		/* Dom0 did not register a machine check handler, but if DomU
		 * did so, then... */
                if ( guest_has_trap_callback(curdom, vcpu->vcpu_id, TRAP_machine_check) ) {
			dom_state = DOMU_TRAP;

			/* ... deliver machine check trap to DomU */
			send_guest_trap(curdom, vcpu->vcpu_id, TRAP_machine_check);
	} else {
		/* hmm... noone feels responsible to handle the error.
		 * So, do a quick check if a DomU is impacted or not.
		 */
		if (curdom == dom0) {
			/* Dom0 is impacted. Since noone can't handle
			 * this error, panic! */
			x86_mcinfo_dump(mc_data);
			mc_panic("MCE occured in Dom0, which it can't handle\n");

			/* UNREACHED */
		} else {
			dom_state = DOMU_KILLED;

			/* Enable interrupts. This basically results in
			 * calling sti on the *physical* cpu. But after
			 * domain_crash() the vcpu pointer is invalid.
			 * Therefore, we must unlock the irqs before killing
			 * it. */
			vcpu_schedule_unlock_irq(vcpu);

			/* DomU is impacted. Kill it and continue. */
			domain_crash(curdom);
		}
	}


	switch (dom_state) {
	case DOM0_TRAP:
	case DOMU_TRAP:
		/* Enable interrupts. */
		vcpu_schedule_unlock_irq(vcpu);

		/* guest softirqs and event callbacks are scheduled
		 * immediately after this handler exits. */
		break;
	case DOMU_KILLED:
		/* Nothing to do here. */
		break;
	default:
		BUG();
	}
}