Beispiel #1
0
static int mce_handle_error(struct rtas_error_log *errp)
{
	struct pseries_errorlog *pseries_log;
	struct pseries_mc_errorlog *mce_log;
	int disposition = rtas_error_disposition(errp);
	u8 error_type;

	if (!rtas_error_extended(errp))
		goto out;

	pseries_log = get_pseries_errorlog(errp, PSERIES_ELOG_SECT_ID_MCE);
	if (pseries_log == NULL)
		goto out;

	mce_log = (struct pseries_mc_errorlog *)pseries_log->data;
	error_type = mce_log->error_type;

#ifdef CONFIG_PPC_BOOK3S_64
	if (disposition == RTAS_DISP_NOT_RECOVERED) {
		switch (error_type) {
		case	MC_ERROR_TYPE_SLB:
		case	MC_ERROR_TYPE_ERAT:
			/*
			 * Store the old slb content in paca before flushing.
			 * Print this when we go to virtual mode.
			 * There are chances that we may hit MCE again if there
			 * is a parity error on the SLB entry we trying to read
			 * for saving. Hence limit the slb saving to single
			 * level of recursion.
			 */
			if (local_paca->in_mce == 1)
				slb_save_contents(local_paca->mce_faulty_slbs);
			flush_and_reload_slb();
			disposition = RTAS_DISP_FULLY_RECOVERED;
			rtas_set_disposition_recovered(errp);
			break;
		default:
			break;
		}
	}
#endif

out:
	return disposition;
}
Beispiel #2
0
/*
 * See if we can recover from a machine check exception.
 * This is only called on power4 (or above) and only via
 * the Firmware Non-Maskable Interrupts (fwnmi) handler
 * which provides the error analysis for us.
 *
 * Return 1 if corrected (or delivered a signal).
 * Return 0 if there is nothing we can do.
 */
static int recover_mce(struct pt_regs *regs, struct rtas_error_log *err)
{
	int recovered = 0;
	int disposition = rtas_error_disposition(err);

	pseries_print_mce_info(regs, err);

	if (!(regs->msr & MSR_RI)) {
		/* If MSR_RI isn't set, we cannot recover */
		pr_err("Machine check interrupt unrecoverable: MSR(RI=0)\n");
		recovered = 0;

	} else if (disposition == RTAS_DISP_FULLY_RECOVERED) {
		/* Platform corrected itself */
		recovered = 1;

	} else if (disposition == RTAS_DISP_LIMITED_RECOVERY) {
		/* Platform corrected itself but could be degraded */
		printk(KERN_ERR "MCE: limited recovery, system may "
		       "be degraded\n");
		recovered = 1;

	} else if (user_mode(regs) && !is_global_init(current) &&
		   rtas_error_severity(err) == RTAS_SEVERITY_ERROR_SYNC) {

		/*
		 * If we received a synchronous error when in userspace
		 * kill the task. Firmware may report details of the fail
		 * asynchronously, so we can't rely on the target and type
		 * fields being valid here.
		 */
		printk(KERN_ERR "MCE: uncorrectable error, killing task "
		       "%s:%d\n", current->comm, current->pid);

		_exception(SIGBUS, regs, BUS_MCEERR_AR, regs->nip);
		recovered = 1;
	}

	pseries_process_ue(regs, err);

	/* Queue irq work to log this rtas event later. */
	irq_work_queue(&mce_errlog_process_work);

	return recovered;
}
Beispiel #3
0
/*
 * See if we can recover from a machine check exception.
 * This is only called on power4 (or above) and only via
 * the Firmware Non-Maskable Interrupts (fwnmi) handler
 * which provides the error analysis for us.
 *
 * Return 1 if corrected (or delivered a signal).
 * Return 0 if there is nothing we can do.
 */
static int recover_mce(struct pt_regs *regs, struct rtas_error_log *err)
{
	int recovered = 0;
	int disposition = rtas_error_disposition(err);

	if (!(regs->msr & MSR_RI)) {
		/* If MSR_RI isn't set, we cannot recover */
		recovered = 0;

	} else if (disposition == RTAS_DISP_FULLY_RECOVERED) {
		/* Platform corrected itself */
		recovered = 1;

	} else if (disposition == RTAS_DISP_LIMITED_RECOVERY) {
		/* Platform corrected itself but could be degraded */
		printk(KERN_ERR "MCE: limited recovery, system may "
		       "be degraded\n");
		recovered = 1;

	} else if (user_mode(regs) && !is_global_init(current) &&
		   rtas_error_severity(err) == RTAS_SEVERITY_ERROR_SYNC) {

		/*
		 * If we received a synchronous error when in userspace
		 * kill the task. Firmware may report details of the fail
		 * asynchronously, so we can't rely on the target and type
		 * fields being valid here.
		 */
		printk(KERN_ERR "MCE: uncorrectable error, killing task "
		       "%s:%d\n", current->comm, current->pid);

		_exception(SIGBUS, regs, BUS_MCEERR_AR, regs->nip);
		recovered = 1;
	}

	log_error((char *)err, ERR_TYPE_RTAS_LOG, 0);

	return recovered;
}
Beispiel #4
0
static void pseries_print_mce_info(struct pt_regs *regs,
				   struct rtas_error_log *errp)
{
	const char *level, *sevstr;
	struct pseries_errorlog *pseries_log;
	struct pseries_mc_errorlog *mce_log;
	u8 error_type, err_sub_type;
	u64 addr;
	u8 initiator = rtas_error_initiator(errp);
	int disposition = rtas_error_disposition(errp);

	static const char * const initiators[] = {
		"Unknown",
		"CPU",
		"PCI",
		"ISA",
		"Memory",
		"Power Mgmt",
	};
	static const char * const mc_err_types[] = {
		"UE",
		"SLB",
		"ERAT",
		"TLB",
		"D-Cache",
		"Unknown",
		"I-Cache",
	};
	static const char * const mc_ue_types[] = {
		"Indeterminate",
		"Instruction fetch",
		"Page table walk ifetch",
		"Load/Store",
		"Page table walk Load/Store",
	};

	/* SLB sub errors valid values are 0x0, 0x1, 0x2 */
	static const char * const mc_slb_types[] = {
		"Parity",
		"Multihit",
		"Indeterminate",
	};

	/* TLB and ERAT sub errors valid values are 0x1, 0x2, 0x3 */
	static const char * const mc_soft_types[] = {
		"Unknown",
		"Parity",
		"Multihit",
		"Indeterminate",
	};

	if (!rtas_error_extended(errp)) {
		pr_err("Machine check interrupt: Missing extended error log\n");
		return;
	}

	pseries_log = get_pseries_errorlog(errp, PSERIES_ELOG_SECT_ID_MCE);
	if (pseries_log == NULL)
		return;

	mce_log = (struct pseries_mc_errorlog *)pseries_log->data;

	error_type = mce_log->error_type;
	err_sub_type = rtas_mc_error_sub_type(mce_log);

	switch (rtas_error_severity(errp)) {
	case RTAS_SEVERITY_NO_ERROR:
		level = KERN_INFO;
		sevstr = "Harmless";
		break;
	case RTAS_SEVERITY_WARNING:
		level = KERN_WARNING;
		sevstr = "";
		break;
	case RTAS_SEVERITY_ERROR:
	case RTAS_SEVERITY_ERROR_SYNC:
		level = KERN_ERR;
		sevstr = "Severe";
		break;
	case RTAS_SEVERITY_FATAL:
	default:
		level = KERN_ERR;
		sevstr = "Fatal";
		break;
	}

#ifdef CONFIG_PPC_BOOK3S_64
	/* Display faulty slb contents for SLB errors. */
	if (error_type == MC_ERROR_TYPE_SLB)
		slb_dump_contents(local_paca->mce_faulty_slbs);
#endif

	printk("%s%s Machine check interrupt [%s]\n", level, sevstr,
	       disposition == RTAS_DISP_FULLY_RECOVERED ?
	       "Recovered" : "Not recovered");
	if (user_mode(regs)) {
		printk("%s  NIP: [%016lx] PID: %d Comm: %s\n", level,
		       regs->nip, current->pid, current->comm);
	} else {
		printk("%s  NIP [%016lx]: %pS\n", level, regs->nip,
		       (void *)regs->nip);
	}
	printk("%s  Initiator: %s\n", level,
	       VAL_TO_STRING(initiators, initiator));

	switch (error_type) {
	case MC_ERROR_TYPE_UE:
		printk("%s  Error type: %s [%s]\n", level,
		       VAL_TO_STRING(mc_err_types, error_type),
		       VAL_TO_STRING(mc_ue_types, err_sub_type));
		break;
	case MC_ERROR_TYPE_SLB:
		printk("%s  Error type: %s [%s]\n", level,
		       VAL_TO_STRING(mc_err_types, error_type),
		       VAL_TO_STRING(mc_slb_types, err_sub_type));
		break;
	case MC_ERROR_TYPE_ERAT:
	case MC_ERROR_TYPE_TLB:
		printk("%s  Error type: %s [%s]\n", level,
		       VAL_TO_STRING(mc_err_types, error_type),
		       VAL_TO_STRING(mc_soft_types, err_sub_type));
		break;
	default:
		printk("%s  Error type: %s\n", level,
		       VAL_TO_STRING(mc_err_types, error_type));
		break;
	}

	addr = rtas_mc_get_effective_addr(mce_log);
	if (addr)
		printk("%s    Effective address: %016llx\n", level, addr);
}