static int mce_handle_error(struct rtas_error_log *errp) { struct pseries_errorlog *pseries_log; struct pseries_mc_errorlog *mce_log; int disposition = rtas_error_disposition(errp); u8 error_type; if (!rtas_error_extended(errp)) goto out; pseries_log = get_pseries_errorlog(errp, PSERIES_ELOG_SECT_ID_MCE); if (pseries_log == NULL) goto out; mce_log = (struct pseries_mc_errorlog *)pseries_log->data; error_type = mce_log->error_type; #ifdef CONFIG_PPC_BOOK3S_64 if (disposition == RTAS_DISP_NOT_RECOVERED) { switch (error_type) { case MC_ERROR_TYPE_SLB: case MC_ERROR_TYPE_ERAT: /* * Store the old slb content in paca before flushing. * Print this when we go to virtual mode. * There are chances that we may hit MCE again if there * is a parity error on the SLB entry we trying to read * for saving. Hence limit the slb saving to single * level of recursion. */ if (local_paca->in_mce == 1) slb_save_contents(local_paca->mce_faulty_slbs); flush_and_reload_slb(); disposition = RTAS_DISP_FULLY_RECOVERED; rtas_set_disposition_recovered(errp); break; default: break; } } #endif out: return disposition; }
/* * See if we can recover from a machine check exception. * This is only called on power4 (or above) and only via * the Firmware Non-Maskable Interrupts (fwnmi) handler * which provides the error analysis for us. * * Return 1 if corrected (or delivered a signal). * Return 0 if there is nothing we can do. */ static int recover_mce(struct pt_regs *regs, struct rtas_error_log *err) { int recovered = 0; int disposition = rtas_error_disposition(err); pseries_print_mce_info(regs, err); if (!(regs->msr & MSR_RI)) { /* If MSR_RI isn't set, we cannot recover */ pr_err("Machine check interrupt unrecoverable: MSR(RI=0)\n"); recovered = 0; } else if (disposition == RTAS_DISP_FULLY_RECOVERED) { /* Platform corrected itself */ recovered = 1; } else if (disposition == RTAS_DISP_LIMITED_RECOVERY) { /* Platform corrected itself but could be degraded */ printk(KERN_ERR "MCE: limited recovery, system may " "be degraded\n"); recovered = 1; } else if (user_mode(regs) && !is_global_init(current) && rtas_error_severity(err) == RTAS_SEVERITY_ERROR_SYNC) { /* * If we received a synchronous error when in userspace * kill the task. Firmware may report details of the fail * asynchronously, so we can't rely on the target and type * fields being valid here. */ printk(KERN_ERR "MCE: uncorrectable error, killing task " "%s:%d\n", current->comm, current->pid); _exception(SIGBUS, regs, BUS_MCEERR_AR, regs->nip); recovered = 1; } pseries_process_ue(regs, err); /* Queue irq work to log this rtas event later. */ irq_work_queue(&mce_errlog_process_work); return recovered; }
/* * See if we can recover from a machine check exception. * This is only called on power4 (or above) and only via * the Firmware Non-Maskable Interrupts (fwnmi) handler * which provides the error analysis for us. * * Return 1 if corrected (or delivered a signal). * Return 0 if there is nothing we can do. */ static int recover_mce(struct pt_regs *regs, struct rtas_error_log *err) { int recovered = 0; int disposition = rtas_error_disposition(err); if (!(regs->msr & MSR_RI)) { /* If MSR_RI isn't set, we cannot recover */ recovered = 0; } else if (disposition == RTAS_DISP_FULLY_RECOVERED) { /* Platform corrected itself */ recovered = 1; } else if (disposition == RTAS_DISP_LIMITED_RECOVERY) { /* Platform corrected itself but could be degraded */ printk(KERN_ERR "MCE: limited recovery, system may " "be degraded\n"); recovered = 1; } else if (user_mode(regs) && !is_global_init(current) && rtas_error_severity(err) == RTAS_SEVERITY_ERROR_SYNC) { /* * If we received a synchronous error when in userspace * kill the task. Firmware may report details of the fail * asynchronously, so we can't rely on the target and type * fields being valid here. */ printk(KERN_ERR "MCE: uncorrectable error, killing task " "%s:%d\n", current->comm, current->pid); _exception(SIGBUS, regs, BUS_MCEERR_AR, regs->nip); recovered = 1; } log_error((char *)err, ERR_TYPE_RTAS_LOG, 0); return recovered; }
static void pseries_print_mce_info(struct pt_regs *regs, struct rtas_error_log *errp) { const char *level, *sevstr; struct pseries_errorlog *pseries_log; struct pseries_mc_errorlog *mce_log; u8 error_type, err_sub_type; u64 addr; u8 initiator = rtas_error_initiator(errp); int disposition = rtas_error_disposition(errp); static const char * const initiators[] = { "Unknown", "CPU", "PCI", "ISA", "Memory", "Power Mgmt", }; static const char * const mc_err_types[] = { "UE", "SLB", "ERAT", "TLB", "D-Cache", "Unknown", "I-Cache", }; static const char * const mc_ue_types[] = { "Indeterminate", "Instruction fetch", "Page table walk ifetch", "Load/Store", "Page table walk Load/Store", }; /* SLB sub errors valid values are 0x0, 0x1, 0x2 */ static const char * const mc_slb_types[] = { "Parity", "Multihit", "Indeterminate", }; /* TLB and ERAT sub errors valid values are 0x1, 0x2, 0x3 */ static const char * const mc_soft_types[] = { "Unknown", "Parity", "Multihit", "Indeterminate", }; if (!rtas_error_extended(errp)) { pr_err("Machine check interrupt: Missing extended error log\n"); return; } pseries_log = get_pseries_errorlog(errp, PSERIES_ELOG_SECT_ID_MCE); if (pseries_log == NULL) return; mce_log = (struct pseries_mc_errorlog *)pseries_log->data; error_type = mce_log->error_type; err_sub_type = rtas_mc_error_sub_type(mce_log); switch (rtas_error_severity(errp)) { case RTAS_SEVERITY_NO_ERROR: level = KERN_INFO; sevstr = "Harmless"; break; case RTAS_SEVERITY_WARNING: level = KERN_WARNING; sevstr = ""; break; case RTAS_SEVERITY_ERROR: case RTAS_SEVERITY_ERROR_SYNC: level = KERN_ERR; sevstr = "Severe"; break; case RTAS_SEVERITY_FATAL: default: level = KERN_ERR; sevstr = "Fatal"; break; } #ifdef CONFIG_PPC_BOOK3S_64 /* Display faulty slb contents for SLB errors. */ if (error_type == MC_ERROR_TYPE_SLB) slb_dump_contents(local_paca->mce_faulty_slbs); #endif printk("%s%s Machine check interrupt [%s]\n", level, sevstr, disposition == RTAS_DISP_FULLY_RECOVERED ? "Recovered" : "Not recovered"); if (user_mode(regs)) { printk("%s NIP: [%016lx] PID: %d Comm: %s\n", level, regs->nip, current->pid, current->comm); } else { printk("%s NIP [%016lx]: %pS\n", level, regs->nip, (void *)regs->nip); } printk("%s Initiator: %s\n", level, VAL_TO_STRING(initiators, initiator)); switch (error_type) { case MC_ERROR_TYPE_UE: printk("%s Error type: %s [%s]\n", level, VAL_TO_STRING(mc_err_types, error_type), VAL_TO_STRING(mc_ue_types, err_sub_type)); break; case MC_ERROR_TYPE_SLB: printk("%s Error type: %s [%s]\n", level, VAL_TO_STRING(mc_err_types, error_type), VAL_TO_STRING(mc_slb_types, err_sub_type)); break; case MC_ERROR_TYPE_ERAT: case MC_ERROR_TYPE_TLB: printk("%s Error type: %s [%s]\n", level, VAL_TO_STRING(mc_err_types, error_type), VAL_TO_STRING(mc_soft_types, err_sub_type)); break; default: printk("%s Error type: %s\n", level, VAL_TO_STRING(mc_err_types, error_type)); break; } addr = rtas_mc_get_effective_addr(mce_log); if (addr) printk("%s Effective address: %016llx\n", level, addr); }