static int ioda_eeh_event(struct notifier_block *nb, unsigned long events, void *change) { uint64_t changed_evts = (uint64_t)change; /* We simply send special EEH event */ if ((changed_evts & OPAL_EVENT_PCI_ERROR) && (events & OPAL_EVENT_PCI_ERROR)) eeh_send_failure_event(NULL); return 0; }
/* * On PowerNV platform, we might already have fenced PHB there. * For that case, it's meaningless to recover frozen PE. Intead, * We have to handle fenced PHB firstly. */ static int eeh_phb_check_failure(struct eeh_pe *pe) { struct eeh_pe *phb_pe; unsigned long flags; int ret; if (!eeh_has_flag(EEH_PROBE_MODE_DEV)) return -EPERM; /* Find the PHB PE */ phb_pe = eeh_phb_pe_get(pe->phb); if (!phb_pe) { pr_warn("%s Can't find PE for PHB#%d\n", __func__, pe->phb->global_number); return -EEXIST; } /* If the PHB has been in problematic state */ eeh_serialize_lock(&flags); if (phb_pe->state & EEH_PE_ISOLATED) { ret = 0; goto out; } /* Check PHB state */ ret = eeh_ops->get_state(phb_pe, NULL); if ((ret < 0) || (ret == EEH_STATE_NOT_SUPPORT) || (ret & (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE)) == (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE)) { ret = 0; goto out; } /* Isolate the PHB and send event */ eeh_pe_state_mark(phb_pe, EEH_PE_ISOLATED); eeh_serialize_unlock(flags); pr_err("EEH: PHB#%x failure detected, location: %s\n", phb_pe->phb->global_number, eeh_pe_loc_get(phb_pe)); dump_stack(); eeh_send_failure_event(phb_pe); return 1; out: eeh_serialize_unlock(flags); return ret; }
/** * eeh_dn_check_failure - check if all 1's data is due to EEH slot freeze * @dn device node * @dev pci device, if known * * Check for an EEH failure for the given device node. Call this * routine if the result of a read was all 0xff's and you want to * find out if this is due to an EEH slot freeze. This routine * will query firmware for the EEH status. * * Returns 0 if there has not been an EEH error; otherwise returns * a non-zero value and queues up a slot isolation event notification. * * It is safe to call this routine in an interrupt context. */ int eeh_dn_check_failure(struct device_node *dn, struct pci_dev *dev) { int ret; int rets[3]; unsigned long flags; struct pci_dn *pdn; enum pci_channel_state state; int rc = 0; total_mmio_ffs++; if (!eeh_subsystem_enabled) return 0; if (!dn) { no_dn++; return 0; } pdn = PCI_DN(dn); /* Access to IO BARs might get this far and still not want checking. */ if (!(pdn->eeh_mode & EEH_MODE_SUPPORTED) || pdn->eeh_mode & EEH_MODE_NOCHECK) { ignored_check++; #ifdef DEBUG printk ("EEH:ignored check (%x) for %s %s\n", pdn->eeh_mode, pci_name (dev), dn->full_name); #endif return 0; } if (!pdn->eeh_config_addr && !pdn->eeh_pe_config_addr) { no_cfg_addr++; return 0; } /* If we already have a pending isolation event for this * slot, we know it's bad already, we don't need to check. * Do this checking under a lock; as multiple PCI devices * in one slot might report errors simultaneously, and we * only want one error recovery routine running. */ spin_lock_irqsave(&confirm_error_lock, flags); rc = 1; if (pdn->eeh_mode & EEH_MODE_ISOLATED) { pdn->eeh_check_count ++; if (pdn->eeh_check_count >= EEH_MAX_FAILS) { printk (KERN_ERR "EEH: Device driver ignored %d bad reads, panicing\n", pdn->eeh_check_count); dump_stack(); msleep(5000); /* re-read the slot reset state */ if (read_slot_reset_state(pdn, rets) != 0) rets[0] = -1; /* reset state unknown */ /* If we are here, then we hit an infinite loop. Stop. */ panic("EEH: MMIO halt (%d) on device:%s\n", rets[0], pci_name(dev)); } goto dn_unlock; } /* * Now test for an EEH failure. This is VERY expensive. * Note that the eeh_config_addr may be a parent device * in the case of a device behind a bridge, or it may be * function zero of a multi-function device. * In any case they must share a common PHB. */ ret = read_slot_reset_state(pdn, rets); /* If the call to firmware failed, punt */ if (ret != 0) { printk(KERN_WARNING "EEH: read_slot_reset_state() failed; rc=%d dn=%s\n", ret, dn->full_name); false_positives++; rc = 0; goto dn_unlock; } /* If EEH is not supported on this device, punt. */ if (rets[1] != 1) { printk(KERN_WARNING "EEH: event on unsupported device, rc=%d dn=%s\n", ret, dn->full_name); false_positives++; rc = 0; goto dn_unlock; } /* If not the kind of error we know about, punt. */ if (rets[0] != 2 && rets[0] != 4 && rets[0] != 5) { false_positives++; rc = 0; goto dn_unlock; } /* Note that config-io to empty slots may fail; * we recognize empty because they don't have children. */ if ((rets[0] == 5) && (dn->child == NULL)) { false_positives++; rc = 0; goto dn_unlock; } slot_resets++; /* Avoid repeated reports of this failure, including problems * with other functions on this device, and functions under * bridges. */ eeh_mark_slot (dn, EEH_MODE_ISOLATED); spin_unlock_irqrestore(&confirm_error_lock, flags); state = pci_channel_io_normal; if ((rets[0] == 2) || (rets[0] == 4)) state = pci_channel_io_frozen; if (rets[0] == 5) state = pci_channel_io_perm_failure; eeh_send_failure_event (dn, dev, state, rets[2]); /* Most EEH events are due to device driver bugs. Having * a stack trace will help the device-driver authors figure * out what happened. So print that out. */ if (rets[0] != 5) dump_stack(); return 1; dn_unlock: spin_unlock_irqrestore(&confirm_error_lock, flags); return rc; }