/** * eeh_report_resume - Tell device to resume normal operations * @data: eeh device * @userdata: return value * * This routine must be called to notify the device driver that it * could resume so that the device driver can do some initialization * to make the recovered device work again. */ static void *eeh_report_resume(void *data, void *userdata) { struct eeh_dev *edev = (struct eeh_dev *)data; struct pci_dev *dev = eeh_dev_to_pci_dev(edev); struct pci_driver *driver; if (!dev || eeh_dev_removed(edev)) return NULL; dev->error_state = pci_channel_io_normal; driver = eeh_pcid_get(dev); if (!driver) return NULL; eeh_enable_irq(dev); if (!driver->err_handler || !driver->err_handler->resume || (edev->mode & EEH_DEV_NO_HANDLER)) { edev->mode &= ~EEH_DEV_NO_HANDLER; eeh_pcid_put(dev); return NULL; } driver->err_handler->resume(dev); eeh_pcid_put(dev); return NULL; }
/** * eeh_report_failure - Tell device driver that device is dead. * @data: eeh device * @userdata: return value * * This informs the device driver that the device is permanently * dead, and that no further recovery attempts will be made on it. */ static void *eeh_report_failure(void *data, void *userdata) { struct eeh_dev *edev = (struct eeh_dev *)data; struct pci_dev *dev = eeh_dev_to_pci_dev(edev); struct pci_driver *driver; if (!dev || eeh_dev_removed(edev)) return NULL; dev->error_state = pci_channel_io_perm_failure; driver = eeh_pcid_get(dev); if (!driver) return NULL; eeh_disable_irq(dev); if (!driver->err_handler || !driver->err_handler->error_detected) { eeh_pcid_put(dev); return NULL; } driver->err_handler->error_detected(dev, pci_channel_io_perm_failure); eeh_pcid_put(dev); return NULL; }
/** * eeh_report_mmio_enabled - Tell drivers that MMIO has been enabled * @data: eeh device * @userdata: return value * * Tells each device driver that IO ports, MMIO and config space I/O * are now enabled. Collects up and merges the device driver responses. * Cumulative response passed back in "userdata". */ static void *eeh_report_mmio_enabled(void *data, void *userdata) { struct eeh_dev *edev = (struct eeh_dev *)data; struct pci_dev *dev = eeh_dev_to_pci_dev(edev); enum pci_ers_result rc, *res = userdata; struct pci_driver *driver; if (!dev || eeh_dev_removed(edev)) return NULL; driver = eeh_pcid_get(dev); if (!driver) return NULL; if (!driver->err_handler || !driver->err_handler->mmio_enabled || (edev->mode & EEH_DEV_NO_HANDLER)) { eeh_pcid_put(dev); return NULL; } rc = driver->err_handler->mmio_enabled(dev); /* A driver that needs a reset trumps all others */ if (rc == PCI_ERS_RESULT_NEED_RESET) *res = rc; if (*res == PCI_ERS_RESULT_NONE) *res = rc; eeh_pcid_put(dev); return NULL; }
/** * eeh_report_reset - Tell device that slot has been reset * @data: eeh device * @userdata: return value * * This routine must be called while EEH tries to reset particular * PCI device so that the associated PCI device driver could take * some actions, usually to save data the driver needs so that the * driver can work again while the device is recovered. */ static void *eeh_report_reset(void *data, void *userdata) { struct eeh_dev *edev = (struct eeh_dev *)data; struct pci_dev *dev = eeh_dev_to_pci_dev(edev); enum pci_ers_result rc, *res = userdata; struct pci_driver *driver; if (!dev || eeh_dev_removed(edev)) return NULL; dev->error_state = pci_channel_io_normal; driver = eeh_pcid_get(dev); if (!driver) return NULL; eeh_enable_irq(dev); if (!driver->err_handler || !driver->err_handler->slot_reset || (edev->mode & EEH_DEV_NO_HANDLER)) { eeh_pcid_put(dev); return NULL; } rc = driver->err_handler->slot_reset(dev); if ((*res == PCI_ERS_RESULT_NONE) || (*res == PCI_ERS_RESULT_RECOVERED)) *res = rc; if (*res == PCI_ERS_RESULT_DISCONNECT && rc == PCI_ERS_RESULT_NEED_RESET) *res = rc; eeh_pcid_put(dev); return NULL; }
/** * eeh_report_error - Report pci error to each device driver * @data: eeh device * @userdata: return value * * Report an EEH error to each device driver, collect up and * merge the device driver responses. Cumulative response * passed back in "userdata". */ static void *eeh_report_error(void *data, void *userdata) { struct eeh_dev *edev = (struct eeh_dev *)data; struct pci_dev *dev = eeh_dev_to_pci_dev(edev); enum pci_ers_result rc, *res = userdata; struct pci_driver *driver; if (!dev || eeh_dev_removed(edev)) return NULL; dev->error_state = pci_channel_io_frozen; driver = eeh_pcid_get(dev); if (!driver) return NULL; eeh_disable_irq(dev); if (!driver->err_handler || !driver->err_handler->error_detected) { eeh_pcid_put(dev); return NULL; } rc = driver->err_handler->error_detected(dev, pci_channel_io_frozen); /* A driver that needs a reset trumps all others */ if (rc == PCI_ERS_RESULT_NEED_RESET) *res = rc; if (*res == PCI_ERS_RESULT_NONE) *res = rc; eeh_pcid_put(dev); return NULL; }
static void *eeh_rmv_device(void *data, void *userdata) { struct pci_driver *driver; struct eeh_dev *edev = (struct eeh_dev *)data; struct pci_dev *dev = eeh_dev_to_pci_dev(edev); int *removed = (int *)userdata; /* * Actually, we should remove the PCI bridges as well. * However, that's lots of complexity to do that, * particularly some of devices under the bridge might * support EEH. So we just care about PCI devices for * simplicity here. */ if (!dev || (dev->hdr_type & PCI_HEADER_TYPE_BRIDGE)) return NULL; /* * We rely on count-based pcibios_release_device() to * detach permanently offlined PEs. Unfortunately, that's * not reliable enough. We might have the permanently * offlined PEs attached, but we needn't take care of * them and their child devices. */ if (eeh_dev_removed(edev)) return NULL; driver = eeh_pcid_get(dev); if (driver) { eeh_pcid_put(dev); if (driver->err_handler) return NULL; } /* Remove it from PCI subsystem */ pr_debug("EEH: Removing %s without EEH sensitive driver\n", pci_name(dev)); edev->bus = dev->bus; edev->mode |= EEH_DEV_DISCONNECTED; (*removed)++; pci_lock_rescan_remove(); pci_stop_and_remove_bus_device(dev); pci_unlock_rescan_remove(); return NULL; }
/** * ioda_eeh_reset - Reset the indicated PE * @pe: EEH PE * @option: reset option * * Do reset on the indicated PE. For PCI bus sensitive PE, * we need to reset the parent p2p bridge. The PHB has to * be reinitialized if the p2p bridge is root bridge. For * PCI device sensitive PE, we will try to reset the device * through FLR. For now, we don't have OPAL APIs to do HARD * reset yet, so all reset would be SOFT (HOT) reset. */ static int ioda_eeh_reset(struct eeh_pe *pe, int option) { struct pci_controller *hose = pe->phb; struct eeh_dev *edev; struct pci_dev *dev; int ret; /* * Anyway, we have to clear the problematic state for the * corresponding PE. However, we needn't do it if the PE * is PHB associated. That means the PHB is having fatal * errors and it needs reset. Further more, the AIB interface * isn't reliable any more. */ if (!(pe->type & EEH_PE_PHB) && (option == EEH_RESET_HOT || option == EEH_RESET_FUNDAMENTAL)) { ret = ioda_eeh_pe_clear(pe); if (ret) return -EIO; } /* * The rules applied to reset, either fundamental or hot reset: * * We always reset the direct upstream bridge of the PE. If the * direct upstream bridge isn't root bridge, we always take hot * reset no matter what option (fundamental or hot) is. Otherwise, * we should do the reset according to the required option. */ if (pe->type & EEH_PE_PHB) { ret = ioda_eeh_phb_reset(hose, option); } else { if (pe->type & EEH_PE_DEVICE) { /* * If it's device PE, we didn't refer to the parent * PCI bus yet. So we have to figure it out indirectly. */ edev = list_first_entry(&pe->edevs, struct eeh_dev, list); dev = eeh_dev_to_pci_dev(edev); dev = dev->bus->self; } else {
struct eeh_dev *handle_eeh_events(struct eeh_event *event) { struct device_node *frozen_dn; struct eeh_dev *frozen_edev; struct pci_bus *frozen_bus; int rc = 0; enum pci_ers_result result = PCI_ERS_RESULT_NONE; const char *location, *pci_str, *drv_str, *bus_pci_str, *bus_drv_str; frozen_dn = eeh_find_device_pe(eeh_dev_to_of_node(event->edev)); if (!frozen_dn) { location = of_get_property(eeh_dev_to_of_node(event->edev), "ibm,loc-code", NULL); location = location ? location : "unknown"; printk(KERN_ERR "EEH: Error: Cannot find partition endpoint " "for location=%s pci addr=%s\n", location, eeh_pci_name(eeh_dev_to_pci_dev(event->edev))); return NULL; } frozen_bus = pcibios_find_pci_bus(frozen_dn); location = of_get_property(frozen_dn, "ibm,loc-code", NULL); location = location ? location : "unknown"; if (!frozen_bus) frozen_bus = pcibios_find_pci_bus(frozen_dn->parent); if (!frozen_bus) { printk(KERN_ERR "EEH: Cannot find PCI bus " "for location=%s dn=%s\n", location, frozen_dn->full_name); return NULL; } frozen_edev = of_node_to_eeh_dev(frozen_dn); frozen_edev->freeze_count++; pci_str = eeh_pci_name(eeh_dev_to_pci_dev(event->edev)); drv_str = eeh_pcid_name(eeh_dev_to_pci_dev(event->edev)); if (frozen_edev->freeze_count > EEH_MAX_ALLOWED_FREEZES) goto excess_failures; printk(KERN_WARNING "EEH: This PCI device has failed %d times in the last hour:\n", frozen_edev->freeze_count); if (frozen_edev->pdev) { bus_pci_str = pci_name(frozen_edev->pdev); bus_drv_str = eeh_pcid_name(frozen_edev->pdev); printk(KERN_WARNING "EEH: Bus location=%s driver=%s pci addr=%s\n", location, bus_drv_str, bus_pci_str); } printk(KERN_WARNING "EEH: Device location=%s driver=%s pci addr=%s\n", location, drv_str, pci_str); pci_walk_bus(frozen_bus, eeh_report_error, &result); rc = eeh_ops->wait_state(eeh_dev_to_of_node(frozen_edev), MAX_WAIT_FOR_RECOVERY*1000); if (rc < 0 || rc == EEH_STATE_NOT_SUPPORT) { printk(KERN_WARNING "EEH: Permanent failure\n"); goto hard_fail; } eeh_slot_error_detail(frozen_edev, EEH_LOG_TEMP); if (result == PCI_ERS_RESULT_NONE) { rc = eeh_reset_device(frozen_edev, frozen_bus); if (rc) { printk(KERN_WARNING "EEH: Unable to reset, rc=%d\n", rc); goto hard_fail; } } if (result == PCI_ERS_RESULT_CAN_RECOVER) { rc = eeh_pci_enable(frozen_edev, EEH_OPT_THAW_MMIO); if (rc < 0) goto hard_fail; if (rc) { result = PCI_ERS_RESULT_NEED_RESET; } else { result = PCI_ERS_RESULT_NONE; pci_walk_bus(frozen_bus, eeh_report_mmio_enabled, &result); } } if (result == PCI_ERS_RESULT_CAN_RECOVER) { rc = eeh_pci_enable(frozen_edev, EEH_OPT_THAW_DMA); if (rc < 0) goto hard_fail; if (rc) result = PCI_ERS_RESULT_NEED_RESET; else result = PCI_ERS_RESULT_RECOVERED; } if (result == PCI_ERS_RESULT_DISCONNECT) { printk(KERN_WARNING "EEH: Device driver gave up\n"); goto hard_fail; } if (result == PCI_ERS_RESULT_NEED_RESET) { rc = eeh_reset_device(frozen_edev, NULL); if (rc) { printk(KERN_WARNING "EEH: Cannot reset, rc=%d\n", rc); goto hard_fail; } result = PCI_ERS_RESULT_NONE; pci_walk_bus(frozen_bus, eeh_report_reset, &result); } if ((result != PCI_ERS_RESULT_RECOVERED) && (result != PCI_ERS_RESULT_NONE)) { printk(KERN_WARNING "EEH: Not recovered\n"); goto hard_fail; } pci_walk_bus(frozen_bus, eeh_report_resume, NULL); return frozen_edev; excess_failures: printk(KERN_ERR "EEH: PCI device at location=%s driver=%s pci addr=%s\n" "has failed %d times in the last hour " "and has been permanently disabled.\n" "Please try reseating this device or replacing it.\n", location, drv_str, pci_str, frozen_edev->freeze_count); goto perm_error; hard_fail: printk(KERN_ERR "EEH: Unable to recover from failure of PCI device " "at location=%s driver=%s pci addr=%s\n" "Please try reseating this device or replacing it.\n", location, drv_str, pci_str); perm_error: eeh_slot_error_detail(frozen_edev, EEH_LOG_PERM); pci_walk_bus(frozen_bus, eeh_report_failure, NULL); pcibios_remove_pci_devices(frozen_bus); return NULL; }
/** * eeh_dev_check_failure - Check if all 1's data is due to EEH slot freeze * @edev: eeh device * * Check for an EEH failure for the given device node. Call this * routine if the result of a read was all 0xff's and you want to * find out if this is due to an EEH slot freeze. This routine * will query firmware for the EEH status. * * Returns 0 if there has not been an EEH error; otherwise returns * a non-zero value and queues up a slot isolation event notification. * * It is safe to call this routine in an interrupt context. */ int eeh_dev_check_failure(struct eeh_dev *edev) { int ret; int active_flags = (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE); unsigned long flags; struct pci_dn *pdn; struct pci_dev *dev; struct eeh_pe *pe, *parent_pe, *phb_pe; int rc = 0; const char *location = NULL; eeh_stats.total_mmio_ffs++; if (!eeh_enabled()) return 0; if (!edev) { eeh_stats.no_dn++; return 0; } dev = eeh_dev_to_pci_dev(edev); pe = eeh_dev_to_pe(edev); /* Access to IO BARs might get this far and still not want checking. */ if (!pe) { eeh_stats.ignored_check++; pr_debug("EEH: Ignored check for %s\n", eeh_pci_name(dev)); return 0; } if (!pe->addr && !pe->config_addr) { eeh_stats.no_cfg_addr++; return 0; } /* * On PowerNV platform, we might already have fenced PHB * there and we need take care of that firstly. */ ret = eeh_phb_check_failure(pe); if (ret > 0) return ret; /* * If the PE isn't owned by us, we shouldn't check the * state. Instead, let the owner handle it if the PE has * been frozen. */ if (eeh_pe_passed(pe)) return 0; /* If we already have a pending isolation event for this * slot, we know it's bad already, we don't need to check. * Do this checking under a lock; as multiple PCI devices * in one slot might report errors simultaneously, and we * only want one error recovery routine running. */ eeh_serialize_lock(&flags); rc = 1; if (pe->state & EEH_PE_ISOLATED) { pe->check_count++; if (pe->check_count % EEH_MAX_FAILS == 0) { pdn = eeh_dev_to_pdn(edev); if (pdn->node) location = of_get_property(pdn->node, "ibm,loc-code", NULL); printk(KERN_ERR "EEH: %d reads ignored for recovering device at " "location=%s driver=%s pci addr=%s\n", pe->check_count, location ? location : "unknown", eeh_driver_name(dev), eeh_pci_name(dev)); printk(KERN_ERR "EEH: Might be infinite loop in %s driver\n", eeh_driver_name(dev)); dump_stack(); } goto dn_unlock; } /* * Now test for an EEH failure. This is VERY expensive. * Note that the eeh_config_addr may be a parent device * in the case of a device behind a bridge, or it may be * function zero of a multi-function device. * In any case they must share a common PHB. */ ret = eeh_ops->get_state(pe, NULL); /* Note that config-io to empty slots may fail; * they are empty when they don't have children. * We will punt with the following conditions: Failure to get * PE's state, EEH not support and Permanently unavailable * state, PE is in good state. */ if ((ret < 0) || (ret == EEH_STATE_NOT_SUPPORT) || ((ret & active_flags) == active_flags)) { eeh_stats.false_positives++; pe->false_positives++; rc = 0; goto dn_unlock; } /* * It should be corner case that the parent PE has been * put into frozen state as well. We should take care * that at first. */ parent
/** * eeh_handle_event - Reset a PCI device after hard lockup. * @event: EEH event * * While PHB detects address or data parity errors on particular PCI * slot, the associated PE will be frozen. Besides, DMA's occurring * to wild addresses (which usually happen due to bugs in device * drivers or in PCI adapter firmware) can cause EEH error. #SERR, * #PERR or other misc PCI-related errors also can trigger EEH errors. * * Recovery process consists of unplugging the device driver (which * generated hotplug events to userspace), then issuing a PCI #RST to * the device, then reconfiguring the PCI config space for all bridges * & devices under this slot, and then finally restarting the device * drivers (which cause a second set of hotplug events to go out to * userspace). */ struct eeh_dev *handle_eeh_events(struct eeh_event *event) { struct device_node *frozen_dn; struct eeh_dev *frozen_edev; struct pci_bus *frozen_bus; int rc = 0; enum pci_ers_result result = PCI_ERS_RESULT_NONE; const char *location, *pci_str, *drv_str, *bus_pci_str, *bus_drv_str; frozen_dn = eeh_find_device_pe(eeh_dev_to_of_node(event->edev)); if (!frozen_dn) { location = of_get_property(eeh_dev_to_of_node(event->edev), "ibm,loc-code", NULL); location = location ? location : "unknown"; printk(KERN_ERR "EEH: Error: Cannot find partition endpoint " "for location=%s pci addr=%s\n", location, eeh_pci_name(eeh_dev_to_pci_dev(event->edev))); return NULL; } frozen_bus = pcibios_find_pci_bus(frozen_dn); location = of_get_property(frozen_dn, "ibm,loc-code", NULL); location = location ? location : "unknown"; /* There are two different styles for coming up with the PE. * In the old style, it was the highest EEH-capable device * which was always an EADS pci bridge. In the new style, * there might not be any EADS bridges, and even when there are, * the firmware marks them as "EEH incapable". So another * two-step is needed to find the pci bus.. */ if (!frozen_bus) frozen_bus = pcibios_find_pci_bus(frozen_dn->parent); if (!frozen_bus) { printk(KERN_ERR "EEH: Cannot find PCI bus " "for location=%s dn=%s\n", location, frozen_dn->full_name); return NULL; } frozen_edev = of_node_to_eeh_dev(frozen_dn); frozen_edev->freeze_count++; pci_str = eeh_pci_name(eeh_dev_to_pci_dev(event->edev)); drv_str = eeh_pcid_name(eeh_dev_to_pci_dev(event->edev)); if (frozen_edev->freeze_count > EEH_MAX_ALLOWED_FREEZES) goto excess_failures; printk(KERN_WARNING "EEH: This PCI device has failed %d times in the last hour:\n", frozen_edev->freeze_count); if (frozen_edev->pdev) { bus_pci_str = pci_name(frozen_edev->pdev); bus_drv_str = eeh_pcid_name(frozen_edev->pdev); printk(KERN_WARNING "EEH: Bus location=%s driver=%s pci addr=%s\n", location, bus_drv_str, bus_pci_str); } printk(KERN_WARNING "EEH: Device location=%s driver=%s pci addr=%s\n", location, drv_str, pci_str); /* Walk the various device drivers attached to this slot through * a reset sequence, giving each an opportunity to do what it needs * to accomplish the reset. Each child gets a report of the * status ... if any child can't handle the reset, then the entire * slot is dlpar removed and added. */ pci_walk_bus(frozen_bus, eeh_report_error, &result); /* Get the current PCI slot state. This can take a long time, * sometimes over 3 seconds for certain systems. */ rc = eeh_ops->wait_state(eeh_dev_to_of_node(frozen_edev), MAX_WAIT_FOR_RECOVERY*1000); if (rc < 0 || rc == EEH_STATE_NOT_SUPPORT) { printk(KERN_WARNING "EEH: Permanent failure\n"); goto hard_fail; } /* Since rtas may enable MMIO when posting the error log, * don't post the error log until after all dev drivers * have been informed. */ eeh_slot_error_detail(frozen_edev, EEH_LOG_TEMP); /* If all device drivers were EEH-unaware, then shut * down all of the device drivers, and hope they * go down willingly, without panicing the system. */ if (result == PCI_ERS_RESULT_NONE) { rc = eeh_reset_device(frozen_edev, frozen_bus); if (rc) { printk(KERN_WARNING "EEH: Unable to reset, rc=%d\n", rc); goto hard_fail; } } /* If all devices reported they can proceed, then re-enable MMIO */ if (result == PCI_ERS_RESULT_CAN_RECOVER) { rc = eeh_pci_enable(frozen_edev, EEH_OPT_THAW_MMIO); if (rc < 0) goto hard_fail; if (rc) { result = PCI_ERS_RESULT_NEED_RESET; } else { result = PCI_ERS_RESULT_NONE; pci_walk_bus(frozen_bus, eeh_report_mmio_enabled, &result); } } /* If all devices reported they can proceed, then re-enable DMA */ if (result == PCI_ERS_RESULT_CAN_RECOVER) { rc = eeh_pci_enable(frozen_edev, EEH_OPT_THAW_DMA); if (rc < 0) goto hard_fail; if (rc) result = PCI_ERS_RESULT_NEED_RESET; else result = PCI_ERS_RESULT_RECOVERED; } /* If any device has a hard failure, then shut off everything. */ if (result == PCI_ERS_RESULT_DISCONNECT) { printk(KERN_WARNING "EEH: Device driver gave up\n"); goto hard_fail; } /* If any device called out for a reset, then reset the slot */ if (result == PCI_ERS_RESULT_NEED_RESET) { rc = eeh_reset_device(frozen_edev, NULL); if (rc) { printk(KERN_WARNING "EEH: Cannot reset, rc=%d\n", rc); goto hard_fail; } result = PCI_ERS_RESULT_NONE; pci_walk_bus(frozen_bus, eeh_report_reset, &result); } /* All devices should claim they have recovered by now. */ if ((result != PCI_ERS_RESULT_RECOVERED) && (result != PCI_ERS_RESULT_NONE)) { printk(KERN_WARNING "EEH: Not recovered\n"); goto hard_fail; } /* Tell all device drivers that they can resume operations */ pci_walk_bus(frozen_bus, eeh_report_resume, NULL); return frozen_edev; excess_failures: /* * About 90% of all real-life EEH failures in the field * are due to poorly seated PCI cards. Only 10% or so are * due to actual, failed cards. */ printk(KERN_ERR "EEH: PCI device at location=%s driver=%s pci addr=%s\n" "has failed %d times in the last hour " "and has been permanently disabled.\n" "Please try reseating this device or replacing it.\n", location, drv_str, pci_str, frozen_edev->freeze_count); goto perm_error; hard_fail: printk(KERN_ERR "EEH: Unable to recover from failure of PCI device " "at location=%s driver=%s pci addr=%s\n" "Please try reseating this device or replacing it.\n", location, drv_str, pci_str); perm_error: eeh_slot_error_detail(frozen_edev, EEH_LOG_PERM); /* Notify all devices that they're about to go down. */ pci_walk_bus(frozen_bus, eeh_report_failure, NULL); /* Shut down the device drivers for good. */ pcibios_remove_pci_devices(frozen_bus); return NULL; }