/** * eeh_slot_error_detail - Generate combined log including driver log and error log * @pe: EEH PE * @severity: temporary or permanent error log * * This routine should be called to generate the combined log, which * is comprised of driver log and error log. The driver log is figured * out from the config space of the corresponding PCI device, while * the error log is fetched through platform dependent function call. */ void eeh_slot_error_detail(struct eeh_pe *pe, int severity) { size_t loglen = 0; struct eeh_dev *edev, *tmp; /* * When the PHB is fenced or dead, it's pointless to collect * the data from PCI config space because it should return * 0xFF's. For ER, we still retrieve the data from the PCI * config space. * * For pHyp, we have to enable IO for log retrieval. Otherwise, * 0xFF's is always returned from PCI config space. */ if (!(pe->type & EEH_PE_PHB)) { if (eeh_has_flag(EEH_ENABLE_IO_FOR_LOG)) eeh_pci_enable(pe, EEH_OPT_THAW_MMIO); eeh_ops->configure_bridge(pe); eeh_pe_restore_bars(pe); pci_regs_buf[0] = 0; eeh_pe_for_each_dev(pe, edev, tmp) { loglen += eeh_gather_pci_data(edev, pci_regs_buf + loglen, EEH_PCI_REGS_LOG_LEN - loglen); } }
/* * Explicitly clear PE's frozen state for PowerNV where * we have frozen PE until BAR restore is completed. It's * harmless to clear it for pSeries. To be consistent with * PE reset (for 3 times), we try to clear the frozen state * for 3 times as well. */ static void *__eeh_clear_pe_frozen_state(void *data, void *flag) { struct eeh_pe *pe = (struct eeh_pe *)data; int i, rc; for (i = 0; i < 3; i++) { rc = eeh_pci_enable(pe, EEH_OPT_THAW_MMIO); if (rc) continue; rc = eeh_pci_enable(pe, EEH_OPT_THAW_DMA); if (!rc) break; } /* The PE has been isolated, clear it */ if (rc) { pr_warn("%s: Can't clear frozen PHB#%x-PE#%x (%d)\n", __func__, pe->phb->global_number, pe->addr, rc); return (void *)pe; } return NULL; }
/** * eeh_slot_error_detail - Generate combined log including driver log and error log * @pe: EEH PE * @severity: temporary or permanent error log * * This routine should be called to generate the combined log, which * is comprised of driver log and error log. The driver log is figured * out from the config space of the corresponding PCI device, while * the error log is fetched through platform dependent function call. */ void eeh_slot_error_detail(struct eeh_pe *pe, int severity) { size_t loglen = 0; /* * When the PHB is fenced or dead, it's pointless to collect * the data from PCI config space because it should return * 0xFF's. For ER, we still retrieve the data from the PCI * config space. * * For pHyp, we have to enable IO for log retrieval. Otherwise, * 0xFF's is always returned from PCI config space. */ if (!(pe->type & EEH_PE_PHB)) { if (eeh_has_flag(EEH_ENABLE_IO_FOR_LOG)) eeh_pci_enable(pe, EEH_OPT_THAW_MMIO); /* * The config space of some PCI devices can't be accessed * when their PEs are in frozen state. Otherwise, fenced * PHB might be seen. Those PEs are identified with flag * EEH_PE_CFG_RESTRICTED, indicating EEH_PE_CFG_BLOCKED * is set automatically when the PE is put to EEH_PE_ISOLATED. * * Restoring BARs possibly triggers PCI config access in * (OPAL) firmware and then causes fenced PHB. If the * PCI config is blocked with flag EEH_PE_CFG_BLOCKED, it's * pointless to restore BARs and dump config space. */ eeh_ops->configure_bridge(pe); if (!(pe->state & EEH_PE_CFG_BLOCKED)) { eeh_pe_restore_bars(pe); pci_regs_buf[0] = 0; eeh_pe_traverse(pe, eeh_dump_pe_log, &loglen); } } eeh_ops->get_log(pe, severity, pci_regs_buf, loglen); }
struct eeh_dev *handle_eeh_events(struct eeh_event *event) { struct device_node *frozen_dn; struct eeh_dev *frozen_edev; struct pci_bus *frozen_bus; int rc = 0; enum pci_ers_result result = PCI_ERS_RESULT_NONE; const char *location, *pci_str, *drv_str, *bus_pci_str, *bus_drv_str; frozen_dn = eeh_find_device_pe(eeh_dev_to_of_node(event->edev)); if (!frozen_dn) { location = of_get_property(eeh_dev_to_of_node(event->edev), "ibm,loc-code", NULL); location = location ? location : "unknown"; printk(KERN_ERR "EEH: Error: Cannot find partition endpoint " "for location=%s pci addr=%s\n", location, eeh_pci_name(eeh_dev_to_pci_dev(event->edev))); return NULL; } frozen_bus = pcibios_find_pci_bus(frozen_dn); location = of_get_property(frozen_dn, "ibm,loc-code", NULL); location = location ? location : "unknown"; if (!frozen_bus) frozen_bus = pcibios_find_pci_bus(frozen_dn->parent); if (!frozen_bus) { printk(KERN_ERR "EEH: Cannot find PCI bus " "for location=%s dn=%s\n", location, frozen_dn->full_name); return NULL; } frozen_edev = of_node_to_eeh_dev(frozen_dn); frozen_edev->freeze_count++; pci_str = eeh_pci_name(eeh_dev_to_pci_dev(event->edev)); drv_str = eeh_pcid_name(eeh_dev_to_pci_dev(event->edev)); if (frozen_edev->freeze_count > EEH_MAX_ALLOWED_FREEZES) goto excess_failures; printk(KERN_WARNING "EEH: This PCI device has failed %d times in the last hour:\n", frozen_edev->freeze_count); if (frozen_edev->pdev) { bus_pci_str = pci_name(frozen_edev->pdev); bus_drv_str = eeh_pcid_name(frozen_edev->pdev); printk(KERN_WARNING "EEH: Bus location=%s driver=%s pci addr=%s\n", location, bus_drv_str, bus_pci_str); } printk(KERN_WARNING "EEH: Device location=%s driver=%s pci addr=%s\n", location, drv_str, pci_str); pci_walk_bus(frozen_bus, eeh_report_error, &result); rc = eeh_ops->wait_state(eeh_dev_to_of_node(frozen_edev), MAX_WAIT_FOR_RECOVERY*1000); if (rc < 0 || rc == EEH_STATE_NOT_SUPPORT) { printk(KERN_WARNING "EEH: Permanent failure\n"); goto hard_fail; } eeh_slot_error_detail(frozen_edev, EEH_LOG_TEMP); if (result == PCI_ERS_RESULT_NONE) { rc = eeh_reset_device(frozen_edev, frozen_bus); if (rc) { printk(KERN_WARNING "EEH: Unable to reset, rc=%d\n", rc); goto hard_fail; } } if (result == PCI_ERS_RESULT_CAN_RECOVER) { rc = eeh_pci_enable(frozen_edev, EEH_OPT_THAW_MMIO); if (rc < 0) goto hard_fail; if (rc) { result = PCI_ERS_RESULT_NEED_RESET; } else { result = PCI_ERS_RESULT_NONE; pci_walk_bus(frozen_bus, eeh_report_mmio_enabled, &result); } } if (result == PCI_ERS_RESULT_CAN_RECOVER) { rc = eeh_pci_enable(frozen_edev, EEH_OPT_THAW_DMA); if (rc < 0) goto hard_fail; if (rc) result = PCI_ERS_RESULT_NEED_RESET; else result = PCI_ERS_RESULT_RECOVERED; } if (result == PCI_ERS_RESULT_DISCONNECT) { printk(KERN_WARNING "EEH: Device driver gave up\n"); goto hard_fail; } if (result == PCI_ERS_RESULT_NEED_RESET) { rc = eeh_reset_device(frozen_edev, NULL); if (rc) { printk(KERN_WARNING "EEH: Cannot reset, rc=%d\n", rc); goto hard_fail; } result = PCI_ERS_RESULT_NONE; pci_walk_bus(frozen_bus, eeh_report_reset, &result); } if ((result != PCI_ERS_RESULT_RECOVERED) && (result != PCI_ERS_RESULT_NONE)) { printk(KERN_WARNING "EEH: Not recovered\n"); goto hard_fail; } pci_walk_bus(frozen_bus, eeh_report_resume, NULL); return frozen_edev; excess_failures: printk(KERN_ERR "EEH: PCI device at location=%s driver=%s pci addr=%s\n" "has failed %d times in the last hour " "and has been permanently disabled.\n" "Please try reseating this device or replacing it.\n", location, drv_str, pci_str, frozen_edev->freeze_count); goto perm_error; hard_fail: printk(KERN_ERR "EEH: Unable to recover from failure of PCI device " "at location=%s driver=%s pci addr=%s\n" "Please try reseating this device or replacing it.\n", location, drv_str, pci_str); perm_error: eeh_slot_error_detail(frozen_edev, EEH_LOG_PERM); pci_walk_bus(frozen_bus, eeh_report_failure, NULL); pcibios_remove_pci_devices(frozen_bus); return NULL; }
/** * eeh_handle_event - Reset a PCI device after hard lockup. * @event: EEH event * * While PHB detects address or data parity errors on particular PCI * slot, the associated PE will be frozen. Besides, DMA's occurring * to wild addresses (which usually happen due to bugs in device * drivers or in PCI adapter firmware) can cause EEH error. #SERR, * #PERR or other misc PCI-related errors also can trigger EEH errors. * * Recovery process consists of unplugging the device driver (which * generated hotplug events to userspace), then issuing a PCI #RST to * the device, then reconfiguring the PCI config space for all bridges * & devices under this slot, and then finally restarting the device * drivers (which cause a second set of hotplug events to go out to * userspace). */ struct eeh_dev *handle_eeh_events(struct eeh_event *event) { struct device_node *frozen_dn; struct eeh_dev *frozen_edev; struct pci_bus *frozen_bus; int rc = 0; enum pci_ers_result result = PCI_ERS_RESULT_NONE; const char *location, *pci_str, *drv_str, *bus_pci_str, *bus_drv_str; frozen_dn = eeh_find_device_pe(eeh_dev_to_of_node(event->edev)); if (!frozen_dn) { location = of_get_property(eeh_dev_to_of_node(event->edev), "ibm,loc-code", NULL); location = location ? location : "unknown"; printk(KERN_ERR "EEH: Error: Cannot find partition endpoint " "for location=%s pci addr=%s\n", location, eeh_pci_name(eeh_dev_to_pci_dev(event->edev))); return NULL; } frozen_bus = pcibios_find_pci_bus(frozen_dn); location = of_get_property(frozen_dn, "ibm,loc-code", NULL); location = location ? location : "unknown"; /* There are two different styles for coming up with the PE. * In the old style, it was the highest EEH-capable device * which was always an EADS pci bridge. In the new style, * there might not be any EADS bridges, and even when there are, * the firmware marks them as "EEH incapable". So another * two-step is needed to find the pci bus.. */ if (!frozen_bus) frozen_bus = pcibios_find_pci_bus(frozen_dn->parent); if (!frozen_bus) { printk(KERN_ERR "EEH: Cannot find PCI bus " "for location=%s dn=%s\n", location, frozen_dn->full_name); return NULL; } frozen_edev = of_node_to_eeh_dev(frozen_dn); frozen_edev->freeze_count++; pci_str = eeh_pci_name(eeh_dev_to_pci_dev(event->edev)); drv_str = eeh_pcid_name(eeh_dev_to_pci_dev(event->edev)); if (frozen_edev->freeze_count > EEH_MAX_ALLOWED_FREEZES) goto excess_failures; printk(KERN_WARNING "EEH: This PCI device has failed %d times in the last hour:\n", frozen_edev->freeze_count); if (frozen_edev->pdev) { bus_pci_str = pci_name(frozen_edev->pdev); bus_drv_str = eeh_pcid_name(frozen_edev->pdev); printk(KERN_WARNING "EEH: Bus location=%s driver=%s pci addr=%s\n", location, bus_drv_str, bus_pci_str); } printk(KERN_WARNING "EEH: Device location=%s driver=%s pci addr=%s\n", location, drv_str, pci_str); /* Walk the various device drivers attached to this slot through * a reset sequence, giving each an opportunity to do what it needs * to accomplish the reset. Each child gets a report of the * status ... if any child can't handle the reset, then the entire * slot is dlpar removed and added. */ pci_walk_bus(frozen_bus, eeh_report_error, &result); /* Get the current PCI slot state. This can take a long time, * sometimes over 3 seconds for certain systems. */ rc = eeh_ops->wait_state(eeh_dev_to_of_node(frozen_edev), MAX_WAIT_FOR_RECOVERY*1000); if (rc < 0 || rc == EEH_STATE_NOT_SUPPORT) { printk(KERN_WARNING "EEH: Permanent failure\n"); goto hard_fail; } /* Since rtas may enable MMIO when posting the error log, * don't post the error log until after all dev drivers * have been informed. */ eeh_slot_error_detail(frozen_edev, EEH_LOG_TEMP); /* If all device drivers were EEH-unaware, then shut * down all of the device drivers, and hope they * go down willingly, without panicing the system. */ if (result == PCI_ERS_RESULT_NONE) { rc = eeh_reset_device(frozen_edev, frozen_bus); if (rc) { printk(KERN_WARNING "EEH: Unable to reset, rc=%d\n", rc); goto hard_fail; } } /* If all devices reported they can proceed, then re-enable MMIO */ if (result == PCI_ERS_RESULT_CAN_RECOVER) { rc = eeh_pci_enable(frozen_edev, EEH_OPT_THAW_MMIO); if (rc < 0) goto hard_fail; if (rc) { result = PCI_ERS_RESULT_NEED_RESET; } else { result = PCI_ERS_RESULT_NONE; pci_walk_bus(frozen_bus, eeh_report_mmio_enabled, &result); } } /* If all devices reported they can proceed, then re-enable DMA */ if (result == PCI_ERS_RESULT_CAN_RECOVER) { rc = eeh_pci_enable(frozen_edev, EEH_OPT_THAW_DMA); if (rc < 0) goto hard_fail; if (rc) result = PCI_ERS_RESULT_NEED_RESET; else result = PCI_ERS_RESULT_RECOVERED; } /* If any device has a hard failure, then shut off everything. */ if (result == PCI_ERS_RESULT_DISCONNECT) { printk(KERN_WARNING "EEH: Device driver gave up\n"); goto hard_fail; } /* If any device called out for a reset, then reset the slot */ if (result == PCI_ERS_RESULT_NEED_RESET) { rc = eeh_reset_device(frozen_edev, NULL); if (rc) { printk(KERN_WARNING "EEH: Cannot reset, rc=%d\n", rc); goto hard_fail; } result = PCI_ERS_RESULT_NONE; pci_walk_bus(frozen_bus, eeh_report_reset, &result); } /* All devices should claim they have recovered by now. */ if ((result != PCI_ERS_RESULT_RECOVERED) && (result != PCI_ERS_RESULT_NONE)) { printk(KERN_WARNING "EEH: Not recovered\n"); goto hard_fail; } /* Tell all device drivers that they can resume operations */ pci_walk_bus(frozen_bus, eeh_report_resume, NULL); return frozen_edev; excess_failures: /* * About 90% of all real-life EEH failures in the field * are due to poorly seated PCI cards. Only 10% or so are * due to actual, failed cards. */ printk(KERN_ERR "EEH: PCI device at location=%s driver=%s pci addr=%s\n" "has failed %d times in the last hour " "and has been permanently disabled.\n" "Please try reseating this device or replacing it.\n", location, drv_str, pci_str, frozen_edev->freeze_count); goto perm_error; hard_fail: printk(KERN_ERR "EEH: Unable to recover from failure of PCI device " "at location=%s driver=%s pci addr=%s\n" "Please try reseating this device or replacing it.\n", location, drv_str, pci_str); perm_error: eeh_slot_error_detail(frozen_edev, EEH_LOG_PERM); /* Notify all devices that they're about to go down. */ pci_walk_bus(frozen_bus, eeh_report_failure, NULL); /* Shut down the device drivers for good. */ pcibios_remove_pci_devices(frozen_bus); return NULL; }