/** * Return the "partitionable endpoint" (pe) under which this device lies */ struct device_node * find_device_pe(struct device_node *dn) { while ((dn->parent) && PCI_DN(dn->parent) && (PCI_DN(dn->parent)->eeh_mode & EEH_MODE_SUPPORTED)) { dn = dn->parent; } return dn; }
/** * eeh_enable_irq - enable interrupt for the recovering device */ static void eeh_enable_irq(struct pci_dev *dev) { struct device_node *dn = pci_device_to_OF_node(dev); if ((PCI_DN(dn)->eeh_mode) & EEH_MODE_IRQ_DISABLED) { PCI_DN(dn)->eeh_mode &= ~EEH_MODE_IRQ_DISABLED; enable_irq(dev->irq); } }
static void __eeh_clear_slot (struct device_node *dn, int mode_flag) { while (dn) { if (PCI_DN(dn)) { PCI_DN(dn)->eeh_mode &= ~mode_flag; PCI_DN(dn)->eeh_check_count = 0; if (dn->child) __eeh_clear_slot (dn->child, mode_flag); } dn = dn->sibling; } }
void eeh_clear_slot (struct device_node *dn, int mode_flag) { unsigned long flags; spin_lock_irqsave(&confirm_error_lock, flags); dn = find_device_pe (dn); /* Back up one, since config addrs might be shared */ if (PCI_DN(dn) && PCI_DN(dn)->eeh_pe_config_addr) dn = dn->parent; PCI_DN(dn)->eeh_mode &= ~mode_flag; PCI_DN(dn)->eeh_check_count = 0; __eeh_clear_slot (dn->child, mode_flag); spin_unlock_irqrestore(&confirm_error_lock, flags); }
static void eeh_report_error(struct pci_dev *dev, void *userdata) { enum pci_ers_result rc, *res = userdata; struct pci_driver *driver = dev->driver; dev->error_state = pci_channel_io_frozen; if (!driver) return; if (irq_in_use (dev->irq)) { struct device_node *dn = pci_device_to_OF_node(dev); PCI_DN(dn)->eeh_mode |= EEH_MODE_IRQ_DISABLED; disable_irq_nosync(dev->irq); } if (!driver->err_handler || !driver->err_handler->error_detected) return; rc = driver->err_handler->error_detected (dev, pci_channel_io_frozen); /* A driver that needs a reset trumps all others */ if (rc == PCI_ERS_RESULT_NEED_RESET) *res = rc; if (*res == PCI_ERS_RESULT_NONE) *res = rc; }
static void iommu_bus_setup_pSeries(struct pci_bus *bus) { struct device_node *dn; struct iommu_table *tbl; struct device_node *isa_dn, *isa_dn_orig; struct device_node *tmp; struct pci_dn *pci; int children; DBG("iommu_bus_setup_pSeries, bus %p, bus->self %p\n", bus, bus->self); dn = pci_bus_to_OF_node(bus); pci = PCI_DN(dn); if (bus->self) { /* This is not a root bus, any setup will be done for the * device-side of the bridge in iommu_dev_setup_pSeries(). */ return; } /* Check if the ISA bus on the system is under * this PHB. */ isa_dn = isa_dn_orig = of_find_node_by_type(NULL, "isa"); while (isa_dn && isa_dn != dn) isa_dn = isa_dn->parent; if (isa_dn_orig) of_node_put(isa_dn_orig); /* Count number of direct PCI children of the PHB. * All PCI device nodes have class-code property, so it's * an easy way to find them. */ for (children = 0, tmp = dn->child; tmp; tmp = tmp->sibling) if (get_property(tmp, "class-code", NULL)) children++; DBG("Children: %d\n", children); /* Calculate amount of DMA window per slot. Each window must be * a power of two (due to pci_alloc_consistent requirements). * * Keep 256MB aside for PHBs with ISA. */ if (!isa_dn) { /* No ISA/IDE - just set window size and return */ pci->phb->dma_window_size = 0x80000000ul; /* To be divided */ while (pci->phb->dma_window_size * children > 0x80000000ul) pci->phb->dma_window_size >>= 1; DBG("No ISA/IDE, window size is 0x%lx\n", pci->phb->dma_window_size); pci->phb->dma_window_base_cur = 0; return; }
static int eeh_reset_device (struct pci_dn *pe_dn, struct pci_bus *bus) { struct device_node *dn; int cnt, rc; /* pcibios will clear the counter; save the value */ cnt = pe_dn->eeh_freeze_count; if (bus) pcibios_remove_pci_devices(bus); /* Reset the pci controller. (Asserts RST#; resets config space). * Reconfigure bridges and devices. Don't try to bring the system * up if the reset failed for some reason. */ rc = rtas_set_slot_reset(pe_dn); if (rc) return rc; /* Walk over all functions on this device. */ dn = pe_dn->node; if (!pcibios_find_pci_bus(dn) && PCI_DN(dn->parent)) dn = dn->parent->child; while (dn) { struct pci_dn *ppe = PCI_DN(dn); /* On Power4, always true because eeh_pe_config_addr=0 */ if (pe_dn->eeh_pe_config_addr == ppe->eeh_pe_config_addr) { rtas_configure_bridge(ppe); eeh_restore_bars(ppe); } dn = dn->sibling; } /* Give the system 5 seconds to finish running the user-space * hotplug shutdown scripts, e.g. ifdown for ethernet. Yes, * this is a hack, but if we don't do this, and try to bring * the device up before the scripts have taken it down, * potentially weird things happen. */ if (bus) { ssleep (5); pcibios_add_pci_devices(bus); } pe_dn->eeh_freeze_count = cnt; return 0; }
static int of_pci_phb_probe(struct platform_device *dev) { struct pci_controller *phb; /* Check if we can do that ... */ if (ppc_md.pci_setup_phb == NULL) return -ENODEV; pr_info("Setting up PCI bus %s\n", dev->dev.of_node->full_name); /* Alloc and setup PHB data structure */ phb = pcibios_alloc_controller(dev->dev.of_node); if (!phb) return -ENODEV; /* Setup parent in sysfs */ phb->parent = &dev->dev; /* Setup the PHB using arch provided callback */ if (ppc_md.pci_setup_phb(phb)) { pcibios_free_controller(phb); return -ENODEV; } /* Process "ranges" property */ pci_process_bridge_OF_ranges(phb, dev->dev.of_node, 0); /* Init pci_dn data structures */ pci_devs_phb_init_dynamic(phb); /* Create EEH devices for the PHB */ eeh_dev_phb_init_dynamic(phb); /* Register devices with EEH */ if (dev->dev.of_node->child) eeh_add_device_tree_early(PCI_DN(dev->dev.of_node)); /* Scan the bus */ pcibios_scan_phb(phb); if (phb->bus == NULL) return -ENXIO; /* Claim resources. This might need some rework as well depending * whether we are doing probe-only or not, like assigning unassigned * resources etc... */ pcibios_claim_one_bus(phb->bus); /* Finish EEH setup */ eeh_add_device_tree_late(phb->bus); /* Add probed PCI devices to the device model */ pci_bus_add_devices(phb->bus); /* sysfs files should only be added after devices are added */ eeh_add_sysfs_files(phb->bus); return 0; }
/* * Generate a Direct Select Address for the Hypervisor */ static inline u64 iseries_ds_addr(struct device_node *node) { struct pci_dn *pdn = PCI_DN(node); const u32 *sbp = of_get_property(node, "linux,subbus", NULL); return ((u64)pdn->busno << 48) + ((u64)(sbp ? *sbp : 0) << 40) + ((u64)0x10 << 32); }
/** * pseries_eeh_write_config - Write PCI config space * @dn: device node * @where: PCI address * @size: size to write * @val: value to be written * * Write config space to the specified device */ static int pseries_eeh_write_config(struct device_node *dn, int where, int size, u32 val) { struct pci_dn *pdn; pdn = PCI_DN(dn); return rtas_write_config(pdn, where, size, val); }
void eeh_mark_slot (struct device_node *dn, int mode_flag) { struct pci_dev *dev; dn = find_device_pe (dn); /* Back up one, since config addrs might be shared */ if (PCI_DN(dn) && PCI_DN(dn)->eeh_pe_config_addr) dn = dn->parent; PCI_DN(dn)->eeh_mode |= mode_flag; /* Mark the pci device too */ dev = PCI_DN(dn)->pcidev; if (dev) dev->error_state = pci_channel_io_frozen; __eeh_mark_slot (dn->child, mode_flag); }
static void __eeh_mark_slot (struct device_node *dn, int mode_flag) { while (dn) { if (PCI_DN(dn)) { /* Mark the pci device driver too */ struct pci_dev *dev = PCI_DN(dn)->pcidev; PCI_DN(dn)->eeh_mode |= mode_flag; if (dev && dev->driver) dev->error_state = pci_channel_io_frozen; if (dn->child) __eeh_mark_slot (dn->child, mode_flag); } dn = dn->sibling; } }
static void eeh_report_reset(struct pci_dev *dev, void *userdata) { struct pci_driver *driver = dev->driver; struct device_node *dn = pci_device_to_OF_node(dev); if (!driver) return; if ((PCI_DN(dn)->eeh_mode) & EEH_MODE_IRQ_DISABLED) { PCI_DN(dn)->eeh_mode &= ~EEH_MODE_IRQ_DISABLED; enable_irq(dev->irq); } if (!driver->err_handler) return; if (!driver->err_handler->slot_reset) return; driver->err_handler->slot_reset(dev); }
static void eeh_report_resume(struct pci_dev *dev, void *userdata) { struct pci_driver *driver = dev->driver; struct device_node *dn = pci_device_to_OF_node(dev); dev->error_state = pci_channel_io_normal; if (!driver) return; if ((PCI_DN(dn)->eeh_mode) & EEH_MODE_IRQ_DISABLED) { PCI_DN(dn)->eeh_mode &= ~EEH_MODE_IRQ_DISABLED; enable_irq(dev->irq); } if (!driver->err_handler || !driver->err_handler->resume) return; driver->err_handler->resume(dev); }
static inline struct iommu_table *devnode_table(struct device *dev) { struct pci_dev *pdev; if (!dev) { pdev = ppc64_isabridge_dev; if (!pdev) return NULL; } else pdev = to_pci_dev(dev); return PCI_DN(PCI_GET_DN(pdev))->iommu_table; }
static void eeh_report_reset(struct pci_dev *dev, void *userdata) { enum pci_ers_result rc, *res = userdata; struct pci_driver *driver = dev->driver; struct device_node *dn = pci_device_to_OF_node(dev); if (!driver) return; if ((PCI_DN(dn)->eeh_mode) & EEH_MODE_IRQ_DISABLED) { PCI_DN(dn)->eeh_mode &= ~EEH_MODE_IRQ_DISABLED; enable_irq(dev->irq); } if (!driver->err_handler || !driver->err_handler->slot_reset) return; rc = driver->err_handler->slot_reset(dev); if (*res == PCI_ERS_RESULT_NONE) *res = rc; if (*res == PCI_ERS_RESULT_DISCONNECT && rc == PCI_ERS_RESULT_NEED_RESET) *res = rc; }
/** * eeh_restore_bars - restore the PCI config space info * * This routine performs a recursive walk to the children * of this device as well. */ void eeh_restore_bars(struct pci_dn *pdn) { struct device_node *dn; if (!pdn) return; if ((pdn->eeh_mode & EEH_MODE_SUPPORTED) && !IS_BRIDGE(pdn->class_code)) __restore_bars (pdn); dn = pdn->node->child; while (dn) { eeh_restore_bars (PCI_DN(dn)); dn = dn->sibling; } }
static void print_device_node_tree (struct pci_dn *pdn, int dent) { int i; if (!pdn) return; for (i=0;i<dent; i++) printk(" "); printk("dn=%s mode=%x \tcfg_addr=%x pe_addr=%x \tfull=%s\n", pdn->node->name, pdn->eeh_mode, pdn->eeh_config_addr, pdn->eeh_pe_config_addr, pdn->node->full_name); dent += 3; struct device_node *pc = pdn->node->child; while (pc) { print_device_node_tree(PCI_DN(pc), dent); pc = pc->sibling; } }
/* * iommu_table_setparms_lpar * * Function: On pSeries LPAR systems, return TCE table info, given a pci bus. */ static void iommu_table_setparms_lpar(struct pci_controller *phb, struct device_node *dn, struct iommu_table *tbl, unsigned char *dma_window) { unsigned long offset, size; tbl->it_busno = PCI_DN(dn)->bussubno; of_parse_dma_window(dn, dma_window, &tbl->it_index, &offset, &size); tbl->it_base = 0; tbl->it_blocksize = 16; tbl->it_type = TCE_PCI; tbl->it_offset = offset >> PAGE_SHIFT; tbl->it_size = size >> PAGE_SHIFT; }
/** * eeh_disable_irq - disable interrupt for the recovering device */ static void eeh_disable_irq(struct pci_dev *dev) { struct device_node *dn = pci_device_to_OF_node(dev); /* Don't disable MSI and MSI-X interrupts. They are * effectively disabled by the DMA Stopped state * when an EEH error occurs. */ if (dev->msi_enabled || dev->msix_enabled) return; if (!irq_has_action(dev->irq)) return; PCI_DN(dn)->eeh_mode |= EEH_MODE_IRQ_DISABLED; disable_irq_nosync(dev->irq); }
static int rtas_pci_write_config(struct pci_bus *bus, unsigned int devfn, int where, int size, u32 val) { struct device_node *busdn, *dn; busdn = pci_bus_to_OF_node(bus); /* Search only direct children of the bus */ for (dn = busdn->child; dn; dn = dn->sibling) { struct pci_dn *pdn = PCI_DN(dn); if (pdn && pdn->devfn == devfn && of_device_is_available(dn)) return rtas_write_config(pdn, where, size, val); } return PCIBIOS_DEVICE_NOT_FOUND; }
/* * iommu_table_setparms_lpar * * Function: On pSeries LPAR systems, return TCE table info, given a pci bus. * * ToDo: properly interpret the ibm,dma-window property. The definition is: * logical-bus-number (1 word) * phys-address (#address-cells words) * size (#cell-size words) * * Currently we hard code these sizes (more or less). */ static void iommu_table_setparms_lpar(struct pci_controller *phb, struct device_node *dn, struct iommu_table *tbl, unsigned int *dma_window) { tbl->it_busno = PCI_DN(dn)->bussubno; /* TODO: Parse field size properties properly. */ tbl->it_size = (((unsigned long)dma_window[4] << 32) | (unsigned long)dma_window[5]) >> PAGE_SHIFT; tbl->it_offset = (((unsigned long)dma_window[2] << 32) | (unsigned long)dma_window[3]) >> PAGE_SHIFT; tbl->it_base = 0; tbl->it_index = dma_window[0]; tbl->it_blocksize = 16; tbl->it_type = TCE_PCI; }
static void eeh_report_failure(struct pci_dev *dev, void *userdata) { struct pci_driver *driver = dev->driver; dev->error_state = pci_channel_io_perm_failure; if (!driver) return; if (irq_in_use (dev->irq)) { struct device_node *dn = pci_device_to_OF_node(dev); PCI_DN(dn)->eeh_mode |= EEH_MODE_IRQ_DISABLED; disable_irq_nosync(dev->irq); } if (!driver->err_handler) return; if (!driver->err_handler->error_detected) return; driver->err_handler->error_detected(dev, pci_channel_io_perm_failure); }
static int rtas_pci_read_config(struct pci_bus *bus, unsigned int devfn, int where, int size, u32 *val) { struct device_node *busdn, *dn; if (bus->self) busdn = pci_device_to_OF_node(bus->self); else busdn = bus->sysdata; /* must be a phb */ /* Search only direct children of the bus */ for (dn = busdn->child; dn; dn = dn->sibling) { struct pci_dn *pdn = PCI_DN(dn); if (pdn && pdn->devfn == devfn && of_device_available(dn)) return rtas_read_config(pdn, where, size, val); } return PCIBIOS_DEVICE_NOT_FOUND; }
/** * eeh_dev_init - Create EEH device according to OF node * @dn: device node * @data: PHB * * It will create EEH device according to the given OF node. The function * might be called by PCI emunation, DR, PHB hotplug. */ void *eeh_dev_init(struct device_node *dn, void *data) { struct pci_controller *phb = data; struct eeh_dev *edev; /* Allocate EEH device */ edev = kzalloc(sizeof(*edev), GFP_KERNEL); if (!edev) { pr_warn("%s: out of memory\n", __func__); return NULL; } /* Associate EEH device with OF node */ PCI_DN(dn)->edev = edev; edev->dn = dn; edev->phb = phb; INIT_LIST_HEAD(&edev->list); return NULL; }
struct pci_dn * handle_eeh_events (struct eeh_event *event) { struct device_node *frozen_dn; struct pci_dn *frozen_pdn; struct pci_bus *frozen_bus; int rc = 0; enum pci_ers_result result = PCI_ERS_RESULT_NONE; const char *location, *pci_str, *drv_str; frozen_dn = find_device_pe(event->dn); if (!frozen_dn) { location = of_get_property(event->dn, "ibm,loc-code", NULL); location = location ? location : "unknown"; printk(KERN_ERR "EEH: Error: Cannot find partition endpoint " "for location=%s pci addr=%s\n", location, pci_name(event->dev)); return NULL; } frozen_bus = pcibios_find_pci_bus(frozen_dn); location = of_get_property(frozen_dn, "ibm,loc-code", NULL); location = location ? location : "unknown"; /* There are two different styles for coming up with the PE. * In the old style, it was the highest EEH-capable device * which was always an EADS pci bridge. In the new style, * there might not be any EADS bridges, and even when there are, * the firmware marks them as "EEH incapable". So another * two-step is needed to find the pci bus.. */ if (!frozen_bus) frozen_bus = pcibios_find_pci_bus (frozen_dn->parent); if (!frozen_bus) { printk(KERN_ERR "EEH: Cannot find PCI bus " "for location=%s dn=%s\n", location, frozen_dn->full_name); return NULL; } frozen_pdn = PCI_DN(frozen_dn); frozen_pdn->eeh_freeze_count++; if (frozen_pdn->pcidev) { pci_str = pci_name (frozen_pdn->pcidev); drv_str = pcid_name (frozen_pdn->pcidev); } else { pci_str = pci_name (event->dev); drv_str = pcid_name (event->dev); } if (frozen_pdn->eeh_freeze_count > EEH_MAX_ALLOWED_FREEZES) goto excess_failures; printk(KERN_WARNING "EEH: This PCI device has failed %d times in the last hour:\n", frozen_pdn->eeh_freeze_count); printk(KERN_WARNING "EEH: location=%s driver=%s pci addr=%s\n", location, drv_str, pci_str); /* Walk the various device drivers attached to this slot through * a reset sequence, giving each an opportunity to do what it needs * to accomplish the reset. Each child gets a report of the * status ... if any child can't handle the reset, then the entire * slot is dlpar removed and added. */ pci_walk_bus(frozen_bus, eeh_report_error, &result); /* Get the current PCI slot state. This can take a long time, * sometimes over 3 seconds for certain systems. */ rc = eeh_wait_for_slot_status (frozen_pdn, MAX_WAIT_FOR_RECOVERY*1000); if (rc < 0) { printk(KERN_WARNING "EEH: Permanent failure\n"); goto hard_fail; } /* Since rtas may enable MMIO when posting the error log, * don't post the error log until after all dev drivers * have been informed. */ eeh_slot_error_detail(frozen_pdn, EEH_LOG_TEMP_FAILURE); /* If all device drivers were EEH-unaware, then shut * down all of the device drivers, and hope they * go down willingly, without panicing the system. */ if (result == PCI_ERS_RESULT_NONE) { rc = eeh_reset_device(frozen_pdn, frozen_bus); if (rc) { printk(KERN_WARNING "EEH: Unable to reset, rc=%d\n", rc); goto hard_fail; } } /* If all devices reported they can proceed, then re-enable MMIO */ if (result == PCI_ERS_RESULT_CAN_RECOVER) { rc = rtas_pci_enable(frozen_pdn, EEH_THAW_MMIO); if (rc < 0) goto hard_fail; if (rc) { result = PCI_ERS_RESULT_NEED_RESET; } else { result = PCI_ERS_RESULT_NONE; pci_walk_bus(frozen_bus, eeh_report_mmio_enabled, &result); } } /* If all devices reported they can proceed, then re-enable DMA */ if (result == PCI_ERS_RESULT_CAN_RECOVER) { rc = rtas_pci_enable(frozen_pdn, EEH_THAW_DMA); if (rc < 0) goto hard_fail; if (rc) result = PCI_ERS_RESULT_NEED_RESET; else result = PCI_ERS_RESULT_RECOVERED; } /* If any device has a hard failure, then shut off everything. */ if (result == PCI_ERS_RESULT_DISCONNECT) { printk(KERN_WARNING "EEH: Device driver gave up\n"); goto hard_fail; } /* If any device called out for a reset, then reset the slot */ if (result == PCI_ERS_RESULT_NEED_RESET) { rc = eeh_reset_device(frozen_pdn, NULL); if (rc) { printk(KERN_WARNING "EEH: Cannot reset, rc=%d\n", rc); goto hard_fail; } result = PCI_ERS_RESULT_NONE; pci_walk_bus(frozen_bus, eeh_report_reset, &result); } /* All devices should claim they have recovered by now. */ if ((result != PCI_ERS_RESULT_RECOVERED) && (result != PCI_ERS_RESULT_NONE)) { printk(KERN_WARNING "EEH: Not recovered\n"); goto hard_fail; } /* Tell all device drivers that they can resume operations */ pci_walk_bus(frozen_bus, eeh_report_resume, NULL); return frozen_pdn; excess_failures: /* * About 90% of all real-life EEH failures in the field * are due to poorly seated PCI cards. Only 10% or so are * due to actual, failed cards. */ printk(KERN_ERR "EEH: PCI device at location=%s driver=%s pci addr=%s \n" "has failed %d times in the last hour " "and has been permanently disabled. \n" "Please try reseating this device or replacing it.\n", location, drv_str, pci_str, frozen_pdn->eeh_freeze_count); goto perm_error; hard_fail: printk(KERN_ERR "EEH: Unable to recover from failure of PCI device " "at location=%s driver=%s pci addr=%s \n" "Please try reseating this device or replacing it.\n", location, drv_str, pci_str); perm_error: eeh_slot_error_detail(frozen_pdn, EEH_LOG_PERM_FAILURE); /* Notify all devices that they're about to go down. */ pci_walk_bus(frozen_bus, eeh_report_failure, NULL); /* Shut down the device drivers for good. */ pcibios_remove_pci_devices(frozen_bus); return NULL; }
/** * eeh_dn_check_failure - check if all 1's data is due to EEH slot freeze * @dn device node * @dev pci device, if known * * Check for an EEH failure for the given device node. Call this * routine if the result of a read was all 0xff's and you want to * find out if this is due to an EEH slot freeze. This routine * will query firmware for the EEH status. * * Returns 0 if there has not been an EEH error; otherwise returns * a non-zero value and queues up a slot isolation event notification. * * It is safe to call this routine in an interrupt context. */ int eeh_dn_check_failure(struct device_node *dn, struct pci_dev *dev) { int ret; int rets[3]; unsigned long flags; struct pci_dn *pdn; enum pci_channel_state state; int rc = 0; total_mmio_ffs++; if (!eeh_subsystem_enabled) return 0; if (!dn) { no_dn++; return 0; } pdn = PCI_DN(dn); /* Access to IO BARs might get this far and still not want checking. */ if (!(pdn->eeh_mode & EEH_MODE_SUPPORTED) || pdn->eeh_mode & EEH_MODE_NOCHECK) { ignored_check++; #ifdef DEBUG printk ("EEH:ignored check (%x) for %s %s\n", pdn->eeh_mode, pci_name (dev), dn->full_name); #endif return 0; } if (!pdn->eeh_config_addr && !pdn->eeh_pe_config_addr) { no_cfg_addr++; return 0; } /* If we already have a pending isolation event for this * slot, we know it's bad already, we don't need to check. * Do this checking under a lock; as multiple PCI devices * in one slot might report errors simultaneously, and we * only want one error recovery routine running. */ spin_lock_irqsave(&confirm_error_lock, flags); rc = 1; if (pdn->eeh_mode & EEH_MODE_ISOLATED) { pdn->eeh_check_count ++; if (pdn->eeh_check_count >= EEH_MAX_FAILS) { printk (KERN_ERR "EEH: Device driver ignored %d bad reads, panicing\n", pdn->eeh_check_count); dump_stack(); msleep(5000); /* re-read the slot reset state */ if (read_slot_reset_state(pdn, rets) != 0) rets[0] = -1; /* reset state unknown */ /* If we are here, then we hit an infinite loop. Stop. */ panic("EEH: MMIO halt (%d) on device:%s\n", rets[0], pci_name(dev)); } goto dn_unlock; } /* * Now test for an EEH failure. This is VERY expensive. * Note that the eeh_config_addr may be a parent device * in the case of a device behind a bridge, or it may be * function zero of a multi-function device. * In any case they must share a common PHB. */ ret = read_slot_reset_state(pdn, rets); /* If the call to firmware failed, punt */ if (ret != 0) { printk(KERN_WARNING "EEH: read_slot_reset_state() failed; rc=%d dn=%s\n", ret, dn->full_name); false_positives++; rc = 0; goto dn_unlock; } /* If EEH is not supported on this device, punt. */ if (rets[1] != 1) { printk(KERN_WARNING "EEH: event on unsupported device, rc=%d dn=%s\n", ret, dn->full_name); false_positives++; rc = 0; goto dn_unlock; } /* If not the kind of error we know about, punt. */ if (rets[0] != 2 && rets[0] != 4 && rets[0] != 5) { false_positives++; rc = 0; goto dn_unlock; } /* Note that config-io to empty slots may fail; * we recognize empty because they don't have children. */ if ((rets[0] == 5) && (dn->child == NULL)) { false_positives++; rc = 0; goto dn_unlock; } slot_resets++; /* Avoid repeated reports of this failure, including problems * with other functions on this device, and functions under * bridges. */ eeh_mark_slot (dn, EEH_MODE_ISOLATED); spin_unlock_irqrestore(&confirm_error_lock, flags); state = pci_channel_io_normal; if ((rets[0] == 2) || (rets[0] == 4)) state = pci_channel_io_frozen; if (rets[0] == 5) state = pci_channel_io_perm_failure; eeh_send_failure_event (dn, dev, state, rets[2]); /* Most EEH events are due to device driver bugs. Having * a stack trace will help the device-driver authors figure * out what happened. So print that out. */ if (rets[0] != 5) dump_stack(); return 1; dn_unlock: spin_unlock_irqrestore(&confirm_error_lock, flags); return rc; }
struct pci_dn * handle_eeh_events (struct eeh_event *event) { struct device_node *frozen_dn; struct pci_dn *frozen_pdn; struct pci_bus *frozen_bus; int rc = 0; enum pci_ers_result result = PCI_ERS_RESULT_NONE; const char *location, *pci_str, *drv_str; frozen_dn = find_device_pe(event->dn); frozen_bus = pcibios_find_pci_bus(frozen_dn); if (!frozen_dn) { location = (char *) get_property(event->dn, "ibm,loc-code", NULL); location = location ? location : "unknown"; printk(KERN_ERR "EEH: Error: Cannot find partition endpoint " "for location=%s pci addr=%s\n", location, pci_name(event->dev)); return NULL; } location = (char *) get_property(frozen_dn, "ibm,loc-code", NULL); location = location ? location : "unknown"; /* There are two different styles for coming up with the PE. * In the old style, it was the highest EEH-capable device * which was always an EADS pci bridge. In the new style, * there might not be any EADS bridges, and even when there are, * the firmware marks them as "EEH incapable". So another * two-step is needed to find the pci bus.. */ if (!frozen_bus) frozen_bus = pcibios_find_pci_bus (frozen_dn->parent); if (!frozen_bus) { printk(KERN_ERR "EEH: Cannot find PCI bus " "for location=%s dn=%s\n", location, frozen_dn->full_name); return NULL; } #if 0 /* We may get "permanent failure" messages on empty slots. * These are false alarms. Empty slots have no child dn. */ if ((event->state == pci_channel_io_perm_failure) && (frozen_device == NULL)) return; #endif frozen_pdn = PCI_DN(frozen_dn); frozen_pdn->eeh_freeze_count++; if (frozen_pdn->pcidev) { pci_str = pci_name (frozen_pdn->pcidev); drv_str = pcid_name (frozen_pdn->pcidev); } else { pci_str = pci_name (event->dev); drv_str = pcid_name (event->dev); } if (frozen_pdn->eeh_freeze_count > EEH_MAX_ALLOWED_FREEZES) goto excess_failures; /* If the reset state is a '5' and the time to reset is 0 (infinity) * or is more then 15 seconds, then mark this as a permanent failure. */ if ((event->state == pci_channel_io_perm_failure) && ((event->time_unavail <= 0) || (event->time_unavail > MAX_WAIT_FOR_RECOVERY*1000))) goto hard_fail; eeh_slot_error_detail(frozen_pdn, 1 /* Temporary Error */); printk(KERN_WARNING "EEH: This PCI device has failed %d times since last reboot: " "location=%s driver=%s pci addr=%s\n", frozen_pdn->eeh_freeze_count, location, drv_str, pci_str); /* Walk the various device drivers attached to this slot through * a reset sequence, giving each an opportunity to do what it needs * to accomplish the reset. Each child gets a report of the * status ... if any child can't handle the reset, then the entire * slot is dlpar removed and added. */ pci_walk_bus(frozen_bus, eeh_report_error, &result); /* If all device drivers were EEH-unaware, then shut * down all of the device drivers, and hope they * go down willingly, without panicing the system. */ if (result == PCI_ERS_RESULT_NONE) { rc = eeh_reset_device(frozen_pdn, frozen_bus); if (rc) goto hard_fail; } /* If any device called out for a reset, then reset the slot */ if (result == PCI_ERS_RESULT_NEED_RESET) { rc = eeh_reset_device(frozen_pdn, NULL); if (rc) goto hard_fail; pci_walk_bus(frozen_bus, eeh_report_reset, NULL); } /* If all devices reported they can proceed, the re-enable PIO */ if (result == PCI_ERS_RESULT_CAN_RECOVER) { /* XXX Not supported; we brute-force reset the device */ rc = eeh_reset_device(frozen_pdn, NULL); if (rc) goto hard_fail; pci_walk_bus(frozen_bus, eeh_report_reset, NULL); } /* Tell all device drivers that they can resume operations */ pci_walk_bus(frozen_bus, eeh_report_resume, NULL); return frozen_pdn; excess_failures: /* * About 90% of all real-life EEH failures in the field * are due to poorly seated PCI cards. Only 10% or so are * due to actual, failed cards. */ printk(KERN_ERR "EEH: PCI device at location=%s driver=%s pci addr=%s \n" "has failed %d times and has been permanently disabled. \n" "Please try reseating this device or replacing it.\n", location, drv_str, pci_str, frozen_pdn->eeh_freeze_count); goto perm_error; hard_fail: printk(KERN_ERR "EEH: Unable to recover from failure of PCI device " "at location=%s driver=%s pci addr=%s \n" "Please try reseating this device or replacing it.\n", location, drv_str, pci_str); perm_error: eeh_slot_error_detail(frozen_pdn, 2 /* Permanent Error */); /* Notify all devices that they're about to go down. */ pci_walk_bus(frozen_bus, eeh_report_failure, NULL); /* Shut down the device drivers for good. */ pcibios_remove_pci_devices(frozen_bus); return NULL; }