/* * On PowerNV platform, we might already have fenced PHB there. * For that case, it's meaningless to recover frozen PE. Intead, * We have to handle fenced PHB firstly. */ static int eeh_phb_check_failure(struct eeh_pe *pe) { struct eeh_pe *phb_pe; unsigned long flags; int ret; if (!eeh_has_flag(EEH_PROBE_MODE_DEV)) return -EPERM; /* Find the PHB PE */ phb_pe = eeh_phb_pe_get(pe->phb); if (!phb_pe) { pr_warn("%s Can't find PE for PHB#%d\n", __func__, pe->phb->global_number); return -EEXIST; } /* If the PHB has been in problematic state */ eeh_serialize_lock(&flags); if (phb_pe->state & EEH_PE_ISOLATED) { ret = 0; goto out; } /* Check PHB state */ ret = eeh_ops->get_state(phb_pe, NULL); if ((ret < 0) || (ret == EEH_STATE_NOT_SUPPORT) || (ret & (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE)) == (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE)) { ret = 0; goto out; } /* Isolate the PHB and send event */ eeh_pe_state_mark(phb_pe, EEH_PE_ISOLATED); eeh_serialize_unlock(flags); pr_err("EEH: PHB#%x failure detected, location: %s\n", phb_pe->phb->global_number, eeh_pe_loc_get(phb_pe)); dump_stack(); eeh_send_failure_event(phb_pe); return 1; out: eeh_serialize_unlock(flags); return ret; }
/** * eeh_event_handler - Dispatch EEH events. * @dummy - unused * * The detection of a frozen slot can occur inside an interrupt, * where it can be hard to do anything about it. The goal of this * routine is to pull these detection events out of the context * of the interrupt handler, and re-dispatch them for processing * at a later time in a normal context. */ static int eeh_event_handler(void * dummy) { unsigned long flags; struct eeh_event *event; struct eeh_pe *pe; while (!kthread_should_stop()) { if (down_interruptible(&eeh_eventlist_sem)) break; /* Fetch EEH event from the queue */ spin_lock_irqsave(&eeh_eventlist_lock, flags); event = NULL; if (!list_empty(&eeh_eventlist)) { event = list_entry(eeh_eventlist.next, struct eeh_event, list); list_del(&event->list); } spin_unlock_irqrestore(&eeh_eventlist_lock, flags); if (!event) continue; /* We might have event without binding PE */ pe = event->pe; if (pe) { eeh_pe_state_mark(pe, EEH_PE_RECOVERING); if (pe->type & EEH_PE_PHB) pr_info("EEH: Detected error on PHB#%d\n", pe->phb->global_number); else pr_info("EEH: Detected PCI bus error on " "PHB#%d-PE#%x\n", pe->phb->global_number, pe->addr); eeh_handle_event(pe); eeh_pe_state_clear(pe, EEH_PE_RECOVERING); } else { eeh_handle_event(NULL); } kfree(event); }
/** * eeh_reset_device - Perform actual reset of a pci slot * @pe: EEH PE * @bus: PCI bus corresponding to the isolcated slot * * This routine must be called to do reset on the indicated PE. * During the reset, udev might be invoked because those affected * PCI devices will be removed and then added. */ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus) { struct pci_bus *frozen_bus = eeh_pe_bus_get(pe); struct timeval tstamp; int cnt, rc, removed = 0; /* pcibios will clear the counter; save the value */ cnt = pe->freeze_count; tstamp = pe->tstamp; /* * We don't remove the corresponding PE instances because * we need the information afterwords. The attached EEH * devices are expected to be attached soon when calling * into pcibios_add_pci_devices(). */ eeh_pe_state_mark(pe, EEH_PE_KEEP); if (bus) { pci_lock_rescan_remove(); pcibios_remove_pci_devices(bus); pci_unlock_rescan_remove(); } else if (frozen_bus) { eeh_pe_dev_traverse(pe, eeh_rmv_device, &removed); } /* * Reset the pci controller. (Asserts RST#; resets config space). * Reconfigure bridges and devices. Don't try to bring the system * up if the reset failed for some reason. * * During the reset, it's very dangerous to have uncontrolled PCI * config accesses. So we prefer to block them. However, controlled * PCI config accesses initiated from EEH itself are allowed. */ eeh_pe_state_mark(pe, EEH_PE_RESET); rc = eeh_reset_pe(pe); if (rc) { eeh_pe_state_clear(pe, EEH_PE_RESET); return rc; } pci_lock_rescan_remove(); /* Restore PE */ eeh_ops->configure_bridge(pe); eeh_pe_restore_bars(pe); eeh_pe_state_clear(pe, EEH_PE_RESET); /* Clear frozen state */ rc = eeh_clear_pe_frozen_state(pe); if (rc) return rc; /* Give the system 5 seconds to finish running the user-space * hotplug shutdown scripts, e.g. ifdown for ethernet. Yes, * this is a hack, but if we don't do this, and try to bring * the device up before the scripts have taken it down, * potentially weird things happen. */ if (bus) { pr_info("EEH: Sleep 5s ahead of complete hotplug\n"); ssleep(5); /* * The EEH device is still connected with its parent * PE. We should disconnect it so the binding can be * rebuilt when adding PCI devices. */ eeh_pe_traverse(pe, eeh_pe_detach_dev, NULL); pcibios_add_pci_devices(bus); } else if (frozen_bus && removed) { pr_info("EEH: Sleep 5s ahead of partial hotplug\n"); ssleep(5); eeh_pe_traverse(pe, eeh_pe_detach_dev, NULL); pcibios_add_pci_devices(frozen_bus); } eeh_pe_state_clear(pe, EEH_PE_KEEP); pe->tstamp = tstamp; pe->freeze_count = cnt; pci_unlock_rescan_remove(); return 0; }