/** * eeh_event_handler - dispatch EEH events. The detection of a frozen * slot can occur inside an interrupt, where it can be hard to do * anything about it. The goal of this routine is to pull these * detection events out of the context of the interrupt handler, and * re-dispatch them for processing at a later time in a normal context. * * @dummy - unused */ static int eeh_event_handler(void * dummy) { unsigned long flags; struct eeh_event *event; daemonize ("eehd"); while (1) { set_current_state(TASK_INTERRUPTIBLE); spin_lock_irqsave(&eeh_eventlist_lock, flags); event = NULL; /* Unqueue the event, get ready to process. */ if (!list_empty(&eeh_eventlist)) { event = list_entry(eeh_eventlist.next, struct eeh_event, list); list_del(&event->list); } if (event) eeh_mark_slot(event->dn, EEH_MODE_RECOVERING); spin_unlock_irqrestore(&eeh_eventlist_lock, flags); if (event == NULL) break; printk(KERN_INFO "EEH: Detected PCI bus error on device %s\n", pci_name(event->dev)); handle_eeh_events(event); eeh_clear_slot(event->dn, EEH_MODE_RECOVERING); pci_dev_put(event->dev); kfree(event); }
/** * eeh_dn_check_failure - check if all 1's data is due to EEH slot freeze * @dn device node * @dev pci device, if known * * Check for an EEH failure for the given device node. Call this * routine if the result of a read was all 0xff's and you want to * find out if this is due to an EEH slot freeze. This routine * will query firmware for the EEH status. * * Returns 0 if there has not been an EEH error; otherwise returns * a non-zero value and queues up a slot isolation event notification. * * It is safe to call this routine in an interrupt context. */ int eeh_dn_check_failure(struct device_node *dn, struct pci_dev *dev) { int ret; int rets[3]; unsigned long flags; struct pci_dn *pdn; enum pci_channel_state state; int rc = 0; total_mmio_ffs++; if (!eeh_subsystem_enabled) return 0; if (!dn) { no_dn++; return 0; } pdn = PCI_DN(dn); /* Access to IO BARs might get this far and still not want checking. */ if (!(pdn->eeh_mode & EEH_MODE_SUPPORTED) || pdn->eeh_mode & EEH_MODE_NOCHECK) { ignored_check++; #ifdef DEBUG printk ("EEH:ignored check (%x) for %s %s\n", pdn->eeh_mode, pci_name (dev), dn->full_name); #endif return 0; } if (!pdn->eeh_config_addr && !pdn->eeh_pe_config_addr) { no_cfg_addr++; return 0; } /* If we already have a pending isolation event for this * slot, we know it's bad already, we don't need to check. * Do this checking under a lock; as multiple PCI devices * in one slot might report errors simultaneously, and we * only want one error recovery routine running. */ spin_lock_irqsave(&confirm_error_lock, flags); rc = 1; if (pdn->eeh_mode & EEH_MODE_ISOLATED) { pdn->eeh_check_count ++; if (pdn->eeh_check_count >= EEH_MAX_FAILS) { printk (KERN_ERR "EEH: Device driver ignored %d bad reads, panicing\n", pdn->eeh_check_count); dump_stack(); msleep(5000); /* re-read the slot reset state */ if (read_slot_reset_state(pdn, rets) != 0) rets[0] = -1; /* reset state unknown */ /* If we are here, then we hit an infinite loop. Stop. */ panic("EEH: MMIO halt (%d) on device:%s\n", rets[0], pci_name(dev)); } goto dn_unlock; } /* * Now test for an EEH failure. This is VERY expensive. * Note that the eeh_config_addr may be a parent device * in the case of a device behind a bridge, or it may be * function zero of a multi-function device. * In any case they must share a common PHB. */ ret = read_slot_reset_state(pdn, rets); /* If the call to firmware failed, punt */ if (ret != 0) { printk(KERN_WARNING "EEH: read_slot_reset_state() failed; rc=%d dn=%s\n", ret, dn->full_name); false_positives++; rc = 0; goto dn_unlock; } /* If EEH is not supported on this device, punt. */ if (rets[1] != 1) { printk(KERN_WARNING "EEH: event on unsupported device, rc=%d dn=%s\n", ret, dn->full_name); false_positives++; rc = 0; goto dn_unlock; } /* If not the kind of error we know about, punt. */ if (rets[0] != 2 && rets[0] != 4 && rets[0] != 5) { false_positives++; rc = 0; goto dn_unlock; } /* Note that config-io to empty slots may fail; * we recognize empty because they don't have children. */ if ((rets[0] == 5) && (dn->child == NULL)) { false_positives++; rc = 0; goto dn_unlock; } slot_resets++; /* Avoid repeated reports of this failure, including problems * with other functions on this device, and functions under * bridges. */ eeh_mark_slot (dn, EEH_MODE_ISOLATED); spin_unlock_irqrestore(&confirm_error_lock, flags); state = pci_channel_io_normal; if ((rets[0] == 2) || (rets[0] == 4)) state = pci_channel_io_frozen; if (rets[0] == 5) state = pci_channel_io_perm_failure; eeh_send_failure_event (dn, dev, state, rets[2]); /* Most EEH events are due to device driver bugs. Having * a stack trace will help the device-driver authors figure * out what happened. So print that out. */ if (rets[0] != 5) dump_stack(); return 1; dn_unlock: spin_unlock_irqrestore(&confirm_error_lock, flags); return rc; }