Exemple #1
0
/**
 * eeh_event_handler - dispatch EEH events.  The detection of a frozen
 * slot can occur inside an interrupt, where it can be hard to do
 * anything about it.  The goal of this routine is to pull these
 * detection events out of the context of the interrupt handler, and
 * re-dispatch them for processing at a later time in a normal context.
 *
 * @dummy - unused
 */
static int eeh_event_handler(void * dummy)
{
	unsigned long flags;
	struct eeh_event	*event;

	daemonize ("eehd");

	while (1) {
		set_current_state(TASK_INTERRUPTIBLE);

		spin_lock_irqsave(&eeh_eventlist_lock, flags);
		event = NULL;

		/* Unqueue the event, get ready to process. */
		if (!list_empty(&eeh_eventlist)) {
			event = list_entry(eeh_eventlist.next, struct eeh_event, list);
			list_del(&event->list);
		}
		
		if (event)
			eeh_mark_slot(event->dn, EEH_MODE_RECOVERING);

		spin_unlock_irqrestore(&eeh_eventlist_lock, flags);
		if (event == NULL)
			break;

		printk(KERN_INFO "EEH: Detected PCI bus error on device %s\n",
		       pci_name(event->dev));

		handle_eeh_events(event);

		eeh_clear_slot(event->dn, EEH_MODE_RECOVERING);

		pci_dev_put(event->dev);
		kfree(event);
	}
Exemple #2
0
/**
 * eeh_dn_check_failure - check if all 1's data is due to EEH slot freeze
 * @dn device node
 * @dev pci device, if known
 *
 * Check for an EEH failure for the given device node.  Call this
 * routine if the result of a read was all 0xff's and you want to
 * find out if this is due to an EEH slot freeze.  This routine
 * will query firmware for the EEH status.
 *
 * Returns 0 if there has not been an EEH error; otherwise returns
 * a non-zero value and queues up a slot isolation event notification.
 *
 * It is safe to call this routine in an interrupt context.
 */
int eeh_dn_check_failure(struct device_node *dn, struct pci_dev *dev)
{
	int ret;
	int rets[3];
	unsigned long flags;
	struct pci_dn *pdn;
	enum pci_channel_state state;
	int rc = 0;

	total_mmio_ffs++;

	if (!eeh_subsystem_enabled)
		return 0;

	if (!dn) {
		no_dn++;
		return 0;
	}
	pdn = PCI_DN(dn);

	/* Access to IO BARs might get this far and still not want checking. */
	if (!(pdn->eeh_mode & EEH_MODE_SUPPORTED) ||
	    pdn->eeh_mode & EEH_MODE_NOCHECK) {
		ignored_check++;
#ifdef DEBUG
		printk ("EEH:ignored check (%x) for %s %s\n", 
		        pdn->eeh_mode, pci_name (dev), dn->full_name);
#endif
		return 0;
	}

	if (!pdn->eeh_config_addr && !pdn->eeh_pe_config_addr) {
		no_cfg_addr++;
		return 0;
	}

	/* If we already have a pending isolation event for this
	 * slot, we know it's bad already, we don't need to check.
	 * Do this checking under a lock; as multiple PCI devices
	 * in one slot might report errors simultaneously, and we
	 * only want one error recovery routine running.
	 */
	spin_lock_irqsave(&confirm_error_lock, flags);
	rc = 1;
	if (pdn->eeh_mode & EEH_MODE_ISOLATED) {
		pdn->eeh_check_count ++;
		if (pdn->eeh_check_count >= EEH_MAX_FAILS) {
			printk (KERN_ERR "EEH: Device driver ignored %d bad reads, panicing\n",
			        pdn->eeh_check_count);
			dump_stack();
			msleep(5000);
			
			/* re-read the slot reset state */
			if (read_slot_reset_state(pdn, rets) != 0)
				rets[0] = -1;	/* reset state unknown */

			/* If we are here, then we hit an infinite loop. Stop. */
			panic("EEH: MMIO halt (%d) on device:%s\n", rets[0], pci_name(dev));
		}
		goto dn_unlock;
	}

	/*
	 * Now test for an EEH failure.  This is VERY expensive.
	 * Note that the eeh_config_addr may be a parent device
	 * in the case of a device behind a bridge, or it may be
	 * function zero of a multi-function device.
	 * In any case they must share a common PHB.
	 */
	ret = read_slot_reset_state(pdn, rets);

	/* If the call to firmware failed, punt */
	if (ret != 0) {
		printk(KERN_WARNING "EEH: read_slot_reset_state() failed; rc=%d dn=%s\n",
		       ret, dn->full_name);
		false_positives++;
		rc = 0;
		goto dn_unlock;
	}

	/* If EEH is not supported on this device, punt. */
	if (rets[1] != 1) {
		printk(KERN_WARNING "EEH: event on unsupported device, rc=%d dn=%s\n",
		       ret, dn->full_name);
		false_positives++;
		rc = 0;
		goto dn_unlock;
	}

	/* If not the kind of error we know about, punt. */
	if (rets[0] != 2 && rets[0] != 4 && rets[0] != 5) {
		false_positives++;
		rc = 0;
		goto dn_unlock;
	}

	/* Note that config-io to empty slots may fail;
	 * we recognize empty because they don't have children. */
	if ((rets[0] == 5) && (dn->child == NULL)) {
		false_positives++;
		rc = 0;
		goto dn_unlock;
	}

	slot_resets++;
 
	/* Avoid repeated reports of this failure, including problems
	 * with other functions on this device, and functions under
	 * bridges. */
	eeh_mark_slot (dn, EEH_MODE_ISOLATED);
	spin_unlock_irqrestore(&confirm_error_lock, flags);

	state = pci_channel_io_normal;
	if ((rets[0] == 2) || (rets[0] == 4))
		state = pci_channel_io_frozen;
	if (rets[0] == 5)
		state = pci_channel_io_perm_failure;
	eeh_send_failure_event (dn, dev, state, rets[2]);

	/* Most EEH events are due to device driver bugs.  Having
	 * a stack trace will help the device-driver authors figure
	 * out what happened.  So print that out. */
	if (rets[0] != 5) dump_stack();
	return 1;

dn_unlock:
	spin_unlock_irqrestore(&confirm_error_lock, flags);
	return rc;
}