Example #1
0
/*
 * On PowerNV platform, we might already have fenced PHB there.
 * For that case, it's meaningless to recover frozen PE. Intead,
 * We have to handle fenced PHB firstly.
 */
static int eeh_phb_check_failure(struct eeh_pe *pe)
{
	struct eeh_pe *phb_pe;
	unsigned long flags;
	int ret;

	if (!eeh_has_flag(EEH_PROBE_MODE_DEV))
		return -EPERM;

	/* Find the PHB PE */
	phb_pe = eeh_phb_pe_get(pe->phb);
	if (!phb_pe) {
		pr_warn("%s Can't find PE for PHB#%d\n",
			__func__, pe->phb->global_number);
		return -EEXIST;
	}

	/* If the PHB has been in problematic state */
	eeh_serialize_lock(&flags);
	if (phb_pe->state & EEH_PE_ISOLATED) {
		ret = 0;
		goto out;
	}

	/* Check PHB state */
	ret = eeh_ops->get_state(phb_pe, NULL);
	if ((ret < 0) ||
	    (ret == EEH_STATE_NOT_SUPPORT) ||
	    (ret & (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE)) ==
	    (EEH_STATE_MMIO_ACTIVE | EEH_STATE_DMA_ACTIVE)) {
		ret = 0;
		goto out;
	}

	/* Isolate the PHB and send event */
	eeh_pe_state_mark(phb_pe, EEH_PE_ISOLATED);
	eeh_serialize_unlock(flags);

	pr_err("EEH: PHB#%x failure detected, location: %s\n",
		phb_pe->phb->global_number, eeh_pe_loc_get(phb_pe));
	dump_stack();
	eeh_send_failure_event(phb_pe);

	return 1;
out:
	eeh_serialize_unlock(flags);
	return ret;
}
Example #2
0
/**
 * eeh_event_handler - Dispatch EEH events.
 * @dummy - unused
 *
 * The detection of a frozen slot can occur inside an interrupt,
 * where it can be hard to do anything about it.  The goal of this
 * routine is to pull these detection events out of the context
 * of the interrupt handler, and re-dispatch them for processing
 * at a later time in a normal context.
 */
static int eeh_event_handler(void * dummy)
{
	unsigned long flags;
	struct eeh_event *event;
	struct eeh_pe *pe;

	while (!kthread_should_stop()) {
		if (down_interruptible(&eeh_eventlist_sem))
			break;

		/* Fetch EEH event from the queue */
		spin_lock_irqsave(&eeh_eventlist_lock, flags);
		event = NULL;
		if (!list_empty(&eeh_eventlist)) {
			event = list_entry(eeh_eventlist.next,
					   struct eeh_event, list);
			list_del(&event->list);
		}
		spin_unlock_irqrestore(&eeh_eventlist_lock, flags);
		if (!event)
			continue;

		/* We might have event without binding PE */
		pe = event->pe;
		if (pe) {
			eeh_pe_state_mark(pe, EEH_PE_RECOVERING);
			if (pe->type & EEH_PE_PHB)
				pr_info("EEH: Detected error on PHB#%d\n",
					 pe->phb->global_number);
			else
				pr_info("EEH: Detected PCI bus error on "
					"PHB#%d-PE#%x\n",
					pe->phb->global_number, pe->addr);
			eeh_handle_event(pe);
			eeh_pe_state_clear(pe, EEH_PE_RECOVERING);
		} else {
			eeh_handle_event(NULL);
		}

		kfree(event);
	}
Example #3
0
/**
 * eeh_reset_device - Perform actual reset of a pci slot
 * @pe: EEH PE
 * @bus: PCI bus corresponding to the isolcated slot
 *
 * This routine must be called to do reset on the indicated PE.
 * During the reset, udev might be invoked because those affected
 * PCI devices will be removed and then added.
 */
static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus)
{
	struct pci_bus *frozen_bus = eeh_pe_bus_get(pe);
	struct timeval tstamp;
	int cnt, rc, removed = 0;

	/* pcibios will clear the counter; save the value */
	cnt = pe->freeze_count;
	tstamp = pe->tstamp;

	/*
	 * We don't remove the corresponding PE instances because
	 * we need the information afterwords. The attached EEH
	 * devices are expected to be attached soon when calling
	 * into pcibios_add_pci_devices().
	 */
	eeh_pe_state_mark(pe, EEH_PE_KEEP);
	if (bus) {
		pci_lock_rescan_remove();
		pcibios_remove_pci_devices(bus);
		pci_unlock_rescan_remove();
	} else if (frozen_bus) {
		eeh_pe_dev_traverse(pe, eeh_rmv_device, &removed);
	}

	/*
	 * Reset the pci controller. (Asserts RST#; resets config space).
	 * Reconfigure bridges and devices. Don't try to bring the system
	 * up if the reset failed for some reason.
	 *
	 * During the reset, it's very dangerous to have uncontrolled PCI
	 * config accesses. So we prefer to block them. However, controlled
	 * PCI config accesses initiated from EEH itself are allowed.
	 */
	eeh_pe_state_mark(pe, EEH_PE_RESET);
	rc = eeh_reset_pe(pe);
	if (rc) {
		eeh_pe_state_clear(pe, EEH_PE_RESET);
		return rc;
	}

	pci_lock_rescan_remove();

	/* Restore PE */
	eeh_ops->configure_bridge(pe);
	eeh_pe_restore_bars(pe);
	eeh_pe_state_clear(pe, EEH_PE_RESET);

	/* Clear frozen state */
	rc = eeh_clear_pe_frozen_state(pe);
	if (rc)
		return rc;

	/* Give the system 5 seconds to finish running the user-space
	 * hotplug shutdown scripts, e.g. ifdown for ethernet.  Yes,
	 * this is a hack, but if we don't do this, and try to bring
	 * the device up before the scripts have taken it down,
	 * potentially weird things happen.
	 */
	if (bus) {
		pr_info("EEH: Sleep 5s ahead of complete hotplug\n");
		ssleep(5);

		/*
		 * The EEH device is still connected with its parent
		 * PE. We should disconnect it so the binding can be
		 * rebuilt when adding PCI devices.
		 */
		eeh_pe_traverse(pe, eeh_pe_detach_dev, NULL);
		pcibios_add_pci_devices(bus);
	} else if (frozen_bus && removed) {
		pr_info("EEH: Sleep 5s ahead of partial hotplug\n");
		ssleep(5);

		eeh_pe_traverse(pe, eeh_pe_detach_dev, NULL);
		pcibios_add_pci_devices(frozen_bus);
	}
	eeh_pe_state_clear(pe, EEH_PE_KEEP);

	pe->tstamp = tstamp;
	pe->freeze_count = cnt;

	pci_unlock_rescan_remove();
	return 0;
}