Example #1
0
static ssize_t version_show(struct device *dev, struct device_attribute *attr,
			    char *buf)
{
	u64 slu_id, app_id;
	struct genwqe_dev *cd = dev_get_drvdata(dev);

	slu_id = __genwqe_readq(cd, IO_SLU_UNITCFG);
	app_id = __genwqe_readq(cd, IO_APP_UNITCFG);

	return sprintf(buf, "%016llx.%016llx\n", slu_id, app_id);
}
Example #2
0
static int genwqe_read_ids(struct genwqe_dev *cd)
{
	int err = 0;
	int slu_id;
	struct pci_dev *pci_dev = cd->pci_dev;

	cd->slu_unitcfg = __genwqe_readq(cd, IO_SLU_UNITCFG);
	if (cd->slu_unitcfg == IO_ILLEGAL_VALUE) {
		dev_err(&pci_dev->dev,
			"err: SLUID=%016llx\n", cd->slu_unitcfg);
		err = -EIO;
		goto out_err;
	}

	slu_id = genwqe_get_slu_id(cd);
	if (slu_id < GENWQE_SLU_ARCH_REQ || slu_id == 0xff) {
		dev_err(&pci_dev->dev,
			"err: incompatible SLU Architecture %u\n", slu_id);
		err = -ENOENT;
		goto out_err;
	}

	cd->app_unitcfg = __genwqe_readq(cd, IO_APP_UNITCFG);
	if (cd->app_unitcfg == IO_ILLEGAL_VALUE) {
		dev_err(&pci_dev->dev,
			"err: APPID=%016llx\n", cd->app_unitcfg);
		err = -EIO;
		goto out_err;
	}
	genwqe_read_app_id(cd, cd->app_name, sizeof(cd->app_name));

	/*
	 * Is access to all registers possible? If we are a VF the
	 * answer is obvious. If we run fully virtualized, we need to
	 * check if we can access all registers. If we do not have
	 * full access we will cause an UR and some informational FIRs
	 * in the PF, but that should not harm.
	 */
	if (pci_dev->is_virtfn)
		cd->is_privileged = 0;
	else
		cd->is_privileged = (__genwqe_readq(cd, IO_SLU_BITSTREAM)
				     != IO_ILLEGAL_VALUE);

 out_err:
	return err;
}
Example #3
0
/**
 * curr_bitstream_show() - Show the current bitstream id
 *
 * There is a bug in some old versions of the CPLD which selects the
 * bitstream, which causes the IO_SLU_BITSTREAM register to report
 * unreliable data in very rare cases. This makes this sysfs
 * unreliable up to the point were a new CPLD version is being used.
 *
 * Unfortunately there is no automatic way yet to query the CPLD
 * version, such that you need to manually ensure via programming
 * tools that you have a recent version of the CPLD software.
 *
 * The proposed circumvention is to use a special recovery bitstream
 * on the backup partition (0) to identify problems while loading the
 * image.
 */
static ssize_t curr_bitstream_show(struct device *dev,
				   struct device_attribute *attr, char *buf)
{
	int curr_bitstream;
	struct genwqe_dev *cd = dev_get_drvdata(dev);

	curr_bitstream = __genwqe_readq(cd, IO_SLU_BITSTREAM) & 0x1;
	return sprintf(buf, "%d\n", curr_bitstream);
}
Example #4
0
static ssize_t tempsens_show(struct device *dev, struct device_attribute *attr,
			     char *buf)
{
	u64 tempsens;
	struct genwqe_dev *cd = dev_get_drvdata(dev);

	tempsens = __genwqe_readq(cd, IO_SLU_TEMPERATURE_SENSOR);
	return sprintf(buf, "%016llx\n", tempsens);
}
Example #5
0
static ssize_t queue_working_time_show(struct device *dev,
				       struct device_attribute *attr,
				       char *buf)
{
	u64 t;
	struct genwqe_dev *cd = dev_get_drvdata(dev);

	t = __genwqe_readq(cd, IO_SLC_QUEUE_WTIME);
	return sprintf(buf, "%016llx\n", t);
}
Example #6
0
static ssize_t freerunning_timer_show(struct device *dev,
				      struct device_attribute *attr,
				      char *buf)
{
	u64 t;
	struct genwqe_dev *cd = dev_get_drvdata(dev);

	t = __genwqe_readq(cd, IO_SLC_FREE_RUNNING_TIMER);
	return sprintf(buf, "%016llx\n", t);
}
Example #7
0
/**
 * genwqe_health_thread() - Health checking thread
 *
 * This thread is only started for the PF of the card.
 *
 * This thread monitors the health of the card. A critical situation
 * is when we read registers which contain -1 (IO_ILLEGAL_VALUE). In
 * this case we need to be recovered from outside. Writing to
 * registers will very likely not work either.
 *
 * This thread must only exit if kthread_should_stop() becomes true.
 *
 * Condition for the health-thread to trigger:
 *   a) when a kthread_stop() request comes in or
 *   b) a critical GFIR occured
 *
 * Informational GFIRs are checked and potentially printed in
 * health_check_interval seconds.
 */
static int genwqe_health_thread(void *data)
{
	int rc, should_stop = 0;
	struct genwqe_dev *cd = data;
	struct pci_dev *pci_dev = cd->pci_dev;
	u64 gfir, gfir_masked, slu_unitcfg, app_unitcfg;

 health_thread_begin:
	while (!kthread_should_stop()) {
		rc = wait_event_interruptible_timeout(cd->health_waitq,
			 (genwqe_health_check_cond(cd, &gfir) ||
			  (should_stop = kthread_should_stop())),
				genwqe_health_check_interval * HZ);

		if (should_stop)
			break;

		if (gfir == IO_ILLEGAL_VALUE) {
			dev_err(&pci_dev->dev,
				"[%s] GFIR=%016llx\n", __func__, gfir);
			goto fatal_error;
		}

		slu_unitcfg = __genwqe_readq(cd, IO_SLU_UNITCFG);
		if (slu_unitcfg == IO_ILLEGAL_VALUE) {
			dev_err(&pci_dev->dev,
				"[%s] SLU_UNITCFG=%016llx\n",
				__func__, slu_unitcfg);
			goto fatal_error;
		}

		app_unitcfg = __genwqe_readq(cd, IO_APP_UNITCFG);
		if (app_unitcfg == IO_ILLEGAL_VALUE) {
			dev_err(&pci_dev->dev,
				"[%s] APP_UNITCFG=%016llx\n",
				__func__, app_unitcfg);
			goto fatal_error;
		}

		gfir = __genwqe_readq(cd, IO_SLC_CFGREG_GFIR);
		if (gfir == IO_ILLEGAL_VALUE) {
			dev_err(&pci_dev->dev,
				"[%s] %s: GFIR=%016llx\n", __func__,
				(gfir & GFIR_ERR_TRIGGER) ? "err" : "info",
				gfir);
			goto fatal_error;
		}

		gfir_masked = genwqe_fir_checking(cd);
		if (gfir_masked == IO_ILLEGAL_VALUE)
			goto fatal_error;

		/*
		 * GFIR ErrorTrigger bits set => reset the card!
		 * Never do this for old/manufacturing images!
		 */
		if ((gfir_masked) && !cd->skip_recovery &&
		    genwqe_recovery_on_fatal_gfir_required(cd)) {

			cd->card_state = GENWQE_CARD_FATAL_ERROR;

			rc = genwqe_recover_card(cd, 0);
			if (rc < 0) {
				/* FIXME Card is unusable and needs unbind! */
				goto fatal_error;
			}
		}

		if (cd->card_state == GENWQE_CARD_RELOAD_BITSTREAM) {
			/* Userspace requested card bitstream reload */
			rc = genwqe_reload_bistream(cd);
			if (rc)
				goto fatal_error;
		}

		cd->last_gfir = gfir;
		cond_resched();
	}

	return 0;

 fatal_error:
	if (cd->use_platform_recovery) {
		/*
		 * Since we use raw accessors, EEH errors won't be detected
		 * by the platform until we do a non-raw MMIO or config space
		 * read
		 */
		readq(cd->mmio + IO_SLC_CFGREG_GFIR);

		/* We do nothing if the card is going over PCI recovery */
		if (pci_channel_offline(pci_dev))
			return -EIO;

		/*
		 * If it's supported by the platform, we try a fundamental reset
		 * to recover from a fatal error. Otherwise, we continue to wait
		 * for an external recovery procedure to take care of it.
		 */
		rc = genwqe_platform_recovery(cd);
		if (!rc)
			goto health_thread_begin;
	}

	dev_err(&pci_dev->dev,
		"[%s] card unusable. Please trigger unbind!\n", __func__);

	/* Bring down logical devices to inform user space via udev remove. */
	cd->card_state = GENWQE_CARD_FATAL_ERROR;
	genwqe_stop(cd);

	/* genwqe_bus_reset failed(). Now wait for genwqe_remove(). */
	while (!kthread_should_stop())
		cond_resched();

	return -EIO;
}
Example #8
0
/**
 * genwqe_fir_checking() - Check the fault isolation registers of the card
 *
 * If this code works ok, can be tried out with help of the genwqe_poke tool:
 *   sudo ./tools/genwqe_poke 0x8 0xfefefefefef
 *
 * Now the relevant FIRs/sFIRs should be printed out and the driver should
 * invoke recovery (devices are removed and readded).
 */
static u64 genwqe_fir_checking(struct genwqe_dev *cd)
{
	int j, iterations = 0;
	u64 mask, fir, fec, uid, gfir, gfir_masked, sfir, sfec;
	u32 fir_addr, fir_clr_addr, fec_addr, sfir_addr, sfec_addr;
	struct pci_dev *pci_dev = cd->pci_dev;

 healthMonitor:
	iterations++;
	if (iterations > 16) {
		dev_err(&pci_dev->dev, "* exit looping after %d times\n",
			iterations);
		goto fatal_error;
	}

	gfir = __genwqe_readq(cd, IO_SLC_CFGREG_GFIR);
	if (gfir != 0x0)
		dev_err(&pci_dev->dev, "* 0x%08x 0x%016llx\n",
				    IO_SLC_CFGREG_GFIR, gfir);
	if (gfir == IO_ILLEGAL_VALUE)
		goto fatal_error;

	/*
	 * Avoid printing when to GFIR bit is on prevents contignous
	 * printout e.g. for the following bug:
	 *   FIR set without a 2ndary FIR/FIR cannot be cleared
	 * Comment out the following if to get the prints:
	 */
	if (gfir == 0)
		return 0;

	gfir_masked = gfir & GFIR_ERR_TRIGGER;  /* fatal errors */

	for (uid = 0; uid < GENWQE_MAX_UNITS; uid++) { /* 0..2 in zEDC */

		/* read the primary FIR (pfir) */
		fir_addr = (uid << 24) + 0x08;
		fir = __genwqe_readq(cd, fir_addr);
		if (fir == 0x0)
			continue;  /* no error in this unit */

		dev_err(&pci_dev->dev, "* 0x%08x 0x%016llx\n", fir_addr, fir);
		if (fir == IO_ILLEGAL_VALUE)
			goto fatal_error;

		/* read primary FEC */
		fec_addr = (uid << 24) + 0x18;
		fec = __genwqe_readq(cd, fec_addr);

		dev_err(&pci_dev->dev, "* 0x%08x 0x%016llx\n", fec_addr, fec);
		if (fec == IO_ILLEGAL_VALUE)
			goto fatal_error;

		for (j = 0, mask = 1ULL; j < 64; j++, mask <<= 1) {

			/* secondary fir empty, skip it */
			if ((fir & mask) == 0x0)
				continue;

			sfir_addr = (uid << 24) + 0x100 + 0x08 * j;
			sfir = __genwqe_readq(cd, sfir_addr);

			if (sfir == IO_ILLEGAL_VALUE)
				goto fatal_error;
			dev_err(&pci_dev->dev,
				"* 0x%08x 0x%016llx\n", sfir_addr, sfir);

			sfec_addr = (uid << 24) + 0x300 + 0x08 * j;
			sfec = __genwqe_readq(cd, sfec_addr);

			if (sfec == IO_ILLEGAL_VALUE)
				goto fatal_error;
			dev_err(&pci_dev->dev,
				"* 0x%08x 0x%016llx\n", sfec_addr, sfec);

			gfir = __genwqe_readq(cd, IO_SLC_CFGREG_GFIR);
			if (gfir == IO_ILLEGAL_VALUE)
				goto fatal_error;

			/* gfir turned on during routine! get out and
			   start over. */
			if ((gfir_masked == 0x0) &&
			    (gfir & GFIR_ERR_TRIGGER)) {
				goto healthMonitor;
			}

			/* do not clear if we entered with a fatal gfir */
			if (gfir_masked == 0x0) {

				/* NEW clear by mask the logged bits */
				sfir_addr = (uid << 24) + 0x100 + 0x08 * j;
				__genwqe_writeq(cd, sfir_addr, sfir);

				dev_dbg(&pci_dev->dev,
					"[HM] Clearing  2ndary FIR 0x%08x with 0x%016llx\n",
					sfir_addr, sfir);

				/*
				 * note, these cannot be error-Firs
				 * since gfir_masked is 0 after sfir
				 * was read. Also, it is safe to do
				 * this write if sfir=0. Still need to
				 * clear the primary. This just means
				 * there is no secondary FIR.
				 */

				/* clear by mask the logged bit. */
				fir_clr_addr = (uid << 24) + 0x10;
				__genwqe_writeq(cd, fir_clr_addr, mask);

				dev_dbg(&pci_dev->dev,
					"[HM] Clearing primary FIR 0x%08x with 0x%016llx\n",
					fir_clr_addr, mask);
			}
		}
	}
	gfir = __genwqe_readq(cd, IO_SLC_CFGREG_GFIR);
	if (gfir == IO_ILLEGAL_VALUE)
		goto fatal_error;

	if ((gfir_masked == 0x0) && (gfir & GFIR_ERR_TRIGGER)) {
		/*
		 * Check once more that it didn't go on after all the
		 * FIRS were cleared.
		 */
		dev_dbg(&pci_dev->dev, "ACK! Another FIR! Recursing %d!\n",
			iterations);
		goto healthMonitor;
	}
	return gfir_masked;

 fatal_error:
	return IO_ILLEGAL_VALUE;
}
Example #9
0
static int genwqe_health_check_cond(struct genwqe_dev *cd, u64 *gfir)
{
	*gfir = __genwqe_readq(cd, IO_SLC_CFGREG_GFIR);
	return (*gfir & GFIR_ERR_TRIGGER) &&
		genwqe_recovery_on_fatal_gfir_required(cd);
}