static ssize_t version_show(struct device *dev, struct device_attribute *attr, char *buf) { u64 slu_id, app_id; struct genwqe_dev *cd = dev_get_drvdata(dev); slu_id = __genwqe_readq(cd, IO_SLU_UNITCFG); app_id = __genwqe_readq(cd, IO_APP_UNITCFG); return sprintf(buf, "%016llx.%016llx\n", slu_id, app_id); }
static int genwqe_read_ids(struct genwqe_dev *cd) { int err = 0; int slu_id; struct pci_dev *pci_dev = cd->pci_dev; cd->slu_unitcfg = __genwqe_readq(cd, IO_SLU_UNITCFG); if (cd->slu_unitcfg == IO_ILLEGAL_VALUE) { dev_err(&pci_dev->dev, "err: SLUID=%016llx\n", cd->slu_unitcfg); err = -EIO; goto out_err; } slu_id = genwqe_get_slu_id(cd); if (slu_id < GENWQE_SLU_ARCH_REQ || slu_id == 0xff) { dev_err(&pci_dev->dev, "err: incompatible SLU Architecture %u\n", slu_id); err = -ENOENT; goto out_err; } cd->app_unitcfg = __genwqe_readq(cd, IO_APP_UNITCFG); if (cd->app_unitcfg == IO_ILLEGAL_VALUE) { dev_err(&pci_dev->dev, "err: APPID=%016llx\n", cd->app_unitcfg); err = -EIO; goto out_err; } genwqe_read_app_id(cd, cd->app_name, sizeof(cd->app_name)); /* * Is access to all registers possible? If we are a VF the * answer is obvious. If we run fully virtualized, we need to * check if we can access all registers. If we do not have * full access we will cause an UR and some informational FIRs * in the PF, but that should not harm. */ if (pci_dev->is_virtfn) cd->is_privileged = 0; else cd->is_privileged = (__genwqe_readq(cd, IO_SLU_BITSTREAM) != IO_ILLEGAL_VALUE); out_err: return err; }
/** * curr_bitstream_show() - Show the current bitstream id * * There is a bug in some old versions of the CPLD which selects the * bitstream, which causes the IO_SLU_BITSTREAM register to report * unreliable data in very rare cases. This makes this sysfs * unreliable up to the point were a new CPLD version is being used. * * Unfortunately there is no automatic way yet to query the CPLD * version, such that you need to manually ensure via programming * tools that you have a recent version of the CPLD software. * * The proposed circumvention is to use a special recovery bitstream * on the backup partition (0) to identify problems while loading the * image. */ static ssize_t curr_bitstream_show(struct device *dev, struct device_attribute *attr, char *buf) { int curr_bitstream; struct genwqe_dev *cd = dev_get_drvdata(dev); curr_bitstream = __genwqe_readq(cd, IO_SLU_BITSTREAM) & 0x1; return sprintf(buf, "%d\n", curr_bitstream); }
static ssize_t tempsens_show(struct device *dev, struct device_attribute *attr, char *buf) { u64 tempsens; struct genwqe_dev *cd = dev_get_drvdata(dev); tempsens = __genwqe_readq(cd, IO_SLU_TEMPERATURE_SENSOR); return sprintf(buf, "%016llx\n", tempsens); }
static ssize_t queue_working_time_show(struct device *dev, struct device_attribute *attr, char *buf) { u64 t; struct genwqe_dev *cd = dev_get_drvdata(dev); t = __genwqe_readq(cd, IO_SLC_QUEUE_WTIME); return sprintf(buf, "%016llx\n", t); }
static ssize_t freerunning_timer_show(struct device *dev, struct device_attribute *attr, char *buf) { u64 t; struct genwqe_dev *cd = dev_get_drvdata(dev); t = __genwqe_readq(cd, IO_SLC_FREE_RUNNING_TIMER); return sprintf(buf, "%016llx\n", t); }
/** * genwqe_health_thread() - Health checking thread * * This thread is only started for the PF of the card. * * This thread monitors the health of the card. A critical situation * is when we read registers which contain -1 (IO_ILLEGAL_VALUE). In * this case we need to be recovered from outside. Writing to * registers will very likely not work either. * * This thread must only exit if kthread_should_stop() becomes true. * * Condition for the health-thread to trigger: * a) when a kthread_stop() request comes in or * b) a critical GFIR occured * * Informational GFIRs are checked and potentially printed in * health_check_interval seconds. */ static int genwqe_health_thread(void *data) { int rc, should_stop = 0; struct genwqe_dev *cd = data; struct pci_dev *pci_dev = cd->pci_dev; u64 gfir, gfir_masked, slu_unitcfg, app_unitcfg; health_thread_begin: while (!kthread_should_stop()) { rc = wait_event_interruptible_timeout(cd->health_waitq, (genwqe_health_check_cond(cd, &gfir) || (should_stop = kthread_should_stop())), genwqe_health_check_interval * HZ); if (should_stop) break; if (gfir == IO_ILLEGAL_VALUE) { dev_err(&pci_dev->dev, "[%s] GFIR=%016llx\n", __func__, gfir); goto fatal_error; } slu_unitcfg = __genwqe_readq(cd, IO_SLU_UNITCFG); if (slu_unitcfg == IO_ILLEGAL_VALUE) { dev_err(&pci_dev->dev, "[%s] SLU_UNITCFG=%016llx\n", __func__, slu_unitcfg); goto fatal_error; } app_unitcfg = __genwqe_readq(cd, IO_APP_UNITCFG); if (app_unitcfg == IO_ILLEGAL_VALUE) { dev_err(&pci_dev->dev, "[%s] APP_UNITCFG=%016llx\n", __func__, app_unitcfg); goto fatal_error; } gfir = __genwqe_readq(cd, IO_SLC_CFGREG_GFIR); if (gfir == IO_ILLEGAL_VALUE) { dev_err(&pci_dev->dev, "[%s] %s: GFIR=%016llx\n", __func__, (gfir & GFIR_ERR_TRIGGER) ? "err" : "info", gfir); goto fatal_error; } gfir_masked = genwqe_fir_checking(cd); if (gfir_masked == IO_ILLEGAL_VALUE) goto fatal_error; /* * GFIR ErrorTrigger bits set => reset the card! * Never do this for old/manufacturing images! */ if ((gfir_masked) && !cd->skip_recovery && genwqe_recovery_on_fatal_gfir_required(cd)) { cd->card_state = GENWQE_CARD_FATAL_ERROR; rc = genwqe_recover_card(cd, 0); if (rc < 0) { /* FIXME Card is unusable and needs unbind! */ goto fatal_error; } } if (cd->card_state == GENWQE_CARD_RELOAD_BITSTREAM) { /* Userspace requested card bitstream reload */ rc = genwqe_reload_bistream(cd); if (rc) goto fatal_error; } cd->last_gfir = gfir; cond_resched(); } return 0; fatal_error: if (cd->use_platform_recovery) { /* * Since we use raw accessors, EEH errors won't be detected * by the platform until we do a non-raw MMIO or config space * read */ readq(cd->mmio + IO_SLC_CFGREG_GFIR); /* We do nothing if the card is going over PCI recovery */ if (pci_channel_offline(pci_dev)) return -EIO; /* * If it's supported by the platform, we try a fundamental reset * to recover from a fatal error. Otherwise, we continue to wait * for an external recovery procedure to take care of it. */ rc = genwqe_platform_recovery(cd); if (!rc) goto health_thread_begin; } dev_err(&pci_dev->dev, "[%s] card unusable. Please trigger unbind!\n", __func__); /* Bring down logical devices to inform user space via udev remove. */ cd->card_state = GENWQE_CARD_FATAL_ERROR; genwqe_stop(cd); /* genwqe_bus_reset failed(). Now wait for genwqe_remove(). */ while (!kthread_should_stop()) cond_resched(); return -EIO; }
/** * genwqe_fir_checking() - Check the fault isolation registers of the card * * If this code works ok, can be tried out with help of the genwqe_poke tool: * sudo ./tools/genwqe_poke 0x8 0xfefefefefef * * Now the relevant FIRs/sFIRs should be printed out and the driver should * invoke recovery (devices are removed and readded). */ static u64 genwqe_fir_checking(struct genwqe_dev *cd) { int j, iterations = 0; u64 mask, fir, fec, uid, gfir, gfir_masked, sfir, sfec; u32 fir_addr, fir_clr_addr, fec_addr, sfir_addr, sfec_addr; struct pci_dev *pci_dev = cd->pci_dev; healthMonitor: iterations++; if (iterations > 16) { dev_err(&pci_dev->dev, "* exit looping after %d times\n", iterations); goto fatal_error; } gfir = __genwqe_readq(cd, IO_SLC_CFGREG_GFIR); if (gfir != 0x0) dev_err(&pci_dev->dev, "* 0x%08x 0x%016llx\n", IO_SLC_CFGREG_GFIR, gfir); if (gfir == IO_ILLEGAL_VALUE) goto fatal_error; /* * Avoid printing when to GFIR bit is on prevents contignous * printout e.g. for the following bug: * FIR set without a 2ndary FIR/FIR cannot be cleared * Comment out the following if to get the prints: */ if (gfir == 0) return 0; gfir_masked = gfir & GFIR_ERR_TRIGGER; /* fatal errors */ for (uid = 0; uid < GENWQE_MAX_UNITS; uid++) { /* 0..2 in zEDC */ /* read the primary FIR (pfir) */ fir_addr = (uid << 24) + 0x08; fir = __genwqe_readq(cd, fir_addr); if (fir == 0x0) continue; /* no error in this unit */ dev_err(&pci_dev->dev, "* 0x%08x 0x%016llx\n", fir_addr, fir); if (fir == IO_ILLEGAL_VALUE) goto fatal_error; /* read primary FEC */ fec_addr = (uid << 24) + 0x18; fec = __genwqe_readq(cd, fec_addr); dev_err(&pci_dev->dev, "* 0x%08x 0x%016llx\n", fec_addr, fec); if (fec == IO_ILLEGAL_VALUE) goto fatal_error; for (j = 0, mask = 1ULL; j < 64; j++, mask <<= 1) { /* secondary fir empty, skip it */ if ((fir & mask) == 0x0) continue; sfir_addr = (uid << 24) + 0x100 + 0x08 * j; sfir = __genwqe_readq(cd, sfir_addr); if (sfir == IO_ILLEGAL_VALUE) goto fatal_error; dev_err(&pci_dev->dev, "* 0x%08x 0x%016llx\n", sfir_addr, sfir); sfec_addr = (uid << 24) + 0x300 + 0x08 * j; sfec = __genwqe_readq(cd, sfec_addr); if (sfec == IO_ILLEGAL_VALUE) goto fatal_error; dev_err(&pci_dev->dev, "* 0x%08x 0x%016llx\n", sfec_addr, sfec); gfir = __genwqe_readq(cd, IO_SLC_CFGREG_GFIR); if (gfir == IO_ILLEGAL_VALUE) goto fatal_error; /* gfir turned on during routine! get out and start over. */ if ((gfir_masked == 0x0) && (gfir & GFIR_ERR_TRIGGER)) { goto healthMonitor; } /* do not clear if we entered with a fatal gfir */ if (gfir_masked == 0x0) { /* NEW clear by mask the logged bits */ sfir_addr = (uid << 24) + 0x100 + 0x08 * j; __genwqe_writeq(cd, sfir_addr, sfir); dev_dbg(&pci_dev->dev, "[HM] Clearing 2ndary FIR 0x%08x with 0x%016llx\n", sfir_addr, sfir); /* * note, these cannot be error-Firs * since gfir_masked is 0 after sfir * was read. Also, it is safe to do * this write if sfir=0. Still need to * clear the primary. This just means * there is no secondary FIR. */ /* clear by mask the logged bit. */ fir_clr_addr = (uid << 24) + 0x10; __genwqe_writeq(cd, fir_clr_addr, mask); dev_dbg(&pci_dev->dev, "[HM] Clearing primary FIR 0x%08x with 0x%016llx\n", fir_clr_addr, mask); } } } gfir = __genwqe_readq(cd, IO_SLC_CFGREG_GFIR); if (gfir == IO_ILLEGAL_VALUE) goto fatal_error; if ((gfir_masked == 0x0) && (gfir & GFIR_ERR_TRIGGER)) { /* * Check once more that it didn't go on after all the * FIRS were cleared. */ dev_dbg(&pci_dev->dev, "ACK! Another FIR! Recursing %d!\n", iterations); goto healthMonitor; } return gfir_masked; fatal_error: return IO_ILLEGAL_VALUE; }
static int genwqe_health_check_cond(struct genwqe_dev *cd, u64 *gfir) { *gfir = __genwqe_readq(cd, IO_SLC_CFGREG_GFIR); return (*gfir & GFIR_ERR_TRIGGER) && genwqe_recovery_on_fatal_gfir_required(cd); }