int zfsctl_mount_snapshot(struct path *path, int flags) { struct dentry *dentry = path->dentry; struct inode *ip = dentry->d_inode; zfs_sb_t *zsb = ITOZSB(ip); char *full_name, *full_path; zfs_snapentry_t *sep; zfs_snapentry_t search; char *argv[] = { "/bin/sh", "-c", NULL, NULL }; char *envp[] = { NULL }; int error; ZFS_ENTER(zsb); full_name = kmem_zalloc(MAXNAMELEN, KM_SLEEP); full_path = kmem_zalloc(PATH_MAX, KM_SLEEP); error = zfsctl_snapshot_zname(ip, dname(dentry), MAXNAMELEN, full_name); if (error) goto error; error = zfsctl_snapshot_zpath(path, PATH_MAX, full_path); if (error) goto error; /* * Attempt to mount the snapshot from user space. Normally this * would be done using the vfs_kern_mount() function, however that * function is marked GPL-only and cannot be used. On error we * careful to log the real error to the console and return EISDIR * to safely abort the automount. This should be very rare. */ argv[2] = kmem_asprintf(SET_MOUNT_CMD, full_name, full_path); error = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC); strfree(argv[2]); if (error) { printk("ZFS: Unable to automount %s at %s: %d\n", full_name, full_path, error); error = EISDIR; goto error; } mutex_enter(&zsb->z_ctldir_lock); /* * Ensure a previous entry does not exist, if it does safely remove * it any cancel the outstanding expiration. This can occur when a * snapshot is manually unmounted and then an automount is triggered. */ search.se_name = full_name; sep = avl_find(&zsb->z_ctldir_snaps, &search, NULL); if (sep) { avl_remove(&zsb->z_ctldir_snaps, sep); taskq_cancel_id(zfs_expire_taskq, sep->se_taskqid); zfsctl_sep_free(sep); } sep = zfsctl_sep_alloc(); sep->se_name = full_name; sep->se_path = full_path; sep->se_inode = ip; avl_add(&zsb->z_ctldir_snaps, sep); sep->se_taskqid = taskq_dispatch_delay(zfs_expire_taskq, zfsctl_expire_snapshot, sep, TQ_SLEEP, ddi_get_lbolt() + zfs_expire_snapshot * HZ); mutex_exit(&zsb->z_ctldir_lock); error: if (error) { kmem_free(full_name, MAXNAMELEN); kmem_free(full_path, PATH_MAX); } ZFS_EXIT(zsb); return (error); }
/** * Worker for rtSemMutexSolRequest that handles the case where we go to sleep. * * @returns VINF_SUCCESS, VERR_INTERRUPTED, or VERR_SEM_DESTROYED. * Returns without owning the mutex. * @param pThis The mutex instance. * @param cMillies The timeout, must be > 0 or RT_INDEFINITE_WAIT. * @param fInterruptible The wait type. * * @remarks This needs to be called with the mutex object held! */ static int rtSemMutexSolRequestSleep(PRTSEMMUTEXINTERNAL pThis, RTMSINTERVAL cMillies, bool fInterruptible) { int rc = VERR_GENERAL_FAILURE; Assert(cMillies > 0); /* * Now we wait (sleep; although might spin and then sleep) & reference the mutex. */ ASMAtomicIncU32(&pThis->cWaiters); ASMAtomicIncU32(&pThis->cRefs); if (cMillies != RT_INDEFINITE_WAIT) { clock_t cTicks = drv_usectohz((clock_t)(cMillies * 1000L)); clock_t cTimeout = ddi_get_lbolt(); cTimeout += cTicks; if (fInterruptible) rc = cv_timedwait_sig(&pThis->Cnd, &pThis->Mtx, cTimeout); else rc = cv_timedwait(&pThis->Cnd, &pThis->Mtx, cTimeout); } else { if (fInterruptible) rc = cv_wait_sig(&pThis->Cnd, &pThis->Mtx); else { cv_wait(&pThis->Cnd, &pThis->Mtx); rc = 1; } } ASMAtomicDecU32(&pThis->cWaiters); if (rc > 0) { if (pThis->u32Magic == RTSEMMUTEX_MAGIC) { if (pThis->hOwnerThread == NIL_RTNATIVETHREAD) { /* * Woken up by a release from another thread. */ Assert(pThis->cRecursions == 0); pThis->cRecursions = 1; pThis->hOwnerThread = RTThreadNativeSelf(); rc = VINF_SUCCESS; } else { /* * Interrupted by some signal. */ rc = VERR_INTERRUPTED; } } else { /* * Awakened due to the destruction-in-progress broadcast. * We will cleanup if we're the last waiter. */ rc = VERR_SEM_DESTROYED; } } else if (rc == -1) { /* * Timed out. */ rc = VERR_TIMEOUT; } else { /* * Condition may not have been met, returned due to pending signal. */ rc = VERR_INTERRUPTED; } if (!ASMAtomicDecU32(&pThis->cRefs)) { Assert(RT_FAILURE_NP(rc)); mutex_exit(&pThis->Mtx); cv_destroy(&pThis->Cnd); mutex_destroy(&pThis->Mtx); RTMemFree(pThis); return rc; } return rc; }
static int splat_taskq_test10(struct file *file, void *arg) { taskq_t *tq; splat_taskq_arg_t **tqas; atomic_t count; int i, j, rc = 0; int minalloc = 1; int maxalloc = 10; int nr_tasks = 100; int canceled = 0; int completed = 0; int blocked = 0; clock_t start, cancel; tqas = vmalloc(sizeof(*tqas) * nr_tasks); if (tqas == NULL) return -ENOMEM; memset(tqas, 0, sizeof(*tqas) * nr_tasks); splat_vprint(file, SPLAT_TASKQ_TEST10_NAME, "Taskq '%s' creating (%s dispatch) (%d/%d/%d)\n", SPLAT_TASKQ_TEST10_NAME, "delay", minalloc, maxalloc, nr_tasks); if ((tq = taskq_create(SPLAT_TASKQ_TEST10_NAME, 3, maxclsyspri, minalloc, maxalloc, TASKQ_PREPOPULATE)) == NULL) { splat_vprint(file, SPLAT_TASKQ_TEST10_NAME, "Taskq '%s' create failed\n", SPLAT_TASKQ_TEST10_NAME); rc = -EINVAL; goto out_free; } atomic_set(&count, 0); for (i = 0; i < nr_tasks; i++) { splat_taskq_arg_t *tq_arg; uint32_t rnd; /* A random timeout in jiffies of at most 5 seconds */ get_random_bytes((void *)&rnd, 4); rnd = rnd % (5 * HZ); tq_arg = kmem_alloc(sizeof(splat_taskq_arg_t), KM_SLEEP); tq_arg->file = file; tq_arg->name = SPLAT_TASKQ_TEST10_NAME; tq_arg->count = &count; tqas[i] = tq_arg; /* * Dispatch every 1/3 one immediately to mix it up, the cancel * code is inherently racy and we want to try and provoke any * subtle concurrently issues. */ if ((i % 3) == 0) { tq_arg->expire = ddi_get_lbolt(); tq_arg->id = taskq_dispatch(tq, splat_taskq_test10_func, tq_arg, TQ_SLEEP); } else { tq_arg->expire = ddi_get_lbolt() + rnd; tq_arg->id = taskq_dispatch_delay(tq, splat_taskq_test10_func, tq_arg, TQ_SLEEP, ddi_get_lbolt() + rnd); } if (tq_arg->id == 0) { splat_vprint(file, SPLAT_TASKQ_TEST10_NAME, "Taskq '%s' dispatch failed\n", SPLAT_TASKQ_TEST10_NAME); kmem_free(tq_arg, sizeof(splat_taskq_arg_t)); taskq_wait(tq); rc = -EINVAL; goto out; } else { splat_vprint(file, SPLAT_TASKQ_TEST10_NAME, "Taskq '%s' dispatch %lu in %lu jiffies\n", SPLAT_TASKQ_TEST10_NAME, (unsigned long)tq_arg->id, !(i % 3) ? 0 : tq_arg->expire - ddi_get_lbolt()); } } /* * Start randomly canceling tasks for the duration of the test. We * happen to know the valid task id's will be in the range 1..nr_tasks * because the taskq is private and was just created. However, we * have no idea of a particular task has already executed or not. */ splat_vprint(file, SPLAT_TASKQ_TEST10_NAME, "Taskq '%s' randomly " "canceling task ids\n", SPLAT_TASKQ_TEST10_NAME); start = ddi_get_lbolt(); i = 0; while (ddi_time_before(ddi_get_lbolt(), start + 5 * HZ)) { taskqid_t id; uint32_t rnd; i++; cancel = ddi_get_lbolt(); get_random_bytes((void *)&rnd, 4); id = 1 + (rnd % nr_tasks); rc = taskq_cancel_id(tq, id); /* * Keep track of the results of the random cancels. */ if (rc == 0) { canceled++; } else if (rc == ENOENT) { completed++; } else if (rc == EBUSY) { blocked++; } else { rc = -EINVAL; break; } /* * Verify we never get blocked to long in taskq_cancel_id(). * The worst case is 10ms if we happen to cancel the task * which is currently executing. We allow a factor of 2x. */ if (ddi_get_lbolt() - cancel > HZ / 50) { splat_vprint(file, SPLAT_TASKQ_TEST10_NAME, "Taskq '%s' cancel for %lu took %lu\n", SPLAT_TASKQ_TEST10_NAME, (unsigned long)id, ddi_get_lbolt() - cancel); rc = -ETIMEDOUT; break; } get_random_bytes((void *)&rnd, 4); msleep(1 + (rnd % 100)); rc = 0; } taskq_wait(tq); /* * Cross check the results of taskq_cancel_id() with the number of * times the dispatched function actually ran successfully. */ if ((rc == 0) && (nr_tasks - canceled != atomic_read(&count))) rc = -EDOM; splat_vprint(file, SPLAT_TASKQ_TEST10_NAME, "Taskq '%s' %d attempts, " "%d canceled, %d completed, %d blocked, %d/%d tasks run\n", SPLAT_TASKQ_TEST10_NAME, i, canceled, completed, blocked, atomic_read(&count), nr_tasks); splat_vprint(file, SPLAT_TASKQ_TEST10_NAME, "Taskq '%s' destroying %d\n", SPLAT_TASKQ_TEST10_NAME, rc); out: taskq_destroy(tq); out_free: for (j = 0; j < nr_tasks && tqas[j] != NULL; j++) kmem_free(tqas[j], sizeof(splat_taskq_arg_t)); vfree(tqas); return rc; }
static void txg_sync_thread(void *arg) { dsl_pool_t *dp = arg; spa_t *spa = dp->dp_spa; tx_state_t *tx = &dp->dp_tx; callb_cpr_t cpr; uint64_t start, delta; txg_thread_enter(tx, &cpr); start = delta = 0; for (;;) { uint64_t timeout = zfs_txg_timeout * hz; uint64_t timer; uint64_t txg; /* * We sync when we're scanning, there's someone waiting * on us, or the quiesce thread has handed off a txg to * us, or we have reached our timeout. */ timer = (delta >= timeout ? 0 : timeout - delta); while (!dsl_scan_active(dp->dp_scan) && !tx->tx_exiting && timer > 0 && tx->tx_synced_txg >= tx->tx_sync_txg_waiting && tx->tx_quiesced_txg == 0 && dp->dp_dirty_total < zfs_dirty_data_sync) { dprintf("waiting; tx_synced=%llu waiting=%llu dp=%p\n", tx->tx_synced_txg, tx->tx_sync_txg_waiting, dp); txg_thread_wait(tx, &cpr, &tx->tx_sync_more_cv, timer); delta = ddi_get_lbolt() - start; timer = (delta > timeout ? 0 : timeout - delta); } /* * Wait until the quiesce thread hands off a txg to us, * prompting it to do so if necessary. */ while (!tx->tx_exiting && tx->tx_quiesced_txg == 0) { if (tx->tx_quiesce_txg_waiting < tx->tx_open_txg+1) tx->tx_quiesce_txg_waiting = tx->tx_open_txg+1; cv_broadcast(&tx->tx_quiesce_more_cv); txg_thread_wait(tx, &cpr, &tx->tx_quiesce_done_cv, 0); } if (tx->tx_exiting) txg_thread_exit(tx, &cpr, &tx->tx_sync_thread); /* * Consume the quiesced txg which has been handed off to * us. This may cause the quiescing thread to now be * able to quiesce another txg, so we must signal it. */ txg = tx->tx_quiesced_txg; tx->tx_quiesced_txg = 0; tx->tx_syncing_txg = txg; DTRACE_PROBE2(txg__syncing, dsl_pool_t *, dp, uint64_t, txg); cv_broadcast(&tx->tx_quiesce_more_cv); dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n", txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting); mutex_exit(&tx->tx_sync_lock); start = ddi_get_lbolt(); spa_sync(spa, txg); delta = ddi_get_lbolt() - start; mutex_enter(&tx->tx_sync_lock); tx->tx_synced_txg = txg; tx->tx_syncing_txg = 0; DTRACE_PROBE2(txg__synced, dsl_pool_t *, dp, uint64_t, txg); cv_broadcast(&tx->tx_sync_done_cv); /* * Dispatch commit callbacks to worker threads. */ txg_dispatch_callbacks(dp, txg); } }
/* * cvc_send_to_iosram() * Flush as much data as possible to the CONO chunk. If successful, free * any mblks that were completely transmitted, update the b_rptr field in * the first remaining mblk if it was partially transmitted, and update the * caller's pointer to the new head of the mblk chain. Since the software * that will be pulling this data out of IOSRAM (dxs on the SC) is just * polling at some frequency, we avoid attempts to flush data to IOSRAM any * faster than a large divisor of that polling frequency. * * Note that "cvc_buf_t out" is only declared "static" to keep it from * being allocated on the stack. Allocating 1K+ structures on the stack * seems rather antisocial. */ static void cvc_send_to_iosram(mblk_t **chainpp) { int rval; uint8_t dvalid; uchar_t *cp; mblk_t *mp; mblk_t *last_empty_mp; static clock_t last_flush = (clock_t)-1; static cvc_buf_t out; /* see note above about static */ ASSERT(chainpp != NULL); /* * We _do_ have something to do, right? */ if (*chainpp == NULL) { return; } /* * We can actually increase throughput by throttling back on attempts to * flush data to IOSRAM, since trying to write every little bit of data * as it shows up will actually generate more delays waiting for the SC * to pick up each of those bits. Instead, we'll avoid attempting to * write data to IOSRAM any faster than half of the polling frequency we * expect the SC to be using. */ if (ddi_get_lbolt() - last_flush < drv_usectohz(CVC_IOSRAM_POLL_USECS / 2)) { return; } /* * If IOSRAM is inaccessible or the CONO chunk still holds data that * hasn't been picked up by the SC, there's nothing we can do right now. */ rval = iosram_get_flag(IOSRAM_KEY_CONO, &dvalid, NULL); if ((rval != 0) || (dvalid == IOSRAM_DATA_VALID)) { if ((rval != 0) && (rval != EAGAIN)) { cmn_err(CE_WARN, "cvc_send_to_iosram: get_flag ret %d", rval); } return; } /* * Copy up to MAX_XFER_COUTPUT chars from the mblk chain into a buffer. * Don't change any of the mblks just yet, since we can't be certain * that we'll be successful in writing data to the CONO chunk. */ out.count = 0; mp = *chainpp; cp = mp->b_rptr; last_empty_mp = NULL; while ((mp != NULL) && (out.count < MAX_XFER_COUTPUT)) { /* * Process as many of the characters in the current mblk as * possible. */ while ((cp != mp->b_wptr) && (out.count < MAX_XFER_COUTPUT)) { out.buffer[out.count++] = *cp++; } /* * Did we process that entire mblk? If so, move on to the next * one. If not, we're done filling the buffer even if there's * space left, because apparently there wasn't room to process * the next character. */ if (cp != mp->b_wptr) { break; } /* * When this loop terminates, last_empty_mp will point to the * last mblk that was completely processed, mp will point to the * following mblk (or NULL if no more mblks exist), and cp will * point to the first untransmitted character in the mblk * pointed to by mp. We'll need this data to update the mblk * chain if all of the data is successfully transmitted. */ last_empty_mp = mp; mp = mp->b_cont; cp = (mp != NULL) ? mp->b_rptr : NULL; } /* * If we succeeded in preparing some data, try to transmit it through * IOSRAM. First write the count and the data, which can be done in a * single operation thanks to the buffer structure we use, then set the * data_valid flag if the first step succeeded. */ if (out.count != 0) { rval = iosram_wr(IOSRAM_KEY_CONO, COUNT_OFFSET, CONSBUF_COUNT_SIZE + out.count, (caddr_t)&out); if ((rval != 0) && (rval != EAGAIN)) { cmn_err(CE_WARN, "cvc_putc: write ret %d", rval); } /* if the data write succeeded, set the data_valid flag */ if (rval == 0) { rval = iosram_set_flag(IOSRAM_KEY_CONO, IOSRAM_DATA_VALID, IOSRAM_INT_NONE); if ((rval != 0) && (rval != EAGAIN)) { cmn_err(CE_WARN, "cvc_putc: set flags for outbuf ret %d", rval); } } /* * If we successfully transmitted any data, modify the caller's * mblk chain to remove the data that was transmitted, freeing * all mblks that were completely processed. */ if (rval == 0) { last_flush = ddi_get_lbolt(); /* * If any data is left over, update the b_rptr field of * the first remaining mblk in case some of its data was * processed. */ if (mp != NULL) { mp->b_rptr = cp; } /* * If any mblks have been emptied, unlink them from the * residual chain, free them, and update the caller's * mblk pointer. */ if (last_empty_mp != NULL) { last_empty_mp->b_cont = NULL; freemsg(*chainpp); *chainpp = mp; } } } }
static int splat_taskq_test9(struct file *file, void *arg) { taskq_t *tq; atomic_t count; int i, rc = 0; int minalloc = 1; int maxalloc = 10; int nr_tasks = 100; splat_vprint(file, SPLAT_TASKQ_TEST9_NAME, "Taskq '%s' creating (%s dispatch) (%d/%d/%d)\n", SPLAT_TASKQ_TEST9_NAME, "delay", minalloc, maxalloc, nr_tasks); if ((tq = taskq_create(SPLAT_TASKQ_TEST9_NAME, 3, maxclsyspri, minalloc, maxalloc, TASKQ_PREPOPULATE)) == NULL) { splat_vprint(file, SPLAT_TASKQ_TEST9_NAME, "Taskq '%s' create failed\n", SPLAT_TASKQ_TEST9_NAME); return -EINVAL; } atomic_set(&count, 0); for (i = 1; i <= nr_tasks; i++) { splat_taskq_arg_t *tq_arg; taskqid_t id; uint32_t rnd; /* A random timeout in jiffies of at most 5 seconds */ get_random_bytes((void *)&rnd, 4); rnd = rnd % (5 * HZ); tq_arg = kmem_alloc(sizeof(splat_taskq_arg_t), KM_SLEEP); tq_arg->file = file; tq_arg->name = SPLAT_TASKQ_TEST9_NAME; tq_arg->expire = ddi_get_lbolt() + rnd; tq_arg->count = &count; splat_vprint(file, SPLAT_TASKQ_TEST9_NAME, "Taskq '%s' delay dispatch %u jiffies\n", SPLAT_TASKQ_TEST9_NAME, rnd); id = taskq_dispatch_delay(tq, splat_taskq_test9_func, tq_arg, TQ_SLEEP, ddi_get_lbolt() + rnd); if (id == 0) { splat_vprint(file, SPLAT_TASKQ_TEST9_NAME, "Taskq '%s' delay dispatch failed\n", SPLAT_TASKQ_TEST9_NAME); kmem_free(tq_arg, sizeof(splat_taskq_arg_t)); taskq_wait(tq); rc = -EINVAL; goto out; } } splat_vprint(file, SPLAT_TASKQ_TEST9_NAME, "Taskq '%s' waiting for " "%d delay dispatches\n", SPLAT_TASKQ_TEST9_NAME, nr_tasks); taskq_wait(tq); if (atomic_read(&count) != nr_tasks) rc = -ERANGE; splat_vprint(file, SPLAT_TASKQ_TEST9_NAME, "Taskq '%s' %d/%d delay " "dispatches finished on time\n", SPLAT_TASKQ_TEST9_NAME, atomic_read(&count), nr_tasks); splat_vprint(file, SPLAT_TASKQ_TEST9_NAME, "Taskq '%s' destroying\n", SPLAT_TASKQ_TEST9_NAME); out: taskq_destroy(tq); return rc; }
static void mmp_thread(void *arg) { spa_t *spa = (spa_t *)arg; mmp_thread_t *mmp = &spa->spa_mmp; boolean_t last_spa_suspended = spa_suspended(spa); boolean_t last_spa_multihost = spa_multihost(spa); callb_cpr_t cpr; hrtime_t max_fail_ns = zfs_multihost_fail_intervals * MSEC2NSEC(MAX(zfs_multihost_interval, MMP_MIN_INTERVAL)); mmp_thread_enter(mmp, &cpr); /* * The mmp_write_done() function calculates mmp_delay based on the * prior value of mmp_delay and the elapsed time since the last write. * For the first mmp write, there is no "last write", so we start * with fake, but reasonable, default non-zero values. */ mmp->mmp_delay = MSEC2NSEC(MAX(zfs_multihost_interval, MMP_MIN_INTERVAL)) / MAX(vdev_count_leaves(spa), 1); mmp->mmp_last_write = gethrtime() - mmp->mmp_delay; while (!mmp->mmp_thread_exiting) { uint64_t mmp_fail_intervals = zfs_multihost_fail_intervals; uint64_t mmp_interval = MSEC2NSEC( MAX(zfs_multihost_interval, MMP_MIN_INTERVAL)); boolean_t suspended = spa_suspended(spa); boolean_t multihost = spa_multihost(spa); hrtime_t start, next_time; start = gethrtime(); if (multihost) { next_time = start + mmp_interval / MAX(vdev_count_leaves(spa), 1); } else { next_time = start + MSEC2NSEC(MMP_DEFAULT_INTERVAL); } /* * When MMP goes off => on, or spa goes suspended => * !suspended, we know no writes occurred recently. We * update mmp_last_write to give us some time to try. */ if ((!last_spa_multihost && multihost) || (last_spa_suspended && !suspended)) { mutex_enter(&mmp->mmp_io_lock); mmp->mmp_last_write = gethrtime(); mutex_exit(&mmp->mmp_io_lock); } else if (last_spa_multihost && !multihost) { mutex_enter(&mmp->mmp_io_lock); mmp->mmp_delay = 0; mutex_exit(&mmp->mmp_io_lock); } last_spa_multihost = multihost; last_spa_suspended = suspended; /* * Smooth max_fail_ns when its factors are decreased, because * making (max_fail_ns < mmp_interval) results in the pool being * immediately suspended before writes can occur at the new * higher frequency. */ if ((mmp_interval * mmp_fail_intervals) < max_fail_ns) { max_fail_ns = ((31 * max_fail_ns) + (mmp_interval * mmp_fail_intervals)) / 32; } else { max_fail_ns = mmp_interval * mmp_fail_intervals; } /* * Suspend the pool if no MMP write has succeeded in over * mmp_interval * mmp_fail_intervals nanoseconds. */ if (!suspended && mmp_fail_intervals && multihost && (start - mmp->mmp_last_write) > max_fail_ns) { cmn_err(CE_WARN, "MMP writes to pool '%s' have not " "succeeded in over %llus; suspending pool", spa_name(spa), NSEC2SEC(start - mmp->mmp_last_write)); zio_suspend(spa, NULL); } if (multihost) mmp_write_uberblock(spa); CALLB_CPR_SAFE_BEGIN(&cpr); (void) cv_timedwait_sig(&mmp->mmp_thread_cv, &mmp->mmp_thread_lock, ddi_get_lbolt() + ((next_time - gethrtime()) / (NANOSEC / hz))); CALLB_CPR_SAFE_END(&cpr, &mmp->mmp_thread_lock); } /* Outstanding writes are allowed to complete. */ if (mmp->mmp_zio_root) zio_wait(mmp->mmp_zio_root); mmp->mmp_zio_root = NULL; mmp_thread_exit(mmp, &mmp->mmp_thread, &cpr); }
static void txg_sync_thread(dsl_pool_t *dp) { spa_t *spa = dp->dp_spa; tx_state_t *tx = &dp->dp_tx; callb_cpr_t cpr; vdev_stat_t *vs1, *vs2; uint64_t start, delta; #ifdef _KERNEL /* * Annotate this process with a flag that indicates that it is * unsafe to use KM_SLEEP during memory allocations due to the * potential for a deadlock. KM_PUSHPAGE should be used instead. */ current->flags |= PF_NOFS; #endif /* _KERNEL */ txg_thread_enter(tx, &cpr); vs1 = kmem_alloc(sizeof(vdev_stat_t), KM_PUSHPAGE); vs2 = kmem_alloc(sizeof(vdev_stat_t), KM_PUSHPAGE); start = delta = 0; for (;;) { uint64_t timer, timeout; uint64_t txg; timeout = zfs_txg_timeout * hz; /* * We sync when we're scanning, there's someone waiting * on us, or the quiesce thread has handed off a txg to * us, or we have reached our timeout. */ timer = (delta >= timeout ? 0 : timeout - delta); while (!dsl_scan_active(dp->dp_scan) && !tx->tx_exiting && timer > 0 && tx->tx_synced_txg >= tx->tx_sync_txg_waiting && tx->tx_quiesced_txg == 0) { dprintf("waiting; tx_synced=%llu waiting=%llu dp=%p\n", tx->tx_synced_txg, tx->tx_sync_txg_waiting, dp); txg_thread_wait(tx, &cpr, &tx->tx_sync_more_cv, timer); delta = ddi_get_lbolt() - start; timer = (delta > timeout ? 0 : timeout - delta); } /* * Wait until the quiesce thread hands off a txg to us, * prompting it to do so if necessary. */ while (!tx->tx_exiting && tx->tx_quiesced_txg == 0) { if (tx->tx_quiesce_txg_waiting < tx->tx_open_txg+1) tx->tx_quiesce_txg_waiting = tx->tx_open_txg+1; cv_broadcast(&tx->tx_quiesce_more_cv); txg_thread_wait(tx, &cpr, &tx->tx_quiesce_done_cv, 0); } if (tx->tx_exiting) { kmem_free(vs2, sizeof(vdev_stat_t)); kmem_free(vs1, sizeof(vdev_stat_t)); txg_thread_exit(tx, &cpr, &tx->tx_sync_thread); } vdev_get_stats(spa->spa_root_vdev, vs1); /* * Consume the quiesced txg which has been handed off to * us. This may cause the quiescing thread to now be * able to quiesce another txg, so we must signal it. */ txg = tx->tx_quiesced_txg; tx->tx_quiesced_txg = 0; tx->tx_syncing_txg = txg; cv_broadcast(&tx->tx_quiesce_more_cv); dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n", txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting); mutex_exit(&tx->tx_sync_lock); start = ddi_get_lbolt(); spa_sync(spa, txg); delta = ddi_get_lbolt() - start; mutex_enter(&tx->tx_sync_lock); tx->tx_synced_txg = txg; tx->tx_syncing_txg = 0; cv_broadcast(&tx->tx_sync_done_cv); /* * Dispatch commit callbacks to worker threads. */ txg_dispatch_callbacks(dp, txg); vdev_get_stats(spa->spa_root_vdev, vs2); spa_txg_history_set_io(spa, txg, vs2->vs_bytes[ZIO_TYPE_READ]-vs1->vs_bytes[ZIO_TYPE_READ], vs2->vs_bytes[ZIO_TYPE_WRITE]-vs1->vs_bytes[ZIO_TYPE_WRITE], vs2->vs_ops[ZIO_TYPE_READ]-vs1->vs_ops[ZIO_TYPE_READ], vs2->vs_ops[ZIO_TYPE_WRITE]-vs1->vs_ops[ZIO_TYPE_WRITE], dp->dp_space_towrite[txg & TXG_MASK] + dp->dp_tempreserved[txg & TXG_MASK] / 2); spa_txg_history_set(spa, txg, TXG_STATE_SYNCED, gethrtime()); } }
/* * For unsolicited exchanges, FCoET is only responsible for allocation of * req_payload. FCT will allocate resp_payload after the exchange is * passed on. */ static fcoet_exchange_t * fcoet_create_unsol_exchange(fcoe_frame_t *frm) { uint8_t r_ctl; int cdb_size; fcoet_exchange_t *xch, *xch_tmp; fct_cmd_t *cmd; fcoe_fcp_cmnd_t *ffc; uint32_t task_expected_len = 0; r_ctl = FRM_R_CTL(frm); switch (r_ctl) { case 0x22: /* * FCoET's unsolicited ELS */ cmd = (fct_cmd_t *)fct_alloc(FCT_STRUCT_CMD_RCVD_ELS, GET_STRUCT_SIZE(fcoet_exchange_t) + frm->frm_payload_size, 0); if (cmd == NULL) { FCOET_EXT_LOG(0, "can't get cmd"); return (NULL); } break; case 0x06: /* * FCoET's unsolicited SCSI cmd */ cdb_size = 16; /* need improve later */ cmd = fct_scsi_task_alloc(FRM2SS(frm)->ss_port, FCT_HANDLE_NONE, FRM_S_ID(frm), frm->frm_payload, cdb_size, STMF_TASK_EXT_NONE); if (cmd == NULL) { FCOET_EXT_LOG(0, "can't get fcp cmd"); return (NULL); } ffc = (fcoe_fcp_cmnd_t *)frm->frm_payload; task_expected_len = FCOE_B2V_4(ffc->ffc_fcp_dl); break; default: FCOET_EXT_LOG(0, "unsupported R_CTL: %x", r_ctl); return (NULL); } /* * xch initialization */ xch = CMD2XCH(cmd); xch->xch_oxid = FRM_OXID(frm); xch->xch_flags = 0; xch->xch_ss = FRM2SS(frm); xch->xch_cmd = cmd; xch->xch_current_seq = NULL; xch->xch_left_data_size = 0; if (task_expected_len) { xch->xch_dbuf_num = (task_expected_len + FCOET_MAX_DBUF_LEN - 1) / FCOET_MAX_DBUF_LEN; xch->xch_dbufs = kmem_zalloc(xch->xch_dbuf_num * sizeof (stmf_data_buf_t *), KM_SLEEP); } xch->xch_start_time = ddi_get_lbolt(); do { xch->xch_rxid = atomic_add_16_nv( &xch->xch_ss->ss_next_unsol_rxid, 1); if (xch->xch_rxid == 0xFFFF) { xch->xch_rxid = atomic_add_16_nv( &xch->xch_ss->ss_next_unsol_rxid, 1); } } while (mod_hash_find(FRM2SS(frm)->ss_unsol_rxid_hash, (mod_hash_key_t)(intptr_t)xch->xch_rxid, (mod_hash_val_t)&xch_tmp) == 0); xch->xch_sequence_no = 0; xch->xch_ref = 0; (void) mod_hash_insert(xch->xch_ss->ss_unsol_rxid_hash, (mod_hash_key_t)(intptr_t)xch->xch_rxid, (mod_hash_val_t)xch); xch->xch_flags |= XCH_FLAG_IN_HASH_TABLE; /* * cmd initialization */ cmd->cmd_port = FRM2SS(frm)->ss_port; cmd->cmd_rp_handle = FCT_HANDLE_NONE; cmd->cmd_rportid = FRM_S_ID(frm); cmd->cmd_lportid = FRM_D_ID(frm); cmd->cmd_oxid = xch->xch_oxid; cmd->cmd_rxid = xch->xch_rxid; fcoet_init_tfm(frm, xch); return (xch); }
int ghd_waitq_process_and_mutex_hold(ccc_t *cccp) { gcmd_t *gcmdp; int rc = FALSE; ASSERT(mutex_owned(&cccp->ccc_hba_mutex)); ASSERT(mutex_owned(&cccp->ccc_waitq_mutex)); for (;;) { if (L2_EMPTY(&GHBA_QHEAD(cccp))) { /* return if the list is empty */ GDBG_WAITQ(("ghd_waitq_proc: MT cccp 0x%p qp 0x%p\n", (void *)cccp, (void *)&cccp->ccc_waitq)); break; } if (GHBA_NACTIVE(cccp) >= GHBA_MAXACTIVE(cccp)) { /* return if the HBA is too active */ GDBG_WAITQ(("ghd_waitq_proc: N>M cccp 0x%p qp 0x%p" " N %ld max %ld\n", (void *)cccp, (void *)&cccp->ccc_waitq, GHBA_NACTIVE(cccp), GHBA_MAXACTIVE(cccp))); break; } /* * bail out if the wait queue has been * "held" by the HBA driver */ if (cccp->ccc_waitq_held) { GDBG_WAITQ(("ghd_waitq_proc: held")); return (rc); } if (cccp->ccc_waitq_frozen) { clock_t lbolt, delay_in_hz, time_to_wait; delay_in_hz = drv_usectohz(cccp->ccc_waitq_freezedelay * 1000); lbolt = ddi_get_lbolt(); time_to_wait = delay_in_hz - (lbolt - cccp->ccc_waitq_freezetime); if (time_to_wait > 0) { /* * stay frozen; we'll be called again * by ghd_timeout_softintr() */ GDBG_WAITQ(("ghd_waitq_proc: frozen")); return (rc); } else { /* unfreeze and continue */ GDBG_WAITQ(("ghd_waitq_proc: unfreezing")); cccp->ccc_waitq_freezetime = 0; cccp->ccc_waitq_freezedelay = 0; cccp->ccc_waitq_frozen = 0; } } gcmdp = (gcmd_t *)L2_remove_head(&GHBA_QHEAD(cccp)); GHBA_NACTIVE(cccp)++; gcmdp->cmd_waitq_level++; mutex_exit(&cccp->ccc_waitq_mutex); /* * Start up the next I/O request */ ASSERT(gcmdp != NULL); gcmdp->cmd_state = GCMD_STATE_ACTIVE; if (!(*cccp->ccc_hba_start)(cccp->ccc_hba_handle, gcmdp)) { /* if the HBA rejected the request, requeue it */ gcmdp->cmd_state = GCMD_STATE_WAITQ; mutex_enter(&cccp->ccc_waitq_mutex); GHBA_NACTIVE(cccp)--; gcmdp->cmd_waitq_level--; L2_add_head(&GHBA_QHEAD(cccp), &gcmdp->cmd_q, gcmdp); GDBG_WAITQ(("ghd_waitq_proc: busy cccp 0x%p gcmdp 0x%p" " handle 0x%p\n", (void *)cccp, (void *)gcmdp, cccp->ccc_hba_handle)); break; } rc = TRUE; mutex_enter(&cccp->ccc_waitq_mutex); GDBG_WAITQ(("ghd_waitq_proc: ++ cccp 0x%p gcmdp 0x%p N %ld\n", (void *)cccp, (void *)gcmdp, GHBA_NACTIVE(cccp))); } ASSERT(mutex_owned(&cccp->ccc_hba_mutex)); ASSERT(mutex_owned(&cccp->ccc_waitq_mutex)); return (rc); }
/* * If adding a new entry would exceed the cache size, * evict the oldest entry (LRU). */ if ((avl_numnodes(&vc->vc_lastused_tree) << zfs_vdev_cache_bshift) > zfs_vdev_cache_size) { ve = avl_first(&vc->vc_lastused_tree); if (ve->ve_fill_io != NULL) return (NULL); ASSERT3U(ve->ve_hits, !=, 0); vdev_cache_evict(vc, ve); } ve = kmem_zalloc(sizeof (vdev_cache_entry_t), KM_SLEEP); ve->ve_offset = offset; ve->ve_lastused = ddi_get_lbolt(); ve->ve_abd = abd_alloc_for_io(VCBS, B_TRUE); avl_add(&vc->vc_offset_tree, ve); avl_add(&vc->vc_lastused_tree, ve); return (ve); } static void vdev_cache_hit(vdev_cache_t *vc, vdev_cache_entry_t *ve, zio_t *zio) { uint64_t cache_phase = P2PHASE(zio->io_offset, VCBS); ASSERT(MUTEX_HELD(&vc->vc_lock)); ASSERT3P(ve->ve_fill_io, ==, NULL);