Exemple #1
0
int
zfsctl_mount_snapshot(struct path *path, int flags)
{
    struct dentry *dentry = path->dentry;
    struct inode *ip = dentry->d_inode;
    zfs_sb_t *zsb = ITOZSB(ip);
    char *full_name, *full_path;
    zfs_snapentry_t *sep;
    zfs_snapentry_t search;
    char *argv[] = { "/bin/sh", "-c", NULL, NULL };
    char *envp[] = { NULL };
    int error;

    ZFS_ENTER(zsb);

    full_name = kmem_zalloc(MAXNAMELEN, KM_SLEEP);
    full_path = kmem_zalloc(PATH_MAX, KM_SLEEP);

    error = zfsctl_snapshot_zname(ip, dname(dentry), MAXNAMELEN, full_name);
    if (error)
        goto error;

    error = zfsctl_snapshot_zpath(path, PATH_MAX, full_path);
    if (error)
        goto error;

    /*
     * Attempt to mount the snapshot from user space.  Normally this
     * would be done using the vfs_kern_mount() function, however that
     * function is marked GPL-only and cannot be used.  On error we
     * careful to log the real error to the console and return EISDIR
     * to safely abort the automount.  This should be very rare.
     */
    argv[2] = kmem_asprintf(SET_MOUNT_CMD, full_name, full_path);
    error = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
    strfree(argv[2]);
    if (error) {
        printk("ZFS: Unable to automount %s at %s: %d\n",
               full_name, full_path, error);
        error = EISDIR;
        goto error;
    }

    mutex_enter(&zsb->z_ctldir_lock);

    /*
     * Ensure a previous entry does not exist, if it does safely remove
     * it any cancel the outstanding expiration.  This can occur when a
     * snapshot is manually unmounted and then an automount is triggered.
     */
    search.se_name = full_name;
    sep = avl_find(&zsb->z_ctldir_snaps, &search, NULL);
    if (sep) {
        avl_remove(&zsb->z_ctldir_snaps, sep);
        taskq_cancel_id(zfs_expire_taskq, sep->se_taskqid);
        zfsctl_sep_free(sep);
    }

    sep = zfsctl_sep_alloc();
    sep->se_name = full_name;
    sep->se_path = full_path;
    sep->se_inode = ip;
    avl_add(&zsb->z_ctldir_snaps, sep);

    sep->se_taskqid = taskq_dispatch_delay(zfs_expire_taskq,
                                           zfsctl_expire_snapshot, sep, TQ_SLEEP,
                                           ddi_get_lbolt() + zfs_expire_snapshot * HZ);

    mutex_exit(&zsb->z_ctldir_lock);
error:
    if (error) {
        kmem_free(full_name, MAXNAMELEN);
        kmem_free(full_path, PATH_MAX);
    }

    ZFS_EXIT(zsb);

    return (error);
}
/**
 * Worker for rtSemMutexSolRequest that handles the case where we go to sleep.
 *
 * @returns VINF_SUCCESS, VERR_INTERRUPTED, or VERR_SEM_DESTROYED.
 *          Returns without owning the mutex.
 * @param   pThis           The mutex instance.
 * @param   cMillies        The timeout, must be > 0 or RT_INDEFINITE_WAIT.
 * @param   fInterruptible  The wait type.
 *
 * @remarks This needs to be called with the mutex object held!
 */
static int rtSemMutexSolRequestSleep(PRTSEMMUTEXINTERNAL pThis, RTMSINTERVAL cMillies,
                                     bool fInterruptible)
{
    int rc = VERR_GENERAL_FAILURE;
    Assert(cMillies > 0);

    /*
     * Now we wait (sleep; although might spin and then sleep) & reference the mutex.
     */
    ASMAtomicIncU32(&pThis->cWaiters);
    ASMAtomicIncU32(&pThis->cRefs);

    if (cMillies != RT_INDEFINITE_WAIT)
    {
        clock_t cTicks   = drv_usectohz((clock_t)(cMillies * 1000L));
        clock_t cTimeout = ddi_get_lbolt();
        cTimeout        += cTicks;
        if (fInterruptible)
            rc = cv_timedwait_sig(&pThis->Cnd, &pThis->Mtx, cTimeout);
        else
            rc = cv_timedwait(&pThis->Cnd, &pThis->Mtx, cTimeout);
    }
    else
    {
        if (fInterruptible)
            rc = cv_wait_sig(&pThis->Cnd, &pThis->Mtx);
        else
        {
            cv_wait(&pThis->Cnd, &pThis->Mtx);
            rc = 1;
        }
    }

    ASMAtomicDecU32(&pThis->cWaiters);
    if (rc > 0)
    {
        if (pThis->u32Magic == RTSEMMUTEX_MAGIC)
        {
            if (pThis->hOwnerThread == NIL_RTNATIVETHREAD)
            {
                /*
                 * Woken up by a release from another thread.
                 */
                Assert(pThis->cRecursions == 0);
                pThis->cRecursions = 1;
                pThis->hOwnerThread = RTThreadNativeSelf();
                rc = VINF_SUCCESS;
            }
            else
            {
                /*
                 * Interrupted by some signal.
                 */
                rc = VERR_INTERRUPTED;
            }
        }
        else
        {
            /*
             * Awakened due to the destruction-in-progress broadcast.
             * We will cleanup if we're the last waiter.
             */
            rc = VERR_SEM_DESTROYED;
        }
    }
    else if (rc == -1)
    {
        /*
         * Timed out.
         */
        rc = VERR_TIMEOUT;
    }
    else
    {
        /*
         * Condition may not have been met, returned due to pending signal.
         */
        rc = VERR_INTERRUPTED;
    }

    if (!ASMAtomicDecU32(&pThis->cRefs))
    {
        Assert(RT_FAILURE_NP(rc));
        mutex_exit(&pThis->Mtx);
        cv_destroy(&pThis->Cnd);
        mutex_destroy(&pThis->Mtx);
        RTMemFree(pThis);
        return rc;
    }

    return rc;
}
Exemple #3
0
static int
splat_taskq_test10(struct file *file, void *arg)
{
	taskq_t *tq;
	splat_taskq_arg_t **tqas;
	atomic_t count;
	int i, j, rc = 0;
	int minalloc = 1;
	int maxalloc = 10;
	int nr_tasks = 100;
	int canceled = 0;
	int completed = 0;
	int blocked = 0;
	clock_t start, cancel;

	tqas = vmalloc(sizeof(*tqas) * nr_tasks);
	if (tqas == NULL)
		return -ENOMEM;
        memset(tqas, 0, sizeof(*tqas) * nr_tasks);

	splat_vprint(file, SPLAT_TASKQ_TEST10_NAME,
	    "Taskq '%s' creating (%s dispatch) (%d/%d/%d)\n",
	    SPLAT_TASKQ_TEST10_NAME, "delay", minalloc, maxalloc, nr_tasks);
	if ((tq = taskq_create(SPLAT_TASKQ_TEST10_NAME, 3, maxclsyspri,
	    minalloc, maxalloc, TASKQ_PREPOPULATE)) == NULL) {
		splat_vprint(file, SPLAT_TASKQ_TEST10_NAME,
		    "Taskq '%s' create failed\n", SPLAT_TASKQ_TEST10_NAME);
		rc = -EINVAL;
		goto out_free;
	}

	atomic_set(&count, 0);

	for (i = 0; i < nr_tasks; i++) {
		splat_taskq_arg_t *tq_arg;
		uint32_t rnd;

		/* A random timeout in jiffies of at most 5 seconds */
		get_random_bytes((void *)&rnd, 4);
		rnd = rnd % (5 * HZ);

		tq_arg = kmem_alloc(sizeof(splat_taskq_arg_t), KM_SLEEP);
		tq_arg->file = file;
		tq_arg->name = SPLAT_TASKQ_TEST10_NAME;
		tq_arg->count = &count;
		tqas[i] = tq_arg;

		/*
		 * Dispatch every 1/3 one immediately to mix it up, the cancel
		 * code is inherently racy and we want to try and provoke any
		 * subtle concurrently issues.
		 */
		if ((i % 3) == 0) {
			tq_arg->expire = ddi_get_lbolt();
			tq_arg->id = taskq_dispatch(tq, splat_taskq_test10_func,
			    tq_arg, TQ_SLEEP);
		} else {
			tq_arg->expire = ddi_get_lbolt() + rnd;
			tq_arg->id = taskq_dispatch_delay(tq,
			    splat_taskq_test10_func,
			    tq_arg, TQ_SLEEP, ddi_get_lbolt() + rnd);
		}

		if (tq_arg->id == 0) {
			splat_vprint(file, SPLAT_TASKQ_TEST10_NAME,
			   "Taskq '%s' dispatch failed\n",
			   SPLAT_TASKQ_TEST10_NAME);
			kmem_free(tq_arg, sizeof(splat_taskq_arg_t));
			taskq_wait(tq);
			rc = -EINVAL;
			goto out;
		} else {
			splat_vprint(file, SPLAT_TASKQ_TEST10_NAME,
			    "Taskq '%s' dispatch %lu in %lu jiffies\n",
			    SPLAT_TASKQ_TEST10_NAME, (unsigned long)tq_arg->id,
			    !(i % 3) ? 0 : tq_arg->expire - ddi_get_lbolt());
		}
	}

	/*
	 * Start randomly canceling tasks for the duration of the test.  We
	 * happen to know the valid task id's will be in the range 1..nr_tasks
	 * because the taskq is private and was just created.  However, we
	 * have no idea of a particular task has already executed or not.
	 */
	splat_vprint(file, SPLAT_TASKQ_TEST10_NAME, "Taskq '%s' randomly "
	    "canceling task ids\n", SPLAT_TASKQ_TEST10_NAME);

	start = ddi_get_lbolt();
	i = 0;

	while (ddi_time_before(ddi_get_lbolt(), start + 5 * HZ)) {
		taskqid_t id;
		uint32_t rnd;

		i++;
		cancel = ddi_get_lbolt();
		get_random_bytes((void *)&rnd, 4);
		id = 1 + (rnd % nr_tasks);
		rc = taskq_cancel_id(tq, id);

		/*
		 * Keep track of the results of the random cancels.
		 */
		if (rc == 0) {
			canceled++;
		} else if (rc == ENOENT) {
			completed++;
		} else if (rc == EBUSY) {
			blocked++;
		} else {
			rc = -EINVAL;
			break;
		}

		/*
		 * Verify we never get blocked to long in taskq_cancel_id().
		 * The worst case is 10ms if we happen to cancel the task
		 * which is currently executing.  We allow a factor of 2x.
		 */
		if (ddi_get_lbolt() - cancel > HZ / 50) {
			splat_vprint(file, SPLAT_TASKQ_TEST10_NAME,
			    "Taskq '%s' cancel for %lu took %lu\n",
			    SPLAT_TASKQ_TEST10_NAME, (unsigned long)id,
			    ddi_get_lbolt() - cancel);
			rc = -ETIMEDOUT;
			break;
		}

		get_random_bytes((void *)&rnd, 4);
		msleep(1 + (rnd % 100));
		rc = 0;
	}

	taskq_wait(tq);

	/*
	 * Cross check the results of taskq_cancel_id() with the number of
	 * times the dispatched function actually ran successfully.
	 */
	if ((rc == 0) && (nr_tasks - canceled != atomic_read(&count)))
		rc = -EDOM;

	splat_vprint(file, SPLAT_TASKQ_TEST10_NAME, "Taskq '%s' %d attempts, "
	    "%d canceled, %d completed, %d blocked, %d/%d tasks run\n",
	    SPLAT_TASKQ_TEST10_NAME, i, canceled, completed, blocked,
	    atomic_read(&count), nr_tasks);
	splat_vprint(file, SPLAT_TASKQ_TEST10_NAME, "Taskq '%s' destroying %d\n",
	    SPLAT_TASKQ_TEST10_NAME, rc);
out:
	taskq_destroy(tq);
out_free:
	for (j = 0; j < nr_tasks && tqas[j] != NULL; j++)
		kmem_free(tqas[j], sizeof(splat_taskq_arg_t));
	vfree(tqas);

	return rc;
}
Exemple #4
0
static void
txg_sync_thread(void *arg)
{
	dsl_pool_t *dp = arg;
	spa_t *spa = dp->dp_spa;
	tx_state_t *tx = &dp->dp_tx;
	callb_cpr_t cpr;
	uint64_t start, delta;

	txg_thread_enter(tx, &cpr);

	start = delta = 0;
	for (;;) {
		uint64_t timeout = zfs_txg_timeout * hz;
		uint64_t timer;
		uint64_t txg;

		/*
		 * We sync when we're scanning, there's someone waiting
		 * on us, or the quiesce thread has handed off a txg to
		 * us, or we have reached our timeout.
		 */
		timer = (delta >= timeout ? 0 : timeout - delta);
		while (!dsl_scan_active(dp->dp_scan) &&
		    !tx->tx_exiting && timer > 0 &&
		    tx->tx_synced_txg >= tx->tx_sync_txg_waiting &&
		    tx->tx_quiesced_txg == 0 &&
		    dp->dp_dirty_total < zfs_dirty_data_sync) {
			dprintf("waiting; tx_synced=%llu waiting=%llu dp=%p\n",
			    tx->tx_synced_txg, tx->tx_sync_txg_waiting, dp);
			txg_thread_wait(tx, &cpr, &tx->tx_sync_more_cv, timer);
			delta = ddi_get_lbolt() - start;
			timer = (delta > timeout ? 0 : timeout - delta);
		}

		/*
		 * Wait until the quiesce thread hands off a txg to us,
		 * prompting it to do so if necessary.
		 */
		while (!tx->tx_exiting && tx->tx_quiesced_txg == 0) {
			if (tx->tx_quiesce_txg_waiting < tx->tx_open_txg+1)
				tx->tx_quiesce_txg_waiting = tx->tx_open_txg+1;
			cv_broadcast(&tx->tx_quiesce_more_cv);
			txg_thread_wait(tx, &cpr, &tx->tx_quiesce_done_cv, 0);
		}

		if (tx->tx_exiting)
			txg_thread_exit(tx, &cpr, &tx->tx_sync_thread);

		/*
		 * Consume the quiesced txg which has been handed off to
		 * us.  This may cause the quiescing thread to now be
		 * able to quiesce another txg, so we must signal it.
		 */
		txg = tx->tx_quiesced_txg;
		tx->tx_quiesced_txg = 0;
		tx->tx_syncing_txg = txg;
		DTRACE_PROBE2(txg__syncing, dsl_pool_t *, dp, uint64_t, txg);
		cv_broadcast(&tx->tx_quiesce_more_cv);

		dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
		    txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting);
		mutex_exit(&tx->tx_sync_lock);

		start = ddi_get_lbolt();
		spa_sync(spa, txg);
		delta = ddi_get_lbolt() - start;

		mutex_enter(&tx->tx_sync_lock);
		tx->tx_synced_txg = txg;
		tx->tx_syncing_txg = 0;
		DTRACE_PROBE2(txg__synced, dsl_pool_t *, dp, uint64_t, txg);
		cv_broadcast(&tx->tx_sync_done_cv);

		/*
		 * Dispatch commit callbacks to worker threads.
		 */
		txg_dispatch_callbacks(dp, txg);
	}
}
Exemple #5
0
/*
 * cvc_send_to_iosram()
 *	Flush as much data as possible to the CONO chunk.  If successful, free
 *	any mblks that were completely transmitted, update the b_rptr field in
 *	the first remaining mblk if it was partially transmitted, and update the
 *	caller's pointer to the new head of the mblk chain.  Since the software
 *	that will be pulling this data out of IOSRAM (dxs on the SC) is just
 *	polling at some frequency, we avoid attempts to flush data to IOSRAM any
 *	faster than a large divisor of that polling frequency.
 *
 *	Note that "cvc_buf_t out" is only declared "static" to keep it from
 *	being allocated on the stack.  Allocating 1K+ structures on the stack
 *	seems rather antisocial.
 */
static void
cvc_send_to_iosram(mblk_t **chainpp)
{
	int			rval;
	uint8_t			dvalid;
	uchar_t			*cp;
	mblk_t			*mp;
	mblk_t			*last_empty_mp;
	static clock_t		last_flush = (clock_t)-1;
	static cvc_buf_t	out;   /* see note above about static */

	ASSERT(chainpp != NULL);

	/*
	 * We _do_ have something to do, right?
	 */
	if (*chainpp == NULL) {
		return;
	}

	/*
	 * We can actually increase throughput by throttling back on attempts to
	 * flush data to IOSRAM, since trying to write every little bit of data
	 * as it shows up will actually generate more delays waiting for the SC
	 * to pick up each of those bits.  Instead, we'll avoid attempting to
	 * write data to IOSRAM any faster than half of the polling frequency we
	 * expect the SC to be using.
	 */
	if (ddi_get_lbolt() - last_flush <
	    drv_usectohz(CVC_IOSRAM_POLL_USECS / 2)) {
		return;
	}

	/*
	 * If IOSRAM is inaccessible or the CONO chunk still holds data that
	 * hasn't been picked up by the SC, there's nothing we can do right now.
	 */
	rval = iosram_get_flag(IOSRAM_KEY_CONO, &dvalid, NULL);
	if ((rval != 0) || (dvalid == IOSRAM_DATA_VALID)) {
		if ((rval != 0) && (rval != EAGAIN)) {
			cmn_err(CE_WARN, "cvc_send_to_iosram: get_flag ret %d",
			    rval);
		}
		return;
	}

	/*
	 * Copy up to MAX_XFER_COUTPUT chars from the mblk chain into a buffer.
	 * Don't change any of the mblks just yet, since we can't be certain
	 * that we'll be successful in writing data to the CONO chunk.
	 */
	out.count = 0;
	mp = *chainpp;
	cp = mp->b_rptr;
	last_empty_mp = NULL;
	while ((mp != NULL) && (out.count < MAX_XFER_COUTPUT)) {
		/*
		 * Process as many of the characters in the current mblk as
		 * possible.
		 */
		while ((cp != mp->b_wptr) && (out.count < MAX_XFER_COUTPUT)) {
			out.buffer[out.count++] = *cp++;
		}

		/*
		 * Did we process that entire mblk?  If so, move on to the next
		 * one.  If not, we're done filling the buffer even if there's
		 * space left, because apparently there wasn't room to process
		 * the next character.
		 */
		if (cp != mp->b_wptr) {
			break;
		}

		/*
		 * When this loop terminates, last_empty_mp will point to the
		 * last mblk that was completely processed, mp will point to the
		 * following mblk (or NULL if no more mblks exist), and cp will
		 * point to the first untransmitted character in the mblk
		 * pointed to by mp.  We'll need this data to update the mblk
		 * chain if all of the data is successfully transmitted.
		 */
		last_empty_mp = mp;
		mp = mp->b_cont;
		cp = (mp != NULL) ? mp->b_rptr : NULL;
	}

	/*
	 * If we succeeded in preparing some data, try to transmit it through
	 * IOSRAM.  First write the count and the data, which can be done in a
	 * single operation thanks to the buffer structure we use, then set the
	 * data_valid flag if the first step succeeded.
	 */
	if (out.count != 0) {
		rval = iosram_wr(IOSRAM_KEY_CONO, COUNT_OFFSET,
		    CONSBUF_COUNT_SIZE + out.count, (caddr_t)&out);
		if ((rval != 0) && (rval != EAGAIN)) {
			cmn_err(CE_WARN, "cvc_putc: write ret %d", rval);
		}

		/* if the data write succeeded, set the data_valid flag */
		if (rval == 0) {
			rval = iosram_set_flag(IOSRAM_KEY_CONO,
			    IOSRAM_DATA_VALID, IOSRAM_INT_NONE);
			if ((rval != 0) && (rval != EAGAIN)) {
				cmn_err(CE_WARN,
				    "cvc_putc: set flags for outbuf ret %d",
				    rval);
			}
		}

		/*
		 * If we successfully transmitted any data, modify the caller's
		 * mblk chain to remove the data that was transmitted, freeing
		 * all mblks that were completely processed.
		 */
		if (rval == 0) {
			last_flush = ddi_get_lbolt();

			/*
			 * If any data is left over, update the b_rptr field of
			 * the first remaining mblk in case some of its data was
			 * processed.
			 */
			if (mp != NULL) {
				mp->b_rptr = cp;
			}

			/*
			 * If any mblks have been emptied, unlink them from the
			 * residual chain, free them, and update the caller's
			 * mblk pointer.
			 */
			if (last_empty_mp != NULL) {
				last_empty_mp->b_cont = NULL;
				freemsg(*chainpp);
				*chainpp = mp;
			}
		}
	}
}
Exemple #6
0
static int
splat_taskq_test9(struct file *file, void *arg)
{
	taskq_t *tq;
	atomic_t count;
	int i, rc = 0;
	int minalloc = 1;
	int maxalloc = 10;
	int nr_tasks = 100;

	splat_vprint(file, SPLAT_TASKQ_TEST9_NAME,
	    "Taskq '%s' creating (%s dispatch) (%d/%d/%d)\n",
	    SPLAT_TASKQ_TEST9_NAME, "delay", minalloc, maxalloc, nr_tasks);
	if ((tq = taskq_create(SPLAT_TASKQ_TEST9_NAME, 3, maxclsyspri,
	    minalloc, maxalloc, TASKQ_PREPOPULATE)) == NULL) {
		splat_vprint(file, SPLAT_TASKQ_TEST9_NAME,
		    "Taskq '%s' create failed\n", SPLAT_TASKQ_TEST9_NAME);
		return -EINVAL;
	}

	atomic_set(&count, 0);

	for (i = 1; i <= nr_tasks; i++) {
		splat_taskq_arg_t *tq_arg;
		taskqid_t id;
		uint32_t rnd;

		/* A random timeout in jiffies of at most 5 seconds */
		get_random_bytes((void *)&rnd, 4);
		rnd = rnd % (5 * HZ);

		tq_arg = kmem_alloc(sizeof(splat_taskq_arg_t), KM_SLEEP);
		tq_arg->file = file;
		tq_arg->name = SPLAT_TASKQ_TEST9_NAME;
		tq_arg->expire = ddi_get_lbolt() + rnd;
		tq_arg->count = &count;

		splat_vprint(file, SPLAT_TASKQ_TEST9_NAME,
		    "Taskq '%s' delay dispatch %u jiffies\n",
		    SPLAT_TASKQ_TEST9_NAME, rnd);

		id = taskq_dispatch_delay(tq, splat_taskq_test9_func,
		    tq_arg, TQ_SLEEP, ddi_get_lbolt() + rnd);

		if (id == 0) {
			splat_vprint(file, SPLAT_TASKQ_TEST9_NAME,
			   "Taskq '%s' delay dispatch failed\n",
			   SPLAT_TASKQ_TEST9_NAME);
			kmem_free(tq_arg, sizeof(splat_taskq_arg_t));
			taskq_wait(tq);
			rc = -EINVAL;
			goto out;
		}
	}

	splat_vprint(file, SPLAT_TASKQ_TEST9_NAME, "Taskq '%s' waiting for "
	    "%d delay dispatches\n", SPLAT_TASKQ_TEST9_NAME, nr_tasks);

	taskq_wait(tq);
	if (atomic_read(&count) != nr_tasks)
		rc = -ERANGE;

	splat_vprint(file, SPLAT_TASKQ_TEST9_NAME, "Taskq '%s' %d/%d delay "
	    "dispatches finished on time\n", SPLAT_TASKQ_TEST9_NAME,
	    atomic_read(&count), nr_tasks);
	splat_vprint(file, SPLAT_TASKQ_TEST9_NAME, "Taskq '%s' destroying\n",
	    SPLAT_TASKQ_TEST9_NAME);
out:
	taskq_destroy(tq);

	return rc;
}
Exemple #7
0
static void
mmp_thread(void *arg)
{
	spa_t *spa = (spa_t *)arg;
	mmp_thread_t *mmp = &spa->spa_mmp;
	boolean_t last_spa_suspended = spa_suspended(spa);
	boolean_t last_spa_multihost = spa_multihost(spa);
	callb_cpr_t cpr;
	hrtime_t max_fail_ns = zfs_multihost_fail_intervals *
	    MSEC2NSEC(MAX(zfs_multihost_interval, MMP_MIN_INTERVAL));

	mmp_thread_enter(mmp, &cpr);

	/*
	 * The mmp_write_done() function calculates mmp_delay based on the
	 * prior value of mmp_delay and the elapsed time since the last write.
	 * For the first mmp write, there is no "last write", so we start
	 * with fake, but reasonable, default non-zero values.
	 */
	mmp->mmp_delay = MSEC2NSEC(MAX(zfs_multihost_interval,
	    MMP_MIN_INTERVAL)) / MAX(vdev_count_leaves(spa), 1);
	mmp->mmp_last_write = gethrtime() - mmp->mmp_delay;

	while (!mmp->mmp_thread_exiting) {
		uint64_t mmp_fail_intervals = zfs_multihost_fail_intervals;
		uint64_t mmp_interval = MSEC2NSEC(
		    MAX(zfs_multihost_interval, MMP_MIN_INTERVAL));
		boolean_t suspended = spa_suspended(spa);
		boolean_t multihost = spa_multihost(spa);
		hrtime_t start, next_time;

		start = gethrtime();
		if (multihost) {
			next_time = start + mmp_interval /
			    MAX(vdev_count_leaves(spa), 1);
		} else {
			next_time = start + MSEC2NSEC(MMP_DEFAULT_INTERVAL);
		}

		/*
		 * When MMP goes off => on, or spa goes suspended =>
		 * !suspended, we know no writes occurred recently.  We
		 * update mmp_last_write to give us some time to try.
		 */
		if ((!last_spa_multihost && multihost) ||
		    (last_spa_suspended && !suspended)) {
			mutex_enter(&mmp->mmp_io_lock);
			mmp->mmp_last_write = gethrtime();
			mutex_exit(&mmp->mmp_io_lock);
		} else if (last_spa_multihost && !multihost) {
			mutex_enter(&mmp->mmp_io_lock);
			mmp->mmp_delay = 0;
			mutex_exit(&mmp->mmp_io_lock);
		}
		last_spa_multihost = multihost;
		last_spa_suspended = suspended;

		/*
		 * Smooth max_fail_ns when its factors are decreased, because
		 * making (max_fail_ns < mmp_interval) results in the pool being
		 * immediately suspended before writes can occur at the new
		 * higher frequency.
		 */
		if ((mmp_interval * mmp_fail_intervals) < max_fail_ns) {
			max_fail_ns = ((31 * max_fail_ns) + (mmp_interval *
			    mmp_fail_intervals)) / 32;
		} else {
			max_fail_ns = mmp_interval * mmp_fail_intervals;
		}

		/*
		 * Suspend the pool if no MMP write has succeeded in over
		 * mmp_interval * mmp_fail_intervals nanoseconds.
		 */
		if (!suspended && mmp_fail_intervals && multihost &&
		    (start - mmp->mmp_last_write) > max_fail_ns) {
			cmn_err(CE_WARN, "MMP writes to pool '%s' have not "
			    "succeeded in over %llus; suspending pool",
			    spa_name(spa),
			    NSEC2SEC(start - mmp->mmp_last_write));
			zio_suspend(spa, NULL);
		}

		if (multihost)
			mmp_write_uberblock(spa);

		CALLB_CPR_SAFE_BEGIN(&cpr);
		(void) cv_timedwait_sig(&mmp->mmp_thread_cv,
		    &mmp->mmp_thread_lock, ddi_get_lbolt() +
		    ((next_time - gethrtime()) / (NANOSEC / hz)));
		CALLB_CPR_SAFE_END(&cpr, &mmp->mmp_thread_lock);
	}

	/* Outstanding writes are allowed to complete. */
	if (mmp->mmp_zio_root)
		zio_wait(mmp->mmp_zio_root);

	mmp->mmp_zio_root = NULL;
	mmp_thread_exit(mmp, &mmp->mmp_thread, &cpr);
}
Exemple #8
0
static void
txg_sync_thread(dsl_pool_t *dp)
{
	spa_t *spa = dp->dp_spa;
	tx_state_t *tx = &dp->dp_tx;
	callb_cpr_t cpr;
	vdev_stat_t *vs1, *vs2;
	uint64_t start, delta;

#ifdef _KERNEL
	/*
	 * Annotate this process with a flag that indicates that it is
	 * unsafe to use KM_SLEEP during memory allocations due to the
	 * potential for a deadlock.  KM_PUSHPAGE should be used instead.
	 */
	current->flags |= PF_NOFS;
#endif /* _KERNEL */

	txg_thread_enter(tx, &cpr);

	vs1 = kmem_alloc(sizeof(vdev_stat_t), KM_PUSHPAGE);
	vs2 = kmem_alloc(sizeof(vdev_stat_t), KM_PUSHPAGE);

	start = delta = 0;
	for (;;) {
		uint64_t timer, timeout;
		uint64_t txg;

		timeout = zfs_txg_timeout * hz;

		/*
		 * We sync when we're scanning, there's someone waiting
		 * on us, or the quiesce thread has handed off a txg to
		 * us, or we have reached our timeout.
		 */
		timer = (delta >= timeout ? 0 : timeout - delta);
		while (!dsl_scan_active(dp->dp_scan) &&
		    !tx->tx_exiting && timer > 0 &&
		    tx->tx_synced_txg >= tx->tx_sync_txg_waiting &&
		    tx->tx_quiesced_txg == 0) {
			dprintf("waiting; tx_synced=%llu waiting=%llu dp=%p\n",
			    tx->tx_synced_txg, tx->tx_sync_txg_waiting, dp);
			txg_thread_wait(tx, &cpr, &tx->tx_sync_more_cv, timer);
			delta = ddi_get_lbolt() - start;
			timer = (delta > timeout ? 0 : timeout - delta);
		}

		/*
		 * Wait until the quiesce thread hands off a txg to us,
		 * prompting it to do so if necessary.
		 */
		while (!tx->tx_exiting && tx->tx_quiesced_txg == 0) {
			if (tx->tx_quiesce_txg_waiting < tx->tx_open_txg+1)
				tx->tx_quiesce_txg_waiting = tx->tx_open_txg+1;
			cv_broadcast(&tx->tx_quiesce_more_cv);
			txg_thread_wait(tx, &cpr, &tx->tx_quiesce_done_cv, 0);
		}

		if (tx->tx_exiting) {
			kmem_free(vs2, sizeof(vdev_stat_t));
			kmem_free(vs1, sizeof(vdev_stat_t));
			txg_thread_exit(tx, &cpr, &tx->tx_sync_thread);
		}

		vdev_get_stats(spa->spa_root_vdev, vs1);

		/*
		 * Consume the quiesced txg which has been handed off to
		 * us.  This may cause the quiescing thread to now be
		 * able to quiesce another txg, so we must signal it.
		 */
		txg = tx->tx_quiesced_txg;
		tx->tx_quiesced_txg = 0;
		tx->tx_syncing_txg = txg;
		cv_broadcast(&tx->tx_quiesce_more_cv);

		dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
		    txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting);
		mutex_exit(&tx->tx_sync_lock);

		start = ddi_get_lbolt();
		spa_sync(spa, txg);
		delta = ddi_get_lbolt() - start;

		mutex_enter(&tx->tx_sync_lock);
		tx->tx_synced_txg = txg;
		tx->tx_syncing_txg = 0;
		cv_broadcast(&tx->tx_sync_done_cv);

		/*
		 * Dispatch commit callbacks to worker threads.
		 */
		txg_dispatch_callbacks(dp, txg);

		vdev_get_stats(spa->spa_root_vdev, vs2);
		spa_txg_history_set_io(spa, txg,
		    vs2->vs_bytes[ZIO_TYPE_READ]-vs1->vs_bytes[ZIO_TYPE_READ],
		    vs2->vs_bytes[ZIO_TYPE_WRITE]-vs1->vs_bytes[ZIO_TYPE_WRITE],
		    vs2->vs_ops[ZIO_TYPE_READ]-vs1->vs_ops[ZIO_TYPE_READ],
		    vs2->vs_ops[ZIO_TYPE_WRITE]-vs1->vs_ops[ZIO_TYPE_WRITE],
		    dp->dp_space_towrite[txg & TXG_MASK] +
		    dp->dp_tempreserved[txg & TXG_MASK] / 2);
		spa_txg_history_set(spa, txg, TXG_STATE_SYNCED, gethrtime());
	}
}
Exemple #9
0
/*
 * For unsolicited exchanges, FCoET is only responsible for allocation of
 * req_payload. FCT will allocate resp_payload after the exchange is
 * passed on.
 */
static fcoet_exchange_t *
fcoet_create_unsol_exchange(fcoe_frame_t *frm)
{
	uint8_t			 r_ctl;
	int			 cdb_size;
	fcoet_exchange_t	*xch, *xch_tmp;
	fct_cmd_t		*cmd;
	fcoe_fcp_cmnd_t		*ffc;
	uint32_t		task_expected_len = 0;

	r_ctl = FRM_R_CTL(frm);
	switch (r_ctl) {
	case 0x22:
		/*
		 * FCoET's unsolicited ELS
		 */
		cmd = (fct_cmd_t *)fct_alloc(FCT_STRUCT_CMD_RCVD_ELS,
		    GET_STRUCT_SIZE(fcoet_exchange_t) +
		    frm->frm_payload_size, 0);
		if (cmd == NULL) {
			FCOET_EXT_LOG(0, "can't get cmd");
			return (NULL);
		}
		break;

	case 0x06:
		/*
		 * FCoET's unsolicited SCSI cmd
		 */
		cdb_size = 16;	/* need improve later */
		cmd = fct_scsi_task_alloc(FRM2SS(frm)->ss_port, FCT_HANDLE_NONE,
		    FRM_S_ID(frm), frm->frm_payload, cdb_size,
		    STMF_TASK_EXT_NONE);
		if (cmd == NULL) {
			FCOET_EXT_LOG(0, "can't get fcp cmd");
			return (NULL);
		}
		ffc = (fcoe_fcp_cmnd_t *)frm->frm_payload;
		task_expected_len = FCOE_B2V_4(ffc->ffc_fcp_dl);
		break;

	default:
		FCOET_EXT_LOG(0, "unsupported R_CTL: %x", r_ctl);
		return (NULL);
	}

	/*
	 * xch initialization
	 */
	xch = CMD2XCH(cmd);
	xch->xch_oxid = FRM_OXID(frm);
	xch->xch_flags = 0;
	xch->xch_ss = FRM2SS(frm);
	xch->xch_cmd = cmd;
	xch->xch_current_seq = NULL;
	xch->xch_left_data_size = 0;
	if (task_expected_len) {
		xch->xch_dbuf_num =
		    (task_expected_len + FCOET_MAX_DBUF_LEN - 1) /
		    FCOET_MAX_DBUF_LEN;
		xch->xch_dbufs =
		    kmem_zalloc(xch->xch_dbuf_num * sizeof (stmf_data_buf_t *),
		    KM_SLEEP);
	}
	xch->xch_start_time = ddi_get_lbolt();
	do {
		xch->xch_rxid = atomic_add_16_nv(
		    &xch->xch_ss->ss_next_unsol_rxid, 1);
		if (xch->xch_rxid == 0xFFFF) {
			xch->xch_rxid = atomic_add_16_nv(
			    &xch->xch_ss->ss_next_unsol_rxid, 1);
		}
	} while (mod_hash_find(FRM2SS(frm)->ss_unsol_rxid_hash,
	    (mod_hash_key_t)(intptr_t)xch->xch_rxid,
	    (mod_hash_val_t)&xch_tmp) == 0);

	xch->xch_sequence_no = 0;
	xch->xch_ref = 0;
	(void) mod_hash_insert(xch->xch_ss->ss_unsol_rxid_hash,
	    (mod_hash_key_t)(intptr_t)xch->xch_rxid, (mod_hash_val_t)xch);
	xch->xch_flags |= XCH_FLAG_IN_HASH_TABLE;

	/*
	 * cmd initialization
	 */
	cmd->cmd_port = FRM2SS(frm)->ss_port;
	cmd->cmd_rp_handle = FCT_HANDLE_NONE;
	cmd->cmd_rportid = FRM_S_ID(frm);
	cmd->cmd_lportid = FRM_D_ID(frm);
	cmd->cmd_oxid = xch->xch_oxid;
	cmd->cmd_rxid = xch->xch_rxid;

	fcoet_init_tfm(frm, xch);
	return (xch);
}
int
ghd_waitq_process_and_mutex_hold(ccc_t *cccp)
{
	gcmd_t	*gcmdp;
	int	 rc = FALSE;

	ASSERT(mutex_owned(&cccp->ccc_hba_mutex));
	ASSERT(mutex_owned(&cccp->ccc_waitq_mutex));

	for (;;) {
		if (L2_EMPTY(&GHBA_QHEAD(cccp))) {
			/* return if the list is empty */
			GDBG_WAITQ(("ghd_waitq_proc: MT cccp 0x%p qp 0x%p\n",
			    (void *)cccp, (void *)&cccp->ccc_waitq));
			break;
		}
		if (GHBA_NACTIVE(cccp) >= GHBA_MAXACTIVE(cccp)) {
			/* return if the HBA is too active */
			GDBG_WAITQ(("ghd_waitq_proc: N>M cccp 0x%p qp 0x%p"
			    " N %ld max %ld\n", (void *)cccp,
			    (void *)&cccp->ccc_waitq,
			    GHBA_NACTIVE(cccp),
			    GHBA_MAXACTIVE(cccp)));
			break;
		}

		/*
		 * bail out if the wait queue has been
		 * "held" by the HBA driver
		 */
		if (cccp->ccc_waitq_held) {
			GDBG_WAITQ(("ghd_waitq_proc: held"));
			return (rc);
		}

		if (cccp->ccc_waitq_frozen) {

			clock_t lbolt, delay_in_hz, time_to_wait;

			delay_in_hz =
			    drv_usectohz(cccp->ccc_waitq_freezedelay * 1000);

			lbolt = ddi_get_lbolt();
			time_to_wait = delay_in_hz -
			    (lbolt - cccp->ccc_waitq_freezetime);

			if (time_to_wait > 0) {
				/*
				 * stay frozen; we'll be called again
				 * by ghd_timeout_softintr()
				 */
				GDBG_WAITQ(("ghd_waitq_proc: frozen"));
				return (rc);
			} else {
				/* unfreeze and continue */
				GDBG_WAITQ(("ghd_waitq_proc: unfreezing"));
				cccp->ccc_waitq_freezetime = 0;
				cccp->ccc_waitq_freezedelay = 0;
				cccp->ccc_waitq_frozen = 0;
			}
		}

		gcmdp = (gcmd_t *)L2_remove_head(&GHBA_QHEAD(cccp));
		GHBA_NACTIVE(cccp)++;
		gcmdp->cmd_waitq_level++;
		mutex_exit(&cccp->ccc_waitq_mutex);

		/*
		 * Start up the next I/O request
		 */
		ASSERT(gcmdp != NULL);
		gcmdp->cmd_state = GCMD_STATE_ACTIVE;
		if (!(*cccp->ccc_hba_start)(cccp->ccc_hba_handle, gcmdp)) {
			/* if the HBA rejected the request, requeue it */
			gcmdp->cmd_state = GCMD_STATE_WAITQ;
			mutex_enter(&cccp->ccc_waitq_mutex);
			GHBA_NACTIVE(cccp)--;
			gcmdp->cmd_waitq_level--;
			L2_add_head(&GHBA_QHEAD(cccp), &gcmdp->cmd_q, gcmdp);
			GDBG_WAITQ(("ghd_waitq_proc: busy cccp 0x%p gcmdp 0x%p"
			    " handle 0x%p\n", (void *)cccp, (void *)gcmdp,
			    cccp->ccc_hba_handle));
			break;
		}
		rc = TRUE;
		mutex_enter(&cccp->ccc_waitq_mutex);
		GDBG_WAITQ(("ghd_waitq_proc: ++ cccp 0x%p gcmdp 0x%p N %ld\n",
		    (void *)cccp, (void *)gcmdp, GHBA_NACTIVE(cccp)));
	}
	ASSERT(mutex_owned(&cccp->ccc_hba_mutex));
	ASSERT(mutex_owned(&cccp->ccc_waitq_mutex));
	return (rc);
}
Exemple #11
0
	/*
	 * If adding a new entry would exceed the cache size,
	 * evict the oldest entry (LRU).
	 */
	if ((avl_numnodes(&vc->vc_lastused_tree) << zfs_vdev_cache_bshift) >
	    zfs_vdev_cache_size) {
		ve = avl_first(&vc->vc_lastused_tree);
		if (ve->ve_fill_io != NULL)
			return (NULL);
		ASSERT3U(ve->ve_hits, !=, 0);
		vdev_cache_evict(vc, ve);
	}

	ve = kmem_zalloc(sizeof (vdev_cache_entry_t), KM_SLEEP);
	ve->ve_offset = offset;
	ve->ve_lastused = ddi_get_lbolt();
	ve->ve_abd = abd_alloc_for_io(VCBS, B_TRUE);

	avl_add(&vc->vc_offset_tree, ve);
	avl_add(&vc->vc_lastused_tree, ve);

	return (ve);
}

static void
vdev_cache_hit(vdev_cache_t *vc, vdev_cache_entry_t *ve, zio_t *zio)
{
	uint64_t cache_phase = P2PHASE(zio->io_offset, VCBS);

	ASSERT(MUTEX_HELD(&vc->vc_lock));
	ASSERT3P(ve->ve_fill_io, ==, NULL);