Esempio n. 1
0
/*
 * __wt_lsm_work_switch --
 *	Do a switch if the LSM tree needs one.
 */
int
__wt_lsm_work_switch(
    WT_SESSION_IMPL *session, WT_LSM_WORK_UNIT **entryp, bool *ran)
{
	WT_DECL_RET;
	WT_LSM_WORK_UNIT *entry;

	/* We've become responsible for freeing the work unit. */
	entry = *entryp;
	*ran = false;
	*entryp = NULL;

	if (F_ISSET(entry->lsm_tree, WT_LSM_TREE_NEED_SWITCH)) {
		WT_WITH_SCHEMA_LOCK(session, ret,
		    ret = __wt_lsm_tree_switch(session, entry->lsm_tree));
		/* Failing to complete the switch is fine */
		if (ret == EBUSY) {
			if (F_ISSET(entry->lsm_tree, WT_LSM_TREE_NEED_SWITCH))
				WT_ERR(__wt_lsm_manager_push_entry(session,
				    WT_LSM_WORK_SWITCH, 0, entry->lsm_tree));
			ret = 0;
		} else
			*ran = true;
	}
err:	__wt_lsm_manager_free_work_unit(session, entry);
	return (ret);
}
Esempio n. 2
0
/*
 * __wt_clsm_open_bulk --
 *	WT_SESSION->open_cursor method for LSM bulk cursors.
 */
int
__wt_clsm_open_bulk(WT_CURSOR_LSM *clsm, const char *cfg[])
{
	WT_CURSOR *cursor, *bulk_cursor;
	WT_DECL_RET;
	WT_LSM_TREE *lsm_tree;
	WT_SESSION_IMPL *session;

	bulk_cursor = NULL;
	cursor = &clsm->iface;
	lsm_tree = clsm->lsm_tree;
	session = (WT_SESSION_IMPL *)clsm->iface.session;

	F_SET(clsm, WT_CLSM_BULK);

	/* Bulk cursors are limited to insert and close. */
	__wt_cursor_set_notsup(cursor);
	cursor->insert = __clsm_insert_bulk;
	cursor->close = __clsm_close_bulk;

	/*
	 * Setup the first chunk in the tree. This is the only time we switch
	 * without using the LSM worker threads, it's safe to do here since
	 * we have an exclusive lock on the LSM tree. We need to do this
	 * switch inline, since switch needs a schema lock and online index
	 * creation opens a bulk cursor while holding the schema lock.
	 */
	WT_WITH_SCHEMA_LOCK(session, ret,
	    ret = __wt_lsm_tree_switch(session, lsm_tree));
	WT_RET(ret);

	/*
	 * Open a bulk cursor on the first chunk, it's not a regular LSM chunk
	 * cursor, but use the standard storage locations. Allocate the space
	 * for a bloom filter - it makes cleanup simpler. Cleaned up by
	 * cursor close on error.
	 */
	WT_RET(__wt_calloc_one(session, &clsm->blooms));
	clsm->bloom_alloc = 1;
	WT_RET(__wt_calloc_one(session, &clsm->cursors));
	clsm->cursor_alloc = 1;
	clsm->nchunks = 1;

	/*
	 * Open a bulk cursor on the first chunk in the tree - take a read
	 * lock on the LSM tree while we are opening the chunk, to ensure
	 * that the first chunk has been fully created before we succeed.
	 * Pass through the application config to ensure the tree is open
	 * for bulk access.
	 */
	WT_RET(__wt_open_cursor(session,
	    lsm_tree->chunk[0]->uri, &clsm->iface, cfg, &bulk_cursor));
	clsm->cursors[0] = bulk_cursor;
	/* LSM cursors are always raw */
	F_SET(bulk_cursor, WT_CURSTD_RAW);

	return (0);
}
Esempio n. 3
0
/*
 * __lsm_tree_open --
 *	Open an LSM tree structure.
 */
static int
__lsm_tree_open(
    WT_SESSION_IMPL *session, const char *uri, WT_LSM_TREE **treep)
{
	WT_DECL_RET;
	WT_LSM_TREE *lsm_tree;

	WT_ASSERT(session, F_ISSET(session, WT_SESSION_SCHEMA_LOCKED));

	/* Make sure no one beat us to it. */
	TAILQ_FOREACH(lsm_tree, &S2C(session)->lsmqh, q)
		if (strcmp(uri, lsm_tree->name) == 0) {
			*treep = lsm_tree;
			return (0);
		}

	/* Try to open the tree. */
	WT_RET(__wt_calloc_def(session, 1, &lsm_tree));
	WT_ERR(__wt_rwlock_alloc(session, "lsm tree", &lsm_tree->rwlock));
	WT_ERR(__wt_cond_alloc(session, "lsm ckpt", 0, &lsm_tree->work_cond));
	WT_ERR(__lsm_tree_set_name(session, lsm_tree, uri));

	WT_ERR(__wt_lsm_meta_read(session, lsm_tree));

	/*
	 * Sanity check the configuration. Do it now since this is the first
	 * time we have the LSM tree configuration.
	 */
	WT_ERR(__lsm_tree_open_check(session, lsm_tree));

	if (lsm_tree->nchunks == 0) {
		F_SET(lsm_tree, WT_LSM_TREE_NEED_SWITCH);
		WT_ERR(__wt_lsm_tree_switch(session, lsm_tree));
	}

	/* Set the generation number so cursors are opened on first usage. */
	lsm_tree->dsk_gen = 1;

	/* Now the tree is setup, make it visible to others. */
	lsm_tree->refcnt = 1;
	TAILQ_INSERT_HEAD(&S2C(session)->lsmqh, lsm_tree, q);
	F_SET(lsm_tree, WT_LSM_TREE_OPEN);

	WT_ERR(__lsm_tree_start_worker(session, lsm_tree));
	*treep = lsm_tree;

	if (0) {
err:		WT_TRET(__lsm_tree_discard(session, lsm_tree));
	}
	return (ret);
}
Esempio n. 4
0
/*
 * __lsm_tree_open --
 *	Open an LSM tree structure.
 */
static int
__lsm_tree_open(
    WT_SESSION_IMPL *session, const char *uri, WT_LSM_TREE **treep)
{
	WT_CONNECTION_IMPL *conn;
	WT_DECL_RET;
	WT_LSM_TREE *lsm_tree;

	conn = S2C(session);
	lsm_tree = NULL;

	WT_ASSERT(session, F_ISSET(session, WT_SESSION_SCHEMA_LOCKED));

	/* Start the LSM manager thread if it isn't running. */
	if (WT_ATOMIC_CAS4(conn->lsm_manager.lsm_workers, 0, 1))
		WT_RET(__wt_lsm_manager_start(session));

	/* Make sure no one beat us to it. */
	TAILQ_FOREACH(lsm_tree, &S2C(session)->lsmqh, q)
		if (strcmp(uri, lsm_tree->name) == 0) {
			*treep = lsm_tree;
			return (0);
		}

	/* Try to open the tree. */
	WT_RET(__wt_calloc_def(session, 1, &lsm_tree));
	WT_ERR(__wt_rwlock_alloc(session, &lsm_tree->rwlock, "lsm tree"));

	WT_ERR(__lsm_tree_set_name(session, lsm_tree, uri));

	WT_ERR(__wt_lsm_meta_read(session, lsm_tree));

	/*
	 * Sanity check the configuration. Do it now since this is the first
	 * time we have the LSM tree configuration.
	 */
	WT_ERR(__lsm_tree_open_check(session, lsm_tree));

	if (lsm_tree->nchunks == 0) {
		F_SET(lsm_tree, WT_LSM_TREE_NEED_SWITCH);
		WT_ERR(__wt_lsm_tree_switch(session, lsm_tree));
	}

	/* Set the generation number so cursors are opened on first usage. */
	lsm_tree->dsk_gen = 1;

	/*
	 * Setup reference counting. Use separate reference counts for tree
	 * handles and queue entries, so that queue entries don't interfere
	 * with getting handles exclusive.
	 */
	lsm_tree->refcnt = 1;
	lsm_tree->queue_ref = 0;

	/* Set a flush timestamp as a baseline. */
	WT_ERR(__wt_epoch(session, &lsm_tree->last_flush_ts));

	/* Now the tree is setup, make it visible to others. */
	TAILQ_INSERT_HEAD(&S2C(session)->lsmqh, lsm_tree, q);
	F_SET(lsm_tree, WT_LSM_TREE_ACTIVE | WT_LSM_TREE_OPEN);

	*treep = lsm_tree;

	if (0) {
err:		WT_TRET(__lsm_tree_discard(session, lsm_tree));
	}
	return (ret);
}
Esempio n. 5
0
/*
 * __clsm_open_cursors --
 *	Open cursors for the current set of files.
 */
static int
__clsm_open_cursors(
    WT_CURSOR_LSM *clsm, int update, u_int start_chunk, uint32_t start_id)
{
	WT_CURSOR *c, **cp, *primary;
	WT_DECL_RET;
	WT_LSM_CHUNK *chunk;
	WT_LSM_TREE *lsm_tree;
	WT_SESSION_IMPL *session;
	WT_TXN *txn;
	const char *checkpoint, *ckpt_cfg[3];
	uint64_t saved_gen;
	u_int i, nchunks, ngood, nupdates;
	int locked;

	c = &clsm->iface;
	session = (WT_SESSION_IMPL *)c->session;
	txn = &session->txn;
	lsm_tree = clsm->lsm_tree;
	chunk = NULL;

	ckpt_cfg[0] = WT_CONFIG_BASE(session, session_open_cursor);
	ckpt_cfg[1] = "checkpoint=" WT_CHECKPOINT ",raw";
	ckpt_cfg[2] = NULL;

	/* Copy the key, so we don't lose the cursor position. */
	if (F_ISSET(c, WT_CURSTD_KEY_INT) && !WT_DATA_IN_ITEM(&c->key))
		WT_RET(__wt_buf_set(
		    session, &c->key, c->key.data, c->key.size));

	F_CLR(clsm, WT_CLSM_ITERATE_NEXT | WT_CLSM_ITERATE_PREV);

	if (update) {
		if (txn->isolation == TXN_ISO_SNAPSHOT)
			F_SET(clsm, WT_CLSM_OPEN_SNAPSHOT);
	} else
		F_SET(clsm, WT_CLSM_OPEN_READ);

	WT_RET(__wt_lsm_tree_lock(session, lsm_tree, 0));
	locked = 1;
	/*
	 * If there is no in-memory chunk in the tree for an update operation,
	 * create one.
	 *
	 * !!!
	 * It is exceeding unlikely that we get here at all, but if there is a
	 * transaction in progress and it rolls back, it would leave the
	 * metadata inconsistent.
	 */
	if (update && (lsm_tree->nchunks == 0 ||
	    (chunk = lsm_tree->chunk[lsm_tree->nchunks - 1]) == NULL ||
	    F_ISSET(chunk, WT_LSM_CHUNK_ONDISK))) {
		/* Release our lock because switch will get a write lock. */
		locked = 0;
		WT_ERR(__wt_lsm_tree_unlock(session, lsm_tree));
		WT_ERR(__wt_lsm_tree_switch(session, lsm_tree));
		WT_ERR(__wt_lsm_tree_lock(session, lsm_tree, 0));
		locked = 1;
	}
	F_SET(session, WT_SESSION_NO_CACHE_CHECK);

	/* Merge cursors have already figured out how many chunks they need. */
retry:	if (F_ISSET(clsm, WT_CLSM_MERGE)) {
		nchunks = clsm->nchunks;
		ngood = 0;

		/*
		 * We may have raced with another merge completing.  Check that
		 * we're starting at the right offset in the chunk array.
		 */
		if (start_chunk >= lsm_tree->nchunks ||
		    lsm_tree->chunk[start_chunk]->id != start_id) {
			for (start_chunk = 0;
			    start_chunk < lsm_tree->nchunks;
			    start_chunk++) {
				chunk = lsm_tree->chunk[start_chunk];
				if (chunk->id == start_id)
					break;
			}
			/* We have to find the start chunk: merge locked it. */
			WT_ASSERT(session, start_chunk < lsm_tree->nchunks);
		}

		WT_ASSERT(session, start_chunk + nchunks <= lsm_tree->nchunks);
	} else {
		nchunks = lsm_tree->nchunks;

		/*
		 * If we are only opening the cursor for updates, only open the
		 * primary chunk, plus any other chunks that might be required
		 * to detect snapshot isolation conflicts.
		 */
		if (F_ISSET(clsm, WT_CLSM_OPEN_SNAPSHOT))
			WT_ERR(__wt_realloc_def(session,
			    &clsm->txnid_alloc, nchunks,
			    &clsm->txnid_max));
		if (F_ISSET(clsm, WT_CLSM_OPEN_READ))
			ngood = nupdates = 0;
		else if (F_ISSET(clsm, WT_CLSM_OPEN_SNAPSHOT)) {
			/*
			 * Keep going until all updates in the next
			 * chunk are globally visible.  Copy the maximum
			 * transaction IDs into the cursor as we go.
			 */
			for (ngood = nchunks - 1, nupdates = 1;
			    ngood > 0;
			    ngood--, nupdates++) {
				chunk = lsm_tree->chunk[ngood - 1];
				clsm->txnid_max[ngood - 1] =
				    chunk->txnid_max;
				if (__wt_txn_visible_all(
				    session, chunk->txnid_max))
					break;
			}
		} else {
			nupdates = 1;
			ngood = nchunks - 1;
		}

		/* Check how many cursors are already open. */
		for (cp = clsm->cursors + ngood;
		    ngood < clsm->nchunks && ngood < nchunks;
		    cp++, ngood++) {
			chunk = lsm_tree->chunk[ngood];

			/* If the cursor isn't open yet, we're done. */
			if (*cp == NULL)
				break;

			/* Easy case: the URIs don't match. */
			if (strcmp((*cp)->uri, chunk->uri) != 0)
				break;

			/* Make sure the checkpoint config matches. */
			checkpoint = ((WT_CURSOR_BTREE *)*cp)->
			    btree->dhandle->checkpoint;
			if (checkpoint == NULL &&
			    F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) &&
			    !chunk->empty)
				break;

			/* Make sure the Bloom config matches. */
			if (clsm->blooms[ngood] == NULL &&
			    F_ISSET(chunk, WT_LSM_CHUNK_BLOOM))
				break;
		}

		/* Spurious generation bump? */
		if (ngood == clsm->nchunks && clsm->nchunks == nchunks) {
			clsm->dsk_gen = lsm_tree->dsk_gen;
			goto err;
		}

		/*
		 * Close any cursors we no longer need.
		 *
		 * Drop the LSM tree lock while we do this: if the cache is
		 * full, we may block while closing a cursor.  Save the
		 * generation number and retry if it has changed under us.
		 */
		if (clsm->cursors != NULL && (ngood < clsm->nchunks ||
		    (!F_ISSET(clsm, WT_CLSM_OPEN_READ) && nupdates > 0))) {
			saved_gen = lsm_tree->dsk_gen;
			locked = 0;
			WT_ERR(__wt_lsm_tree_unlock(session, lsm_tree));
			if (!F_ISSET(clsm, WT_CLSM_OPEN_READ) && nupdates > 0)
				WT_ERR(__clsm_close_cursors(
				    clsm, 0, nchunks - nupdates));
			WT_ERR(__clsm_close_cursors(
			    clsm, ngood, clsm->nchunks));
			WT_ERR(__wt_lsm_tree_lock(session, lsm_tree, 0));
			locked = 1;
			if (lsm_tree->dsk_gen != saved_gen)
				goto retry;
		}

		/* Detach from our old primary. */
		clsm->primary_chunk = NULL;
		clsm->current = NULL;
	}

	WT_ERR(__wt_realloc_def(session,
	    &clsm->bloom_alloc, nchunks, &clsm->blooms));
	WT_ERR(__wt_realloc_def(session,
	    &clsm->cursor_alloc, nchunks, &clsm->cursors));

	clsm->nchunks = nchunks;

	/* Open the cursors for chunks that have changed. */
	for (i = ngood, cp = clsm->cursors + i; i != nchunks; i++, cp++) {
		chunk = lsm_tree->chunk[i + start_chunk];
		/* Copy the maximum transaction ID. */
		if (F_ISSET(clsm, WT_CLSM_OPEN_SNAPSHOT))
			clsm->txnid_max[i] = chunk->txnid_max;

		/*
		 * Read from the checkpoint if the file has been written.
		 * Once all cursors switch, the in-memory tree can be evicted.
		 */
		WT_ASSERT(session, *cp == NULL);
		ret = __wt_open_cursor(session, chunk->uri, c,
		    (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) && !chunk->empty) ?
			ckpt_cfg : NULL, cp);

		/*
		 * XXX kludge: we may have an empty chunk where no checkpoint
		 * was written.  If so, try to open the ordinary handle on that
		 * chunk instead.
		 */
		if (ret == WT_NOTFOUND &&
		    F_ISSET(chunk, WT_LSM_CHUNK_ONDISK)) {
			ret = __wt_open_cursor(
			    session, chunk->uri, c, NULL, cp);
			if (ret == 0)
				chunk->empty = 1;
		}
		WT_ERR(ret);

		if (F_ISSET(chunk, WT_LSM_CHUNK_BLOOM) &&
		    !F_ISSET(clsm, WT_CLSM_MERGE))
			WT_ERR(__wt_bloom_open(session, chunk->bloom_uri,
			    lsm_tree->bloom_bit_count,
			    lsm_tree->bloom_hash_count,
			    c, &clsm->blooms[i]));

		/* Child cursors always use overwrite and raw mode. */
		F_SET(*cp, WT_CURSTD_OVERWRITE | WT_CURSTD_RAW);
	}

	/* The last chunk is our new primary. */
	if (chunk != NULL && !F_ISSET(chunk, WT_LSM_CHUNK_ONDISK)) {
		clsm->primary_chunk = chunk;
		primary = clsm->cursors[clsm->nchunks - 1];
		WT_WITH_BTREE(session, ((WT_CURSOR_BTREE *)(primary))->btree,
		    __wt_btree_evictable(session, 0));
	}

	clsm->dsk_gen = lsm_tree->dsk_gen;
err:	F_CLR(session, WT_SESSION_NO_CACHE_CHECK);
#ifdef HAVE_DIAGNOSTIC
	/* Check that all cursors are open as expected. */
	if (ret == 0 && F_ISSET(clsm, WT_CLSM_OPEN_READ)) {
		for (i = 0, cp = clsm->cursors; i != clsm->nchunks; cp++, i++) {
			chunk = lsm_tree->chunk[i + start_chunk];

			/* Make sure the cursor is open. */
			WT_ASSERT(session, *cp != NULL);

			/* Easy case: the URIs should match. */
			WT_ASSERT(session, strcmp((*cp)->uri, chunk->uri) == 0);

			/* Make sure the checkpoint config matches. */
			checkpoint = ((WT_CURSOR_BTREE *)*cp)->
			    btree->dhandle->checkpoint;
			WT_ASSERT(session,
			    (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) &&
			    !chunk->empty) ?
			    checkpoint != NULL : checkpoint == NULL);

			/* Make sure the Bloom config matches. */
			WT_ASSERT(session,
			    (F_ISSET(chunk, WT_LSM_CHUNK_BLOOM) &&
			    !F_ISSET(clsm, WT_CLSM_MERGE)) ?
			    clsm->blooms[i] != NULL : clsm->blooms[i] == NULL);
		}
	}
#endif
	if (locked)
		WT_TRET(__wt_lsm_tree_unlock(session, lsm_tree));
	return (ret);
}
Esempio n. 6
0
/*
 * __wt_lsm_merge_worker --
 *	The merge worker thread for an LSM tree, responsible for merging
 *	on-disk trees.
 */
void *
__wt_lsm_merge_worker(void *vargs)
{
	WT_DECL_RET;
	WT_LSM_WORKER_ARGS *args;
	WT_LSM_TREE *lsm_tree;
	WT_SESSION_IMPL *session;
	u_int aggressive, chunk_wait, id, old_aggressive, stallms;
	int progress;

	args = vargs;
	lsm_tree = args->lsm_tree;
	id = args->id;
	session = lsm_tree->worker_sessions[id];
	__wt_free(session, args);

	aggressive = stallms = 0;

	while (F_ISSET(lsm_tree, WT_LSM_TREE_WORKING)) {
		/*
		 * Help out with switching chunks in case the checkpoint worker
		 * is busy.
		 */
		if (F_ISSET(lsm_tree, WT_LSM_TREE_NEED_SWITCH)) {
			WT_WITH_SCHEMA_LOCK(session, ret =
			    __wt_lsm_tree_switch(session, lsm_tree));
			WT_ERR(ret);
		}

		progress = 0;

		/* Clear any state from previous worker thread iterations. */
		session->dhandle = NULL;

		/* Try to create a Bloom filter. */
		if (__lsm_bloom_work(session, lsm_tree) == 0)
			progress = 1;

		/* If we didn't create a Bloom filter, try to merge. */
		if (progress == 0 &&
		    __wt_lsm_merge(session, lsm_tree, id, aggressive) == 0)
			progress = 1;

		/* Clear any state from previous worker thread iterations. */
		WT_CLEAR_BTREE_IN_SESSION(session);

		/*
		 * Only have one thread freeing old chunks, and only if there
		 * are chunks to free.
		 */
		if (id == 0 && lsm_tree->nold_chunks > 0 &&
		    __lsm_free_chunks(session, lsm_tree) == 0)
			progress = 1;

		if (progress)
			stallms = 0;
		else if (F_ISSET(lsm_tree, WT_LSM_TREE_WORKING) &&
		    !F_ISSET(lsm_tree, WT_LSM_TREE_NEED_SWITCH)) {
			/* Poll 10 times per second. */
			WT_ERR_TIMEDOUT_OK(__wt_cond_wait(
			    session, lsm_tree->work_cond, 100000));
			stallms += 100;

			/*
			 * Get aggressive if more than enough chunks for a
			 * merge should have been created while we waited.
			 * Use 10 seconds as a default if we don't have an
			 * estimate.
			 */
			chunk_wait = stallms / (lsm_tree->chunk_fill_ms == 0 ?
			    10000 : lsm_tree->chunk_fill_ms);
			old_aggressive = aggressive;
			aggressive = chunk_wait / lsm_tree->merge_min;

			if (aggressive > old_aggressive)
				WT_VERBOSE_ERR(session, lsm,
				     "LSM merge got aggressive (%u), "
				     "%u / %" PRIu64,
				     aggressive, stallms,
				     lsm_tree->chunk_fill_ms);
		}
	}

	if (0) {
err:		__wt_err(session, ret, "LSM merge worker failed");
	}

	return (NULL);
}
Esempio n. 7
0
/*
 * __wt_lsm_checkpoint_worker --
 *	A worker thread for an LSM tree, responsible for flushing new chunks to
 *	disk.
 */
void *
__wt_lsm_checkpoint_worker(void *arg)
{
	WT_DECL_RET;
	WT_LSM_CHUNK *chunk;
	WT_LSM_TREE *lsm_tree;
	WT_LSM_WORKER_COOKIE cookie;
	WT_SESSION_IMPL *session;
	WT_TXN_ISOLATION saved_isolation;
	u_int i, j;
	int locked;

	lsm_tree = arg;
	session = lsm_tree->ckpt_session;

	WT_CLEAR(cookie);

	while (F_ISSET(lsm_tree, WT_LSM_TREE_WORKING)) {
		if (F_ISSET(lsm_tree, WT_LSM_TREE_NEED_SWITCH)) {
			WT_WITH_SCHEMA_LOCK(session, ret =
			    __wt_lsm_tree_switch(session, lsm_tree));
			WT_ERR(ret);
		}

		WT_ERR(__lsm_copy_chunks(session, lsm_tree, &cookie, 0));

		/* Write checkpoints in all completed files. */
		for (i = 0, j = 0; i < cookie.nchunks - 1; i++) {
			if (!F_ISSET(lsm_tree, WT_LSM_TREE_WORKING))
				goto err;

			if (F_ISSET(lsm_tree, WT_LSM_TREE_NEED_SWITCH))
				break;

			chunk = cookie.chunk_array[i];

			/* Stop if a running transaction needs the chunk. */
			__wt_txn_update_oldest(session);
			if (!__wt_txn_visible_all(session, chunk->txnid_max))
				break;

			/*
			 * If the chunk is already checkpointed, make sure it
			 * is also evicted.  Either way, there is no point
			 * trying to checkpoint it again.
			 */
			if (F_ISSET_ATOMIC(chunk, WT_LSM_CHUNK_ONDISK)) {
				if (F_ISSET_ATOMIC(chunk, WT_LSM_CHUNK_EVICTED))
					continue;

				if ((ret = __lsm_discard_handle(
				    session, chunk->uri, NULL)) == 0)
					F_SET_ATOMIC(
					    chunk, WT_LSM_CHUNK_EVICTED);
				else if (ret == EBUSY)
					ret = 0;
				else
					WT_ERR_MSG(session, ret,
					    "discard handle");
				continue;
			}

			WT_VERBOSE_ERR(session, lsm,
			     "LSM worker flushing %u", i);

			/*
			 * Flush the file before checkpointing: this is the
			 * expensive part in terms of I/O: do it without
			 * holding the schema lock.
			 *
			 * Use the special eviction isolation level to avoid
			 * interfering with an application checkpoint: we have
			 * already checked that all of the updates in this
			 * chunk are globally visible.
			 *
			 * !!! We can wait here for checkpoints and fsyncs to
			 * complete, which can be a long time.
			 *
			 * Don't keep waiting for the lock if application
			 * threads are waiting for a switch.  Don't skip
			 * flushing the leaves either: that just means we'll
			 * hold the schema lock for (much) longer, which blocks
			 * the world.
			 */
			WT_ERR(__wt_session_get_btree(
			    session, chunk->uri, NULL, NULL, 0));
			for (locked = 0;
			    !locked && ret == 0 &&
			    !F_ISSET(lsm_tree, WT_LSM_TREE_NEED_SWITCH);) {
				if ((ret = __wt_spin_trylock(session,
				    &S2C(session)->checkpoint_lock)) == 0)
					locked = 1;
				else if (ret == EBUSY) {
					__wt_yield();
					ret = 0;
				}
			}
			if (locked) {
				saved_isolation = session->txn.isolation;
				session->txn.isolation = TXN_ISO_EVICTION;
				ret = __wt_bt_cache_op(
				    session, NULL, WT_SYNC_WRITE_LEAVES);
				session->txn.isolation = saved_isolation;
				__wt_spin_unlock(
				    session, &S2C(session)->checkpoint_lock);
			}
			WT_TRET(__wt_session_release_btree(session));
			WT_ERR(ret);

			if (F_ISSET(lsm_tree, WT_LSM_TREE_NEED_SWITCH))
				break;

			WT_VERBOSE_ERR(session, lsm,
			     "LSM worker checkpointing %u", i);

			WT_WITH_SCHEMA_LOCK(session,
			    ret = __wt_schema_worker(session, chunk->uri,
			    __wt_checkpoint, NULL, NULL, 0));

			if (ret != 0) {
				__wt_err(session, ret, "LSM checkpoint");
				break;
			}

			WT_ERR(__wt_lsm_tree_set_chunk_size(session, chunk));
			/*
			 * Clear the "cache resident" flag so the primary can
			 * be evicted and eventually closed.  Only do this once
			 * the checkpoint has succeeded: otherwise, accessing
			 * the leaf page during the checkpoint can trigger
			 * forced eviction.
			 */
			WT_ERR(__wt_session_get_btree(
			    session, chunk->uri, NULL, NULL, 0));
			__wt_btree_evictable(session, 1);
			WT_ERR(__wt_session_release_btree(session));

			++j;
			WT_ERR(__wt_lsm_tree_lock(session, lsm_tree, 1));
			F_SET_ATOMIC(chunk, WT_LSM_CHUNK_ONDISK);
			ret = __wt_lsm_meta_write(session, lsm_tree);
			++lsm_tree->dsk_gen;

			/* Update the throttle time. */
			__wt_lsm_tree_throttle(session, lsm_tree);
			WT_TRET(__wt_lsm_tree_unlock(session, lsm_tree));

			/* Make sure we aren't pinning a transaction ID. */
			__wt_txn_release_snapshot(session);

			if (ret != 0) {
				__wt_err(session, ret,
				    "LSM checkpoint metadata write");
				break;
			}

			WT_VERBOSE_ERR(session, lsm,
			     "LSM worker checkpointed %u", i);
		}
		__lsm_unpin_chunks(session, &cookie);
		if (j == 0 && F_ISSET(lsm_tree, WT_LSM_TREE_WORKING) &&
		    !F_ISSET(lsm_tree, WT_LSM_TREE_NEED_SWITCH))
			WT_ERR_TIMEDOUT_OK(__wt_cond_wait(
			    session, lsm_tree->work_cond, 100000));
	}
err:	__lsm_unpin_chunks(session, &cookie);
	__wt_free(session, cookie.chunk_array);
	/*
	 * The thread will only exit with failure if we run out of memory or
	 * there is some other system driven failure. We can't keep going
	 * after such a failure - ensure WiredTiger shuts down.
	 */
	if (ret != 0 && ret != WT_NOTFOUND)
		WT_PANIC_ERR(session, ret,
		    "Shutting down LSM checkpoint utility thread");
	return (NULL);
}