Exemple #1
0
/*
 * __wt_lsm_tree_truncate --
 *	Truncate an LSM tree.
 */
int
__wt_lsm_tree_truncate(
    WT_SESSION_IMPL *session, const char *name, const char *cfg[])
{
	WT_DECL_RET;
	WT_LSM_CHUNK *chunk;
	WT_LSM_TREE *lsm_tree;
	int locked;

	WT_UNUSED(cfg);
	chunk = NULL;
	locked = 0;

	/* Get the LSM tree. */
	WT_RET(__wt_lsm_tree_get(session, name, 1, &lsm_tree));

	/* Shut down the LSM worker. */
	WT_RET(__lsm_tree_close(session, lsm_tree));

	/* Prevent any new opens. */
	WT_RET(__wt_lsm_tree_lock(session, lsm_tree, 1));
	locked = 1;

	/* Create the new chunk. */
	WT_ERR(__wt_calloc_def(session, 1, &chunk));
	chunk->id = WT_ATOMIC_ADD(lsm_tree->last, 1);
	WT_ERR(__wt_lsm_tree_setup_chunk(session, lsm_tree, chunk));

	/* Mark all chunks old. */
	WT_ERR(__wt_lsm_merge_update_tree(
	    session, lsm_tree, 0, lsm_tree->nchunks, chunk));

	WT_ERR(__wt_lsm_meta_write(session, lsm_tree));

	WT_ERR(__lsm_tree_start_worker(session, lsm_tree));
	locked = 0;
	WT_ERR(__wt_lsm_tree_unlock(session, lsm_tree));
	__wt_lsm_tree_release(session, lsm_tree);

err:	if (locked) 
		WT_TRET(__wt_lsm_tree_unlock(session, lsm_tree));
	if (ret != 0) {
		if (chunk != NULL) {
			(void)__wt_schema_drop(session, chunk->uri, NULL);
			__wt_free(session, chunk);
		}
		/*
		 * Discard the LSM tree structure on error. This will force the
		 * LSM tree to be re-opened the next time it is accessed and
		 * the last good version of the metadata will be used, resulting
		 * in a valid (not truncated) tree.
		 */
		WT_TRET(__lsm_tree_discard(session, lsm_tree));
	}
	return (ret);
}
Exemple #2
0
/*
 * __wt_lsm_tree_drop --
 *	Drop an LSM tree.
 */
int
__wt_lsm_tree_drop(
    WT_SESSION_IMPL *session, const char *name, const char *cfg[])
{
	WT_DECL_RET;
	WT_LSM_CHUNK *chunk;
	WT_LSM_TREE *lsm_tree;
	u_int i;
	int locked;

	locked = 0;

	/* Get the LSM tree. */
	WT_RET(__wt_lsm_tree_get(session, name, 1, &lsm_tree));

	/* Shut down the LSM worker. */
	WT_ERR(__lsm_tree_close(session, lsm_tree));

	/* Prevent any new opens. */
	WT_ERR(__wt_lsm_tree_lock(session, lsm_tree, 1));
	locked = 1;

	/* Drop the chunks. */
	for (i = 0; i < lsm_tree->nchunks; i++) {
		chunk = lsm_tree->chunk[i];
		WT_ERR(__wt_schema_drop(session, chunk->uri, cfg));
		if (F_ISSET_ATOMIC(chunk, WT_LSM_CHUNK_BLOOM))
			WT_ERR(
			    __wt_schema_drop(session, chunk->bloom_uri, cfg));
	}

	/* Drop any chunks on the obsolete list. */
	for (i = 0; i < lsm_tree->nold_chunks; i++) {
		if ((chunk = lsm_tree->old_chunks[i]) == NULL)
			continue;
		WT_ERR(__wt_schema_drop(session, chunk->uri, cfg));
		if (F_ISSET_ATOMIC(chunk, WT_LSM_CHUNK_BLOOM))
			WT_ERR(
			    __wt_schema_drop(session, chunk->bloom_uri, cfg));
	}

	locked = 0;
	WT_ERR(__wt_lsm_tree_unlock(session, lsm_tree));
	ret = __wt_metadata_remove(session, name);

err:	if (locked)
		WT_TRET(__wt_lsm_tree_unlock(session, lsm_tree));
	WT_TRET(__lsm_tree_discard(session, lsm_tree));
	return (ret);
}
/*
 * __lsm_copy_chunks --
 *	 Take a copy of part of the LSM tree chunk array so that we can work on
 *	 the contents without holding the LSM tree handle lock long term.
 */
static int
__lsm_copy_chunks(WT_SESSION_IMPL *session,
    WT_LSM_TREE *lsm_tree, WT_LSM_WORKER_COOKIE *cookie, int old_chunks)
{
	WT_DECL_RET;
	u_int i, nchunks;
	size_t alloc;

	/* Always return zero chunks on error. */
	cookie->nchunks = 0;

	WT_RET(__wt_lsm_tree_lock(session, lsm_tree, 0));
	if (!F_ISSET(lsm_tree, WT_LSM_TREE_WORKING))
		return (__wt_lsm_tree_unlock(session, lsm_tree));

	/* Take a copy of the current state of the LSM tree. */
	nchunks = old_chunks ? lsm_tree->nold_chunks : lsm_tree->nchunks;
	alloc = old_chunks ? lsm_tree->old_alloc : lsm_tree->chunk_alloc;

	/*
	 * If the tree array of active chunks is larger than our current buffer,
	 * increase the size of our current buffer to match.
	 */
	if (cookie->chunk_alloc < alloc)
		WT_ERR(__wt_realloc(session,
		    &cookie->chunk_alloc, alloc, &cookie->chunk_array));
	if (nchunks > 0)
		memcpy(cookie->chunk_array,
		    old_chunks ? lsm_tree->old_chunks : lsm_tree->chunk,
		    nchunks * sizeof(*cookie->chunk_array));

	/*
	 * Mark each chunk as active, so we don't drop it until after we know
	 * it's safe.
	 */
	for (i = 0; i < nchunks; i++)
		(void)WT_ATOMIC_ADD(cookie->chunk_array[i]->refcnt, 1);

err:	WT_TRET(__wt_lsm_tree_unlock(session, lsm_tree));

	if (ret == 0)
		cookie->nchunks = nchunks;
	return (ret);
}
Exemple #4
0
/*
 * __wt_lsm_tree_switch --
 *	Switch to a new in-memory tree.
 */
int
__wt_lsm_tree_switch(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
{
	WT_DECL_RET;
	WT_LSM_CHUNK *chunk;
	uint32_t nchunks, new_id;

	WT_RET(__wt_lsm_tree_lock(session, lsm_tree, 1));

	/*
	 * Check if a switch is still needed: we may have raced while waiting
	 * for a lock.
	 */
	if ((nchunks = lsm_tree->nchunks) != 0 &&
	    (chunk = lsm_tree->chunk[nchunks - 1]) != NULL &&
	    !F_ISSET_ATOMIC(chunk, WT_LSM_CHUNK_ONDISK) &&
	    !F_ISSET(lsm_tree, WT_LSM_TREE_NEED_SWITCH))
		goto err;

	/* Update the throttle time. */
	__wt_lsm_tree_throttle(session, lsm_tree);

	new_id = WT_ATOMIC_ADD(lsm_tree->last, 1);

	WT_ERR(__wt_realloc_def(session, &lsm_tree->chunk_alloc,
	    nchunks + 1, &lsm_tree->chunk));

	WT_VERBOSE_ERR(session, lsm,
	    "Tree switch to: %" PRIu32 ", throttle %ld",
	    new_id, lsm_tree->throttle_sleep);

	WT_ERR(__wt_calloc_def(session, 1, &chunk));
	chunk->id = new_id;
	chunk->txnid_max = WT_TXN_NONE;
	lsm_tree->chunk[lsm_tree->nchunks++] = chunk;
	WT_ERR(__wt_lsm_tree_setup_chunk(session, lsm_tree, chunk));

	WT_ERR(__wt_lsm_meta_write(session, lsm_tree));
	F_CLR(lsm_tree, WT_LSM_TREE_NEED_SWITCH);
	++lsm_tree->dsk_gen;

	lsm_tree->modified = 1;

err:	/* TODO: mark lsm_tree bad on error(?) */
	WT_TRET(__wt_lsm_tree_unlock(session, lsm_tree));
	return (ret);
}
Exemple #5
0
/*
 * __wt_lsm_tree_worker --
 *	Run a schema worker operation on each level of a LSM tree.
 */
int
__wt_lsm_tree_worker(WT_SESSION_IMPL *session,
   const char *uri,
   int (*file_func)(WT_SESSION_IMPL *, const char *[]),
   int (*name_func)(WT_SESSION_IMPL *, const char *, int *),
   const char *cfg[], uint32_t open_flags)
{
	WT_DECL_RET;
	WT_LSM_CHUNK *chunk;
	WT_LSM_TREE *lsm_tree;
	u_int i;

	WT_RET(__wt_lsm_tree_get(session, uri,
	    FLD_ISSET(open_flags, WT_DHANDLE_EXCLUSIVE) ? 1 : 0, &lsm_tree));

	/*
	 * We mark that we're busy using the tree to coordinate
	 * with merges so that merging doesn't change the chunk
	 * array out from underneath us.
	 */
	WT_RET(__wt_lsm_tree_lock(session, lsm_tree, 0));
	for (i = 0; i < lsm_tree->nchunks; i++) {
		chunk = lsm_tree->chunk[i];
		if (file_func == __wt_checkpoint &&
		    F_ISSET_ATOMIC(chunk, WT_LSM_CHUNK_ONDISK))
			continue;
		WT_ERR(__wt_schema_worker(session, chunk->uri,
		    file_func, name_func, cfg, open_flags));
		if (name_func == __wt_backup_list_uri_append &&
		    F_ISSET_ATOMIC(chunk, WT_LSM_CHUNK_BLOOM))
			WT_ERR(__wt_schema_worker(session, chunk->bloom_uri,
			    file_func, name_func, cfg, open_flags));
	}
err:	WT_TRET(__wt_lsm_tree_unlock(session, lsm_tree));
	__wt_lsm_tree_release(session, lsm_tree);
	return (ret);
}
/*
 * __clsm_open_cursors --
 *	Open cursors for the current set of files.
 */
static int
__clsm_open_cursors(
    WT_CURSOR_LSM *clsm, int update, u_int start_chunk, uint32_t start_id)
{
	WT_CURSOR *c, **cp, *primary;
	WT_DECL_RET;
	WT_LSM_CHUNK *chunk;
	WT_LSM_TREE *lsm_tree;
	WT_SESSION_IMPL *session;
	WT_TXN *txn;
	const char *checkpoint, *ckpt_cfg[3];
	uint64_t saved_gen;
	u_int i, nchunks, ngood, nupdates;
	int locked;

	c = &clsm->iface;
	session = (WT_SESSION_IMPL *)c->session;
	txn = &session->txn;
	lsm_tree = clsm->lsm_tree;
	chunk = NULL;

	ckpt_cfg[0] = WT_CONFIG_BASE(session, session_open_cursor);
	ckpt_cfg[1] = "checkpoint=" WT_CHECKPOINT ",raw";
	ckpt_cfg[2] = NULL;

	/* Copy the key, so we don't lose the cursor position. */
	if (F_ISSET(c, WT_CURSTD_KEY_INT) && !WT_DATA_IN_ITEM(&c->key))
		WT_RET(__wt_buf_set(
		    session, &c->key, c->key.data, c->key.size));

	F_CLR(clsm, WT_CLSM_ITERATE_NEXT | WT_CLSM_ITERATE_PREV);

	if (update) {
		if (txn->isolation == TXN_ISO_SNAPSHOT)
			F_SET(clsm, WT_CLSM_OPEN_SNAPSHOT);
	} else
		F_SET(clsm, WT_CLSM_OPEN_READ);

	WT_RET(__wt_lsm_tree_lock(session, lsm_tree, 0));
	locked = 1;
	/*
	 * If there is no in-memory chunk in the tree for an update operation,
	 * create one.
	 *
	 * !!!
	 * It is exceeding unlikely that we get here at all, but if there is a
	 * transaction in progress and it rolls back, it would leave the
	 * metadata inconsistent.
	 */
	if (update && (lsm_tree->nchunks == 0 ||
	    (chunk = lsm_tree->chunk[lsm_tree->nchunks - 1]) == NULL ||
	    F_ISSET(chunk, WT_LSM_CHUNK_ONDISK))) {
		/* Release our lock because switch will get a write lock. */
		locked = 0;
		WT_ERR(__wt_lsm_tree_unlock(session, lsm_tree));
		WT_ERR(__wt_lsm_tree_switch(session, lsm_tree));
		WT_ERR(__wt_lsm_tree_lock(session, lsm_tree, 0));
		locked = 1;
	}
	F_SET(session, WT_SESSION_NO_CACHE_CHECK);

	/* Merge cursors have already figured out how many chunks they need. */
retry:	if (F_ISSET(clsm, WT_CLSM_MERGE)) {
		nchunks = clsm->nchunks;
		ngood = 0;

		/*
		 * We may have raced with another merge completing.  Check that
		 * we're starting at the right offset in the chunk array.
		 */
		if (start_chunk >= lsm_tree->nchunks ||
		    lsm_tree->chunk[start_chunk]->id != start_id) {
			for (start_chunk = 0;
			    start_chunk < lsm_tree->nchunks;
			    start_chunk++) {
				chunk = lsm_tree->chunk[start_chunk];
				if (chunk->id == start_id)
					break;
			}
			/* We have to find the start chunk: merge locked it. */
			WT_ASSERT(session, start_chunk < lsm_tree->nchunks);
		}

		WT_ASSERT(session, start_chunk + nchunks <= lsm_tree->nchunks);
	} else {
		nchunks = lsm_tree->nchunks;

		/*
		 * If we are only opening the cursor for updates, only open the
		 * primary chunk, plus any other chunks that might be required
		 * to detect snapshot isolation conflicts.
		 */
		if (F_ISSET(clsm, WT_CLSM_OPEN_SNAPSHOT))
			WT_ERR(__wt_realloc_def(session,
			    &clsm->txnid_alloc, nchunks,
			    &clsm->txnid_max));
		if (F_ISSET(clsm, WT_CLSM_OPEN_READ))
			ngood = nupdates = 0;
		else if (F_ISSET(clsm, WT_CLSM_OPEN_SNAPSHOT)) {
			/*
			 * Keep going until all updates in the next
			 * chunk are globally visible.  Copy the maximum
			 * transaction IDs into the cursor as we go.
			 */
			for (ngood = nchunks - 1, nupdates = 1;
			    ngood > 0;
			    ngood--, nupdates++) {
				chunk = lsm_tree->chunk[ngood - 1];
				clsm->txnid_max[ngood - 1] =
				    chunk->txnid_max;
				if (__wt_txn_visible_all(
				    session, chunk->txnid_max))
					break;
			}
		} else {
			nupdates = 1;
			ngood = nchunks - 1;
		}

		/* Check how many cursors are already open. */
		for (cp = clsm->cursors + ngood;
		    ngood < clsm->nchunks && ngood < nchunks;
		    cp++, ngood++) {
			chunk = lsm_tree->chunk[ngood];

			/* If the cursor isn't open yet, we're done. */
			if (*cp == NULL)
				break;

			/* Easy case: the URIs don't match. */
			if (strcmp((*cp)->uri, chunk->uri) != 0)
				break;

			/* Make sure the checkpoint config matches. */
			checkpoint = ((WT_CURSOR_BTREE *)*cp)->
			    btree->dhandle->checkpoint;
			if (checkpoint == NULL &&
			    F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) &&
			    !chunk->empty)
				break;

			/* Make sure the Bloom config matches. */
			if (clsm->blooms[ngood] == NULL &&
			    F_ISSET(chunk, WT_LSM_CHUNK_BLOOM))
				break;
		}

		/* Spurious generation bump? */
		if (ngood == clsm->nchunks && clsm->nchunks == nchunks) {
			clsm->dsk_gen = lsm_tree->dsk_gen;
			goto err;
		}

		/*
		 * Close any cursors we no longer need.
		 *
		 * Drop the LSM tree lock while we do this: if the cache is
		 * full, we may block while closing a cursor.  Save the
		 * generation number and retry if it has changed under us.
		 */
		if (clsm->cursors != NULL && (ngood < clsm->nchunks ||
		    (!F_ISSET(clsm, WT_CLSM_OPEN_READ) && nupdates > 0))) {
			saved_gen = lsm_tree->dsk_gen;
			locked = 0;
			WT_ERR(__wt_lsm_tree_unlock(session, lsm_tree));
			if (!F_ISSET(clsm, WT_CLSM_OPEN_READ) && nupdates > 0)
				WT_ERR(__clsm_close_cursors(
				    clsm, 0, nchunks - nupdates));
			WT_ERR(__clsm_close_cursors(
			    clsm, ngood, clsm->nchunks));
			WT_ERR(__wt_lsm_tree_lock(session, lsm_tree, 0));
			locked = 1;
			if (lsm_tree->dsk_gen != saved_gen)
				goto retry;
		}

		/* Detach from our old primary. */
		clsm->primary_chunk = NULL;
		clsm->current = NULL;
	}

	WT_ERR(__wt_realloc_def(session,
	    &clsm->bloom_alloc, nchunks, &clsm->blooms));
	WT_ERR(__wt_realloc_def(session,
	    &clsm->cursor_alloc, nchunks, &clsm->cursors));

	clsm->nchunks = nchunks;

	/* Open the cursors for chunks that have changed. */
	for (i = ngood, cp = clsm->cursors + i; i != nchunks; i++, cp++) {
		chunk = lsm_tree->chunk[i + start_chunk];
		/* Copy the maximum transaction ID. */
		if (F_ISSET(clsm, WT_CLSM_OPEN_SNAPSHOT))
			clsm->txnid_max[i] = chunk->txnid_max;

		/*
		 * Read from the checkpoint if the file has been written.
		 * Once all cursors switch, the in-memory tree can be evicted.
		 */
		WT_ASSERT(session, *cp == NULL);
		ret = __wt_open_cursor(session, chunk->uri, c,
		    (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) && !chunk->empty) ?
			ckpt_cfg : NULL, cp);

		/*
		 * XXX kludge: we may have an empty chunk where no checkpoint
		 * was written.  If so, try to open the ordinary handle on that
		 * chunk instead.
		 */
		if (ret == WT_NOTFOUND &&
		    F_ISSET(chunk, WT_LSM_CHUNK_ONDISK)) {
			ret = __wt_open_cursor(
			    session, chunk->uri, c, NULL, cp);
			if (ret == 0)
				chunk->empty = 1;
		}
		WT_ERR(ret);

		if (F_ISSET(chunk, WT_LSM_CHUNK_BLOOM) &&
		    !F_ISSET(clsm, WT_CLSM_MERGE))
			WT_ERR(__wt_bloom_open(session, chunk->bloom_uri,
			    lsm_tree->bloom_bit_count,
			    lsm_tree->bloom_hash_count,
			    c, &clsm->blooms[i]));

		/* Child cursors always use overwrite and raw mode. */
		F_SET(*cp, WT_CURSTD_OVERWRITE | WT_CURSTD_RAW);
	}

	/* The last chunk is our new primary. */
	if (chunk != NULL && !F_ISSET(chunk, WT_LSM_CHUNK_ONDISK)) {
		clsm->primary_chunk = chunk;
		primary = clsm->cursors[clsm->nchunks - 1];
		WT_WITH_BTREE(session, ((WT_CURSOR_BTREE *)(primary))->btree,
		    __wt_btree_evictable(session, 0));
	}

	clsm->dsk_gen = lsm_tree->dsk_gen;
err:	F_CLR(session, WT_SESSION_NO_CACHE_CHECK);
#ifdef HAVE_DIAGNOSTIC
	/* Check that all cursors are open as expected. */
	if (ret == 0 && F_ISSET(clsm, WT_CLSM_OPEN_READ)) {
		for (i = 0, cp = clsm->cursors; i != clsm->nchunks; cp++, i++) {
			chunk = lsm_tree->chunk[i + start_chunk];

			/* Make sure the cursor is open. */
			WT_ASSERT(session, *cp != NULL);

			/* Easy case: the URIs should match. */
			WT_ASSERT(session, strcmp((*cp)->uri, chunk->uri) == 0);

			/* Make sure the checkpoint config matches. */
			checkpoint = ((WT_CURSOR_BTREE *)*cp)->
			    btree->dhandle->checkpoint;
			WT_ASSERT(session,
			    (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) &&
			    !chunk->empty) ?
			    checkpoint != NULL : checkpoint == NULL);

			/* Make sure the Bloom config matches. */
			WT_ASSERT(session,
			    (F_ISSET(chunk, WT_LSM_CHUNK_BLOOM) &&
			    !F_ISSET(clsm, WT_CLSM_MERGE)) ?
			    clsm->blooms[i] != NULL : clsm->blooms[i] == NULL);
		}
	}
#endif
	if (locked)
		WT_TRET(__wt_lsm_tree_unlock(session, lsm_tree));
	return (ret);
}
Exemple #7
0
/*
 * __wt_lsm_tree_rename --
 *	Rename an LSM tree.
 */
int
__wt_lsm_tree_rename(WT_SESSION_IMPL *session,
    const char *olduri, const char *newuri, const char *cfg[])
{
	WT_DECL_RET;
	WT_ITEM buf;
	WT_LSM_CHUNK *chunk;
	WT_LSM_TREE *lsm_tree;
	const char *old;
	u_int i;
	int locked;

	old = NULL;
	WT_CLEAR(buf);
	locked = 0;

	/* Get the LSM tree. */
	WT_RET(__wt_lsm_tree_get(session, olduri, 1, &lsm_tree));

	/* Shut down the LSM worker. */
	WT_ERR(__lsm_tree_close(session, lsm_tree));

	/* Prevent any new opens. */
	WT_ERR(__wt_lsm_tree_lock(session, lsm_tree, 1));
	locked = 1;

	/* Set the new name. */
	WT_ERR(__lsm_tree_set_name(session, lsm_tree, newuri));

	/* Rename the chunks. */
	for (i = 0; i < lsm_tree->nchunks; i++) {
		chunk = lsm_tree->chunk[i];
		old = chunk->uri;
		chunk->uri = NULL;

		WT_ERR(__wt_lsm_tree_chunk_name(
		    session, lsm_tree, chunk->id, &buf));
		chunk->uri = __wt_buf_steal(session, &buf, NULL);
		WT_ERR(__wt_schema_rename(session, old, chunk->uri, cfg));
		__wt_free(session, old);

		if (F_ISSET_ATOMIC(chunk, WT_LSM_CHUNK_BLOOM)) {
			old = chunk->bloom_uri;
			chunk->bloom_uri = NULL;
			WT_ERR(__wt_lsm_tree_bloom_name(
			    session, lsm_tree, chunk->id, &buf));
			chunk->bloom_uri = __wt_buf_steal(session, &buf, NULL);
			F_SET_ATOMIC(chunk, WT_LSM_CHUNK_BLOOM);
			WT_ERR(__wt_schema_rename(
			    session, old, chunk->uri, cfg));
			__wt_free(session, old);
		}
	}

	WT_ERR(__wt_lsm_meta_write(session, lsm_tree));
	locked = 0;
	WT_ERR(__wt_lsm_tree_unlock(session, lsm_tree));
	WT_ERR(__wt_metadata_remove(session, olduri));

err:	if (locked)
		WT_TRET(__wt_lsm_tree_unlock(session, lsm_tree));
	if (old != NULL)
		__wt_free(session, old);
	/*
	 * Discard this LSM tree structure. The first operation on the renamed
	 * tree will create a new one.
	 */
	WT_TRET(__lsm_tree_discard(session, lsm_tree));
	return (ret);
}
/*
 * __lsm_free_chunks --
 *	Try to drop chunks from the tree that are no longer required.
 */
static int
__lsm_free_chunks(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
{
	WT_DECL_RET;
	WT_LSM_CHUNK *chunk;
	WT_LSM_WORKER_COOKIE cookie;
	u_int i, skipped;
	int progress;

	/*
	 * Take a copy of the current state of the LSM tree and look for chunks
	 * to drop.  We do it this way to avoid holding the LSM tree lock while
	 * doing I/O or waiting on the schema lock.
	 *
	 * This is safe because only one thread will be in this function at a
	 * time (the first merge thread).  Merges may complete concurrently,
	 * and the old_chunks array may be extended, but we shuffle down the
	 * pointers each time we free one to keep the non-NULL slots at the
	 * beginning of the array.
	 */
	WT_CLEAR(cookie);
	WT_RET(__lsm_copy_chunks(session, lsm_tree, &cookie, 1));
	for (i = skipped = 0, progress = 0; i < cookie.nchunks; i++) {
		chunk = cookie.chunk_array[i];
		WT_ASSERT(session, chunk != NULL);
		/* Skip the chunk if another worker is using it. */
		if (chunk->refcnt > 1) {
			++skipped;
			continue;
		}

		if (F_ISSET_ATOMIC(chunk, WT_LSM_CHUNK_BLOOM)) {
			/*
			 * An EBUSY return is acceptable - a cursor may still
			 * be positioned on this old chunk.
			 */
			if ((ret = __lsm_drop_file(
			    session, chunk->bloom_uri)) == EBUSY) {
				WT_VERBOSE_ERR(session, lsm,
				    "LSM worker bloom drop busy: %s.",
				    chunk->bloom_uri);
				++skipped;
				continue;
			} else
				WT_ERR(ret);

			F_CLR_ATOMIC(chunk, WT_LSM_CHUNK_BLOOM);
		}
		if (chunk->uri != NULL) {
			/*
			 * An EBUSY return is acceptable - a cursor may still
			 * be positioned on this old chunk.
			 */
			if ((ret = __lsm_drop_file(
			    session, chunk->uri)) == EBUSY) {
				WT_VERBOSE_ERR(session, lsm,
				    "LSM worker drop busy: %s.",
				    chunk->uri);
				++skipped;
				continue;
			} else
				WT_ERR(ret);
		}

		progress = 1;

		/* Lock the tree to clear out the old chunk information. */
		WT_ERR(__wt_lsm_tree_lock(session, lsm_tree, 1));

		/*
		 * The chunk we are looking at should be the first one in the
		 * tree that we haven't already skipped over.
		 */
		WT_ASSERT(session, lsm_tree->old_chunks[skipped] == chunk);
		__wt_free(session, chunk->bloom_uri);
		__wt_free(session, chunk->uri);
		__wt_free(session, lsm_tree->old_chunks[skipped]);

		/* Shuffle down to keep all occupied slots at the beginning. */
		if (--lsm_tree->nold_chunks > skipped) {
			memmove(lsm_tree->old_chunks + skipped,
			    lsm_tree->old_chunks + skipped + 1,
			    (lsm_tree->nold_chunks - skipped) *
			    sizeof(WT_LSM_CHUNK *));
			lsm_tree->old_chunks[lsm_tree->nold_chunks] = NULL;
		}
		/*
		 * Clear the chunk in the cookie so we don't attempt to
		 * decrement the reference count.
		 */
		cookie.chunk_array[i] = NULL;

		/*
		 * Update the metadata.  We used to try to optimize by only
		 * updating the metadata once at the end, but the error
		 * handling is not straightforward.
		 */
		WT_TRET(__wt_lsm_meta_write(session, lsm_tree));
		WT_ERR(__wt_lsm_tree_unlock(session, lsm_tree));
	}

err:	__lsm_unpin_chunks(session, &cookie);
	__wt_free(session, cookie.chunk_array);

	/* Returning non-zero means there is no work to do. */
	if (!progress)
		WT_TRET(WT_NOTFOUND);

	return (ret);
}
/*
 * __lsm_bloom_create --
 *	Create a bloom filter for a chunk of the LSM tree that has been
 *	checkpointed but not yet been merged.
 */
static int
__lsm_bloom_create(WT_SESSION_IMPL *session,
    WT_LSM_TREE *lsm_tree, WT_LSM_CHUNK *chunk, u_int chunk_off)
{
	WT_BLOOM *bloom;
	WT_CURSOR *src;
	WT_DECL_RET;
	WT_ITEM buf, key;
	WT_SESSION *wt_session;
	uint64_t insert_count;
	int exist;

	/*
	 * Normally, the Bloom URI is populated when the chunk struct is
	 * allocated.  After an open, however, it may not have been.
	 * Deal with that here.
	 */
	if (chunk->bloom_uri == NULL) {
		WT_CLEAR(buf);
		WT_RET(__wt_lsm_tree_bloom_name(
		    session, lsm_tree, chunk->id, &buf));
		chunk->bloom_uri = __wt_buf_steal(session, &buf, NULL);
	}

	/*
	 * Drop the bloom filter first - there may be some content hanging over
	 * from an aborted merge or checkpoint.
	 */
	wt_session = &session->iface;
	WT_RET(__wt_exist(session, chunk->bloom_uri + strlen("file:"), &exist));
	if (exist)
		WT_RET(wt_session->drop(wt_session, chunk->bloom_uri, "force"));

	bloom = NULL;
	/*
	 * This is merge-like activity, and we don't want compacts to give up
	 * because we are creating a bunch of bloom filters before merging.
	 */
	++lsm_tree->merge_progressing;
	WT_RET(__wt_bloom_create(session, chunk->bloom_uri,
	    lsm_tree->bloom_config, chunk->count,
	    lsm_tree->bloom_bit_count, lsm_tree->bloom_hash_count, &bloom));

	/* Open a special merge cursor just on this chunk. */
	WT_ERR(__wt_open_cursor(session, lsm_tree->name, NULL, NULL, &src));
	F_SET(src, WT_CURSTD_RAW);
	WT_ERR(__wt_clsm_init_merge(src, chunk_off, chunk->id, 1));

	F_SET(session, WT_SESSION_NO_CACHE);
	for (insert_count = 0; (ret = src->next(src)) == 0; insert_count++) {
		WT_ERR(src->get_key(src, &key));
		WT_ERR(__wt_bloom_insert(bloom, &key));
	}
	WT_ERR_NOTFOUND_OK(ret);
	WT_TRET(src->close(src));

	WT_TRET(__wt_bloom_finalize(bloom));
	WT_ERR(ret);

	F_CLR(session, WT_SESSION_NO_CACHE);

	/* Load the new Bloom filter into cache. */
	WT_CLEAR(key);
	WT_ERR_NOTFOUND_OK(__wt_bloom_get(bloom, &key));

	WT_VERBOSE_ERR(session, lsm,
	    "LSM worker created bloom filter %s. "
	    "Expected %" PRIu64 " items, got %" PRIu64,
	    chunk->bloom_uri, chunk->count, insert_count);

	/* Ensure the bloom filter is in the metadata. */
	WT_ERR(__wt_lsm_tree_lock(session, lsm_tree, 1));
	F_SET_ATOMIC(chunk, WT_LSM_CHUNK_BLOOM);
	ret = __wt_lsm_meta_write(session, lsm_tree);
	++lsm_tree->dsk_gen;
	WT_TRET(__wt_lsm_tree_unlock(session, lsm_tree));

	if (ret != 0)
		WT_ERR_MSG(session, ret, "LSM bloom worker metadata write");

err:	if (bloom != NULL)
		WT_TRET(__wt_bloom_close(bloom));
	F_CLR(session, WT_SESSION_NO_CACHE);
	return (ret);
}
/*
 * __wt_lsm_checkpoint_worker --
 *	A worker thread for an LSM tree, responsible for flushing new chunks to
 *	disk.
 */
void *
__wt_lsm_checkpoint_worker(void *arg)
{
	WT_DECL_RET;
	WT_LSM_CHUNK *chunk;
	WT_LSM_TREE *lsm_tree;
	WT_LSM_WORKER_COOKIE cookie;
	WT_SESSION_IMPL *session;
	WT_TXN_ISOLATION saved_isolation;
	u_int i, j;
	int locked;

	lsm_tree = arg;
	session = lsm_tree->ckpt_session;

	WT_CLEAR(cookie);

	while (F_ISSET(lsm_tree, WT_LSM_TREE_WORKING)) {
		if (F_ISSET(lsm_tree, WT_LSM_TREE_NEED_SWITCH)) {
			WT_WITH_SCHEMA_LOCK(session, ret =
			    __wt_lsm_tree_switch(session, lsm_tree));
			WT_ERR(ret);
		}

		WT_ERR(__lsm_copy_chunks(session, lsm_tree, &cookie, 0));

		/* Write checkpoints in all completed files. */
		for (i = 0, j = 0; i < cookie.nchunks - 1; i++) {
			if (!F_ISSET(lsm_tree, WT_LSM_TREE_WORKING))
				goto err;

			if (F_ISSET(lsm_tree, WT_LSM_TREE_NEED_SWITCH))
				break;

			chunk = cookie.chunk_array[i];

			/* Stop if a running transaction needs the chunk. */
			__wt_txn_update_oldest(session);
			if (!__wt_txn_visible_all(session, chunk->txnid_max))
				break;

			/*
			 * If the chunk is already checkpointed, make sure it
			 * is also evicted.  Either way, there is no point
			 * trying to checkpoint it again.
			 */
			if (F_ISSET_ATOMIC(chunk, WT_LSM_CHUNK_ONDISK)) {
				if (F_ISSET_ATOMIC(chunk, WT_LSM_CHUNK_EVICTED))
					continue;

				if ((ret = __lsm_discard_handle(
				    session, chunk->uri, NULL)) == 0)
					F_SET_ATOMIC(
					    chunk, WT_LSM_CHUNK_EVICTED);
				else if (ret == EBUSY)
					ret = 0;
				else
					WT_ERR_MSG(session, ret,
					    "discard handle");
				continue;
			}

			WT_VERBOSE_ERR(session, lsm,
			     "LSM worker flushing %u", i);

			/*
			 * Flush the file before checkpointing: this is the
			 * expensive part in terms of I/O: do it without
			 * holding the schema lock.
			 *
			 * Use the special eviction isolation level to avoid
			 * interfering with an application checkpoint: we have
			 * already checked that all of the updates in this
			 * chunk are globally visible.
			 *
			 * !!! We can wait here for checkpoints and fsyncs to
			 * complete, which can be a long time.
			 *
			 * Don't keep waiting for the lock if application
			 * threads are waiting for a switch.  Don't skip
			 * flushing the leaves either: that just means we'll
			 * hold the schema lock for (much) longer, which blocks
			 * the world.
			 */
			WT_ERR(__wt_session_get_btree(
			    session, chunk->uri, NULL, NULL, 0));
			for (locked = 0;
			    !locked && ret == 0 &&
			    !F_ISSET(lsm_tree, WT_LSM_TREE_NEED_SWITCH);) {
				if ((ret = __wt_spin_trylock(session,
				    &S2C(session)->checkpoint_lock)) == 0)
					locked = 1;
				else if (ret == EBUSY) {
					__wt_yield();
					ret = 0;
				}
			}
			if (locked) {
				saved_isolation = session->txn.isolation;
				session->txn.isolation = TXN_ISO_EVICTION;
				ret = __wt_bt_cache_op(
				    session, NULL, WT_SYNC_WRITE_LEAVES);
				session->txn.isolation = saved_isolation;
				__wt_spin_unlock(
				    session, &S2C(session)->checkpoint_lock);
			}
			WT_TRET(__wt_session_release_btree(session));
			WT_ERR(ret);

			if (F_ISSET(lsm_tree, WT_LSM_TREE_NEED_SWITCH))
				break;

			WT_VERBOSE_ERR(session, lsm,
			     "LSM worker checkpointing %u", i);

			WT_WITH_SCHEMA_LOCK(session,
			    ret = __wt_schema_worker(session, chunk->uri,
			    __wt_checkpoint, NULL, NULL, 0));

			if (ret != 0) {
				__wt_err(session, ret, "LSM checkpoint");
				break;
			}

			WT_ERR(__wt_lsm_tree_set_chunk_size(session, chunk));
			/*
			 * Clear the "cache resident" flag so the primary can
			 * be evicted and eventually closed.  Only do this once
			 * the checkpoint has succeeded: otherwise, accessing
			 * the leaf page during the checkpoint can trigger
			 * forced eviction.
			 */
			WT_ERR(__wt_session_get_btree(
			    session, chunk->uri, NULL, NULL, 0));
			__wt_btree_evictable(session, 1);
			WT_ERR(__wt_session_release_btree(session));

			++j;
			WT_ERR(__wt_lsm_tree_lock(session, lsm_tree, 1));
			F_SET_ATOMIC(chunk, WT_LSM_CHUNK_ONDISK);
			ret = __wt_lsm_meta_write(session, lsm_tree);
			++lsm_tree->dsk_gen;

			/* Update the throttle time. */
			__wt_lsm_tree_throttle(session, lsm_tree);
			WT_TRET(__wt_lsm_tree_unlock(session, lsm_tree));

			/* Make sure we aren't pinning a transaction ID. */
			__wt_txn_release_snapshot(session);

			if (ret != 0) {
				__wt_err(session, ret,
				    "LSM checkpoint metadata write");
				break;
			}

			WT_VERBOSE_ERR(session, lsm,
			     "LSM worker checkpointed %u", i);
		}
		__lsm_unpin_chunks(session, &cookie);
		if (j == 0 && F_ISSET(lsm_tree, WT_LSM_TREE_WORKING) &&
		    !F_ISSET(lsm_tree, WT_LSM_TREE_NEED_SWITCH))
			WT_ERR_TIMEDOUT_OK(__wt_cond_wait(
			    session, lsm_tree->work_cond, 100000));
	}
err:	__lsm_unpin_chunks(session, &cookie);
	__wt_free(session, cookie.chunk_array);
	/*
	 * The thread will only exit with failure if we run out of memory or
	 * there is some other system driven failure. We can't keep going
	 * after such a failure - ensure WiredTiger shuts down.
	 */
	if (ret != 0 && ret != WT_NOTFOUND)
		WT_PANIC_ERR(session, ret,
		    "Shutting down LSM checkpoint utility thread");
	return (NULL);
}