示例#1
0
/*
 * __clsm_open_cursors --
 *	Open cursors for the current set of files.
 */
static int
__clsm_open_cursors(
    WT_CURSOR_LSM *clsm, int update, u_int start_chunk, uint32_t start_id)
{
	WT_CURSOR *c, **cp, *primary;
	WT_DECL_RET;
	WT_LSM_CHUNK *chunk;
	WT_LSM_TREE *lsm_tree;
	WT_SESSION_IMPL *session;
	WT_TXN *txn;
	const char *checkpoint, *ckpt_cfg[3];
	uint64_t saved_gen;
	u_int i, nchunks, ngood, nupdates;
	int locked;

	c = &clsm->iface;
	session = (WT_SESSION_IMPL *)c->session;
	txn = &session->txn;
	lsm_tree = clsm->lsm_tree;
	chunk = NULL;

	ckpt_cfg[0] = WT_CONFIG_BASE(session, session_open_cursor);
	ckpt_cfg[1] = "checkpoint=" WT_CHECKPOINT ",raw";
	ckpt_cfg[2] = NULL;

	/* Copy the key, so we don't lose the cursor position. */
	if (F_ISSET(c, WT_CURSTD_KEY_INT) && !WT_DATA_IN_ITEM(&c->key))
		WT_RET(__wt_buf_set(
		    session, &c->key, c->key.data, c->key.size));

	F_CLR(clsm, WT_CLSM_ITERATE_NEXT | WT_CLSM_ITERATE_PREV);

	if (update) {
		if (txn->isolation == TXN_ISO_SNAPSHOT)
			F_SET(clsm, WT_CLSM_OPEN_SNAPSHOT);
	} else
		F_SET(clsm, WT_CLSM_OPEN_READ);

	WT_RET(__wt_lsm_tree_lock(session, lsm_tree, 0));
	locked = 1;
	/*
	 * If there is no in-memory chunk in the tree for an update operation,
	 * create one.
	 *
	 * !!!
	 * It is exceeding unlikely that we get here at all, but if there is a
	 * transaction in progress and it rolls back, it would leave the
	 * metadata inconsistent.
	 */
	if (update && (lsm_tree->nchunks == 0 ||
	    (chunk = lsm_tree->chunk[lsm_tree->nchunks - 1]) == NULL ||
	    F_ISSET(chunk, WT_LSM_CHUNK_ONDISK))) {
		/* Release our lock because switch will get a write lock. */
		locked = 0;
		WT_ERR(__wt_lsm_tree_unlock(session, lsm_tree));
		WT_ERR(__wt_lsm_tree_switch(session, lsm_tree));
		WT_ERR(__wt_lsm_tree_lock(session, lsm_tree, 0));
		locked = 1;
	}
	F_SET(session, WT_SESSION_NO_CACHE_CHECK);

	/* Merge cursors have already figured out how many chunks they need. */
retry:	if (F_ISSET(clsm, WT_CLSM_MERGE)) {
		nchunks = clsm->nchunks;
		ngood = 0;

		/*
		 * We may have raced with another merge completing.  Check that
		 * we're starting at the right offset in the chunk array.
		 */
		if (start_chunk >= lsm_tree->nchunks ||
		    lsm_tree->chunk[start_chunk]->id != start_id) {
			for (start_chunk = 0;
			    start_chunk < lsm_tree->nchunks;
			    start_chunk++) {
				chunk = lsm_tree->chunk[start_chunk];
				if (chunk->id == start_id)
					break;
			}
			/* We have to find the start chunk: merge locked it. */
			WT_ASSERT(session, start_chunk < lsm_tree->nchunks);
		}

		WT_ASSERT(session, start_chunk + nchunks <= lsm_tree->nchunks);
	} else {
		nchunks = lsm_tree->nchunks;

		/*
		 * If we are only opening the cursor for updates, only open the
		 * primary chunk, plus any other chunks that might be required
		 * to detect snapshot isolation conflicts.
		 */
		if (F_ISSET(clsm, WT_CLSM_OPEN_SNAPSHOT))
			WT_ERR(__wt_realloc_def(session,
			    &clsm->txnid_alloc, nchunks,
			    &clsm->txnid_max));
		if (F_ISSET(clsm, WT_CLSM_OPEN_READ))
			ngood = nupdates = 0;
		else if (F_ISSET(clsm, WT_CLSM_OPEN_SNAPSHOT)) {
			/*
			 * Keep going until all updates in the next
			 * chunk are globally visible.  Copy the maximum
			 * transaction IDs into the cursor as we go.
			 */
			for (ngood = nchunks - 1, nupdates = 1;
			    ngood > 0;
			    ngood--, nupdates++) {
				chunk = lsm_tree->chunk[ngood - 1];
				clsm->txnid_max[ngood - 1] =
				    chunk->txnid_max;
				if (__wt_txn_visible_all(
				    session, chunk->txnid_max))
					break;
			}
		} else {
			nupdates = 1;
			ngood = nchunks - 1;
		}

		/* Check how many cursors are already open. */
		for (cp = clsm->cursors + ngood;
		    ngood < clsm->nchunks && ngood < nchunks;
		    cp++, ngood++) {
			chunk = lsm_tree->chunk[ngood];

			/* If the cursor isn't open yet, we're done. */
			if (*cp == NULL)
				break;

			/* Easy case: the URIs don't match. */
			if (strcmp((*cp)->uri, chunk->uri) != 0)
				break;

			/* Make sure the checkpoint config matches. */
			checkpoint = ((WT_CURSOR_BTREE *)*cp)->
			    btree->dhandle->checkpoint;
			if (checkpoint == NULL &&
			    F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) &&
			    !chunk->empty)
				break;

			/* Make sure the Bloom config matches. */
			if (clsm->blooms[ngood] == NULL &&
			    F_ISSET(chunk, WT_LSM_CHUNK_BLOOM))
				break;
		}

		/* Spurious generation bump? */
		if (ngood == clsm->nchunks && clsm->nchunks == nchunks) {
			clsm->dsk_gen = lsm_tree->dsk_gen;
			goto err;
		}

		/*
		 * Close any cursors we no longer need.
		 *
		 * Drop the LSM tree lock while we do this: if the cache is
		 * full, we may block while closing a cursor.  Save the
		 * generation number and retry if it has changed under us.
		 */
		if (clsm->cursors != NULL && (ngood < clsm->nchunks ||
		    (!F_ISSET(clsm, WT_CLSM_OPEN_READ) && nupdates > 0))) {
			saved_gen = lsm_tree->dsk_gen;
			locked = 0;
			WT_ERR(__wt_lsm_tree_unlock(session, lsm_tree));
			if (!F_ISSET(clsm, WT_CLSM_OPEN_READ) && nupdates > 0)
				WT_ERR(__clsm_close_cursors(
				    clsm, 0, nchunks - nupdates));
			WT_ERR(__clsm_close_cursors(
			    clsm, ngood, clsm->nchunks));
			WT_ERR(__wt_lsm_tree_lock(session, lsm_tree, 0));
			locked = 1;
			if (lsm_tree->dsk_gen != saved_gen)
				goto retry;
		}

		/* Detach from our old primary. */
		clsm->primary_chunk = NULL;
		clsm->current = NULL;
	}

	WT_ERR(__wt_realloc_def(session,
	    &clsm->bloom_alloc, nchunks, &clsm->blooms));
	WT_ERR(__wt_realloc_def(session,
	    &clsm->cursor_alloc, nchunks, &clsm->cursors));

	clsm->nchunks = nchunks;

	/* Open the cursors for chunks that have changed. */
	for (i = ngood, cp = clsm->cursors + i; i != nchunks; i++, cp++) {
		chunk = lsm_tree->chunk[i + start_chunk];
		/* Copy the maximum transaction ID. */
		if (F_ISSET(clsm, WT_CLSM_OPEN_SNAPSHOT))
			clsm->txnid_max[i] = chunk->txnid_max;

		/*
		 * Read from the checkpoint if the file has been written.
		 * Once all cursors switch, the in-memory tree can be evicted.
		 */
		WT_ASSERT(session, *cp == NULL);
		ret = __wt_open_cursor(session, chunk->uri, c,
		    (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) && !chunk->empty) ?
			ckpt_cfg : NULL, cp);

		/*
		 * XXX kludge: we may have an empty chunk where no checkpoint
		 * was written.  If so, try to open the ordinary handle on that
		 * chunk instead.
		 */
		if (ret == WT_NOTFOUND &&
		    F_ISSET(chunk, WT_LSM_CHUNK_ONDISK)) {
			ret = __wt_open_cursor(
			    session, chunk->uri, c, NULL, cp);
			if (ret == 0)
				chunk->empty = 1;
		}
		WT_ERR(ret);

		if (F_ISSET(chunk, WT_LSM_CHUNK_BLOOM) &&
		    !F_ISSET(clsm, WT_CLSM_MERGE))
			WT_ERR(__wt_bloom_open(session, chunk->bloom_uri,
			    lsm_tree->bloom_bit_count,
			    lsm_tree->bloom_hash_count,
			    c, &clsm->blooms[i]));

		/* Child cursors always use overwrite and raw mode. */
		F_SET(*cp, WT_CURSTD_OVERWRITE | WT_CURSTD_RAW);
	}

	/* The last chunk is our new primary. */
	if (chunk != NULL && !F_ISSET(chunk, WT_LSM_CHUNK_ONDISK)) {
		clsm->primary_chunk = chunk;
		primary = clsm->cursors[clsm->nchunks - 1];
		WT_WITH_BTREE(session, ((WT_CURSOR_BTREE *)(primary))->btree,
		    __wt_btree_evictable(session, 0));
	}

	clsm->dsk_gen = lsm_tree->dsk_gen;
err:	F_CLR(session, WT_SESSION_NO_CACHE_CHECK);
#ifdef HAVE_DIAGNOSTIC
	/* Check that all cursors are open as expected. */
	if (ret == 0 && F_ISSET(clsm, WT_CLSM_OPEN_READ)) {
		for (i = 0, cp = clsm->cursors; i != clsm->nchunks; cp++, i++) {
			chunk = lsm_tree->chunk[i + start_chunk];

			/* Make sure the cursor is open. */
			WT_ASSERT(session, *cp != NULL);

			/* Easy case: the URIs should match. */
			WT_ASSERT(session, strcmp((*cp)->uri, chunk->uri) == 0);

			/* Make sure the checkpoint config matches. */
			checkpoint = ((WT_CURSOR_BTREE *)*cp)->
			    btree->dhandle->checkpoint;
			WT_ASSERT(session,
			    (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) &&
			    !chunk->empty) ?
			    checkpoint != NULL : checkpoint == NULL);

			/* Make sure the Bloom config matches. */
			WT_ASSERT(session,
			    (F_ISSET(chunk, WT_LSM_CHUNK_BLOOM) &&
			    !F_ISSET(clsm, WT_CLSM_MERGE)) ?
			    clsm->blooms[i] != NULL : clsm->blooms[i] == NULL);
		}
	}
#endif
	if (locked)
		WT_TRET(__wt_lsm_tree_unlock(session, lsm_tree));
	return (ret);
}
示例#2
0
/*
 * __clsm_open_cursors --
 *	Open cursors for the current set of files.
 */
static int
__clsm_open_cursors(
    WT_CURSOR_LSM *clsm, bool update, u_int start_chunk, uint32_t start_id)
{
	WT_BTREE *btree;
	WT_CURSOR *c, **cp, *primary;
	WT_DECL_RET;
	WT_LSM_CHUNK *chunk;
	WT_LSM_TREE *lsm_tree;
	WT_SESSION_IMPL *session;
	WT_TXN *txn;
	const char *checkpoint, *ckpt_cfg[3];
	uint64_t saved_gen;
	u_int i, nchunks, ngood, nupdates;
	u_int close_range_end, close_range_start;
	bool locked;

	c = &clsm->iface;
	session = (WT_SESSION_IMPL *)c->session;
	txn = &session->txn;
	chunk = NULL;
	locked = false;
	lsm_tree = clsm->lsm_tree;

	/*
	 * Ensure that any snapshot update has cursors on the right set of
	 * chunks to guarantee visibility is correct.
	 */
	if (update && txn->isolation == WT_ISO_SNAPSHOT)
		F_SET(clsm, WT_CLSM_OPEN_SNAPSHOT);

	/*
	 * Query operations need a full set of cursors. Overwrite cursors
	 * do queries in service of updates.
	 */
	if (!update || !F_ISSET(c, WT_CURSTD_OVERWRITE))
		F_SET(clsm, WT_CLSM_OPEN_READ);

	if (lsm_tree->nchunks == 0)
		return (0);

	ckpt_cfg[0] = WT_CONFIG_BASE(session, WT_SESSION_open_cursor);
	ckpt_cfg[1] = "checkpoint=" WT_CHECKPOINT ",raw";
	ckpt_cfg[2] = NULL;

	/*
	 * If the key is pointing to memory that is pinned by a chunk
	 * cursor, take a copy before closing cursors.
	 */
	if (F_ISSET(c, WT_CURSTD_KEY_INT))
		WT_CURSOR_NEEDKEY(c);

	F_CLR(clsm, WT_CLSM_ITERATE_NEXT | WT_CLSM_ITERATE_PREV);

	WT_RET(__wt_lsm_tree_readlock(session, lsm_tree));
	locked = true;

	/* Merge cursors have already figured out how many chunks they need. */
retry:	if (F_ISSET(clsm, WT_CLSM_MERGE)) {
		nchunks = clsm->nchunks;
		ngood = 0;

		/*
		 * We may have raced with another merge completing.  Check that
		 * we're starting at the right offset in the chunk array.
		 */
		if (start_chunk >= lsm_tree->nchunks ||
		    lsm_tree->chunk[start_chunk]->id != start_id) {
			for (start_chunk = 0;
			    start_chunk < lsm_tree->nchunks;
			    start_chunk++) {
				chunk = lsm_tree->chunk[start_chunk];
				if (chunk->id == start_id)
					break;
			}
			/* We have to find the start chunk: merge locked it. */
			WT_ASSERT(session, start_chunk < lsm_tree->nchunks);
		}

		WT_ASSERT(session, start_chunk + nchunks <= lsm_tree->nchunks);
	} else {
		nchunks = lsm_tree->nchunks;

		/*
		 * If we are only opening the cursor for updates, only open the
		 * primary chunk, plus any other chunks that might be required
		 * to detect snapshot isolation conflicts.
		 */
		if (F_ISSET(clsm, WT_CLSM_OPEN_SNAPSHOT))
			WT_ERR(__wt_realloc_def(session,
			    &clsm->txnid_alloc, nchunks,
			    &clsm->switch_txn));
		if (F_ISSET(clsm, WT_CLSM_OPEN_READ))
			ngood = nupdates = 0;
		else if (F_ISSET(clsm, WT_CLSM_OPEN_SNAPSHOT)) {
			/*
			 * Keep going until all updates in the next
			 * chunk are globally visible.  Copy the maximum
			 * transaction IDs into the cursor as we go.
			 */
			for (ngood = nchunks - 1, nupdates = 1;
			    ngood > 0;
			    ngood--, nupdates++) {
				chunk = lsm_tree->chunk[ngood - 1];
				clsm->switch_txn[ngood - 1] = chunk->switch_txn;
				if (__wt_txn_visible_all(
				    session, chunk->switch_txn))
					break;
			}
		} else {
			nupdates = 1;
			ngood = nchunks - 1;
		}

		/* Check how many cursors are already open. */
		for (cp = clsm->cursors + ngood;
		    ngood < clsm->nchunks && ngood < nchunks;
		    cp++, ngood++) {
			chunk = lsm_tree->chunk[ngood];

			/* If the cursor isn't open yet, we're done. */
			if (*cp == NULL)
				break;

			/* Easy case: the URIs don't match. */
			if (strcmp((*cp)->uri, chunk->uri) != 0)
				break;

			/* Make sure the checkpoint config matches. */
			checkpoint = ((WT_CURSOR_BTREE *)*cp)->
			    btree->dhandle->checkpoint;
			if (checkpoint == NULL &&
			    F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) &&
			    !chunk->empty)
				break;

			/* Make sure the Bloom config matches. */
			if (clsm->blooms[ngood] == NULL &&
			    F_ISSET(chunk, WT_LSM_CHUNK_BLOOM))
				break;
		}

		/* Spurious generation bump? */
		if (ngood == clsm->nchunks && clsm->nchunks == nchunks) {
			clsm->dsk_gen = lsm_tree->dsk_gen;
			goto err;
		}

		/*
		 * Close any cursors we no longer need.
		 *
		 * Drop the LSM tree lock while we do this: if the cache is
		 * full, we may block while closing a cursor.  Save the
		 * generation number and retry if it has changed under us.
		 */
		if (clsm->cursors != NULL && ngood < clsm->nchunks) {
			close_range_start = ngood;
			close_range_end = clsm->nchunks;
		} else if (!F_ISSET(clsm, WT_CLSM_OPEN_READ) && nupdates > 0 ) {
			close_range_start = 0;
			close_range_end = WT_MIN(nchunks, clsm->nchunks);
			if (close_range_end > nupdates)
				close_range_end -= nupdates;
			else
				close_range_end = 0;
			WT_ASSERT(session, ngood >= close_range_end);
		} else {
			close_range_end = 0;
			close_range_start = 0;
		}
		if (close_range_end > close_range_start) {
			saved_gen = lsm_tree->dsk_gen;
			locked = false;
			WT_ERR(__wt_lsm_tree_readunlock(session, lsm_tree));
			WT_ERR(__clsm_close_cursors(
			    clsm, close_range_start, close_range_end));
			WT_ERR(__wt_lsm_tree_readlock(session, lsm_tree));
			locked = true;
			if (lsm_tree->dsk_gen != saved_gen)
				goto retry;
		}

		/* Detach from our old primary. */
		clsm->primary_chunk = NULL;
		clsm->current = NULL;
	}

	WT_ERR(__wt_realloc_def(session,
	    &clsm->bloom_alloc, nchunks, &clsm->blooms));
	WT_ERR(__wt_realloc_def(session,
	    &clsm->cursor_alloc, nchunks, &clsm->cursors));

	clsm->nchunks = nchunks;

	/* Open the cursors for chunks that have changed. */
	for (i = ngood, cp = clsm->cursors + i; i != nchunks; i++, cp++) {
		chunk = lsm_tree->chunk[i + start_chunk];
		/* Copy the maximum transaction ID. */
		if (F_ISSET(clsm, WT_CLSM_OPEN_SNAPSHOT))
			clsm->switch_txn[i] = chunk->switch_txn;

		/*
		 * Read from the checkpoint if the file has been written.
		 * Once all cursors switch, the in-memory tree can be evicted.
		 */
		WT_ASSERT(session, *cp == NULL);
		ret = __wt_open_cursor(session, chunk->uri, c,
		    (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) && !chunk->empty) ?
			ckpt_cfg : NULL, cp);

		/*
		 * XXX kludge: we may have an empty chunk where no checkpoint
		 * was written.  If so, try to open the ordinary handle on that
		 * chunk instead.
		 */
		if (ret == WT_NOTFOUND && F_ISSET(chunk, WT_LSM_CHUNK_ONDISK)) {
			ret = __wt_open_cursor(
			    session, chunk->uri, c, NULL, cp);
			if (ret == 0)
				chunk->empty = 1;
		}
		WT_ERR(ret);

		/*
		 * Setup all cursors other than the primary to only do conflict
		 * checks on insert operations. This allows us to execute
		 * inserts on non-primary chunks as a way of checking for
		 * write conflicts with concurrent updates.
		 */
		if (i != nchunks - 1)
			(*cp)->insert = __wt_curfile_update_check;

		if (!F_ISSET(clsm, WT_CLSM_MERGE) &&
		    F_ISSET(chunk, WT_LSM_CHUNK_BLOOM))
			WT_ERR(__wt_bloom_open(session, chunk->bloom_uri,
			    lsm_tree->bloom_bit_count,
			    lsm_tree->bloom_hash_count,
			    c, &clsm->blooms[i]));

		/* Child cursors always use overwrite and raw mode. */
		F_SET(*cp, WT_CURSTD_OVERWRITE | WT_CURSTD_RAW);
	}

	/* The last chunk is our new primary. */
	if (chunk != NULL &&
	    !F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) &&
	    chunk->switch_txn == WT_TXN_NONE) {
		clsm->primary_chunk = chunk;
		primary = clsm->cursors[clsm->nchunks - 1];
		/*
		 * Disable eviction for the in-memory chunk.  Also clear the
		 * bulk load flag here, otherwise eviction will be enabled by
		 * the first update.
		 */
		btree = ((WT_CURSOR_BTREE *)(primary))->btree;
		if (btree->bulk_load_ok) {
			btree->bulk_load_ok = false;
			WT_WITH_BTREE(session, btree,
			    __wt_btree_evictable(session, false));
		}
	}

	clsm->dsk_gen = lsm_tree->dsk_gen;

err:	
#ifdef HAVE_DIAGNOSTIC
	/* Check that all cursors are open as expected. */
	if (ret == 0 && F_ISSET(clsm, WT_CLSM_OPEN_READ)) {
		for (i = 0, cp = clsm->cursors; i != clsm->nchunks; cp++, i++) {
			chunk = lsm_tree->chunk[i + start_chunk];

			/* Make sure the cursor is open. */
			WT_ASSERT(session, *cp != NULL);

			/* Easy case: the URIs should match. */
			WT_ASSERT(session, strcmp((*cp)->uri, chunk->uri) == 0);

			/* Make sure the checkpoint config matches. */
			checkpoint = ((WT_CURSOR_BTREE *)*cp)->
			    btree->dhandle->checkpoint;
			WT_ASSERT(session,
			    (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) &&
			    !chunk->empty) ?
			    checkpoint != NULL : checkpoint == NULL);

			/* Make sure the Bloom config matches. */
			WT_ASSERT(session,
			    (F_ISSET(chunk, WT_LSM_CHUNK_BLOOM) &&
			    !F_ISSET(clsm, WT_CLSM_MERGE)) ?
			    clsm->blooms[i] != NULL : clsm->blooms[i] == NULL);
		}
	}
#endif
	if (locked)
		WT_TRET(__wt_lsm_tree_readunlock(session, lsm_tree));
	return (ret);
}