Ejemplo n.º 1
0
/*
 * __curds_txn_enter --
 *	Do transactional initialization when starting an operation.
 */
static int
__curds_txn_enter(WT_SESSION_IMPL *session)
{
	session->ncursors++;				/* XXX */
	WT_RET(__wt_txn_cursor_op(session));

	return (0);
}
Ejemplo n.º 2
0
/*
 * __curds_txn_enter --
 *	Do transactional initialization when starting an operation.
 */
static int
__curds_txn_enter(WT_SESSION_IMPL *session, bool update)
{
	/* Check if we need to start an autocommit transaction. */
	if (update)
		WT_RET(__wt_txn_autocommit_check(session));

	session->ncursors++;				/* XXX */
	__wt_txn_cursor_op(session);

	return (0);
}
Ejemplo n.º 3
0
/*
 * __wt_btcur_search_near --
 *	Search for a record in the tree.
 */
int
__wt_btcur_search_near(WT_CURSOR_BTREE *cbt, int *exactp)
{
	WT_BTREE *btree;
	WT_CURSOR *cursor;
	WT_DECL_RET;
	WT_SESSION_IMPL *session;
	WT_UPDATE *upd;
	int exact;
	bool valid;

	btree = cbt->btree;
	cursor = &cbt->iface;
	session = (WT_SESSION_IMPL *)cursor->session;
	upd = NULL;					/* -Wuninitialized */
	exact = 0;

	WT_STAT_CONN_INCR(session, cursor_search_near);
	WT_STAT_DATA_INCR(session, cursor_search_near);

	/*
	 * If we have a row-store page pinned, search it; if we don't have a
	 * page pinned, or the search of the pinned page doesn't find an exact
	 * match, search from the root. Unlike WT_CURSOR.search, ignore pinned
	 * pages in the case of column-store, search-near isn't an interesting
	 * enough case for column-store to add the complexity needed to avoid
	 * the tree search.
	 *
	 * Set the "insert" flag for the btree row-store search; we may intend
	 * to position the cursor at the end of the tree, rather than match an
	 * existing record.
	 */
	valid = false;
	if (btree->type == BTREE_ROW &&
	    F_ISSET(cbt, WT_CBT_ACTIVE) &&
	    cbt->ref->page->read_gen != WT_READGEN_OLDEST) {
		__wt_txn_cursor_op(session);

		WT_ERR(__cursor_row_search(session, cbt, cbt->ref, true));

		/*
		 * Search-near is trickier than search when searching an already
		 * pinned page. If search returns the first or last page slots,
		 * discard the results and search the full tree as the neighbor
		 * pages might offer better matches. This test is simplistic as
		 * we're ignoring append lists (there may be no page slots or we
		 * might be legitimately positioned after the last page slot).
		 * Ignore those cases, it makes things too complicated.
		 */
		if (cbt->slot != 0 &&
		    cbt->slot != cbt->ref->page->pg_row_entries - 1)
			valid = __cursor_valid(cbt, &upd);
	}
	if (!valid) {
		WT_ERR(__cursor_func_init(cbt, true));
		WT_ERR(btree->type == BTREE_ROW ?
		    __cursor_row_search(session, cbt, NULL, true) :
		    __cursor_col_search(session, cbt, NULL));
		valid = __cursor_valid(cbt, &upd);
	}

	/*
	 * If we find a valid key, return it.
	 *
	 * Else, creating a record past the end of the tree in a fixed-length
	 * column-store implicitly fills the gap with empty records.  In this
	 * case, we instantiate the empty record, it's an exact match.
	 *
	 * Else, move to the next key in the tree (bias for prefix searches).
	 * Cursor next skips invalid rows, so we don't have to test for them
	 * again.
	 *
	 * Else, redo the search and move to the previous key in the tree.
	 * Cursor previous skips invalid rows, so we don't have to test for
	 * them again.
	 *
	 * If that fails, quit, there's no record to return.
	 */
	if (valid) {
		exact = cbt->compare;
		ret = __wt_kv_return(session, cbt, upd);
	} else if (__cursor_fix_implicit(btree, cbt)) {
		cbt->recno = cursor->recno;
		cbt->v = 0;
		cursor->value.data = &cbt->v;
		cursor->value.size = 1;
		exact = 0;
	} else if ((ret = __wt_btcur_next(cbt, false)) != WT_NOTFOUND)
		exact = 1;
	else {
		WT_ERR(__cursor_func_init(cbt, true));
		WT_ERR(btree->type == BTREE_ROW ?
		    __cursor_row_search(session, cbt, NULL, true) :
		    __cursor_col_search(session, cbt, NULL));
		if (__cursor_valid(cbt, &upd)) {
			exact = cbt->compare;
			ret = __wt_kv_return(session, cbt, upd);
		} else if ((ret = __wt_btcur_prev(cbt, false)) != WT_NOTFOUND)
			exact = -1;
	}

#ifdef HAVE_DIAGNOSTIC
	if (ret == 0)
		WT_ERR(__wt_cursor_key_order_init(session, cbt));
#endif

err:	if (ret != 0)
		WT_TRET(__cursor_reset(cbt));
	if (exactp != NULL && (ret == 0 || ret == WT_NOTFOUND))
		*exactp = exact;
	return (ret);
}
Ejemplo n.º 4
0
/*
 * __wt_btcur_search --
 *	Search for a matching record in the tree.
 */
int
__wt_btcur_search(WT_CURSOR_BTREE *cbt)
{
	WT_BTREE *btree;
	WT_CURSOR *cursor;
	WT_DECL_RET;
	WT_SESSION_IMPL *session;
	WT_UPDATE *upd;
	bool valid;

	btree = cbt->btree;
	cursor = &cbt->iface;
	session = (WT_SESSION_IMPL *)cursor->session;
	upd = NULL;					/* -Wuninitialized */

	WT_STAT_CONN_INCR(session, cursor_search);
	WT_STAT_DATA_INCR(session, cursor_search);

	/*
	 * If we have a page pinned, search it; if we don't have a page pinned,
	 * or the search of the pinned page doesn't find an exact match, search
	 * from the root.
	 */
	valid = false;
	if (F_ISSET(cbt, WT_CBT_ACTIVE) &&
	    cbt->ref->page->read_gen != WT_READGEN_OLDEST) {
		__wt_txn_cursor_op(session);

		WT_ERR(btree->type == BTREE_ROW ?
		    __cursor_row_search(session, cbt, cbt->ref, false) :
		    __cursor_col_search(session, cbt, cbt->ref));
		valid = cbt->compare == 0 && __cursor_valid(cbt, &upd);
	}
	if (!valid) {
		WT_ERR(__cursor_func_init(cbt, true));

		WT_ERR(btree->type == BTREE_ROW ?
		    __cursor_row_search(session, cbt, NULL, false) :
		    __cursor_col_search(session, cbt, NULL));
		valid = cbt->compare == 0 && __cursor_valid(cbt, &upd);
	}

	if (valid)
		ret = __wt_kv_return(session, cbt, upd);
	else if (__cursor_fix_implicit(btree, cbt)) {
		/*
		 * Creating a record past the end of the tree in a fixed-length
		 * column-store implicitly fills the gap with empty records.
		 */
		cbt->recno = cursor->recno;
		cbt->v = 0;
		cursor->value.data = &cbt->v;
		cursor->value.size = 1;
	} else
		ret = WT_NOTFOUND;

#ifdef HAVE_DIAGNOSTIC
	if (ret == 0)
		WT_ERR(__wt_cursor_key_order_init(session, cbt));
#endif

err:	if (ret != 0)
		WT_TRET(__cursor_reset(cbt));
	return (ret);
}
Ejemplo n.º 5
0
/*
 * __curjoin_init_next --
 *	Initialize the cursor join when the next function is first called.
 */
static int
__curjoin_init_next(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin,
    bool iterable)
{
	WT_BLOOM *bloom;
	WT_CURSOR *origcur;
	WT_CURSOR_JOIN_ENDPOINT *end;
	WT_CURSOR_JOIN_ENTRY *je, *jeend, *je2;
	WT_DECL_RET;
	size_t size;
	uint32_t f, k;
	char *mainbuf;
	const char *def_cfg[] = { WT_CONFIG_BASE(
	    session, WT_SESSION_open_cursor), NULL };
	const char *raw_cfg[] = { WT_CONFIG_BASE(
	    session, WT_SESSION_open_cursor), "raw", NULL };
	const char **config, *proj, *urimain;

	mainbuf = NULL;
	if (cjoin->entries_next == 0)
		WT_RET_MSG(session, EINVAL,
		    "join cursor has not yet been joined with any other "
		    "cursors");

	/* Get a consistent view of our subordinate cursors if appropriate. */
	__wt_txn_cursor_op(session);

	if (F_ISSET((WT_CURSOR *)cjoin, WT_CURSTD_RAW))
		config = &raw_cfg[0];
	else
		config = &def_cfg[0];
	urimain = cjoin->table->iface.name;
	if ((proj = cjoin->projection) != NULL) {
		size = strlen(urimain) + strlen(proj) + 1;
		WT_ERR(__wt_calloc(session, size, 1, &mainbuf));
		WT_ERR(__wt_snprintf(mainbuf, size, "%s%s", urimain, proj));
		urimain = mainbuf;
	}
	WT_ERR(__wt_open_cursor(session, urimain, (WT_CURSOR *)cjoin, config,
	    &cjoin->main));

	jeend = &cjoin->entries[cjoin->entries_next];
	for (je = cjoin->entries; je < jeend; je++) {
		if (je->subjoin != NULL) {
			WT_ERR(__curjoin_init_next(session, je->subjoin,
			    iterable));
			continue;
		}
		__wt_stat_join_init_single(&je->stats);
		/*
		 * For a single compare=le/lt endpoint in any entry that may
		 * be iterated, construct a companion compare=ge endpoint
		 * that will actually be iterated.
		 */
		if (iterable && je->ends_next == 1 &&
		    F_ISSET(&je->ends[0], WT_CURJOIN_END_LT)) {
			origcur = je->ends[0].cursor;
			WT_ERR(__curjoin_insert_endpoint(session, je, 0, &end));
			WT_ERR(__wt_open_cursor(session, origcur->uri,
			    (WT_CURSOR *)cjoin,
			    F_ISSET(origcur, WT_CURSTD_RAW) ? raw_cfg : def_cfg,
			    &end->cursor));
			end->flags = WT_CURJOIN_END_GT | WT_CURJOIN_END_EQ |
			    WT_CURJOIN_END_OWN_CURSOR;
			WT_ERR(end->cursor->next(end->cursor));
			F_CLR(je, WT_CURJOIN_ENTRY_DISJUNCTION);
		}
		for (end = &je->ends[0]; end < &je->ends[je->ends_next];
		     end++)
			WT_ERR(__curjoin_endpoint_init_key(session, je, end));

		/*
		 * Do any needed Bloom filter initialization.  Ignore Bloom
		 * filters for entries that will be iterated.  They won't
		 * help since these entries either don't need an inclusion
		 * check or are doing any needed check during the iteration.
		 */
		if (!iterable && F_ISSET(je, WT_CURJOIN_ENTRY_BLOOM)) {
			if (session->txn.isolation == WT_ISO_READ_UNCOMMITTED)
			       WT_ERR_MSG(session, EINVAL,
				    "join cursors with Bloom filters cannot be "
				    "used with read-uncommitted isolation");
			if (je->bloom == NULL) {
				/*
				 * Look for compatible filters to be shared,
				 * pick compatible numbers for bit counts
				 * and number of hashes.
				 */
				f = je->bloom_bit_count;
				k = je->bloom_hash_count;
				for (je2 = je + 1; je2 < jeend; je2++)
					if (F_ISSET(je2,
					    WT_CURJOIN_ENTRY_BLOOM) &&
					    je2->count == je->count) {
						f = WT_MAX(
						    je2->bloom_bit_count, f);
						k = WT_MAX(
						    je2->bloom_hash_count, k);
					}
				je->bloom_bit_count = f;
				je->bloom_hash_count = k;
				WT_ERR(__wt_bloom_create(session, NULL,
				    NULL, je->count, f, k, &je->bloom));
				F_SET(je, WT_CURJOIN_ENTRY_OWN_BLOOM);
				WT_ERR(__curjoin_init_bloom(session, cjoin,
				    je, je->bloom));
				/*
				 * Share the Bloom filter, making all
				 * config info consistent.
				 */
				for (je2 = je + 1; je2 < jeend; je2++)
					if (F_ISSET(je2,
					    WT_CURJOIN_ENTRY_BLOOM) &&
					    je2->count == je->count) {
						WT_ASSERT(session,
						    je2->bloom == NULL);
						je2->bloom = je->bloom;
						je2->bloom_bit_count = f;
						je2->bloom_hash_count = k;
					}
			} else {
				/*
				 * Create a temporary filter that we'll
				 * merge into the shared one.  The Bloom
				 * parameters of the two filters must match.
				 */
				WT_ERR(__wt_bloom_create(session, NULL,
				    NULL, je->count, je->bloom_bit_count,
				    je->bloom_hash_count, &bloom));
				WT_ERR(__curjoin_init_bloom(session, cjoin,
				    je, bloom));
				WT_ERR(__wt_bloom_intersection(je->bloom,
				    bloom));
				WT_ERR(__wt_bloom_close(bloom));
			}
		}
		if (!F_ISSET(cjoin, WT_CURJOIN_DISJUNCTION))
			iterable = false;
	}
	F_SET(cjoin, WT_CURJOIN_INITIALIZED);

err:	__wt_free(session, mainbuf);
	return (ret);
}
Ejemplo n.º 6
0
/*
 * __clsm_enter --
 *	Start an operation on an LSM cursor, update if the tree has changed.
 */
static inline int
__clsm_enter(WT_CURSOR_LSM *clsm, bool reset, bool update)
{
	WT_DECL_RET;
	WT_LSM_TREE *lsm_tree;
	WT_SESSION_IMPL *session;
	WT_TXN *txn;
	uint64_t *switch_txnp;
	uint64_t snap_min;

	lsm_tree = clsm->lsm_tree;
	session = (WT_SESSION_IMPL *)clsm->iface.session;
	txn = &session->txn;

	/* Merge cursors never update. */
	if (F_ISSET(clsm, WT_CLSM_MERGE))
		return (0);

	if (reset) {
		WT_ASSERT(session, !F_ISSET(&clsm->iface,
		   WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT));
		WT_RET(__clsm_reset_cursors(clsm, NULL));
	}

	for (;;) {
		/*
		 * If the cursor looks up-to-date, check if the cache is full.
		 * In case this call blocks, the check will be repeated before
		 * proceeding.
		 */
		if (clsm->dsk_gen != lsm_tree->dsk_gen &&
		    lsm_tree->nchunks != 0)
			goto open;

		if (clsm->dsk_gen != lsm_tree->dsk_gen &&
		    lsm_tree->nchunks != 0)
			goto open;

		/* Update the maximum transaction ID in the primary chunk. */
		if (update) {
			/*
			 * Ensure that there is a transaction snapshot active.
			 */
			WT_RET(__wt_txn_autocommit_check(session));
			WT_RET(__wt_txn_id_check(session));

			WT_RET(__clsm_enter_update(clsm));
			if (clsm->dsk_gen != clsm->lsm_tree->dsk_gen)
				goto open;

			if (txn->isolation == WT_ISO_SNAPSHOT)
				__wt_txn_cursor_op(session);

			/*
			 * Figure out how many updates are required for
			 * snapshot isolation.
			 *
			 * This is not a normal visibility check on the maximum
			 * transaction ID in each chunk: any transaction ID
			 * that overlaps with our snapshot is a potential
			 * conflict.
			 */
			clsm->nupdates = 1;
			if (txn->isolation == WT_ISO_SNAPSHOT &&
			    F_ISSET(clsm, WT_CLSM_OPEN_SNAPSHOT)) {
				WT_ASSERT(session,
				    F_ISSET(txn, WT_TXN_HAS_SNAPSHOT));
				snap_min = txn->snap_min;
				for (switch_txnp =
				    &clsm->switch_txn[clsm->nchunks - 2];
				    clsm->nupdates < clsm->nchunks;
				    clsm->nupdates++, switch_txnp--) {
					if (WT_TXNID_LT(*switch_txnp, snap_min))
						break;
					WT_ASSERT(session,
					    !__wt_txn_visible_all(
					    session, *switch_txnp));
				}
			}
		}

		/*
		 * Stop when we are up-to-date, as long as this is:
		 *   - a snapshot isolation update and the cursor is set up for
		 *     that;
		 *   - an update operation with a primary chunk, or
		 *   - a read operation and the cursor is open for reading.
		 */
		if ((!update ||
		    txn->isolation != WT_ISO_SNAPSHOT ||
		    F_ISSET(clsm, WT_CLSM_OPEN_SNAPSHOT)) &&
		    ((update && clsm->primary_chunk != NULL) ||
		    (!update && F_ISSET(clsm, WT_CLSM_OPEN_READ))))
			break;

open:		WT_WITH_SCHEMA_LOCK(session,
		    ret = __clsm_open_cursors(clsm, update, 0, 0));
		WT_RET(ret);
	}

	if (!F_ISSET(clsm, WT_CLSM_ACTIVE)) {
		WT_RET(__cursor_enter(session));
		F_SET(clsm, WT_CLSM_ACTIVE);
	}

	return (0);
}