Пример #1
0
/*
 * __curfile_remove --
 *	WT_CURSOR->remove method for the btree cursor type.
 */
static int
__curfile_remove(WT_CURSOR *cursor)
{
	WT_CURSOR_BTREE *cbt;
	WT_DECL_RET;
	WT_SESSION_IMPL *session;

	cbt = (WT_CURSOR_BTREE *)cursor;
	CURSOR_REMOVE_API_CALL(cursor, session, cbt->btree);
	WT_ERR(__cursor_checkkey(cursor));

	WT_ERR(__wt_btcur_remove(cbt));

	/*
	 * Remove with a search-key is fire-and-forget, no position and no key.
	 * Remove starting from a position maintains the position and a key.
	 * We don't know which it was at this layer, so can only assert the key
	 * is not set at all, or internal. There's never a value.
	 */
	WT_ASSERT(session,
	    F_MASK(cursor, WT_CURSTD_KEY_SET) == 0 ||
	    F_MASK(cursor, WT_CURSTD_KEY_SET) == WT_CURSTD_KEY_INT);
	WT_ASSERT(session, F_MASK(cursor, WT_CURSTD_VALUE_SET) == 0);

err:	CURSOR_UPDATE_API_END(session, ret);
	return (ret);
}
Пример #2
0
/*
 * __curfile_modify --
 *	WT_CURSOR->modify method for the btree cursor type.
 */
static int
__curfile_modify(WT_CURSOR *cursor, WT_MODIFY *entries, int nentries)
{
	WT_CURSOR_BTREE *cbt;
	WT_DECL_RET;
	WT_SESSION_IMPL *session;

	cbt = (WT_CURSOR_BTREE *)cursor;
	CURSOR_UPDATE_API_CALL_BTREE(cursor, session, modify, cbt->btree);
	WT_ERR(__cursor_checkkey(cursor));

	/* Check for a rational modify vector count. */
	if (nentries <= 0)
		WT_ERR_MSG(session, EINVAL,
		    "Illegal modify vector with %d entries", nentries);

	WT_ERR(__wt_btcur_modify(cbt, entries, nentries));

	/*
	 * Modify maintains a position, key and value. Unlike update, it's not
	 * always an internal value.
	 */
	WT_ASSERT(session,
	    F_MASK(cursor, WT_CURSTD_KEY_SET) == WT_CURSTD_KEY_INT);
	WT_ASSERT(session, F_MASK(cursor, WT_CURSTD_VALUE_SET) != 0);

err:	CURSOR_UPDATE_API_END(session, ret);
	return (ret);
}
Пример #3
0
/*
 * __curfile_insert --
 *	WT_CURSOR->insert method for the btree cursor type.
 */
static int
__curfile_insert(WT_CURSOR *cursor)
{
	WT_CURSOR_BTREE *cbt;
	WT_DECL_RET;
	WT_SESSION_IMPL *session;

	cbt = (WT_CURSOR_BTREE *)cursor;
	CURSOR_UPDATE_API_CALL_BTREE(cursor, session, insert, cbt->btree);

	if (!F_ISSET(cursor, WT_CURSTD_APPEND))
		WT_ERR(__cursor_checkkey(cursor));
	WT_ERR(__cursor_checkvalue(cursor));

	WT_ERR(__wt_btcur_insert(cbt));

	/*
	 * Insert maintains no position, key or value (except for column-store
	 * appends, where we are returning a key).
	 */
	WT_ASSERT(session,
	    (F_ISSET(cursor, WT_CURSTD_APPEND) &&
	    F_MASK(cursor, WT_CURSTD_KEY_SET) == WT_CURSTD_KEY_INT) ||
	    (!F_ISSET(cursor, WT_CURSTD_APPEND) &&
	    F_MASK(cursor, WT_CURSTD_KEY_SET) == 0));

err:	CURSOR_UPDATE_API_END(session, ret);
	return (ret);
}
Пример #4
0
/*
 * __curlog_search --
 *	WT_CURSOR.search method for the log cursor type.
 */
static int
__curlog_search(WT_CURSOR *cursor)
{
	WT_CURSOR_LOG *cl;
	WT_DECL_RET;
	WT_LSN key;
	WT_SESSION_IMPL *session;
	uint32_t counter, key_file, key_offset, raw;

	cl = (WT_CURSOR_LOG *)cursor;
	/* Temporarily turn off raw so we can do direct cursor operations. */
	raw = F_MASK(cursor, WT_CURSTD_RAW);
	F_CLR(cursor, WT_CURSTD_RAW);

	CURSOR_API_CALL(cursor, session, search, NULL);

	/*
	 * !!! We are ignoring the counter and only searching based on the LSN.
	 */
	WT_ERR(__wt_cursor_get_key(cursor, &key_file, &key_offset, &counter));
	WT_SET_LSN(&key, key_file, key_offset);
	ret = __wt_log_scan(session, &key, WT_LOGSCAN_ONE,
	    __curlog_logrec, cl);
	if (ret == ENOENT)
		ret = WT_NOTFOUND;
	WT_ERR(ret);
	WT_ERR(__curlog_kv(session, cursor));
	WT_STAT_CONN_INCR(session, cursor_search);
	WT_STAT_DATA_INCR(session, cursor_search);

err:	F_SET(cursor, raw);
	API_END_RET(session, ret);
}
Пример #5
0
/*
 * __curfile_next --
 *	WT_CURSOR->next method for the btree cursor type.
 */
static int
__curfile_next(WT_CURSOR *cursor)
{
	WT_CURSOR_BTREE *cbt;
	WT_DECL_RET;
	WT_SESSION_IMPL *session;

	cbt = (WT_CURSOR_BTREE *)cursor;
	CURSOR_API_CALL(cursor, session, next, cbt->btree);

	WT_ERR(__wt_btcur_next(cbt, false));

	/* Next maintains a position, key and value. */
	WT_ASSERT(session,
	    F_MASK(cursor, WT_CURSTD_KEY_SET) == WT_CURSTD_KEY_INT &&
	    F_MASK(cursor, WT_CURSTD_VALUE_SET) == WT_CURSTD_VALUE_INT);

err:	API_END_RET(session, ret);
}
Пример #6
0
/*
 * __curfile_reset --
 *	WT_CURSOR->reset method for the btree cursor type.
 */
static int
__curfile_reset(WT_CURSOR *cursor)
{
	WT_CURSOR_BTREE *cbt;
	WT_DECL_RET;
	WT_SESSION_IMPL *session;

	cbt = (WT_CURSOR_BTREE *)cursor;
	CURSOR_API_CALL(cursor, session, reset, cbt->btree);

	ret = __wt_btcur_reset(cbt);

	/* Reset maintains no position, key or value. */
	WT_ASSERT(session,
	    F_MASK(cursor, WT_CURSTD_KEY_SET) == 0 &&
	    F_MASK(cursor, WT_CURSTD_VALUE_SET) == 0);

err:	API_END_RET(session, ret);
}
Пример #7
0
/*
 * __curfile_search_near --
 *	WT_CURSOR->search_near method for the btree cursor type.
 */
static int
__curfile_search_near(WT_CURSOR *cursor, int *exact)
{
	WT_CURSOR_BTREE *cbt;
	WT_DECL_RET;
	WT_SESSION_IMPL *session;

	cbt = (WT_CURSOR_BTREE *)cursor;
	CURSOR_API_CALL(cursor, session, search_near, cbt->btree);
	WT_ERR(__cursor_checkkey(cursor));

	WT_ERR(__wt_btcur_search_near(cbt, exact));

	/* Search-near maintains a position, key and value. */
	WT_ASSERT(session,
	    F_MASK(cursor, WT_CURSTD_KEY_SET) == WT_CURSTD_KEY_INT &&
	    F_MASK(cursor, WT_CURSTD_VALUE_SET) == WT_CURSTD_VALUE_INT);

err:	API_END_RET(session, ret);
}
Пример #8
0
/*
 * __curfile_update --
 *	WT_CURSOR->update method for the btree cursor type.
 */
static int
__curfile_update(WT_CURSOR *cursor)
{
	WT_CURSOR_BTREE *cbt;
	WT_DECL_RET;
	WT_SESSION_IMPL *session;

	cbt = (WT_CURSOR_BTREE *)cursor;
	CURSOR_UPDATE_API_CALL_BTREE(cursor, session, update, cbt->btree);
	WT_ERR(__cursor_checkkey(cursor));
	WT_ERR(__cursor_checkvalue(cursor));

	WT_ERR(__wt_btcur_update(cbt));

	/* Update maintains a position, key and value. */
	WT_ASSERT(session,
	    F_MASK(cursor, WT_CURSTD_KEY_SET) == WT_CURSTD_KEY_INT &&
	    F_MASK(cursor, WT_CURSTD_VALUE_SET) == WT_CURSTD_VALUE_INT);

err:	CURSOR_UPDATE_API_END(session, ret);
	return (ret);
}
Пример #9
0
/*
 * __curfile_reserve --
 *	WT_CURSOR->reserve method for the btree cursor type.
 */
static int
__curfile_reserve(WT_CURSOR *cursor)
{
	WT_CURSOR_BTREE *cbt;
	WT_DECL_RET;
	WT_SESSION_IMPL *session;

	cbt = (WT_CURSOR_BTREE *)cursor;
	CURSOR_UPDATE_API_CALL_BTREE(cursor, session, reserve, cbt->btree);
	WT_ERR(__cursor_checkkey(cursor));

	WT_ERR(__wt_txn_context_check(session, true));

	WT_ERR(__wt_btcur_reserve(cbt));

	/*
	 * Reserve maintains a position and key, which doesn't match the library
	 * API, where reserve maintains a value. Fix the API by searching after
	 * each successful reserve operation.
	 */
	WT_ASSERT(session,
	    F_MASK(cursor, WT_CURSTD_KEY_SET) == WT_CURSTD_KEY_INT);
	WT_ASSERT(session, F_MASK(cursor, WT_CURSTD_VALUE_SET) == 0);

err:	CURSOR_UPDATE_API_END(session, ret);

	/*
	 * The application might do a WT_CURSOR.get_value call when we return,
	 * so we need a value and the underlying functions didn't set one up.
	 * For various reasons, those functions may not have done a search and
	 * any previous value in the cursor might race with WT_CURSOR.reserve
	 * (and in cases like LSM, the reserve never encountered the original
	 * key). For simplicity, repeat the search here.
	 */
	return (ret == 0 ? cursor->search(cursor) : ret);
}
Пример #10
0
/*
 * __curlog_kv --
 *	Set the key and value of the log cursor to return to the user.
 */
static int
__curlog_kv(WT_SESSION_IMPL *session, WT_CURSOR *cursor)
{
	WT_CURSOR_LOG *cl;
	WT_DECL_RET;
	uint32_t fileid, key_count, opsize, optype, raw;

	cl = (WT_CURSOR_LOG *)cursor;
	/* Temporarily turn off raw so we can do direct cursor operations. */
	raw = F_MASK(cursor, WT_CURSTD_RAW);
	F_CLR(cursor, WT_CURSTD_RAW);

	/*
	 * If it is a commit and we have stepped over the header, peek to get
	 * the size and optype and read out any key/value from this operation.
	 */
	if ((key_count = cl->step_count++) > 0) {
		WT_ERR(__wt_logop_read(session,
		    &cl->stepp, cl->stepp_end, &optype, &opsize));
		WT_ERR(__curlog_op_read(session, cl, optype, opsize, &fileid));
		/* Position on the beginning of the next record part. */
		cl->stepp += opsize;
	} else {
		optype = WT_LOGOP_INVALID;
		fileid = 0;
		cl->opkey->data = NULL;
		cl->opkey->size = 0;
		/*
		 * Non-commit records we want to return the record without the
		 * header and the adjusted size.  Add one to skip over the type
		 * which is normally consumed by __wt_logrec_read.
		 */
		cl->opvalue->data = WT_LOG_SKIP_HEADER(cl->logrec->data) + 1;
		cl->opvalue->size = WT_LOG_REC_SIZE(cl->logrec->size) - 1;
	}
	/*
	 * The log cursor sets the LSN and step count as the cursor key and
	 * and log record related data in the value.  The data in the value
	 * contains any operation key/value that was in the log record.
	 */
	__wt_cursor_set_key(cursor, cl->cur_lsn->l.file, cl->cur_lsn->l.offset,
	    key_count);
	__wt_cursor_set_value(cursor, cl->txnid, cl->rectype, optype, fileid,
	    cl->opkey, cl->opvalue);

err:	F_SET(cursor, raw);
	return (ret);
}
Пример #11
0
/*
 * __wt_las_cursor --
 *	Return a lookaside cursor.
 */
void
__wt_las_cursor(
    WT_SESSION_IMPL *session, WT_CURSOR **cursorp, uint32_t *session_flags)
{
	WT_CONNECTION_IMPL *conn;

	*cursorp = NULL;

	/*
	 * We don't want to get tapped for eviction after we start using the
	 * lookaside cursor; save a copy of the current eviction state, we'll
	 * turn eviction off before we return.
	 *
	 * Don't cache lookaside table pages, we're here because of eviction
	 * problems and there's no reason to believe lookaside pages will be
	 * useful more than once.
	 */
	*session_flags =
	    F_MASK(session, WT_SESSION_NO_CACHE | WT_SESSION_NO_EVICTION);

	conn = S2C(session);

	/*
	 * Some threads have their own lookaside table cursors, else lock the
	 * shared lookaside cursor.
	 */
	if (F_ISSET(session, WT_SESSION_LOOKASIDE_CURSOR))
		*cursorp = session->las_cursor;
	else {
		__wt_spin_lock(session, &conn->las_lock);
		*cursorp = conn->las_session->las_cursor;
	}

	/* Turn caching and eviction off. */
	F_SET(session, WT_SESSION_NO_CACHE | WT_SESSION_NO_EVICTION);
}
Пример #12
0
/*
 * __wt_curjoin_join --
 *	Add a new join to a join cursor.
 */
int
__wt_curjoin_join(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin,
    WT_INDEX *idx, WT_CURSOR *ref_cursor, uint8_t flags, uint8_t range,
    uint64_t count, uint32_t bloom_bit_count, uint32_t bloom_hash_count)
{
	WT_CURSOR_INDEX *cindex;
	WT_CURSOR_JOIN *child;
	WT_CURSOR_JOIN_ENDPOINT *end;
	WT_CURSOR_JOIN_ENTRY *entry;
	size_t len;
	uint8_t endrange;
	u_int i, ins, nonbloom;
	bool hasins, needbloom, nested, range_eq;

	entry = NULL;
	hasins = needbloom = false;
	ins = nonbloom = 0;				/* -Wuninitialized */

	if (cjoin->entries_next == 0) {
		if (LF_ISSET(WT_CURJOIN_ENTRY_DISJUNCTION))
			F_SET(cjoin, WT_CURJOIN_DISJUNCTION);
	} else if (LF_ISSET(WT_CURJOIN_ENTRY_DISJUNCTION) &&
	    !F_ISSET(cjoin, WT_CURJOIN_DISJUNCTION))
		WT_RET_MSG(session, EINVAL,
		    "operation=or does not match previous operation=and");
	else if (!LF_ISSET(WT_CURJOIN_ENTRY_DISJUNCTION) &&
	    F_ISSET(cjoin, WT_CURJOIN_DISJUNCTION))
		WT_RET_MSG(session, EINVAL,
		    "operation=and does not match previous operation=or");

	nested = WT_PREFIX_MATCH(ref_cursor->uri, "join:");
	if (!nested)
		for (i = 0; i < cjoin->entries_next; i++) {
			if (cjoin->entries[i].index == idx &&
			    cjoin->entries[i].subjoin == NULL) {
				entry = &cjoin->entries[i];
				break;
			}
			if (!needbloom && i > 0 &&
			    !F_ISSET(&cjoin->entries[i],
			    WT_CURJOIN_ENTRY_BLOOM)) {
				needbloom = true;
				nonbloom = i;
			}
		}
	else {
		if (LF_ISSET(WT_CURJOIN_ENTRY_BLOOM))
			WT_RET_MSG(session, EINVAL,
			    "Bloom filters cannot be used with subjoins");
	}

	if (entry == NULL) {
		WT_RET(__wt_realloc_def(session, &cjoin->entries_allocated,
		    cjoin->entries_next + 1, &cjoin->entries));
		if (LF_ISSET(WT_CURJOIN_ENTRY_BLOOM) && needbloom) {
			/*
			 * Reorder the list so that after the first entry,
			 * the Bloom filtered entries come next, followed by
			 * the non-Bloom entries.  Once the Bloom filters
			 * are built, determining membership via Bloom is
			 * faster than without Bloom, so we can answer
			 * membership questions more quickly, and with less
			 * I/O, with the Bloom entries first.
			 */
			entry = &cjoin->entries[nonbloom];
			memmove(entry + 1, entry,
			    (cjoin->entries_next - nonbloom) *
			    sizeof(WT_CURSOR_JOIN_ENTRY));
			memset(entry, 0, sizeof(WT_CURSOR_JOIN_ENTRY));
		}
		else
			entry = &cjoin->entries[cjoin->entries_next];
		entry->index = idx;
		entry->flags = flags;
		entry->count = count;
		entry->bloom_bit_count = bloom_bit_count;
		entry->bloom_hash_count = bloom_hash_count;
		++cjoin->entries_next;
	} else {
		/* Merge the join into an existing entry for this index */
		if (count != 0 && entry->count != 0 && entry->count != count)
			WT_RET_MSG(session, EINVAL,
			    "count=%" PRIu64 " does not match "
			    "previous count=%" PRIu64 " for this index",
			    count, entry->count);
		if (LF_MASK(WT_CURJOIN_ENTRY_BLOOM) !=
		    F_MASK(entry, WT_CURJOIN_ENTRY_BLOOM))
			WT_RET_MSG(session, EINVAL,
			    "join has incompatible strategy "
			    "values for the same index");
		if (LF_MASK(WT_CURJOIN_ENTRY_FALSE_POSITIVES) !=
		    F_MASK(entry, WT_CURJOIN_ENTRY_FALSE_POSITIVES))
			WT_RET_MSG(session, EINVAL,
			    "join has incompatible bloom_false_positives "
			    "values for the same index");

		/*
		 * Check against other comparisons (we call them endpoints)
		 * already set up for this index.
		 * We allow either:
		 *   - one or more "eq" (with disjunction)
		 *   - exactly one "eq" (with conjunction)
		 *   - exactly one of "gt" or "ge" (conjunction or disjunction)
		 *   - exactly one of "lt" or "le" (conjunction or disjunction)
		 *   - one of "gt"/"ge" along with one of "lt"/"le"
		 *         (currently restricted to conjunction).
		 *
		 * Some other combinations, although expressible either do
		 * not make sense (X == 3 AND X == 5) or are reducible (X <
		 * 7 AND X < 9).  Other specific cases of (X < 7 OR X > 15)
		 * or (X == 4 OR X > 15) make sense but we don't handle yet.
		 */
		for (i = 0; i < entry->ends_next; i++) {
			end = &entry->ends[i];
			range_eq = (range == WT_CURJOIN_END_EQ);
			endrange = WT_CURJOIN_END_RANGE(end);
			if ((F_ISSET(end, WT_CURJOIN_END_GT) &&
			    ((range & WT_CURJOIN_END_GT) != 0 || range_eq)) ||
			    (F_ISSET(end, WT_CURJOIN_END_LT) &&
			    ((range & WT_CURJOIN_END_LT) != 0 || range_eq)) ||
			    (endrange == WT_CURJOIN_END_EQ &&
			    (range & (WT_CURJOIN_END_LT | WT_CURJOIN_END_GT))
			    != 0))
				WT_RET_MSG(session, EINVAL,
				    "join has overlapping ranges");
			if (range == WT_CURJOIN_END_EQ &&
			    endrange == WT_CURJOIN_END_EQ &&
			    !F_ISSET(entry, WT_CURJOIN_ENTRY_DISJUNCTION))
				WT_RET_MSG(session, EINVAL,
				    "compare=eq can only be combined "
				    "using operation=or");

			/*
			 * Sort "gt"/"ge" to the front, followed by any number
			 * of "eq", and finally "lt"/"le".
			 */
			if (!hasins &&
			    ((range & WT_CURJOIN_END_GT) != 0 ||
			    (range == WT_CURJOIN_END_EQ &&
			    endrange != WT_CURJOIN_END_EQ &&
			    !F_ISSET(end, WT_CURJOIN_END_GT)))) {
				ins = i;
				hasins = true;
			}
		}
		/* All checks completed, merge any new configuration now */
		entry->count = count;
		entry->bloom_bit_count =
		    WT_MAX(entry->bloom_bit_count, bloom_bit_count);
		entry->bloom_hash_count =
		    WT_MAX(entry->bloom_hash_count, bloom_hash_count);
	}
	if (nested) {
		child = (WT_CURSOR_JOIN *)ref_cursor;
		entry->subjoin = child;
		child->parent = cjoin;
	} else {
		WT_RET(__curjoin_insert_endpoint(session, entry,
		    hasins ? ins : entry->ends_next, &end));
		end->cursor = ref_cursor;
		F_SET(end, range);

		if (entry->main == NULL && idx != NULL) {
			/*
			 * Open the main file with a projection of the
			 * indexed columns.
			 */
			WT_RET(__curjoin_open_main(session, cjoin, entry));

			/*
			 * When we are repacking index keys to remove the
			 * primary key, we never want to transform trailing
			 * 'u'.  Use no-op padding to force this.
			 */
			cindex = (WT_CURSOR_INDEX *)ref_cursor;
			len = strlen(cindex->iface.key_format) + 3;
			WT_RET(__wt_calloc(session, len, 1,
			    &entry->repack_format));
			WT_RET(__wt_snprintf(entry->repack_format,
			    len, "%s0x", cindex->iface.key_format));
		}
	}
	return (0);
}
Пример #13
0
/*
 * __wt_curjoin_join --
 *	Add a new join to a join cursor.
 */
int
__wt_curjoin_join(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin,
    WT_INDEX *idx, WT_CURSOR *ref_cursor, uint8_t flags, uint8_t range,
    uint64_t count, uint32_t bloom_bit_count, uint32_t bloom_hash_count)
{
	WT_CURSOR_JOIN_ENTRY *entry;
	WT_DECL_RET;
	WT_CURSOR_JOIN_ENDPOINT *end, *newend;
	bool hasins, needbloom, range_eq;
	u_int i, ins, nonbloom;
	const char *raw_cfg[] = { WT_CONFIG_BASE(
	    session, WT_SESSION_open_cursor), "raw", NULL };
	char *main_uri;
	size_t namesize, newsize;

	entry = NULL;
	hasins = needbloom = false;
	ins = 0; /* -Wuninitialized */
	main_uri = NULL;
	nonbloom = 0; /* -Wuninitialized */
	namesize = strlen(cjoin->table->name);

	for (i = 0; i < cjoin->entries_next; i++) {
		if (cjoin->entries[i].index == idx) {
			entry = &cjoin->entries[i];
			break;
		}
		if (!needbloom && i > 0 &&
		    !F_ISSET(&cjoin->entries[i], WT_CURJOIN_ENTRY_BLOOM)) {
			needbloom = true;
			nonbloom = i;
		}
	}
	if (entry == NULL) {
		WT_ERR(__wt_realloc_def(session, &cjoin->entries_allocated,
		    cjoin->entries_next + 1, &cjoin->entries));
		if (LF_ISSET(WT_CURJOIN_ENTRY_BLOOM) && needbloom) {
			/*
			 * Reorder the list so that after the first entry,
			 * the Bloom filtered entries come next, followed by
			 * the non-Bloom entries.  Once the Bloom filters
			 * are built, determining membership via Bloom is
			 * faster than without Bloom, so we can answer
			 * membership questions more quickly, and with less
			 * I/O, with the Bloom entries first.
			 */
			entry = &cjoin->entries[nonbloom];
			memmove(entry + 1, entry,
			    (cjoin->entries_next - nonbloom) *
			    sizeof(WT_CURSOR_JOIN_ENTRY));
			memset(entry, 0, sizeof(WT_CURSOR_JOIN_ENTRY));
		}
		else
			entry = &cjoin->entries[cjoin->entries_next];
		entry->index = idx;
		entry->flags = flags;
		entry->count = count;
		entry->bloom_bit_count = bloom_bit_count;
		entry->bloom_hash_count = bloom_hash_count;
		++cjoin->entries_next;
	} else {
		/* Merge the join into an existing entry for this index */
		if (count != 0 && entry->count != 0 && entry->count != count)
			WT_ERR_MSG(session, EINVAL,
			    "count=%" PRIu64 " does not match "
			    "previous count=%" PRIu64 " for this index",
			    count, entry->count);
		if (LF_MASK(WT_CURJOIN_ENTRY_BLOOM) !=
		    F_MASK(entry, WT_CURJOIN_ENTRY_BLOOM))
			WT_ERR_MSG(session, EINVAL,
			    "join has incompatible strategy "
			    "values for the same index");

		/*
		 * Check against other comparisons (we call them endpoints)
		 * already set up for this index.
		 * We allow either:
		 *   - one or more "eq" (with disjunction)
		 *   - exactly one "eq" (with conjunction)
		 *   - exactly one of "gt" or "ge" (conjunction or disjunction)
		 *   - exactly one of "lt" or "le" (conjunction or disjunction)
		 *   - one of "gt"/"ge" along with one of "lt"/"le"
		 *         (currently restricted to conjunction).
		 *
		 * Some other combinations, although expressible either do
		 * not make sense (X == 3 AND X == 5) or are reducible (X <
		 * 7 AND X < 9).  Other specific cases of (X < 7 OR X > 15)
		 * or (X == 4 OR X > 15) make sense but we don't handle yet.
		 */
		for (i = 0; i < entry->ends_next; i++) {
			end = &entry->ends[i];
			range_eq = (range == WT_CURJOIN_END_EQ);
			if ((F_ISSET(end, WT_CURJOIN_END_GT) &&
			    ((range & WT_CURJOIN_END_GT) != 0 || range_eq)) ||
			    (F_ISSET(end, WT_CURJOIN_END_LT) &&
			    ((range & WT_CURJOIN_END_LT) != 0 || range_eq)) ||
			    (end->flags == WT_CURJOIN_END_EQ &&
			    (range & (WT_CURJOIN_END_LT | WT_CURJOIN_END_GT))
			    != 0))
				WT_ERR_MSG(session, EINVAL,
				    "join has overlapping ranges");
			if (range == WT_CURJOIN_END_EQ &&
			    end->flags == WT_CURJOIN_END_EQ &&
			    !F_ISSET(entry, WT_CURJOIN_ENTRY_DISJUNCTION))
				WT_ERR_MSG(session, EINVAL,
				    "compare=eq can only be combined "
				    "using operation=or");

			/*
			 * Sort "gt"/"ge" to the front, followed by any number
			 * of "eq", and finally "lt"/"le".
			 */
			if (!hasins &&
			    ((range & WT_CURJOIN_END_GT) != 0 ||
			    (range == WT_CURJOIN_END_EQ &&
			    !F_ISSET(end, WT_CURJOIN_END_GT)))) {
				ins = i;
				hasins = true;
			}
		}
		/* All checks completed, merge any new configuration now */
		entry->count = count;
		entry->bloom_bit_count =
		    WT_MAX(entry->bloom_bit_count, bloom_bit_count);
		entry->bloom_hash_count =
		    WT_MAX(entry->bloom_hash_count, bloom_hash_count);
	}
	WT_ERR(__wt_realloc_def(session, &entry->ends_allocated,
	    entry->ends_next + 1, &entry->ends));
	if (!hasins)
		ins = entry->ends_next;
	newend = &entry->ends[ins];
	memmove(newend + 1, newend,
	    (entry->ends_next - ins) * sizeof(WT_CURSOR_JOIN_ENDPOINT));
	memset(newend, 0, sizeof(WT_CURSOR_JOIN_ENDPOINT));
	entry->ends_next++;
	newend->cursor = ref_cursor;
	F_SET(newend, range);

	/* Open the main file with a projection of the indexed columns. */
	if (entry->main == NULL && entry->index != NULL) {
		namesize = strlen(cjoin->table->name);
		newsize = namesize + entry->index->colconf.len + 1;
		WT_ERR(__wt_calloc(session, 1, newsize, &main_uri));
		snprintf(main_uri, newsize, "%s%.*s",
		    cjoin->table->name, (int)entry->index->colconf.len,
		    entry->index->colconf.str);
		WT_ERR(__wt_open_cursor(session, main_uri,
		    (WT_CURSOR *)cjoin, raw_cfg, &entry->main));
	}

err:	if (main_uri != NULL)
		__wt_free(session, main_uri);
	return (ret);
}