Exemple #1
0
/*
 * __wt_curjoin_join --
 *	Add a new join to a join cursor.
 */
int
__wt_curjoin_join(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin,
    WT_INDEX *idx, WT_CURSOR *ref_cursor, uint8_t flags, uint8_t range,
    uint64_t count, uint32_t bloom_bit_count, uint32_t bloom_hash_count)
{
	WT_CURSOR_INDEX *cindex;
	WT_CURSOR_JOIN *child;
	WT_CURSOR_JOIN_ENDPOINT *end;
	WT_CURSOR_JOIN_ENTRY *entry;
	size_t len;
	uint8_t endrange;
	u_int i, ins, nonbloom;
	bool hasins, needbloom, nested, range_eq;

	entry = NULL;
	hasins = needbloom = false;
	ins = nonbloom = 0;				/* -Wuninitialized */

	if (cjoin->entries_next == 0) {
		if (LF_ISSET(WT_CURJOIN_ENTRY_DISJUNCTION))
			F_SET(cjoin, WT_CURJOIN_DISJUNCTION);
	} else if (LF_ISSET(WT_CURJOIN_ENTRY_DISJUNCTION) &&
	    !F_ISSET(cjoin, WT_CURJOIN_DISJUNCTION))
		WT_RET_MSG(session, EINVAL,
		    "operation=or does not match previous operation=and");
	else if (!LF_ISSET(WT_CURJOIN_ENTRY_DISJUNCTION) &&
	    F_ISSET(cjoin, WT_CURJOIN_DISJUNCTION))
		WT_RET_MSG(session, EINVAL,
		    "operation=and does not match previous operation=or");

	nested = WT_PREFIX_MATCH(ref_cursor->uri, "join:");
	if (!nested)
		for (i = 0; i < cjoin->entries_next; i++) {
			if (cjoin->entries[i].index == idx &&
			    cjoin->entries[i].subjoin == NULL) {
				entry = &cjoin->entries[i];
				break;
			}
			if (!needbloom && i > 0 &&
			    !F_ISSET(&cjoin->entries[i],
			    WT_CURJOIN_ENTRY_BLOOM)) {
				needbloom = true;
				nonbloom = i;
			}
		}
	else {
		if (LF_ISSET(WT_CURJOIN_ENTRY_BLOOM))
			WT_RET_MSG(session, EINVAL,
			    "Bloom filters cannot be used with subjoins");
	}

	if (entry == NULL) {
		WT_RET(__wt_realloc_def(session, &cjoin->entries_allocated,
		    cjoin->entries_next + 1, &cjoin->entries));
		if (LF_ISSET(WT_CURJOIN_ENTRY_BLOOM) && needbloom) {
			/*
			 * Reorder the list so that after the first entry,
			 * the Bloom filtered entries come next, followed by
			 * the non-Bloom entries.  Once the Bloom filters
			 * are built, determining membership via Bloom is
			 * faster than without Bloom, so we can answer
			 * membership questions more quickly, and with less
			 * I/O, with the Bloom entries first.
			 */
			entry = &cjoin->entries[nonbloom];
			memmove(entry + 1, entry,
			    (cjoin->entries_next - nonbloom) *
			    sizeof(WT_CURSOR_JOIN_ENTRY));
			memset(entry, 0, sizeof(WT_CURSOR_JOIN_ENTRY));
		}
		else
			entry = &cjoin->entries[cjoin->entries_next];
		entry->index = idx;
		entry->flags = flags;
		entry->count = count;
		entry->bloom_bit_count = bloom_bit_count;
		entry->bloom_hash_count = bloom_hash_count;
		++cjoin->entries_next;
	} else {
		/* Merge the join into an existing entry for this index */
		if (count != 0 && entry->count != 0 && entry->count != count)
			WT_RET_MSG(session, EINVAL,
			    "count=%" PRIu64 " does not match "
			    "previous count=%" PRIu64 " for this index",
			    count, entry->count);
		if (LF_MASK(WT_CURJOIN_ENTRY_BLOOM) !=
		    F_MASK(entry, WT_CURJOIN_ENTRY_BLOOM))
			WT_RET_MSG(session, EINVAL,
			    "join has incompatible strategy "
			    "values for the same index");
		if (LF_MASK(WT_CURJOIN_ENTRY_FALSE_POSITIVES) !=
		    F_MASK(entry, WT_CURJOIN_ENTRY_FALSE_POSITIVES))
			WT_RET_MSG(session, EINVAL,
			    "join has incompatible bloom_false_positives "
			    "values for the same index");

		/*
		 * Check against other comparisons (we call them endpoints)
		 * already set up for this index.
		 * We allow either:
		 *   - one or more "eq" (with disjunction)
		 *   - exactly one "eq" (with conjunction)
		 *   - exactly one of "gt" or "ge" (conjunction or disjunction)
		 *   - exactly one of "lt" or "le" (conjunction or disjunction)
		 *   - one of "gt"/"ge" along with one of "lt"/"le"
		 *         (currently restricted to conjunction).
		 *
		 * Some other combinations, although expressible either do
		 * not make sense (X == 3 AND X == 5) or are reducible (X <
		 * 7 AND X < 9).  Other specific cases of (X < 7 OR X > 15)
		 * or (X == 4 OR X > 15) make sense but we don't handle yet.
		 */
		for (i = 0; i < entry->ends_next; i++) {
			end = &entry->ends[i];
			range_eq = (range == WT_CURJOIN_END_EQ);
			endrange = WT_CURJOIN_END_RANGE(end);
			if ((F_ISSET(end, WT_CURJOIN_END_GT) &&
			    ((range & WT_CURJOIN_END_GT) != 0 || range_eq)) ||
			    (F_ISSET(end, WT_CURJOIN_END_LT) &&
			    ((range & WT_CURJOIN_END_LT) != 0 || range_eq)) ||
			    (endrange == WT_CURJOIN_END_EQ &&
			    (range & (WT_CURJOIN_END_LT | WT_CURJOIN_END_GT))
			    != 0))
				WT_RET_MSG(session, EINVAL,
				    "join has overlapping ranges");
			if (range == WT_CURJOIN_END_EQ &&
			    endrange == WT_CURJOIN_END_EQ &&
			    !F_ISSET(entry, WT_CURJOIN_ENTRY_DISJUNCTION))
				WT_RET_MSG(session, EINVAL,
				    "compare=eq can only be combined "
				    "using operation=or");

			/*
			 * Sort "gt"/"ge" to the front, followed by any number
			 * of "eq", and finally "lt"/"le".
			 */
			if (!hasins &&
			    ((range & WT_CURJOIN_END_GT) != 0 ||
			    (range == WT_CURJOIN_END_EQ &&
			    endrange != WT_CURJOIN_END_EQ &&
			    !F_ISSET(end, WT_CURJOIN_END_GT)))) {
				ins = i;
				hasins = true;
			}
		}
		/* All checks completed, merge any new configuration now */
		entry->count = count;
		entry->bloom_bit_count =
		    WT_MAX(entry->bloom_bit_count, bloom_bit_count);
		entry->bloom_hash_count =
		    WT_MAX(entry->bloom_hash_count, bloom_hash_count);
	}
	if (nested) {
		child = (WT_CURSOR_JOIN *)ref_cursor;
		entry->subjoin = child;
		child->parent = cjoin;
	} else {
		WT_RET(__curjoin_insert_endpoint(session, entry,
		    hasins ? ins : entry->ends_next, &end));
		end->cursor = ref_cursor;
		F_SET(end, range);

		if (entry->main == NULL && idx != NULL) {
			/*
			 * Open the main file with a projection of the
			 * indexed columns.
			 */
			WT_RET(__curjoin_open_main(session, cjoin, entry));

			/*
			 * When we are repacking index keys to remove the
			 * primary key, we never want to transform trailing
			 * 'u'.  Use no-op padding to force this.
			 */
			cindex = (WT_CURSOR_INDEX *)ref_cursor;
			len = strlen(cindex->iface.key_format) + 3;
			WT_RET(__wt_calloc(session, len, 1,
			    &entry->repack_format));
			WT_RET(__wt_snprintf(entry->repack_format,
			    len, "%s0x", cindex->iface.key_format));
		}
	}
	return (0);
}
Exemple #2
0
/*
 * __wt_curjoin_join --
 *	Add a new join to a join cursor.
 */
int
__wt_curjoin_join(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin,
    WT_INDEX *idx, WT_CURSOR *ref_cursor, uint8_t flags, uint8_t range,
    uint64_t count, uint32_t bloom_bit_count, uint32_t bloom_hash_count)
{
	WT_CURSOR_JOIN_ENTRY *entry;
	WT_DECL_RET;
	WT_CURSOR_JOIN_ENDPOINT *end, *newend;
	bool hasins, needbloom, range_eq;
	u_int i, ins, nonbloom;
	const char *raw_cfg[] = { WT_CONFIG_BASE(
	    session, WT_SESSION_open_cursor), "raw", NULL };
	char *main_uri;
	size_t namesize, newsize;

	entry = NULL;
	hasins = needbloom = false;
	ins = 0; /* -Wuninitialized */
	main_uri = NULL;
	nonbloom = 0; /* -Wuninitialized */
	namesize = strlen(cjoin->table->name);

	for (i = 0; i < cjoin->entries_next; i++) {
		if (cjoin->entries[i].index == idx) {
			entry = &cjoin->entries[i];
			break;
		}
		if (!needbloom && i > 0 &&
		    !F_ISSET(&cjoin->entries[i], WT_CURJOIN_ENTRY_BLOOM)) {
			needbloom = true;
			nonbloom = i;
		}
	}
	if (entry == NULL) {
		WT_ERR(__wt_realloc_def(session, &cjoin->entries_allocated,
		    cjoin->entries_next + 1, &cjoin->entries));
		if (LF_ISSET(WT_CURJOIN_ENTRY_BLOOM) && needbloom) {
			/*
			 * Reorder the list so that after the first entry,
			 * the Bloom filtered entries come next, followed by
			 * the non-Bloom entries.  Once the Bloom filters
			 * are built, determining membership via Bloom is
			 * faster than without Bloom, so we can answer
			 * membership questions more quickly, and with less
			 * I/O, with the Bloom entries first.
			 */
			entry = &cjoin->entries[nonbloom];
			memmove(entry + 1, entry,
			    (cjoin->entries_next - nonbloom) *
			    sizeof(WT_CURSOR_JOIN_ENTRY));
			memset(entry, 0, sizeof(WT_CURSOR_JOIN_ENTRY));
		}
		else
			entry = &cjoin->entries[cjoin->entries_next];
		entry->index = idx;
		entry->flags = flags;
		entry->count = count;
		entry->bloom_bit_count = bloom_bit_count;
		entry->bloom_hash_count = bloom_hash_count;
		++cjoin->entries_next;
	} else {
		/* Merge the join into an existing entry for this index */
		if (count != 0 && entry->count != 0 && entry->count != count)
			WT_ERR_MSG(session, EINVAL,
			    "count=%" PRIu64 " does not match "
			    "previous count=%" PRIu64 " for this index",
			    count, entry->count);
		if (LF_MASK(WT_CURJOIN_ENTRY_BLOOM) !=
		    F_MASK(entry, WT_CURJOIN_ENTRY_BLOOM))
			WT_ERR_MSG(session, EINVAL,
			    "join has incompatible strategy "
			    "values for the same index");

		/*
		 * Check against other comparisons (we call them endpoints)
		 * already set up for this index.
		 * We allow either:
		 *   - one or more "eq" (with disjunction)
		 *   - exactly one "eq" (with conjunction)
		 *   - exactly one of "gt" or "ge" (conjunction or disjunction)
		 *   - exactly one of "lt" or "le" (conjunction or disjunction)
		 *   - one of "gt"/"ge" along with one of "lt"/"le"
		 *         (currently restricted to conjunction).
		 *
		 * Some other combinations, although expressible either do
		 * not make sense (X == 3 AND X == 5) or are reducible (X <
		 * 7 AND X < 9).  Other specific cases of (X < 7 OR X > 15)
		 * or (X == 4 OR X > 15) make sense but we don't handle yet.
		 */
		for (i = 0; i < entry->ends_next; i++) {
			end = &entry->ends[i];
			range_eq = (range == WT_CURJOIN_END_EQ);
			if ((F_ISSET(end, WT_CURJOIN_END_GT) &&
			    ((range & WT_CURJOIN_END_GT) != 0 || range_eq)) ||
			    (F_ISSET(end, WT_CURJOIN_END_LT) &&
			    ((range & WT_CURJOIN_END_LT) != 0 || range_eq)) ||
			    (end->flags == WT_CURJOIN_END_EQ &&
			    (range & (WT_CURJOIN_END_LT | WT_CURJOIN_END_GT))
			    != 0))
				WT_ERR_MSG(session, EINVAL,
				    "join has overlapping ranges");
			if (range == WT_CURJOIN_END_EQ &&
			    end->flags == WT_CURJOIN_END_EQ &&
			    !F_ISSET(entry, WT_CURJOIN_ENTRY_DISJUNCTION))
				WT_ERR_MSG(session, EINVAL,
				    "compare=eq can only be combined "
				    "using operation=or");

			/*
			 * Sort "gt"/"ge" to the front, followed by any number
			 * of "eq", and finally "lt"/"le".
			 */
			if (!hasins &&
			    ((range & WT_CURJOIN_END_GT) != 0 ||
			    (range == WT_CURJOIN_END_EQ &&
			    !F_ISSET(end, WT_CURJOIN_END_GT)))) {
				ins = i;
				hasins = true;
			}
		}
		/* All checks completed, merge any new configuration now */
		entry->count = count;
		entry->bloom_bit_count =
		    WT_MAX(entry->bloom_bit_count, bloom_bit_count);
		entry->bloom_hash_count =
		    WT_MAX(entry->bloom_hash_count, bloom_hash_count);
	}
	WT_ERR(__wt_realloc_def(session, &entry->ends_allocated,
	    entry->ends_next + 1, &entry->ends));
	if (!hasins)
		ins = entry->ends_next;
	newend = &entry->ends[ins];
	memmove(newend + 1, newend,
	    (entry->ends_next - ins) * sizeof(WT_CURSOR_JOIN_ENDPOINT));
	memset(newend, 0, sizeof(WT_CURSOR_JOIN_ENDPOINT));
	entry->ends_next++;
	newend->cursor = ref_cursor;
	F_SET(newend, range);

	/* Open the main file with a projection of the indexed columns. */
	if (entry->main == NULL && entry->index != NULL) {
		namesize = strlen(cjoin->table->name);
		newsize = namesize + entry->index->colconf.len + 1;
		WT_ERR(__wt_calloc(session, 1, newsize, &main_uri));
		snprintf(main_uri, newsize, "%s%.*s",
		    cjoin->table->name, (int)entry->index->colconf.len,
		    entry->index->colconf.str);
		WT_ERR(__wt_open_cursor(session, main_uri,
		    (WT_CURSOR *)cjoin, raw_cfg, &entry->main));
	}

err:	if (main_uri != NULL)
		__wt_free(session, main_uri);
	return (ret);
}