Example #1
0
/*
 * __wt_bloom_open --
 *	Open a Bloom filter object for use by a single session. The filter must
 *	have been created and finalized.
 */
int
__wt_bloom_open(WT_SESSION_IMPL *session,
    const char *uri, uint32_t factor, uint32_t k,
    WT_CURSOR *owner, WT_BLOOM **bloomp)
{
	WT_BLOOM *bloom;
	WT_CURSOR *c;
	WT_DECL_RET;
	uint64_t size;

	WT_RET(__bloom_init(session, uri, NULL, &bloom));
	WT_ERR(__bloom_open_cursor(bloom, owner));
	c = bloom->c;

	/* Find the largest key, to get the size of the filter. */
	WT_ERR(c->prev(c));
	WT_ERR(c->get_key(c, &size));
	WT_ERR(c->reset(c));

	WT_ERR(__bloom_setup(bloom, 0, size, factor, k));

	*bloomp = bloom;
	return (0);

err:	(void)__wt_bloom_close(bloom);
	return (ret);
}
Example #2
0
/*
 * __clsm_close_cursors --
 *	Close any btree cursors that are not needed.
 */
static int
__clsm_close_cursors(WT_CURSOR_LSM *clsm, u_int start, u_int end)
{
	WT_BLOOM *bloom;
	WT_CURSOR *c;
	u_int i;

	if (clsm->cursors == NULL || clsm->nchunks == 0)
		return (0);

	/*
	 * Walk the cursors, closing any we don't need.  Note that the exit
	 * condition here is special, don't use WT_FORALL_CURSORS, and be
	 * careful with unsigned integer wrapping.
	 */
	for (i = start; i < end; i++) {
		if ((c = (clsm)->cursors[i]) != NULL) {
			clsm->cursors[i] = NULL;
			WT_RET(c->close(c));
		}
		if ((bloom = clsm->blooms[i]) != NULL) {
			clsm->blooms[i] = NULL;
			WT_RET(__wt_bloom_close(bloom));
		}
	}

	return (0);
}
Example #3
0
/*
 * __curjoin_close --
 *	WT_CURSOR::close for join cursors.
 */
static int
__curjoin_close(WT_CURSOR *cursor)
{
	WT_CURSOR_JOIN *cjoin;
	WT_CURSOR_JOIN_ENDPOINT *end;
	WT_CURSOR_JOIN_ENTRY *entry;
	WT_DECL_RET;
	WT_SESSION_IMPL *session;
	u_int i;

	cjoin = (WT_CURSOR_JOIN *)cursor;
	JOINABLE_CURSOR_API_CALL_PREPARE_ALLOWED(cursor, session, close, NULL);
err:

	WT_TRET(__wt_schema_release_table(session, &cjoin->table));

	/* This is owned by the table */
	cursor->key_format = NULL;
	if (cjoin->projection != NULL) {
		__wt_free(session, cjoin->projection);
		__wt_free(session, cursor->value_format);
	}

	for (entry = cjoin->entries, i = 0; i < cjoin->entries_next;
		entry++, i++) {
		if (entry->subjoin != NULL) {
			F_CLR(&entry->subjoin->iface, WT_CURSTD_JOINED);
			entry->subjoin->parent = NULL;
		}
		if (entry->main != NULL)
			WT_TRET(entry->main->close(entry->main));
		if (F_ISSET(entry, WT_CURJOIN_ENTRY_OWN_BLOOM))
			WT_TRET(__wt_bloom_close(entry->bloom));
		for (end = &entry->ends[0];
		     end < &entry->ends[entry->ends_next]; end++) {
			F_CLR(end->cursor, WT_CURSTD_JOINED);
			if (F_ISSET(end, WT_CURJOIN_END_OWN_CURSOR))
				WT_TRET(end->cursor->close(end->cursor));
		}
		__wt_free(session, entry->ends);
		__wt_free(session, entry->repack_format);
	}

	if (cjoin->iter != NULL)
		WT_TRET(__curjoin_iter_close_all(cjoin->iter));
	if (cjoin->main != NULL)
		WT_TRET(cjoin->main->close(cjoin->main));

	__wt_free(session, cjoin->entries);
	__wt_cursor_close(cursor);

	API_END_RET(session, ret);
}
Example #4
0
/*
 * __wt_bloom_drop --
 *	Drop a Bloom filter, release any resources.
 */
int
__wt_bloom_drop(WT_BLOOM *bloom, const char *config)
{
	WT_DECL_RET;
	WT_SESSION *wt_session;

	wt_session = (WT_SESSION *)bloom->session;
	if (bloom->c != NULL) {
		ret = bloom->c->close(bloom->c);
		bloom->c = NULL;
	}
	WT_TRET(wt_session->drop(wt_session, bloom->uri, config));
	WT_TRET(__wt_bloom_close(bloom));

	return (ret);
}
Example #5
0
/*
 * __curjoin_close --
 *	WT_CURSOR::close for join cursors.
 */
static int
__curjoin_close(WT_CURSOR *cursor)
{
	WT_CURSOR_JOIN *cjoin;
	WT_CURSOR_JOIN_ENDPOINT *end;
	WT_CURSOR_JOIN_ENTRY *entry;
	WT_DECL_RET;
	WT_SESSION_IMPL *session;
	u_int i;

	cjoin = (WT_CURSOR_JOIN *)cursor;

	CURSOR_API_CALL(cursor, session, close, NULL);

	__wt_schema_release_table(session, cjoin->table);
	/* These are owned by the table */
	cursor->internal_uri = NULL;
	cursor->key_format = NULL;
	if (cjoin->projection != NULL) {
		__wt_free(session, cjoin->projection);
		__wt_free(session, cursor->value_format);
	}

	for (entry = cjoin->entries, i = 0; i < cjoin->entries_next;
		entry++, i++) {
		if (entry->main != NULL)
			WT_TRET(entry->main->close(entry->main));
		if (F_ISSET(entry, WT_CURJOIN_ENTRY_OWN_BLOOM))
			WT_TRET(__wt_bloom_close(entry->bloom));
		for (end = &entry->ends[0];
		     end < &entry->ends[entry->ends_next]; end++) {
			F_CLR(end->cursor, WT_CURSTD_JOINED);
			if (F_ISSET(end, WT_CURJOIN_END_OWN_KEY))
				__wt_free(session, end->key.data);
		}
		__wt_free(session, entry->ends);
	}

	if (cjoin->iter != NULL)
		WT_TRET(__curjoin_entry_iter_close(cjoin->iter));
	__wt_free(session, cjoin->entries);
	WT_TRET(__wt_cursor_close(cursor));

err:	API_END_RET(session, ret);
}
Example #6
0
/*
 * __wt_bloom_create --
 *
 * Creates and configures a WT_BLOOM handle, allocates a bitstring in memory to
 * use while populating the bloom filter.
 *
 * count  - is the expected number of inserted items
 * factor - is the number of bits to use per inserted item
 * k      - is the number of hash values to set or test per item
 */
int
__wt_bloom_create(
    WT_SESSION_IMPL *session, const char *uri, const char *config,
    uint64_t count, uint32_t factor, uint32_t k, WT_BLOOM **bloomp)
{
	WT_BLOOM *bloom;
	WT_DECL_RET;

	WT_RET(__bloom_init(session, uri, config, &bloom));
	WT_ERR(__bloom_setup(bloom, count, 0, factor, k));

	WT_ERR(__bit_alloc(session, bloom->m, &bloom->bitstring));

	*bloomp = bloom;
	return (0);

err:	(void)__wt_bloom_close(bloom);
	return (ret);
}
Example #7
0
/*
 * __lsm_bloom_create --
 *	Create a bloom filter for a chunk of the LSM tree that has been
 *	checkpointed but not yet been merged.
 */
static int
__lsm_bloom_create(WT_SESSION_IMPL *session,
    WT_LSM_TREE *lsm_tree, WT_LSM_CHUNK *chunk, u_int chunk_off)
{
	WT_BLOOM *bloom;
	WT_CURSOR *src;
	WT_DECL_RET;
	WT_ITEM key;
	uint64_t insert_count;

	WT_RET(__wt_lsm_tree_setup_bloom(session, lsm_tree, chunk));

	bloom = NULL;
	/*
	 * This is merge-like activity, and we don't want compacts to give up
	 * because we are creating a bunch of bloom filters before merging.
	 */
	++lsm_tree->merge_progressing;
	WT_RET(__wt_bloom_create(session, chunk->bloom_uri,
	    lsm_tree->bloom_config, chunk->count,
	    lsm_tree->bloom_bit_count, lsm_tree->bloom_hash_count, &bloom));

	/* Open a special merge cursor just on this chunk. */
	WT_ERR(__wt_open_cursor(session, lsm_tree->name, NULL, NULL, &src));
	F_SET(src, WT_CURSTD_RAW);
	WT_ERR(__wt_clsm_init_merge(src, chunk_off, chunk->id, 1));

	/*
	 * Setup so that we don't hold pages we read into cache, and so
	 * that we don't get stuck if the cache is full. If we allow
	 * ourselves to get stuck creating bloom filters, the entire tree
	 * can stall since there may be no worker threads available to flush.
	 */
	F_SET(session, WT_SESSION_NO_CACHE | WT_SESSION_NO_EVICTION);
	for (insert_count = 0; (ret = src->next(src)) == 0; insert_count++) {
		WT_ERR(src->get_key(src, &key));
		WT_ERR(__wt_bloom_insert(bloom, &key));
	}
	WT_ERR_NOTFOUND_OK(ret);
	WT_TRET(src->close(src));

	WT_TRET(__wt_bloom_finalize(bloom));
	WT_ERR(ret);

	F_CLR(session, WT_SESSION_NO_CACHE);

	/* Load the new Bloom filter into cache. */
	WT_CLEAR(key);
	WT_ERR_NOTFOUND_OK(__wt_bloom_get(bloom, &key));

	WT_ERR(__wt_verbose(session, WT_VERB_LSM,
	    "LSM worker created bloom filter %s. "
	    "Expected %" PRIu64 " items, got %" PRIu64,
	    chunk->bloom_uri, chunk->count, insert_count));

	/* Ensure the bloom filter is in the metadata. */
	WT_ERR(__wt_lsm_tree_writelock(session, lsm_tree));
	F_SET(chunk, WT_LSM_CHUNK_BLOOM);
	ret = __wt_lsm_meta_write(session, lsm_tree);
	++lsm_tree->dsk_gen;
	WT_TRET(__wt_lsm_tree_writeunlock(session, lsm_tree));

	if (ret != 0)
		WT_ERR_MSG(session, ret, "LSM bloom worker metadata write");

err:	if (bloom != NULL)
		WT_TRET(__wt_bloom_close(bloom));
	F_CLR(session, WT_SESSION_NO_CACHE | WT_SESSION_NO_EVICTION);
	return (ret);
}
Example #8
0
void
run(void)
{
	WT_BLOOM *bloomp;
	WT_ITEM item;
	WT_SESSION_IMPL *sess;
	uint32_t fp, i;
	int ret;
	const char *uri = "file:my_bloom.bf";

	/* Use the internal session handle to access private APIs. */
	sess = (WT_SESSION_IMPL *)g.wt_session;

	testutil_check(__wt_bloom_create(
	    sess, uri, NULL, g.c_ops, g.c_factor, g.c_k, &bloomp));

	item.size = g.c_key_max;
	for (i = 0; i < g.c_ops; i++) {
		item.data = g.entries[i];
		if ((ret = __wt_bloom_insert(bloomp, &item)) != 0)
			testutil_die(ret, "__wt_bloom_insert: %" PRIu32, i);
	}

	testutil_check(__wt_bloom_finalize(bloomp));

	for (i = 0; i < g.c_ops; i++) {
		item.data = g.entries[i];
		if ((ret = __wt_bloom_get(bloomp, &item)) != 0) {
			fprintf(stderr,
			    "get failed at record: %" PRIu32 "\n", i);
			testutil_die(ret, "__wt_bloom_get");
		}
	}
	testutil_check(__wt_bloom_close(bloomp));

	testutil_check(g.wt_session->checkpoint(g.wt_session, NULL));
	testutil_check(__wt_bloom_open(
	    sess, uri, g.c_factor, g.c_k, NULL, &bloomp));

	for (i = 0; i < g.c_ops; i++) {
		item.data = g.entries[i];
		testutil_check(__wt_bloom_get(bloomp, &item));
	}

	/*
	 * Try out some values we didn't insert - choose a different size to
	 * ensure the value doesn't overlap with existing values.
	 */
	item.size = g.c_key_max + 10;
	item.data = dcalloc(item.size, 1);
	memset((void *)item.data, 'a', item.size);
	for (i = 0, fp = 0; i < g.c_ops; i++) {
		((uint8_t *)item.data)[i % item.size] =
		    'a' + ((uint8_t)rand() % 26);
		if ((ret = __wt_bloom_get(bloomp, &item)) == 0)
			++fp;
		if (ret != 0 && ret != WT_NOTFOUND)
			testutil_die(ret, "__wt_bloom_get");
	}
	free((void *)item.data);
	printf(
	    "Out of %" PRIu32 " ops, got %" PRIu32 " false positives, %.4f%%\n",
	    g.c_ops, fp, 100.0 * fp/g.c_ops);
	testutil_check(__wt_bloom_drop(bloomp, NULL));
}
Example #9
0
/*
 * __wt_lsm_merge --
 *	Merge a set of chunks of an LSM tree.
 */
int
__wt_lsm_merge(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, u_int id)
{
	WT_BLOOM *bloom;
	WT_CURSOR *dest, *src;
	WT_DECL_RET;
	WT_ITEM key, value;
	WT_LSM_CHUNK *chunk;
	uint32_t generation;
	uint64_t insert_count, record_count;
	u_int dest_id, end_chunk, i, nchunks, start_chunk, start_id, verb;
	int tret;
	bool created_chunk, create_bloom, locked, in_sync;
	const char *cfg[3];
	const char *drop_cfg[] =
	    { WT_CONFIG_BASE(session, WT_SESSION_drop), "force", NULL };

	bloom = NULL;
	chunk = NULL;
	dest = src = NULL;
	start_id = 0;
	created_chunk = create_bloom = locked = in_sync = false;

	/* Fast path if it's obvious no merges could be done. */
	if (lsm_tree->nchunks < lsm_tree->merge_min &&
	    lsm_tree->merge_aggressiveness < WT_LSM_AGGRESSIVE_THRESHOLD)
		return (WT_NOTFOUND);

	/*
	 * Use the lsm_tree lock to read the chunks (so no switches occur), but
	 * avoid holding it while the merge is in progress: that may take a
	 * long time.
	 */
	WT_RET(__wt_lsm_tree_writelock(session, lsm_tree));
	locked = true;

	WT_ERR(__lsm_merge_span(session,
	    lsm_tree, id, &start_chunk, &end_chunk, &record_count));
	nchunks = (end_chunk + 1) - start_chunk;

	WT_ASSERT(session, nchunks > 0);
	start_id = lsm_tree->chunk[start_chunk]->id;

	/* Find the merge generation. */
	for (generation = 0, i = 0; i < nchunks; i++)
		generation = WT_MAX(generation,
		    lsm_tree->chunk[start_chunk + i]->generation + 1);

	WT_ERR(__wt_lsm_tree_writeunlock(session, lsm_tree));
	locked = false;

	/* Allocate an ID for the merge. */
	dest_id = __wt_atomic_add32(&lsm_tree->last, 1);

	/*
	 * We only want to do the chunk loop if we're running with verbose,
	 * so we wrap these statements in the conditional.  Avoid the loop
	 * in the normal path.
	 */
	if (WT_VERBOSE_ISSET(session, WT_VERB_LSM)) {
		WT_ERR(__wt_verbose(session, WT_VERB_LSM,
		    "Merging %s chunks %u-%u into %u (%" PRIu64 " records)"
		    ", generation %" PRIu32,
		    lsm_tree->name,
		    start_chunk, end_chunk, dest_id, record_count, generation));
		for (verb = start_chunk; verb <= end_chunk; verb++)
			WT_ERR(__wt_verbose(session, WT_VERB_LSM,
			    "Merging %s: Chunk[%u] id %u, gen: %" PRIu32
			    ", size: %" PRIu64 ", records: %" PRIu64,
			    lsm_tree->name, verb, lsm_tree->chunk[verb]->id,
			    lsm_tree->chunk[verb]->generation,
			    lsm_tree->chunk[verb]->size,
			    lsm_tree->chunk[verb]->count));
	}

	WT_ERR(__wt_calloc_one(session, &chunk));
	created_chunk = true;
	chunk->id = dest_id;

	if (FLD_ISSET(lsm_tree->bloom, WT_LSM_BLOOM_MERGED) &&
	    (FLD_ISSET(lsm_tree->bloom, WT_LSM_BLOOM_OLDEST) ||
	    start_chunk > 0) && record_count > 0)
		create_bloom = true;

	/*
	 * Special setup for the merge cursor:
	 * first, reset to open the dependent cursors;
	 * then restrict the cursor to a specific number of chunks;
	 * then set MERGE so the cursor doesn't track updates to the tree.
	 */
	WT_ERR(__wt_open_cursor(session, lsm_tree->name, NULL, NULL, &src));
	F_SET(src, WT_CURSTD_RAW);
	WT_ERR(__wt_clsm_init_merge(src, start_chunk, start_id, nchunks));

	WT_WITH_SCHEMA_LOCK(session,
	    ret = __wt_lsm_tree_setup_chunk(session, lsm_tree, chunk));
	WT_ERR(ret);
	if (create_bloom) {
		WT_ERR(__wt_lsm_tree_setup_bloom(session, lsm_tree, chunk));

		WT_ERR(__wt_bloom_create(session, chunk->bloom_uri,
		    lsm_tree->bloom_config,
		    record_count, lsm_tree->bloom_bit_count,
		    lsm_tree->bloom_hash_count, &bloom));
	}

	/* Discard pages we read as soon as we're done with them. */
	F_SET(session, WT_SESSION_NO_CACHE);

	cfg[0] = WT_CONFIG_BASE(session, WT_SESSION_open_cursor);
	cfg[1] = "bulk,raw,skip_sort_check";
	cfg[2] = NULL;
	WT_ERR(__wt_open_cursor(session, chunk->uri, NULL, cfg, &dest));

#define	LSM_MERGE_CHECK_INTERVAL	WT_THOUSAND
	for (insert_count = 0; (ret = src->next(src)) == 0; insert_count++) {
		if (insert_count % LSM_MERGE_CHECK_INTERVAL == 0) {
			if (!F_ISSET(lsm_tree, WT_LSM_TREE_ACTIVE))
				WT_ERR(EINTR);

			WT_STAT_FAST_CONN_INCRV(session,
			    lsm_rows_merged, LSM_MERGE_CHECK_INTERVAL);
			++lsm_tree->merge_progressing;
		}

		WT_ERR(src->get_key(src, &key));
		dest->set_key(dest, &key);
		WT_ERR(src->get_value(src, &value));
		dest->set_value(dest, &value);
		WT_ERR(dest->insert(dest));
		if (create_bloom)
			WT_ERR(__wt_bloom_insert(bloom, &key));
	}
	WT_ERR_NOTFOUND_OK(ret);

	WT_STAT_FAST_CONN_INCRV(session,
	    lsm_rows_merged, insert_count % LSM_MERGE_CHECK_INTERVAL);
	++lsm_tree->merge_progressing;
	WT_ERR(__wt_verbose(session, WT_VERB_LSM,
	    "Bloom size for %" PRIu64 " has %" PRIu64 " items inserted.",
	    record_count, insert_count));

	/*
	 * Closing and syncing the files can take a while.  Set the
	 * merge_syncing field so that compact knows it is still in
	 * progress.
	 */
	(void)__wt_atomic_add32(&lsm_tree->merge_syncing, 1);
	in_sync = true;
	/*
	 * We've successfully created the new chunk.  Now install it.  We need
	 * to ensure that the NO_CACHE flag is cleared and the bloom filter
	 * is closed (even if a step fails), so track errors but don't return
	 * until we've cleaned up.
	 */
	WT_TRET(src->close(src));
	WT_TRET(dest->close(dest));
	src = dest = NULL;

	F_CLR(session, WT_SESSION_NO_CACHE);

	/*
	 * We're doing advisory reads to fault the new trees into cache.
	 * Don't block if the cache is full: our next unit of work may be to
	 * discard some trees to free space.
	 */
	F_SET(session, WT_SESSION_NO_EVICTION);

	if (create_bloom) {
		if (ret == 0)
			WT_TRET(__wt_bloom_finalize(bloom));

		/*
		 * Read in a key to make sure the Bloom filters btree handle is
		 * open before it becomes visible to application threads.
		 * Otherwise application threads will stall while it is opened
		 * and internal pages are read into cache.
		 */
		if (ret == 0) {
			WT_CLEAR(key);
			WT_TRET_NOTFOUND_OK(__wt_bloom_get(bloom, &key));
		}

		WT_TRET(__wt_bloom_close(bloom));
		bloom = NULL;
	}
	WT_ERR(ret);

	/*
	 * Open a handle on the new chunk before application threads attempt
	 * to access it, opening it pre-loads internal pages into the file
	 * system cache.
	 */
	cfg[1] = "checkpoint=" WT_CHECKPOINT;
	WT_ERR(__wt_open_cursor(session, chunk->uri, NULL, cfg, &dest));
	WT_TRET(dest->close(dest));
	dest = NULL;
	++lsm_tree->merge_progressing;
	(void)__wt_atomic_sub32(&lsm_tree->merge_syncing, 1);
	in_sync = false;
	WT_ERR_NOTFOUND_OK(ret);

	WT_ERR(__wt_lsm_tree_set_chunk_size(session, chunk));
	WT_ERR(__wt_lsm_tree_writelock(session, lsm_tree));
	locked = true;

	/*
	 * Check whether we raced with another merge, and adjust the chunk
	 * array offset as necessary.
	 */
	if (start_chunk >= lsm_tree->nchunks ||
	    lsm_tree->chunk[start_chunk]->id != start_id)
		for (start_chunk = 0;
		    start_chunk < lsm_tree->nchunks;
		    start_chunk++)
			if (lsm_tree->chunk[start_chunk]->id == start_id)
				break;

	/*
	 * It is safe to error out here - since the update can only fail
	 * prior to making updates to the tree.
	 */
	WT_ERR(__wt_lsm_merge_update_tree(
	    session, lsm_tree, start_chunk, nchunks, chunk));

	if (create_bloom)
		F_SET(chunk, WT_LSM_CHUNK_BLOOM);
	chunk->count = insert_count;
	chunk->generation = generation;
	F_SET(chunk, WT_LSM_CHUNK_ONDISK);

	/*
	 * We have no current way of continuing if the metadata update fails,
	 * so we will panic in that case.  Put some effort into cleaning up
	 * after ourselves here - so things have a chance of shutting down.
	 *
	 * Any errors that happened after the tree was locked are
	 * fatal - we can't guarantee the state of the tree.
	 */
	if ((ret = __wt_lsm_meta_write(session, lsm_tree)) != 0)
		WT_PANIC_ERR(session, ret, "Failed finalizing LSM merge");

	lsm_tree->dsk_gen++;

	/* Update the throttling while holding the tree lock. */
	__wt_lsm_tree_throttle(session, lsm_tree, true);

	/* Schedule a pass to discard old chunks */
	WT_ERR(__wt_lsm_manager_push_entry(
	    session, WT_LSM_WORK_DROP, 0, lsm_tree));

err:	if (locked)
		WT_TRET(__wt_lsm_tree_writeunlock(session, lsm_tree));
	if (in_sync)
		(void)__wt_atomic_sub32(&lsm_tree->merge_syncing, 1);
	if (src != NULL)
		WT_TRET(src->close(src));
	if (dest != NULL)
		WT_TRET(dest->close(dest));
	if (bloom != NULL)
		WT_TRET(__wt_bloom_close(bloom));
	if (ret != 0 && created_chunk) {
		/* Drop the newly-created files on error. */
		if (chunk->uri != NULL) {
			WT_WITH_SCHEMA_LOCK(session, tret =
			    __wt_schema_drop(session, chunk->uri, drop_cfg));
			WT_TRET(tret);
		}
		if (create_bloom && chunk->bloom_uri != NULL) {
			WT_WITH_SCHEMA_LOCK(session,
			    tret = __wt_schema_drop(
			    session, chunk->bloom_uri, drop_cfg));
			WT_TRET(tret);
		}
		__wt_free(session, chunk->bloom_uri);
		__wt_free(session, chunk->uri);
		__wt_free(session, chunk);

		if (ret == EINTR)
			WT_TRET(__wt_verbose(session, WT_VERB_LSM,
			    "Merge aborted due to close"));
		else
			WT_TRET(__wt_verbose(session, WT_VERB_LSM,
			    "Merge failed with %s",
			   __wt_strerror(session, ret, NULL, 0)));
	}
	F_CLR(session, WT_SESSION_NO_CACHE | WT_SESSION_NO_EVICTION);
	return (ret);
}
Example #10
0
/*
 * __curjoin_init_next --
 *	Initialize the cursor join when the next function is first called.
 */
static int
__curjoin_init_next(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin,
    bool iterable)
{
	WT_BLOOM *bloom;
	WT_CURSOR *origcur;
	WT_CURSOR_JOIN_ENDPOINT *end;
	WT_CURSOR_JOIN_ENTRY *je, *jeend, *je2;
	WT_DECL_RET;
	size_t size;
	uint32_t f, k;
	char *mainbuf;
	const char *def_cfg[] = { WT_CONFIG_BASE(
	    session, WT_SESSION_open_cursor), NULL };
	const char *raw_cfg[] = { WT_CONFIG_BASE(
	    session, WT_SESSION_open_cursor), "raw", NULL };
	const char **config, *proj, *urimain;

	mainbuf = NULL;
	if (cjoin->entries_next == 0)
		WT_RET_MSG(session, EINVAL,
		    "join cursor has not yet been joined with any other "
		    "cursors");

	/* Get a consistent view of our subordinate cursors if appropriate. */
	__wt_txn_cursor_op(session);

	if (F_ISSET((WT_CURSOR *)cjoin, WT_CURSTD_RAW))
		config = &raw_cfg[0];
	else
		config = &def_cfg[0];
	urimain = cjoin->table->iface.name;
	if ((proj = cjoin->projection) != NULL) {
		size = strlen(urimain) + strlen(proj) + 1;
		WT_ERR(__wt_calloc(session, size, 1, &mainbuf));
		WT_ERR(__wt_snprintf(mainbuf, size, "%s%s", urimain, proj));
		urimain = mainbuf;
	}
	WT_ERR(__wt_open_cursor(session, urimain, (WT_CURSOR *)cjoin, config,
	    &cjoin->main));

	jeend = &cjoin->entries[cjoin->entries_next];
	for (je = cjoin->entries; je < jeend; je++) {
		if (je->subjoin != NULL) {
			WT_ERR(__curjoin_init_next(session, je->subjoin,
			    iterable));
			continue;
		}
		__wt_stat_join_init_single(&je->stats);
		/*
		 * For a single compare=le/lt endpoint in any entry that may
		 * be iterated, construct a companion compare=ge endpoint
		 * that will actually be iterated.
		 */
		if (iterable && je->ends_next == 1 &&
		    F_ISSET(&je->ends[0], WT_CURJOIN_END_LT)) {
			origcur = je->ends[0].cursor;
			WT_ERR(__curjoin_insert_endpoint(session, je, 0, &end));
			WT_ERR(__wt_open_cursor(session, origcur->uri,
			    (WT_CURSOR *)cjoin,
			    F_ISSET(origcur, WT_CURSTD_RAW) ? raw_cfg : def_cfg,
			    &end->cursor));
			end->flags = WT_CURJOIN_END_GT | WT_CURJOIN_END_EQ |
			    WT_CURJOIN_END_OWN_CURSOR;
			WT_ERR(end->cursor->next(end->cursor));
			F_CLR(je, WT_CURJOIN_ENTRY_DISJUNCTION);
		}
		for (end = &je->ends[0]; end < &je->ends[je->ends_next];
		     end++)
			WT_ERR(__curjoin_endpoint_init_key(session, je, end));

		/*
		 * Do any needed Bloom filter initialization.  Ignore Bloom
		 * filters for entries that will be iterated.  They won't
		 * help since these entries either don't need an inclusion
		 * check or are doing any needed check during the iteration.
		 */
		if (!iterable && F_ISSET(je, WT_CURJOIN_ENTRY_BLOOM)) {
			if (session->txn.isolation == WT_ISO_READ_UNCOMMITTED)
			       WT_ERR_MSG(session, EINVAL,
				    "join cursors with Bloom filters cannot be "
				    "used with read-uncommitted isolation");
			if (je->bloom == NULL) {
				/*
				 * Look for compatible filters to be shared,
				 * pick compatible numbers for bit counts
				 * and number of hashes.
				 */
				f = je->bloom_bit_count;
				k = je->bloom_hash_count;
				for (je2 = je + 1; je2 < jeend; je2++)
					if (F_ISSET(je2,
					    WT_CURJOIN_ENTRY_BLOOM) &&
					    je2->count == je->count) {
						f = WT_MAX(
						    je2->bloom_bit_count, f);
						k = WT_MAX(
						    je2->bloom_hash_count, k);
					}
				je->bloom_bit_count = f;
				je->bloom_hash_count = k;
				WT_ERR(__wt_bloom_create(session, NULL,
				    NULL, je->count, f, k, &je->bloom));
				F_SET(je, WT_CURJOIN_ENTRY_OWN_BLOOM);
				WT_ERR(__curjoin_init_bloom(session, cjoin,
				    je, je->bloom));
				/*
				 * Share the Bloom filter, making all
				 * config info consistent.
				 */
				for (je2 = je + 1; je2 < jeend; je2++)
					if (F_ISSET(je2,
					    WT_CURJOIN_ENTRY_BLOOM) &&
					    je2->count == je->count) {
						WT_ASSERT(session,
						    je2->bloom == NULL);
						je2->bloom = je->bloom;
						je2->bloom_bit_count = f;
						je2->bloom_hash_count = k;
					}
			} else {
				/*
				 * Create a temporary filter that we'll
				 * merge into the shared one.  The Bloom
				 * parameters of the two filters must match.
				 */
				WT_ERR(__wt_bloom_create(session, NULL,
				    NULL, je->count, je->bloom_bit_count,
				    je->bloom_hash_count, &bloom));
				WT_ERR(__curjoin_init_bloom(session, cjoin,
				    je, bloom));
				WT_ERR(__wt_bloom_intersection(je->bloom,
				    bloom));
				WT_ERR(__wt_bloom_close(bloom));
			}
		}
		if (!F_ISSET(cjoin, WT_CURJOIN_DISJUNCTION))
			iterable = false;
	}
	F_SET(cjoin, WT_CURJOIN_INITIALIZED);

err:	__wt_free(session, mainbuf);
	return (ret);
}
Example #11
0
/*
 * __lsm_bloom_create --
 *	Create a bloom filter for a chunk of the LSM tree that has been
 *	checkpointed but not yet been merged.
 */
static int
__lsm_bloom_create(WT_SESSION_IMPL *session,
    WT_LSM_TREE *lsm_tree, WT_LSM_CHUNK *chunk, u_int chunk_off)
{
	WT_BLOOM *bloom;
	WT_CURSOR *src;
	WT_DECL_RET;
	WT_ITEM buf, key;
	WT_SESSION *wt_session;
	uint64_t insert_count;
	int exist;

	/*
	 * Normally, the Bloom URI is populated when the chunk struct is
	 * allocated.  After an open, however, it may not have been.
	 * Deal with that here.
	 */
	if (chunk->bloom_uri == NULL) {
		WT_CLEAR(buf);
		WT_RET(__wt_lsm_tree_bloom_name(
		    session, lsm_tree, chunk->id, &buf));
		chunk->bloom_uri = __wt_buf_steal(session, &buf, NULL);
	}

	/*
	 * Drop the bloom filter first - there may be some content hanging over
	 * from an aborted merge or checkpoint.
	 */
	wt_session = &session->iface;
	WT_RET(__wt_exist(session, chunk->bloom_uri + strlen("file:"), &exist));
	if (exist)
		WT_RET(wt_session->drop(wt_session, chunk->bloom_uri, "force"));

	bloom = NULL;
	/*
	 * This is merge-like activity, and we don't want compacts to give up
	 * because we are creating a bunch of bloom filters before merging.
	 */
	++lsm_tree->merge_progressing;
	WT_RET(__wt_bloom_create(session, chunk->bloom_uri,
	    lsm_tree->bloom_config, chunk->count,
	    lsm_tree->bloom_bit_count, lsm_tree->bloom_hash_count, &bloom));

	/* Open a special merge cursor just on this chunk. */
	WT_ERR(__wt_open_cursor(session, lsm_tree->name, NULL, NULL, &src));
	F_SET(src, WT_CURSTD_RAW);
	WT_ERR(__wt_clsm_init_merge(src, chunk_off, chunk->id, 1));

	F_SET(session, WT_SESSION_NO_CACHE);
	for (insert_count = 0; (ret = src->next(src)) == 0; insert_count++) {
		WT_ERR(src->get_key(src, &key));
		WT_ERR(__wt_bloom_insert(bloom, &key));
	}
	WT_ERR_NOTFOUND_OK(ret);
	WT_TRET(src->close(src));

	WT_TRET(__wt_bloom_finalize(bloom));
	WT_ERR(ret);

	F_CLR(session, WT_SESSION_NO_CACHE);

	/* Load the new Bloom filter into cache. */
	WT_CLEAR(key);
	WT_ERR_NOTFOUND_OK(__wt_bloom_get(bloom, &key));

	WT_VERBOSE_ERR(session, lsm,
	    "LSM worker created bloom filter %s. "
	    "Expected %" PRIu64 " items, got %" PRIu64,
	    chunk->bloom_uri, chunk->count, insert_count);

	/* Ensure the bloom filter is in the metadata. */
	WT_ERR(__wt_lsm_tree_lock(session, lsm_tree, 1));
	F_SET_ATOMIC(chunk, WT_LSM_CHUNK_BLOOM);
	ret = __wt_lsm_meta_write(session, lsm_tree);
	++lsm_tree->dsk_gen;
	WT_TRET(__wt_lsm_tree_unlock(session, lsm_tree));

	if (ret != 0)
		WT_ERR_MSG(session, ret, "LSM bloom worker metadata write");

err:	if (bloom != NULL)
		WT_TRET(__wt_bloom_close(bloom));
	F_CLR(session, WT_SESSION_NO_CACHE);
	return (ret);
}
Example #12
0
/*
 * __curjoin_init_iter --
 *	Initialize before any iteration.
 */
static int
__curjoin_init_iter(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin)
{
	WT_BLOOM *bloom;
	WT_DECL_RET;
	WT_CURSOR_JOIN_ENTRY *je, *jeend, *je2;
	WT_CURSOR_JOIN_ENDPOINT *end;
	uint32_t f, k;

	if (cjoin->entries_next == 0)
		WT_RET_MSG(session, EINVAL,
		    "join cursor has not yet been joined with any other "
		    "cursors");

	je = &cjoin->entries[0];
	WT_RET(__curjoin_entry_iter_init(session, cjoin, je, &cjoin->iter));

	jeend = &cjoin->entries[cjoin->entries_next];
	for (je = cjoin->entries; je < jeend; je++) {
		__wt_stat_join_init_single(&je->stats);
		for (end = &je->ends[0]; end < &je->ends[je->ends_next];
		     end++)
			WT_RET(__curjoin_endpoint_init_key(session, je, end));

		/*
		 * The first entry is iterated as the 'outermost' cursor.
		 * For the common GE case, we don't have to test against
		 * the left reference key, we know it will be true since
		 * the btree is ordered.
		 */
		if (je == cjoin->entries && je->ends[0].flags ==
		    (WT_CURJOIN_END_GT | WT_CURJOIN_END_EQ))
			F_SET(cjoin, WT_CURJOIN_SKIP_FIRST_LEFT);

		if (F_ISSET(je, WT_CURJOIN_ENTRY_BLOOM)) {
			if (je->bloom == NULL) {
				/*
				 * Look for compatible filters to be shared,
				 * pick compatible numbers for bit counts
				 * and number of hashes.
				 */
				f = je->bloom_bit_count;
				k = je->bloom_hash_count;
				for (je2 = je + 1; je2 < jeend; je2++)
					if (F_ISSET(je2,
					    WT_CURJOIN_ENTRY_BLOOM) &&
					    je2->count == je->count) {
						f = WT_MAX(
						    je2->bloom_bit_count, f);
						k = WT_MAX(
						    je2->bloom_hash_count, k);
					}
				je->bloom_bit_count = f;
				je->bloom_hash_count = k;
				WT_RET(__wt_bloom_create(session, NULL,
				    NULL, je->count, f, k, &je->bloom));
				F_SET(je, WT_CURJOIN_ENTRY_OWN_BLOOM);
				WT_RET(__curjoin_init_bloom(session, cjoin,
				    je, je->bloom));
				/*
				 * Share the Bloom filter, making all
				 * config info consistent.
				 */
				for (je2 = je + 1; je2 < jeend; je2++)
					if (F_ISSET(je2,
					    WT_CURJOIN_ENTRY_BLOOM) &&
					    je2->count == je->count) {
						WT_ASSERT(session,
						    je2->bloom == NULL);
						je2->bloom = je->bloom;
						je2->bloom_bit_count = f;
						je2->bloom_hash_count = k;
					}
			} else {
				/*
				 * Create a temporary filter that we'll
				 * merge into the shared one.  The Bloom
				 * parameters of the two filters must match.
				 */
				WT_RET(__wt_bloom_create(session, NULL,
				    NULL, je->count, je->bloom_bit_count,
				    je->bloom_hash_count, &bloom));
				WT_RET(__curjoin_init_bloom(session, cjoin,
				    je, bloom));
				WT_RET(__wt_bloom_intersection(je->bloom,
				    bloom));
				WT_RET(__wt_bloom_close(bloom));
			}
		}
	}

	F_SET(cjoin, WT_CURJOIN_INITIALIZED);
	return (ret);
}