/* * __wt_bloom_open -- * Open a Bloom filter object for use by a single session. The filter must * have been created and finalized. */ int __wt_bloom_open(WT_SESSION_IMPL *session, const char *uri, uint32_t factor, uint32_t k, WT_CURSOR *owner, WT_BLOOM **bloomp) { WT_BLOOM *bloom; WT_CURSOR *c; WT_DECL_RET; uint64_t size; WT_RET(__bloom_init(session, uri, NULL, &bloom)); WT_ERR(__bloom_open_cursor(bloom, owner)); c = bloom->c; /* Find the largest key, to get the size of the filter. */ WT_ERR(c->prev(c)); WT_ERR(c->get_key(c, &size)); WT_ERR(c->reset(c)); WT_ERR(__bloom_setup(bloom, 0, size, factor, k)); *bloomp = bloom; return (0); err: (void)__wt_bloom_close(bloom); return (ret); }
/* * __clsm_close_cursors -- * Close any btree cursors that are not needed. */ static int __clsm_close_cursors(WT_CURSOR_LSM *clsm, u_int start, u_int end) { WT_BLOOM *bloom; WT_CURSOR *c; u_int i; if (clsm->cursors == NULL || clsm->nchunks == 0) return (0); /* * Walk the cursors, closing any we don't need. Note that the exit * condition here is special, don't use WT_FORALL_CURSORS, and be * careful with unsigned integer wrapping. */ for (i = start; i < end; i++) { if ((c = (clsm)->cursors[i]) != NULL) { clsm->cursors[i] = NULL; WT_RET(c->close(c)); } if ((bloom = clsm->blooms[i]) != NULL) { clsm->blooms[i] = NULL; WT_RET(__wt_bloom_close(bloom)); } } return (0); }
/* * __curjoin_close -- * WT_CURSOR::close for join cursors. */ static int __curjoin_close(WT_CURSOR *cursor) { WT_CURSOR_JOIN *cjoin; WT_CURSOR_JOIN_ENDPOINT *end; WT_CURSOR_JOIN_ENTRY *entry; WT_DECL_RET; WT_SESSION_IMPL *session; u_int i; cjoin = (WT_CURSOR_JOIN *)cursor; JOINABLE_CURSOR_API_CALL_PREPARE_ALLOWED(cursor, session, close, NULL); err: WT_TRET(__wt_schema_release_table(session, &cjoin->table)); /* This is owned by the table */ cursor->key_format = NULL; if (cjoin->projection != NULL) { __wt_free(session, cjoin->projection); __wt_free(session, cursor->value_format); } for (entry = cjoin->entries, i = 0; i < cjoin->entries_next; entry++, i++) { if (entry->subjoin != NULL) { F_CLR(&entry->subjoin->iface, WT_CURSTD_JOINED); entry->subjoin->parent = NULL; } if (entry->main != NULL) WT_TRET(entry->main->close(entry->main)); if (F_ISSET(entry, WT_CURJOIN_ENTRY_OWN_BLOOM)) WT_TRET(__wt_bloom_close(entry->bloom)); for (end = &entry->ends[0]; end < &entry->ends[entry->ends_next]; end++) { F_CLR(end->cursor, WT_CURSTD_JOINED); if (F_ISSET(end, WT_CURJOIN_END_OWN_CURSOR)) WT_TRET(end->cursor->close(end->cursor)); } __wt_free(session, entry->ends); __wt_free(session, entry->repack_format); } if (cjoin->iter != NULL) WT_TRET(__curjoin_iter_close_all(cjoin->iter)); if (cjoin->main != NULL) WT_TRET(cjoin->main->close(cjoin->main)); __wt_free(session, cjoin->entries); __wt_cursor_close(cursor); API_END_RET(session, ret); }
/* * __wt_bloom_drop -- * Drop a Bloom filter, release any resources. */ int __wt_bloom_drop(WT_BLOOM *bloom, const char *config) { WT_DECL_RET; WT_SESSION *wt_session; wt_session = (WT_SESSION *)bloom->session; if (bloom->c != NULL) { ret = bloom->c->close(bloom->c); bloom->c = NULL; } WT_TRET(wt_session->drop(wt_session, bloom->uri, config)); WT_TRET(__wt_bloom_close(bloom)); return (ret); }
/* * __curjoin_close -- * WT_CURSOR::close for join cursors. */ static int __curjoin_close(WT_CURSOR *cursor) { WT_CURSOR_JOIN *cjoin; WT_CURSOR_JOIN_ENDPOINT *end; WT_CURSOR_JOIN_ENTRY *entry; WT_DECL_RET; WT_SESSION_IMPL *session; u_int i; cjoin = (WT_CURSOR_JOIN *)cursor; CURSOR_API_CALL(cursor, session, close, NULL); __wt_schema_release_table(session, cjoin->table); /* These are owned by the table */ cursor->internal_uri = NULL; cursor->key_format = NULL; if (cjoin->projection != NULL) { __wt_free(session, cjoin->projection); __wt_free(session, cursor->value_format); } for (entry = cjoin->entries, i = 0; i < cjoin->entries_next; entry++, i++) { if (entry->main != NULL) WT_TRET(entry->main->close(entry->main)); if (F_ISSET(entry, WT_CURJOIN_ENTRY_OWN_BLOOM)) WT_TRET(__wt_bloom_close(entry->bloom)); for (end = &entry->ends[0]; end < &entry->ends[entry->ends_next]; end++) { F_CLR(end->cursor, WT_CURSTD_JOINED); if (F_ISSET(end, WT_CURJOIN_END_OWN_KEY)) __wt_free(session, end->key.data); } __wt_free(session, entry->ends); } if (cjoin->iter != NULL) WT_TRET(__curjoin_entry_iter_close(cjoin->iter)); __wt_free(session, cjoin->entries); WT_TRET(__wt_cursor_close(cursor)); err: API_END_RET(session, ret); }
/* * __wt_bloom_create -- * * Creates and configures a WT_BLOOM handle, allocates a bitstring in memory to * use while populating the bloom filter. * * count - is the expected number of inserted items * factor - is the number of bits to use per inserted item * k - is the number of hash values to set or test per item */ int __wt_bloom_create( WT_SESSION_IMPL *session, const char *uri, const char *config, uint64_t count, uint32_t factor, uint32_t k, WT_BLOOM **bloomp) { WT_BLOOM *bloom; WT_DECL_RET; WT_RET(__bloom_init(session, uri, config, &bloom)); WT_ERR(__bloom_setup(bloom, count, 0, factor, k)); WT_ERR(__bit_alloc(session, bloom->m, &bloom->bitstring)); *bloomp = bloom; return (0); err: (void)__wt_bloom_close(bloom); return (ret); }
/* * __lsm_bloom_create -- * Create a bloom filter for a chunk of the LSM tree that has been * checkpointed but not yet been merged. */ static int __lsm_bloom_create(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, WT_LSM_CHUNK *chunk, u_int chunk_off) { WT_BLOOM *bloom; WT_CURSOR *src; WT_DECL_RET; WT_ITEM key; uint64_t insert_count; WT_RET(__wt_lsm_tree_setup_bloom(session, lsm_tree, chunk)); bloom = NULL; /* * This is merge-like activity, and we don't want compacts to give up * because we are creating a bunch of bloom filters before merging. */ ++lsm_tree->merge_progressing; WT_RET(__wt_bloom_create(session, chunk->bloom_uri, lsm_tree->bloom_config, chunk->count, lsm_tree->bloom_bit_count, lsm_tree->bloom_hash_count, &bloom)); /* Open a special merge cursor just on this chunk. */ WT_ERR(__wt_open_cursor(session, lsm_tree->name, NULL, NULL, &src)); F_SET(src, WT_CURSTD_RAW); WT_ERR(__wt_clsm_init_merge(src, chunk_off, chunk->id, 1)); /* * Setup so that we don't hold pages we read into cache, and so * that we don't get stuck if the cache is full. If we allow * ourselves to get stuck creating bloom filters, the entire tree * can stall since there may be no worker threads available to flush. */ F_SET(session, WT_SESSION_NO_CACHE | WT_SESSION_NO_EVICTION); for (insert_count = 0; (ret = src->next(src)) == 0; insert_count++) { WT_ERR(src->get_key(src, &key)); WT_ERR(__wt_bloom_insert(bloom, &key)); } WT_ERR_NOTFOUND_OK(ret); WT_TRET(src->close(src)); WT_TRET(__wt_bloom_finalize(bloom)); WT_ERR(ret); F_CLR(session, WT_SESSION_NO_CACHE); /* Load the new Bloom filter into cache. */ WT_CLEAR(key); WT_ERR_NOTFOUND_OK(__wt_bloom_get(bloom, &key)); WT_ERR(__wt_verbose(session, WT_VERB_LSM, "LSM worker created bloom filter %s. " "Expected %" PRIu64 " items, got %" PRIu64, chunk->bloom_uri, chunk->count, insert_count)); /* Ensure the bloom filter is in the metadata. */ WT_ERR(__wt_lsm_tree_writelock(session, lsm_tree)); F_SET(chunk, WT_LSM_CHUNK_BLOOM); ret = __wt_lsm_meta_write(session, lsm_tree); ++lsm_tree->dsk_gen; WT_TRET(__wt_lsm_tree_writeunlock(session, lsm_tree)); if (ret != 0) WT_ERR_MSG(session, ret, "LSM bloom worker metadata write"); err: if (bloom != NULL) WT_TRET(__wt_bloom_close(bloom)); F_CLR(session, WT_SESSION_NO_CACHE | WT_SESSION_NO_EVICTION); return (ret); }
void run(void) { WT_BLOOM *bloomp; WT_ITEM item; WT_SESSION_IMPL *sess; uint32_t fp, i; int ret; const char *uri = "file:my_bloom.bf"; /* Use the internal session handle to access private APIs. */ sess = (WT_SESSION_IMPL *)g.wt_session; testutil_check(__wt_bloom_create( sess, uri, NULL, g.c_ops, g.c_factor, g.c_k, &bloomp)); item.size = g.c_key_max; for (i = 0; i < g.c_ops; i++) { item.data = g.entries[i]; if ((ret = __wt_bloom_insert(bloomp, &item)) != 0) testutil_die(ret, "__wt_bloom_insert: %" PRIu32, i); } testutil_check(__wt_bloom_finalize(bloomp)); for (i = 0; i < g.c_ops; i++) { item.data = g.entries[i]; if ((ret = __wt_bloom_get(bloomp, &item)) != 0) { fprintf(stderr, "get failed at record: %" PRIu32 "\n", i); testutil_die(ret, "__wt_bloom_get"); } } testutil_check(__wt_bloom_close(bloomp)); testutil_check(g.wt_session->checkpoint(g.wt_session, NULL)); testutil_check(__wt_bloom_open( sess, uri, g.c_factor, g.c_k, NULL, &bloomp)); for (i = 0; i < g.c_ops; i++) { item.data = g.entries[i]; testutil_check(__wt_bloom_get(bloomp, &item)); } /* * Try out some values we didn't insert - choose a different size to * ensure the value doesn't overlap with existing values. */ item.size = g.c_key_max + 10; item.data = dcalloc(item.size, 1); memset((void *)item.data, 'a', item.size); for (i = 0, fp = 0; i < g.c_ops; i++) { ((uint8_t *)item.data)[i % item.size] = 'a' + ((uint8_t)rand() % 26); if ((ret = __wt_bloom_get(bloomp, &item)) == 0) ++fp; if (ret != 0 && ret != WT_NOTFOUND) testutil_die(ret, "__wt_bloom_get"); } free((void *)item.data); printf( "Out of %" PRIu32 " ops, got %" PRIu32 " false positives, %.4f%%\n", g.c_ops, fp, 100.0 * fp/g.c_ops); testutil_check(__wt_bloom_drop(bloomp, NULL)); }
/* * __wt_lsm_merge -- * Merge a set of chunks of an LSM tree. */ int __wt_lsm_merge(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, u_int id) { WT_BLOOM *bloom; WT_CURSOR *dest, *src; WT_DECL_RET; WT_ITEM key, value; WT_LSM_CHUNK *chunk; uint32_t generation; uint64_t insert_count, record_count; u_int dest_id, end_chunk, i, nchunks, start_chunk, start_id, verb; int tret; bool created_chunk, create_bloom, locked, in_sync; const char *cfg[3]; const char *drop_cfg[] = { WT_CONFIG_BASE(session, WT_SESSION_drop), "force", NULL }; bloom = NULL; chunk = NULL; dest = src = NULL; start_id = 0; created_chunk = create_bloom = locked = in_sync = false; /* Fast path if it's obvious no merges could be done. */ if (lsm_tree->nchunks < lsm_tree->merge_min && lsm_tree->merge_aggressiveness < WT_LSM_AGGRESSIVE_THRESHOLD) return (WT_NOTFOUND); /* * Use the lsm_tree lock to read the chunks (so no switches occur), but * avoid holding it while the merge is in progress: that may take a * long time. */ WT_RET(__wt_lsm_tree_writelock(session, lsm_tree)); locked = true; WT_ERR(__lsm_merge_span(session, lsm_tree, id, &start_chunk, &end_chunk, &record_count)); nchunks = (end_chunk + 1) - start_chunk; WT_ASSERT(session, nchunks > 0); start_id = lsm_tree->chunk[start_chunk]->id; /* Find the merge generation. */ for (generation = 0, i = 0; i < nchunks; i++) generation = WT_MAX(generation, lsm_tree->chunk[start_chunk + i]->generation + 1); WT_ERR(__wt_lsm_tree_writeunlock(session, lsm_tree)); locked = false; /* Allocate an ID for the merge. */ dest_id = __wt_atomic_add32(&lsm_tree->last, 1); /* * We only want to do the chunk loop if we're running with verbose, * so we wrap these statements in the conditional. Avoid the loop * in the normal path. */ if (WT_VERBOSE_ISSET(session, WT_VERB_LSM)) { WT_ERR(__wt_verbose(session, WT_VERB_LSM, "Merging %s chunks %u-%u into %u (%" PRIu64 " records)" ", generation %" PRIu32, lsm_tree->name, start_chunk, end_chunk, dest_id, record_count, generation)); for (verb = start_chunk; verb <= end_chunk; verb++) WT_ERR(__wt_verbose(session, WT_VERB_LSM, "Merging %s: Chunk[%u] id %u, gen: %" PRIu32 ", size: %" PRIu64 ", records: %" PRIu64, lsm_tree->name, verb, lsm_tree->chunk[verb]->id, lsm_tree->chunk[verb]->generation, lsm_tree->chunk[verb]->size, lsm_tree->chunk[verb]->count)); } WT_ERR(__wt_calloc_one(session, &chunk)); created_chunk = true; chunk->id = dest_id; if (FLD_ISSET(lsm_tree->bloom, WT_LSM_BLOOM_MERGED) && (FLD_ISSET(lsm_tree->bloom, WT_LSM_BLOOM_OLDEST) || start_chunk > 0) && record_count > 0) create_bloom = true; /* * Special setup for the merge cursor: * first, reset to open the dependent cursors; * then restrict the cursor to a specific number of chunks; * then set MERGE so the cursor doesn't track updates to the tree. */ WT_ERR(__wt_open_cursor(session, lsm_tree->name, NULL, NULL, &src)); F_SET(src, WT_CURSTD_RAW); WT_ERR(__wt_clsm_init_merge(src, start_chunk, start_id, nchunks)); WT_WITH_SCHEMA_LOCK(session, ret = __wt_lsm_tree_setup_chunk(session, lsm_tree, chunk)); WT_ERR(ret); if (create_bloom) { WT_ERR(__wt_lsm_tree_setup_bloom(session, lsm_tree, chunk)); WT_ERR(__wt_bloom_create(session, chunk->bloom_uri, lsm_tree->bloom_config, record_count, lsm_tree->bloom_bit_count, lsm_tree->bloom_hash_count, &bloom)); } /* Discard pages we read as soon as we're done with them. */ F_SET(session, WT_SESSION_NO_CACHE); cfg[0] = WT_CONFIG_BASE(session, WT_SESSION_open_cursor); cfg[1] = "bulk,raw,skip_sort_check"; cfg[2] = NULL; WT_ERR(__wt_open_cursor(session, chunk->uri, NULL, cfg, &dest)); #define LSM_MERGE_CHECK_INTERVAL WT_THOUSAND for (insert_count = 0; (ret = src->next(src)) == 0; insert_count++) { if (insert_count % LSM_MERGE_CHECK_INTERVAL == 0) { if (!F_ISSET(lsm_tree, WT_LSM_TREE_ACTIVE)) WT_ERR(EINTR); WT_STAT_FAST_CONN_INCRV(session, lsm_rows_merged, LSM_MERGE_CHECK_INTERVAL); ++lsm_tree->merge_progressing; } WT_ERR(src->get_key(src, &key)); dest->set_key(dest, &key); WT_ERR(src->get_value(src, &value)); dest->set_value(dest, &value); WT_ERR(dest->insert(dest)); if (create_bloom) WT_ERR(__wt_bloom_insert(bloom, &key)); } WT_ERR_NOTFOUND_OK(ret); WT_STAT_FAST_CONN_INCRV(session, lsm_rows_merged, insert_count % LSM_MERGE_CHECK_INTERVAL); ++lsm_tree->merge_progressing; WT_ERR(__wt_verbose(session, WT_VERB_LSM, "Bloom size for %" PRIu64 " has %" PRIu64 " items inserted.", record_count, insert_count)); /* * Closing and syncing the files can take a while. Set the * merge_syncing field so that compact knows it is still in * progress. */ (void)__wt_atomic_add32(&lsm_tree->merge_syncing, 1); in_sync = true; /* * We've successfully created the new chunk. Now install it. We need * to ensure that the NO_CACHE flag is cleared and the bloom filter * is closed (even if a step fails), so track errors but don't return * until we've cleaned up. */ WT_TRET(src->close(src)); WT_TRET(dest->close(dest)); src = dest = NULL; F_CLR(session, WT_SESSION_NO_CACHE); /* * We're doing advisory reads to fault the new trees into cache. * Don't block if the cache is full: our next unit of work may be to * discard some trees to free space. */ F_SET(session, WT_SESSION_NO_EVICTION); if (create_bloom) { if (ret == 0) WT_TRET(__wt_bloom_finalize(bloom)); /* * Read in a key to make sure the Bloom filters btree handle is * open before it becomes visible to application threads. * Otherwise application threads will stall while it is opened * and internal pages are read into cache. */ if (ret == 0) { WT_CLEAR(key); WT_TRET_NOTFOUND_OK(__wt_bloom_get(bloom, &key)); } WT_TRET(__wt_bloom_close(bloom)); bloom = NULL; } WT_ERR(ret); /* * Open a handle on the new chunk before application threads attempt * to access it, opening it pre-loads internal pages into the file * system cache. */ cfg[1] = "checkpoint=" WT_CHECKPOINT; WT_ERR(__wt_open_cursor(session, chunk->uri, NULL, cfg, &dest)); WT_TRET(dest->close(dest)); dest = NULL; ++lsm_tree->merge_progressing; (void)__wt_atomic_sub32(&lsm_tree->merge_syncing, 1); in_sync = false; WT_ERR_NOTFOUND_OK(ret); WT_ERR(__wt_lsm_tree_set_chunk_size(session, chunk)); WT_ERR(__wt_lsm_tree_writelock(session, lsm_tree)); locked = true; /* * Check whether we raced with another merge, and adjust the chunk * array offset as necessary. */ if (start_chunk >= lsm_tree->nchunks || lsm_tree->chunk[start_chunk]->id != start_id) for (start_chunk = 0; start_chunk < lsm_tree->nchunks; start_chunk++) if (lsm_tree->chunk[start_chunk]->id == start_id) break; /* * It is safe to error out here - since the update can only fail * prior to making updates to the tree. */ WT_ERR(__wt_lsm_merge_update_tree( session, lsm_tree, start_chunk, nchunks, chunk)); if (create_bloom) F_SET(chunk, WT_LSM_CHUNK_BLOOM); chunk->count = insert_count; chunk->generation = generation; F_SET(chunk, WT_LSM_CHUNK_ONDISK); /* * We have no current way of continuing if the metadata update fails, * so we will panic in that case. Put some effort into cleaning up * after ourselves here - so things have a chance of shutting down. * * Any errors that happened after the tree was locked are * fatal - we can't guarantee the state of the tree. */ if ((ret = __wt_lsm_meta_write(session, lsm_tree)) != 0) WT_PANIC_ERR(session, ret, "Failed finalizing LSM merge"); lsm_tree->dsk_gen++; /* Update the throttling while holding the tree lock. */ __wt_lsm_tree_throttle(session, lsm_tree, true); /* Schedule a pass to discard old chunks */ WT_ERR(__wt_lsm_manager_push_entry( session, WT_LSM_WORK_DROP, 0, lsm_tree)); err: if (locked) WT_TRET(__wt_lsm_tree_writeunlock(session, lsm_tree)); if (in_sync) (void)__wt_atomic_sub32(&lsm_tree->merge_syncing, 1); if (src != NULL) WT_TRET(src->close(src)); if (dest != NULL) WT_TRET(dest->close(dest)); if (bloom != NULL) WT_TRET(__wt_bloom_close(bloom)); if (ret != 0 && created_chunk) { /* Drop the newly-created files on error. */ if (chunk->uri != NULL) { WT_WITH_SCHEMA_LOCK(session, tret = __wt_schema_drop(session, chunk->uri, drop_cfg)); WT_TRET(tret); } if (create_bloom && chunk->bloom_uri != NULL) { WT_WITH_SCHEMA_LOCK(session, tret = __wt_schema_drop( session, chunk->bloom_uri, drop_cfg)); WT_TRET(tret); } __wt_free(session, chunk->bloom_uri); __wt_free(session, chunk->uri); __wt_free(session, chunk); if (ret == EINTR) WT_TRET(__wt_verbose(session, WT_VERB_LSM, "Merge aborted due to close")); else WT_TRET(__wt_verbose(session, WT_VERB_LSM, "Merge failed with %s", __wt_strerror(session, ret, NULL, 0))); } F_CLR(session, WT_SESSION_NO_CACHE | WT_SESSION_NO_EVICTION); return (ret); }
/* * __curjoin_init_next -- * Initialize the cursor join when the next function is first called. */ static int __curjoin_init_next(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin, bool iterable) { WT_BLOOM *bloom; WT_CURSOR *origcur; WT_CURSOR_JOIN_ENDPOINT *end; WT_CURSOR_JOIN_ENTRY *je, *jeend, *je2; WT_DECL_RET; size_t size; uint32_t f, k; char *mainbuf; const char *def_cfg[] = { WT_CONFIG_BASE( session, WT_SESSION_open_cursor), NULL }; const char *raw_cfg[] = { WT_CONFIG_BASE( session, WT_SESSION_open_cursor), "raw", NULL }; const char **config, *proj, *urimain; mainbuf = NULL; if (cjoin->entries_next == 0) WT_RET_MSG(session, EINVAL, "join cursor has not yet been joined with any other " "cursors"); /* Get a consistent view of our subordinate cursors if appropriate. */ __wt_txn_cursor_op(session); if (F_ISSET((WT_CURSOR *)cjoin, WT_CURSTD_RAW)) config = &raw_cfg[0]; else config = &def_cfg[0]; urimain = cjoin->table->iface.name; if ((proj = cjoin->projection) != NULL) { size = strlen(urimain) + strlen(proj) + 1; WT_ERR(__wt_calloc(session, size, 1, &mainbuf)); WT_ERR(__wt_snprintf(mainbuf, size, "%s%s", urimain, proj)); urimain = mainbuf; } WT_ERR(__wt_open_cursor(session, urimain, (WT_CURSOR *)cjoin, config, &cjoin->main)); jeend = &cjoin->entries[cjoin->entries_next]; for (je = cjoin->entries; je < jeend; je++) { if (je->subjoin != NULL) { WT_ERR(__curjoin_init_next(session, je->subjoin, iterable)); continue; } __wt_stat_join_init_single(&je->stats); /* * For a single compare=le/lt endpoint in any entry that may * be iterated, construct a companion compare=ge endpoint * that will actually be iterated. */ if (iterable && je->ends_next == 1 && F_ISSET(&je->ends[0], WT_CURJOIN_END_LT)) { origcur = je->ends[0].cursor; WT_ERR(__curjoin_insert_endpoint(session, je, 0, &end)); WT_ERR(__wt_open_cursor(session, origcur->uri, (WT_CURSOR *)cjoin, F_ISSET(origcur, WT_CURSTD_RAW) ? raw_cfg : def_cfg, &end->cursor)); end->flags = WT_CURJOIN_END_GT | WT_CURJOIN_END_EQ | WT_CURJOIN_END_OWN_CURSOR; WT_ERR(end->cursor->next(end->cursor)); F_CLR(je, WT_CURJOIN_ENTRY_DISJUNCTION); } for (end = &je->ends[0]; end < &je->ends[je->ends_next]; end++) WT_ERR(__curjoin_endpoint_init_key(session, je, end)); /* * Do any needed Bloom filter initialization. Ignore Bloom * filters for entries that will be iterated. They won't * help since these entries either don't need an inclusion * check or are doing any needed check during the iteration. */ if (!iterable && F_ISSET(je, WT_CURJOIN_ENTRY_BLOOM)) { if (session->txn.isolation == WT_ISO_READ_UNCOMMITTED) WT_ERR_MSG(session, EINVAL, "join cursors with Bloom filters cannot be " "used with read-uncommitted isolation"); if (je->bloom == NULL) { /* * Look for compatible filters to be shared, * pick compatible numbers for bit counts * and number of hashes. */ f = je->bloom_bit_count; k = je->bloom_hash_count; for (je2 = je + 1; je2 < jeend; je2++) if (F_ISSET(je2, WT_CURJOIN_ENTRY_BLOOM) && je2->count == je->count) { f = WT_MAX( je2->bloom_bit_count, f); k = WT_MAX( je2->bloom_hash_count, k); } je->bloom_bit_count = f; je->bloom_hash_count = k; WT_ERR(__wt_bloom_create(session, NULL, NULL, je->count, f, k, &je->bloom)); F_SET(je, WT_CURJOIN_ENTRY_OWN_BLOOM); WT_ERR(__curjoin_init_bloom(session, cjoin, je, je->bloom)); /* * Share the Bloom filter, making all * config info consistent. */ for (je2 = je + 1; je2 < jeend; je2++) if (F_ISSET(je2, WT_CURJOIN_ENTRY_BLOOM) && je2->count == je->count) { WT_ASSERT(session, je2->bloom == NULL); je2->bloom = je->bloom; je2->bloom_bit_count = f; je2->bloom_hash_count = k; } } else { /* * Create a temporary filter that we'll * merge into the shared one. The Bloom * parameters of the two filters must match. */ WT_ERR(__wt_bloom_create(session, NULL, NULL, je->count, je->bloom_bit_count, je->bloom_hash_count, &bloom)); WT_ERR(__curjoin_init_bloom(session, cjoin, je, bloom)); WT_ERR(__wt_bloom_intersection(je->bloom, bloom)); WT_ERR(__wt_bloom_close(bloom)); } } if (!F_ISSET(cjoin, WT_CURJOIN_DISJUNCTION)) iterable = false; } F_SET(cjoin, WT_CURJOIN_INITIALIZED); err: __wt_free(session, mainbuf); return (ret); }
/* * __lsm_bloom_create -- * Create a bloom filter for a chunk of the LSM tree that has been * checkpointed but not yet been merged. */ static int __lsm_bloom_create(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, WT_LSM_CHUNK *chunk, u_int chunk_off) { WT_BLOOM *bloom; WT_CURSOR *src; WT_DECL_RET; WT_ITEM buf, key; WT_SESSION *wt_session; uint64_t insert_count; int exist; /* * Normally, the Bloom URI is populated when the chunk struct is * allocated. After an open, however, it may not have been. * Deal with that here. */ if (chunk->bloom_uri == NULL) { WT_CLEAR(buf); WT_RET(__wt_lsm_tree_bloom_name( session, lsm_tree, chunk->id, &buf)); chunk->bloom_uri = __wt_buf_steal(session, &buf, NULL); } /* * Drop the bloom filter first - there may be some content hanging over * from an aborted merge or checkpoint. */ wt_session = &session->iface; WT_RET(__wt_exist(session, chunk->bloom_uri + strlen("file:"), &exist)); if (exist) WT_RET(wt_session->drop(wt_session, chunk->bloom_uri, "force")); bloom = NULL; /* * This is merge-like activity, and we don't want compacts to give up * because we are creating a bunch of bloom filters before merging. */ ++lsm_tree->merge_progressing; WT_RET(__wt_bloom_create(session, chunk->bloom_uri, lsm_tree->bloom_config, chunk->count, lsm_tree->bloom_bit_count, lsm_tree->bloom_hash_count, &bloom)); /* Open a special merge cursor just on this chunk. */ WT_ERR(__wt_open_cursor(session, lsm_tree->name, NULL, NULL, &src)); F_SET(src, WT_CURSTD_RAW); WT_ERR(__wt_clsm_init_merge(src, chunk_off, chunk->id, 1)); F_SET(session, WT_SESSION_NO_CACHE); for (insert_count = 0; (ret = src->next(src)) == 0; insert_count++) { WT_ERR(src->get_key(src, &key)); WT_ERR(__wt_bloom_insert(bloom, &key)); } WT_ERR_NOTFOUND_OK(ret); WT_TRET(src->close(src)); WT_TRET(__wt_bloom_finalize(bloom)); WT_ERR(ret); F_CLR(session, WT_SESSION_NO_CACHE); /* Load the new Bloom filter into cache. */ WT_CLEAR(key); WT_ERR_NOTFOUND_OK(__wt_bloom_get(bloom, &key)); WT_VERBOSE_ERR(session, lsm, "LSM worker created bloom filter %s. " "Expected %" PRIu64 " items, got %" PRIu64, chunk->bloom_uri, chunk->count, insert_count); /* Ensure the bloom filter is in the metadata. */ WT_ERR(__wt_lsm_tree_lock(session, lsm_tree, 1)); F_SET_ATOMIC(chunk, WT_LSM_CHUNK_BLOOM); ret = __wt_lsm_meta_write(session, lsm_tree); ++lsm_tree->dsk_gen; WT_TRET(__wt_lsm_tree_unlock(session, lsm_tree)); if (ret != 0) WT_ERR_MSG(session, ret, "LSM bloom worker metadata write"); err: if (bloom != NULL) WT_TRET(__wt_bloom_close(bloom)); F_CLR(session, WT_SESSION_NO_CACHE); return (ret); }
/* * __curjoin_init_iter -- * Initialize before any iteration. */ static int __curjoin_init_iter(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin) { WT_BLOOM *bloom; WT_DECL_RET; WT_CURSOR_JOIN_ENTRY *je, *jeend, *je2; WT_CURSOR_JOIN_ENDPOINT *end; uint32_t f, k; if (cjoin->entries_next == 0) WT_RET_MSG(session, EINVAL, "join cursor has not yet been joined with any other " "cursors"); je = &cjoin->entries[0]; WT_RET(__curjoin_entry_iter_init(session, cjoin, je, &cjoin->iter)); jeend = &cjoin->entries[cjoin->entries_next]; for (je = cjoin->entries; je < jeend; je++) { __wt_stat_join_init_single(&je->stats); for (end = &je->ends[0]; end < &je->ends[je->ends_next]; end++) WT_RET(__curjoin_endpoint_init_key(session, je, end)); /* * The first entry is iterated as the 'outermost' cursor. * For the common GE case, we don't have to test against * the left reference key, we know it will be true since * the btree is ordered. */ if (je == cjoin->entries && je->ends[0].flags == (WT_CURJOIN_END_GT | WT_CURJOIN_END_EQ)) F_SET(cjoin, WT_CURJOIN_SKIP_FIRST_LEFT); if (F_ISSET(je, WT_CURJOIN_ENTRY_BLOOM)) { if (je->bloom == NULL) { /* * Look for compatible filters to be shared, * pick compatible numbers for bit counts * and number of hashes. */ f = je->bloom_bit_count; k = je->bloom_hash_count; for (je2 = je + 1; je2 < jeend; je2++) if (F_ISSET(je2, WT_CURJOIN_ENTRY_BLOOM) && je2->count == je->count) { f = WT_MAX( je2->bloom_bit_count, f); k = WT_MAX( je2->bloom_hash_count, k); } je->bloom_bit_count = f; je->bloom_hash_count = k; WT_RET(__wt_bloom_create(session, NULL, NULL, je->count, f, k, &je->bloom)); F_SET(je, WT_CURJOIN_ENTRY_OWN_BLOOM); WT_RET(__curjoin_init_bloom(session, cjoin, je, je->bloom)); /* * Share the Bloom filter, making all * config info consistent. */ for (je2 = je + 1; je2 < jeend; je2++) if (F_ISSET(je2, WT_CURJOIN_ENTRY_BLOOM) && je2->count == je->count) { WT_ASSERT(session, je2->bloom == NULL); je2->bloom = je->bloom; je2->bloom_bit_count = f; je2->bloom_hash_count = k; } } else { /* * Create a temporary filter that we'll * merge into the shared one. The Bloom * parameters of the two filters must match. */ WT_RET(__wt_bloom_create(session, NULL, NULL, je->count, je->bloom_bit_count, je->bloom_hash_count, &bloom)); WT_RET(__curjoin_init_bloom(session, cjoin, je, bloom)); WT_RET(__wt_bloom_intersection(je->bloom, bloom)); WT_RET(__wt_bloom_close(bloom)); } } } F_SET(cjoin, WT_CURJOIN_INITIALIZED); return (ret); }