/* * __wt_clsm_request_switch -- * Request an LSM tree switch for a cursor operation. */ int __wt_clsm_request_switch(WT_CURSOR_LSM *clsm) { WT_DECL_RET; WT_LSM_TREE *lsm_tree; WT_SESSION_IMPL *session; lsm_tree = clsm->lsm_tree; session = (WT_SESSION_IMPL *)clsm->iface.session; if (!F_ISSET(lsm_tree, WT_LSM_TREE_NEED_SWITCH)) { /* * Check that we are up-to-date: don't set the switch if the * tree has changed since we last opened cursors: that can lead * to switching multiple times when only one switch is * required, creating very small chunks. */ WT_RET(__wt_lsm_tree_readlock(session, lsm_tree)); if (lsm_tree->nchunks == 0 || (clsm->dsk_gen == lsm_tree->dsk_gen && !F_ISSET(lsm_tree, WT_LSM_TREE_NEED_SWITCH))) { F_SET(lsm_tree, WT_LSM_TREE_NEED_SWITCH); ret = __wt_lsm_manager_push_entry( session, WT_LSM_WORK_SWITCH, 0, lsm_tree); } WT_TRET(__wt_lsm_tree_readunlock(session, lsm_tree)); } return (ret); }
/* * __lsm_copy_chunks -- * Take a copy of part of the LSM tree chunk array so that we can work on * the contents without holding the LSM tree handle lock long term. */ static int __lsm_copy_chunks(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, WT_LSM_WORKER_COOKIE *cookie, bool old_chunks) { WT_DECL_RET; u_int i, nchunks; size_t alloc; /* Always return zero chunks on error. */ cookie->nchunks = 0; __wt_lsm_tree_readlock(session, lsm_tree); if (!lsm_tree->active) { __wt_lsm_tree_readunlock(session, lsm_tree); return (0); } /* Take a copy of the current state of the LSM tree. */ nchunks = old_chunks ? lsm_tree->nold_chunks : lsm_tree->nchunks; alloc = old_chunks ? lsm_tree->old_alloc : lsm_tree->chunk_alloc; /* * If the tree array of active chunks is larger than our current buffer, * increase the size of our current buffer to match. */ if (cookie->chunk_alloc < alloc) WT_ERR(__wt_realloc(session, &cookie->chunk_alloc, alloc, &cookie->chunk_array)); if (nchunks > 0) memcpy(cookie->chunk_array, old_chunks ? lsm_tree->old_chunks : lsm_tree->chunk, nchunks * sizeof(*cookie->chunk_array)); /* * Mark each chunk as active, so we don't drop it until after we know * it's safe. */ for (i = 0; i < nchunks; i++) (void)__wt_atomic_add32(&cookie->chunk_array[i]->refcnt, 1); err: __wt_lsm_tree_readunlock(session, lsm_tree); if (ret == 0) cookie->nchunks = nchunks; return (ret); }
/* * __wt_clsm_open_bulk -- * WT_SESSION->open_cursor method for LSM bulk cursors. */ int __wt_clsm_open_bulk(WT_CURSOR_LSM *clsm, const char *cfg[]) { WT_CURSOR *cursor, *bulk_cursor; WT_LSM_TREE *lsm_tree; WT_SESSION_IMPL *session; bulk_cursor = NULL; cursor = &clsm->iface; lsm_tree = clsm->lsm_tree; session = (WT_SESSION_IMPL *)clsm->iface.session; F_SET(clsm, WT_CLSM_BULK); /* Bulk cursors are limited to insert and close. */ __wt_cursor_set_notsup(cursor); cursor->insert = __clsm_insert_bulk; cursor->close = __clsm_close_bulk; /* Setup the first chunk in the tree. */ WT_RET(__wt_clsm_request_switch(clsm)); WT_RET(__wt_clsm_await_switch(clsm)); /* * Grab and release the LSM tree lock to ensure that the first chunk * has been fully created before proceeding. We have the LSM tree * open exclusive, so that saves us from needing the lock generally. */ WT_RET(__wt_lsm_tree_readlock(session, lsm_tree)); WT_RET(__wt_lsm_tree_readunlock(session, lsm_tree)); /* * Open a bulk cursor on the first chunk, it's not a regular LSM chunk * cursor, but use the standard storage locations. Allocate the space * for a bloom filter - it makes cleanup simpler. Cleaned up by * cursor close on error. */ WT_RET(__wt_calloc_one(session, &clsm->blooms)); clsm->bloom_alloc = 1; WT_RET(__wt_calloc_one(session, &clsm->cursors)); clsm->cursor_alloc = 1; clsm->nchunks = 1; /* * Open a bulk cursor on the first chunk in the tree - take a read * lock on the LSM tree while we are opening the chunk, to ensure * that the first chunk has been fully created before we succeed. * Pass through the application config to ensure the tree is open * for bulk access. */ WT_RET(__wt_open_cursor(session, lsm_tree->chunk[0]->uri, &clsm->iface, cfg, &bulk_cursor)); clsm->cursors[0] = bulk_cursor; /* LSM cursors are always raw */ F_SET(bulk_cursor, WT_CURSTD_RAW); return (0); }
/* * __wt_lsm_tree_worker -- * Run a schema worker operation on each level of a LSM tree. */ int __wt_lsm_tree_worker(WT_SESSION_IMPL *session, const char *uri, int (*file_func)(WT_SESSION_IMPL *, const char *[]), int (*name_func)(WT_SESSION_IMPL *, const char *, int *), const char *cfg[], uint32_t open_flags) { WT_DECL_RET; WT_LSM_CHUNK *chunk; WT_LSM_TREE *lsm_tree; u_int i; int exclusive, locked; locked = 0; exclusive = FLD_ISSET(open_flags, WT_DHANDLE_EXCLUSIVE) ? 1 : 0; WT_RET(__wt_lsm_tree_get(session, uri, exclusive, &lsm_tree)); /* * We mark that we're busy using the tree to coordinate * with merges so that merging doesn't change the chunk * array out from underneath us. */ WT_ERR(exclusive ? __wt_lsm_tree_writelock(session, lsm_tree) : __wt_lsm_tree_readlock(session, lsm_tree)); locked = 1; for (i = 0; i < lsm_tree->nchunks; i++) { chunk = lsm_tree->chunk[i]; if (file_func == __wt_checkpoint && F_ISSET(chunk, WT_LSM_CHUNK_ONDISK)) continue; WT_ERR(__wt_schema_worker(session, chunk->uri, file_func, name_func, cfg, open_flags)); if (name_func == __wt_backup_list_uri_append && F_ISSET(chunk, WT_LSM_CHUNK_BLOOM)) WT_ERR(__wt_schema_worker(session, chunk->bloom_uri, file_func, name_func, cfg, open_flags)); } err: if (locked) WT_TRET(exclusive ? __wt_lsm_tree_writeunlock(session, lsm_tree) : __wt_lsm_tree_readunlock(session, lsm_tree)); __wt_lsm_tree_release(session, lsm_tree); return (ret); }
/* * __wt_lsm_get_chunk_to_flush -- * Find and pin a chunk in the LSM tree that is likely to need flushing. */ int __wt_lsm_get_chunk_to_flush(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, bool force, WT_LSM_CHUNK **chunkp) { WT_DECL_RET; WT_LSM_CHUNK *chunk, *evict_chunk, *flush_chunk; u_int i; *chunkp = NULL; chunk = evict_chunk = flush_chunk = NULL; WT_ASSERT(session, lsm_tree->queue_ref > 0); WT_RET(__wt_lsm_tree_readlock(session, lsm_tree)); if (!F_ISSET(lsm_tree, WT_LSM_TREE_ACTIVE) || lsm_tree->nchunks == 0) return (__wt_lsm_tree_readunlock(session, lsm_tree)); /* Search for a chunk to evict and/or a chunk to flush. */ for (i = 0; i < lsm_tree->nchunks; i++) { chunk = lsm_tree->chunk[i]; if (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK)) { /* * Normally we don't want to force out the last chunk. * But if we're doing a forced flush on behalf of a * compact, then we want to include the final chunk. */ if (evict_chunk == NULL && !chunk->evicted && !F_ISSET(chunk, WT_LSM_CHUNK_STABLE)) evict_chunk = chunk; } else if (flush_chunk == NULL && chunk->switch_txn != 0 && (force || i < lsm_tree->nchunks - 1)) flush_chunk = chunk; } /* * Don't be overly zealous about pushing old chunks from cache. * Attempting too many drops can interfere with checkpoints. * * If retrying a discard push an additional work unit so there are * enough to trigger checkpoints. */ if (evict_chunk != NULL && flush_chunk != NULL) { chunk = (__wt_random(&session->rnd) & 1) ? evict_chunk : flush_chunk; WT_ERR(__wt_lsm_manager_push_entry( session, WT_LSM_WORK_FLUSH, 0, lsm_tree)); } else chunk = (evict_chunk != NULL) ? evict_chunk : flush_chunk; if (chunk != NULL) { WT_ERR(__wt_verbose(session, WT_VERB_LSM, "Flush%s: return chunk %u of %u: %s", force ? " w/ force" : "", i, lsm_tree->nchunks, chunk->uri)); (void)__wt_atomic_add32(&chunk->refcnt, 1); } err: WT_RET(__wt_lsm_tree_readunlock(session, lsm_tree)); *chunkp = chunk; return (ret); }
/* * __clsm_open_cursors -- * Open cursors for the current set of files. */ static int __clsm_open_cursors( WT_CURSOR_LSM *clsm, bool update, u_int start_chunk, uint32_t start_id) { WT_BTREE *btree; WT_CURSOR *c, **cp, *primary; WT_DECL_RET; WT_LSM_CHUNK *chunk; WT_LSM_TREE *lsm_tree; WT_SESSION_IMPL *session; WT_TXN *txn; const char *checkpoint, *ckpt_cfg[3]; uint64_t saved_gen; u_int i, nchunks, ngood, nupdates; u_int close_range_end, close_range_start; bool locked; c = &clsm->iface; session = (WT_SESSION_IMPL *)c->session; txn = &session->txn; chunk = NULL; locked = false; lsm_tree = clsm->lsm_tree; /* * Ensure that any snapshot update has cursors on the right set of * chunks to guarantee visibility is correct. */ if (update && txn->isolation == WT_ISO_SNAPSHOT) F_SET(clsm, WT_CLSM_OPEN_SNAPSHOT); /* * Query operations need a full set of cursors. Overwrite cursors * do queries in service of updates. */ if (!update || !F_ISSET(c, WT_CURSTD_OVERWRITE)) F_SET(clsm, WT_CLSM_OPEN_READ); if (lsm_tree->nchunks == 0) return (0); ckpt_cfg[0] = WT_CONFIG_BASE(session, WT_SESSION_open_cursor); ckpt_cfg[1] = "checkpoint=" WT_CHECKPOINT ",raw"; ckpt_cfg[2] = NULL; /* * If the key is pointing to memory that is pinned by a chunk * cursor, take a copy before closing cursors. */ if (F_ISSET(c, WT_CURSTD_KEY_INT)) WT_CURSOR_NEEDKEY(c); F_CLR(clsm, WT_CLSM_ITERATE_NEXT | WT_CLSM_ITERATE_PREV); WT_RET(__wt_lsm_tree_readlock(session, lsm_tree)); locked = true; /* Merge cursors have already figured out how many chunks they need. */ retry: if (F_ISSET(clsm, WT_CLSM_MERGE)) { nchunks = clsm->nchunks; ngood = 0; /* * We may have raced with another merge completing. Check that * we're starting at the right offset in the chunk array. */ if (start_chunk >= lsm_tree->nchunks || lsm_tree->chunk[start_chunk]->id != start_id) { for (start_chunk = 0; start_chunk < lsm_tree->nchunks; start_chunk++) { chunk = lsm_tree->chunk[start_chunk]; if (chunk->id == start_id) break; } /* We have to find the start chunk: merge locked it. */ WT_ASSERT(session, start_chunk < lsm_tree->nchunks); } WT_ASSERT(session, start_chunk + nchunks <= lsm_tree->nchunks); } else { nchunks = lsm_tree->nchunks; /* * If we are only opening the cursor for updates, only open the * primary chunk, plus any other chunks that might be required * to detect snapshot isolation conflicts. */ if (F_ISSET(clsm, WT_CLSM_OPEN_SNAPSHOT)) WT_ERR(__wt_realloc_def(session, &clsm->txnid_alloc, nchunks, &clsm->switch_txn)); if (F_ISSET(clsm, WT_CLSM_OPEN_READ)) ngood = nupdates = 0; else if (F_ISSET(clsm, WT_CLSM_OPEN_SNAPSHOT)) { /* * Keep going until all updates in the next * chunk are globally visible. Copy the maximum * transaction IDs into the cursor as we go. */ for (ngood = nchunks - 1, nupdates = 1; ngood > 0; ngood--, nupdates++) { chunk = lsm_tree->chunk[ngood - 1]; clsm->switch_txn[ngood - 1] = chunk->switch_txn; if (__wt_txn_visible_all( session, chunk->switch_txn)) break; } } else { nupdates = 1; ngood = nchunks - 1; } /* Check how many cursors are already open. */ for (cp = clsm->cursors + ngood; ngood < clsm->nchunks && ngood < nchunks; cp++, ngood++) { chunk = lsm_tree->chunk[ngood]; /* If the cursor isn't open yet, we're done. */ if (*cp == NULL) break; /* Easy case: the URIs don't match. */ if (strcmp((*cp)->uri, chunk->uri) != 0) break; /* Make sure the checkpoint config matches. */ checkpoint = ((WT_CURSOR_BTREE *)*cp)-> btree->dhandle->checkpoint; if (checkpoint == NULL && F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) && !chunk->empty) break; /* Make sure the Bloom config matches. */ if (clsm->blooms[ngood] == NULL && F_ISSET(chunk, WT_LSM_CHUNK_BLOOM)) break; } /* Spurious generation bump? */ if (ngood == clsm->nchunks && clsm->nchunks == nchunks) { clsm->dsk_gen = lsm_tree->dsk_gen; goto err; } /* * Close any cursors we no longer need. * * Drop the LSM tree lock while we do this: if the cache is * full, we may block while closing a cursor. Save the * generation number and retry if it has changed under us. */ if (clsm->cursors != NULL && ngood < clsm->nchunks) { close_range_start = ngood; close_range_end = clsm->nchunks; } else if (!F_ISSET(clsm, WT_CLSM_OPEN_READ) && nupdates > 0 ) { close_range_start = 0; close_range_end = WT_MIN(nchunks, clsm->nchunks); if (close_range_end > nupdates) close_range_end -= nupdates; else close_range_end = 0; WT_ASSERT(session, ngood >= close_range_end); } else { close_range_end = 0; close_range_start = 0; } if (close_range_end > close_range_start) { saved_gen = lsm_tree->dsk_gen; locked = false; WT_ERR(__wt_lsm_tree_readunlock(session, lsm_tree)); WT_ERR(__clsm_close_cursors( clsm, close_range_start, close_range_end)); WT_ERR(__wt_lsm_tree_readlock(session, lsm_tree)); locked = true; if (lsm_tree->dsk_gen != saved_gen) goto retry; } /* Detach from our old primary. */ clsm->primary_chunk = NULL; clsm->current = NULL; } WT_ERR(__wt_realloc_def(session, &clsm->bloom_alloc, nchunks, &clsm->blooms)); WT_ERR(__wt_realloc_def(session, &clsm->cursor_alloc, nchunks, &clsm->cursors)); clsm->nchunks = nchunks; /* Open the cursors for chunks that have changed. */ for (i = ngood, cp = clsm->cursors + i; i != nchunks; i++, cp++) { chunk = lsm_tree->chunk[i + start_chunk]; /* Copy the maximum transaction ID. */ if (F_ISSET(clsm, WT_CLSM_OPEN_SNAPSHOT)) clsm->switch_txn[i] = chunk->switch_txn; /* * Read from the checkpoint if the file has been written. * Once all cursors switch, the in-memory tree can be evicted. */ WT_ASSERT(session, *cp == NULL); ret = __wt_open_cursor(session, chunk->uri, c, (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) && !chunk->empty) ? ckpt_cfg : NULL, cp); /* * XXX kludge: we may have an empty chunk where no checkpoint * was written. If so, try to open the ordinary handle on that * chunk instead. */ if (ret == WT_NOTFOUND && F_ISSET(chunk, WT_LSM_CHUNK_ONDISK)) { ret = __wt_open_cursor( session, chunk->uri, c, NULL, cp); if (ret == 0) chunk->empty = 1; } WT_ERR(ret); /* * Setup all cursors other than the primary to only do conflict * checks on insert operations. This allows us to execute * inserts on non-primary chunks as a way of checking for * write conflicts with concurrent updates. */ if (i != nchunks - 1) (*cp)->insert = __wt_curfile_update_check; if (!F_ISSET(clsm, WT_CLSM_MERGE) && F_ISSET(chunk, WT_LSM_CHUNK_BLOOM)) WT_ERR(__wt_bloom_open(session, chunk->bloom_uri, lsm_tree->bloom_bit_count, lsm_tree->bloom_hash_count, c, &clsm->blooms[i])); /* Child cursors always use overwrite and raw mode. */ F_SET(*cp, WT_CURSTD_OVERWRITE | WT_CURSTD_RAW); } /* The last chunk is our new primary. */ if (chunk != NULL && !F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) && chunk->switch_txn == WT_TXN_NONE) { clsm->primary_chunk = chunk; primary = clsm->cursors[clsm->nchunks - 1]; /* * Disable eviction for the in-memory chunk. Also clear the * bulk load flag here, otherwise eviction will be enabled by * the first update. */ btree = ((WT_CURSOR_BTREE *)(primary))->btree; if (btree->bulk_load_ok) { btree->bulk_load_ok = false; WT_WITH_BTREE(session, btree, __wt_btree_evictable(session, false)); } } clsm->dsk_gen = lsm_tree->dsk_gen; err: #ifdef HAVE_DIAGNOSTIC /* Check that all cursors are open as expected. */ if (ret == 0 && F_ISSET(clsm, WT_CLSM_OPEN_READ)) { for (i = 0, cp = clsm->cursors; i != clsm->nchunks; cp++, i++) { chunk = lsm_tree->chunk[i + start_chunk]; /* Make sure the cursor is open. */ WT_ASSERT(session, *cp != NULL); /* Easy case: the URIs should match. */ WT_ASSERT(session, strcmp((*cp)->uri, chunk->uri) == 0); /* Make sure the checkpoint config matches. */ checkpoint = ((WT_CURSOR_BTREE *)*cp)-> btree->dhandle->checkpoint; WT_ASSERT(session, (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) && !chunk->empty) ? checkpoint != NULL : checkpoint == NULL); /* Make sure the Bloom config matches. */ WT_ASSERT(session, (F_ISSET(chunk, WT_LSM_CHUNK_BLOOM) && !F_ISSET(clsm, WT_CLSM_MERGE)) ? clsm->blooms[i] != NULL : clsm->blooms[i] == NULL); } } #endif if (locked) WT_TRET(__wt_lsm_tree_readunlock(session, lsm_tree)); return (ret); }