/* * __clsm_open_cursors -- * Open cursors for the current set of files. */ static int __clsm_open_cursors( WT_CURSOR_LSM *clsm, int update, u_int start_chunk, uint32_t start_id) { WT_CURSOR *c, **cp, *primary; WT_DECL_RET; WT_LSM_CHUNK *chunk; WT_LSM_TREE *lsm_tree; WT_SESSION_IMPL *session; WT_TXN *txn; const char *checkpoint, *ckpt_cfg[3]; uint64_t saved_gen; u_int i, nchunks, ngood, nupdates; int locked; c = &clsm->iface; session = (WT_SESSION_IMPL *)c->session; txn = &session->txn; lsm_tree = clsm->lsm_tree; chunk = NULL; ckpt_cfg[0] = WT_CONFIG_BASE(session, session_open_cursor); ckpt_cfg[1] = "checkpoint=" WT_CHECKPOINT ",raw"; ckpt_cfg[2] = NULL; /* Copy the key, so we don't lose the cursor position. */ if (F_ISSET(c, WT_CURSTD_KEY_INT) && !WT_DATA_IN_ITEM(&c->key)) WT_RET(__wt_buf_set( session, &c->key, c->key.data, c->key.size)); F_CLR(clsm, WT_CLSM_ITERATE_NEXT | WT_CLSM_ITERATE_PREV); if (update) { if (txn->isolation == TXN_ISO_SNAPSHOT) F_SET(clsm, WT_CLSM_OPEN_SNAPSHOT); } else F_SET(clsm, WT_CLSM_OPEN_READ); WT_RET(__wt_lsm_tree_lock(session, lsm_tree, 0)); locked = 1; /* * If there is no in-memory chunk in the tree for an update operation, * create one. * * !!! * It is exceeding unlikely that we get here at all, but if there is a * transaction in progress and it rolls back, it would leave the * metadata inconsistent. */ if (update && (lsm_tree->nchunks == 0 || (chunk = lsm_tree->chunk[lsm_tree->nchunks - 1]) == NULL || F_ISSET(chunk, WT_LSM_CHUNK_ONDISK))) { /* Release our lock because switch will get a write lock. */ locked = 0; WT_ERR(__wt_lsm_tree_unlock(session, lsm_tree)); WT_ERR(__wt_lsm_tree_switch(session, lsm_tree)); WT_ERR(__wt_lsm_tree_lock(session, lsm_tree, 0)); locked = 1; } F_SET(session, WT_SESSION_NO_CACHE_CHECK); /* Merge cursors have already figured out how many chunks they need. */ retry: if (F_ISSET(clsm, WT_CLSM_MERGE)) { nchunks = clsm->nchunks; ngood = 0; /* * We may have raced with another merge completing. Check that * we're starting at the right offset in the chunk array. */ if (start_chunk >= lsm_tree->nchunks || lsm_tree->chunk[start_chunk]->id != start_id) { for (start_chunk = 0; start_chunk < lsm_tree->nchunks; start_chunk++) { chunk = lsm_tree->chunk[start_chunk]; if (chunk->id == start_id) break; } /* We have to find the start chunk: merge locked it. */ WT_ASSERT(session, start_chunk < lsm_tree->nchunks); } WT_ASSERT(session, start_chunk + nchunks <= lsm_tree->nchunks); } else { nchunks = lsm_tree->nchunks; /* * If we are only opening the cursor for updates, only open the * primary chunk, plus any other chunks that might be required * to detect snapshot isolation conflicts. */ if (F_ISSET(clsm, WT_CLSM_OPEN_SNAPSHOT)) WT_ERR(__wt_realloc_def(session, &clsm->txnid_alloc, nchunks, &clsm->txnid_max)); if (F_ISSET(clsm, WT_CLSM_OPEN_READ)) ngood = nupdates = 0; else if (F_ISSET(clsm, WT_CLSM_OPEN_SNAPSHOT)) { /* * Keep going until all updates in the next * chunk are globally visible. Copy the maximum * transaction IDs into the cursor as we go. */ for (ngood = nchunks - 1, nupdates = 1; ngood > 0; ngood--, nupdates++) { chunk = lsm_tree->chunk[ngood - 1]; clsm->txnid_max[ngood - 1] = chunk->txnid_max; if (__wt_txn_visible_all( session, chunk->txnid_max)) break; } } else { nupdates = 1; ngood = nchunks - 1; } /* Check how many cursors are already open. */ for (cp = clsm->cursors + ngood; ngood < clsm->nchunks && ngood < nchunks; cp++, ngood++) { chunk = lsm_tree->chunk[ngood]; /* If the cursor isn't open yet, we're done. */ if (*cp == NULL) break; /* Easy case: the URIs don't match. */ if (strcmp((*cp)->uri, chunk->uri) != 0) break; /* Make sure the checkpoint config matches. */ checkpoint = ((WT_CURSOR_BTREE *)*cp)-> btree->dhandle->checkpoint; if (checkpoint == NULL && F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) && !chunk->empty) break; /* Make sure the Bloom config matches. */ if (clsm->blooms[ngood] == NULL && F_ISSET(chunk, WT_LSM_CHUNK_BLOOM)) break; } /* Spurious generation bump? */ if (ngood == clsm->nchunks && clsm->nchunks == nchunks) { clsm->dsk_gen = lsm_tree->dsk_gen; goto err; } /* * Close any cursors we no longer need. * * Drop the LSM tree lock while we do this: if the cache is * full, we may block while closing a cursor. Save the * generation number and retry if it has changed under us. */ if (clsm->cursors != NULL && (ngood < clsm->nchunks || (!F_ISSET(clsm, WT_CLSM_OPEN_READ) && nupdates > 0))) { saved_gen = lsm_tree->dsk_gen; locked = 0; WT_ERR(__wt_lsm_tree_unlock(session, lsm_tree)); if (!F_ISSET(clsm, WT_CLSM_OPEN_READ) && nupdates > 0) WT_ERR(__clsm_close_cursors( clsm, 0, nchunks - nupdates)); WT_ERR(__clsm_close_cursors( clsm, ngood, clsm->nchunks)); WT_ERR(__wt_lsm_tree_lock(session, lsm_tree, 0)); locked = 1; if (lsm_tree->dsk_gen != saved_gen) goto retry; } /* Detach from our old primary. */ clsm->primary_chunk = NULL; clsm->current = NULL; } WT_ERR(__wt_realloc_def(session, &clsm->bloom_alloc, nchunks, &clsm->blooms)); WT_ERR(__wt_realloc_def(session, &clsm->cursor_alloc, nchunks, &clsm->cursors)); clsm->nchunks = nchunks; /* Open the cursors for chunks that have changed. */ for (i = ngood, cp = clsm->cursors + i; i != nchunks; i++, cp++) { chunk = lsm_tree->chunk[i + start_chunk]; /* Copy the maximum transaction ID. */ if (F_ISSET(clsm, WT_CLSM_OPEN_SNAPSHOT)) clsm->txnid_max[i] = chunk->txnid_max; /* * Read from the checkpoint if the file has been written. * Once all cursors switch, the in-memory tree can be evicted. */ WT_ASSERT(session, *cp == NULL); ret = __wt_open_cursor(session, chunk->uri, c, (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) && !chunk->empty) ? ckpt_cfg : NULL, cp); /* * XXX kludge: we may have an empty chunk where no checkpoint * was written. If so, try to open the ordinary handle on that * chunk instead. */ if (ret == WT_NOTFOUND && F_ISSET(chunk, WT_LSM_CHUNK_ONDISK)) { ret = __wt_open_cursor( session, chunk->uri, c, NULL, cp); if (ret == 0) chunk->empty = 1; } WT_ERR(ret); if (F_ISSET(chunk, WT_LSM_CHUNK_BLOOM) && !F_ISSET(clsm, WT_CLSM_MERGE)) WT_ERR(__wt_bloom_open(session, chunk->bloom_uri, lsm_tree->bloom_bit_count, lsm_tree->bloom_hash_count, c, &clsm->blooms[i])); /* Child cursors always use overwrite and raw mode. */ F_SET(*cp, WT_CURSTD_OVERWRITE | WT_CURSTD_RAW); } /* The last chunk is our new primary. */ if (chunk != NULL && !F_ISSET(chunk, WT_LSM_CHUNK_ONDISK)) { clsm->primary_chunk = chunk; primary = clsm->cursors[clsm->nchunks - 1]; WT_WITH_BTREE(session, ((WT_CURSOR_BTREE *)(primary))->btree, __wt_btree_evictable(session, 0)); } clsm->dsk_gen = lsm_tree->dsk_gen; err: F_CLR(session, WT_SESSION_NO_CACHE_CHECK); #ifdef HAVE_DIAGNOSTIC /* Check that all cursors are open as expected. */ if (ret == 0 && F_ISSET(clsm, WT_CLSM_OPEN_READ)) { for (i = 0, cp = clsm->cursors; i != clsm->nchunks; cp++, i++) { chunk = lsm_tree->chunk[i + start_chunk]; /* Make sure the cursor is open. */ WT_ASSERT(session, *cp != NULL); /* Easy case: the URIs should match. */ WT_ASSERT(session, strcmp((*cp)->uri, chunk->uri) == 0); /* Make sure the checkpoint config matches. */ checkpoint = ((WT_CURSOR_BTREE *)*cp)-> btree->dhandle->checkpoint; WT_ASSERT(session, (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) && !chunk->empty) ? checkpoint != NULL : checkpoint == NULL); /* Make sure the Bloom config matches. */ WT_ASSERT(session, (F_ISSET(chunk, WT_LSM_CHUNK_BLOOM) && !F_ISSET(clsm, WT_CLSM_MERGE)) ? clsm->blooms[i] != NULL : clsm->blooms[i] == NULL); } } #endif if (locked) WT_TRET(__wt_lsm_tree_unlock(session, lsm_tree)); return (ret); }
/* * __clsm_open_cursors -- * Open cursors for the current set of files. */ static int __clsm_open_cursors( WT_CURSOR_LSM *clsm, bool update, u_int start_chunk, uint32_t start_id) { WT_BTREE *btree; WT_CURSOR *c, **cp, *primary; WT_DECL_RET; WT_LSM_CHUNK *chunk; WT_LSM_TREE *lsm_tree; WT_SESSION_IMPL *session; WT_TXN *txn; const char *checkpoint, *ckpt_cfg[3]; uint64_t saved_gen; u_int i, nchunks, ngood, nupdates; u_int close_range_end, close_range_start; bool locked; c = &clsm->iface; session = (WT_SESSION_IMPL *)c->session; txn = &session->txn; chunk = NULL; locked = false; lsm_tree = clsm->lsm_tree; /* * Ensure that any snapshot update has cursors on the right set of * chunks to guarantee visibility is correct. */ if (update && txn->isolation == WT_ISO_SNAPSHOT) F_SET(clsm, WT_CLSM_OPEN_SNAPSHOT); /* * Query operations need a full set of cursors. Overwrite cursors * do queries in service of updates. */ if (!update || !F_ISSET(c, WT_CURSTD_OVERWRITE)) F_SET(clsm, WT_CLSM_OPEN_READ); if (lsm_tree->nchunks == 0) return (0); ckpt_cfg[0] = WT_CONFIG_BASE(session, WT_SESSION_open_cursor); ckpt_cfg[1] = "checkpoint=" WT_CHECKPOINT ",raw"; ckpt_cfg[2] = NULL; /* * If the key is pointing to memory that is pinned by a chunk * cursor, take a copy before closing cursors. */ if (F_ISSET(c, WT_CURSTD_KEY_INT)) WT_CURSOR_NEEDKEY(c); F_CLR(clsm, WT_CLSM_ITERATE_NEXT | WT_CLSM_ITERATE_PREV); WT_RET(__wt_lsm_tree_readlock(session, lsm_tree)); locked = true; /* Merge cursors have already figured out how many chunks they need. */ retry: if (F_ISSET(clsm, WT_CLSM_MERGE)) { nchunks = clsm->nchunks; ngood = 0; /* * We may have raced with another merge completing. Check that * we're starting at the right offset in the chunk array. */ if (start_chunk >= lsm_tree->nchunks || lsm_tree->chunk[start_chunk]->id != start_id) { for (start_chunk = 0; start_chunk < lsm_tree->nchunks; start_chunk++) { chunk = lsm_tree->chunk[start_chunk]; if (chunk->id == start_id) break; } /* We have to find the start chunk: merge locked it. */ WT_ASSERT(session, start_chunk < lsm_tree->nchunks); } WT_ASSERT(session, start_chunk + nchunks <= lsm_tree->nchunks); } else { nchunks = lsm_tree->nchunks; /* * If we are only opening the cursor for updates, only open the * primary chunk, plus any other chunks that might be required * to detect snapshot isolation conflicts. */ if (F_ISSET(clsm, WT_CLSM_OPEN_SNAPSHOT)) WT_ERR(__wt_realloc_def(session, &clsm->txnid_alloc, nchunks, &clsm->switch_txn)); if (F_ISSET(clsm, WT_CLSM_OPEN_READ)) ngood = nupdates = 0; else if (F_ISSET(clsm, WT_CLSM_OPEN_SNAPSHOT)) { /* * Keep going until all updates in the next * chunk are globally visible. Copy the maximum * transaction IDs into the cursor as we go. */ for (ngood = nchunks - 1, nupdates = 1; ngood > 0; ngood--, nupdates++) { chunk = lsm_tree->chunk[ngood - 1]; clsm->switch_txn[ngood - 1] = chunk->switch_txn; if (__wt_txn_visible_all( session, chunk->switch_txn)) break; } } else { nupdates = 1; ngood = nchunks - 1; } /* Check how many cursors are already open. */ for (cp = clsm->cursors + ngood; ngood < clsm->nchunks && ngood < nchunks; cp++, ngood++) { chunk = lsm_tree->chunk[ngood]; /* If the cursor isn't open yet, we're done. */ if (*cp == NULL) break; /* Easy case: the URIs don't match. */ if (strcmp((*cp)->uri, chunk->uri) != 0) break; /* Make sure the checkpoint config matches. */ checkpoint = ((WT_CURSOR_BTREE *)*cp)-> btree->dhandle->checkpoint; if (checkpoint == NULL && F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) && !chunk->empty) break; /* Make sure the Bloom config matches. */ if (clsm->blooms[ngood] == NULL && F_ISSET(chunk, WT_LSM_CHUNK_BLOOM)) break; } /* Spurious generation bump? */ if (ngood == clsm->nchunks && clsm->nchunks == nchunks) { clsm->dsk_gen = lsm_tree->dsk_gen; goto err; } /* * Close any cursors we no longer need. * * Drop the LSM tree lock while we do this: if the cache is * full, we may block while closing a cursor. Save the * generation number and retry if it has changed under us. */ if (clsm->cursors != NULL && ngood < clsm->nchunks) { close_range_start = ngood; close_range_end = clsm->nchunks; } else if (!F_ISSET(clsm, WT_CLSM_OPEN_READ) && nupdates > 0 ) { close_range_start = 0; close_range_end = WT_MIN(nchunks, clsm->nchunks); if (close_range_end > nupdates) close_range_end -= nupdates; else close_range_end = 0; WT_ASSERT(session, ngood >= close_range_end); } else { close_range_end = 0; close_range_start = 0; } if (close_range_end > close_range_start) { saved_gen = lsm_tree->dsk_gen; locked = false; WT_ERR(__wt_lsm_tree_readunlock(session, lsm_tree)); WT_ERR(__clsm_close_cursors( clsm, close_range_start, close_range_end)); WT_ERR(__wt_lsm_tree_readlock(session, lsm_tree)); locked = true; if (lsm_tree->dsk_gen != saved_gen) goto retry; } /* Detach from our old primary. */ clsm->primary_chunk = NULL; clsm->current = NULL; } WT_ERR(__wt_realloc_def(session, &clsm->bloom_alloc, nchunks, &clsm->blooms)); WT_ERR(__wt_realloc_def(session, &clsm->cursor_alloc, nchunks, &clsm->cursors)); clsm->nchunks = nchunks; /* Open the cursors for chunks that have changed. */ for (i = ngood, cp = clsm->cursors + i; i != nchunks; i++, cp++) { chunk = lsm_tree->chunk[i + start_chunk]; /* Copy the maximum transaction ID. */ if (F_ISSET(clsm, WT_CLSM_OPEN_SNAPSHOT)) clsm->switch_txn[i] = chunk->switch_txn; /* * Read from the checkpoint if the file has been written. * Once all cursors switch, the in-memory tree can be evicted. */ WT_ASSERT(session, *cp == NULL); ret = __wt_open_cursor(session, chunk->uri, c, (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) && !chunk->empty) ? ckpt_cfg : NULL, cp); /* * XXX kludge: we may have an empty chunk where no checkpoint * was written. If so, try to open the ordinary handle on that * chunk instead. */ if (ret == WT_NOTFOUND && F_ISSET(chunk, WT_LSM_CHUNK_ONDISK)) { ret = __wt_open_cursor( session, chunk->uri, c, NULL, cp); if (ret == 0) chunk->empty = 1; } WT_ERR(ret); /* * Setup all cursors other than the primary to only do conflict * checks on insert operations. This allows us to execute * inserts on non-primary chunks as a way of checking for * write conflicts with concurrent updates. */ if (i != nchunks - 1) (*cp)->insert = __wt_curfile_update_check; if (!F_ISSET(clsm, WT_CLSM_MERGE) && F_ISSET(chunk, WT_LSM_CHUNK_BLOOM)) WT_ERR(__wt_bloom_open(session, chunk->bloom_uri, lsm_tree->bloom_bit_count, lsm_tree->bloom_hash_count, c, &clsm->blooms[i])); /* Child cursors always use overwrite and raw mode. */ F_SET(*cp, WT_CURSTD_OVERWRITE | WT_CURSTD_RAW); } /* The last chunk is our new primary. */ if (chunk != NULL && !F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) && chunk->switch_txn == WT_TXN_NONE) { clsm->primary_chunk = chunk; primary = clsm->cursors[clsm->nchunks - 1]; /* * Disable eviction for the in-memory chunk. Also clear the * bulk load flag here, otherwise eviction will be enabled by * the first update. */ btree = ((WT_CURSOR_BTREE *)(primary))->btree; if (btree->bulk_load_ok) { btree->bulk_load_ok = false; WT_WITH_BTREE(session, btree, __wt_btree_evictable(session, false)); } } clsm->dsk_gen = lsm_tree->dsk_gen; err: #ifdef HAVE_DIAGNOSTIC /* Check that all cursors are open as expected. */ if (ret == 0 && F_ISSET(clsm, WT_CLSM_OPEN_READ)) { for (i = 0, cp = clsm->cursors; i != clsm->nchunks; cp++, i++) { chunk = lsm_tree->chunk[i + start_chunk]; /* Make sure the cursor is open. */ WT_ASSERT(session, *cp != NULL); /* Easy case: the URIs should match. */ WT_ASSERT(session, strcmp((*cp)->uri, chunk->uri) == 0); /* Make sure the checkpoint config matches. */ checkpoint = ((WT_CURSOR_BTREE *)*cp)-> btree->dhandle->checkpoint; WT_ASSERT(session, (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) && !chunk->empty) ? checkpoint != NULL : checkpoint == NULL); /* Make sure the Bloom config matches. */ WT_ASSERT(session, (F_ISSET(chunk, WT_LSM_CHUNK_BLOOM) && !F_ISSET(clsm, WT_CLSM_MERGE)) ? clsm->blooms[i] != NULL : clsm->blooms[i] == NULL); } } #endif if (locked) WT_TRET(__wt_lsm_tree_readunlock(session, lsm_tree)); return (ret); }