/* * __wt_clsm_await_switch -- * Wait for a switch to have completed in the LSM tree */ int __wt_clsm_await_switch(WT_CURSOR_LSM *clsm) { WT_LSM_TREE *lsm_tree; WT_SESSION_IMPL *session; int waited; lsm_tree = clsm->lsm_tree; session = (WT_SESSION_IMPL *)clsm->iface.session; /* * If there is no primary chunk, or a chunk has overflowed the hard * limit, which either means a worker thread has fallen behind or there * has just been a user-level checkpoint, wait until the tree changes. * * We used to switch chunks in the application thread here, but that is * problematic because there is a transaction in progress and it could * roll back, leaving the metadata inconsistent. */ for (waited = 0; lsm_tree->nchunks == 0 || clsm->dsk_gen == lsm_tree->dsk_gen; ++waited) { if (waited % 1000 == 0) WT_RET(__wt_lsm_manager_push_entry( session, WT_LSM_WORK_SWITCH, 0, lsm_tree)); __wt_sleep(0, 10); } return (0); }
/* * __wt_attach -- * A routine to wait for the debugging to attach. */ void __wt_attach(WT_SESSION_IMPL *session) { #ifdef HAVE_ATTACH __wt_errx(session, "process ID %" PRIdMAX ": waiting for debugger...", (intmax_t)getpid()); /* Sleep forever, the debugger will interrupt us when it attaches. */ for (;;) __wt_sleep(100, 0); #else WT_UNUSED(session); #endif }
/* * compaction -- * Periodically do a compaction operation. */ WT_THREAD_RET compact(void *arg) { WT_CONNECTION *conn; WT_DECL_RET; WT_SESSION *session; u_int period; (void)(arg); /* Compaction isn't supported for all data sources. */ if (DATASOURCE("helium") || DATASOURCE("kvsbdb")) return (WT_THREAD_RET_VALUE); /* Open a session. */ conn = g.wts_conn; testutil_check(conn->open_session(conn, NULL, NULL, &session)); /* * Perform compaction at somewhere under 15 seconds (so we get at * least one done), and then at 23 second intervals. */ for (period = mmrand(NULL, 1, 15);; period = 23) { /* Sleep for short periods so we don't make the run wait. */ while (period > 0 && !g.workers_finished) { --period; __wt_sleep(1, 0); } if (g.workers_finished) break; /* * Compact can return EBUSY if concurrent with alter or if there * is eviction pressure, or we collide with checkpoints. * * Compact returns ETIMEDOUT if the compaction doesn't finish in * in some number of seconds. We don't configure a timeout and * occasionally exceed the default of 1200 seconds. */ ret = session->compact(session, g.uri, NULL); if (ret != 0 && ret != EBUSY && ret != ETIMEDOUT && ret != WT_ROLLBACK) testutil_die(ret, "session.compact"); } testutil_check(session->close(session, NULL)); return (WT_THREAD_RET_VALUE); }
/* * __wt_log_slot_wait -- * Wait for slot leader to allocate log area and tell us our log offset. */ int __wt_log_slot_wait(WT_SESSION_IMPL *session, WT_LOGSLOT *slot) { int yield_count; yield_count = 0; WT_UNUSED(session); while (slot->slot_state > WT_LOG_SLOT_DONE) if (++yield_count < 1000) __wt_yield(); else __wt_sleep(0, 200); return (0); }
/* * __wt_lsm_compact -- * Compact an LSM tree called via __wt_schema_worker. */ int __wt_lsm_compact(WT_SESSION_IMPL *session, const char *name, int *skip) { WT_DECL_RET; WT_LSM_TREE *lsm_tree; uint64_t last_merge_progressing; time_t begin, end; /* * This function is applied to all matching sources: ignore anything * that is not an LSM tree. */ if (!WT_PREFIX_MATCH(name, "lsm:")) return (0); /* Tell __wt_schema_worker not to look inside the LSM tree. */ *skip = 1; WT_RET(__wt_lsm_tree_get(session, name, 0, &lsm_tree)); if (!F_ISSET(S2C(session), WT_CONN_LSM_MERGE) || lsm_tree->merge_threads == 0) WT_RET_MSG(session, EINVAL, "LSM compaction requires active merge threads"); WT_RET(__wt_seconds(session, &begin)); F_SET(lsm_tree, WT_LSM_TREE_COMPACTING); /* Wake up the merge threads. */ WT_RET(__wt_cond_signal(session, lsm_tree->work_cond)); /* Now wait for merge activity to stop. */ do { last_merge_progressing = lsm_tree->merge_progressing; __wt_sleep(1, 0); WT_RET(__wt_seconds(session, &end)); if (session->compact->max_time > 0 && session->compact->max_time < (uint64_t)(end - begin)) WT_ERR(ETIMEDOUT); } while (lsm_tree->merge_progressing != last_merge_progressing && lsm_tree->nchunks > 1); err: F_CLR(lsm_tree, WT_LSM_TREE_COMPACTING); return (ret); }
/* * __compact_checkpoint -- * Perform a checkpoint for compaction. */ static int __compact_checkpoint(WT_SESSION_IMPL *session) { WT_DECL_RET; WT_TXN_GLOBAL *txn_global; uint64_t txn_gen; /* * Force compaction checkpoints: we don't want to skip it because the * work we need to have done is done in the underlying block manager. */ const char *checkpoint_cfg[] = { WT_CONFIG_BASE(session, WT_SESSION_checkpoint), "force=1", NULL }; /* Checkpoints take a lot of time, check if we've run out. */ WT_RET(__wt_session_compact_check_timeout(session)); if ((ret = __wt_txn_checkpoint(session, checkpoint_cfg, false)) == 0) return (0); WT_RET_BUSY_OK(ret); /* * If there's a checkpoint running, wait for it to complete, checking if * we're out of time. If there's no checkpoint running or the checkpoint * generation number changes, the checkpoint blocking us has completed. */ txn_global = &S2C(session)->txn_global; for (txn_gen = __wt_gen(session, WT_GEN_CHECKPOINT);;) { /* * This loop only checks objects that are declared volatile, * therefore no barriers are needed. */ if (!txn_global->checkpoint_running || txn_gen != __wt_gen(session, WT_GEN_CHECKPOINT)) break; WT_RET(__wt_session_compact_check_timeout(session)); __wt_sleep(2, 0); } return (0); }
/* * thread_ts_run -- * Runner function for a timestamp thread. */ static WT_THREAD_RET thread_ts_run(void *arg) { WT_DECL_RET; WT_SESSION *session; THREAD_DATA *td; char tscfg[64], ts_string[WT_TS_HEX_STRING_SIZE]; td = (THREAD_DATA *)arg; testutil_check(td->conn->open_session(td->conn, NULL, NULL, &session)); /* Update the oldest timestamp every 1 millisecond. */ for (;;) { /* * We get the last committed timestamp periodically in order to * update the oldest timestamp, that requires locking out * transactional ops that set or query a timestamp. */ testutil_check(pthread_rwlock_wrlock(&ts_lock)); ret = td->conn->query_timestamp( td->conn, ts_string, "get=all_committed"); testutil_check(pthread_rwlock_unlock(&ts_lock)); testutil_assert(ret == 0 || ret == WT_NOTFOUND); if (ret == 0) { /* * Set both the oldest and stable timestamp so that we * don't need to maintain read availability at older * timestamps. */ testutil_check(__wt_snprintf( tscfg, sizeof(tscfg), "oldest_timestamp=%s,stable_timestamp=%s", ts_string, ts_string)); testutil_check( td->conn->set_timestamp(td->conn, tscfg)); } __wt_sleep(0, 1000); } /* NOTREACHED */ }
/* * __logmgr_force_ckpt -- * Force a checkpoint out, waiting for the checkpoint LSN in the log * is up to the given log number. */ static int __logmgr_force_ckpt(WT_SESSION_IMPL *session, uint32_t lognum) { WT_CONNECTION_IMPL *conn; WT_LOG *log; WT_SESSION_IMPL *tmp_session; int yield; conn = S2C(session); log = conn->log; yield = 0; WT_RET(__wt_open_internal_session(conn, "compatibility-reconfig", true, 0, &tmp_session)); while (log->ckpt_lsn.l.file < lognum) { /* * Force a checkpoint to be written in the new log file and * force the archiving of all previous log files. We do the * checkpoint in the loop because the checkpoint LSN in the * log record could still reflect the previous log file in * cases such as the write LSN has not yet advanced into the * new log file due to another group of threads still in * progress with their slot copies or writes. */ WT_RET(tmp_session->iface.checkpoint( &tmp_session->iface, "force=1")); WT_RET(WT_SESSION_CHECK_PANIC(tmp_session)); /* * Only sleep in the rare case that we had to come through * this loop more than once. */ if (yield++) { WT_STAT_CONN_INCR(session, log_force_ckpt_sleep); __wt_sleep(0, WT_THOUSAND); } } WT_RET(tmp_session->iface.close(&tmp_session->iface, NULL)); return (0); }
/* * __wt_ref_out -- * Discard an in-memory page, freeing all memory associated with it. */ void __wt_ref_out(WT_SESSION_IMPL *session, WT_REF *ref) { /* * A version of the page-out function that allows us to make additional * diagnostic checks. * * The WT_REF cannot be the eviction thread's location. */ WT_ASSERT(session, S2BT(session)->evict_ref != ref); #ifdef HAVE_DIAGNOSTIC { WT_HAZARD *hp; int i; /* * Make sure no other thread has a hazard pointer on the page we are * about to discard. This is complicated by the fact that readers * publish their hazard pointer before re-checking the page state, so * our check can race with readers without indicating a real problem. * Wait for up to a second for hazard pointers to be cleared. */ for (hp = NULL, i = 0; i < 100; i++) { if ((hp = __wt_hazard_check(session, ref)) == NULL) break; __wt_sleep(0, 10000); } if (hp != NULL) __wt_errx(session, "discarded page has hazard pointer: (%p: %s, line %d)", (void *)hp->ref, hp->file, hp->line); WT_ASSERT(session, hp == NULL); } #endif __wt_page_out(session, &ref->page); }
/* * __wt_lsm_compact -- * Compact an LSM tree called via __wt_schema_worker. */ int __wt_lsm_compact(WT_SESSION_IMPL *session, const char *name, int *skip) { WT_DECL_RET; WT_LSM_CHUNK *chunk; WT_LSM_TREE *lsm_tree; time_t begin, end; uint64_t progress; int i, compacting, flushing, locked, ref; compacting = flushing = locked = ref = 0; chunk = NULL; /* * This function is applied to all matching sources: ignore anything * that is not an LSM tree. */ if (!WT_PREFIX_MATCH(name, "lsm:")) return (0); /* Tell __wt_schema_worker not to look inside the LSM tree. */ *skip = 1; WT_RET(__wt_lsm_tree_get(session, name, 0, &lsm_tree)); if (!F_ISSET(S2C(session), WT_CONN_LSM_MERGE)) WT_ERR_MSG(session, EINVAL, "LSM compaction requires active merge threads"); WT_ERR(__wt_seconds(session, &begin)); /* * Compacting has two distinct phases. * 1. All in-memory chunks up to and including the current * current chunk must be flushed. Normally, the flush code * does not flush the last, in-use chunk, so we set a force * flag to include that last chunk. We monitor the state of the * last chunk and periodically push another forced flush work * unit until it is complete. * 2. After all flushing is done, we move onto the merging * phase for compaction. Again, we monitor the state and * continue to push merge work units until all merging is done. */ /* Lock the tree: single-thread compaction. */ WT_ERR(__wt_lsm_tree_writelock(session, lsm_tree)); locked = 1; /* Clear any merge throttle: compact throws out that calculation. */ lsm_tree->merge_throttle = 0; lsm_tree->merge_aggressiveness = 0; progress = lsm_tree->merge_progressing; /* If another thread started a compact on this tree, we're done. */ if (F_ISSET(lsm_tree, WT_LSM_TREE_COMPACTING)) goto err; /* * Set the switch transaction on the current chunk, if it * hasn't been set before. This prevents further writes, so it * can be flushed by the checkpoint worker. */ if (lsm_tree->nchunks > 0 && (chunk = lsm_tree->chunk[lsm_tree->nchunks - 1]) != NULL) { if (chunk->switch_txn == WT_TXN_NONE) chunk->switch_txn = __wt_txn_new_id(session); /* * If we have a chunk, we want to look for it to be on-disk. * So we need to add a reference to keep it available. */ (void)WT_ATOMIC_ADD4(chunk->refcnt, 1); ref = 1; } locked = 0; WT_ERR(__wt_lsm_tree_writeunlock(session, lsm_tree)); if (chunk != NULL) { WT_ERR(__wt_verbose(session, WT_VERB_LSM, "Compact force flush %s flags 0x%" PRIx32 " chunk %u flags 0x%" PRIx32, name, lsm_tree->flags, chunk->id, chunk->flags)); flushing = 1; /* * Make sure the in-memory chunk gets flushed do not push a * switch, because we don't want to create a new in-memory * chunk if the tree is being used read-only now. */ WT_ERR(__wt_lsm_manager_push_entry(session, WT_LSM_WORK_FLUSH, WT_LSM_WORK_FORCE, lsm_tree)); } else { /* * If there is no chunk to flush, go straight to the * compacting state. */ compacting = 1; progress = lsm_tree->merge_progressing; F_SET(lsm_tree, WT_LSM_TREE_COMPACTING); WT_ERR(__wt_verbose(session, WT_VERB_LSM, "COMPACT: Start compacting %s", lsm_tree->name)); } /* Wait for the work unit queues to drain. */ while (F_ISSET(lsm_tree, WT_LSM_TREE_ACTIVE)) { /* * The flush flag is cleared when the chunk has been flushed. * Continue to push forced flushes until the chunk is on disk. * Once it is on disk move to the compacting phase. */ if (flushing) { WT_ASSERT(session, chunk != NULL); if (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK)) { WT_ERR(__wt_verbose(session, WT_VERB_LSM, "Compact flush done %s chunk %u. " "Start compacting progress %" PRIu64, name, chunk->id, lsm_tree->merge_progressing)); (void)WT_ATOMIC_SUB4(chunk->refcnt, 1); flushing = ref = 0; compacting = 1; F_SET(lsm_tree, WT_LSM_TREE_COMPACTING); progress = lsm_tree->merge_progressing; } else { WT_ERR(__wt_verbose(session, WT_VERB_LSM, "Compact flush retry %s chunk %u", name, chunk->id)); WT_ERR(__wt_lsm_manager_push_entry(session, WT_LSM_WORK_FLUSH, WT_LSM_WORK_FORCE, lsm_tree)); } } /* * The compacting flag is cleared when no merges can be done. * Ensure that we push through some aggressive merges before * stopping otherwise we might not do merges that would * span chunks with different generations. */ if (compacting && !F_ISSET(lsm_tree, WT_LSM_TREE_COMPACTING)) { if (lsm_tree->merge_aggressiveness < 10 || (progress < lsm_tree->merge_progressing) || lsm_tree->merge_syncing) { progress = lsm_tree->merge_progressing; F_SET(lsm_tree, WT_LSM_TREE_COMPACTING); lsm_tree->merge_aggressiveness = 10; } else break; } __wt_sleep(1, 0); WT_ERR(__wt_seconds(session, &end)); if (session->compact->max_time > 0 && session->compact->max_time < (uint64_t)(end - begin)) { WT_ERR(ETIMEDOUT); } /* * Push merge operations while they are still getting work * done. If we are pushing merges, make sure they are * aggressive, to avoid duplicating effort. */ if (compacting) #define COMPACT_PARALLEL_MERGES 5 for (i = lsm_tree->queue_ref; i < COMPACT_PARALLEL_MERGES; i++) { lsm_tree->merge_aggressiveness = 10; WT_ERR(__wt_lsm_manager_push_entry( session, WT_LSM_WORK_MERGE, 0, lsm_tree)); } } err: /* Ensure anything we set is cleared. */ if (ref) (void)WT_ATOMIC_SUB4(chunk->refcnt, 1); if (compacting) { F_CLR(lsm_tree, WT_LSM_TREE_COMPACTING); lsm_tree->merge_aggressiveness = 0; } if (locked) WT_TRET(__wt_lsm_tree_writeunlock(session, lsm_tree)); WT_TRET(__wt_verbose(session, WT_VERB_LSM, "Compact %s complete, return %d", name, ret)); __wt_lsm_tree_release(session, lsm_tree); return (ret); }
/* * __wt_page_in_func -- * Acquire a hazard pointer to a page; if the page is not in-memory, * read it from the disk and build an in-memory version. */ int __wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags #ifdef HAVE_DIAGNOSTIC , const char *file, int line #endif ) { WT_DECL_RET; WT_PAGE *page; u_int sleep_cnt, wait_cnt; int busy, force_attempts, oldgen; for (force_attempts = oldgen = 0, wait_cnt = 0;;) { switch (ref->state) { case WT_REF_DISK: case WT_REF_DELETED: if (LF_ISSET(WT_READ_CACHE)) return (WT_NOTFOUND); /* * The page isn't in memory, attempt to read it. * Make sure there is space in the cache. */ WT_RET(__wt_cache_full_check(session)); WT_RET(__wt_cache_read(session, ref)); oldgen = LF_ISSET(WT_READ_WONT_NEED) || F_ISSET(session, WT_SESSION_NO_CACHE); continue; case WT_REF_READING: if (LF_ISSET(WT_READ_CACHE)) return (WT_NOTFOUND); if (LF_ISSET(WT_READ_NO_WAIT)) return (WT_NOTFOUND); WT_STAT_FAST_CONN_INCR(session, page_read_blocked); break; case WT_REF_LOCKED: if (LF_ISSET(WT_READ_NO_WAIT)) return (WT_NOTFOUND); WT_STAT_FAST_CONN_INCR(session, page_locked_blocked); break; case WT_REF_SPLIT: return (WT_RESTART); case WT_REF_MEM: /* * The page is in memory: get a hazard pointer, update * the page's LRU and return. The expected reason we * can't get a hazard pointer is because the page is * being evicted; yield and try again. */ #ifdef HAVE_DIAGNOSTIC WT_RET( __wt_hazard_set(session, ref, &busy, file, line)); #else WT_RET(__wt_hazard_set(session, ref, &busy)); #endif if (busy) { WT_STAT_FAST_CONN_INCR( session, page_busy_blocked); break; } page = ref->page; WT_ASSERT(session, page != NULL); /* * Forcibly evict pages that are too big. */ if (force_attempts < 10 && __evict_force_check(session, page, flags)) { ++force_attempts; ret = __wt_page_release_evict(session, ref); /* If forced eviction fails, stall. */ if (ret == EBUSY) { ret = 0; wait_cnt += 1000; WT_STAT_FAST_CONN_INCR(session, page_forcible_evict_blocked); break; } else WT_RET(ret); /* * The result of a successful forced eviction * is a page-state transition (potentially to * an in-memory page we can use, or a restart * return for our caller), continue the outer * page-acquisition loop. */ continue; } /* Check if we need an autocommit transaction. */ if ((ret = __wt_txn_autocommit_check(session)) != 0) { WT_TRET(__wt_hazard_clear(session, page)); return (ret); } /* * If we read the page and we are configured to not * trash the cache, set the oldest read generation so * the page is forcibly evicted as soon as possible. * * Otherwise, update the page's read generation. */ if (oldgen && page->read_gen == WT_READGEN_NOTSET) __wt_page_evict_soon(page); else if (!LF_ISSET(WT_READ_NO_GEN) && page->read_gen != WT_READGEN_OLDEST && page->read_gen < __wt_cache_read_gen(session)) page->read_gen = __wt_cache_read_gen_set(session); return (0); WT_ILLEGAL_VALUE(session); } /* * We failed to get the page -- yield before retrying, and if * we've yielded enough times, start sleeping so we don't burn * CPU to no purpose. */ if (++wait_cnt < 1000) __wt_yield(); else { sleep_cnt = WT_MIN(wait_cnt, 10000); wait_cnt *= 2; WT_STAT_FAST_CONN_INCRV(session, page_sleep, sleep_cnt); __wt_sleep(0, sleep_cnt); } } }
/* * Regularly create, open a cursor and drop a table. * Measure how long each step takes, and flag an error if it exceeds the * configured maximum. */ static void * cycle_idle_tables(void *arg) { struct timespec start, stop; CONFIG *cfg; WT_SESSION *session; WT_CURSOR *cursor; int cycle_count, ret; char uri[512]; cfg = (CONFIG *)arg; cycle_count = 0; if ((ret = cfg->conn->open_session( cfg->conn, NULL, cfg->sess_config, &session)) != 0) { lprintf(cfg, ret, 0, "Error opening a session on %s", cfg->home); return (NULL); } for (cycle_count = 0; cfg->idle_cycle_run; ++cycle_count) { snprintf(uri, 512, "%s_cycle%07d", cfg->uris[0], cycle_count); /* Don't busy cycle in this loop. */ __wt_sleep(1, 0); /* Setup a start timer. */ if ((ret = __wt_epoch(NULL, &start)) != 0) { lprintf(cfg, ret, 0, "Get time failed in cycle_idle_tables."); cfg->error = ret; return (NULL); } /* Create a table. */ if ((ret = session->create( session, uri, cfg->table_config)) != 0) { if (ret == EBUSY) continue; lprintf(cfg, ret, 0, "Table create failed in cycle_idle_tables."); cfg->error = ret; return (NULL); } if (check_timing(cfg, "create", start, &stop) != 0) return (NULL); start = stop; /* Open and close cursor. */ if ((ret = session->open_cursor( session, uri, NULL, NULL, &cursor)) != 0) { lprintf(cfg, ret, 0, "Cursor open failed in cycle_idle_tables."); cfg->error = ret; return (NULL); } if ((ret = cursor->close(cursor)) != 0) { lprintf(cfg, ret, 0, "Cursor close failed in cycle_idle_tables."); cfg->error = ret; return (NULL); } if (check_timing(cfg, "cursor", start, &stop) != 0) return (NULL); start = stop; /* * Drop the table. Keep retrying on EBUSY failure - it is an * expected return when checkpoints are happening. */ while ((ret = session->drop( session, uri, "force,checkpoint_wait=false")) == EBUSY) __wt_sleep(1, 0); if (ret != 0 && ret != EBUSY) { lprintf(cfg, ret, 0, "Table drop failed in cycle_idle_tables."); cfg->error = ret; return (NULL); } if (check_timing(cfg, "drop", start, &stop) != 0) return (NULL); } return (NULL); }
/* * __wt_page_in_func -- * Acquire a hazard pointer to a page; if the page is not in-memory, * read it from the disk and build an in-memory version. */ int __wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags #ifdef HAVE_DIAGNOSTIC , const char *file, int line #endif ) { WT_BTREE *btree; WT_DECL_RET; WT_PAGE *page; u_int sleep_cnt, wait_cnt; bool busy, cache_work, evict_soon, stalled; int force_attempts; btree = S2BT(session); /* * Ignore reads of pages already known to be in cache, otherwise the * eviction server can dominate these statistics. */ if (!LF_ISSET(WT_READ_CACHE)) { WT_STAT_FAST_CONN_INCR(session, cache_pages_requested); WT_STAT_FAST_DATA_INCR(session, cache_pages_requested); } for (evict_soon = stalled = false, force_attempts = 0, sleep_cnt = wait_cnt = 0;;) { switch (ref->state) { case WT_REF_DELETED: if (LF_ISSET(WT_READ_NO_EMPTY) && __wt_delete_page_skip(session, ref, false)) return (WT_NOTFOUND); /* FALLTHROUGH */ case WT_REF_DISK: if (LF_ISSET(WT_READ_CACHE)) return (WT_NOTFOUND); /* * The page isn't in memory, read it. If this thread is * allowed to do eviction work, check for space in the * cache. */ if (!LF_ISSET(WT_READ_NO_EVICT)) WT_RET(__wt_cache_eviction_check( session, 1, NULL)); WT_RET(__page_read(session, ref)); /* * If configured to not trash the cache, leave the page * generation unset, we'll set it before returning to * the oldest read generation, so the page is forcibly * evicted as soon as possible. We don't do that set * here because we don't want to evict the page before * we "acquire" it. */ evict_soon = LF_ISSET(WT_READ_WONT_NEED) || F_ISSET(session, WT_SESSION_NO_CACHE); continue; case WT_REF_READING: if (LF_ISSET(WT_READ_CACHE)) return (WT_NOTFOUND); if (LF_ISSET(WT_READ_NO_WAIT)) return (WT_NOTFOUND); /* Waiting on another thread's read, stall. */ WT_STAT_FAST_CONN_INCR(session, page_read_blocked); stalled = true; break; case WT_REF_LOCKED: if (LF_ISSET(WT_READ_NO_WAIT)) return (WT_NOTFOUND); /* Waiting on eviction, stall. */ WT_STAT_FAST_CONN_INCR(session, page_locked_blocked); stalled = true; break; case WT_REF_SPLIT: return (WT_RESTART); case WT_REF_MEM: /* * The page is in memory. * * Get a hazard pointer if one is required. We cannot * be evicting if no hazard pointer is required, we're * done. */ if (F_ISSET(btree, WT_BTREE_IN_MEMORY)) goto skip_evict; /* * The expected reason we can't get a hazard pointer is * because the page is being evicted, yield, try again. */ #ifdef HAVE_DIAGNOSTIC WT_RET( __wt_hazard_set(session, ref, &busy, file, line)); #else WT_RET(__wt_hazard_set(session, ref, &busy)); #endif if (busy) { WT_STAT_FAST_CONN_INCR( session, page_busy_blocked); break; } /* * If eviction is configured for this file, check to see * if the page qualifies for forced eviction and update * the page's generation number. If eviction isn't being * done on this file, we're done. * In-memory split of large pages is allowed while * no_eviction is set on btree, whereas reconciliation * is not allowed. */ if (LF_ISSET(WT_READ_NO_EVICT) || F_ISSET(session, WT_SESSION_NO_EVICTION) || (F_ISSET(btree, WT_BTREE_NO_EVICTION) && !F_ISSET(btree, WT_BTREE_NO_RECONCILE))) goto skip_evict; /* * Forcibly evict pages that are too big. */ if (force_attempts < 10 && __evict_force_check(session, ref)) { ++force_attempts; ret = __wt_page_release_evict(session, ref); /* If forced eviction fails, stall. */ if (ret == EBUSY) { ret = 0; WT_STAT_FAST_CONN_INCR(session, page_forcible_evict_blocked); stalled = true; break; } WT_RET(ret); /* * The result of a successful forced eviction * is a page-state transition (potentially to * an in-memory page we can use, or a restart * return for our caller), continue the outer * page-acquisition loop. */ continue; } /* * If we read the page and are configured to not trash * the cache, and no other thread has already used the * page, set the oldest read generation so the page is * forcibly evicted as soon as possible. * * Otherwise, if we read the page, or, if configured to * update the page's read generation and the page isn't * already flagged for forced eviction, update the page * read generation. */ page = ref->page; if (page->read_gen == WT_READGEN_NOTSET) { if (evict_soon) __wt_page_evict_soon(session, ref); else __wt_cache_read_gen_new(session, page); } else if (!LF_ISSET(WT_READ_NO_GEN)) __wt_cache_read_gen_bump(session, page); skip_evict: /* * Check if we need an autocommit transaction. * Starting a transaction can trigger eviction, so skip * it if eviction isn't permitted. */ return (LF_ISSET(WT_READ_NO_EVICT) ? 0 : __wt_txn_autocommit_check(session)); WT_ILLEGAL_VALUE(session); } /* * We failed to get the page -- yield before retrying, and if * we've yielded enough times, start sleeping so we don't burn * CPU to no purpose. */ if (stalled) wait_cnt += WT_THOUSAND; else if (++wait_cnt < WT_THOUSAND) { __wt_yield(); continue; } /* * If stalling and this thread is allowed to do eviction work, * check if the cache needs help. If we do work for the cache, * substitute that for a sleep. */ if (!LF_ISSET(WT_READ_NO_EVICT)) { WT_RET( __wt_cache_eviction_check(session, 1, &cache_work)); if (cache_work) continue; } sleep_cnt = WT_MIN(sleep_cnt + WT_THOUSAND, 10000); WT_STAT_FAST_CONN_INCRV(session, page_sleep, sleep_cnt); __wt_sleep(0, sleep_cnt); } }
/* * __wt_log_slot_join -- * Join a consolidated logging slot. */ void __wt_log_slot_join(WT_SESSION_IMPL *session, uint64_t mysize, uint32_t flags, WT_MYSLOT *myslot) { WT_CONNECTION_IMPL *conn; WT_LOG *log; WT_LOGSLOT *slot; uint64_t time_start, time_stop, usecs; int64_t flag_state, new_state, old_state, released; int32_t join_offset, new_join, wait_cnt; bool closed, diag_yield, raced, slept, unbuffered, yielded; conn = S2C(session); log = conn->log; time_start = time_stop = 0; WT_ASSERT(session, !F_ISSET(session, WT_SESSION_LOCKED_SLOT)); WT_ASSERT(session, mysize != 0); /* * There should almost always be a slot open. */ unbuffered = yielded = false; closed = raced = slept = false; wait_cnt = 0; #ifdef HAVE_DIAGNOSTIC diag_yield = (++log->write_calls % 7) == 0; if ((log->write_calls % WT_THOUSAND) == 0 || mysize > WT_LOG_SLOT_BUF_MAX) { #else diag_yield = false; if (mysize > WT_LOG_SLOT_BUF_MAX) { #endif unbuffered = true; F_SET(myslot, WT_MYSLOT_UNBUFFERED); } for (;;) { WT_BARRIER(); slot = log->active_slot; old_state = slot->slot_state; if (WT_LOG_SLOT_OPEN(old_state)) { /* * Try to join our size into the existing size and * atomically write it back into the state. */ flag_state = WT_LOG_SLOT_FLAGS(old_state); released = WT_LOG_SLOT_RELEASED(old_state); join_offset = WT_LOG_SLOT_JOINED(old_state); if (unbuffered) new_join = join_offset + WT_LOG_SLOT_UNBUFFERED; else new_join = join_offset + (int32_t)mysize; new_state = (int64_t)WT_LOG_SLOT_JOIN_REL( (int64_t)new_join, (int64_t)released, (int64_t)flag_state); /* * Braces used due to potential empty body warning. */ if (diag_yield) { WT_DIAGNOSTIC_YIELD; } /* * Attempt to swap our size into the state. */ if (__wt_atomic_casiv64( &slot->slot_state, old_state, new_state)) break; WT_STAT_CONN_INCR(session, log_slot_races); raced = true; } else { WT_STAT_CONN_INCR(session, log_slot_active_closed); closed = true; ++wait_cnt; } if (!yielded) time_start = __wt_clock(session); yielded = true; /* * The slot is no longer open or we lost the race to * update it. Yield and try again. */ if (wait_cnt < WT_THOUSAND) __wt_yield(); else { __wt_sleep(0, WT_THOUSAND); slept = true; } } /* * We joined this slot. Fill in our information to return to * the caller. */ if (!yielded) WT_STAT_CONN_INCR(session, log_slot_immediate); else { WT_STAT_CONN_INCR(session, log_slot_yield); time_stop = __wt_clock(session); usecs = WT_CLOCKDIFF_US(time_stop, time_start); WT_STAT_CONN_INCRV(session, log_slot_yield_duration, usecs); if (closed) WT_STAT_CONN_INCR(session, log_slot_yield_close); if (raced) WT_STAT_CONN_INCR(session, log_slot_yield_race); if (slept) WT_STAT_CONN_INCR(session, log_slot_yield_sleep); } if (LF_ISSET(WT_LOG_DSYNC | WT_LOG_FSYNC)) F_SET(slot, WT_SLOT_SYNC_DIR); if (LF_ISSET(WT_LOG_FLUSH)) F_SET(slot, WT_SLOT_FLUSH); if (LF_ISSET(WT_LOG_FSYNC)) F_SET(slot, WT_SLOT_SYNC); if (F_ISSET(myslot, WT_MYSLOT_UNBUFFERED)) { WT_ASSERT(session, slot->slot_unbuffered == 0); WT_STAT_CONN_INCR(session, log_slot_unbuffered); slot->slot_unbuffered = (int64_t)mysize; } myslot->slot = slot; myslot->offset = join_offset; myslot->end_offset = (wt_off_t)((uint64_t)join_offset + mysize); } /* * __wt_log_slot_release -- * Each thread in a consolidated group releases its portion to * signal it has completed copying its piece of the log into * the memory buffer. */ int64_t __wt_log_slot_release(WT_MYSLOT *myslot, int64_t size) { WT_LOGSLOT *slot; wt_off_t cur_offset, my_start; int64_t my_size, rel_size; slot = myslot->slot; my_start = slot->slot_start_offset + myslot->offset; /* * We maintain the last starting offset within this slot. * This is used to know the offset of the last record that * was written rather than the beginning record of the slot. */ while ((cur_offset = slot->slot_last_offset) < my_start) { /* * Set our offset if we are larger. */ if (__wt_atomic_casiv64( &slot->slot_last_offset, cur_offset, my_start)) break; /* * If we raced another thread updating this, try again. */ WT_BARRIER(); } /* * Add my size into the state and return the new size. */ rel_size = size; if (F_ISSET(myslot, WT_MYSLOT_UNBUFFERED)) rel_size = WT_LOG_SLOT_UNBUFFERED; my_size = (int64_t)WT_LOG_SLOT_JOIN_REL((int64_t)0, rel_size, 0); return (__wt_atomic_addiv64(&slot->slot_state, my_size)); }
/* * __wt_page_out -- * Discard an in-memory page, freeing all memory associated with it. */ void __wt_page_out(WT_SESSION_IMPL *session, WT_PAGE **pagep) { WT_PAGE *page; WT_PAGE_HEADER *dsk; WT_PAGE_MODIFY *mod; /* * Kill our caller's reference, do our best to catch races. */ page = *pagep; *pagep = NULL; if (F_ISSET(session->dhandle, WT_DHANDLE_DEAD)) __wt_page_modify_clear(session, page); /* * We should never discard: * - a dirty page, * - a page queued for eviction, or * - a locked page. */ WT_ASSERT(session, !__wt_page_is_modified(page)); WT_ASSERT(session, !F_ISSET_ATOMIC(page, WT_PAGE_EVICT_LRU)); WT_ASSERT(session, !__wt_fair_islocked(session, &page->page_lock)); #ifdef HAVE_DIAGNOSTIC { WT_HAZARD *hp; int i; /* * Make sure no other thread has a hazard pointer on the page we are * about to discard. This is complicated by the fact that readers * publish their hazard pointer before re-checking the page state, so * our check can race with readers without indicating a real problem. * Wait for up to a second for hazard pointers to be cleared. */ for (hp = NULL, i = 0; i < 100; i++) { if ((hp = __wt_page_hazard_check(session, page)) == NULL) break; __wt_sleep(0, 10000); } if (hp != NULL) __wt_errx(session, "discarded page has hazard pointer: (%p: %s, line %d)", hp->page, hp->file, hp->line); WT_ASSERT(session, hp == NULL); } #endif /* * If a root page split, there may be one or more pages linked from the * page; walk the list, discarding pages. */ switch (page->type) { case WT_PAGE_COL_INT: case WT_PAGE_ROW_INT: mod = page->modify; if (mod != NULL && mod->mod_root_split != NULL) __wt_page_out(session, &mod->mod_root_split); break; } /* Update the cache's information. */ __wt_cache_page_evict(session, page); /* * If discarding the page as part of process exit, the application may * configure to leak the memory rather than do the work. */ if (F_ISSET(S2C(session), WT_CONN_LEAK_MEMORY)) return; /* Free the page modification information. */ if (page->modify != NULL) __free_page_modify(session, page); switch (page->type) { case WT_PAGE_COL_FIX: break; case WT_PAGE_COL_INT: case WT_PAGE_ROW_INT: __free_page_int(session, page); break; case WT_PAGE_COL_VAR: __free_page_col_var(session, page); break; case WT_PAGE_ROW_LEAF: __free_page_row_leaf(session, page); break; } /* Discard any disk image. */ dsk = (WT_PAGE_HEADER *)page->dsk; if (F_ISSET_ATOMIC(page, WT_PAGE_DISK_ALLOC)) __wt_overwrite_and_free_len(session, dsk, dsk->mem_size); if (F_ISSET_ATOMIC(page, WT_PAGE_DISK_MAPPED)) (void)__wt_mmap_discard(session, dsk, dsk->mem_size); __wt_overwrite_and_free(session, page); }
/* * __wt_page_in_func -- * Acquire a hazard pointer to a page; if the page is not in-memory, * read it from the disk and build an in-memory version. */ int __wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags #ifdef HAVE_DIAGNOSTIC , const char *file, int line #endif ) { WT_BTREE *btree; WT_DECL_RET; WT_PAGE *page; u_int sleep_cnt, wait_cnt; int busy, cache_work, force_attempts, oldgen, stalled; btree = S2BT(session); stalled = 0; for (force_attempts = oldgen = 0, sleep_cnt = wait_cnt = 0;;) { switch (ref->state) { case WT_REF_DISK: case WT_REF_DELETED: if (LF_ISSET(WT_READ_CACHE)) return (WT_NOTFOUND); /* * The page isn't in memory, read it. If this thread is * allowed to do eviction work, check for space in the * cache. */ if (!LF_ISSET(WT_READ_NO_EVICT)) WT_RET(__wt_cache_eviction_check( session, 1, NULL)); WT_RET(__page_read(session, ref)); oldgen = LF_ISSET(WT_READ_WONT_NEED) || F_ISSET(session, WT_SESSION_NO_CACHE); continue; case WT_REF_READING: if (LF_ISSET(WT_READ_CACHE)) return (WT_NOTFOUND); if (LF_ISSET(WT_READ_NO_WAIT)) return (WT_NOTFOUND); /* Waiting on another thread's read, stall. */ WT_STAT_FAST_CONN_INCR(session, page_read_blocked); stalled = 1; break; case WT_REF_LOCKED: if (LF_ISSET(WT_READ_NO_WAIT)) return (WT_NOTFOUND); /* Waiting on eviction, stall. */ WT_STAT_FAST_CONN_INCR(session, page_locked_blocked); stalled = 1; break; case WT_REF_SPLIT: return (WT_RESTART); case WT_REF_MEM: /* * The page is in memory. * * Get a hazard pointer if one is required. We cannot * be evicting if no hazard pointer is required, we're * done. */ if (F_ISSET(btree, WT_BTREE_IN_MEMORY)) goto skip_evict; /* * The expected reason we can't get a hazard pointer is * because the page is being evicted, yield, try again. */ #ifdef HAVE_DIAGNOSTIC WT_RET( __wt_hazard_set(session, ref, &busy, file, line)); #else WT_RET(__wt_hazard_set(session, ref, &busy)); #endif if (busy) { WT_STAT_FAST_CONN_INCR( session, page_busy_blocked); break; } /* * If eviction is configured for this file, check to see * if the page qualifies for forced eviction and update * the page's generation number. If eviction isn't being * done on this file, we're done. */ if (LF_ISSET(WT_READ_NO_EVICT) || F_ISSET(session, WT_SESSION_NO_EVICTION) || F_ISSET(btree, WT_BTREE_NO_EVICTION)) goto skip_evict; /* * Forcibly evict pages that are too big. */ page = ref->page; if (force_attempts < 10 && __evict_force_check(session, page)) { ++force_attempts; ret = __wt_page_release_evict(session, ref); /* If forced eviction fails, stall. */ if (ret == EBUSY) { ret = 0; WT_STAT_FAST_CONN_INCR(session, page_forcible_evict_blocked); stalled = 1; break; } WT_RET(ret); /* * The result of a successful forced eviction * is a page-state transition (potentially to * an in-memory page we can use, or a restart * return for our caller), continue the outer * page-acquisition loop. */ continue; } /* * If we read the page and we are configured to not * trash the cache, set the oldest read generation so * the page is forcibly evicted as soon as possible. * * Otherwise, update the page's read generation. */ if (oldgen && page->read_gen == WT_READGEN_NOTSET) __wt_page_evict_soon(page); else if (!LF_ISSET(WT_READ_NO_GEN) && page->read_gen != WT_READGEN_OLDEST && page->read_gen < __wt_cache_read_gen(session)) page->read_gen = __wt_cache_read_gen_bump(session); skip_evict: /* * Check if we need an autocommit transaction. * Starting a transaction can trigger eviction, so skip * it if eviction isn't permitted. */ return (LF_ISSET(WT_READ_NO_EVICT) ? 0 : __wt_txn_autocommit_check(session)); WT_ILLEGAL_VALUE(session); } /* * We failed to get the page -- yield before retrying, and if * we've yielded enough times, start sleeping so we don't burn * CPU to no purpose. */ if (stalled) wait_cnt += 1000; else if (++wait_cnt < 1000) { __wt_yield(); continue; } /* * If stalling and this thread is allowed to do eviction work, * check if the cache needs help. If we do work for the cache, * substitute that for a sleep. */ if (!LF_ISSET(WT_READ_NO_EVICT)) { WT_RET( __wt_cache_eviction_check(session, 1, &cache_work)); if (cache_work) continue; } sleep_cnt = WT_MIN(sleep_cnt + 1000, 10000); WT_STAT_FAST_CONN_INCRV(session, page_sleep, sleep_cnt); __wt_sleep(0, sleep_cnt); } }
/* * __clsm_open_cursors -- * Open cursors for the current set of files. */ static int __clsm_open_cursors( WT_CURSOR_LSM *clsm, int update, u_int start_chunk, uint32_t start_id) { WT_CURSOR *c, **cp, *primary; WT_DECL_RET; WT_LSM_CHUNK *chunk; WT_LSM_TREE *lsm_tree; WT_SESSION_IMPL *session; WT_TXN *txn; const char *checkpoint, *ckpt_cfg[3]; uint64_t saved_gen; u_int i, nchunks, ngood, nupdates; int locked; c = &clsm->iface; session = (WT_SESSION_IMPL *)c->session; txn = &session->txn; lsm_tree = clsm->lsm_tree; chunk = NULL; ckpt_cfg[0] = WT_CONFIG_BASE(session, session_open_cursor); ckpt_cfg[1] = "checkpoint=" WT_CHECKPOINT ",raw"; ckpt_cfg[2] = NULL; /* Copy the key, so we don't lose the cursor position. */ if (F_ISSET(c, WT_CURSTD_KEY_INT) && !WT_DATA_IN_ITEM(&c->key)) WT_RET(__wt_buf_set( session, &c->key, c->key.data, c->key.size)); F_CLR(clsm, WT_CLSM_ITERATE_NEXT | WT_CLSM_ITERATE_PREV); if (update) { if (txn->isolation == TXN_ISO_SNAPSHOT) F_SET(clsm, WT_CLSM_OPEN_SNAPSHOT); } else F_SET(clsm, WT_CLSM_OPEN_READ); WT_RET(__wt_lsm_tree_lock(session, lsm_tree, 0)); locked = 1; /* * If there is no in-memory chunk in the tree for an update operation, * create one. * * !!! * It is exceeding unlikely that we get here at all, but if we were to * switch chunks in this thread and our transaction roll back, it would * leave the metadata inconsistent. Signal for the LSM worker thread * to create the chunk instead to avoid the issue. */ if (update && (lsm_tree->nchunks == 0 || (chunk = lsm_tree->chunk[lsm_tree->nchunks - 1]) == NULL || F_ISSET(chunk, WT_LSM_CHUNK_ONDISK))) { /* Release our lock because switch will get a write lock. */ F_SET(lsm_tree, WT_LSM_TREE_NEED_SWITCH); locked = 0; WT_ERR(__wt_lsm_tree_unlock(session, lsm_tree)); WT_ERR(__wt_cond_signal(session, lsm_tree->work_cond)); /* * Give the worker thread a chance to run before locking the * tree again -- we will loop in __clsm_enter until there is an * in-memory chunk in the tree. */ __wt_sleep(0, 1000); WT_ERR(__wt_lsm_tree_lock(session, lsm_tree, 0)); locked = 1; } F_SET(session, WT_SESSION_NO_CACHE_CHECK); /* Merge cursors have already figured out how many chunks they need. */ retry: if (F_ISSET(clsm, WT_CLSM_MERGE)) { nchunks = clsm->nchunks; ngood = 0; /* * We may have raced with another merge completing. Check that * we're starting at the right offset in the chunk array. */ if (start_chunk >= lsm_tree->nchunks || lsm_tree->chunk[start_chunk]->id != start_id) { for (start_chunk = 0; start_chunk < lsm_tree->nchunks; start_chunk++) { chunk = lsm_tree->chunk[start_chunk]; if (chunk->id == start_id) break; } /* We have to find the start chunk: merge locked it. */ WT_ASSERT(session, start_chunk < lsm_tree->nchunks); } WT_ASSERT(session, start_chunk + nchunks <= lsm_tree->nchunks); } else { nchunks = lsm_tree->nchunks; /* * If we are only opening the cursor for updates, only open the * primary chunk, plus any other chunks that might be required * to detect snapshot isolation conflicts. */ if (F_ISSET(clsm, WT_CLSM_OPEN_SNAPSHOT)) WT_ERR(__wt_realloc_def(session, &clsm->txnid_alloc, nchunks, &clsm->txnid_max)); if (F_ISSET(clsm, WT_CLSM_OPEN_READ)) ngood = nupdates = 0; else if (F_ISSET(clsm, WT_CLSM_OPEN_SNAPSHOT)) { /* * Keep going until all updates in the next * chunk are globally visible. Copy the maximum * transaction IDs into the cursor as we go. */ for (ngood = nchunks - 1, nupdates = 1; ngood > 0; ngood--, nupdates++) { chunk = lsm_tree->chunk[ngood - 1]; clsm->txnid_max[ngood - 1] = chunk->txnid_max; if (__wt_txn_visible_all( session, chunk->txnid_max)) break; } } else { nupdates = 1; ngood = nchunks - 1; } /* Check how many cursors are already open. */ for (cp = clsm->cursors + ngood; ngood < clsm->nchunks && ngood < nchunks; cp++, ngood++) { chunk = lsm_tree->chunk[ngood]; /* If the cursor isn't open yet, we're done. */ if (*cp == NULL) break; /* Easy case: the URIs don't match. */ if (strcmp((*cp)->uri, chunk->uri) != 0) break; /* Make sure the checkpoint config matches. */ checkpoint = ((WT_CURSOR_BTREE *)*cp)-> btree->dhandle->checkpoint; if (checkpoint == NULL && F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) && !chunk->empty) break; /* Make sure the Bloom config matches. */ if (clsm->blooms[ngood] == NULL && F_ISSET(chunk, WT_LSM_CHUNK_BLOOM)) break; } /* Spurious generation bump? */ if (ngood == clsm->nchunks && clsm->nchunks == nchunks) { clsm->dsk_gen = lsm_tree->dsk_gen; goto err; } /* * Close any cursors we no longer need. If the cursor is a * pure update cursor, close everything -- we usually only need * a single chunk open in that case and we haven't walked all * of the other slots in the loop above. * * Drop the LSM tree lock while we do this: if the cache is * full, we may block while closing a cursor. Save the * generation number and retry if it has changed under us. */ if (!F_ISSET(clsm, WT_CLSM_OPEN_READ) && nupdates > 0) ngood = 0; if (clsm->cursors != NULL && ngood < clsm->nchunks) { saved_gen = lsm_tree->dsk_gen; locked = 0; WT_ERR(__wt_lsm_tree_unlock(session, lsm_tree)); WT_ERR(__clsm_close_cursors( clsm, ngood, clsm->nchunks)); WT_ERR(__wt_lsm_tree_lock(session, lsm_tree, 0)); locked = 1; if (lsm_tree->dsk_gen != saved_gen) goto retry; } /* Detach from our old primary. */ clsm->primary_chunk = NULL; clsm->current = NULL; } WT_ERR(__wt_realloc_def(session, &clsm->bloom_alloc, nchunks, &clsm->blooms)); WT_ERR(__wt_realloc_def(session, &clsm->cursor_alloc, nchunks, &clsm->cursors)); clsm->nchunks = nchunks; /* Open the cursors for chunks that have changed. */ for (i = ngood, cp = clsm->cursors + i; i != nchunks; i++, cp++) { chunk = lsm_tree->chunk[i + start_chunk]; /* Copy the maximum transaction ID. */ if (F_ISSET(clsm, WT_CLSM_OPEN_SNAPSHOT)) clsm->txnid_max[i] = chunk->txnid_max; /* * Read from the checkpoint if the file has been written. * Once all cursors switch, the in-memory tree can be evicted. */ WT_ASSERT(session, *cp == NULL); ret = __wt_open_cursor(session, chunk->uri, c, (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) && !chunk->empty) ? ckpt_cfg : NULL, cp); /* * XXX kludge: we may have an empty chunk where no checkpoint * was written. If so, try to open the ordinary handle on that * chunk instead. */ if (ret == WT_NOTFOUND && F_ISSET(chunk, WT_LSM_CHUNK_ONDISK)) { ret = __wt_open_cursor( session, chunk->uri, c, NULL, cp); if (ret == 0) chunk->empty = 1; } WT_ERR(ret); if (F_ISSET(chunk, WT_LSM_CHUNK_BLOOM) && !F_ISSET(clsm, WT_CLSM_MERGE)) WT_ERR(__wt_bloom_open(session, chunk->bloom_uri, lsm_tree->bloom_bit_count, lsm_tree->bloom_hash_count, c, &clsm->blooms[i])); /* Child cursors always use overwrite and raw mode. */ F_SET(*cp, WT_CURSTD_OVERWRITE | WT_CURSTD_RAW); } /* The last chunk is our new primary. */ if (chunk != NULL && !F_ISSET(chunk, WT_LSM_CHUNK_ONDISK)) { clsm->primary_chunk = chunk; primary = clsm->cursors[clsm->nchunks - 1]; WT_WITH_BTREE(session, ((WT_CURSOR_BTREE *)(primary))->btree, __wt_btree_evictable(session, 0)); } clsm->dsk_gen = lsm_tree->dsk_gen; err: F_CLR(session, WT_SESSION_NO_CACHE_CHECK); #ifdef HAVE_DIAGNOSTIC /* Check that all cursors are open as expected. */ if (ret == 0 && F_ISSET(clsm, WT_CLSM_OPEN_READ)) { for (i = 0, cp = clsm->cursors; i != clsm->nchunks; cp++, i++) { chunk = lsm_tree->chunk[i + start_chunk]; /* Make sure the cursor is open. */ WT_ASSERT(session, *cp != NULL); /* Easy case: the URIs should match. */ WT_ASSERT(session, strcmp((*cp)->uri, chunk->uri) == 0); /* Make sure the checkpoint config matches. */ checkpoint = ((WT_CURSOR_BTREE *)*cp)-> btree->dhandle->checkpoint; WT_ASSERT(session, (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) && !chunk->empty) ? checkpoint != NULL : checkpoint == NULL); /* Make sure the Bloom config matches. */ WT_ASSERT(session, (F_ISSET(chunk, WT_LSM_CHUNK_BLOOM) && !F_ISSET(clsm, WT_CLSM_MERGE)) ? clsm->blooms[i] != NULL : clsm->blooms[i] == NULL); } } #endif if (locked) WT_TRET(__wt_lsm_tree_unlock(session, lsm_tree)); return (ret); }