예제 #1
0
/*
 * __wt_clsm_await_switch --
 *	Wait for a switch to have completed in the LSM tree
 */
int
__wt_clsm_await_switch(WT_CURSOR_LSM *clsm)
{
	WT_LSM_TREE *lsm_tree;
	WT_SESSION_IMPL *session;
	int waited;

	lsm_tree = clsm->lsm_tree;
	session = (WT_SESSION_IMPL *)clsm->iface.session;

	/*
	 * If there is no primary chunk, or a chunk has overflowed the hard
	 * limit, which either means a worker thread has fallen behind or there
	 * has just been a user-level checkpoint, wait until the tree changes.
	 *
	 * We used to switch chunks in the application thread here, but that is
	 * problematic because there is a transaction in progress and it could
	 * roll back, leaving the metadata inconsistent.
	 */
	for (waited = 0;
	    lsm_tree->nchunks == 0 ||
	    clsm->dsk_gen == lsm_tree->dsk_gen;
	    ++waited) {
		if (waited % 1000 == 0)
			WT_RET(__wt_lsm_manager_push_entry(
			    session, WT_LSM_WORK_SWITCH, 0, lsm_tree));
		__wt_sleep(0, 10);
	}
	return (0);
}
예제 #2
0
/*
 * __wt_attach --
 *	A routine to wait for the debugging to attach.
 */
void
__wt_attach(WT_SESSION_IMPL *session)
{
#ifdef HAVE_ATTACH
	__wt_errx(session, "process ID %" PRIdMAX
	    ": waiting for debugger...", (intmax_t)getpid());

	/* Sleep forever, the debugger will interrupt us when it attaches. */
	for (;;)
		__wt_sleep(100, 0);
#else
	WT_UNUSED(session);
#endif
}
예제 #3
0
/*
 * compaction --
 *	Periodically do a compaction operation.
 */
WT_THREAD_RET
compact(void *arg)
{
	WT_CONNECTION *conn;
	WT_DECL_RET;
	WT_SESSION *session;
	u_int period;

	(void)(arg);

	/* Compaction isn't supported for all data sources. */
	if (DATASOURCE("helium") || DATASOURCE("kvsbdb"))
		return (WT_THREAD_RET_VALUE);

	/* Open a session. */
	conn = g.wts_conn;
	testutil_check(conn->open_session(conn, NULL, NULL, &session));

	/*
	 * Perform compaction at somewhere under 15 seconds (so we get at
	 * least one done), and then at 23 second intervals.
	 */
	for (period = mmrand(NULL, 1, 15);; period = 23) {
		/* Sleep for short periods so we don't make the run wait. */
		while (period > 0 && !g.workers_finished) {
			--period;
			__wt_sleep(1, 0);
		}
		if (g.workers_finished)
			break;

		/*
		 * Compact can return EBUSY if concurrent with alter or if there
		 * is eviction pressure, or we collide with checkpoints.
		 *
		 * Compact returns ETIMEDOUT if the compaction doesn't finish in
		 * in some number of seconds. We don't configure a timeout and
		 * occasionally exceed the default of 1200 seconds.
		 */
		ret = session->compact(session, g.uri, NULL);
		if (ret != 0 &&
		    ret != EBUSY && ret != ETIMEDOUT && ret != WT_ROLLBACK)
			testutil_die(ret, "session.compact");
	}

	testutil_check(session->close(session, NULL));

	return (WT_THREAD_RET_VALUE);
}
예제 #4
0
/*
 * __wt_log_slot_wait --
 *	Wait for slot leader to allocate log area and tell us our log offset.
 */
int
__wt_log_slot_wait(WT_SESSION_IMPL *session, WT_LOGSLOT *slot)
{
	int yield_count;

	yield_count = 0;
	WT_UNUSED(session);

	while (slot->slot_state > WT_LOG_SLOT_DONE)
		if (++yield_count < 1000)
			__wt_yield();
		else
			__wt_sleep(0, 200);
	return (0);
}
예제 #5
0
/*
 * __wt_lsm_compact --
 *	Compact an LSM tree called via __wt_schema_worker.
 */
int
__wt_lsm_compact(WT_SESSION_IMPL *session, const char *name, int *skip)
{
	WT_DECL_RET;
	WT_LSM_TREE *lsm_tree;
	uint64_t last_merge_progressing;
	time_t begin, end;

	/*
	 * This function is applied to all matching sources: ignore anything
	 * that is not an LSM tree.
	 */
	if (!WT_PREFIX_MATCH(name, "lsm:"))
		return (0);

	/* Tell __wt_schema_worker not to look inside the LSM tree. */
	*skip = 1;

	WT_RET(__wt_lsm_tree_get(session, name, 0, &lsm_tree));

	if (!F_ISSET(S2C(session), WT_CONN_LSM_MERGE) ||
	    lsm_tree->merge_threads == 0)
		WT_RET_MSG(session, EINVAL,
		    "LSM compaction requires active merge threads");

	WT_RET(__wt_seconds(session, &begin));

	F_SET(lsm_tree, WT_LSM_TREE_COMPACTING);

	/* Wake up the merge threads. */
	WT_RET(__wt_cond_signal(session, lsm_tree->work_cond));

	/* Now wait for merge activity to stop. */
	do {
		last_merge_progressing = lsm_tree->merge_progressing;
		__wt_sleep(1, 0);
		WT_RET(__wt_seconds(session, &end));
		if (session->compact->max_time > 0 &&
		    session->compact->max_time < (uint64_t)(end - begin))
			WT_ERR(ETIMEDOUT);
	} while (lsm_tree->merge_progressing != last_merge_progressing &&
	    lsm_tree->nchunks > 1);

err:	F_CLR(lsm_tree, WT_LSM_TREE_COMPACTING);

	return (ret);
}
예제 #6
0
/*
 * __compact_checkpoint --
 *     Perform a checkpoint for compaction.
 */
static int
__compact_checkpoint(WT_SESSION_IMPL *session)
{
	WT_DECL_RET;
	WT_TXN_GLOBAL *txn_global;
	uint64_t txn_gen;

	/*
	 * Force compaction checkpoints: we don't want to skip it because the
	 * work we need to have done is done in the underlying block manager.
	 */
	const char *checkpoint_cfg[] = {
	    WT_CONFIG_BASE(session, WT_SESSION_checkpoint), "force=1", NULL };

	/* Checkpoints take a lot of time, check if we've run out. */
	WT_RET(__wt_session_compact_check_timeout(session));

	if ((ret = __wt_txn_checkpoint(session, checkpoint_cfg, false)) == 0)
		return (0);
	WT_RET_BUSY_OK(ret);

	/*
	 * If there's a checkpoint running, wait for it to complete, checking if
	 * we're out of time. If there's no checkpoint running or the checkpoint
	 * generation number changes, the checkpoint blocking us has completed.
	 */
	txn_global = &S2C(session)->txn_global;
	for (txn_gen = __wt_gen(session, WT_GEN_CHECKPOINT);;) {
		/*
		 * This loop only checks objects that are declared volatile,
		 * therefore no barriers are needed.
		 */
		if (!txn_global->checkpoint_running ||
		    txn_gen != __wt_gen(session, WT_GEN_CHECKPOINT))
			break;

		WT_RET(__wt_session_compact_check_timeout(session));
		__wt_sleep(2, 0);
	}

	return (0);
}
예제 #7
0
/*
 * thread_ts_run --
 *	Runner function for a timestamp thread.
 */
static WT_THREAD_RET
thread_ts_run(void *arg)
{
	WT_DECL_RET;
	WT_SESSION *session;
	THREAD_DATA *td;
	char tscfg[64], ts_string[WT_TS_HEX_STRING_SIZE];

	td = (THREAD_DATA *)arg;

	testutil_check(td->conn->open_session(td->conn, NULL, NULL, &session));
	/* Update the oldest timestamp every 1 millisecond. */
	for (;;) {
		/*
		 * We get the last committed timestamp periodically in order to
		 * update the oldest timestamp, that requires locking out
		 * transactional ops that set or query a timestamp.
		 */
		testutil_check(pthread_rwlock_wrlock(&ts_lock));
		ret = td->conn->query_timestamp(
		    td->conn, ts_string, "get=all_committed");
		testutil_check(pthread_rwlock_unlock(&ts_lock));
		testutil_assert(ret == 0 || ret == WT_NOTFOUND);
		if (ret == 0) {
			/*
			 * Set both the oldest and stable timestamp so that we
			 * don't need to maintain read availability at older
			 * timestamps.
			 */
			testutil_check(__wt_snprintf(
			    tscfg, sizeof(tscfg),
			    "oldest_timestamp=%s,stable_timestamp=%s",
			    ts_string, ts_string));
			testutil_check(
			    td->conn->set_timestamp(td->conn, tscfg));
		}
		__wt_sleep(0, 1000);
	}
	/* NOTREACHED */
}
예제 #8
0
파일: conn_log.c 프로젝트: mpobrien/mongo
/*
 * __logmgr_force_ckpt --
 *	Force a checkpoint out, waiting for the checkpoint LSN in the log
 *	is up to the given log number.
 */
static int
__logmgr_force_ckpt(WT_SESSION_IMPL *session, uint32_t lognum)
{
	WT_CONNECTION_IMPL *conn;
	WT_LOG *log;
	WT_SESSION_IMPL *tmp_session;
	int yield;

	conn = S2C(session);
	log = conn->log;
	yield = 0;
	WT_RET(__wt_open_internal_session(conn,
	    "compatibility-reconfig", true, 0, &tmp_session));
	while (log->ckpt_lsn.l.file < lognum) {
		/*
		 * Force a checkpoint to be written in the new log file and
		 * force the archiving of all previous log files.  We do the
		 * checkpoint in the loop because the checkpoint LSN in the
		 * log record could still reflect the previous log file in
		 * cases such as the write LSN has not yet advanced into the
		 * new log file due to another group of threads still in
		 * progress with their slot copies or writes.
		 */
		WT_RET(tmp_session->iface.checkpoint(
		    &tmp_session->iface, "force=1"));
		WT_RET(WT_SESSION_CHECK_PANIC(tmp_session));
		/*
		 * Only sleep in the rare case that we had to come through
		 * this loop more than once.
		 */
		if (yield++) {
			WT_STAT_CONN_INCR(session, log_force_ckpt_sleep);
			__wt_sleep(0, WT_THOUSAND);
		}
	}
	WT_RET(tmp_session->iface.close(&tmp_session->iface, NULL));
	return (0);
}
예제 #9
0
파일: bt_discard.c 프로젝트: ksuarz/mongo
/*
 * __wt_ref_out --
 *	Discard an in-memory page, freeing all memory associated with it.
 */
void
__wt_ref_out(WT_SESSION_IMPL *session, WT_REF *ref)
{
	/*
	 * A version of the page-out function that allows us to make additional
	 * diagnostic checks.
	 *
	 * The WT_REF cannot be the eviction thread's location.
	 */
	WT_ASSERT(session, S2BT(session)->evict_ref != ref);

#ifdef HAVE_DIAGNOSTIC
	{
	WT_HAZARD *hp;
	int i;
	/*
	 * Make sure no other thread has a hazard pointer on the page we are
	 * about to discard.  This is complicated by the fact that readers
	 * publish their hazard pointer before re-checking the page state, so
	 * our check can race with readers without indicating a real problem.
	 * Wait for up to a second for hazard pointers to be cleared.
	 */
	for (hp = NULL, i = 0; i < 100; i++) {
		if ((hp = __wt_hazard_check(session, ref)) == NULL)
			break;
		__wt_sleep(0, 10000);
	}
	if (hp != NULL)
		__wt_errx(session,
		    "discarded page has hazard pointer: (%p: %s, line %d)",
		    (void *)hp->ref, hp->file, hp->line);
	WT_ASSERT(session, hp == NULL);
	}
#endif

	__wt_page_out(session, &ref->page);
}
예제 #10
0
/*
 * __wt_lsm_compact --
 *	Compact an LSM tree called via __wt_schema_worker.
 */
int
__wt_lsm_compact(WT_SESSION_IMPL *session, const char *name, int *skip)
{
	WT_DECL_RET;
	WT_LSM_CHUNK *chunk;
	WT_LSM_TREE *lsm_tree;
	time_t begin, end;
	uint64_t progress;
	int i, compacting, flushing, locked, ref;

	compacting = flushing = locked = ref = 0;
	chunk = NULL;
	/*
	 * This function is applied to all matching sources: ignore anything
	 * that is not an LSM tree.
	 */
	if (!WT_PREFIX_MATCH(name, "lsm:"))
		return (0);

	/* Tell __wt_schema_worker not to look inside the LSM tree. */
	*skip = 1;

	WT_RET(__wt_lsm_tree_get(session, name, 0, &lsm_tree));

	if (!F_ISSET(S2C(session), WT_CONN_LSM_MERGE))
		WT_ERR_MSG(session, EINVAL,
		    "LSM compaction requires active merge threads");

	WT_ERR(__wt_seconds(session, &begin));

	/*
	 * Compacting has two distinct phases.
	 * 1.  All in-memory chunks up to and including the current
	 * current chunk must be flushed.  Normally, the flush code
	 * does not flush the last, in-use chunk, so we set a force
	 * flag to include that last chunk.  We monitor the state of the
	 * last chunk and periodically push another forced flush work
	 * unit until it is complete.
	 * 2.  After all flushing is done, we move onto the merging
	 * phase for compaction.  Again, we monitor the state and
	 * continue to push merge work units until all merging is done.
	 */

	/* Lock the tree: single-thread compaction. */
	WT_ERR(__wt_lsm_tree_writelock(session, lsm_tree));
	locked = 1;

	/* Clear any merge throttle: compact throws out that calculation. */
	lsm_tree->merge_throttle = 0;
	lsm_tree->merge_aggressiveness = 0;
	progress = lsm_tree->merge_progressing;

	/* If another thread started a compact on this tree, we're done. */
	if (F_ISSET(lsm_tree, WT_LSM_TREE_COMPACTING))
		goto err;

	/*
	 * Set the switch transaction on the current chunk, if it
	 * hasn't been set before.  This prevents further writes, so it
	 * can be flushed by the checkpoint worker.
	 */
	if (lsm_tree->nchunks > 0 &&
	    (chunk = lsm_tree->chunk[lsm_tree->nchunks - 1]) != NULL) {
		if (chunk->switch_txn == WT_TXN_NONE)
			chunk->switch_txn = __wt_txn_new_id(session);
		/*
		 * If we have a chunk, we want to look for it to be on-disk.
		 * So we need to add a reference to keep it available.
		 */
		(void)WT_ATOMIC_ADD4(chunk->refcnt, 1);
		ref = 1;
	}

	locked = 0;
	WT_ERR(__wt_lsm_tree_writeunlock(session, lsm_tree));

	if (chunk != NULL) {
		WT_ERR(__wt_verbose(session, WT_VERB_LSM,
		    "Compact force flush %s flags 0x%" PRIx32
		    " chunk %u flags 0x%"
		    PRIx32, name, lsm_tree->flags, chunk->id, chunk->flags));
		flushing = 1;
		/*
		 * Make sure the in-memory chunk gets flushed do not push a
		 * switch, because we don't want to create a new in-memory
		 * chunk if the tree is being used read-only now.
		 */
		WT_ERR(__wt_lsm_manager_push_entry(session,
		    WT_LSM_WORK_FLUSH, WT_LSM_WORK_FORCE, lsm_tree));
	} else {
		/*
		 * If there is no chunk to flush, go straight to the
		 * compacting state.
		 */
		compacting = 1;
		progress = lsm_tree->merge_progressing;
		F_SET(lsm_tree, WT_LSM_TREE_COMPACTING);
		WT_ERR(__wt_verbose(session, WT_VERB_LSM,
		    "COMPACT: Start compacting %s", lsm_tree->name));
	}

	/* Wait for the work unit queues to drain. */
	while (F_ISSET(lsm_tree, WT_LSM_TREE_ACTIVE)) {
		/*
		 * The flush flag is cleared when the chunk has been flushed.
		 * Continue to push forced flushes until the chunk is on disk.
		 * Once it is on disk move to the compacting phase.
		 */
		if (flushing) {
			WT_ASSERT(session, chunk != NULL);
			if (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK)) {
				WT_ERR(__wt_verbose(session,
				    WT_VERB_LSM,
				    "Compact flush done %s chunk %u.  "
				    "Start compacting progress %" PRIu64,
				    name, chunk->id,
				    lsm_tree->merge_progressing));
				(void)WT_ATOMIC_SUB4(chunk->refcnt, 1);
				flushing = ref = 0;
				compacting = 1;
				F_SET(lsm_tree, WT_LSM_TREE_COMPACTING);
				progress = lsm_tree->merge_progressing;
			} else {
				WT_ERR(__wt_verbose(session, WT_VERB_LSM,
				    "Compact flush retry %s chunk %u",
				    name, chunk->id));
				WT_ERR(__wt_lsm_manager_push_entry(session,
				    WT_LSM_WORK_FLUSH, WT_LSM_WORK_FORCE,
				    lsm_tree));
			}
		}

		/*
		 * The compacting flag is cleared when no merges can be done.
		 * Ensure that we push through some aggressive merges before
		 * stopping otherwise we might not do merges that would
		 * span chunks with different generations.
		 */
		if (compacting && !F_ISSET(lsm_tree, WT_LSM_TREE_COMPACTING)) {
			if (lsm_tree->merge_aggressiveness < 10 ||
			    (progress < lsm_tree->merge_progressing) ||
			    lsm_tree->merge_syncing) {
				progress = lsm_tree->merge_progressing;
				F_SET(lsm_tree, WT_LSM_TREE_COMPACTING);
				lsm_tree->merge_aggressiveness = 10;
			} else
				break;
		}
		__wt_sleep(1, 0);
		WT_ERR(__wt_seconds(session, &end));
		if (session->compact->max_time > 0 &&
		    session->compact->max_time < (uint64_t)(end - begin)) {
			WT_ERR(ETIMEDOUT);
		}
		/*
		 * Push merge operations while they are still getting work
		 * done. If we are pushing merges, make sure they are
		 * aggressive, to avoid duplicating effort.
		 */
		if (compacting)
#define	COMPACT_PARALLEL_MERGES	5
			for (i = lsm_tree->queue_ref;
			    i < COMPACT_PARALLEL_MERGES; i++) {
				lsm_tree->merge_aggressiveness = 10;
				WT_ERR(__wt_lsm_manager_push_entry(
				    session, WT_LSM_WORK_MERGE, 0, lsm_tree));
			}
	}
err:
	/* Ensure anything we set is cleared. */
	if (ref)
		(void)WT_ATOMIC_SUB4(chunk->refcnt, 1);
	if (compacting) {
		F_CLR(lsm_tree, WT_LSM_TREE_COMPACTING);
		lsm_tree->merge_aggressiveness = 0;
	}
	if (locked)
		WT_TRET(__wt_lsm_tree_writeunlock(session, lsm_tree));

	WT_TRET(__wt_verbose(session, WT_VERB_LSM,
	    "Compact %s complete, return %d", name, ret));

	__wt_lsm_tree_release(session, lsm_tree);
	return (ret);

}
예제 #11
0
/*
 * __wt_page_in_func --
 *	Acquire a hazard pointer to a page; if the page is not in-memory,
 *	read it from the disk and build an in-memory version.
 */
int
__wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags
#ifdef HAVE_DIAGNOSTIC
    , const char *file, int line
#endif
    )
{
	WT_DECL_RET;
	WT_PAGE *page;
	u_int sleep_cnt, wait_cnt;
	int busy, force_attempts, oldgen;

	for (force_attempts = oldgen = 0, wait_cnt = 0;;) {
		switch (ref->state) {
		case WT_REF_DISK:
		case WT_REF_DELETED:
			if (LF_ISSET(WT_READ_CACHE))
				return (WT_NOTFOUND);

			/*
			 * The page isn't in memory, attempt to read it.
			 * Make sure there is space in the cache.
			 */
			WT_RET(__wt_cache_full_check(session));
			WT_RET(__wt_cache_read(session, ref));
			oldgen = LF_ISSET(WT_READ_WONT_NEED) ||
			    F_ISSET(session, WT_SESSION_NO_CACHE);
			continue;
		case WT_REF_READING:
			if (LF_ISSET(WT_READ_CACHE))
				return (WT_NOTFOUND);
			if (LF_ISSET(WT_READ_NO_WAIT))
				return (WT_NOTFOUND);
			WT_STAT_FAST_CONN_INCR(session, page_read_blocked);
			break;
		case WT_REF_LOCKED:
			if (LF_ISSET(WT_READ_NO_WAIT))
				return (WT_NOTFOUND);
			WT_STAT_FAST_CONN_INCR(session, page_locked_blocked);
			break;
		case WT_REF_SPLIT:
			return (WT_RESTART);
		case WT_REF_MEM:
			/*
			 * The page is in memory: get a hazard pointer, update
			 * the page's LRU and return.  The expected reason we
			 * can't get a hazard pointer is because the page is
			 * being evicted; yield and try again.
			 */
#ifdef HAVE_DIAGNOSTIC
			WT_RET(
			    __wt_hazard_set(session, ref, &busy, file, line));
#else
			WT_RET(__wt_hazard_set(session, ref, &busy));
#endif
			if (busy) {
				WT_STAT_FAST_CONN_INCR(
				    session, page_busy_blocked);
				break;
			}

			page = ref->page;
			WT_ASSERT(session, page != NULL);

			/*
			 * Forcibly evict pages that are too big.
			 */
			if (force_attempts < 10 &&
			    __evict_force_check(session, page, flags)) {
				++force_attempts;
				ret = __wt_page_release_evict(session, ref);
				/* If forced eviction fails, stall. */
				if (ret == EBUSY) {
					ret = 0;
					wait_cnt += 1000;
					WT_STAT_FAST_CONN_INCR(session,
					    page_forcible_evict_blocked);
					break;
				} else
					WT_RET(ret);

				/*
				 * The result of a successful forced eviction
				 * is a page-state transition (potentially to
				 * an in-memory page we can use, or a restart
				 * return for our caller), continue the outer
				 * page-acquisition loop.
				 */
				continue;
			}

			/* Check if we need an autocommit transaction. */
			if ((ret = __wt_txn_autocommit_check(session)) != 0) {
				WT_TRET(__wt_hazard_clear(session, page));
				return (ret);
			}

			/*
			 * If we read the page and we are configured to not
			 * trash the cache, set the oldest read generation so
			 * the page is forcibly evicted as soon as possible.
			 *
			 * Otherwise, update the page's read generation.
			 */
			if (oldgen && page->read_gen == WT_READGEN_NOTSET)
				__wt_page_evict_soon(page);
			else if (!LF_ISSET(WT_READ_NO_GEN) &&
			    page->read_gen != WT_READGEN_OLDEST &&
			    page->read_gen < __wt_cache_read_gen(session))
				page->read_gen =
				    __wt_cache_read_gen_set(session);

			return (0);
		WT_ILLEGAL_VALUE(session);
		}

		/*
		 * We failed to get the page -- yield before retrying, and if
		 * we've yielded enough times, start sleeping so we don't burn
		 * CPU to no purpose.
		 */
		if (++wait_cnt < 1000)
			__wt_yield();
		else {
			sleep_cnt = WT_MIN(wait_cnt, 10000);
			wait_cnt *= 2;
			WT_STAT_FAST_CONN_INCRV(session, page_sleep, sleep_cnt);
			__wt_sleep(0, sleep_cnt);
		}
	}
}
예제 #12
0
/*
 * Regularly create, open a cursor and drop a table.
 * Measure how long each step takes, and flag an error if it exceeds the
 * configured maximum.
 */
static void *
cycle_idle_tables(void *arg)
{
	struct timespec start, stop;
	CONFIG *cfg;
	WT_SESSION *session;
	WT_CURSOR *cursor;
	int cycle_count, ret;
	char uri[512];

	cfg = (CONFIG *)arg;
	cycle_count = 0;

	if ((ret = cfg->conn->open_session(
	    cfg->conn, NULL, cfg->sess_config, &session)) != 0) {
		lprintf(cfg, ret, 0,
		    "Error opening a session on %s", cfg->home);
		return (NULL);
	}

	for (cycle_count = 0; cfg->idle_cycle_run; ++cycle_count) {
		snprintf(uri, 512, "%s_cycle%07d", cfg->uris[0], cycle_count);
		/* Don't busy cycle in this loop. */
		__wt_sleep(1, 0);

		/* Setup a start timer. */
		if ((ret = __wt_epoch(NULL, &start)) != 0) {
			lprintf(cfg, ret, 0,
			     "Get time failed in cycle_idle_tables.");
			cfg->error = ret;
			return (NULL);
		}

		/* Create a table. */
		if ((ret = session->create(
		    session, uri, cfg->table_config)) != 0) {
			if (ret == EBUSY)
				continue;
			lprintf(cfg, ret, 0,
			     "Table create failed in cycle_idle_tables.");
			cfg->error = ret;
			return (NULL);
		}
		if (check_timing(cfg, "create", start, &stop) != 0)
			return (NULL);
		start = stop;

		/* Open and close cursor. */
		if ((ret = session->open_cursor(
		    session, uri, NULL, NULL, &cursor)) != 0) {
			lprintf(cfg, ret, 0,
			     "Cursor open failed in cycle_idle_tables.");
			cfg->error = ret;
			return (NULL);
		}
		if ((ret = cursor->close(cursor)) != 0) {
			lprintf(cfg, ret, 0,
			     "Cursor close failed in cycle_idle_tables.");
			cfg->error = ret;
			return (NULL);
		}
		if (check_timing(cfg, "cursor", start, &stop) != 0)
			return (NULL);
		start = stop;

		/*
		 * Drop the table. Keep retrying on EBUSY failure - it is an
		 * expected return when checkpoints are happening.
		 */
		while ((ret = session->drop(
		    session, uri, "force,checkpoint_wait=false")) == EBUSY)
			__wt_sleep(1, 0);

		if (ret != 0 && ret != EBUSY) {
			lprintf(cfg, ret, 0,
			     "Table drop failed in cycle_idle_tables.");
			cfg->error = ret;
			return (NULL);
		}
		if (check_timing(cfg, "drop", start, &stop) != 0)
			return (NULL);
	}

	return (NULL);
}
예제 #13
0
/*
 * __wt_page_in_func --
 *	Acquire a hazard pointer to a page; if the page is not in-memory,
 *	read it from the disk and build an in-memory version.
 */
int
__wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags
#ifdef HAVE_DIAGNOSTIC
    , const char *file, int line
#endif
    )
{
	WT_BTREE *btree;
	WT_DECL_RET;
	WT_PAGE *page;
	u_int sleep_cnt, wait_cnt;
	bool busy, cache_work, evict_soon, stalled;
	int force_attempts;

	btree = S2BT(session);

	/*
	 * Ignore reads of pages already known to be in cache, otherwise the
	 * eviction server can dominate these statistics.
	 */
	if (!LF_ISSET(WT_READ_CACHE)) {
		WT_STAT_FAST_CONN_INCR(session, cache_pages_requested);
		WT_STAT_FAST_DATA_INCR(session, cache_pages_requested);
	}

	for (evict_soon = stalled = false,
	    force_attempts = 0, sleep_cnt = wait_cnt = 0;;) {
		switch (ref->state) {
		case WT_REF_DELETED:
			if (LF_ISSET(WT_READ_NO_EMPTY) &&
			    __wt_delete_page_skip(session, ref, false))
				return (WT_NOTFOUND);
			/* FALLTHROUGH */
		case WT_REF_DISK:
			if (LF_ISSET(WT_READ_CACHE))
				return (WT_NOTFOUND);

			/*
			 * The page isn't in memory, read it. If this thread is
			 * allowed to do eviction work, check for space in the
			 * cache.
			 */
			if (!LF_ISSET(WT_READ_NO_EVICT))
				WT_RET(__wt_cache_eviction_check(
				    session, 1, NULL));
			WT_RET(__page_read(session, ref));

			/*
			 * If configured to not trash the cache, leave the page
			 * generation unset, we'll set it before returning to
			 * the oldest read generation, so the page is forcibly
			 * evicted as soon as possible. We don't do that set
			 * here because we don't want to evict the page before
			 * we "acquire" it.
			 */
			evict_soon = LF_ISSET(WT_READ_WONT_NEED) ||
			    F_ISSET(session, WT_SESSION_NO_CACHE);
			continue;
		case WT_REF_READING:
			if (LF_ISSET(WT_READ_CACHE))
				return (WT_NOTFOUND);
			if (LF_ISSET(WT_READ_NO_WAIT))
				return (WT_NOTFOUND);

			/* Waiting on another thread's read, stall. */
			WT_STAT_FAST_CONN_INCR(session, page_read_blocked);
			stalled = true;
			break;
		case WT_REF_LOCKED:
			if (LF_ISSET(WT_READ_NO_WAIT))
				return (WT_NOTFOUND);

			/* Waiting on eviction, stall. */
			WT_STAT_FAST_CONN_INCR(session, page_locked_blocked);
			stalled = true;
			break;
		case WT_REF_SPLIT:
			return (WT_RESTART);
		case WT_REF_MEM:
			/*
			 * The page is in memory.
			 *
			 * Get a hazard pointer if one is required. We cannot
			 * be evicting if no hazard pointer is required, we're
			 * done.
			 */
			if (F_ISSET(btree, WT_BTREE_IN_MEMORY))
				goto skip_evict;

			/*
			 * The expected reason we can't get a hazard pointer is
			 * because the page is being evicted, yield, try again.
			 */
#ifdef HAVE_DIAGNOSTIC
			WT_RET(
			    __wt_hazard_set(session, ref, &busy, file, line));
#else
			WT_RET(__wt_hazard_set(session, ref, &busy));
#endif
			if (busy) {
				WT_STAT_FAST_CONN_INCR(
				    session, page_busy_blocked);
				break;
			}

			/*
			 * If eviction is configured for this file, check to see
			 * if the page qualifies for forced eviction and update
			 * the page's generation number. If eviction isn't being
			 * done on this file, we're done.
			 * In-memory split of large pages is allowed while
			 * no_eviction is set on btree, whereas reconciliation
			 * is not allowed.
			 */
			if (LF_ISSET(WT_READ_NO_EVICT) ||
			    F_ISSET(session, WT_SESSION_NO_EVICTION) ||
			    (F_ISSET(btree, WT_BTREE_NO_EVICTION) &&
			     !F_ISSET(btree, WT_BTREE_NO_RECONCILE)))
				goto skip_evict;

			/*
			 * Forcibly evict pages that are too big.
			 */
			if (force_attempts < 10 &&
			    __evict_force_check(session, ref)) {
				++force_attempts;
				ret = __wt_page_release_evict(session, ref);
				/* If forced eviction fails, stall. */
				if (ret == EBUSY) {
					ret = 0;
					WT_STAT_FAST_CONN_INCR(session,
					    page_forcible_evict_blocked);
					stalled = true;
					break;
				}
				WT_RET(ret);

				/*
				 * The result of a successful forced eviction
				 * is a page-state transition (potentially to
				 * an in-memory page we can use, or a restart
				 * return for our caller), continue the outer
				 * page-acquisition loop.
				 */
				continue;
			}

			/*
			 * If we read the page and are configured to not trash
			 * the cache, and no other thread has already used the
			 * page, set the oldest read generation so the page is
			 * forcibly evicted as soon as possible.
			 *
			 * Otherwise, if we read the page, or, if configured to
			 * update the page's read generation and the page isn't
			 * already flagged for forced eviction, update the page
			 * read generation.
			 */
			page = ref->page;
			if (page->read_gen == WT_READGEN_NOTSET) {
				if (evict_soon)
					__wt_page_evict_soon(session, ref);
				else
					__wt_cache_read_gen_new(session, page);
			} else if (!LF_ISSET(WT_READ_NO_GEN))
				__wt_cache_read_gen_bump(session, page);
skip_evict:
			/*
			 * Check if we need an autocommit transaction.
			 * Starting a transaction can trigger eviction, so skip
			 * it if eviction isn't permitted.
			 */
			return (LF_ISSET(WT_READ_NO_EVICT) ? 0 :
			    __wt_txn_autocommit_check(session));
		WT_ILLEGAL_VALUE(session);
		}

		/*
		 * We failed to get the page -- yield before retrying, and if
		 * we've yielded enough times, start sleeping so we don't burn
		 * CPU to no purpose.
		 */
		if (stalled)
			wait_cnt += WT_THOUSAND;
		else if (++wait_cnt < WT_THOUSAND) {
			__wt_yield();
			continue;
		}

		/*
		 * If stalling and this thread is allowed to do eviction work,
		 * check if the cache needs help. If we do work for the cache,
		 * substitute that for a sleep.
		 */
		if (!LF_ISSET(WT_READ_NO_EVICT)) {
			WT_RET(
			    __wt_cache_eviction_check(session, 1, &cache_work));
			if (cache_work)
				continue;
		}
		sleep_cnt = WT_MIN(sleep_cnt + WT_THOUSAND, 10000);
		WT_STAT_FAST_CONN_INCRV(session, page_sleep, sleep_cnt);
		__wt_sleep(0, sleep_cnt);
	}
}
예제 #14
0
/*
 * __wt_log_slot_join --
 *	Join a consolidated logging slot.
 */
void
__wt_log_slot_join(WT_SESSION_IMPL *session, uint64_t mysize,
    uint32_t flags, WT_MYSLOT *myslot)
{
	WT_CONNECTION_IMPL *conn;
	WT_LOG *log;
	WT_LOGSLOT *slot;
	uint64_t time_start, time_stop, usecs;
	int64_t flag_state, new_state, old_state, released;
	int32_t join_offset, new_join, wait_cnt;
	bool closed, diag_yield, raced, slept, unbuffered, yielded;

	conn = S2C(session);
	log = conn->log;
	time_start = time_stop = 0;

	WT_ASSERT(session, !F_ISSET(session, WT_SESSION_LOCKED_SLOT));
	WT_ASSERT(session, mysize != 0);

	/*
	 * There should almost always be a slot open.
	 */
	unbuffered = yielded = false;
	closed = raced = slept = false;
	wait_cnt = 0;
#ifdef	HAVE_DIAGNOSTIC
	diag_yield = (++log->write_calls % 7) == 0;
	if ((log->write_calls % WT_THOUSAND) == 0 ||
	    mysize > WT_LOG_SLOT_BUF_MAX) {
#else
	diag_yield = false;
	if (mysize > WT_LOG_SLOT_BUF_MAX) {
#endif
		unbuffered = true;
		F_SET(myslot, WT_MYSLOT_UNBUFFERED);
	}
	for (;;) {
		WT_BARRIER();
		slot = log->active_slot;
		old_state = slot->slot_state;
		if (WT_LOG_SLOT_OPEN(old_state)) {
			/*
			 * Try to join our size into the existing size and
			 * atomically write it back into the state.
			 */
			flag_state = WT_LOG_SLOT_FLAGS(old_state);
			released = WT_LOG_SLOT_RELEASED(old_state);
			join_offset = WT_LOG_SLOT_JOINED(old_state);
			if (unbuffered)
				new_join = join_offset + WT_LOG_SLOT_UNBUFFERED;
			else
				new_join = join_offset + (int32_t)mysize;
			new_state = (int64_t)WT_LOG_SLOT_JOIN_REL(
			    (int64_t)new_join, (int64_t)released,
			    (int64_t)flag_state);

			/*
			 * Braces used due to potential empty body warning.
			 */
			if (diag_yield) {
				WT_DIAGNOSTIC_YIELD;
			}
			/*
			 * Attempt to swap our size into the state.
			 */
			if (__wt_atomic_casiv64(
			    &slot->slot_state, old_state, new_state))
				break;
			WT_STAT_CONN_INCR(session, log_slot_races);
			raced = true;
		} else {
			WT_STAT_CONN_INCR(session, log_slot_active_closed);
			closed = true;
			++wait_cnt;
		}
		if (!yielded)
			time_start = __wt_clock(session);
		yielded = true;
		/*
		 * The slot is no longer open or we lost the race to
		 * update it.  Yield and try again.
		 */
		if (wait_cnt < WT_THOUSAND)
			__wt_yield();
		else {
			__wt_sleep(0, WT_THOUSAND);
			slept = true;
		}
	}
	/*
	 * We joined this slot.  Fill in our information to return to
	 * the caller.
	 */
	if (!yielded)
		WT_STAT_CONN_INCR(session, log_slot_immediate);
	else {
		WT_STAT_CONN_INCR(session, log_slot_yield);
		time_stop = __wt_clock(session);
		usecs = WT_CLOCKDIFF_US(time_stop, time_start);
		WT_STAT_CONN_INCRV(session, log_slot_yield_duration, usecs);
		if (closed)
			WT_STAT_CONN_INCR(session, log_slot_yield_close);
		if (raced)
			WT_STAT_CONN_INCR(session, log_slot_yield_race);
		if (slept)
			WT_STAT_CONN_INCR(session, log_slot_yield_sleep);
	}
	if (LF_ISSET(WT_LOG_DSYNC | WT_LOG_FSYNC))
		F_SET(slot, WT_SLOT_SYNC_DIR);
	if (LF_ISSET(WT_LOG_FLUSH))
		F_SET(slot, WT_SLOT_FLUSH);
	if (LF_ISSET(WT_LOG_FSYNC))
		F_SET(slot, WT_SLOT_SYNC);
	if (F_ISSET(myslot, WT_MYSLOT_UNBUFFERED)) {
		WT_ASSERT(session, slot->slot_unbuffered == 0);
		WT_STAT_CONN_INCR(session, log_slot_unbuffered);
		slot->slot_unbuffered = (int64_t)mysize;
	}
	myslot->slot = slot;
	myslot->offset = join_offset;
	myslot->end_offset = (wt_off_t)((uint64_t)join_offset + mysize);
}

/*
 * __wt_log_slot_release --
 *	Each thread in a consolidated group releases its portion to
 *	signal it has completed copying its piece of the log into
 *	the memory buffer.
 */
int64_t
__wt_log_slot_release(WT_MYSLOT *myslot, int64_t size)
{
	WT_LOGSLOT *slot;
	wt_off_t cur_offset, my_start;
	int64_t my_size, rel_size;

	slot = myslot->slot;
	my_start = slot->slot_start_offset + myslot->offset;
	/*
	 * We maintain the last starting offset within this slot.
	 * This is used to know the offset of the last record that
	 * was written rather than the beginning record of the slot.
	 */
	while ((cur_offset = slot->slot_last_offset) < my_start) {
		/*
		 * Set our offset if we are larger.
		 */
		if (__wt_atomic_casiv64(
		    &slot->slot_last_offset, cur_offset, my_start))
			break;
		/*
		 * If we raced another thread updating this, try again.
		 */
		WT_BARRIER();
	}
	/*
	 * Add my size into the state and return the new size.
	 */
	rel_size = size;
	if (F_ISSET(myslot, WT_MYSLOT_UNBUFFERED))
		rel_size = WT_LOG_SLOT_UNBUFFERED;
	my_size = (int64_t)WT_LOG_SLOT_JOIN_REL((int64_t)0, rel_size, 0);
	return (__wt_atomic_addiv64(&slot->slot_state, my_size));
}
예제 #15
0
/*
 * __wt_page_out --
 *	Discard an in-memory page, freeing all memory associated with it.
 */
void
__wt_page_out(WT_SESSION_IMPL *session, WT_PAGE **pagep)
{
	WT_PAGE *page;
	WT_PAGE_HEADER *dsk;
	WT_PAGE_MODIFY *mod;

	/*
	 * Kill our caller's reference, do our best to catch races.
	 */
	page = *pagep;
	*pagep = NULL;

	if (F_ISSET(session->dhandle, WT_DHANDLE_DEAD))
		__wt_page_modify_clear(session, page);

	/*
	 * We should never discard:
	 * - a dirty page,
	 * - a page queued for eviction, or
	 * - a locked page.
	 */
	WT_ASSERT(session, !__wt_page_is_modified(page));
	WT_ASSERT(session, !F_ISSET_ATOMIC(page, WT_PAGE_EVICT_LRU));
	WT_ASSERT(session, !__wt_fair_islocked(session, &page->page_lock));

#ifdef HAVE_DIAGNOSTIC
	{
	WT_HAZARD *hp;
	int i;
	/*
	 * Make sure no other thread has a hazard pointer on the page we are
	 * about to discard.  This is complicated by the fact that readers
	 * publish their hazard pointer before re-checking the page state, so
	 * our check can race with readers without indicating a real problem.
	 * Wait for up to a second for hazard pointers to be cleared.
	 */
	for (hp = NULL, i = 0; i < 100; i++) {
		if ((hp = __wt_page_hazard_check(session, page)) == NULL)
			break;
		__wt_sleep(0, 10000);
	}
	if (hp != NULL)
		__wt_errx(session,
		    "discarded page has hazard pointer: (%p: %s, line %d)",
		    hp->page, hp->file, hp->line);
	WT_ASSERT(session, hp == NULL);
	}
#endif

	/*
	 * If a root page split, there may be one or more pages linked from the
	 * page; walk the list, discarding pages.
	 */
	switch (page->type) {
	case WT_PAGE_COL_INT:
	case WT_PAGE_ROW_INT:
		mod = page->modify;
		if (mod != NULL && mod->mod_root_split != NULL)
			__wt_page_out(session, &mod->mod_root_split);
		break;
	}

	/* Update the cache's information. */
	__wt_cache_page_evict(session, page);

	/*
	 * If discarding the page as part of process exit, the application may
	 * configure to leak the memory rather than do the work.
	 */
	if (F_ISSET(S2C(session), WT_CONN_LEAK_MEMORY))
		return;

	/* Free the page modification information. */
	if (page->modify != NULL)
		__free_page_modify(session, page);

	switch (page->type) {
	case WT_PAGE_COL_FIX:
		break;
	case WT_PAGE_COL_INT:
	case WT_PAGE_ROW_INT:
		__free_page_int(session, page);
		break;
	case WT_PAGE_COL_VAR:
		__free_page_col_var(session, page);
		break;
	case WT_PAGE_ROW_LEAF:
		__free_page_row_leaf(session, page);
		break;
	}

	/* Discard any disk image. */
	dsk = (WT_PAGE_HEADER *)page->dsk;
	if (F_ISSET_ATOMIC(page, WT_PAGE_DISK_ALLOC))
		__wt_overwrite_and_free_len(session, dsk, dsk->mem_size);
	if (F_ISSET_ATOMIC(page, WT_PAGE_DISK_MAPPED))
		(void)__wt_mmap_discard(session, dsk, dsk->mem_size);

	__wt_overwrite_and_free(session, page);
}
예제 #16
0
파일: bt_read.c 프로젝트: qihsh/mongo
/*
 * __wt_page_in_func --
 *	Acquire a hazard pointer to a page; if the page is not in-memory,
 *	read it from the disk and build an in-memory version.
 */
int
__wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags
#ifdef HAVE_DIAGNOSTIC
    , const char *file, int line
#endif
    )
{
	WT_BTREE *btree;
	WT_DECL_RET;
	WT_PAGE *page;
	u_int sleep_cnt, wait_cnt;
	int busy, cache_work, force_attempts, oldgen, stalled;

	btree = S2BT(session);
	stalled = 0;

	for (force_attempts = oldgen = 0, sleep_cnt = wait_cnt = 0;;) {
		switch (ref->state) {
		case WT_REF_DISK:
		case WT_REF_DELETED:
			if (LF_ISSET(WT_READ_CACHE))
				return (WT_NOTFOUND);

			/*
			 * The page isn't in memory, read it. If this thread is
			 * allowed to do eviction work, check for space in the
			 * cache.
			 */
			if (!LF_ISSET(WT_READ_NO_EVICT))
				WT_RET(__wt_cache_eviction_check(
				    session, 1, NULL));
			WT_RET(__page_read(session, ref));
			oldgen = LF_ISSET(WT_READ_WONT_NEED) ||
			    F_ISSET(session, WT_SESSION_NO_CACHE);
			continue;
		case WT_REF_READING:
			if (LF_ISSET(WT_READ_CACHE))
				return (WT_NOTFOUND);
			if (LF_ISSET(WT_READ_NO_WAIT))
				return (WT_NOTFOUND);

			/* Waiting on another thread's read, stall. */
			WT_STAT_FAST_CONN_INCR(session, page_read_blocked);
			stalled = 1;
			break;
		case WT_REF_LOCKED:
			if (LF_ISSET(WT_READ_NO_WAIT))
				return (WT_NOTFOUND);

			/* Waiting on eviction, stall. */
			WT_STAT_FAST_CONN_INCR(session, page_locked_blocked);
			stalled = 1;
			break;
		case WT_REF_SPLIT:
			return (WT_RESTART);
		case WT_REF_MEM:
			/*
			 * The page is in memory.
			 *
			 * Get a hazard pointer if one is required. We cannot
			 * be evicting if no hazard pointer is required, we're
			 * done.
			 */
			if (F_ISSET(btree, WT_BTREE_IN_MEMORY))
				goto skip_evict;

			/*
			 * The expected reason we can't get a hazard pointer is
			 * because the page is being evicted, yield, try again.
			 */
#ifdef HAVE_DIAGNOSTIC
			WT_RET(
			    __wt_hazard_set(session, ref, &busy, file, line));
#else
			WT_RET(__wt_hazard_set(session, ref, &busy));
#endif
			if (busy) {
				WT_STAT_FAST_CONN_INCR(
				    session, page_busy_blocked);
				break;
			}

			/*
			 * If eviction is configured for this file, check to see
			 * if the page qualifies for forced eviction and update
			 * the page's generation number. If eviction isn't being
			 * done on this file, we're done.
			 */
			if (LF_ISSET(WT_READ_NO_EVICT) ||
			    F_ISSET(session, WT_SESSION_NO_EVICTION) ||
			    F_ISSET(btree, WT_BTREE_NO_EVICTION))
				goto skip_evict;

			/*
			 * Forcibly evict pages that are too big.
			 */
			page = ref->page;
			if (force_attempts < 10 &&
			    __evict_force_check(session, page)) {
				++force_attempts;
				ret = __wt_page_release_evict(session, ref);
				/* If forced eviction fails, stall. */
				if (ret == EBUSY) {
					ret = 0;
					WT_STAT_FAST_CONN_INCR(session,
					    page_forcible_evict_blocked);
					stalled = 1;
					break;
				}
				WT_RET(ret);

				/*
				 * The result of a successful forced eviction
				 * is a page-state transition (potentially to
				 * an in-memory page we can use, or a restart
				 * return for our caller), continue the outer
				 * page-acquisition loop.
				 */
				continue;
			}

			/*
			 * If we read the page and we are configured to not
			 * trash the cache, set the oldest read generation so
			 * the page is forcibly evicted as soon as possible.
			 *
			 * Otherwise, update the page's read generation.
			 */
			if (oldgen && page->read_gen == WT_READGEN_NOTSET)
				__wt_page_evict_soon(page);
			else if (!LF_ISSET(WT_READ_NO_GEN) &&
			    page->read_gen != WT_READGEN_OLDEST &&
			    page->read_gen < __wt_cache_read_gen(session))
				page->read_gen =
				    __wt_cache_read_gen_bump(session);
skip_evict:
			/*
			 * Check if we need an autocommit transaction.
			 * Starting a transaction can trigger eviction, so skip
			 * it if eviction isn't permitted.
			 */
			return (LF_ISSET(WT_READ_NO_EVICT) ? 0 :
			    __wt_txn_autocommit_check(session));
		WT_ILLEGAL_VALUE(session);
		}

		/*
		 * We failed to get the page -- yield before retrying, and if
		 * we've yielded enough times, start sleeping so we don't burn
		 * CPU to no purpose.
		 */
		if (stalled)
			wait_cnt += 1000;
		else if (++wait_cnt < 1000) {
			__wt_yield();
			continue;
		}

		/*
		 * If stalling and this thread is allowed to do eviction work,
		 * check if the cache needs help. If we do work for the cache,
		 * substitute that for a sleep.
		 */
		if (!LF_ISSET(WT_READ_NO_EVICT)) {
			WT_RET(
			    __wt_cache_eviction_check(session, 1, &cache_work));
			if (cache_work)
				continue;
		}
		sleep_cnt = WT_MIN(sleep_cnt + 1000, 10000);
		WT_STAT_FAST_CONN_INCRV(session, page_sleep, sleep_cnt);
		__wt_sleep(0, sleep_cnt);
	}
}
예제 #17
0
/*
 * __clsm_open_cursors --
 *	Open cursors for the current set of files.
 */
static int
__clsm_open_cursors(
    WT_CURSOR_LSM *clsm, int update, u_int start_chunk, uint32_t start_id)
{
	WT_CURSOR *c, **cp, *primary;
	WT_DECL_RET;
	WT_LSM_CHUNK *chunk;
	WT_LSM_TREE *lsm_tree;
	WT_SESSION_IMPL *session;
	WT_TXN *txn;
	const char *checkpoint, *ckpt_cfg[3];
	uint64_t saved_gen;
	u_int i, nchunks, ngood, nupdates;
	int locked;

	c = &clsm->iface;
	session = (WT_SESSION_IMPL *)c->session;
	txn = &session->txn;
	lsm_tree = clsm->lsm_tree;
	chunk = NULL;

	ckpt_cfg[0] = WT_CONFIG_BASE(session, session_open_cursor);
	ckpt_cfg[1] = "checkpoint=" WT_CHECKPOINT ",raw";
	ckpt_cfg[2] = NULL;

	/* Copy the key, so we don't lose the cursor position. */
	if (F_ISSET(c, WT_CURSTD_KEY_INT) && !WT_DATA_IN_ITEM(&c->key))
		WT_RET(__wt_buf_set(
		    session, &c->key, c->key.data, c->key.size));

	F_CLR(clsm, WT_CLSM_ITERATE_NEXT | WT_CLSM_ITERATE_PREV);

	if (update) {
		if (txn->isolation == TXN_ISO_SNAPSHOT)
			F_SET(clsm, WT_CLSM_OPEN_SNAPSHOT);
	} else
		F_SET(clsm, WT_CLSM_OPEN_READ);

	WT_RET(__wt_lsm_tree_lock(session, lsm_tree, 0));
	locked = 1;
	/*
	 * If there is no in-memory chunk in the tree for an update operation,
	 * create one.
	 *
	 * !!!
	 * It is exceeding unlikely that we get here at all, but if we were to
	 * switch chunks in this thread and our transaction roll back, it would
	 * leave the metadata inconsistent.  Signal for the LSM worker thread
	 * to create the chunk instead to avoid the issue.
	 */
	if (update && (lsm_tree->nchunks == 0 ||
	    (chunk = lsm_tree->chunk[lsm_tree->nchunks - 1]) == NULL ||
	    F_ISSET(chunk, WT_LSM_CHUNK_ONDISK))) {
		/* Release our lock because switch will get a write lock. */
		F_SET(lsm_tree, WT_LSM_TREE_NEED_SWITCH);
		locked = 0;
		WT_ERR(__wt_lsm_tree_unlock(session, lsm_tree));
		WT_ERR(__wt_cond_signal(session, lsm_tree->work_cond));

		/*
		 * Give the worker thread a chance to run before locking the
		 * tree again -- we will loop in __clsm_enter until there is an
		 * in-memory chunk in the tree.
		 */
		__wt_sleep(0, 1000);
		WT_ERR(__wt_lsm_tree_lock(session, lsm_tree, 0));
		locked = 1;
	}
	F_SET(session, WT_SESSION_NO_CACHE_CHECK);

	/* Merge cursors have already figured out how many chunks they need. */
retry:	if (F_ISSET(clsm, WT_CLSM_MERGE)) {
		nchunks = clsm->nchunks;
		ngood = 0;

		/*
		 * We may have raced with another merge completing.  Check that
		 * we're starting at the right offset in the chunk array.
		 */
		if (start_chunk >= lsm_tree->nchunks ||
		    lsm_tree->chunk[start_chunk]->id != start_id) {
			for (start_chunk = 0;
			    start_chunk < lsm_tree->nchunks;
			    start_chunk++) {
				chunk = lsm_tree->chunk[start_chunk];
				if (chunk->id == start_id)
					break;
			}
			/* We have to find the start chunk: merge locked it. */
			WT_ASSERT(session, start_chunk < lsm_tree->nchunks);
		}

		WT_ASSERT(session, start_chunk + nchunks <= lsm_tree->nchunks);
	} else {
		nchunks = lsm_tree->nchunks;

		/*
		 * If we are only opening the cursor for updates, only open the
		 * primary chunk, plus any other chunks that might be required
		 * to detect snapshot isolation conflicts.
		 */
		if (F_ISSET(clsm, WT_CLSM_OPEN_SNAPSHOT))
			WT_ERR(__wt_realloc_def(session,
			    &clsm->txnid_alloc, nchunks,
			    &clsm->txnid_max));
		if (F_ISSET(clsm, WT_CLSM_OPEN_READ))
			ngood = nupdates = 0;
		else if (F_ISSET(clsm, WT_CLSM_OPEN_SNAPSHOT)) {
			/*
			 * Keep going until all updates in the next
			 * chunk are globally visible.  Copy the maximum
			 * transaction IDs into the cursor as we go.
			 */
			for (ngood = nchunks - 1, nupdates = 1;
			    ngood > 0;
			    ngood--, nupdates++) {
				chunk = lsm_tree->chunk[ngood - 1];
				clsm->txnid_max[ngood - 1] =
				    chunk->txnid_max;
				if (__wt_txn_visible_all(
				    session, chunk->txnid_max))
					break;
			}
		} else {
			nupdates = 1;
			ngood = nchunks - 1;
		}

		/* Check how many cursors are already open. */
		for (cp = clsm->cursors + ngood;
		    ngood < clsm->nchunks && ngood < nchunks;
		    cp++, ngood++) {
			chunk = lsm_tree->chunk[ngood];

			/* If the cursor isn't open yet, we're done. */
			if (*cp == NULL)
				break;

			/* Easy case: the URIs don't match. */
			if (strcmp((*cp)->uri, chunk->uri) != 0)
				break;

			/* Make sure the checkpoint config matches. */
			checkpoint = ((WT_CURSOR_BTREE *)*cp)->
			    btree->dhandle->checkpoint;
			if (checkpoint == NULL &&
			    F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) &&
			    !chunk->empty)
				break;

			/* Make sure the Bloom config matches. */
			if (clsm->blooms[ngood] == NULL &&
			    F_ISSET(chunk, WT_LSM_CHUNK_BLOOM))
				break;
		}

		/* Spurious generation bump? */
		if (ngood == clsm->nchunks && clsm->nchunks == nchunks) {
			clsm->dsk_gen = lsm_tree->dsk_gen;
			goto err;
		}

		/*
		 * Close any cursors we no longer need.  If the cursor is a
		 * pure update cursor, close everything -- we usually only need
		 * a single chunk open in that case and we haven't walked all
		 * of the other slots in the loop above.
		 *
		 * Drop the LSM tree lock while we do this: if the cache is
		 * full, we may block while closing a cursor.  Save the
		 * generation number and retry if it has changed under us.
		 */
		if (!F_ISSET(clsm, WT_CLSM_OPEN_READ) && nupdates > 0)
			ngood = 0;
		if (clsm->cursors != NULL && ngood < clsm->nchunks) {
			saved_gen = lsm_tree->dsk_gen;
			locked = 0;
			WT_ERR(__wt_lsm_tree_unlock(session, lsm_tree));
			WT_ERR(__clsm_close_cursors(
			    clsm, ngood, clsm->nchunks));
			WT_ERR(__wt_lsm_tree_lock(session, lsm_tree, 0));
			locked = 1;
			if (lsm_tree->dsk_gen != saved_gen)
				goto retry;
		}

		/* Detach from our old primary. */
		clsm->primary_chunk = NULL;
		clsm->current = NULL;
	}

	WT_ERR(__wt_realloc_def(session,
	    &clsm->bloom_alloc, nchunks, &clsm->blooms));
	WT_ERR(__wt_realloc_def(session,
	    &clsm->cursor_alloc, nchunks, &clsm->cursors));

	clsm->nchunks = nchunks;

	/* Open the cursors for chunks that have changed. */
	for (i = ngood, cp = clsm->cursors + i; i != nchunks; i++, cp++) {
		chunk = lsm_tree->chunk[i + start_chunk];
		/* Copy the maximum transaction ID. */
		if (F_ISSET(clsm, WT_CLSM_OPEN_SNAPSHOT))
			clsm->txnid_max[i] = chunk->txnid_max;

		/*
		 * Read from the checkpoint if the file has been written.
		 * Once all cursors switch, the in-memory tree can be evicted.
		 */
		WT_ASSERT(session, *cp == NULL);
		ret = __wt_open_cursor(session, chunk->uri, c,
		    (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) && !chunk->empty) ?
			ckpt_cfg : NULL, cp);

		/*
		 * XXX kludge: we may have an empty chunk where no checkpoint
		 * was written.  If so, try to open the ordinary handle on that
		 * chunk instead.
		 */
		if (ret == WT_NOTFOUND &&
		    F_ISSET(chunk, WT_LSM_CHUNK_ONDISK)) {
			ret = __wt_open_cursor(
			    session, chunk->uri, c, NULL, cp);
			if (ret == 0)
				chunk->empty = 1;
		}
		WT_ERR(ret);

		if (F_ISSET(chunk, WT_LSM_CHUNK_BLOOM) &&
		    !F_ISSET(clsm, WT_CLSM_MERGE))
			WT_ERR(__wt_bloom_open(session, chunk->bloom_uri,
			    lsm_tree->bloom_bit_count,
			    lsm_tree->bloom_hash_count,
			    c, &clsm->blooms[i]));

		/* Child cursors always use overwrite and raw mode. */
		F_SET(*cp, WT_CURSTD_OVERWRITE | WT_CURSTD_RAW);
	}

	/* The last chunk is our new primary. */
	if (chunk != NULL && !F_ISSET(chunk, WT_LSM_CHUNK_ONDISK)) {
		clsm->primary_chunk = chunk;
		primary = clsm->cursors[clsm->nchunks - 1];
		WT_WITH_BTREE(session, ((WT_CURSOR_BTREE *)(primary))->btree,
		    __wt_btree_evictable(session, 0));
	}

	clsm->dsk_gen = lsm_tree->dsk_gen;
err:	F_CLR(session, WT_SESSION_NO_CACHE_CHECK);
#ifdef HAVE_DIAGNOSTIC
	/* Check that all cursors are open as expected. */
	if (ret == 0 && F_ISSET(clsm, WT_CLSM_OPEN_READ)) {
		for (i = 0, cp = clsm->cursors; i != clsm->nchunks; cp++, i++) {
			chunk = lsm_tree->chunk[i + start_chunk];

			/* Make sure the cursor is open. */
			WT_ASSERT(session, *cp != NULL);

			/* Easy case: the URIs should match. */
			WT_ASSERT(session, strcmp((*cp)->uri, chunk->uri) == 0);

			/* Make sure the checkpoint config matches. */
			checkpoint = ((WT_CURSOR_BTREE *)*cp)->
			    btree->dhandle->checkpoint;
			WT_ASSERT(session,
			    (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) &&
			    !chunk->empty) ?
			    checkpoint != NULL : checkpoint == NULL);

			/* Make sure the Bloom config matches. */
			WT_ASSERT(session,
			    (F_ISSET(chunk, WT_LSM_CHUNK_BLOOM) &&
			    !F_ISSET(clsm, WT_CLSM_MERGE)) ?
			    clsm->blooms[i] != NULL : clsm->blooms[i] == NULL);
		}
	}
#endif
	if (locked)
		WT_TRET(__wt_lsm_tree_unlock(session, lsm_tree));
	return (ret);
}