Esempio n. 1
0
/*
 * __wt_lsm_tree_switch --
 *	Switch to a new in-memory tree.
 */
int
__wt_lsm_tree_switch(
    WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
{
	WT_DECL_RET;
	WT_LSM_CHUNK *chunk, **cp;
	uint32_t in_memory, new_id;

	new_id = WT_ATOMIC_ADD(lsm_tree->last, 1); 

	if ((lsm_tree->nchunks + 1) * sizeof(*lsm_tree->chunk) >
	    lsm_tree->chunk_alloc)
		WT_ERR(__wt_realloc(session,
		    &lsm_tree->chunk_alloc,
		    WT_MAX(10 * sizeof(*lsm_tree->chunk),
		    2 * lsm_tree->chunk_alloc),
		    &lsm_tree->chunk));

	/*
	 * In the steady state, we expect that the checkpoint worker thread
	 * will keep up with inserts.  If not, we throttle the insert rate to
	 * avoid filling the cache with in-memory chunks.  Threads sleep every
	 * 100 operations, so take that into account in the calculation.
	 */
	for (in_memory = 1, cp = lsm_tree->chunk + lsm_tree->nchunks - 1;
	    in_memory < lsm_tree->nchunks && !F_ISSET(*cp, WT_LSM_CHUNK_ONDISK);
	    ++in_memory, --cp)
		;
	if (!F_ISSET(lsm_tree, WT_LSM_TREE_THROTTLE) || in_memory <= 2)
		lsm_tree->throttle_sleep = 0;
	else if (in_memory == lsm_tree->nchunks ||
	    F_ISSET(*cp, WT_LSM_CHUNK_STABLE)) {
		/*
		 * No checkpoint has completed this run.  Keep slowing down
		 * inserts until one does.
		 */
		lsm_tree->throttle_sleep =
		    WT_MAX(20, 2 * lsm_tree->throttle_sleep);
	} else {
		chunk = lsm_tree->chunk[lsm_tree->nchunks - 1];
		lsm_tree->throttle_sleep = (long)((in_memory - 2) *
		    WT_TIMEDIFF(chunk->create_ts, (*cp)->create_ts) /
		    (20 * in_memory * chunk->count));
	}

	WT_VERBOSE_ERR(session, lsm, "Tree switch to: %d, throttle %d",
	    new_id, (int)lsm_tree->throttle_sleep);

	WT_ERR(__wt_calloc_def(session, 1, &chunk));
	chunk->id = new_id;
	lsm_tree->chunk[lsm_tree->nchunks++] = chunk;
	WT_ERR(__wt_lsm_tree_setup_chunk(session, lsm_tree, chunk));

	++lsm_tree->dsk_gen;
	F_CLR(lsm_tree, WT_LSM_TREE_NEED_SWITCH);
	WT_ERR(__wt_lsm_meta_write(session, lsm_tree));

err:	/* TODO: mark lsm_tree bad on error(?) */
	return (ret);
}
Esempio n. 2
0
/*检查compact持续过程的时间是否超过设置的范围*/
static int __session_compact_check_timeout(WT_SESSION_IMPL* session, struct timespec begin)
{
	struct timespec end;

	if (session->compact->max_time == 0)
		return (0);

	WT_RET(__wt_epoch(session, &end));
	if (session->compact->max_time < WT_TIMEDIFF(end, begin) / WT_BILLION)
		WT_RET(ETIMEDOUT);
	return 0;
}
Esempio n. 3
0
/*
 * __sync_file --
 *	Flush pages for a specific file.
 */
static int
__sync_file(WT_SESSION_IMPL *session, int syncop)
{
	struct timespec end, start;
	WT_BTREE *btree;
	WT_DECL_RET;
	WT_PAGE *page;
	WT_PAGE_MODIFY *mod;
	WT_REF *walk;
	WT_TXN *txn;
	uint64_t internal_bytes, leaf_bytes;
	uint64_t internal_pages, leaf_pages;
	uint32_t flags;
	bool evict_reset;

	btree = S2BT(session);

	flags = WT_READ_CACHE | WT_READ_NO_GEN;
	walk = NULL;
	txn = &session->txn;

	internal_bytes = leaf_bytes = 0;
	internal_pages = leaf_pages = 0;
	if (WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT))
		WT_RET(__wt_epoch(session, &start));

	switch (syncop) {
	case WT_SYNC_WRITE_LEAVES:
		/*
		 * Write all immediately available, dirty in-cache leaf pages.
		 *
		 * Writing the leaf pages is done without acquiring a high-level
		 * lock, serialize so multiple threads don't walk the tree at
		 * the same time.
		 */
		if (!btree->modified)
			return (0);
		__wt_spin_lock(session, &btree->flush_lock);
		if (!btree->modified) {
			__wt_spin_unlock(session, &btree->flush_lock);
			return (0);
		}

		flags |= WT_READ_NO_WAIT | WT_READ_SKIP_INTL;
		for (walk = NULL;;) {
			WT_ERR(__wt_tree_walk(session, &walk, NULL, flags));
			if (walk == NULL)
				break;

			/*
			 * Write dirty pages if nobody beat us to it.  Don't
			 * try to write the hottest pages: checkpoint will have
			 * to visit them anyway.
			 */
			page = walk->page;
			if (__wt_page_is_modified(page) &&
			    __wt_txn_visible_all(
			    session, page->modify->update_txn)) {
				if (txn->isolation == WT_ISO_READ_COMMITTED)
					__wt_txn_get_snapshot(session);
				leaf_bytes += page->memory_footprint;
				++leaf_pages;
				WT_ERR(__wt_reconcile(session, walk, NULL, 0));
			}
		}
		break;
	case WT_SYNC_CHECKPOINT:
		/*
		 * We cannot check the tree modified flag in the case of a
		 * checkpoint, the checkpoint code has already cleared it.
		 *
		 * Writing the leaf pages is done without acquiring a high-level
		 * lock, serialize so multiple threads don't walk the tree at
		 * the same time.  We're holding the schema lock, but need the
		 * lower-level lock as well.
		 */
		__wt_spin_lock(session, &btree->flush_lock);

		/*
		 * When internal pages are being reconciled by checkpoint their
		 * child pages cannot disappear from underneath them or be split
		 * into them, nor can underlying blocks be freed until the block
		 * lists for the checkpoint are stable.  Set the checkpointing
		 * flag to block eviction of dirty pages until the checkpoint's
		 * internal page pass is complete, then wait for any existing
		 * eviction to complete.
		 */
		btree->checkpointing = 1;
		WT_FULL_BARRIER();

		WT_ERR(__wt_evict_file_exclusive_on(session, &evict_reset));
		if (evict_reset)
			__wt_evict_file_exclusive_off(session);

		/* Write all dirty in-cache pages. */
		flags |= WT_READ_NO_EVICT;
		for (walk = NULL;;) {
			/*
			 * If we have a page, and it was ever modified, track
			 * the highest transaction ID in the tree.  We do this
			 * here because we want the value after reconciling
			 * dirty pages.
			 */
			if (walk != NULL && walk->page != NULL &&
			    (mod = walk->page->modify) != NULL &&
			    WT_TXNID_LT(btree->rec_max_txn, mod->rec_max_txn))
				btree->rec_max_txn = mod->rec_max_txn;

			WT_ERR(__wt_tree_walk(session, &walk, NULL, flags));
			if (walk == NULL)
				break;

			page = walk->page;
			mod = page->modify;

			/* Skip clean pages. */
			if (!__wt_page_is_modified(page))
				continue;

			/*
			 * Write dirty pages, unless we can be sure they only
			 * became dirty after the checkpoint started.
			 *
			 * We can skip dirty pages if:
			 * (1) they are leaf pages;
			 * (2) there is a snapshot transaction active (which
			 *     is the case in ordinary application checkpoints
			 *     but not all internal cases); and
			 * (3) the first dirty update on the page is
			 *     sufficiently recent that the checkpoint
			 *     transaction would skip them.
			 *
			 * Mark the tree dirty: the checkpoint marked it clean
			 * and we can't skip future checkpoints until this page
			 * is written.
			 */
			if (!WT_PAGE_IS_INTERNAL(page) &&
			    F_ISSET(txn, WT_TXN_HAS_SNAPSHOT) &&
			    WT_TXNID_LT(txn->snap_max, mod->first_dirty_txn) &&
			    mod->rec_result != WT_PM_REC_REWRITE) {
				__wt_page_modify_set(session, page);
				continue;
			}

			if (WT_PAGE_IS_INTERNAL(page)) {
				internal_bytes += page->memory_footprint;
				++internal_pages;
			} else {
				leaf_bytes += page->memory_footprint;
				++leaf_pages;
			}
			WT_ERR(__wt_reconcile(session, walk, NULL, 0));
		}
		break;
	}

	if (WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT)) {
		WT_ERR(__wt_epoch(session, &end));
		WT_ERR(__wt_verbose(session, WT_VERB_CHECKPOINT,
		    "__sync_file WT_SYNC_%s wrote:\n\t %" PRIu64
		    " bytes, %" PRIu64 " pages of leaves\n\t %" PRIu64
		    " bytes, %" PRIu64 " pages of internal\n\t"
		    "Took: %" PRIu64 "ms",
		    syncop == WT_SYNC_WRITE_LEAVES ?
		    "WRITE_LEAVES" : "CHECKPOINT",
		    leaf_bytes, leaf_pages, internal_bytes, internal_pages,
		    WT_TIMEDIFF(end, start) / WT_MILLION));
	}

err:	/* On error, clear any left-over tree walk. */
	if (walk != NULL)
		WT_TRET(__wt_page_release(session, walk, flags));

	if (txn->isolation == WT_ISO_READ_COMMITTED && session->ncursors == 0)
		__wt_txn_release_snapshot(session);

	if (btree->checkpointing) {
		/*
		 * Update the checkpoint generation for this handle so visible
		 * updates newer than the checkpoint can be evicted.
		 *
		 * This has to be published before eviction is enabled again,
		 * so that eviction knows that the checkpoint has completed.
		 */
		WT_PUBLISH(btree->checkpoint_gen,
		    S2C(session)->txn_global.checkpoint_gen);
		WT_STAT_FAST_DATA_SET(session,
		    btree_checkpoint_generation, btree->checkpoint_gen);

		/*
		 * Clear the checkpoint flag and push the change; not required,
		 * but publishing the change means stalled eviction gets moving
		 * as soon as possible.
		 */
		btree->checkpointing = 0;
		WT_FULL_BARRIER();

		/*
		 * If this tree was being skipped by the eviction server during
		 * the checkpoint, clear the wait.
		 */
		btree->evict_walk_period = 0;

		/*
		 * Wake the eviction server, in case application threads have
		 * stalled while the eviction server decided it couldn't make
		 * progress.  Without this, application threads will be stalled
		 * until the eviction server next wakes.
		 */
		WT_TRET(__wt_evict_server_wake(session));
	}

	__wt_spin_unlock(session, &btree->flush_lock);

	/*
	 * Leaves are written before a checkpoint (or as part of a file close,
	 * before checkpointing the file).  Start a flush to stable storage,
	 * but don't wait for it.
	 */
	if (ret == 0 && syncop == WT_SYNC_WRITE_LEAVES)
		WT_RET(btree->bm->sync(btree->bm, session, true));

	return (ret);
}
Esempio n. 4
0
/*
 * __wt_lsm_tree_throttle --
 *	Calculate whether LSM updates need to be throttled. Must be called
 *	with the LSM tree lock held.
 */
void
__wt_lsm_tree_throttle(
    WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, int decrease_only)
{
	WT_LSM_CHUNK *last_chunk, **cp, *ondisk, *prev_chunk;
	uint64_t cache_sz, cache_used, oldtime, record_count, timediff;
	uint32_t in_memory, gen0_chunks;

	/* Never throttle in small trees. */
	if (lsm_tree->nchunks < 3) {
		lsm_tree->ckpt_throttle = lsm_tree->merge_throttle = 0;
		return;
	}

	cache_sz = S2C(session)->cache_size;

	/*
	 * In the steady state, we expect that the checkpoint worker thread
	 * will keep up with inserts.  If not, throttle the insert rate to
	 * avoid filling the cache with in-memory chunks.  Threads sleep every
	 * 100 operations, so take that into account in the calculation.
	 *
	 * Also throttle based on whether merge threads are keeping up.  If
	 * there are enough chunks that have never been merged we slow down
	 * inserts so that merges have some chance of keeping up.
	 *
	 * Count the number of in-memory chunks, the number of unmerged chunk
	 * on disk, and find the most recent on-disk chunk (if any).
	 */
	record_count = 1;
	gen0_chunks = in_memory = 0;
	ondisk = NULL;
	for (cp = lsm_tree->chunk + lsm_tree->nchunks - 1;
	    cp >= lsm_tree->chunk;
	    --cp)
		if (!F_ISSET(*cp, WT_LSM_CHUNK_ONDISK)) {
			record_count += (*cp)->count;
			++in_memory;
		} else {
			/*
			 * Assign ondisk to the last chunk that has been
			 * flushed since the tree was last opened (i.e it's on
			 * disk and stable is not set).
			 */
			if (ondisk == NULL &&
			    ((*cp)->generation == 0 &&
			    !F_ISSET(*cp, WT_LSM_CHUNK_STABLE)))
				ondisk = *cp;

			if ((*cp)->generation == 0 &&
			    !F_ISSET(*cp, WT_LSM_CHUNK_MERGING))
				++gen0_chunks;
		}

	last_chunk = lsm_tree->chunk[lsm_tree->nchunks - 1];

	/* Checkpoint throttling, based on the number of in-memory chunks. */
	if (!F_ISSET(lsm_tree, WT_LSM_TREE_THROTTLE) || in_memory <= 3)
		lsm_tree->ckpt_throttle = 0;
	else if (decrease_only)
		; /* Nothing to do */
	else if (ondisk == NULL) {
		/*
		 * No checkpoint has completed this run.  Keep slowing down
		 * inserts until one does.
		 */
		lsm_tree->ckpt_throttle =
		    WT_MAX(WT_LSM_THROTTLE_START, 2 * lsm_tree->ckpt_throttle);
	} else {
		WT_ASSERT(session,
		    WT_TIMECMP(last_chunk->create_ts, ondisk->create_ts) >= 0);
		timediff =
		    WT_TIMEDIFF(last_chunk->create_ts, ondisk->create_ts);
		lsm_tree->ckpt_throttle =
		    (long)((in_memory - 2) * timediff / (20 * record_count));

		/*
		 * Get more aggressive as the number of in memory chunks
		 * consumes a large proportion of the cache. In memory chunks
		 * are allowed to grow up to twice as large as the configured
		 * value when checkpoints aren't keeping up. That worst case
		 * is when this calculation is relevant.
		 * There is nothing particularly special about the chosen
		 * multipliers.
		 */
		cache_used = in_memory * lsm_tree->chunk_size * 2;
		if (cache_used > cache_sz * 0.8)
			lsm_tree->ckpt_throttle *= 5;
	}

	/*
	 * Merge throttling, based on the number of on-disk, level 0 chunks.
	 *
	 * Don't throttle if the tree has less than a single level's number
	 * of chunks.
	 */
	if (lsm_tree->nchunks < lsm_tree->merge_max)
		lsm_tree->merge_throttle = 0;
	else if (gen0_chunks < WT_LSM_MERGE_THROTTLE_THRESHOLD)
		WT_LSM_MERGE_THROTTLE_DECREASE(lsm_tree->merge_throttle);
	else if (!decrease_only)
		WT_LSM_MERGE_THROTTLE_INCREASE(lsm_tree->merge_throttle);

	/* Put an upper bound of 1s on both throttle calculations. */
	lsm_tree->ckpt_throttle = WT_MIN(1000000, lsm_tree->ckpt_throttle);
	lsm_tree->merge_throttle = WT_MIN(1000000, lsm_tree->merge_throttle);

	/*
	 * Update our estimate of how long each in-memory chunk stays active.
	 * Filter out some noise by keeping a weighted history of the
	 * calculated value.  Wait until we have enough chunks that we can
	 * check that the new value is sane: otherwise, after a long idle
	 * period, we can calculate a crazy value.
	 */
	if (in_memory > 1 && ondisk != NULL) {
		prev_chunk = lsm_tree->chunk[lsm_tree->nchunks - 2];
		WT_ASSERT(session, prev_chunk->generation == 0);
		WT_ASSERT(session, WT_TIMECMP(
		    last_chunk->create_ts, prev_chunk->create_ts) >= 0);
		timediff =
		    WT_TIMEDIFF(last_chunk->create_ts, prev_chunk->create_ts);
		WT_ASSERT(session,
		    WT_TIMECMP(prev_chunk->create_ts, ondisk->create_ts) >= 0);
		oldtime = WT_TIMEDIFF(prev_chunk->create_ts, ondisk->create_ts);
		if (timediff < 10 * oldtime)
			lsm_tree->chunk_fill_ms =
			    (3 * lsm_tree->chunk_fill_ms +
			    timediff / 1000000) / 4;
	}
}
Esempio n. 5
0
/*
 * __wt_lsm_tree_throttle --
 *	Calculate whether LSM updates need to be throttled.
 */
void
__wt_lsm_tree_throttle(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
{
	WT_LSM_CHUNK *chunk, **cp, *prev_chunk;
	uint64_t cache_sz, cache_used, in_memory, record_count;
	uint64_t oldtime, timediff;
	uint32_t i;

	/* Never throttle in small trees. */
	if (lsm_tree->nchunks < 3)
		return;

	cache_sz = S2C(session)->cache_size;

	/*
	 * In the steady state, we expect that the checkpoint worker thread
	 * will keep up with inserts.  If not, throttle the insert rate to
	 * avoid filling the cache with in-memory chunks.  Threads sleep every
	 * 100 operations, so take that into account in the calculation.
	 *
	 * Count the number of in-memory chunks, and find the most recent
	 * on-disk chunk (if any).
	 */
	record_count = 1;
	for (i = in_memory = 0, cp = lsm_tree->chunk + lsm_tree->nchunks - 1;
	    i < lsm_tree->nchunks;
	    ++i, --cp)
		if (!F_ISSET_ATOMIC(*cp, WT_LSM_CHUNK_ONDISK)) {
			record_count += (*cp)->count;
			++in_memory;
		} else if ((*cp)->generation == 0 ||
		    F_ISSET_ATOMIC(*cp, WT_LSM_CHUNK_STABLE))
			break;

	chunk = lsm_tree->chunk[lsm_tree->nchunks - 1];

	if (!F_ISSET(lsm_tree, WT_LSM_TREE_THROTTLE) || in_memory <= 3)
		lsm_tree->throttle_sleep = 0;
	else if (i == lsm_tree->nchunks ||
	    F_ISSET_ATOMIC(*cp, WT_LSM_CHUNK_STABLE)) {
		/*
		 * No checkpoint has completed this run.  Keep slowing down
		 * inserts until one does.
		 */
		lsm_tree->throttle_sleep =
		    WT_MAX(20, 2 * lsm_tree->throttle_sleep);
	} else {
		WT_ASSERT(session,
		    WT_TIMECMP(chunk->create_ts, (*cp)->create_ts) >= 0);
		timediff = WT_TIMEDIFF(chunk->create_ts, (*cp)->create_ts);
		lsm_tree->throttle_sleep =
		    (long)((in_memory - 2) * timediff / (20 * record_count));

		/*
		 * Get more aggressive as the number of in memory chunks
		 * consumes a large proportion of the cache. In memory chunks
		 * are allowed to grow up to twice as large as the configured
		 * value when checkpoints aren't keeping up. That worst case
		 * is when this calculation is relevant.
		 * There is nothing particularly special about the chosen
		 * multipliers.
		 */
		cache_used = in_memory * lsm_tree->chunk_size * 2;
		if (cache_used > cache_sz * 0.8)
			lsm_tree->throttle_sleep *= 5;
	}

	/*
	 * Update our estimate of how long each in-memory chunk stays active.
	 * Filter out some noise by keeping a weighted history of the
	 * calculated value.  Wait until we have enough chunks that we can
	 * check that the new value is sane: otherwise, after a long idle
	 * period, we can calculate a crazy value.
	 */
	if (in_memory > 1 &&
	    i != lsm_tree->nchunks &&
	    !F_ISSET_ATOMIC(*cp, WT_LSM_CHUNK_STABLE)) {
		prev_chunk = lsm_tree->chunk[lsm_tree->nchunks - 2];
		WT_ASSERT(session, prev_chunk->generation == 0);
		WT_ASSERT(session,
		    WT_TIMECMP(chunk->create_ts, prev_chunk->create_ts) >= 0);
		timediff = WT_TIMEDIFF(chunk->create_ts, prev_chunk->create_ts);
		WT_ASSERT(session,
		    WT_TIMECMP(prev_chunk->create_ts, (*cp)->create_ts) >= 0);
		oldtime = WT_TIMEDIFF(prev_chunk->create_ts, (*cp)->create_ts);
		if (timediff < 10 * oldtime)
			lsm_tree->chunk_fill_ms =
			    (3 * lsm_tree->chunk_fill_ms +
			    timediff / 1000000) / 4;
	}
}