Exemple #1
0
/*
 * __stat_page --
 *	Stat any Btree page.
 */
static int
__stat_page(WT_SESSION_IMPL *session, WT_PAGE *page, WT_DSRC_STATS **stats)
{
	/*
	 * All internal pages and overflow pages are trivial, all we track is
	 * a count of the page type.
	 */
	switch (page->type) {
	case WT_PAGE_COL_FIX:
		WT_STAT_INCR(session, stats, btree_column_fix);
		WT_STAT_INCRV(
		    session, stats, btree_entries, page->pg_fix_entries);
		break;
	case WT_PAGE_COL_INT:
		WT_STAT_INCR(session, stats, btree_column_internal);
		break;
	case WT_PAGE_COL_VAR:
		__stat_page_col_var(session, page, stats);
		break;
	case WT_PAGE_ROW_INT:
		__stat_page_row_int(session, page, stats);
		break;
	case WT_PAGE_ROW_LEAF:
		__stat_page_row_leaf(session, page, stats);
		break;
	WT_ILLEGAL_VALUE(session);
	}
	return (0);
}
Exemple #2
0
/*
 * __stat_page_col_var --
 *	Stat a WT_PAGE_COL_VAR page.
 */
static int
__stat_page_col_var(WT_PAGE *page, WT_DSRC_STATS *stats)
{
	WT_CELL *cell;
	WT_CELL_UNPACK *unpack, _unpack;
	WT_COL *cip;
	WT_INSERT *ins;
	WT_UPDATE *upd;
	uint32_t i;
	int orig_deleted;

	unpack = &_unpack;

	WT_STAT_INCR(stats, btree_column_variable);

	/*
	 * Walk the page, counting regular and overflow data items, and checking
	 * to be sure any updates weren't deletions.  If the item was updated,
	 * assume it was updated by an item of the same size (it's expensive to
	 * figure out if it will require the same space or not, especially if
	 * there's Huffman encoding).
	 */
	WT_COL_FOREACH(page, cip, i) {
		if ((cell = WT_COL_PTR(page, cip)) == NULL) {
			orig_deleted = 1;
			WT_STAT_INCR(stats, btree_column_deleted);
		} else {
			orig_deleted = 0;
			__wt_cell_unpack(cell, unpack);
			WT_STAT_INCRV(
			    stats, btree_entries, __wt_cell_rle(unpack));
		}

		/*
		 * Walk the insert list, checking for changes.  For each insert
		 * we find, correct the original count based on its state.
		 */
		WT_SKIP_FOREACH(ins, WT_COL_UPDATE(page, cip)) {
			upd = ins->upd;
			if (WT_UPDATE_DELETED_ISSET(upd)) {
				if (orig_deleted)
					continue;
				WT_STAT_INCR(stats, btree_column_deleted);
				WT_STAT_DECR(stats, btree_entries);
			} else {
				if (!orig_deleted)
					continue;
				WT_STAT_DECR(stats, btree_column_deleted);
				WT_STAT_INCR(stats, btree_entries);
			}
		}
	}
Exemple #3
0
/*
 * __stat_page_col_var --
 *	Stat a WT_PAGE_COL_VAR page.
 */
static void
__stat_page_col_var(
    WT_SESSION_IMPL *session, WT_PAGE *page, WT_DSRC_STATS **stats)
{
	WT_CELL *cell;
	WT_CELL_UNPACK *unpack, _unpack;
	WT_COL *cip;
	WT_INSERT *ins;
	WT_UPDATE *upd;
	uint64_t deleted_cnt, entry_cnt, ovfl_cnt, rle_cnt;
	uint32_t i;
	bool orig_deleted;

	unpack = &_unpack;
	deleted_cnt = entry_cnt = ovfl_cnt = rle_cnt = 0;

	WT_STAT_INCR(session, stats, btree_column_variable);

	/*
	 * Walk the page counting regular items, adjusting if the item has been
	 * subsequently deleted or not. This is a mess because 10-item RLE might
	 * have 3 of the items subsequently deleted. Overflow items are harder,
	 * we can't know if an updated item will be an overflow item or not; do
	 * our best, and simply count every overflow item (or RLE set of items)
	 * we see.
	 */
	WT_COL_FOREACH(page, cip, i) {
		if ((cell = WT_COL_PTR(page, cip)) == NULL) {
			orig_deleted = true;
			++deleted_cnt;
		} else {
			orig_deleted = false;
			__wt_cell_unpack(cell, unpack);
			if (unpack->type == WT_CELL_ADDR_DEL)
				orig_deleted = true;
			else {
				entry_cnt += __wt_cell_rle(unpack);
				rle_cnt += __wt_cell_rle(unpack) - 1;
			}
			if (unpack->ovfl)
				++ovfl_cnt;
		}

		/*
		 * Walk the insert list, checking for changes.  For each insert
		 * we find, correct the original count based on its state.
		 */
		WT_SKIP_FOREACH(ins, WT_COL_UPDATE(page, cip)) {
			upd = ins->upd;
			if (WT_UPDATE_DELETED_ISSET(upd)) {
				if (!orig_deleted) {
					++deleted_cnt;
					--entry_cnt;
				}
			} else
				if (orig_deleted) {
					--deleted_cnt;
					++entry_cnt;
				}
		}
	}
Exemple #4
0
/*
 * __rec_review --
 *	Get exclusive access to the page and review the page and its subtree
 *	for conditions that would block its eviction.
 *
 *	The ref and page arguments may appear to be redundant, because usually
 *	ref->page == page and page->ref == ref.  However, we need both because
 *	(a) there are cases where ref == NULL (e.g., for root page or during
 *	salvage), and (b) we can't safely look at page->ref until we have a
 *	hazard reference.
 */
static int
__rec_review(WT_SESSION_IMPL *session,
    WT_REF *ref, WT_PAGE *page, uint32_t flags, int top)
{
	WT_DECL_RET;
	WT_PAGE_MODIFY *mod;
	WT_TXN *txn;
	uint32_t i;

	txn = &session->txn;

	/*
	 * Get exclusive access to the page if our caller doesn't have the tree
	 * locked down.
	 */
	if (!LF_ISSET(WT_REC_SINGLE))
		WT_RET(__hazard_exclusive(session, ref, top));

	/*
	 * Recurse through the page's subtree: this happens first because we
	 * have to write pages in depth-first order, otherwise we'll dirty
	 * pages after we've written them.
	 */
	if (page->type == WT_PAGE_COL_INT || page->type == WT_PAGE_ROW_INT)
		WT_REF_FOREACH(page, ref, i)
			switch (ref->state) {
			case WT_REF_DISK:		/* On-disk */
			case WT_REF_DELETED:		/* On-disk, deleted */
				break;
			case WT_REF_MEM:		/* In-memory */
				WT_RET(__rec_review(
				    session, ref, ref->page, flags, 0));
				break;
			case WT_REF_EVICT_WALK:		/* Walk point */
			case WT_REF_LOCKED:		/* Being evicted */
			case WT_REF_READING:		/* Being read */
				return (EBUSY);
			}

	/*
	 * Check if this page can be evicted:
	 *
	 * Fail if the top-level page is a page expected to be removed from the
	 * tree as part of eviction (an empty page or a split-merge page).  Note
	 * "split" pages are NOT included in this test, because a split page can
	 * be separately evicted, at which point it's replaced in its parent by
	 * a reference to a split-merge page.  That's a normal part of the leaf
	 * page life-cycle if it grows too large and must be pushed out of the
	 * cache.  There is also an exception for empty pages, the root page may
	 * be empty when evicted, but that only happens when the tree is closed.
	 *
	 * Fail if any page in the top-level page's subtree can't be merged into
	 * its parent.  You can't evict a page that references such in-memory
	 * pages, they must be evicted first.  The test is necessary but should
	 * not fire much: the LRU-based eviction code is biased for leaf pages,
	 * an internal page shouldn't be selected for LRU-based eviction until
	 * its children have been evicted.  Empty, split and split-merge pages
	 * are all included in this test, they can all be merged into a parent.
	 *
	 * We have to write dirty pages to know their final state, a page marked
	 * empty may have had records added since reconciliation, a page marked
	 * split may have had records deleted and no longer need to split.
	 * Split-merge pages are the exception: they can never be change into
	 * anything other than a split-merge page and are merged regardless of
	 * being clean or dirty.
	 *
	 * Writing the page is expensive, do a cheap test first: if it doesn't
	 * appear a subtree page can be merged, quit.  It's possible the page
	 * has been emptied since it was last reconciled, and writing it before
	 * testing might be worthwhile, but it's more probable we're attempting
	 * to evict an internal page with live children, and that's a waste of
	 * time.
	 *
	 * We don't do a cheap test for the top-level page: we're not called
	 * to evict split-merge pages, which means the only interesting case
	 * is an empty page.  If the eviction thread picked an "empty" page
	 * for eviction, it must have had reason, probably the empty page got
	 * really, really full.
	 */
	mod = page->modify;
	if (!top && (mod == NULL || !F_ISSET(mod,
	    WT_PM_REC_EMPTY | WT_PM_REC_SPLIT | WT_PM_REC_SPLIT_MERGE)))
		return (EBUSY);

	/* If the page is dirty, write it so we know the final state. */
	if (__wt_page_is_modified(page) &&
	    !F_ISSET(mod, WT_PM_REC_SPLIT_MERGE)) {
		ret = __wt_rec_write(session, page, NULL, flags);

		/* If there are unwritten changes on the page, give up. */
		if (ret == 0 &&
		    !LF_ISSET(WT_REC_SINGLE) && __wt_page_is_modified(page))
			ret = EBUSY;
		if (ret == EBUSY) {
			WT_VERBOSE_RET(session, evict,
			    "page %p written but not clean", page);

			if (F_ISSET(txn, TXN_RUNNING) &&
			    ++txn->eviction_fails >= 100) {
				txn->eviction_fails = 0;
				ret = WT_DEADLOCK;
				WT_STAT_INCR(
				    S2C(session)->stats, txn_fail_cache);
			}

			/*
			 * If there aren't multiple cursors active, there
			 * are no consistency issues: try to bump our snapshot.
			 */
			if (session->ncursors <= 1) {
				__wt_txn_read_last(session);
				__wt_txn_read_first(session);
			}

			switch (page->type) {
			case WT_PAGE_COL_FIX:
			case WT_PAGE_COL_VAR:
				__wt_col_leaf_obsolete(session, page);
				break;
			case WT_PAGE_ROW_LEAF:
				__wt_row_leaf_obsolete(session, page);
				break;
			}
		}
		WT_RET(ret);

		txn->eviction_fails = 0;
	}

	/*
	 * Repeat the eviction tests.
	 *
	 * Fail if the top-level page should be merged into its parent, and it's
	 * not the root page.
	 *
	 * Fail if a page in the top-level page's subtree can't be merged into
	 * its parent.
	 */
	if (top) {
		/*
		 * We never get a top-level split-merge page to evict, they are
		 * ignored by the eviction thread.  Check out of sheer paranoia.
		 */
		if (mod != NULL) {
			if (F_ISSET(mod, WT_PM_REC_SPLIT_MERGE))
				return (EBUSY);
			if (F_ISSET(mod, WT_PM_REC_EMPTY) &&
			    !WT_PAGE_IS_ROOT(page))
				return (EBUSY);
		}
	} else if (mod == NULL || !F_ISSET(mod,
	    WT_PM_REC_EMPTY | WT_PM_REC_SPLIT | WT_PM_REC_SPLIT_MERGE))
		return (EBUSY);
	return (0);
}
Exemple #5
0
/*
 * __wt_rec_evict --
 *	Reconciliation plus eviction.
 */
int
__wt_rec_evict(WT_SESSION_IMPL *session, WT_PAGE *page, uint32_t flags)
{
	WT_CONNECTION_IMPL *conn;
	WT_DECL_RET;
	int single;

	conn = S2C(session);

	WT_VERBOSE_RET(session, evict,
	    "page %p (%s)", page, __wt_page_type_string(page->type));

	WT_ASSERT(session, session->excl_next == 0);
	single = LF_ISSET(WT_REC_SINGLE) ? 1 : 0;

	/*
	 * Get exclusive access to the page and review the page and its subtree
	 * for conditions that would block our eviction of the page.  If the
	 * check fails (for example, we find a child page that can't be merged),
	 * we're done.  We have to make this check for clean pages, too: while
	 * unlikely eviction would choose an internal page with children, it's
	 * not disallowed anywhere.
	 *
	 * Note that page->ref may be NULL in some cases (e.g., for root pages
	 * or during salvage).  That's OK if WT_REC_SINGLE is set: we won't
	 * check hazard references in that case.
	 */
	WT_ERR(__rec_review(session, page->ref, page, flags, 1));

	/* Count evictions of internal pages during normal operation. */
	if (!single &&
	    (page->type == WT_PAGE_COL_INT || page->type == WT_PAGE_ROW_INT))
		WT_STAT_INCR(conn->stats, cache_evict_internal);

	/* Update the parent and discard the page. */
	if (page->modify == NULL || !F_ISSET(page->modify, WT_PM_REC_MASK)) {
		WT_STAT_INCR(conn->stats, cache_evict_unmodified);
		WT_ASSERT(session, single || page->ref->state == WT_REF_LOCKED);

		if (WT_PAGE_IS_ROOT(page))
			__rec_root_update(session);
		else
			__rec_page_clean_update(session, page);

		/* Discard the page. */
		__rec_discard_page(session, page, single);
	} else {
		WT_STAT_INCR(conn->stats, cache_evict_modified);

		if (WT_PAGE_IS_ROOT(page))
			__rec_root_update(session);
		else
			WT_ERR(__rec_page_dirty_update(session, page));

		/* Discard the tree rooted in this page. */
		__rec_discard_tree(session, page, single);
	}
	if (0) {
err:		/*
		 * If unable to evict this page, release exclusive reference(s)
		 * we've acquired.
		 */
		__rec_excl_clear(session);
	}
	session->excl_next = 0;

	return (ret);
}
Exemple #6
0
/*
 * __wt_lsm_stat_init --
 *	Initialize a LSM statistics structure.
 */
int
__wt_lsm_stat_init(WT_SESSION_IMPL *session,
    WT_LSM_TREE *lsm_tree, WT_CURSOR_STAT *cst, uint32_t flags)
{
	WT_CURSOR *stat_cursor;
	WT_DECL_ITEM(uribuf);
	WT_DECL_RET;
	WT_DSRC_STATS *stats;
	WT_LSM_CHUNK *chunk;
	const char *cfg[] = API_CONF_DEFAULTS(
	    session, open_cursor, "statistics_fast=on");
	const char *disk_cfg[] = API_CONF_DEFAULTS(session,
	    open_cursor, "checkpoint=WiredTigerCheckpoint,statistics_fast=on");
	const char *desc, *pvalue;
	uint64_t value;
	u_int i;
	int locked, stat_key;

	WT_UNUSED(flags);
	locked = 0;

	WT_ERR(__wt_scr_alloc(session, 0, &uribuf));

	/* Clear the statistics we are about to recalculate. */
	if (cst->stats != NULL)
		stats = (WT_DSRC_STATS *)cst->stats;
	else {
		WT_ERR(__wt_calloc_def(session, 1, &stats));
		__wt_stat_init_dsrc_stats(stats);
		cst->stats_first = cst->stats = (WT_STATS *)stats;
		cst->stats_count = sizeof(*stats) / sizeof(WT_STATS);
	}
	*stats = lsm_tree->stats;

	if (LF_ISSET(WT_STATISTICS_CLEAR))
		__wt_stat_clear_dsrc_stats(&lsm_tree->stats);

	/* Hold the LSM lock so that we can safely walk through the chunks. */
	WT_ERR(__wt_readlock(session, lsm_tree->rwlock));
	locked = 1;

	/* Set the stats for this run. */
	WT_STAT_SET(stats, lsm_chunk_count, lsm_tree->nchunks);
	for (i = 0; i < lsm_tree->nchunks; i++) {
		chunk = lsm_tree->chunk[i];
		if (chunk->generation >
		    (uint32_t)WT_STAT(stats, lsm_generation_max))
			WT_STAT_SET(stats,
			    lsm_generation_max, chunk->generation);

		/*
		 * LSM chunk reads happen from a checkpoint, so get the
		 * statistics for a checkpoint if one exists.
		 */
		WT_ERR(__wt_buf_fmt(
		    session, uribuf, "statistics:%s", chunk->uri));
		ret = __wt_curstat_open(session, uribuf->data,
		    F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) ? disk_cfg : cfg,
		    &stat_cursor);
		/*
		 * XXX kludge: we may have an empty chunk where no checkpoint
		 * was written.  If so, try to open the ordinary handle on that
		 * chunk instead.
		 */
		if (ret == WT_NOTFOUND && F_ISSET(chunk, WT_LSM_CHUNK_ONDISK))
			ret = __wt_curstat_open(
			    session, uribuf->data, cfg, &stat_cursor);
		WT_ERR(ret);

		while ((ret = stat_cursor->next(stat_cursor)) == 0) {
			WT_ERR(stat_cursor->get_key(stat_cursor, &stat_key));
			WT_ERR(stat_cursor->get_value(
			    stat_cursor, &desc, &pvalue, &value));
			WT_STAT_INCRKV(stats, stat_key, value);
		}
		WT_ERR_NOTFOUND_OK(ret);
		WT_ERR(stat_cursor->close(stat_cursor));

		if (!F_ISSET(chunk, WT_LSM_CHUNK_BLOOM))
			continue;

		WT_STAT_INCR(stats, bloom_count);
		WT_STAT_INCRV(stats, bloom_size,
		    (chunk->count * lsm_tree->bloom_bit_count) / 8);

		WT_ERR(__wt_buf_fmt(
		    session, uribuf, "statistics:%s", chunk->bloom_uri));
		WT_ERR(__wt_curstat_open(session, uribuf->data,
		    cfg, &stat_cursor));

		stat_cursor->set_key(
		    stat_cursor, WT_STAT_DSRC_CACHE_EVICTION_CLEAN);
		WT_ERR(stat_cursor->search(stat_cursor));
		WT_ERR(stat_cursor->get_value(
		    stat_cursor, &desc, &pvalue, &value));
		WT_STAT_INCRV(stats, cache_eviction_clean, value);
		WT_STAT_INCRV(stats, bloom_page_evict, value);

		stat_cursor->set_key(
		    stat_cursor, WT_STAT_DSRC_CACHE_EVICTION_DIRTY);
		WT_ERR(stat_cursor->search(stat_cursor));
		WT_ERR(stat_cursor->get_value(
		    stat_cursor, &desc, &pvalue, &value));
		WT_STAT_INCRV(stats, cache_eviction_dirty, value);
		WT_STAT_INCRV(stats, bloom_page_evict, value);

		stat_cursor->set_key(
		    stat_cursor, WT_STAT_DSRC_CACHE_EVICTION_FAIL);
		WT_ERR(stat_cursor->search(stat_cursor));
		WT_ERR(stat_cursor->get_value(
		    stat_cursor, &desc, &pvalue, &value));
		WT_STAT_INCRV(stats, cache_eviction_fail, value);

		stat_cursor->set_key(stat_cursor, WT_STAT_DSRC_CACHE_READ);
		WT_ERR(stat_cursor->search(stat_cursor));
		WT_ERR(stat_cursor->get_value(
		    stat_cursor, &desc, &pvalue, &value));
		WT_STAT_INCRV(stats, cache_read, value);
		WT_STAT_INCRV(stats, bloom_page_read, value);

		stat_cursor->set_key(stat_cursor, WT_STAT_DSRC_CACHE_WRITE);
		WT_ERR(stat_cursor->search(stat_cursor));
		WT_ERR(stat_cursor->get_value(
		    stat_cursor, &desc, &pvalue, &value));
		WT_STAT_INCRV(stats, cache_write, value);
		WT_ERR(stat_cursor->close(stat_cursor));
	}

err:	if (locked)
		WT_TRET(__wt_rwunlock(session, lsm_tree->rwlock));
	__wt_scr_free(&uribuf);

	return (ret);
}