예제 #1
0
/*
 * __rec_review --
 *	Get exclusive access to the page and review the page and its subtree
 *	for conditions that would block its eviction.
 *
 *	The ref and page arguments may appear to be redundant, because usually
 *	ref->page == page and page->ref == ref.  However, we need both because
 *	(a) there are cases where ref == NULL (e.g., for root page or during
 *	salvage), and (b) we can't safely look at page->ref until we have a
 *	hazard pointer.
 */
static int
__rec_review(WT_SESSION_IMPL *session,
    WT_REF *ref, WT_PAGE *page, int exclusive, int merge, int top)
{
	WT_BTREE *btree;
	WT_DECL_RET;
	WT_PAGE_MODIFY *mod;
	WT_PAGE *t;
	uint32_t i;

	btree = session->btree;

	/*
	 * Get exclusive access to the page if our caller doesn't have the tree
	 * locked down.
	 */
	if (!exclusive)
		WT_RET(__hazard_exclusive(session, ref, top));

	/*
	 * Recurse through the page's subtree: this happens first because we
	 * have to write pages in depth-first order, otherwise we'll dirty
	 * pages after we've written them.
	 */
	if (page->type == WT_PAGE_COL_INT || page->type == WT_PAGE_ROW_INT)
		WT_REF_FOREACH(page, ref, i)
			switch (ref->state) {
			case WT_REF_DISK:		/* On-disk */
			case WT_REF_DELETED:		/* On-disk, deleted */
				break;
			case WT_REF_MEM:		/* In-memory */
				WT_RET(__rec_review(session,
				    ref, ref->page, exclusive, merge, 0));
				break;
			case WT_REF_EVICT_WALK:		/* Walk point */
			case WT_REF_EVICT_FORCE:	/* Forced evict */
			case WT_REF_LOCKED:		/* Being evicted */
			case WT_REF_READING:		/* Being read */
				return (EBUSY);
			}

	/*
	 * If the file is being checkpointed, we cannot evict dirty pages,
	 * because that may free a page that appears on an internal page in the
	 * checkpoint.  Don't rely on new updates being skipped by the
	 * transaction used for transaction reads: (1) there are paths that
	 * dirty pages for artificial reasons; (2) internal pages aren't
	 * transactional; and (3) if an update was skipped during the
	 * checkpoint (leaving the page dirty), then rolled back, we could
	 * still successfully overwrite a page and corrupt the checkpoint.
	 *
	 * Further, even for clean pages, the checkpoint's reconciliation of an
	 * internal page might race with us as we evict a child in the page's
	 * subtree.
	 *
	 * One half of that test is in the reconciliation code: the checkpoint
	 * thread waits for eviction-locked pages to settle before determining
	 * their status.  The other half of the test is here: after acquiring
	 * the exclusive eviction lock on a page, confirm no page in the page's
	 * stack of pages from the root is being reconciled in a checkpoint.
	 * This ensures we either see the checkpoint-walk state here, or the
	 * reconciliation of the internal page sees our exclusive lock on the
	 * child page and waits until we're finished evicting the child page
	 * (or give up if eviction isn't possible).
	 *
	 * We must check the full stack (we might be attempting to evict a leaf
	 * page multiple levels beneath the internal page being reconciled as
	 * part of the checkpoint, and  all of the intermediate nodes are being
	 * merged into the internal page).
	 *
	 * There's no simple test for knowing if a page in our page stack is
	 * involved in a checkpoint.  The internal page's checkpoint-walk flag
	 * is the best test, but it's not set anywhere for the root page, it's
	 * not a complete test.
	 *
	 * Quit for any page that's not a simple, in-memory page.  (Almost the
	 * same as checking for the checkpoint-walk flag.  I don't think there
	 * are code paths that change the page's status from checkpoint-walk,
	 * but these races are hard enough I'm not going to proceed if there's
	 * anything other than a vanilla, in-memory tree stack.)  Climb until
	 * we find a page which can't be merged into its parent, and failing if
	 * we never find such a page.
	 */
	if (btree->checkpointing && !merge && __wt_page_is_modified(page)) {
ckpt:		WT_CSTAT_INCR(session, cache_eviction_checkpoint);
		WT_DSTAT_INCR(session, cache_eviction_checkpoint);
		return (EBUSY);
	}

	if (btree->checkpointing && top)
		for (t = page->parent;; t = t->parent) {
			if (t == NULL || t->ref == NULL)	/* root */
				goto ckpt;
			if (t->ref->state != WT_REF_MEM)	/* scary */
				goto ckpt;
			if (t->modify == NULL ||		/* not merged */
			    !F_ISSET(t->modify, WT_PM_REC_EMPTY |
			    WT_PM_REC_SPLIT | WT_PM_REC_SPLIT_MERGE))
				break;
		}

	/*
	 * If we are merging internal pages, we just need exclusive access, we
	 * don't need to write everything.
	 */
	if (merge)
		return (0);

	/*
	 * Fail if any page in the top-level page's subtree won't be merged into
	 * its parent, the page that cannot be merged must be evicted first.
	 * The test is necessary but should not fire much: the eviction code is
	 * biased for leaf pages, an internal page shouldn't be selected for
	 * eviction until its children have been evicted.
	 *
	 * We have to write dirty pages to know their final state, a page marked
	 * empty may have had records added since reconciliation, a page marked
	 * split may have had records deleted and no longer need to split.
	 * Split-merge pages are the exception: they can never be change into
	 * anything other than a split-merge page and are merged regardless of
	 * being clean or dirty.
	 *
	 * Writing the page is expensive, do a cheap test first: if it doesn't
	 * appear a subtree page can be merged, quit.  It's possible the page
	 * has been emptied since it was last reconciled, and writing it before
	 * testing might be worthwhile, but it's more probable we're attempting
	 * to evict an internal page with live children, and that's a waste of
	 * time.
	 */
	mod = page->modify;
	if (!top && (mod == NULL || !F_ISSET(mod,
	    WT_PM_REC_EMPTY | WT_PM_REC_SPLIT | WT_PM_REC_SPLIT_MERGE)))
		return (EBUSY);

	/*
	 * If the page is dirty and can possibly change state, write it so we
	 * know the final state.
	 */
	if (__wt_page_is_modified(page) &&
	    !F_ISSET(mod, WT_PM_REC_SPLIT_MERGE)) {
		ret = __wt_rec_write(session, page,
		    NULL, WT_EVICTION_SERVER_LOCKED | WT_SKIP_UPDATE_QUIT);

		/*
		 * Update the page's modification reference, reconciliation
		 * might have changed it.
		 */
		mod = page->modify;

		/* If there are unwritten changes on the page, give up. */
		if (ret == EBUSY) {
			WT_VERBOSE_RET(session, evict,
			    "eviction failed, reconciled page not clean");

			/* 
			 * We may be able to discard any "update" memory the
			 * page no longer needs.
			 */
			switch (page->type) {
			case WT_PAGE_COL_FIX:
			case WT_PAGE_COL_VAR:
				__wt_col_leaf_obsolete(session, page);
				break;
			case WT_PAGE_ROW_LEAF:
				__wt_row_leaf_obsolete(session, page);
				break;
			}
		}
		WT_RET(ret);

		WT_ASSERT(session, __wt_page_is_modified(page) == 0);
	}

	/*
	 * Repeat the test: fail if any page in the top-level page's subtree
	 * won't be merged into its parent.
	 */
	if (!top && (mod == NULL || !F_ISSET(mod,
	    WT_PM_REC_EMPTY | WT_PM_REC_SPLIT | WT_PM_REC_SPLIT_MERGE)))
		return (EBUSY);
	return (0);
}
예제 #2
0
/*
 * __rec_review --
 *	Get exclusive access to the page and review the page and its subtree
 *	for conditions that would block its eviction.
 *
 *	The ref and page arguments may appear to be redundant, because usually
 *	ref->page == page and page->ref == ref.  However, we need both because
 *	(a) there are cases where ref == NULL (e.g., for root page or during
 *	salvage), and (b) we can't safely look at page->ref until we have a
 *	hazard reference.
 */
static int
__rec_review(WT_SESSION_IMPL *session,
    WT_REF *ref, WT_PAGE *page, uint32_t flags, int top)
{
	WT_DECL_RET;
	WT_PAGE_MODIFY *mod;
	WT_TXN *txn;
	uint32_t i;

	txn = &session->txn;

	/*
	 * Get exclusive access to the page if our caller doesn't have the tree
	 * locked down.
	 */
	if (!LF_ISSET(WT_REC_SINGLE))
		WT_RET(__hazard_exclusive(session, ref, top));

	/*
	 * Recurse through the page's subtree: this happens first because we
	 * have to write pages in depth-first order, otherwise we'll dirty
	 * pages after we've written them.
	 */
	if (page->type == WT_PAGE_COL_INT || page->type == WT_PAGE_ROW_INT)
		WT_REF_FOREACH(page, ref, i)
			switch (ref->state) {
			case WT_REF_DISK:		/* On-disk */
			case WT_REF_DELETED:		/* On-disk, deleted */
				break;
			case WT_REF_MEM:		/* In-memory */
				WT_RET(__rec_review(
				    session, ref, ref->page, flags, 0));
				break;
			case WT_REF_EVICT_WALK:		/* Walk point */
			case WT_REF_LOCKED:		/* Being evicted */
			case WT_REF_READING:		/* Being read */
				return (EBUSY);
			}

	/*
	 * Check if this page can be evicted:
	 *
	 * Fail if the top-level page is a page expected to be removed from the
	 * tree as part of eviction (an empty page or a split-merge page).  Note
	 * "split" pages are NOT included in this test, because a split page can
	 * be separately evicted, at which point it's replaced in its parent by
	 * a reference to a split-merge page.  That's a normal part of the leaf
	 * page life-cycle if it grows too large and must be pushed out of the
	 * cache.  There is also an exception for empty pages, the root page may
	 * be empty when evicted, but that only happens when the tree is closed.
	 *
	 * Fail if any page in the top-level page's subtree can't be merged into
	 * its parent.  You can't evict a page that references such in-memory
	 * pages, they must be evicted first.  The test is necessary but should
	 * not fire much: the LRU-based eviction code is biased for leaf pages,
	 * an internal page shouldn't be selected for LRU-based eviction until
	 * its children have been evicted.  Empty, split and split-merge pages
	 * are all included in this test, they can all be merged into a parent.
	 *
	 * We have to write dirty pages to know their final state, a page marked
	 * empty may have had records added since reconciliation, a page marked
	 * split may have had records deleted and no longer need to split.
	 * Split-merge pages are the exception: they can never be change into
	 * anything other than a split-merge page and are merged regardless of
	 * being clean or dirty.
	 *
	 * Writing the page is expensive, do a cheap test first: if it doesn't
	 * appear a subtree page can be merged, quit.  It's possible the page
	 * has been emptied since it was last reconciled, and writing it before
	 * testing might be worthwhile, but it's more probable we're attempting
	 * to evict an internal page with live children, and that's a waste of
	 * time.
	 *
	 * We don't do a cheap test for the top-level page: we're not called
	 * to evict split-merge pages, which means the only interesting case
	 * is an empty page.  If the eviction thread picked an "empty" page
	 * for eviction, it must have had reason, probably the empty page got
	 * really, really full.
	 */
	mod = page->modify;
	if (!top && (mod == NULL || !F_ISSET(mod,
	    WT_PM_REC_EMPTY | WT_PM_REC_SPLIT | WT_PM_REC_SPLIT_MERGE)))
		return (EBUSY);

	/* If the page is dirty, write it so we know the final state. */
	if (__wt_page_is_modified(page) &&
	    !F_ISSET(mod, WT_PM_REC_SPLIT_MERGE)) {
		ret = __wt_rec_write(session, page, NULL, flags);

		/* If there are unwritten changes on the page, give up. */
		if (ret == 0 &&
		    !LF_ISSET(WT_REC_SINGLE) && __wt_page_is_modified(page))
			ret = EBUSY;
		if (ret == EBUSY) {
			WT_VERBOSE_RET(session, evict,
			    "page %p written but not clean", page);

			if (F_ISSET(txn, TXN_RUNNING) &&
			    ++txn->eviction_fails >= 100) {
				txn->eviction_fails = 0;
				ret = WT_DEADLOCK;
				WT_STAT_INCR(
				    S2C(session)->stats, txn_fail_cache);
			}

			/*
			 * If there aren't multiple cursors active, there
			 * are no consistency issues: try to bump our snapshot.
			 */
			if (session->ncursors <= 1) {
				__wt_txn_read_last(session);
				__wt_txn_read_first(session);
			}

			switch (page->type) {
			case WT_PAGE_COL_FIX:
			case WT_PAGE_COL_VAR:
				__wt_col_leaf_obsolete(session, page);
				break;
			case WT_PAGE_ROW_LEAF:
				__wt_row_leaf_obsolete(session, page);
				break;
			}
		}
		WT_RET(ret);

		txn->eviction_fails = 0;
	}

	/*
	 * Repeat the eviction tests.
	 *
	 * Fail if the top-level page should be merged into its parent, and it's
	 * not the root page.
	 *
	 * Fail if a page in the top-level page's subtree can't be merged into
	 * its parent.
	 */
	if (top) {
		/*
		 * We never get a top-level split-merge page to evict, they are
		 * ignored by the eviction thread.  Check out of sheer paranoia.
		 */
		if (mod != NULL) {
			if (F_ISSET(mod, WT_PM_REC_SPLIT_MERGE))
				return (EBUSY);
			if (F_ISSET(mod, WT_PM_REC_EMPTY) &&
			    !WT_PAGE_IS_ROOT(page))
				return (EBUSY);
		}
	} else if (mod == NULL || !F_ISSET(mod,
	    WT_PM_REC_EMPTY | WT_PM_REC_SPLIT | WT_PM_REC_SPLIT_MERGE))
		return (EBUSY);
	return (0);
}