Exemplo n.º 1
0
/*
 * __tree_walk_internal --
 *	Move to the next/previous page in the tree.
 */
static inline int
__tree_walk_internal(WT_SESSION_IMPL *session,
    WT_REF **refp, uint64_t *walkcntp,
    int (*skip_func)(WT_SESSION_IMPL *, WT_REF *, void *, bool *),
    void *func_cookie, uint32_t flags)
{
	WT_BTREE *btree;
	WT_DECL_RET;
	WT_PAGE_INDEX *pindex;
	WT_REF *couple, *couple_orig, *ref;
	uint32_t slot;
	bool empty_internal, initial_descent, prev, skip;

	btree = S2BT(session);
	pindex = NULL;
	empty_internal = initial_descent = false;

	/*
	 * Tree walks are special: they look inside page structures that splits
	 * may want to free.  Publish that the tree is active during this
	 * window.
	 */
	WT_ENTER_PAGE_INDEX(session);

	/* Walk should never instantiate deleted pages. */
	LF_SET(WT_READ_NO_EMPTY);

	/*
	 * !!!
	 * Fast-truncate currently only works on row-store trees.
	 */
	if (btree->type != BTREE_ROW)
		LF_CLR(WT_READ_TRUNCATE);

	prev = LF_ISSET(WT_READ_PREV) ? 1 : 0;

	/*
	 * There are multiple reasons and approaches to walking the in-memory
	 * tree:
	 *
	 * (1) finding pages to evict (the eviction server);
	 * (2) writing just dirty leaves or internal nodes (checkpoint);
	 * (3) discarding pages (close);
	 * (4) truncating pages in a range (fast truncate);
	 * (5) skipping pages based on outside information (compaction);
	 * (6) cursor scans (applications).
	 *
	 * Except for cursor scans and compaction, the walk is limited to the
	 * cache, no pages are read.  In all cases, hazard pointers protect the
	 * walked pages from eviction.
	 *
	 * Walks use hazard-pointer coupling through the tree and that's OK
	 * (hazard pointers can't deadlock, so there's none of the usual
	 * problems found when logically locking up a btree).  If the eviction
	 * thread tries to evict the active page, it fails because of our
	 * hazard pointer.  If eviction tries to evict our parent, that fails
	 * because the parent has a child page that can't be discarded.  We do
	 * play one game: don't couple up to our parent and then back down to a
	 * new leaf, couple to the next page to which we're descending, it
	 * saves a hazard-pointer swap for each cursor page movement.
	 *
	 * !!!
	 * NOTE: we depend on the fact it's OK to release a page we don't hold,
	 * that is, it's OK to release couple when couple is set to NULL.
	 *
	 * Take a copy of any held page and clear the return value.  Remember
	 * the hazard pointer we're currently holding.
	 *
	 * Clear the returned value, it makes future error handling easier.
	 */
	couple = couple_orig = ref = *refp;
	*refp = NULL;

	/* If no page is active, begin a walk from the start/end of the tree. */
	if (ref == NULL) {
restart:	/*
		 * We can be here with a NULL or root WT_REF; the page release
		 * function handles them internally, don't complicate this code
		 * by calling them out.
		 */
		WT_ERR(__wt_page_release(session, couple, flags));

		/*
		 * We're not supposed to walk trees without root pages. As this
		 * has not always been the case, assert to debug that change.
		 */
		WT_ASSERT(session, btree->root.page != NULL);

		couple = couple_orig = ref = &btree->root;
		initial_descent = true;
		goto descend;
	}

	/*
	 * If the active page was the root, we've reached the walk's end; we
	 * only get here if we've returned the root to our caller, so we're
	 * holding no hazard pointers.
	 */
	if (__wt_ref_is_root(ref))
		goto done;

	/* Figure out the current slot in the WT_REF array. */
	__ref_index_slot(session, ref, &pindex, &slot);

	for (;;) {
		/*
		 * If we're at the last/first slot on the internal page, return
		 * it in post-order traversal. Otherwise move to the next/prev
		 * slot and left/right-most element in that subtree.
		 */
		while ((prev && slot == 0) ||
		    (!prev && slot == pindex->entries - 1)) {
			/* Ascend to the parent. */
			__ref_ascend(session, &ref, &pindex, &slot);

			/*
			 * If at the root and returning internal pages, return
			 * the root page, otherwise we're done. Regardless, no
			 * hazard pointer is required, release the one we hold.
			 */
			if (__wt_ref_is_root(ref)) {
				WT_ERR(__wt_page_release(
				    session, couple, flags));
				if (!LF_ISSET(WT_READ_SKIP_INTL))
					*refp = ref;
				goto done;
			}

			/*
			 * If we got all the way through an internal page and
			 * all of the child pages were deleted, mark it for
			 * eviction.
			 */
			if (empty_internal && pindex->entries > 1) {
				__wt_page_evict_soon(session, ref);
				empty_internal = false;
			}

			/*
			 * Optionally return internal pages. Swap our previous
			 * hazard pointer for the page we'll return. We don't
			 * handle restart or not-found returns, it would require
			 * additional complexity and is not a possible return:
			 * we're moving to the parent of the current child page,
			 * the parent can't have been evicted.
			 */
			if (!LF_ISSET(WT_READ_SKIP_INTL)) {
				WT_ERR(__wt_page_swap(
				    session, couple, ref, flags));
				*refp = ref;
				goto done;
			}
		}

		if (prev)
			--slot;
		else
			++slot;

		if (walkcntp != NULL)
			++*walkcntp;

		for (;;) {
			/*
			 * Move to the next slot, and set the reference hint if
			 * it's wrong (used when we continue the walk). We don't
			 * always update the hints when splitting, it's expected
			 * for them to be incorrect in some workloads.
			 */
			ref = pindex->index[slot];
			if (ref->pindex_hint != slot)
				ref->pindex_hint = slot;

			/*
			 * If we see any child states other than deleted, the
			 * page isn't empty.
			 */
			if (ref->state != WT_REF_DELETED &&
			    !LF_ISSET(WT_READ_TRUNCATE))
				empty_internal = false;

			if (LF_ISSET(WT_READ_CACHE)) {
				/*
				 * Only look at unlocked pages in memory:
				 * fast-path some common cases.
				 */
				if (LF_ISSET(WT_READ_NO_WAIT) &&
				    ref->state != WT_REF_MEM)
					break;

				/* Skip lookaside pages if not requested. */
				if (ref->state == WT_REF_LOOKASIDE &&
				    !LF_ISSET(WT_READ_LOOKASIDE))
					break;
			} else if (LF_ISSET(WT_READ_TRUNCATE)) {
				/*
				 * Avoid pulling a deleted page back in to try
				 * to delete it again.
				 */
				if (ref->state == WT_REF_DELETED &&
				    __wt_delete_page_skip(session, ref, false))
					break;
				/*
				 * If deleting a range, try to delete the page
				 * without instantiating it.
				 */
				WT_ERR(__wt_delete_page(session, ref, &skip));
				if (skip)
					break;
				empty_internal = false;
			} else if (skip_func != NULL) {
				WT_ERR(skip_func(session,
				    ref, func_cookie, &skip));
				if (skip)
					break;
			} else {
				/*
				 * Try to skip deleted pages visible to us.
				 */
				if (ref->state == WT_REF_DELETED &&
				    __wt_delete_page_skip(session, ref, false))
					break;
			}

			ret = __wt_page_swap(session, couple, ref,
			    WT_READ_NOTFOUND_OK | WT_READ_RESTART_OK | flags);

			/*
			 * Not-found is an expected return when only walking
			 * in-cache pages, or if we see a deleted page.
			 */
			if (ret == WT_NOTFOUND) {
				ret = 0;
				break;
			}

			/*
			 * The page we're moving to might have split, in which
			 * case move to the last position we held.
			 */
			if (ret == WT_RESTART) {
				ret = 0;

				/*
				 * If a cursor is setting up at the end of the
				 * tree, we can't use our parent page's index,
				 * because it may have already split; restart
				 * the walk.
				 */
				if (prev && initial_descent)
					goto restart;

				/*
				 * If a new walk that never coupled from the
				 * root to a new saved position in the tree,
				 * restart the walk.
				 */
				if (couple == &btree->root)
					goto restart;

				/*
				 * If restarting from some original position,
				 * repeat the increment or decrement we made at
				 * that time. Otherwise, couple is an internal
				 * page we've acquired after moving from that
				 * starting position and we can treat it as a
				 * new page. This works because we never acquire
				 * a hazard pointer on a leaf page we're not
				 * going to return to our caller, this will quit
				 * working if that ever changes.
				 */
				WT_ASSERT(session,
				    couple == couple_orig ||
				    WT_PAGE_IS_INTERNAL(couple->page));
				ref = couple;
				__ref_index_slot(session, ref, &pindex, &slot);
				if (couple == couple_orig)
					break;
			}
			WT_ERR(ret);
			couple = ref;

			/*
			 * A new page: configure for traversal of any internal
			 * page's children, else return the leaf page.
			 */
			if (WT_PAGE_IS_INTERNAL(ref->page)) {
descend:			empty_internal = true;

				/*
				 * There's a split race when a cursor is setting
				 * up at the end of the tree or moving backwards
				 * through the tree and descending a level. When
				 * splitting an internal page into its parent,
				 * we move the WT_REF structures and update the
				 * parent's page index before updating the split
				 * page's page index, and it's not an atomic
				 * update. A thread can read the parent page's
				 * replacement page index, then read the split
				 * page's original index, or the parent page's
				 * original and the split page's replacement.
				 *
				 * This isn't a problem for a cursor setting up
				 * at the start of the tree or moving forwards
				 * through the tree because we do right-hand
				 * splits on internal pages and the initial part
				 * of the split page's namespace won't change as
				 * part of a split. A thread reading the parent
				 * page's and split page's indexes will move to
				 * the same slot no matter what order of indexes
				 * are read.
				 *
				 * Handle a cursor setting up at the end of the
				 * tree or moving backwards through the tree.
				 */
				if (!prev) {
					WT_INTL_INDEX_GET(
					    session, ref->page, pindex);
					slot = 0;
				} else if (initial_descent) {
					if (!__ref_initial_descent_prev(
					    session, ref, &pindex))
						goto restart;
					slot = pindex->entries - 1;
				} else {
					__ref_descend_prev(
					    session, ref, &pindex);
					slot = pindex->entries - 1;
				}
				continue;
			}

			/*
			 * The tree-walk restart code knows we return any leaf
			 * page we acquire (never hazard-pointer coupling on
			 * after acquiring a leaf page), and asserts no restart
			 * happens while holding a leaf page. This page must be
			 * returned to our caller.
			 */
			*refp = ref;
			goto done;
		}
	}

done:
err:	WT_LEAVE_PAGE_INDEX(session);
	return (ret);
}
Exemplo n.º 2
0
/*
 * __clsm_enter_update --
 *	Make sure an LSM cursor is ready to perform an update.
 */
static int
__clsm_enter_update(WT_CURSOR_LSM *clsm)
{
	WT_CURSOR *primary;
	WT_LSM_CHUNK *primary_chunk;
	WT_LSM_TREE *lsm_tree;
	WT_SESSION_IMPL *session;
	bool hard_limit, have_primary, ovfl;

	lsm_tree = clsm->lsm_tree;
	ovfl = false;
	session = (WT_SESSION_IMPL *)clsm->iface.session;

	if (clsm->nchunks == 0) {
		primary = NULL;
		have_primary = false;
	} else {
		primary = clsm->cursors[clsm->nchunks - 1];
		primary_chunk = clsm->primary_chunk;
		WT_ASSERT(session, F_ISSET(&session->txn, WT_TXN_HAS_ID));
		have_primary = (primary != NULL && primary_chunk != NULL &&
		    (primary_chunk->switch_txn == WT_TXN_NONE ||
		    WT_TXNID_LT(session->txn.id, primary_chunk->switch_txn)));
	}

	/*
	 * In LSM there are multiple btrees active at one time. The tree
	 * switch code needs to use btree API methods, and it wants to
	 * operate on the btree for the primary chunk. Set that up now.
	 *
	 * If the primary chunk has grown too large, set a flag so the worker
	 * thread will switch when it gets a chance to avoid introducing high
	 * latency into application threads.  Don't do this indefinitely: if a
	 * chunk grows twice as large as the configured size, block until it
	 * can be switched.
	 */
	hard_limit = F_ISSET(lsm_tree, WT_LSM_TREE_NEED_SWITCH);

	if (have_primary) {
		WT_ENTER_PAGE_INDEX(session);
		WT_WITH_BTREE(session, ((WT_CURSOR_BTREE *)primary)->btree,
		    ovfl = __wt_btree_lsm_over_size(session, hard_limit ?
		    2 * lsm_tree->chunk_size : lsm_tree->chunk_size));
		WT_LEAVE_PAGE_INDEX(session);

		/* If there was no overflow, we're done. */
		if (!ovfl)
			return (0);
	}

	/* Request a switch. */
	WT_RET(__wt_clsm_request_switch(clsm));

	/* If we only overflowed the soft limit, we're done. */
	if (have_primary && !hard_limit)
		return (0);

	WT_RET(__wt_clsm_await_switch(clsm));

	return (0);
}
Exemplo n.º 3
0
/*
 * __wt_tree_walk --
 *	Move to the next/previous page in the tree.
 */
int
__wt_tree_walk(WT_SESSION_IMPL *session,
    WT_REF **refp, uint64_t *walkcntp, uint32_t flags)
{
	WT_BTREE *btree;
	WT_DECL_RET;
	WT_PAGE *page;
	WT_PAGE_INDEX *pindex;
	WT_REF *couple, *couple_orig, *ref;
	int prev, skip;
	uint32_t slot;

	btree = S2BT(session);

	/*
	 * Tree walks are special: they look inside page structures that splits
	 * may want to free.  Publish that the tree is active during this
	 * window.
	 */
	WT_ENTER_PAGE_INDEX(session);

	/*
	 * !!!
	 * Fast-truncate currently only works on row-store trees.
	 */
	if (btree->type != BTREE_ROW)
		LF_CLR(WT_READ_TRUNCATE);

	prev = LF_ISSET(WT_READ_PREV) ? 1 : 0;

	/*
	 * There are multiple reasons and approaches to walking the in-memory
	 * tree:
	 *
	 * (1) finding pages to evict (the eviction server);
	 * (2) writing just dirty leaves or internal nodes (checkpoint);
	 * (3) discarding pages (close);
	 * (4) truncating pages in a range (fast truncate);
	 * (5) skipping pages based on outside information (compaction);
	 * (6) cursor scans (applications).
	 *
	 * Except for cursor scans and compaction, the walk is limited to the
	 * cache, no pages are read.  In all cases, hazard pointers protect the
	 * walked pages from eviction.
	 *
	 * Walks use hazard-pointer coupling through the tree and that's OK
	 * (hazard pointers can't deadlock, so there's none of the usual
	 * problems found when logically locking up a btree).  If the eviction
	 * thread tries to evict the active page, it fails because of our
	 * hazard pointer.  If eviction tries to evict our parent, that fails
	 * because the parent has a child page that can't be discarded.  We do
	 * play one game: don't couple up to our parent and then back down to a
	 * new leaf, couple to the next page to which we're descending, it
	 * saves a hazard-pointer swap for each cursor page movement.
	 *
	 * !!!
	 * NOTE: we depend on the fact it's OK to release a page we don't hold,
	 * that is, it's OK to release couple when couple is set to NULL.
	 *
	 * Take a copy of any held page and clear the return value.  Remember
	 * the hazard pointer we're currently holding.
	 *
	 * We may be passed a pointer to btree->evict_page that we are clearing
	 * here.  We check when discarding pages that we're not discarding that
	 * page, so this clear must be done before the page is released.
	 */
	couple = couple_orig = ref = *refp;
	*refp = NULL;

	/* If no page is active, begin a walk from the start of the tree. */
	if (ref == NULL) {
		ref = &btree->root;
		if (ref->page == NULL)
			goto done;
		goto descend;
	}

ascend:	/*
	 * If the active page was the root, we've reached the walk's end.
	 * Release any hazard-pointer we're holding.
	 */
	if (__wt_ref_is_root(ref)) {
		WT_ERR(__wt_page_release(session, couple, flags));
		goto done;
	}

	/* Figure out the current slot in the WT_REF array. */
	__wt_page_refp(session, ref, &pindex, &slot);

	for (;;) {
		/*
		 * If we're at the last/first slot on the page, return this page
		 * in post-order traversal.  Otherwise we move to the next/prev
		 * slot and left/right-most element in its subtree.
		 */
		if ((prev && slot == 0) ||
		    (!prev && slot == pindex->entries - 1)) {
			ref = ref->home->pg_intl_parent_ref;

			/* Optionally skip internal pages. */
			if (LF_ISSET(WT_READ_SKIP_INTL))
				goto ascend;

			/*
			 * We've ascended the tree and are returning an internal
			 * page.  If it's the root, discard our hazard pointer,
			 * otherwise, swap our hazard pointer for the page we'll
			 * return.
			 */
			if (__wt_ref_is_root(ref))
				WT_ERR(__wt_page_release(
				    session, couple, flags));
			else {
				/*
				 * Locate the reference to our parent page then
				 * swap our child hazard pointer for the parent.
				 * We don't handle restart or not-found returns.
				 * It would require additional complexity and is
				 * not a possible return: we're moving to the
				 * parent of the current child page, our parent
				 * reference can't have split or been evicted.
				 */
				__wt_page_refp(session, ref, &pindex, &slot);
				if ((ret = __wt_page_swap(
				    session, couple, ref, flags)) != 0) {
					WT_TRET(__wt_page_release(
					    session, couple, flags));
					WT_ERR(ret);
				}
			}

			*refp = ref;
			goto done;
		}

		if (prev)
			--slot;
		else
			++slot;

		if (walkcntp != NULL)
			++*walkcntp;

		for (;;) {
			ref = pindex->index[slot];

			if (LF_ISSET(WT_READ_CACHE)) {
				/*
				 * Only look at unlocked pages in memory:
				 * fast-path some common cases.
				 */
				if (LF_ISSET(WT_READ_NO_WAIT) &&
				    ref->state != WT_REF_MEM)
					break;
			} else if (LF_ISSET(WT_READ_TRUNCATE)) {
				/*
				 * Avoid pulling a deleted page back in to try
				 * to delete it again.
				 */
				if (ref->state == WT_REF_DELETED &&
				    __wt_delete_page_skip(session, ref))
					break;
				/*
				 * If deleting a range, try to delete the page
				 * without instantiating it.
				 */
				WT_ERR(__wt_delete_page(session, ref, &skip));
				if (skip)
					break;
			} else if (LF_ISSET(WT_READ_COMPACT)) {
				/*
				 * Skip deleted pages, rewriting them doesn't
				 * seem useful.
				 */
				if (ref->state == WT_REF_DELETED)
					break;

				/*
				 * If the page is in-memory, we want to look at
				 * it (it may have been modified and written,
				 * and the current location is the interesting
				 * one in terms of compaction, not the original
				 * location).  If the page isn't in-memory, test
				 * if the page will help with compaction, don't
				 * read it if we don't have to.
				 */
				if (ref->state == WT_REF_DISK) {
					WT_ERR(__wt_compact_page_skip(
					    session, ref, &skip));
					if (skip)
						break;
				}
			} else {
				/*
				 * Try to skip deleted pages visible to us.
				 */
				if (ref->state == WT_REF_DELETED &&
				    __wt_delete_page_skip(session, ref))
					break;
			}

			ret = __wt_page_swap(session, couple, ref, flags);

			/*
			 * Not-found is an expected return when only walking
			 * in-cache pages.
			 */
			if (ret == WT_NOTFOUND) {
				ret = 0;
				break;
			}

			/*
			 * The page we're moving to might have split, in which
			 * case move to the last position we held.
			 */
			if (ret == WT_RESTART) {
				ret = 0;

				/*
				 * If a new walk that never coupled from the
				 * root to a new saved position in the tree,
				 * restart the walk.
				 */
				if (couple == &btree->root) {
					ref = &btree->root;
					if (ref->page == NULL)
						goto done;
					goto descend;
				}

				/*
				 * If restarting from some original position,
				 * repeat the increment or decrement we made at
				 * that time. Otherwise, couple is an internal
				 * page we've acquired after moving from that
				 * starting position and we can treat it as a
				 * new page. This works because we never acquire
				 * a hazard pointer on a leaf page we're not
				 * going to return to our caller, this will quit
				 * working if that ever changes.
				 */
				WT_ASSERT(session,
				    couple == couple_orig ||
				    WT_PAGE_IS_INTERNAL(couple->page));
				ref = couple;
				__wt_page_refp(session, ref, &pindex, &slot);
				if (couple == couple_orig)
					break;
			}
			WT_ERR(ret);

			/*
			 * A new page: configure for traversal of any internal
			 * page's children, else return the leaf page.
			 */
descend:		couple = ref;
			page = ref->page;
			if (page->type == WT_PAGE_ROW_INT ||
			    page->type == WT_PAGE_COL_INT) {
				WT_INTL_INDEX_GET(session, page, pindex);
				slot = prev ? pindex->entries - 1 : 0;
			} else {
				*refp = ref;
				goto done;
			}
		}
	}

done:
err:	WT_LEAVE_PAGE_INDEX(session);
	return (ret);
}
Exemplo n.º 4
0
/*
 * __tree_walk_internal --
 *	Move to the next/previous page in the tree.
 */
static inline int
__tree_walk_internal(WT_SESSION_IMPL *session,
    WT_REF **refp, uint64_t *walkcntp, uint64_t *skipleafcntp, uint32_t flags)
{
	WT_BTREE *btree;
	WT_DECL_RET;
	WT_PAGE_INDEX *pindex;
	WT_REF *couple, *couple_orig, *ref;
	bool empty_internal, prev, skip;
	uint32_t slot;

	btree = S2BT(session);
	empty_internal = false;

	/*
	 * Tree walks are special: they look inside page structures that splits
	 * may want to free.  Publish that the tree is active during this
	 * window.
	 */
	WT_ENTER_PAGE_INDEX(session);

	/* Walk should never instantiate deleted pages. */
	LF_SET(WT_READ_NO_EMPTY);

	/*
	 * !!!
	 * Fast-truncate currently only works on row-store trees.
	 */
	if (btree->type != BTREE_ROW)
		LF_CLR(WT_READ_TRUNCATE);

	prev = LF_ISSET(WT_READ_PREV) ? 1 : 0;

	/*
	 * There are multiple reasons and approaches to walking the in-memory
	 * tree:
	 *
	 * (1) finding pages to evict (the eviction server);
	 * (2) writing just dirty leaves or internal nodes (checkpoint);
	 * (3) discarding pages (close);
	 * (4) truncating pages in a range (fast truncate);
	 * (5) skipping pages based on outside information (compaction);
	 * (6) cursor scans (applications).
	 *
	 * Except for cursor scans and compaction, the walk is limited to the
	 * cache, no pages are read.  In all cases, hazard pointers protect the
	 * walked pages from eviction.
	 *
	 * Walks use hazard-pointer coupling through the tree and that's OK
	 * (hazard pointers can't deadlock, so there's none of the usual
	 * problems found when logically locking up a btree).  If the eviction
	 * thread tries to evict the active page, it fails because of our
	 * hazard pointer.  If eviction tries to evict our parent, that fails
	 * because the parent has a child page that can't be discarded.  We do
	 * play one game: don't couple up to our parent and then back down to a
	 * new leaf, couple to the next page to which we're descending, it
	 * saves a hazard-pointer swap for each cursor page movement.
	 *
	 * !!!
	 * NOTE: we depend on the fact it's OK to release a page we don't hold,
	 * that is, it's OK to release couple when couple is set to NULL.
	 *
	 * Take a copy of any held page and clear the return value.  Remember
	 * the hazard pointer we're currently holding.
	 *
	 * We may be passed a pointer to btree->evict_page that we are clearing
	 * here.  We check when discarding pages that we're not discarding that
	 * page, so this clear must be done before the page is released.
	 */
	couple = couple_orig = ref = *refp;
	*refp = NULL;

	/* If no page is active, begin a walk from the start of the tree. */
	if (ref == NULL) {
		ref = &btree->root;
		if (ref->page == NULL)
			goto done;
		goto descend;
	}

	/*
	 * If the active page was the root, we've reached the walk's end.
	 * Release any hazard-pointer we're holding.
	 */
	if (__wt_ref_is_root(ref)) {
		WT_ERR(__wt_page_release(session, couple, flags));
		goto done;
	}

	/* Figure out the current slot in the WT_REF array. */
	__ref_index_slot(session, ref, &pindex, &slot);

	for (;;) {
		/*
		 * If we're at the last/first slot on the internal page, return
		 * it in post-order traversal. Otherwise move to the next/prev
		 * slot and left/right-most element in that subtree.
		 */
		while ((prev && slot == 0) ||
		    (!prev && slot == pindex->entries - 1)) {
			/* Ascend to the parent. */
			__page_ascend(session, &ref, &pindex, &slot);

			/*
			 * If we got all the way through an internal page and
			 * all of the child pages were deleted, mark it for
			 * eviction.
			 */
			if (empty_internal && pindex->entries > 1) {
				__wt_page_evict_soon(ref->page);
				empty_internal = false;
			}

			/*
			 * If at the root and returning internal pages, return
			 * the root page, otherwise we're done. Regardless, no
			 * hazard pointer is required, release the one we hold.
			 */
			if (__wt_ref_is_root(ref)) {
				WT_ERR(__wt_page_release(
				    session, couple, flags));
				if (!LF_ISSET(WT_READ_SKIP_INTL))
					*refp = ref;
				goto done;
			}

			/*
			 * Optionally return internal pages. Swap our previous
			 * hazard pointer for the page we'll return. We don't
			 * handle restart or not-found returns, it would require
			 * additional complexity and is not a possible return:
			 * we're moving to the parent of the current child page,
			 * the parent can't have been evicted.
			 */
			if (!LF_ISSET(WT_READ_SKIP_INTL)) {
				WT_ERR(__wt_page_swap(
				    session, couple, ref, flags));
				*refp = ref;
				goto done;
			}
		}

		if (prev)
			--slot;
		else
			++slot;

		if (walkcntp != NULL)
			++*walkcntp;

		for (;;) {
			/*
			 * Move to the next slot, and set the reference hint if
			 * it's wrong (used when we continue the walk). We don't
			 * update those hints when splitting, so it's common for
			 * them to be incorrect in some workloads.
			 */
			ref = pindex->index[slot];
			if (ref->pindex_hint != slot)
				ref->pindex_hint = slot;

			/*
			 * If we see any child states other than deleted, the
			 * page isn't empty.
			 */
			if (ref->state != WT_REF_DELETED &&
			    !LF_ISSET(WT_READ_TRUNCATE))
				empty_internal = false;

			if (LF_ISSET(WT_READ_CACHE)) {
				/*
				 * Only look at unlocked pages in memory:
				 * fast-path some common cases.
				 */
				if (LF_ISSET(WT_READ_NO_WAIT) &&
				    ref->state != WT_REF_MEM)
					break;
			} else if (LF_ISSET(WT_READ_TRUNCATE)) {
				/*
				 * Avoid pulling a deleted page back in to try
				 * to delete it again.
				 */
				if (ref->state == WT_REF_DELETED &&
				    __wt_delete_page_skip(session, ref, false))
					break;
				/*
				 * If deleting a range, try to delete the page
				 * without instantiating it.
				 */
				WT_ERR(__wt_delete_page(session, ref, &skip));
				if (skip)
					break;
				empty_internal = false;
			} else if (LF_ISSET(WT_READ_COMPACT)) {
				/*
				 * Skip deleted pages, rewriting them doesn't
				 * seem useful.
				 */
				if (ref->state == WT_REF_DELETED)
					break;

				/*
				 * If the page is in-memory, we want to look at
				 * it (it may have been modified and written,
				 * and the current location is the interesting
				 * one in terms of compaction, not the original
				 * location).  If the page isn't in-memory, test
				 * if the page will help with compaction, don't
				 * read it if we don't have to.
				 */
				if (ref->state == WT_REF_DISK) {
					WT_ERR(__wt_compact_page_skip(
					    session, ref, &skip));
					if (skip)
						break;
				}
			} else {
				/*
				 * Try to skip deleted pages visible to us.
				 */
				if (ref->state == WT_REF_DELETED &&
				    __wt_delete_page_skip(session, ref, false))
					break;
			}

			/*
			 * Optionally skip leaf pages: skip all leaf pages if
			 * WT_READ_SKIP_LEAF is set, when the skip-leaf-count
			 * variable is non-zero, skip some count of leaf pages.
			 * If this page is disk-based, crack the cell to figure
			 * out it's a leaf page without reading it.
			 *
			 * If skipping some number of leaf pages, decrement the
			 * count of pages to zero, and then take the next leaf
			 * page we can. Be cautious around the page decrement,
			 * if for some reason don't take this particular page,
			 * we can take the next one, and, there are additional
			 * tests/decrements when we're about to return a leaf
			 * page.
			 */
			if (skipleafcntp != NULL || LF_ISSET(WT_READ_SKIP_LEAF))
				if (__ref_is_leaf(ref)) {
					if (LF_ISSET(WT_READ_SKIP_LEAF))
						break;
					if (*skipleafcntp > 0) {
						--*skipleafcntp;
						break;
					}
				}

			ret = __wt_page_swap(session, couple, ref,
			    WT_READ_NOTFOUND_OK | WT_READ_RESTART_OK | flags);

			/*
			 * Not-found is an expected return when only walking
			 * in-cache pages, or if we see a deleted page.
			 */
			if (ret == WT_NOTFOUND) {
				ret = 0;
				break;
			}

			/*
			 * The page we're moving to might have split, in which
			 * case move to the last position we held.
			 */
			if (ret == WT_RESTART) {
				ret = 0;

				/*
				 * If a new walk that never coupled from the
				 * root to a new saved position in the tree,
				 * restart the walk.
				 */
				if (couple == &btree->root) {
					ref = &btree->root;
					if (ref->page == NULL)
						goto done;
					goto descend;
				}

				/*
				 * If restarting from some original position,
				 * repeat the increment or decrement we made at
				 * that time. Otherwise, couple is an internal
				 * page we've acquired after moving from that
				 * starting position and we can treat it as a
				 * new page. This works because we never acquire
				 * a hazard pointer on a leaf page we're not
				 * going to return to our caller, this will quit
				 * working if that ever changes.
				 */
				WT_ASSERT(session,
				    couple == couple_orig ||
				    WT_PAGE_IS_INTERNAL(couple->page));
				ref = couple;
				__ref_index_slot(session, ref, &pindex, &slot);
				if (couple == couple_orig)
					break;
			}
			WT_ERR(ret);

			/*
			 * A new page: configure for traversal of any internal
			 * page's children, else return the leaf page.
			 */
			if (WT_PAGE_IS_INTERNAL(ref->page)) {
descend:			couple = ref;
				empty_internal = true;

				__page_descend(
				    session, ref->page, &pindex, &slot, prev);
			} else {
				/*
				 * Optionally skip leaf pages, the second half.
				 * We didn't have an on-page cell to figure out
				 * if it was a leaf page, we had to acquire the
				 * hazard pointer and look at the page.
				 */
				if (skipleafcntp != NULL ||
				    LF_ISSET(WT_READ_SKIP_LEAF)) {
					couple = ref;
					if (LF_ISSET(WT_READ_SKIP_LEAF))
						break;
					if (*skipleafcntp > 0) {
						--*skipleafcntp;
						break;
					}
				}

				*refp = ref;
				goto done;
			}
		}
	}

done:
err:	WT_LEAVE_PAGE_INDEX(session);
	return (ret);
}