예제 #1
0
/*
 * __compact_rewrite --
 *	Return if a page needs to be re-written.
 */
static int
__compact_rewrite(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp)
{
	WT_BM *bm;
	WT_DECL_RET;
	WT_PAGE *page;
	WT_PAGE_MODIFY *mod;
	size_t addr_size;
	const uint8_t *addr;

	*skipp = true;					/* Default skip. */

	bm = S2BT(session)->bm;
	page = ref->page;
	mod = page->modify;

	/*
	 * Ignore the root: it may not have a replacement address, and besides,
	 * if anything else gets written, so will it.
	 */
	if (__wt_ref_is_root(ref))
		return (0);

	/* Ignore currently dirty pages, they will be written regardless. */
	if (__wt_page_is_modified(page))
		return (0);

	/*
	 * If the page is clean, test the original addresses.
	 * If the page is a 1-to-1 replacement, test the replacement addresses.
	 * Ignore empty pages, they get merged into the parent.
	 */
	if (mod == NULL || mod->rec_result == 0) {
		WT_RET(__wt_ref_info(session, ref, &addr, &addr_size, NULL));
		if (addr == NULL)
			return (0);
		WT_RET(
		    bm->compact_page_skip(bm, session, addr, addr_size, skipp));
	} else if (mod->rec_result == WT_PM_REC_REPLACE) {
		/*
		 * The page's modification information can change underfoot if
		 * the page is being reconciled, serialize with reconciliation.
		 */
		WT_RET(__wt_fair_lock(session, &page->page_lock));

		ret = bm->compact_page_skip(bm, session,
		    mod->mod_replace.addr, mod->mod_replace.size, skipp);

		WT_TRET(__wt_fair_unlock(session, &page->page_lock));
		WT_RET(ret);
	}
	return (0);
}
예제 #2
0
파일: bt_misc.c 프로젝트: 7segments/mongo-1
/*
 * __wt_page_addr_string --
 *	Figure out a page's "address" and load a buffer with a printable,
 * nul-terminated representation of that address.
 */
const char *
__wt_page_addr_string(WT_SESSION_IMPL *session, WT_REF *ref, WT_ITEM *buf)
{
	size_t addr_size;
	const uint8_t *addr;

	if (__wt_ref_is_root(ref)) {
		buf->data = "[Root]";
		buf->size = strlen("[Root]");
		return (buf->data);
	}

	(void)__wt_ref_info(session, ref, &addr, &addr_size, NULL);
	return (__wt_addr_string(session, addr, addr_size, buf));
}
예제 #3
0
파일: bt_split.c 프로젝트: qihsh/mongo
/*
 * __split_should_deepen --
 *	Return if we should deepen the tree.
 */
static bool
__split_should_deepen(WT_SESSION_IMPL *session, WT_REF *ref)
{
	WT_BTREE *btree;
	WT_PAGE *page;
	WT_PAGE_INDEX *pindex;

	btree = S2BT(session);
	page = ref->page;

	/*
	 * Our caller is holding the parent page locked to single-thread splits,
	 * which means we can safely look at the page's index without setting a
	 * split generation.
	 */
	pindex = WT_INTL_INDEX_GET_SAFE(page);

	/*
	 * Deepen the tree if the page's memory footprint is larger than the
	 * maximum size for a page in memory (presumably putting eviction
	 * pressure on the cache).
	 */
	if (page->memory_footprint < btree->maxmempage)
		return (false);

	/*
	 * Ensure the page has enough entries to make it worth splitting and
	 * we get a significant payback (in the case of a set of large keys,
	 * splitting won't help).
	 */
	if (pindex->entries > btree->split_deepen_min_child)
		return (true);

	/*
	 * Don't allow a single page to put pressure on cache usage. The root
	 * page cannot be evicted, so if it's larger than the maximum, or if
	 * and page has a quarter of the cache, let it split, a deep tree is
	 * better than making no progress at all. Sanity check for 100 on-page
	 * keys, nothing helps in the case of large keys and a too-small cache.
	 */
	if (pindex->entries >= 100 &&
	    (__wt_ref_is_root(ref) ||
	    page->memory_footprint >= S2C(session)->cache_size / 4))
		return (true);

	return (false);
}
예제 #4
0
파일: bt_sync.c 프로젝트: ajdavis/mongo
/*
 * __sync_dup_walk --
 *	Duplicate a tree walk point.
 */
static inline int
__sync_dup_walk(
    WT_SESSION_IMPL *session, WT_REF *walk, uint32_t flags, WT_REF **dupp)
{
	WT_REF *old;
	bool busy;

	if ((old = *dupp) != NULL) {
		*dupp = NULL;
		WT_RET(__wt_page_release(session, old, flags));
	}

	/* It is okay to duplicate a walk before it starts. */
	if (walk == NULL || __wt_ref_is_root(walk)) {
		*dupp = walk;
		return (0);
	}

	/* Get a duplicate hazard pointer. */
	for (;;) {
#ifdef HAVE_DIAGNOSTIC
		WT_RET(
		    __wt_hazard_set(session, walk, &busy, __func__, __LINE__));
#else
		WT_RET(__wt_hazard_set(session, walk, &busy));
#endif
		/*
		 * We already have a hazard pointer, we should generally be able
		 * to get another one. We can get spurious busy errors (e.g., if
		 * eviction is attempting to lock the page. Keep trying: we have
		 * one hazard pointer so we should be able to get another one.
		 */
		if (!busy)
			break;
		__wt_yield();
	}

	*dupp = walk;
	return (0);
}
예제 #5
0
/*将多余的文件空间compact到合适的位置,如果ref在compact范围内,返回skip = 1,表示文件空间不能进行compact*/
static int __compact_rewrite(WT_SESSION_IMPL* session, WT_REF* ref, int* skipp)
{
	WT_BM *bm;
	WT_DECL_RET;
	WT_PAGE *page;
	WT_PAGE_MODIFY *mod;
	size_t addr_size;
	const uint8_t *addr;

	*skipp = 1;	

	bm = S2BT(session)->bm;
	page = ref->page;
	mod = page->modify;

	/*root page是不能被compact*/
	if (__wt_ref_is_root(ref))
		return 0;

	/*ref指向的是个脏页,不进行compact*/
	if (__wt_page_is_modified(page))
		return (0);

	/*假如page一已经被清空的,直接判断是否可以它的block空间compact*/
	if (mod == NULL || F_ISSET(mod, WT_PM_REC_MASK) == 0) {
		WT_RET(__wt_ref_info(session, ref, &addr, &addr_size, NULL));
		if (addr == NULL)
			return (0);
		WT_RET(bm->compact_page_skip(bm, session, addr, addr_size, skipp));
	}
	else if (F_ISSET(mod, WT_PM_REC_MASK) == WT_PM_REC_REPLACE){ /*如果page空间是替换,那么进行替换block的compact操作判断*/
		WT_PAGE_LOCK(session, page);
		ret = bm->compact_page_skip(bm, session, mod->mod_replace.addr, mod->mod_replace.size, skipp);
		WT_PAGE_UNLOCK(session, page);
		WT_RET(ret);
	}

	return 0;
}
예제 #6
0
/*
 * __tree_walk_internal --
 *	Move to the next/previous page in the tree.
 */
static inline int
__tree_walk_internal(WT_SESSION_IMPL *session,
    WT_REF **refp, uint64_t *walkcntp,
    int (*skip_func)(WT_SESSION_IMPL *, WT_REF *, void *, bool *),
    void *func_cookie, uint32_t flags)
{
	WT_BTREE *btree;
	WT_DECL_RET;
	WT_PAGE_INDEX *pindex;
	WT_REF *couple, *couple_orig, *ref;
	uint32_t slot;
	bool empty_internal, initial_descent, prev, skip;

	btree = S2BT(session);
	pindex = NULL;
	empty_internal = initial_descent = false;

	/*
	 * Tree walks are special: they look inside page structures that splits
	 * may want to free.  Publish that the tree is active during this
	 * window.
	 */
	WT_ENTER_PAGE_INDEX(session);

	/* Walk should never instantiate deleted pages. */
	LF_SET(WT_READ_NO_EMPTY);

	/*
	 * !!!
	 * Fast-truncate currently only works on row-store trees.
	 */
	if (btree->type != BTREE_ROW)
		LF_CLR(WT_READ_TRUNCATE);

	prev = LF_ISSET(WT_READ_PREV) ? 1 : 0;

	/*
	 * There are multiple reasons and approaches to walking the in-memory
	 * tree:
	 *
	 * (1) finding pages to evict (the eviction server);
	 * (2) writing just dirty leaves or internal nodes (checkpoint);
	 * (3) discarding pages (close);
	 * (4) truncating pages in a range (fast truncate);
	 * (5) skipping pages based on outside information (compaction);
	 * (6) cursor scans (applications).
	 *
	 * Except for cursor scans and compaction, the walk is limited to the
	 * cache, no pages are read.  In all cases, hazard pointers protect the
	 * walked pages from eviction.
	 *
	 * Walks use hazard-pointer coupling through the tree and that's OK
	 * (hazard pointers can't deadlock, so there's none of the usual
	 * problems found when logically locking up a btree).  If the eviction
	 * thread tries to evict the active page, it fails because of our
	 * hazard pointer.  If eviction tries to evict our parent, that fails
	 * because the parent has a child page that can't be discarded.  We do
	 * play one game: don't couple up to our parent and then back down to a
	 * new leaf, couple to the next page to which we're descending, it
	 * saves a hazard-pointer swap for each cursor page movement.
	 *
	 * !!!
	 * NOTE: we depend on the fact it's OK to release a page we don't hold,
	 * that is, it's OK to release couple when couple is set to NULL.
	 *
	 * Take a copy of any held page and clear the return value.  Remember
	 * the hazard pointer we're currently holding.
	 *
	 * Clear the returned value, it makes future error handling easier.
	 */
	couple = couple_orig = ref = *refp;
	*refp = NULL;

	/* If no page is active, begin a walk from the start/end of the tree. */
	if (ref == NULL) {
restart:	/*
		 * We can be here with a NULL or root WT_REF; the page release
		 * function handles them internally, don't complicate this code
		 * by calling them out.
		 */
		WT_ERR(__wt_page_release(session, couple, flags));

		/*
		 * We're not supposed to walk trees without root pages. As this
		 * has not always been the case, assert to debug that change.
		 */
		WT_ASSERT(session, btree->root.page != NULL);

		couple = couple_orig = ref = &btree->root;
		initial_descent = true;
		goto descend;
	}

	/*
	 * If the active page was the root, we've reached the walk's end; we
	 * only get here if we've returned the root to our caller, so we're
	 * holding no hazard pointers.
	 */
	if (__wt_ref_is_root(ref))
		goto done;

	/* Figure out the current slot in the WT_REF array. */
	__ref_index_slot(session, ref, &pindex, &slot);

	for (;;) {
		/*
		 * If we're at the last/first slot on the internal page, return
		 * it in post-order traversal. Otherwise move to the next/prev
		 * slot and left/right-most element in that subtree.
		 */
		while ((prev && slot == 0) ||
		    (!prev && slot == pindex->entries - 1)) {
			/* Ascend to the parent. */
			__ref_ascend(session, &ref, &pindex, &slot);

			/*
			 * If at the root and returning internal pages, return
			 * the root page, otherwise we're done. Regardless, no
			 * hazard pointer is required, release the one we hold.
			 */
			if (__wt_ref_is_root(ref)) {
				WT_ERR(__wt_page_release(
				    session, couple, flags));
				if (!LF_ISSET(WT_READ_SKIP_INTL))
					*refp = ref;
				goto done;
			}

			/*
			 * If we got all the way through an internal page and
			 * all of the child pages were deleted, mark it for
			 * eviction.
			 */
			if (empty_internal && pindex->entries > 1) {
				__wt_page_evict_soon(session, ref);
				empty_internal = false;
			}

			/*
			 * Optionally return internal pages. Swap our previous
			 * hazard pointer for the page we'll return. We don't
			 * handle restart or not-found returns, it would require
			 * additional complexity and is not a possible return:
			 * we're moving to the parent of the current child page,
			 * the parent can't have been evicted.
			 */
			if (!LF_ISSET(WT_READ_SKIP_INTL)) {
				WT_ERR(__wt_page_swap(
				    session, couple, ref, flags));
				*refp = ref;
				goto done;
			}
		}

		if (prev)
			--slot;
		else
			++slot;

		if (walkcntp != NULL)
			++*walkcntp;

		for (;;) {
			/*
			 * Move to the next slot, and set the reference hint if
			 * it's wrong (used when we continue the walk). We don't
			 * always update the hints when splitting, it's expected
			 * for them to be incorrect in some workloads.
			 */
			ref = pindex->index[slot];
			if (ref->pindex_hint != slot)
				ref->pindex_hint = slot;

			/*
			 * If we see any child states other than deleted, the
			 * page isn't empty.
			 */
			if (ref->state != WT_REF_DELETED &&
			    !LF_ISSET(WT_READ_TRUNCATE))
				empty_internal = false;

			if (LF_ISSET(WT_READ_CACHE)) {
				/*
				 * Only look at unlocked pages in memory:
				 * fast-path some common cases.
				 */
				if (LF_ISSET(WT_READ_NO_WAIT) &&
				    ref->state != WT_REF_MEM)
					break;

				/* Skip lookaside pages if not requested. */
				if (ref->state == WT_REF_LOOKASIDE &&
				    !LF_ISSET(WT_READ_LOOKASIDE))
					break;
			} else if (LF_ISSET(WT_READ_TRUNCATE)) {
				/*
				 * Avoid pulling a deleted page back in to try
				 * to delete it again.
				 */
				if (ref->state == WT_REF_DELETED &&
				    __wt_delete_page_skip(session, ref, false))
					break;
				/*
				 * If deleting a range, try to delete the page
				 * without instantiating it.
				 */
				WT_ERR(__wt_delete_page(session, ref, &skip));
				if (skip)
					break;
				empty_internal = false;
			} else if (skip_func != NULL) {
				WT_ERR(skip_func(session,
				    ref, func_cookie, &skip));
				if (skip)
					break;
			} else {
				/*
				 * Try to skip deleted pages visible to us.
				 */
				if (ref->state == WT_REF_DELETED &&
				    __wt_delete_page_skip(session, ref, false))
					break;
			}

			ret = __wt_page_swap(session, couple, ref,
			    WT_READ_NOTFOUND_OK | WT_READ_RESTART_OK | flags);

			/*
			 * Not-found is an expected return when only walking
			 * in-cache pages, or if we see a deleted page.
			 */
			if (ret == WT_NOTFOUND) {
				ret = 0;
				break;
			}

			/*
			 * The page we're moving to might have split, in which
			 * case move to the last position we held.
			 */
			if (ret == WT_RESTART) {
				ret = 0;

				/*
				 * If a cursor is setting up at the end of the
				 * tree, we can't use our parent page's index,
				 * because it may have already split; restart
				 * the walk.
				 */
				if (prev && initial_descent)
					goto restart;

				/*
				 * If a new walk that never coupled from the
				 * root to a new saved position in the tree,
				 * restart the walk.
				 */
				if (couple == &btree->root)
					goto restart;

				/*
				 * If restarting from some original position,
				 * repeat the increment or decrement we made at
				 * that time. Otherwise, couple is an internal
				 * page we've acquired after moving from that
				 * starting position and we can treat it as a
				 * new page. This works because we never acquire
				 * a hazard pointer on a leaf page we're not
				 * going to return to our caller, this will quit
				 * working if that ever changes.
				 */
				WT_ASSERT(session,
				    couple == couple_orig ||
				    WT_PAGE_IS_INTERNAL(couple->page));
				ref = couple;
				__ref_index_slot(session, ref, &pindex, &slot);
				if (couple == couple_orig)
					break;
			}
			WT_ERR(ret);
			couple = ref;

			/*
			 * A new page: configure for traversal of any internal
			 * page's children, else return the leaf page.
			 */
			if (WT_PAGE_IS_INTERNAL(ref->page)) {
descend:			empty_internal = true;

				/*
				 * There's a split race when a cursor is setting
				 * up at the end of the tree or moving backwards
				 * through the tree and descending a level. When
				 * splitting an internal page into its parent,
				 * we move the WT_REF structures and update the
				 * parent's page index before updating the split
				 * page's page index, and it's not an atomic
				 * update. A thread can read the parent page's
				 * replacement page index, then read the split
				 * page's original index, or the parent page's
				 * original and the split page's replacement.
				 *
				 * This isn't a problem for a cursor setting up
				 * at the start of the tree or moving forwards
				 * through the tree because we do right-hand
				 * splits on internal pages and the initial part
				 * of the split page's namespace won't change as
				 * part of a split. A thread reading the parent
				 * page's and split page's indexes will move to
				 * the same slot no matter what order of indexes
				 * are read.
				 *
				 * Handle a cursor setting up at the end of the
				 * tree or moving backwards through the tree.
				 */
				if (!prev) {
					WT_INTL_INDEX_GET(
					    session, ref->page, pindex);
					slot = 0;
				} else if (initial_descent) {
					if (!__ref_initial_descent_prev(
					    session, ref, &pindex))
						goto restart;
					slot = pindex->entries - 1;
				} else {
					__ref_descend_prev(
					    session, ref, &pindex);
					slot = pindex->entries - 1;
				}
				continue;
			}

			/*
			 * The tree-walk restart code knows we return any leaf
			 * page we acquire (never hazard-pointer coupling on
			 * after acquiring a leaf page), and asserts no restart
			 * happens while holding a leaf page. This page must be
			 * returned to our caller.
			 */
			*refp = ref;
			goto done;
		}
	}

done:
err:	WT_LEAVE_PAGE_INDEX(session);
	return (ret);
}
예제 #7
0
/*
 * __ref_ascend --
 *	Ascend the tree one level.
 */
static inline void
__ref_ascend(WT_SESSION_IMPL *session,
    WT_REF **refp, WT_PAGE_INDEX **pindexp, uint32_t *slotp)
{
	WT_REF *parent_ref, *ref;

	/*
	 * Ref points to the first/last slot on an internal page from which we
	 * are ascending the tree, moving to the parent page. This is tricky
	 * because the internal page we're on may be splitting into its parent.
	 * Find a stable configuration where the page we start from and the
	 * page we're moving to are connected. The tree eventually stabilizes
	 * into that configuration, keep trying until we succeed.
	 */
	for (ref = *refp;;) {
		/*
		 * Find our parent slot on the next higher internal page, the
		 * slot from which we move to a next/prev slot, checking that
		 * we haven't reached the root.
		 */
		parent_ref = ref->home->pg_intl_parent_ref;
		if (__wt_ref_is_root(parent_ref))
			break;
		__ref_index_slot(session, parent_ref, pindexp, slotp);

		/*
		 * There's a split race when a cursor moving forwards through
		 * the tree ascends the tree. If we're splitting an internal
		 * page into its parent, we move the WT_REF structures and
		 * then update the parent's page index before updating the split
		 * page's page index, and it's not an atomic update. A thread
		 * can read the split page's original page index and then read
		 * the parent page's replacement index.
		 *
		 * This can create a race for next-cursor movements.
		 *
		 * For example, imagine an internal page with 3 child pages,
		 * with the namespaces a-f, g-h and i-j; the first child page
		 * splits. The parent starts out with the following page-index:
		 *
		 *	| ... | a | g | i | ... |
		 *
		 * which changes to this:
		 *
		 *	| ... | a | c | e | g | i | ... |
		 *
		 * The split page starts out with the following page-index:
		 *
		 *	| a | b | c | d | e | f |
		 *
		 * Imagine a cursor finishing the 'f' part of the namespace that
		 * starts its ascent to the parent's 'a' slot. Then the page
		 * splits and the parent page's page index is replaced. If the
		 * cursor then searches the parent's replacement page index for
		 * the 'a' slot, it finds it and then increments to the slot
		 * after the 'a' slot, the 'c' slot, and then it incorrectly
		 * repeats its traversal of part of the namespace.
		 *
		 * This function takes a WT_REF argument which is the page from
		 * which we start our ascent. If the parent's slot we find in
		 * our search doesn't point to the same page as that initial
		 * WT_REF, there's a race and we start over again.
		 */
		if (ref->home == parent_ref->page)
			break;
	}

	*refp = parent_ref;
}
예제 #8
0
파일: bt_walk.c 프로젝트: 7segments/mongo-1
/*
 * __wt_tree_walk --
 *	Move to the next/previous page in the tree.
 */
int
__wt_tree_walk(WT_SESSION_IMPL *session,
    WT_REF **refp, uint64_t *walkcntp, uint32_t flags)
{
	WT_BTREE *btree;
	WT_DECL_RET;
	WT_PAGE *page;
	WT_PAGE_INDEX *pindex;
	WT_REF *couple, *couple_orig, *ref;
	int prev, skip;
	uint32_t slot;

	btree = S2BT(session);

	/*
	 * Tree walks are special: they look inside page structures that splits
	 * may want to free.  Publish that the tree is active during this
	 * window.
	 */
	WT_ENTER_PAGE_INDEX(session);

	/*
	 * !!!
	 * Fast-truncate currently only works on row-store trees.
	 */
	if (btree->type != BTREE_ROW)
		LF_CLR(WT_READ_TRUNCATE);

	prev = LF_ISSET(WT_READ_PREV) ? 1 : 0;

	/*
	 * There are multiple reasons and approaches to walking the in-memory
	 * tree:
	 *
	 * (1) finding pages to evict (the eviction server);
	 * (2) writing just dirty leaves or internal nodes (checkpoint);
	 * (3) discarding pages (close);
	 * (4) truncating pages in a range (fast truncate);
	 * (5) skipping pages based on outside information (compaction);
	 * (6) cursor scans (applications).
	 *
	 * Except for cursor scans and compaction, the walk is limited to the
	 * cache, no pages are read.  In all cases, hazard pointers protect the
	 * walked pages from eviction.
	 *
	 * Walks use hazard-pointer coupling through the tree and that's OK
	 * (hazard pointers can't deadlock, so there's none of the usual
	 * problems found when logically locking up a btree).  If the eviction
	 * thread tries to evict the active page, it fails because of our
	 * hazard pointer.  If eviction tries to evict our parent, that fails
	 * because the parent has a child page that can't be discarded.  We do
	 * play one game: don't couple up to our parent and then back down to a
	 * new leaf, couple to the next page to which we're descending, it
	 * saves a hazard-pointer swap for each cursor page movement.
	 *
	 * !!!
	 * NOTE: we depend on the fact it's OK to release a page we don't hold,
	 * that is, it's OK to release couple when couple is set to NULL.
	 *
	 * Take a copy of any held page and clear the return value.  Remember
	 * the hazard pointer we're currently holding.
	 *
	 * We may be passed a pointer to btree->evict_page that we are clearing
	 * here.  We check when discarding pages that we're not discarding that
	 * page, so this clear must be done before the page is released.
	 */
	couple = couple_orig = ref = *refp;
	*refp = NULL;

	/* If no page is active, begin a walk from the start of the tree. */
	if (ref == NULL) {
		ref = &btree->root;
		if (ref->page == NULL)
			goto done;
		goto descend;
	}

ascend:	/*
	 * If the active page was the root, we've reached the walk's end.
	 * Release any hazard-pointer we're holding.
	 */
	if (__wt_ref_is_root(ref)) {
		WT_ERR(__wt_page_release(session, couple, flags));
		goto done;
	}

	/* Figure out the current slot in the WT_REF array. */
	__wt_page_refp(session, ref, &pindex, &slot);

	for (;;) {
		/*
		 * If we're at the last/first slot on the page, return this page
		 * in post-order traversal.  Otherwise we move to the next/prev
		 * slot and left/right-most element in its subtree.
		 */
		if ((prev && slot == 0) ||
		    (!prev && slot == pindex->entries - 1)) {
			ref = ref->home->pg_intl_parent_ref;

			/* Optionally skip internal pages. */
			if (LF_ISSET(WT_READ_SKIP_INTL))
				goto ascend;

			/*
			 * We've ascended the tree and are returning an internal
			 * page.  If it's the root, discard our hazard pointer,
			 * otherwise, swap our hazard pointer for the page we'll
			 * return.
			 */
			if (__wt_ref_is_root(ref))
				WT_ERR(__wt_page_release(
				    session, couple, flags));
			else {
				/*
				 * Locate the reference to our parent page then
				 * swap our child hazard pointer for the parent.
				 * We don't handle restart or not-found returns.
				 * It would require additional complexity and is
				 * not a possible return: we're moving to the
				 * parent of the current child page, our parent
				 * reference can't have split or been evicted.
				 */
				__wt_page_refp(session, ref, &pindex, &slot);
				if ((ret = __wt_page_swap(
				    session, couple, ref, flags)) != 0) {
					WT_TRET(__wt_page_release(
					    session, couple, flags));
					WT_ERR(ret);
				}
			}

			*refp = ref;
			goto done;
		}

		if (prev)
			--slot;
		else
			++slot;

		if (walkcntp != NULL)
			++*walkcntp;

		for (;;) {
			ref = pindex->index[slot];

			if (LF_ISSET(WT_READ_CACHE)) {
				/*
				 * Only look at unlocked pages in memory:
				 * fast-path some common cases.
				 */
				if (LF_ISSET(WT_READ_NO_WAIT) &&
				    ref->state != WT_REF_MEM)
					break;
			} else if (LF_ISSET(WT_READ_TRUNCATE)) {
				/*
				 * Avoid pulling a deleted page back in to try
				 * to delete it again.
				 */
				if (ref->state == WT_REF_DELETED &&
				    __wt_delete_page_skip(session, ref))
					break;
				/*
				 * If deleting a range, try to delete the page
				 * without instantiating it.
				 */
				WT_ERR(__wt_delete_page(session, ref, &skip));
				if (skip)
					break;
			} else if (LF_ISSET(WT_READ_COMPACT)) {
				/*
				 * Skip deleted pages, rewriting them doesn't
				 * seem useful.
				 */
				if (ref->state == WT_REF_DELETED)
					break;

				/*
				 * If the page is in-memory, we want to look at
				 * it (it may have been modified and written,
				 * and the current location is the interesting
				 * one in terms of compaction, not the original
				 * location).  If the page isn't in-memory, test
				 * if the page will help with compaction, don't
				 * read it if we don't have to.
				 */
				if (ref->state == WT_REF_DISK) {
					WT_ERR(__wt_compact_page_skip(
					    session, ref, &skip));
					if (skip)
						break;
				}
			} else {
				/*
				 * Try to skip deleted pages visible to us.
				 */
				if (ref->state == WT_REF_DELETED &&
				    __wt_delete_page_skip(session, ref))
					break;
			}

			ret = __wt_page_swap(session, couple, ref, flags);

			/*
			 * Not-found is an expected return when only walking
			 * in-cache pages.
			 */
			if (ret == WT_NOTFOUND) {
				ret = 0;
				break;
			}

			/*
			 * The page we're moving to might have split, in which
			 * case move to the last position we held.
			 */
			if (ret == WT_RESTART) {
				ret = 0;

				/*
				 * If a new walk that never coupled from the
				 * root to a new saved position in the tree,
				 * restart the walk.
				 */
				if (couple == &btree->root) {
					ref = &btree->root;
					if (ref->page == NULL)
						goto done;
					goto descend;
				}

				/*
				 * If restarting from some original position,
				 * repeat the increment or decrement we made at
				 * that time. Otherwise, couple is an internal
				 * page we've acquired after moving from that
				 * starting position and we can treat it as a
				 * new page. This works because we never acquire
				 * a hazard pointer on a leaf page we're not
				 * going to return to our caller, this will quit
				 * working if that ever changes.
				 */
				WT_ASSERT(session,
				    couple == couple_orig ||
				    WT_PAGE_IS_INTERNAL(couple->page));
				ref = couple;
				__wt_page_refp(session, ref, &pindex, &slot);
				if (couple == couple_orig)
					break;
			}
			WT_ERR(ret);

			/*
			 * A new page: configure for traversal of any internal
			 * page's children, else return the leaf page.
			 */
descend:		couple = ref;
			page = ref->page;
			if (page->type == WT_PAGE_ROW_INT ||
			    page->type == WT_PAGE_COL_INT) {
				WT_INTL_INDEX_GET(session, page, pindex);
				slot = prev ? pindex->entries - 1 : 0;
			} else {
				*refp = ref;
				goto done;
			}
		}
	}

done:
err:	WT_LEAVE_PAGE_INDEX(session);
	return (ret);
}
예제 #9
0
파일: bt_vrfy.c 프로젝트: BobbWu/wiredtiger
/*
 * __verify_tree --
 *	Verify a tree, recursively descending through it in depth-first fashion.
 * The page argument was physically verified (so we know it's correctly formed),
 * and the in-memory version built.  Our job is to check logical relationships
 * in the page and in the tree.
 */
static int
__verify_tree(WT_SESSION_IMPL *session, WT_REF *ref, WT_VSTUFF *vs)
{
	WT_BM *bm;
	WT_CELL *cell;
	WT_CELL_UNPACK *unpack, _unpack;
	WT_COL *cip;
	WT_DECL_RET;
	WT_PAGE *page;
	WT_REF *child_ref;
	uint64_t recno;
	uint32_t entry, i;
	bool found;

	bm = S2BT(session)->bm;
	page = ref->page;

	unpack = &_unpack;
	WT_CLEAR(*unpack);	/* -Wuninitialized */

	WT_RET(__wt_verbose(session, WT_VERB_VERIFY, "%s %s",
	    __wt_page_addr_string(session, ref, vs->tmp1),
	    __wt_page_type_string(page->type)));

	/* Optionally dump the address. */
	if (vs->dump_address)
		WT_RET(__wt_msg(session, "%s %s",
		    __wt_page_addr_string(session, ref, vs->tmp1),
		    __wt_page_type_string(page->type)));

	/* Track the shape of the tree. */
	if (WT_PAGE_IS_INTERNAL(page))
		++vs->depth_internal[
		    WT_MIN(vs->depth, WT_ELEMENTS(vs->depth_internal) - 1)];
	else
		++vs->depth_leaf[
		    WT_MIN(vs->depth, WT_ELEMENTS(vs->depth_internal) - 1)];

	/*
	 * The page's physical structure was verified when it was read into
	 * memory by the read server thread, and then the in-memory version
	 * of the page was built. Now we make sure the page and tree are
	 * logically consistent.
	 *
	 * !!!
	 * The problem: (1) the read server has to build the in-memory version
	 * of the page because the read server is the thread that flags when
	 * any thread can access the page in the tree; (2) we can't build the
	 * in-memory version of the page until the physical structure is known
	 * to be OK, so the read server has to verify at least the physical
	 * structure of the page; (3) doing complete page verification requires
	 * reading additional pages (for example, overflow keys imply reading
	 * overflow pages in order to test the key's order in the page); (4)
	 * the read server cannot read additional pages because it will hang
	 * waiting on itself.  For this reason, we split page verification
	 * into a physical verification, which allows the in-memory version
	 * of the page to be built, and then a subsequent logical verification
	 * which happens here.
	 *
	 * Report progress occasionally.
	 */
#define	WT_VERIFY_PROGRESS_INTERVAL	100
	if (++vs->fcnt % WT_VERIFY_PROGRESS_INTERVAL == 0)
		WT_RET(__wt_progress(session, NULL, vs->fcnt));

#ifdef HAVE_DIAGNOSTIC
	/* Optionally dump the blocks or page in debugging mode. */
	if (vs->dump_blocks)
		WT_RET(__wt_debug_disk(session, page->dsk, NULL));
	if (vs->dump_pages)
		WT_RET(__wt_debug_page(session, page, NULL));
#endif

	/*
	 * Column-store key order checks: check the page's record number and
	 * then update the total record count.
	 */
	switch (page->type) {
	case WT_PAGE_COL_FIX:
		recno = page->pg_fix_recno;
		goto recno_chk;
	case WT_PAGE_COL_INT:
		recno = page->pg_intl_recno;
		goto recno_chk;
	case WT_PAGE_COL_VAR:
		recno = page->pg_var_recno;
recno_chk:	if (recno != vs->record_total + 1)
			WT_RET_MSG(session, WT_ERROR,
			    "page at %s has a starting record of %" PRIu64
			    " when the expected starting record is %" PRIu64,
			    __wt_page_addr_string(session, ref, vs->tmp1),
			    recno, vs->record_total + 1);
		break;
	}
	switch (page->type) {
	case WT_PAGE_COL_FIX:
		vs->record_total += page->pg_fix_entries;
		break;
	case WT_PAGE_COL_VAR:
		recno = 0;
		WT_COL_FOREACH(page, cip, i)
			if ((cell = WT_COL_PTR(page, cip)) == NULL)
				++recno;
			else {
				__wt_cell_unpack(cell, unpack);
				recno += __wt_cell_rle(unpack);
			}
		vs->record_total += recno;
		break;
	}

	/*
	 * Row-store leaf page key order check: it's a depth-first traversal,
	 * the first key on this page should be larger than any key previously
	 * seen.
	 */
	switch (page->type) {
	case WT_PAGE_ROW_LEAF:
		WT_RET(__verify_row_leaf_key_order(session, ref, vs));
		break;
	}

	/* If it's not the root page, unpack the parent cell. */
	if (!__wt_ref_is_root(ref)) {
		__wt_cell_unpack(ref->addr, unpack);

		/* Compare the parent cell against the page type. */
		switch (page->type) {
		case WT_PAGE_COL_FIX:
			if (unpack->raw != WT_CELL_ADDR_LEAF_NO)
				goto celltype_err;
			break;
		case WT_PAGE_COL_VAR:
			if (unpack->raw != WT_CELL_ADDR_LEAF &&
			    unpack->raw != WT_CELL_ADDR_LEAF_NO)
				goto celltype_err;
			break;
		case WT_PAGE_ROW_LEAF:
			if (unpack->raw != WT_CELL_ADDR_DEL &&
			    unpack->raw != WT_CELL_ADDR_LEAF &&
			    unpack->raw != WT_CELL_ADDR_LEAF_NO)
				goto celltype_err;
			break;
		case WT_PAGE_COL_INT:
		case WT_PAGE_ROW_INT:
			if (unpack->raw != WT_CELL_ADDR_INT)
celltype_err:			WT_RET_MSG(session, WT_ERROR,
				    "page at %s, of type %s, is referenced in "
				    "its parent by a cell of type %s",
				    __wt_page_addr_string(
					session, ref, vs->tmp1),
				    __wt_page_type_string(page->type),
				    __wt_cell_type_string(unpack->raw));
			break;
		}
	}

	/*
	 * Check overflow pages.  We check overflow cells separately from other
	 * tests that walk the page as it's simpler, and I don't care much how
	 * fast table verify runs.
	 */
	switch (page->type) {
	case WT_PAGE_COL_VAR:
	case WT_PAGE_ROW_INT:
	case WT_PAGE_ROW_LEAF:
		WT_RET(__verify_overflow_cell(session, ref, &found, vs));
		if (__wt_ref_is_root(ref) || page->type == WT_PAGE_ROW_INT)
			break;

		/*
		 * Object if a leaf-no-overflow address cell references a page
		 * with overflow keys, but don't object if a leaf address cell
		 * references a page without overflow keys.  Reconciliation
		 * doesn't guarantee every leaf page without overflow items will
		 * be a leaf-no-overflow type.
		 */
		if (found && unpack->raw == WT_CELL_ADDR_LEAF_NO)
			WT_RET_MSG(session, WT_ERROR,
			    "page at %s, of type %s and referenced in its "
			    "parent by a cell of type %s, contains overflow "
			    "items",
			    __wt_page_addr_string(session, ref, vs->tmp1),
			    __wt_page_type_string(page->type),
			    __wt_cell_type_string(WT_CELL_ADDR_LEAF_NO));
		break;
	}

	/* Check tree connections and recursively descend the tree. */
	switch (page->type) {
	case WT_PAGE_COL_INT:
		/* For each entry in an internal page, verify the subtree. */
		entry = 0;
		WT_INTL_FOREACH_BEGIN(session, page, child_ref) {
			/*
			 * It's a depth-first traversal: this entry's starting
			 * record number should be 1 more than the total records
			 * reviewed to this point.
			 */
			++entry;
			if (child_ref->key.recno != vs->record_total + 1) {
				WT_RET_MSG(session, WT_ERROR,
				    "the starting record number in entry %"
				    PRIu32 " of the column internal page at "
				    "%s is %" PRIu64 " and the expected "
				    "starting record number is %" PRIu64,
				    entry,
				    __wt_page_addr_string(
				    session, child_ref, vs->tmp1),
				    child_ref->key.recno,
				    vs->record_total + 1);
			}

			/* Verify the subtree. */
			++vs->depth;
			WT_RET(__wt_page_in(session, child_ref, 0));
			ret = __verify_tree(session, child_ref, vs);
			WT_TRET(__wt_page_release(session, child_ref, 0));
			--vs->depth;
			WT_RET(ret);

			__wt_cell_unpack(child_ref->addr, unpack);
			WT_RET(bm->verify_addr(
			    bm, session, unpack->data, unpack->size));
		} WT_INTL_FOREACH_END;
		break;
	case WT_PAGE_ROW_INT:
		/* For each entry in an internal page, verify the subtree. */
		entry = 0;
		WT_INTL_FOREACH_BEGIN(session, page, child_ref) {
			/*
			 * It's a depth-first traversal: this entry's starting
			 * key should be larger than the largest key previously
			 * reviewed.
			 *
			 * The 0th key of any internal page is magic, and we
			 * can't test against it.
			 */
			++entry;
			if (entry != 1)
				WT_RET(__verify_row_int_key_order(
				    session, page, child_ref, entry, vs));

			/* Verify the subtree. */
			++vs->depth;
			WT_RET(__wt_page_in(session, child_ref, 0));
			ret = __verify_tree(session, child_ref, vs);
			WT_TRET(__wt_page_release(session, child_ref, 0));
			--vs->depth;
			WT_RET(ret);

			__wt_cell_unpack(child_ref->addr, unpack);
			WT_RET(bm->verify_addr(
			    bm, session, unpack->data, unpack->size));
		} WT_INTL_FOREACH_END;
예제 #10
0
파일: bt_compact.c 프로젝트: Arikes/mongo
/*
 * __compact_rewrite --
 *	Return if a page needs to be re-written.
 */
static int
__compact_rewrite(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp)
{
    WT_BM *bm;
    WT_DECL_RET;
    WT_MULTI *multi;
    WT_PAGE *page;
    WT_PAGE_MODIFY *mod;
    size_t addr_size;
    uint32_t i;
    const uint8_t *addr;

    *skipp = true;					/* Default skip. */

    bm = S2BT(session)->bm;
    page = ref->page;
    mod = page->modify;

    /*
     * Ignore the root: it may not have a replacement address, and besides,
     * if anything else gets written, so will it.
     */
    if (__wt_ref_is_root(ref))
        return (0);

    /* Ignore currently dirty pages, they will be written regardless. */
    if (__wt_page_is_modified(page))
        return (0);

    /*
     * If the page is clean, test the original addresses.
     * If the page is a replacement, test the replacement addresses.
     * Ignore empty pages, they get merged into the parent.
     */
    if (mod == NULL || mod->rec_result == 0) {
        __wt_ref_info(ref, &addr, &addr_size, NULL);
        if (addr == NULL)
            return (0);
        return (
                   bm->compact_page_skip(bm, session, addr, addr_size, skipp));
    }

    /*
     * The page's modification information can change underfoot if the page
     * is being reconciled, serialize with reconciliation.
     */
    if (mod->rec_result == WT_PM_REC_REPLACE ||
            mod->rec_result == WT_PM_REC_MULTIBLOCK)
        WT_RET(__wt_fair_lock(session, &page->page_lock));

    if (mod->rec_result == WT_PM_REC_REPLACE)
        ret = bm->compact_page_skip(bm, session,
                                    mod->mod_replace.addr, mod->mod_replace.size, skipp);

    if (mod->rec_result == WT_PM_REC_MULTIBLOCK)
        for (multi = mod->mod_multi,
                i = 0; i < mod->mod_multi_entries; ++multi, ++i) {
            if (multi->disk_image != NULL)
                continue;
            if ((ret = bm->compact_page_skip(bm, session,
                                             multi->addr.addr, multi->addr.size, skipp)) != 0)
                break;
            if (!*skipp)
                break;
        }

    if (mod->rec_result == WT_PM_REC_REPLACE ||
            mod->rec_result == WT_PM_REC_MULTIBLOCK)
        WT_TRET(__wt_fair_unlock(session, &page->page_lock));

    return (ret);
}
예제 #11
0
/*
 * __wt_evict_file --
 *	Discard pages for a specific file.
 */
int
__wt_evict_file(WT_SESSION_IMPL *session, int syncop)
{
	WT_BTREE *btree;
	WT_DECL_RET;
	WT_PAGE *page;
	WT_REF *next_ref, *ref;
	int eviction_enabled;

	btree = S2BT(session);
	eviction_enabled = !F_ISSET(btree, WT_BTREE_NO_EVICTION);

	/*
	 * We need exclusive access to the file -- disable ordinary eviction
	 * and drain any blocks already queued.
	 */
	if (eviction_enabled)
		WT_RET(__wt_evict_file_exclusive_on(session));

	/* Make sure the oldest transaction ID is up-to-date. */
	__wt_txn_update_oldest(session);

	/* Walk the tree, discarding pages. */
	next_ref = NULL;
	WT_ERR(__wt_tree_walk(
	    session, &next_ref, WT_READ_CACHE | WT_READ_NO_EVICT));
	while ((ref = next_ref) != NULL) {
		page = ref->page;

		/*
		 * Eviction can fail when a page in the evicted page's subtree
		 * switches state.  For example, if we don't evict a page marked
		 * empty, because we expect it to be merged into its parent, it
		 * might no longer be empty after it's reconciled, in which case
		 * eviction of its parent would fail.  We can either walk the
		 * tree multiple times (until it's finally empty), or reconcile
		 * each page to get it to its final state before considering if
		 * it's an eviction target or will be merged into its parent.
		 *
		 * Don't limit this test to any particular page type, that tends
		 * to introduce bugs when the reconciliation of other page types
		 * changes, and there's no advantage to doing so.
		 *
		 * Eviction can also fail because an update cannot be written.
		 * If sessions have disjoint sets of files open, updates in a
		 * no-longer-referenced file may not yet be globally visible,
		 * and the write will fail with EBUSY.  Our caller handles that
		 * error, retrying later.
		 */
		if (syncop == WT_SYNC_CLOSE && __wt_page_is_modified(page))
			WT_ERR(__wt_reconcile(session, ref, NULL, WT_EVICTING));

		/*
		 * We can't evict the page just returned to us (it marks our
		 * place in the tree), so move the walk to one page ahead of
		 * the page being evicted.  Note, we reconciled the returned
		 * page first: if reconciliation of that page were to change
		 * the shape of the tree, and we did the next walk call before
		 * the reconciliation, the next walk call could miss a page in
		 * the tree.
		 */
		WT_ERR(__wt_tree_walk(
		    session, &next_ref, WT_READ_CACHE | WT_READ_NO_EVICT));

		switch (syncop) {
		case WT_SYNC_CLOSE:
			/*
			 * Evict the page.
			 * Do not attempt to evict pages expected to be merged
			 * into their parents, with the exception that the root
			 * page can't be merged, it must be written.
			 */
			if (__wt_ref_is_root(ref) ||
			    page->modify == NULL ||
			    !F_ISSET(page->modify, WT_PM_REC_EMPTY))
				WT_ERR(__wt_evict(session, ref, 1));
			break;
		case WT_SYNC_DISCARD:
			/*
			 * Ordinary discard of the page, whether clean or dirty.
			 * If we see a dirty page in an ordinary discard (e.g.,
			 * from sweep), give up: an update must have happened
			 * since the file was selected for sweeping.
			 */
			if (__wt_page_is_modified(page))
				WT_ERR(EBUSY);

			/*
			 * If the page contains an update that is too recent to
			 * evict, stop.  This should never happen during
			 * connection close, but in other paths our caller
			 * should be prepared to deal with this case.
			 */
			if (page->modify != NULL &&
			    !__wt_txn_visible_all(session,
			    page->modify->rec_max_txn))
				WT_ERR(EBUSY);

			__wt_evict_page_clean_update(session, ref);
			break;
		case WT_SYNC_DISCARD_FORCE:
			/*
			 * Forced discard of the page, whether clean or dirty.
			 * If we see a dirty page in a forced discard, clean
			 * the page, both to keep statistics correct, and to
			 * let the page-discard function assert no dirty page
			 * is ever discarded.
			 */
			if (__wt_page_is_modified(page)) {
				page->modify->write_gen = 0;
				__wt_cache_dirty_decr(session, page);
			}

			F_SET(session, WT_SESSION_DISCARD_FORCE);
			__wt_evict_page_clean_update(session, ref);
			F_CLR(session, WT_SESSION_DISCARD_FORCE);
			break;
		WT_ILLEGAL_VALUE_ERR(session);
		}
	}

	if (0) {
err:		/* On error, clear any left-over tree walk. */
		if (next_ref != NULL)
			WT_TRET(__wt_page_release(
			    session, next_ref, WT_READ_NO_EVICT));
	}

	if (eviction_enabled)
		__wt_evict_file_exclusive_off(session);

	return (ret);
}
예제 #12
0
파일: evict_stat.c 프로젝트: ajdavis/mongo
/*
 * __evict_stat_walk --
 *	Walk all the pages in cache for a dhandle gathering stats information
 */
static void
__evict_stat_walk(WT_SESSION_IMPL *session)
{
	WT_BTREE *btree;
	WT_CACHE *cache;
	WT_PAGE *page;
	WT_REF *next_walk;
	uint64_t dsk_size, gen_gap, gen_gap_max, gen_gap_sum, max_pagesize;
	uint64_t min_written_size, num_memory, num_not_queueable, num_queued;
	uint64_t num_smaller_allocsz, pages_clean, pages_dirty, pages_internal;
	uint64_t pages_leaf, seen_count, size, visited_count;
	uint64_t visited_age_gap_sum, unvisited_count, unvisited_age_gap_sum;
	uint64_t walk_count, written_size_cnt, written_size_sum;

	btree = S2BT(session);
	cache = S2C(session)->cache;
	next_walk = NULL;
	gen_gap_max = gen_gap_sum = max_pagesize = 0;
	num_memory = num_not_queueable = num_queued = 0;
	num_smaller_allocsz = pages_clean = pages_dirty = pages_internal = 0;
	pages_leaf = seen_count = size = visited_count = 0;
	visited_age_gap_sum = unvisited_count = unvisited_age_gap_sum = 0;
	walk_count = written_size_cnt = written_size_sum = 0;
	min_written_size = UINT64_MAX;

	while (__wt_tree_walk_count(session, &next_walk, &walk_count,
	    WT_READ_CACHE | WT_READ_NO_EVICT |
	    WT_READ_NO_GEN | WT_READ_NO_WAIT) == 0 &&
	    next_walk != NULL) {
		++seen_count;
		page = next_walk->page;
		size = page->memory_footprint;

		if (__wt_page_is_modified(page))
			++pages_dirty;
		else
			++pages_clean;

		if (!__wt_ref_is_root(next_walk) &&
		    !__wt_page_can_evict(session, next_walk, NULL))
			++num_not_queueable;

		if (F_ISSET_ATOMIC(page, WT_PAGE_EVICT_LRU))
			++num_queued;

		if (size > max_pagesize)
			max_pagesize = size;

		dsk_size = page->dsk != NULL ? page->dsk->mem_size : 0;
		if (dsk_size != 0) {
			if (dsk_size < btree->allocsize)
				++num_smaller_allocsz;
			if (dsk_size < min_written_size)
				min_written_size = dsk_size;
			++written_size_cnt;
			written_size_sum += dsk_size;
		} else
			++num_memory;

		if (WT_PAGE_IS_INTERNAL(page))
			++pages_internal;
		else
			++pages_leaf;

		/* Skip root pages since they are never considered */
		if (__wt_ref_is_root(next_walk))
			continue;

		if (page->evict_pass_gen == 0) {
			unvisited_age_gap_sum +=
			    (cache->evict_pass_gen - page->cache_create_gen);
			++unvisited_count;
		} else {
			visited_age_gap_sum +=
			    (cache->evict_pass_gen - page->cache_create_gen);
			gen_gap = cache->evict_pass_gen - page->evict_pass_gen;
			if (gen_gap > gen_gap_max)
				gen_gap_max = gen_gap;
			gen_gap_sum += gen_gap;
			++visited_count;
		}
	}

	WT_STAT_DATA_SET(session, cache_state_gen_avg_gap,
	    visited_count == 0 ? 0 : gen_gap_sum / visited_count);
	WT_STAT_DATA_SET(session, cache_state_avg_unvisited_age,
	    unvisited_count == 0 ? 0 : unvisited_age_gap_sum / unvisited_count);
	WT_STAT_DATA_SET(session, cache_state_avg_visited_age,
	    visited_count == 0 ? 0 : visited_age_gap_sum / visited_count);
	WT_STAT_DATA_SET(session, cache_state_avg_written_size,
	    written_size_cnt == 0 ? 0 : written_size_sum / written_size_cnt);
	WT_STAT_DATA_SET(session, cache_state_gen_max_gap, gen_gap_max);
	WT_STAT_DATA_SET(session, cache_state_max_pagesize, max_pagesize);
	WT_STAT_DATA_SET(session,
	    cache_state_min_written_size, min_written_size);
	WT_STAT_DATA_SET(session, cache_state_memory, num_memory);
	WT_STAT_DATA_SET(session, cache_state_queued, num_queued);
	WT_STAT_DATA_SET(session, cache_state_not_queueable, num_not_queueable);
	WT_STAT_DATA_SET(session, cache_state_pages, walk_count);
	WT_STAT_DATA_SET(session, cache_state_pages_clean, pages_clean);
	WT_STAT_DATA_SET(session, cache_state_pages_dirty, pages_dirty);
	WT_STAT_DATA_SET(session, cache_state_pages_internal, pages_internal);
	WT_STAT_DATA_SET(session, cache_state_pages_leaf, pages_leaf);
	WT_STAT_DATA_SET(session,
	    cache_state_refs_skipped, walk_count - seen_count);
	WT_STAT_DATA_SET(session,
	    cache_state_smaller_alloc_size, num_smaller_allocsz);
	WT_STAT_DATA_SET(session,
	    cache_state_unvisited_count, unvisited_count);
}
예제 #13
0
파일: bt_walk.c 프로젝트: Arikes/mongo
/*
 * __tree_walk_internal --
 *	Move to the next/previous page in the tree.
 */
static inline int
__tree_walk_internal(WT_SESSION_IMPL *session,
    WT_REF **refp, uint64_t *walkcntp, uint64_t *skipleafcntp, uint32_t flags)
{
	WT_BTREE *btree;
	WT_DECL_RET;
	WT_PAGE_INDEX *pindex;
	WT_REF *couple, *couple_orig, *ref;
	bool empty_internal, prev, skip;
	uint32_t slot;

	btree = S2BT(session);
	empty_internal = false;

	/*
	 * Tree walks are special: they look inside page structures that splits
	 * may want to free.  Publish that the tree is active during this
	 * window.
	 */
	WT_ENTER_PAGE_INDEX(session);

	/* Walk should never instantiate deleted pages. */
	LF_SET(WT_READ_NO_EMPTY);

	/*
	 * !!!
	 * Fast-truncate currently only works on row-store trees.
	 */
	if (btree->type != BTREE_ROW)
		LF_CLR(WT_READ_TRUNCATE);

	prev = LF_ISSET(WT_READ_PREV) ? 1 : 0;

	/*
	 * There are multiple reasons and approaches to walking the in-memory
	 * tree:
	 *
	 * (1) finding pages to evict (the eviction server);
	 * (2) writing just dirty leaves or internal nodes (checkpoint);
	 * (3) discarding pages (close);
	 * (4) truncating pages in a range (fast truncate);
	 * (5) skipping pages based on outside information (compaction);
	 * (6) cursor scans (applications).
	 *
	 * Except for cursor scans and compaction, the walk is limited to the
	 * cache, no pages are read.  In all cases, hazard pointers protect the
	 * walked pages from eviction.
	 *
	 * Walks use hazard-pointer coupling through the tree and that's OK
	 * (hazard pointers can't deadlock, so there's none of the usual
	 * problems found when logically locking up a btree).  If the eviction
	 * thread tries to evict the active page, it fails because of our
	 * hazard pointer.  If eviction tries to evict our parent, that fails
	 * because the parent has a child page that can't be discarded.  We do
	 * play one game: don't couple up to our parent and then back down to a
	 * new leaf, couple to the next page to which we're descending, it
	 * saves a hazard-pointer swap for each cursor page movement.
	 *
	 * !!!
	 * NOTE: we depend on the fact it's OK to release a page we don't hold,
	 * that is, it's OK to release couple when couple is set to NULL.
	 *
	 * Take a copy of any held page and clear the return value.  Remember
	 * the hazard pointer we're currently holding.
	 *
	 * We may be passed a pointer to btree->evict_page that we are clearing
	 * here.  We check when discarding pages that we're not discarding that
	 * page, so this clear must be done before the page is released.
	 */
	couple = couple_orig = ref = *refp;
	*refp = NULL;

	/* If no page is active, begin a walk from the start of the tree. */
	if (ref == NULL) {
		ref = &btree->root;
		if (ref->page == NULL)
			goto done;
		goto descend;
	}

	/*
	 * If the active page was the root, we've reached the walk's end.
	 * Release any hazard-pointer we're holding.
	 */
	if (__wt_ref_is_root(ref)) {
		WT_ERR(__wt_page_release(session, couple, flags));
		goto done;
	}

	/* Figure out the current slot in the WT_REF array. */
	__ref_index_slot(session, ref, &pindex, &slot);

	for (;;) {
		/*
		 * If we're at the last/first slot on the internal page, return
		 * it in post-order traversal. Otherwise move to the next/prev
		 * slot and left/right-most element in that subtree.
		 */
		while ((prev && slot == 0) ||
		    (!prev && slot == pindex->entries - 1)) {
			/* Ascend to the parent. */
			__page_ascend(session, &ref, &pindex, &slot);

			/*
			 * If we got all the way through an internal page and
			 * all of the child pages were deleted, mark it for
			 * eviction.
			 */
			if (empty_internal && pindex->entries > 1) {
				__wt_page_evict_soon(ref->page);
				empty_internal = false;
			}

			/*
			 * If at the root and returning internal pages, return
			 * the root page, otherwise we're done. Regardless, no
			 * hazard pointer is required, release the one we hold.
			 */
			if (__wt_ref_is_root(ref)) {
				WT_ERR(__wt_page_release(
				    session, couple, flags));
				if (!LF_ISSET(WT_READ_SKIP_INTL))
					*refp = ref;
				goto done;
			}

			/*
			 * Optionally return internal pages. Swap our previous
			 * hazard pointer for the page we'll return. We don't
			 * handle restart or not-found returns, it would require
			 * additional complexity and is not a possible return:
			 * we're moving to the parent of the current child page,
			 * the parent can't have been evicted.
			 */
			if (!LF_ISSET(WT_READ_SKIP_INTL)) {
				WT_ERR(__wt_page_swap(
				    session, couple, ref, flags));
				*refp = ref;
				goto done;
			}
		}

		if (prev)
			--slot;
		else
			++slot;

		if (walkcntp != NULL)
			++*walkcntp;

		for (;;) {
			/*
			 * Move to the next slot, and set the reference hint if
			 * it's wrong (used when we continue the walk). We don't
			 * update those hints when splitting, so it's common for
			 * them to be incorrect in some workloads.
			 */
			ref = pindex->index[slot];
			if (ref->pindex_hint != slot)
				ref->pindex_hint = slot;

			/*
			 * If we see any child states other than deleted, the
			 * page isn't empty.
			 */
			if (ref->state != WT_REF_DELETED &&
			    !LF_ISSET(WT_READ_TRUNCATE))
				empty_internal = false;

			if (LF_ISSET(WT_READ_CACHE)) {
				/*
				 * Only look at unlocked pages in memory:
				 * fast-path some common cases.
				 */
				if (LF_ISSET(WT_READ_NO_WAIT) &&
				    ref->state != WT_REF_MEM)
					break;
			} else if (LF_ISSET(WT_READ_TRUNCATE)) {
				/*
				 * Avoid pulling a deleted page back in to try
				 * to delete it again.
				 */
				if (ref->state == WT_REF_DELETED &&
				    __wt_delete_page_skip(session, ref, false))
					break;
				/*
				 * If deleting a range, try to delete the page
				 * without instantiating it.
				 */
				WT_ERR(__wt_delete_page(session, ref, &skip));
				if (skip)
					break;
				empty_internal = false;
			} else if (LF_ISSET(WT_READ_COMPACT)) {
				/*
				 * Skip deleted pages, rewriting them doesn't
				 * seem useful.
				 */
				if (ref->state == WT_REF_DELETED)
					break;

				/*
				 * If the page is in-memory, we want to look at
				 * it (it may have been modified and written,
				 * and the current location is the interesting
				 * one in terms of compaction, not the original
				 * location).  If the page isn't in-memory, test
				 * if the page will help with compaction, don't
				 * read it if we don't have to.
				 */
				if (ref->state == WT_REF_DISK) {
					WT_ERR(__wt_compact_page_skip(
					    session, ref, &skip));
					if (skip)
						break;
				}
			} else {
				/*
				 * Try to skip deleted pages visible to us.
				 */
				if (ref->state == WT_REF_DELETED &&
				    __wt_delete_page_skip(session, ref, false))
					break;
			}

			/*
			 * Optionally skip leaf pages: skip all leaf pages if
			 * WT_READ_SKIP_LEAF is set, when the skip-leaf-count
			 * variable is non-zero, skip some count of leaf pages.
			 * If this page is disk-based, crack the cell to figure
			 * out it's a leaf page without reading it.
			 *
			 * If skipping some number of leaf pages, decrement the
			 * count of pages to zero, and then take the next leaf
			 * page we can. Be cautious around the page decrement,
			 * if for some reason don't take this particular page,
			 * we can take the next one, and, there are additional
			 * tests/decrements when we're about to return a leaf
			 * page.
			 */
			if (skipleafcntp != NULL || LF_ISSET(WT_READ_SKIP_LEAF))
				if (__ref_is_leaf(ref)) {
					if (LF_ISSET(WT_READ_SKIP_LEAF))
						break;
					if (*skipleafcntp > 0) {
						--*skipleafcntp;
						break;
					}
				}

			ret = __wt_page_swap(session, couple, ref,
			    WT_READ_NOTFOUND_OK | WT_READ_RESTART_OK | flags);

			/*
			 * Not-found is an expected return when only walking
			 * in-cache pages, or if we see a deleted page.
			 */
			if (ret == WT_NOTFOUND) {
				ret = 0;
				break;
			}

			/*
			 * The page we're moving to might have split, in which
			 * case move to the last position we held.
			 */
			if (ret == WT_RESTART) {
				ret = 0;

				/*
				 * If a new walk that never coupled from the
				 * root to a new saved position in the tree,
				 * restart the walk.
				 */
				if (couple == &btree->root) {
					ref = &btree->root;
					if (ref->page == NULL)
						goto done;
					goto descend;
				}

				/*
				 * If restarting from some original position,
				 * repeat the increment or decrement we made at
				 * that time. Otherwise, couple is an internal
				 * page we've acquired after moving from that
				 * starting position and we can treat it as a
				 * new page. This works because we never acquire
				 * a hazard pointer on a leaf page we're not
				 * going to return to our caller, this will quit
				 * working if that ever changes.
				 */
				WT_ASSERT(session,
				    couple == couple_orig ||
				    WT_PAGE_IS_INTERNAL(couple->page));
				ref = couple;
				__ref_index_slot(session, ref, &pindex, &slot);
				if (couple == couple_orig)
					break;
			}
			WT_ERR(ret);

			/*
			 * A new page: configure for traversal of any internal
			 * page's children, else return the leaf page.
			 */
			if (WT_PAGE_IS_INTERNAL(ref->page)) {
descend:			couple = ref;
				empty_internal = true;

				__page_descend(
				    session, ref->page, &pindex, &slot, prev);
			} else {
				/*
				 * Optionally skip leaf pages, the second half.
				 * We didn't have an on-page cell to figure out
				 * if it was a leaf page, we had to acquire the
				 * hazard pointer and look at the page.
				 */
				if (skipleafcntp != NULL ||
				    LF_ISSET(WT_READ_SKIP_LEAF)) {
					couple = ref;
					if (LF_ISSET(WT_READ_SKIP_LEAF))
						break;
					if (*skipleafcntp > 0) {
						--*skipleafcntp;
						break;
					}
				}

				*refp = ref;
				goto done;
			}
		}
	}

done:
err:	WT_LEAVE_PAGE_INDEX(session);
	return (ret);
}