/* * __wt_tree_walk -- * Move to the next/previous page in the tree. */ int __wt_tree_walk(WT_SESSION_IMPL *session, WT_REF **refp, uint64_t *walkcntp, uint32_t flags) { WT_BTREE *btree; WT_DECL_RET; WT_PAGE *page; WT_PAGE_INDEX *pindex; WT_REF *couple, *couple_orig, *ref; int prev, skip; uint32_t slot; btree = S2BT(session); /* * Tree walks are special: they look inside page structures that splits * may want to free. Publish that the tree is active during this * window. */ WT_ENTER_PAGE_INDEX(session); /* * !!! * Fast-truncate currently only works on row-store trees. */ if (btree->type != BTREE_ROW) LF_CLR(WT_READ_TRUNCATE); prev = LF_ISSET(WT_READ_PREV) ? 1 : 0; /* * There are multiple reasons and approaches to walking the in-memory * tree: * * (1) finding pages to evict (the eviction server); * (2) writing just dirty leaves or internal nodes (checkpoint); * (3) discarding pages (close); * (4) truncating pages in a range (fast truncate); * (5) skipping pages based on outside information (compaction); * (6) cursor scans (applications). * * Except for cursor scans and compaction, the walk is limited to the * cache, no pages are read. In all cases, hazard pointers protect the * walked pages from eviction. * * Walks use hazard-pointer coupling through the tree and that's OK * (hazard pointers can't deadlock, so there's none of the usual * problems found when logically locking up a btree). If the eviction * thread tries to evict the active page, it fails because of our * hazard pointer. If eviction tries to evict our parent, that fails * because the parent has a child page that can't be discarded. We do * play one game: don't couple up to our parent and then back down to a * new leaf, couple to the next page to which we're descending, it * saves a hazard-pointer swap for each cursor page movement. * * !!! * NOTE: we depend on the fact it's OK to release a page we don't hold, * that is, it's OK to release couple when couple is set to NULL. * * Take a copy of any held page and clear the return value. Remember * the hazard pointer we're currently holding. * * We may be passed a pointer to btree->evict_page that we are clearing * here. We check when discarding pages that we're not discarding that * page, so this clear must be done before the page is released. */ couple = couple_orig = ref = *refp; *refp = NULL; /* If no page is active, begin a walk from the start of the tree. */ if (ref == NULL) { ref = &btree->root; if (ref->page == NULL) goto done; goto descend; } ascend: /* * If the active page was the root, we've reached the walk's end. * Release any hazard-pointer we're holding. */ if (__wt_ref_is_root(ref)) { WT_ERR(__wt_page_release(session, couple, flags)); goto done; } /* Figure out the current slot in the WT_REF array. */ __wt_page_refp(session, ref, &pindex, &slot); for (;;) { /* * If we're at the last/first slot on the page, return this page * in post-order traversal. Otherwise we move to the next/prev * slot and left/right-most element in its subtree. */ if ((prev && slot == 0) || (!prev && slot == pindex->entries - 1)) { ref = ref->home->pg_intl_parent_ref; /* Optionally skip internal pages. */ if (LF_ISSET(WT_READ_SKIP_INTL)) goto ascend; /* * We've ascended the tree and are returning an internal * page. If it's the root, discard our hazard pointer, * otherwise, swap our hazard pointer for the page we'll * return. */ if (__wt_ref_is_root(ref)) WT_ERR(__wt_page_release( session, couple, flags)); else { /* * Locate the reference to our parent page then * swap our child hazard pointer for the parent. * We don't handle restart or not-found returns. * It would require additional complexity and is * not a possible return: we're moving to the * parent of the current child page, our parent * reference can't have split or been evicted. */ __wt_page_refp(session, ref, &pindex, &slot); if ((ret = __wt_page_swap( session, couple, ref, flags)) != 0) { WT_TRET(__wt_page_release( session, couple, flags)); WT_ERR(ret); } } *refp = ref; goto done; } if (prev) --slot; else ++slot; if (walkcntp != NULL) ++*walkcntp; for (;;) { ref = pindex->index[slot]; if (LF_ISSET(WT_READ_CACHE)) { /* * Only look at unlocked pages in memory: * fast-path some common cases. */ if (LF_ISSET(WT_READ_NO_WAIT) && ref->state != WT_REF_MEM) break; } else if (LF_ISSET(WT_READ_TRUNCATE)) { /* * Avoid pulling a deleted page back in to try * to delete it again. */ if (ref->state == WT_REF_DELETED && __wt_delete_page_skip(session, ref)) break; /* * If deleting a range, try to delete the page * without instantiating it. */ WT_ERR(__wt_delete_page(session, ref, &skip)); if (skip) break; } else if (LF_ISSET(WT_READ_COMPACT)) { /* * Skip deleted pages, rewriting them doesn't * seem useful. */ if (ref->state == WT_REF_DELETED) break; /* * If the page is in-memory, we want to look at * it (it may have been modified and written, * and the current location is the interesting * one in terms of compaction, not the original * location). If the page isn't in-memory, test * if the page will help with compaction, don't * read it if we don't have to. */ if (ref->state == WT_REF_DISK) { WT_ERR(__wt_compact_page_skip( session, ref, &skip)); if (skip) break; } } else { /* * Try to skip deleted pages visible to us. */ if (ref->state == WT_REF_DELETED && __wt_delete_page_skip(session, ref)) break; } ret = __wt_page_swap(session, couple, ref, flags); /* * Not-found is an expected return when only walking * in-cache pages. */ if (ret == WT_NOTFOUND) { ret = 0; break; } /* * The page we're moving to might have split, in which * case move to the last position we held. */ if (ret == WT_RESTART) { ret = 0; /* * If a new walk that never coupled from the * root to a new saved position in the tree, * restart the walk. */ if (couple == &btree->root) { ref = &btree->root; if (ref->page == NULL) goto done; goto descend; } /* * If restarting from some original position, * repeat the increment or decrement we made at * that time. Otherwise, couple is an internal * page we've acquired after moving from that * starting position and we can treat it as a * new page. This works because we never acquire * a hazard pointer on a leaf page we're not * going to return to our caller, this will quit * working if that ever changes. */ WT_ASSERT(session, couple == couple_orig || WT_PAGE_IS_INTERNAL(couple->page)); ref = couple; __wt_page_refp(session, ref, &pindex, &slot); if (couple == couple_orig) break; } WT_ERR(ret); /* * A new page: configure for traversal of any internal * page's children, else return the leaf page. */ descend: couple = ref; page = ref->page; if (page->type == WT_PAGE_ROW_INT || page->type == WT_PAGE_COL_INT) { WT_INTL_INDEX_GET(session, page, pindex); slot = prev ? pindex->entries - 1 : 0; } else { *refp = ref; goto done; } } } done: err: WT_LEAVE_PAGE_INDEX(session); return (ret); }
/* * __tree_walk_internal -- * Move to the next/previous page in the tree. */ static inline int __tree_walk_internal(WT_SESSION_IMPL *session, WT_REF **refp, uint64_t *walkcntp, int (*skip_func)(WT_SESSION_IMPL *, WT_REF *, void *, bool *), void *func_cookie, uint32_t flags) { WT_BTREE *btree; WT_DECL_RET; WT_PAGE_INDEX *pindex; WT_REF *couple, *couple_orig, *ref; uint32_t slot; bool empty_internal, initial_descent, prev, skip; btree = S2BT(session); pindex = NULL; empty_internal = initial_descent = false; /* * Tree walks are special: they look inside page structures that splits * may want to free. Publish that the tree is active during this * window. */ WT_ENTER_PAGE_INDEX(session); /* Walk should never instantiate deleted pages. */ LF_SET(WT_READ_NO_EMPTY); /* * !!! * Fast-truncate currently only works on row-store trees. */ if (btree->type != BTREE_ROW) LF_CLR(WT_READ_TRUNCATE); prev = LF_ISSET(WT_READ_PREV) ? 1 : 0; /* * There are multiple reasons and approaches to walking the in-memory * tree: * * (1) finding pages to evict (the eviction server); * (2) writing just dirty leaves or internal nodes (checkpoint); * (3) discarding pages (close); * (4) truncating pages in a range (fast truncate); * (5) skipping pages based on outside information (compaction); * (6) cursor scans (applications). * * Except for cursor scans and compaction, the walk is limited to the * cache, no pages are read. In all cases, hazard pointers protect the * walked pages from eviction. * * Walks use hazard-pointer coupling through the tree and that's OK * (hazard pointers can't deadlock, so there's none of the usual * problems found when logically locking up a btree). If the eviction * thread tries to evict the active page, it fails because of our * hazard pointer. If eviction tries to evict our parent, that fails * because the parent has a child page that can't be discarded. We do * play one game: don't couple up to our parent and then back down to a * new leaf, couple to the next page to which we're descending, it * saves a hazard-pointer swap for each cursor page movement. * * !!! * NOTE: we depend on the fact it's OK to release a page we don't hold, * that is, it's OK to release couple when couple is set to NULL. * * Take a copy of any held page and clear the return value. Remember * the hazard pointer we're currently holding. * * Clear the returned value, it makes future error handling easier. */ couple = couple_orig = ref = *refp; *refp = NULL; /* If no page is active, begin a walk from the start/end of the tree. */ if (ref == NULL) { restart: /* * We can be here with a NULL or root WT_REF; the page release * function handles them internally, don't complicate this code * by calling them out. */ WT_ERR(__wt_page_release(session, couple, flags)); /* * We're not supposed to walk trees without root pages. As this * has not always been the case, assert to debug that change. */ WT_ASSERT(session, btree->root.page != NULL); couple = couple_orig = ref = &btree->root; initial_descent = true; goto descend; } /* * If the active page was the root, we've reached the walk's end; we * only get here if we've returned the root to our caller, so we're * holding no hazard pointers. */ if (__wt_ref_is_root(ref)) goto done; /* Figure out the current slot in the WT_REF array. */ __ref_index_slot(session, ref, &pindex, &slot); for (;;) { /* * If we're at the last/first slot on the internal page, return * it in post-order traversal. Otherwise move to the next/prev * slot and left/right-most element in that subtree. */ while ((prev && slot == 0) || (!prev && slot == pindex->entries - 1)) { /* Ascend to the parent. */ __ref_ascend(session, &ref, &pindex, &slot); /* * If at the root and returning internal pages, return * the root page, otherwise we're done. Regardless, no * hazard pointer is required, release the one we hold. */ if (__wt_ref_is_root(ref)) { WT_ERR(__wt_page_release( session, couple, flags)); if (!LF_ISSET(WT_READ_SKIP_INTL)) *refp = ref; goto done; } /* * If we got all the way through an internal page and * all of the child pages were deleted, mark it for * eviction. */ if (empty_internal && pindex->entries > 1) { __wt_page_evict_soon(session, ref); empty_internal = false; } /* * Optionally return internal pages. Swap our previous * hazard pointer for the page we'll return. We don't * handle restart or not-found returns, it would require * additional complexity and is not a possible return: * we're moving to the parent of the current child page, * the parent can't have been evicted. */ if (!LF_ISSET(WT_READ_SKIP_INTL)) { WT_ERR(__wt_page_swap( session, couple, ref, flags)); *refp = ref; goto done; } } if (prev) --slot; else ++slot; if (walkcntp != NULL) ++*walkcntp; for (;;) { /* * Move to the next slot, and set the reference hint if * it's wrong (used when we continue the walk). We don't * always update the hints when splitting, it's expected * for them to be incorrect in some workloads. */ ref = pindex->index[slot]; if (ref->pindex_hint != slot) ref->pindex_hint = slot; /* * If we see any child states other than deleted, the * page isn't empty. */ if (ref->state != WT_REF_DELETED && !LF_ISSET(WT_READ_TRUNCATE)) empty_internal = false; if (LF_ISSET(WT_READ_CACHE)) { /* * Only look at unlocked pages in memory: * fast-path some common cases. */ if (LF_ISSET(WT_READ_NO_WAIT) && ref->state != WT_REF_MEM) break; /* Skip lookaside pages if not requested. */ if (ref->state == WT_REF_LOOKASIDE && !LF_ISSET(WT_READ_LOOKASIDE)) break; } else if (LF_ISSET(WT_READ_TRUNCATE)) { /* * Avoid pulling a deleted page back in to try * to delete it again. */ if (ref->state == WT_REF_DELETED && __wt_delete_page_skip(session, ref, false)) break; /* * If deleting a range, try to delete the page * without instantiating it. */ WT_ERR(__wt_delete_page(session, ref, &skip)); if (skip) break; empty_internal = false; } else if (skip_func != NULL) { WT_ERR(skip_func(session, ref, func_cookie, &skip)); if (skip) break; } else { /* * Try to skip deleted pages visible to us. */ if (ref->state == WT_REF_DELETED && __wt_delete_page_skip(session, ref, false)) break; } ret = __wt_page_swap(session, couple, ref, WT_READ_NOTFOUND_OK | WT_READ_RESTART_OK | flags); /* * Not-found is an expected return when only walking * in-cache pages, or if we see a deleted page. */ if (ret == WT_NOTFOUND) { ret = 0; break; } /* * The page we're moving to might have split, in which * case move to the last position we held. */ if (ret == WT_RESTART) { ret = 0; /* * If a cursor is setting up at the end of the * tree, we can't use our parent page's index, * because it may have already split; restart * the walk. */ if (prev && initial_descent) goto restart; /* * If a new walk that never coupled from the * root to a new saved position in the tree, * restart the walk. */ if (couple == &btree->root) goto restart; /* * If restarting from some original position, * repeat the increment or decrement we made at * that time. Otherwise, couple is an internal * page we've acquired after moving from that * starting position and we can treat it as a * new page. This works because we never acquire * a hazard pointer on a leaf page we're not * going to return to our caller, this will quit * working if that ever changes. */ WT_ASSERT(session, couple == couple_orig || WT_PAGE_IS_INTERNAL(couple->page)); ref = couple; __ref_index_slot(session, ref, &pindex, &slot); if (couple == couple_orig) break; } WT_ERR(ret); couple = ref; /* * A new page: configure for traversal of any internal * page's children, else return the leaf page. */ if (WT_PAGE_IS_INTERNAL(ref->page)) { descend: empty_internal = true; /* * There's a split race when a cursor is setting * up at the end of the tree or moving backwards * through the tree and descending a level. When * splitting an internal page into its parent, * we move the WT_REF structures and update the * parent's page index before updating the split * page's page index, and it's not an atomic * update. A thread can read the parent page's * replacement page index, then read the split * page's original index, or the parent page's * original and the split page's replacement. * * This isn't a problem for a cursor setting up * at the start of the tree or moving forwards * through the tree because we do right-hand * splits on internal pages and the initial part * of the split page's namespace won't change as * part of a split. A thread reading the parent * page's and split page's indexes will move to * the same slot no matter what order of indexes * are read. * * Handle a cursor setting up at the end of the * tree or moving backwards through the tree. */ if (!prev) { WT_INTL_INDEX_GET( session, ref->page, pindex); slot = 0; } else if (initial_descent) { if (!__ref_initial_descent_prev( session, ref, &pindex)) goto restart; slot = pindex->entries - 1; } else { __ref_descend_prev( session, ref, &pindex); slot = pindex->entries - 1; } continue; } /* * The tree-walk restart code knows we return any leaf * page we acquire (never hazard-pointer coupling on * after acquiring a leaf page), and asserts no restart * happens while holding a leaf page. This page must be * returned to our caller. */ *refp = ref; goto done; } } done: err: WT_LEAVE_PAGE_INDEX(session); return (ret); }
/* * __tree_walk_internal -- * Move to the next/previous page in the tree. */ static inline int __tree_walk_internal(WT_SESSION_IMPL *session, WT_REF **refp, uint64_t *walkcntp, uint64_t *skipleafcntp, uint32_t flags) { WT_BTREE *btree; WT_DECL_RET; WT_PAGE_INDEX *pindex; WT_REF *couple, *couple_orig, *ref; bool empty_internal, prev, skip; uint32_t slot; btree = S2BT(session); empty_internal = false; /* * Tree walks are special: they look inside page structures that splits * may want to free. Publish that the tree is active during this * window. */ WT_ENTER_PAGE_INDEX(session); /* Walk should never instantiate deleted pages. */ LF_SET(WT_READ_NO_EMPTY); /* * !!! * Fast-truncate currently only works on row-store trees. */ if (btree->type != BTREE_ROW) LF_CLR(WT_READ_TRUNCATE); prev = LF_ISSET(WT_READ_PREV) ? 1 : 0; /* * There are multiple reasons and approaches to walking the in-memory * tree: * * (1) finding pages to evict (the eviction server); * (2) writing just dirty leaves or internal nodes (checkpoint); * (3) discarding pages (close); * (4) truncating pages in a range (fast truncate); * (5) skipping pages based on outside information (compaction); * (6) cursor scans (applications). * * Except for cursor scans and compaction, the walk is limited to the * cache, no pages are read. In all cases, hazard pointers protect the * walked pages from eviction. * * Walks use hazard-pointer coupling through the tree and that's OK * (hazard pointers can't deadlock, so there's none of the usual * problems found when logically locking up a btree). If the eviction * thread tries to evict the active page, it fails because of our * hazard pointer. If eviction tries to evict our parent, that fails * because the parent has a child page that can't be discarded. We do * play one game: don't couple up to our parent and then back down to a * new leaf, couple to the next page to which we're descending, it * saves a hazard-pointer swap for each cursor page movement. * * !!! * NOTE: we depend on the fact it's OK to release a page we don't hold, * that is, it's OK to release couple when couple is set to NULL. * * Take a copy of any held page and clear the return value. Remember * the hazard pointer we're currently holding. * * We may be passed a pointer to btree->evict_page that we are clearing * here. We check when discarding pages that we're not discarding that * page, so this clear must be done before the page is released. */ couple = couple_orig = ref = *refp; *refp = NULL; /* If no page is active, begin a walk from the start of the tree. */ if (ref == NULL) { ref = &btree->root; if (ref->page == NULL) goto done; goto descend; } /* * If the active page was the root, we've reached the walk's end. * Release any hazard-pointer we're holding. */ if (__wt_ref_is_root(ref)) { WT_ERR(__wt_page_release(session, couple, flags)); goto done; } /* Figure out the current slot in the WT_REF array. */ __ref_index_slot(session, ref, &pindex, &slot); for (;;) { /* * If we're at the last/first slot on the internal page, return * it in post-order traversal. Otherwise move to the next/prev * slot and left/right-most element in that subtree. */ while ((prev && slot == 0) || (!prev && slot == pindex->entries - 1)) { /* Ascend to the parent. */ __page_ascend(session, &ref, &pindex, &slot); /* * If we got all the way through an internal page and * all of the child pages were deleted, mark it for * eviction. */ if (empty_internal && pindex->entries > 1) { __wt_page_evict_soon(ref->page); empty_internal = false; } /* * If at the root and returning internal pages, return * the root page, otherwise we're done. Regardless, no * hazard pointer is required, release the one we hold. */ if (__wt_ref_is_root(ref)) { WT_ERR(__wt_page_release( session, couple, flags)); if (!LF_ISSET(WT_READ_SKIP_INTL)) *refp = ref; goto done; } /* * Optionally return internal pages. Swap our previous * hazard pointer for the page we'll return. We don't * handle restart or not-found returns, it would require * additional complexity and is not a possible return: * we're moving to the parent of the current child page, * the parent can't have been evicted. */ if (!LF_ISSET(WT_READ_SKIP_INTL)) { WT_ERR(__wt_page_swap( session, couple, ref, flags)); *refp = ref; goto done; } } if (prev) --slot; else ++slot; if (walkcntp != NULL) ++*walkcntp; for (;;) { /* * Move to the next slot, and set the reference hint if * it's wrong (used when we continue the walk). We don't * update those hints when splitting, so it's common for * them to be incorrect in some workloads. */ ref = pindex->index[slot]; if (ref->pindex_hint != slot) ref->pindex_hint = slot; /* * If we see any child states other than deleted, the * page isn't empty. */ if (ref->state != WT_REF_DELETED && !LF_ISSET(WT_READ_TRUNCATE)) empty_internal = false; if (LF_ISSET(WT_READ_CACHE)) { /* * Only look at unlocked pages in memory: * fast-path some common cases. */ if (LF_ISSET(WT_READ_NO_WAIT) && ref->state != WT_REF_MEM) break; } else if (LF_ISSET(WT_READ_TRUNCATE)) { /* * Avoid pulling a deleted page back in to try * to delete it again. */ if (ref->state == WT_REF_DELETED && __wt_delete_page_skip(session, ref, false)) break; /* * If deleting a range, try to delete the page * without instantiating it. */ WT_ERR(__wt_delete_page(session, ref, &skip)); if (skip) break; empty_internal = false; } else if (LF_ISSET(WT_READ_COMPACT)) { /* * Skip deleted pages, rewriting them doesn't * seem useful. */ if (ref->state == WT_REF_DELETED) break; /* * If the page is in-memory, we want to look at * it (it may have been modified and written, * and the current location is the interesting * one in terms of compaction, not the original * location). If the page isn't in-memory, test * if the page will help with compaction, don't * read it if we don't have to. */ if (ref->state == WT_REF_DISK) { WT_ERR(__wt_compact_page_skip( session, ref, &skip)); if (skip) break; } } else { /* * Try to skip deleted pages visible to us. */ if (ref->state == WT_REF_DELETED && __wt_delete_page_skip(session, ref, false)) break; } /* * Optionally skip leaf pages: skip all leaf pages if * WT_READ_SKIP_LEAF is set, when the skip-leaf-count * variable is non-zero, skip some count of leaf pages. * If this page is disk-based, crack the cell to figure * out it's a leaf page without reading it. * * If skipping some number of leaf pages, decrement the * count of pages to zero, and then take the next leaf * page we can. Be cautious around the page decrement, * if for some reason don't take this particular page, * we can take the next one, and, there are additional * tests/decrements when we're about to return a leaf * page. */ if (skipleafcntp != NULL || LF_ISSET(WT_READ_SKIP_LEAF)) if (__ref_is_leaf(ref)) { if (LF_ISSET(WT_READ_SKIP_LEAF)) break; if (*skipleafcntp > 0) { --*skipleafcntp; break; } } ret = __wt_page_swap(session, couple, ref, WT_READ_NOTFOUND_OK | WT_READ_RESTART_OK | flags); /* * Not-found is an expected return when only walking * in-cache pages, or if we see a deleted page. */ if (ret == WT_NOTFOUND) { ret = 0; break; } /* * The page we're moving to might have split, in which * case move to the last position we held. */ if (ret == WT_RESTART) { ret = 0; /* * If a new walk that never coupled from the * root to a new saved position in the tree, * restart the walk. */ if (couple == &btree->root) { ref = &btree->root; if (ref->page == NULL) goto done; goto descend; } /* * If restarting from some original position, * repeat the increment or decrement we made at * that time. Otherwise, couple is an internal * page we've acquired after moving from that * starting position and we can treat it as a * new page. This works because we never acquire * a hazard pointer on a leaf page we're not * going to return to our caller, this will quit * working if that ever changes. */ WT_ASSERT(session, couple == couple_orig || WT_PAGE_IS_INTERNAL(couple->page)); ref = couple; __ref_index_slot(session, ref, &pindex, &slot); if (couple == couple_orig) break; } WT_ERR(ret); /* * A new page: configure for traversal of any internal * page's children, else return the leaf page. */ if (WT_PAGE_IS_INTERNAL(ref->page)) { descend: couple = ref; empty_internal = true; __page_descend( session, ref->page, &pindex, &slot, prev); } else { /* * Optionally skip leaf pages, the second half. * We didn't have an on-page cell to figure out * if it was a leaf page, we had to acquire the * hazard pointer and look at the page. */ if (skipleafcntp != NULL || LF_ISSET(WT_READ_SKIP_LEAF)) { couple = ref; if (LF_ISSET(WT_READ_SKIP_LEAF)) break; if (*skipleafcntp > 0) { --*skipleafcntp; break; } } *refp = ref; goto done; } } } done: err: WT_LEAVE_PAGE_INDEX(session); return (ret); }