/* * __page_refp -- * Return the page's index and slot for a reference. */ static inline void __page_refp(WT_SESSION_IMPL *session, WT_REF *ref, WT_PAGE_INDEX **pindexp, uint32_t *slotp) { WT_PAGE_INDEX *pindex; uint32_t i; /* * Copy the parent page's index value: the page can split at any time, * but the index's value is always valid, even if it's not up-to-date. */ retry: WT_INTL_INDEX_GET(session, ref->home, pindex); /* * Use the page's reference hint: it should be correct unless the page * split before our slot. If the page splits after our slot, the hint * will point earlier in the array than our actual slot, so the first * loop is from the hint to the end of the list, and the second loop * is from the start of the list to the end of the list. (The second * loop overlaps the first, but that only happen in cases where we've * deepened the tree and aren't going to find our slot at all, that's * not worth optimizing.) * * It's not an error for the reference hint to be wrong, it just means * the first retrieval (which sets the hint for subsequent retrievals), * is slower. */ i = ref->pindex_hint; if (i < pindex->entries && pindex->index[i]->page == ref->page) { *pindexp = pindex; *slotp = i; return; } while (++i < pindex->entries) if (pindex->index[i]->page == ref->page) { *pindexp = pindex; *slotp = ref->pindex_hint = i; return; } for (i = 0; i < pindex->entries; ++i) if (pindex->index[i]->page == ref->page) { *pindexp = pindex; *slotp = ref->pindex_hint = i; return; } /* * If we don't find our reference, the page split into a new level and * our home pointer references the wrong page. After internal pages * deepen, their reference structure home value are updated; yield and * wait for that to happen. */ __wt_yield(); goto retry; }
/* * __ref_initial_descent_prev -- * Descend the tree one level, when setting up the initial cursor position * for a previous-cursor walk. */ static inline bool __ref_initial_descent_prev( WT_SESSION_IMPL *session, WT_REF *ref, WT_PAGE_INDEX **pindexp) { WT_PAGE_INDEX *pindex; /* * We're passed a child page into which we're descending, and on which * we have a hazard pointer. * * Acquire a page index for the child page and then confirm we haven't * raced with a parent split. */ WT_INTL_INDEX_GET(session, ref->page, pindex); if (__wt_split_descent_race(session, ref, *pindexp)) return (false); *pindexp = pindex; return (true); }
/* * __tree_walk_internal -- * Move to the next/previous page in the tree. */ static inline int __tree_walk_internal(WT_SESSION_IMPL *session, WT_REF **refp, uint64_t *walkcntp, int (*skip_func)(WT_SESSION_IMPL *, WT_REF *, void *, bool *), void *func_cookie, uint32_t flags) { WT_BTREE *btree; WT_DECL_RET; WT_PAGE_INDEX *pindex; WT_REF *couple, *couple_orig, *ref; uint32_t slot; bool empty_internal, initial_descent, prev, skip; btree = S2BT(session); pindex = NULL; empty_internal = initial_descent = false; /* * Tree walks are special: they look inside page structures that splits * may want to free. Publish that the tree is active during this * window. */ WT_ENTER_PAGE_INDEX(session); /* Walk should never instantiate deleted pages. */ LF_SET(WT_READ_NO_EMPTY); /* * !!! * Fast-truncate currently only works on row-store trees. */ if (btree->type != BTREE_ROW) LF_CLR(WT_READ_TRUNCATE); prev = LF_ISSET(WT_READ_PREV) ? 1 : 0; /* * There are multiple reasons and approaches to walking the in-memory * tree: * * (1) finding pages to evict (the eviction server); * (2) writing just dirty leaves or internal nodes (checkpoint); * (3) discarding pages (close); * (4) truncating pages in a range (fast truncate); * (5) skipping pages based on outside information (compaction); * (6) cursor scans (applications). * * Except for cursor scans and compaction, the walk is limited to the * cache, no pages are read. In all cases, hazard pointers protect the * walked pages from eviction. * * Walks use hazard-pointer coupling through the tree and that's OK * (hazard pointers can't deadlock, so there's none of the usual * problems found when logically locking up a btree). If the eviction * thread tries to evict the active page, it fails because of our * hazard pointer. If eviction tries to evict our parent, that fails * because the parent has a child page that can't be discarded. We do * play one game: don't couple up to our parent and then back down to a * new leaf, couple to the next page to which we're descending, it * saves a hazard-pointer swap for each cursor page movement. * * !!! * NOTE: we depend on the fact it's OK to release a page we don't hold, * that is, it's OK to release couple when couple is set to NULL. * * Take a copy of any held page and clear the return value. Remember * the hazard pointer we're currently holding. * * Clear the returned value, it makes future error handling easier. */ couple = couple_orig = ref = *refp; *refp = NULL; /* If no page is active, begin a walk from the start/end of the tree. */ if (ref == NULL) { restart: /* * We can be here with a NULL or root WT_REF; the page release * function handles them internally, don't complicate this code * by calling them out. */ WT_ERR(__wt_page_release(session, couple, flags)); /* * We're not supposed to walk trees without root pages. As this * has not always been the case, assert to debug that change. */ WT_ASSERT(session, btree->root.page != NULL); couple = couple_orig = ref = &btree->root; initial_descent = true; goto descend; } /* * If the active page was the root, we've reached the walk's end; we * only get here if we've returned the root to our caller, so we're * holding no hazard pointers. */ if (__wt_ref_is_root(ref)) goto done; /* Figure out the current slot in the WT_REF array. */ __ref_index_slot(session, ref, &pindex, &slot); for (;;) { /* * If we're at the last/first slot on the internal page, return * it in post-order traversal. Otherwise move to the next/prev * slot and left/right-most element in that subtree. */ while ((prev && slot == 0) || (!prev && slot == pindex->entries - 1)) { /* Ascend to the parent. */ __ref_ascend(session, &ref, &pindex, &slot); /* * If at the root and returning internal pages, return * the root page, otherwise we're done. Regardless, no * hazard pointer is required, release the one we hold. */ if (__wt_ref_is_root(ref)) { WT_ERR(__wt_page_release( session, couple, flags)); if (!LF_ISSET(WT_READ_SKIP_INTL)) *refp = ref; goto done; } /* * If we got all the way through an internal page and * all of the child pages were deleted, mark it for * eviction. */ if (empty_internal && pindex->entries > 1) { __wt_page_evict_soon(session, ref); empty_internal = false; } /* * Optionally return internal pages. Swap our previous * hazard pointer for the page we'll return. We don't * handle restart or not-found returns, it would require * additional complexity and is not a possible return: * we're moving to the parent of the current child page, * the parent can't have been evicted. */ if (!LF_ISSET(WT_READ_SKIP_INTL)) { WT_ERR(__wt_page_swap( session, couple, ref, flags)); *refp = ref; goto done; } } if (prev) --slot; else ++slot; if (walkcntp != NULL) ++*walkcntp; for (;;) { /* * Move to the next slot, and set the reference hint if * it's wrong (used when we continue the walk). We don't * always update the hints when splitting, it's expected * for them to be incorrect in some workloads. */ ref = pindex->index[slot]; if (ref->pindex_hint != slot) ref->pindex_hint = slot; /* * If we see any child states other than deleted, the * page isn't empty. */ if (ref->state != WT_REF_DELETED && !LF_ISSET(WT_READ_TRUNCATE)) empty_internal = false; if (LF_ISSET(WT_READ_CACHE)) { /* * Only look at unlocked pages in memory: * fast-path some common cases. */ if (LF_ISSET(WT_READ_NO_WAIT) && ref->state != WT_REF_MEM) break; /* Skip lookaside pages if not requested. */ if (ref->state == WT_REF_LOOKASIDE && !LF_ISSET(WT_READ_LOOKASIDE)) break; } else if (LF_ISSET(WT_READ_TRUNCATE)) { /* * Avoid pulling a deleted page back in to try * to delete it again. */ if (ref->state == WT_REF_DELETED && __wt_delete_page_skip(session, ref, false)) break; /* * If deleting a range, try to delete the page * without instantiating it. */ WT_ERR(__wt_delete_page(session, ref, &skip)); if (skip) break; empty_internal = false; } else if (skip_func != NULL) { WT_ERR(skip_func(session, ref, func_cookie, &skip)); if (skip) break; } else { /* * Try to skip deleted pages visible to us. */ if (ref->state == WT_REF_DELETED && __wt_delete_page_skip(session, ref, false)) break; } ret = __wt_page_swap(session, couple, ref, WT_READ_NOTFOUND_OK | WT_READ_RESTART_OK | flags); /* * Not-found is an expected return when only walking * in-cache pages, or if we see a deleted page. */ if (ret == WT_NOTFOUND) { ret = 0; break; } /* * The page we're moving to might have split, in which * case move to the last position we held. */ if (ret == WT_RESTART) { ret = 0; /* * If a cursor is setting up at the end of the * tree, we can't use our parent page's index, * because it may have already split; restart * the walk. */ if (prev && initial_descent) goto restart; /* * If a new walk that never coupled from the * root to a new saved position in the tree, * restart the walk. */ if (couple == &btree->root) goto restart; /* * If restarting from some original position, * repeat the increment or decrement we made at * that time. Otherwise, couple is an internal * page we've acquired after moving from that * starting position and we can treat it as a * new page. This works because we never acquire * a hazard pointer on a leaf page we're not * going to return to our caller, this will quit * working if that ever changes. */ WT_ASSERT(session, couple == couple_orig || WT_PAGE_IS_INTERNAL(couple->page)); ref = couple; __ref_index_slot(session, ref, &pindex, &slot); if (couple == couple_orig) break; } WT_ERR(ret); couple = ref; /* * A new page: configure for traversal of any internal * page's children, else return the leaf page. */ if (WT_PAGE_IS_INTERNAL(ref->page)) { descend: empty_internal = true; /* * There's a split race when a cursor is setting * up at the end of the tree or moving backwards * through the tree and descending a level. When * splitting an internal page into its parent, * we move the WT_REF structures and update the * parent's page index before updating the split * page's page index, and it's not an atomic * update. A thread can read the parent page's * replacement page index, then read the split * page's original index, or the parent page's * original and the split page's replacement. * * This isn't a problem for a cursor setting up * at the start of the tree or moving forwards * through the tree because we do right-hand * splits on internal pages and the initial part * of the split page's namespace won't change as * part of a split. A thread reading the parent * page's and split page's indexes will move to * the same slot no matter what order of indexes * are read. * * Handle a cursor setting up at the end of the * tree or moving backwards through the tree. */ if (!prev) { WT_INTL_INDEX_GET( session, ref->page, pindex); slot = 0; } else if (initial_descent) { if (!__ref_initial_descent_prev( session, ref, &pindex)) goto restart; slot = pindex->entries - 1; } else { __ref_descend_prev( session, ref, &pindex); slot = pindex->entries - 1; } continue; } /* * The tree-walk restart code knows we return any leaf * page we acquire (never hazard-pointer coupling on * after acquiring a leaf page), and asserts no restart * happens while holding a leaf page. This page must be * returned to our caller. */ *refp = ref; goto done; } } done: err: WT_LEAVE_PAGE_INDEX(session); return (ret); }
/* * __ref_descend_prev -- * Descend the tree one level, during a previous-cursor walk. */ static inline void __ref_descend_prev( WT_SESSION_IMPL *session, WT_REF *ref, WT_PAGE_INDEX **pindexp) { WT_PAGE_INDEX *pindex; uint64_t yield_count; /* * We're passed a child page into which we're descending, and on which * we have a hazard pointer. */ for (yield_count = 0;; yield_count++, __wt_yield()) { /* * There's a split race when a cursor moving backwards through * the tree descends the tree. If we're splitting an internal * page into its parent, we move the WT_REF structures and * update the parent's page index before updating the split * page's page index, and it's not an atomic update. A thread * can read the parent page's replacement page index and then * read the split page's original index. * * This can create a race for previous-cursor movements. * * For example, imagine an internal page with 3 child pages, * with the namespaces a-f, g-h and i-j; the first child page * splits. The parent starts out with the following page-index: * * | ... | a | g | i | ... | * * The split page starts out with the following page-index: * * | a | b | c | d | e | f | * * The first step is to move the c-f ranges into a new subtree, * so, for example we might have two new internal pages 'c' and * 'e', where the new 'c' page references the c-d namespace and * the new 'e' page references the e-f namespace. The top of the * subtree references the parent page, but until the parent's * page index is updated, any threads in the subtree won't be * able to ascend out of the subtree. However, once the parent * page's page index is updated to this: * * | ... | a | c | e | g | i | ... | * * threads in the subtree can ascend into the parent. Imagine a * cursor in the c-d part of the namespace that ascends to the * parent's 'c' slot. It would then decrement to the slot before * the 'c' slot, the 'a' slot. * * The previous-cursor movement selects the last slot in the 'a' * page; if the split page's page-index hasn't been updated yet, * it will select the 'f' slot, which is incorrect. Once the * split page's page index is updated to this: * * | a | b | * * the previous-cursor movement will select the 'b' slot, which * is correct. * * This function takes an argument which is the internal page * from which we're descending. If the last slot on the page no * longer points to the current page as its "home", the page is * being split and part of its namespace moved. We have the * correct page and we don't have to move, all we have to do is * wait until the split page's page index is updated. */ WT_INTL_INDEX_GET(session, ref->page, pindex); if (pindex->index[pindex->entries - 1]->home == ref->page) break; } *pindexp = pindex; WT_STAT_CONN_INCRV(session, tree_descend_blocked, yield_count); }
/* * __ref_index_slot -- * Return the page's index and slot for a reference. */ static inline void __ref_index_slot(WT_SESSION_IMPL *session, WT_REF *ref, WT_PAGE_INDEX **pindexp, uint32_t *slotp) { WT_PAGE_INDEX *pindex; WT_REF **start, **stop, **p, **t; uint64_t sleep_count, yield_count; uint32_t entries, slot; /* * If we don't find our reference, the page split and our home * pointer references the wrong page. When internal pages * split, their WT_REF structure home values are updated; yield * and wait for that to happen. */ for (sleep_count = yield_count = 0;;) { /* * Copy the parent page's index value: the page can split at * any time, but the index's value is always valid, even if * it's not up-to-date. */ WT_INTL_INDEX_GET(session, ref->home, pindex); entries = pindex->entries; /* * Use the page's reference hint: it should be correct unless * there was a split or delete in the parent before our slot. * If the hint is wrong, it can be either too big or too small, * but often only by a small amount. Search up and down the * index starting from the hint. * * It's not an error for the reference hint to be wrong, it * just means the first retrieval (which sets the hint for * subsequent retrievals), is slower. */ slot = ref->pindex_hint; if (slot >= entries) slot = entries - 1; if (pindex->index[slot] == ref) goto found; for (start = &pindex->index[0], stop = &pindex->index[entries - 1], p = t = &pindex->index[slot]; p > start || t < stop;) { if (p > start && *--p == ref) { slot = (uint32_t)(p - start); goto found; } if (t < stop && *++t == ref) { slot = (uint32_t)(t - start); goto found; } } /* * We failed to get the page index and slot reference, yield * before retrying, and if we've yielded enough times, start * sleeping so we don't burn CPU to no purpose. */ __wt_ref_state_yield_sleep(&yield_count, &sleep_count); WT_STAT_CONN_INCRV(session, page_index_slot_ref_blocked, sleep_count); } found: WT_ASSERT(session, pindex->index[slot] == ref); *pindexp = pindex; *slotp = slot; }
/* * __wt_tree_walk -- * Move to the next/previous page in the tree. */ int __wt_tree_walk(WT_SESSION_IMPL *session, WT_REF **refp, uint64_t *walkcntp, uint32_t flags) { WT_BTREE *btree; WT_DECL_RET; WT_PAGE *page; WT_PAGE_INDEX *pindex; WT_REF *couple, *couple_orig, *ref; int prev, skip; uint32_t slot; btree = S2BT(session); /* * Tree walks are special: they look inside page structures that splits * may want to free. Publish that the tree is active during this * window. */ WT_ENTER_PAGE_INDEX(session); /* * !!! * Fast-truncate currently only works on row-store trees. */ if (btree->type != BTREE_ROW) LF_CLR(WT_READ_TRUNCATE); prev = LF_ISSET(WT_READ_PREV) ? 1 : 0; /* * There are multiple reasons and approaches to walking the in-memory * tree: * * (1) finding pages to evict (the eviction server); * (2) writing just dirty leaves or internal nodes (checkpoint); * (3) discarding pages (close); * (4) truncating pages in a range (fast truncate); * (5) skipping pages based on outside information (compaction); * (6) cursor scans (applications). * * Except for cursor scans and compaction, the walk is limited to the * cache, no pages are read. In all cases, hazard pointers protect the * walked pages from eviction. * * Walks use hazard-pointer coupling through the tree and that's OK * (hazard pointers can't deadlock, so there's none of the usual * problems found when logically locking up a btree). If the eviction * thread tries to evict the active page, it fails because of our * hazard pointer. If eviction tries to evict our parent, that fails * because the parent has a child page that can't be discarded. We do * play one game: don't couple up to our parent and then back down to a * new leaf, couple to the next page to which we're descending, it * saves a hazard-pointer swap for each cursor page movement. * * !!! * NOTE: we depend on the fact it's OK to release a page we don't hold, * that is, it's OK to release couple when couple is set to NULL. * * Take a copy of any held page and clear the return value. Remember * the hazard pointer we're currently holding. * * We may be passed a pointer to btree->evict_page that we are clearing * here. We check when discarding pages that we're not discarding that * page, so this clear must be done before the page is released. */ couple = couple_orig = ref = *refp; *refp = NULL; /* If no page is active, begin a walk from the start of the tree. */ if (ref == NULL) { ref = &btree->root; if (ref->page == NULL) goto done; goto descend; } ascend: /* * If the active page was the root, we've reached the walk's end. * Release any hazard-pointer we're holding. */ if (__wt_ref_is_root(ref)) { WT_ERR(__wt_page_release(session, couple, flags)); goto done; } /* Figure out the current slot in the WT_REF array. */ __wt_page_refp(session, ref, &pindex, &slot); for (;;) { /* * If we're at the last/first slot on the page, return this page * in post-order traversal. Otherwise we move to the next/prev * slot and left/right-most element in its subtree. */ if ((prev && slot == 0) || (!prev && slot == pindex->entries - 1)) { ref = ref->home->pg_intl_parent_ref; /* Optionally skip internal pages. */ if (LF_ISSET(WT_READ_SKIP_INTL)) goto ascend; /* * We've ascended the tree and are returning an internal * page. If it's the root, discard our hazard pointer, * otherwise, swap our hazard pointer for the page we'll * return. */ if (__wt_ref_is_root(ref)) WT_ERR(__wt_page_release( session, couple, flags)); else { /* * Locate the reference to our parent page then * swap our child hazard pointer for the parent. * We don't handle restart or not-found returns. * It would require additional complexity and is * not a possible return: we're moving to the * parent of the current child page, our parent * reference can't have split or been evicted. */ __wt_page_refp(session, ref, &pindex, &slot); if ((ret = __wt_page_swap( session, couple, ref, flags)) != 0) { WT_TRET(__wt_page_release( session, couple, flags)); WT_ERR(ret); } } *refp = ref; goto done; } if (prev) --slot; else ++slot; if (walkcntp != NULL) ++*walkcntp; for (;;) { ref = pindex->index[slot]; if (LF_ISSET(WT_READ_CACHE)) { /* * Only look at unlocked pages in memory: * fast-path some common cases. */ if (LF_ISSET(WT_READ_NO_WAIT) && ref->state != WT_REF_MEM) break; } else if (LF_ISSET(WT_READ_TRUNCATE)) { /* * Avoid pulling a deleted page back in to try * to delete it again. */ if (ref->state == WT_REF_DELETED && __wt_delete_page_skip(session, ref)) break; /* * If deleting a range, try to delete the page * without instantiating it. */ WT_ERR(__wt_delete_page(session, ref, &skip)); if (skip) break; } else if (LF_ISSET(WT_READ_COMPACT)) { /* * Skip deleted pages, rewriting them doesn't * seem useful. */ if (ref->state == WT_REF_DELETED) break; /* * If the page is in-memory, we want to look at * it (it may have been modified and written, * and the current location is the interesting * one in terms of compaction, not the original * location). If the page isn't in-memory, test * if the page will help with compaction, don't * read it if we don't have to. */ if (ref->state == WT_REF_DISK) { WT_ERR(__wt_compact_page_skip( session, ref, &skip)); if (skip) break; } } else { /* * Try to skip deleted pages visible to us. */ if (ref->state == WT_REF_DELETED && __wt_delete_page_skip(session, ref)) break; } ret = __wt_page_swap(session, couple, ref, flags); /* * Not-found is an expected return when only walking * in-cache pages. */ if (ret == WT_NOTFOUND) { ret = 0; break; } /* * The page we're moving to might have split, in which * case move to the last position we held. */ if (ret == WT_RESTART) { ret = 0; /* * If a new walk that never coupled from the * root to a new saved position in the tree, * restart the walk. */ if (couple == &btree->root) { ref = &btree->root; if (ref->page == NULL) goto done; goto descend; } /* * If restarting from some original position, * repeat the increment or decrement we made at * that time. Otherwise, couple is an internal * page we've acquired after moving from that * starting position and we can treat it as a * new page. This works because we never acquire * a hazard pointer on a leaf page we're not * going to return to our caller, this will quit * working if that ever changes. */ WT_ASSERT(session, couple == couple_orig || WT_PAGE_IS_INTERNAL(couple->page)); ref = couple; __wt_page_refp(session, ref, &pindex, &slot); if (couple == couple_orig) break; } WT_ERR(ret); /* * A new page: configure for traversal of any internal * page's children, else return the leaf page. */ descend: couple = ref; page = ref->page; if (page->type == WT_PAGE_ROW_INT || page->type == WT_PAGE_COL_INT) { WT_INTL_INDEX_GET(session, page, pindex); slot = prev ? pindex->entries - 1 : 0; } else { *refp = ref; goto done; } } } done: err: WT_LEAVE_PAGE_INDEX(session); return (ret); }
/* * __page_descend -- * Descend the tree one level. */ static void __page_descend(WT_SESSION_IMPL *session, WT_PAGE *page, WT_PAGE_INDEX **pindexp, uint32_t *slotp, bool prev) { WT_PAGE_INDEX *pindex; /* * Ref is a child page into which we're descending, and on which we * have a hazard pointer. */ for (;; __wt_yield()) { WT_INTL_INDEX_GET(session, page, pindex); *slotp = prev ? pindex->entries - 1 : 0; /* * There's a split race when a cursor moving backwards through * the tree descends the tree. If we're splitting an internal * page into its parent, we move the WT_REF structures and * update the parent's page index before updating the split * page's page index, and it's not an atomic update. A thread * can read the parent page's replacement page index and then * read the split page's original index. * * This can create a race for previous-cursor movements. * * For example, imagine an internal page with 3 child pages, * with the namespaces a-f, g-h and i-j; the first child page * splits. The parent starts out with the following page-index: * * | ... | a | g | i | ... | * * The split page starts out with the following page-index: * * | a | b | c | d | e | f | * * The first step is to move the c-f ranges into a new subtree, * so, for example we might have two new internal pages 'c' and * 'e', where the new 'c' page references the c-d namespace and * the new 'e' page references the e-f namespace. The top of the * subtree references the parent page, but until the parent's * page index is updated, any threads in the subtree won't be * able to ascend out of the subtree. However, once the parent * page's page index is updated to this: * * | ... | a | c | e | g | i | ... | * * threads in the subtree can ascend into the parent. Imagine a * cursor in the c-d part of the namespace that ascends to the * parent's 'c' slot. It would then decrement to the slot before * the 'c' slot, the 'a' slot. * * The previous-cursor movement selects the last slot in the 'a' * page; if the split page's page-index hasn't been updated yet, * it will select the 'f' slot, which is incorrect. Once the * split page's page index is updated to this: * * | a | b | * * the previous-cursor movement will select the 'b' slot, which * is correct. * * This function takes an argument which is the internal page * from which we're descending. If the last slot on the page no * longer points to the current page as its "home", the page is * being split and part of its namespace moved. We have the * correct page and we don't have to move, all we have to do is * wait until the split page's page index is updated. * * No test is necessary for a next-cursor movement because we * do right-hand splits on internal pages and the initial part * of the page's namespace won't change as part of a split. * Instead of testing the direction boolean, do the test the * previous cursor movement requires in all cases, even though * it will always succeed for a next-cursor movement. */ if (pindex->index[*slotp]->home == page) break; } *pindexp = pindex; }