/* * __evict_force_check -- * Check if a page matches the criteria for forced eviction. */ static int __evict_force_check(WT_SESSION_IMPL *session, WT_PAGE *page) { WT_BTREE *btree; btree = S2BT(session); /* Pages are usually small enough, check that first. */ if (page->memory_footprint < btree->maxmempage) return (0); /* Leaf pages only. */ if (page->type != WT_PAGE_COL_FIX && page->type != WT_PAGE_COL_VAR && page->type != WT_PAGE_ROW_LEAF) return (0); /* Eviction may be turned off, although that's rare. */ if (F_ISSET(btree, WT_BTREE_NO_EVICTION)) return (0); /* * It's hard to imagine a page with a huge memory footprint that has * never been modified, but check to be sure. */ if (page->modify == NULL) return (0); /* Trigger eviction on the next page release. */ __wt_page_evict_soon(page); return (1); }
/* * __evict_force_check -- * Check if a page matches the criteria for forced eviction. */ static int __evict_force_check(WT_SESSION_IMPL *session, WT_PAGE *page) { WT_BTREE *btree; btree = S2BT(session); /* Pages are usually small enough, check that first. */ if (page->memory_footprint < btree->maxmempage) return (0); /* Leaf pages only. */ if (WT_PAGE_IS_INTERNAL(page)) return (0); /* * It's hard to imagine a page with a huge memory footprint that has * never been modified, but check to be sure. */ if (page->modify == NULL) return (0); /* Trigger eviction on the next page release. */ __wt_page_evict_soon(page); /* Bump the oldest ID, we're about to do some visibility checks. */ __wt_txn_update_oldest(session, 0); /* If eviction cannot succeed, don't try. */ return (__wt_page_can_evict(session, page, 1, NULL)); }
/* * __evict_force_check -- * Check if a page matches the criteria for forced eviction. */ static int __evict_force_check(WT_SESSION_IMPL *session, WT_PAGE *page, uint32_t flags) { WT_BTREE *btree; btree = S2BT(session); /* Pages are usually small enough, check that first. */ if (page->memory_footprint < btree->maxmempage) return (0); /* Leaf pages only. */ if (WT_PAGE_IS_INTERNAL(page)) return (0); /* Eviction may be turned off. */ if (LF_ISSET(WT_READ_NO_EVICT) || F_ISSET(btree, WT_BTREE_NO_EVICTION)) return (0); /* * It's hard to imagine a page with a huge memory footprint that has * never been modified, but check to be sure. */ if (page->modify == NULL) return (0); /* Trigger eviction on the next page release. */ __wt_page_evict_soon(page); /* If eviction cannot succeed, don't try. */ return (__wt_page_can_evict(session, page, 1)); }
/* * __evict_force_check -- * Check if a page matches the criteria for forced eviction. */ static bool __evict_force_check(WT_SESSION_IMPL *session, WT_REF *ref) { WT_BTREE *btree; WT_PAGE *page; btree = S2BT(session); page = ref->page; /* Leaf pages only. */ if (WT_PAGE_IS_INTERNAL(page)) return (false); /* * It's hard to imagine a page with a huge memory footprint that has * never been modified, but check to be sure. */ if (page->modify == NULL) return (false); /* Pages are usually small enough, check that first. */ if (page->memory_footprint < btree->splitmempage) return (false); /* * If this session has more than one hazard pointer, eviction will fail * and there is no point trying. */ if (__wt_hazard_count(session, page) > 1) return (false); /* * If we have already tried and the transaction state has not moved on, * eviction is highly likely to fail. */ if (page->modify->last_eviction_id == __wt_txn_oldest_id(session)) return (false); if (page->memory_footprint < btree->maxmempage) return (__wt_leaf_page_can_split(session, page)); /* Trigger eviction on the next page release. */ __wt_page_evict_soon(session, ref); /* Bump the oldest ID, we're about to do some visibility checks. */ WT_IGNORE_RET(__wt_txn_update_oldest(session, 0)); /* If eviction cannot succeed, don't try. */ return (__wt_page_can_evict(session, ref, NULL)); }
/* * __wt_btcur_prev -- * Move to the previous record in the tree. */ int __wt_btcur_prev(WT_CURSOR_BTREE *cbt, bool truncating) { WT_DECL_RET; WT_PAGE *page; WT_SESSION_IMPL *session; uint32_t flags; bool newpage; session = (WT_SESSION_IMPL *)cbt->iface.session; WT_STAT_FAST_CONN_INCR(session, cursor_prev); WT_STAT_FAST_DATA_INCR(session, cursor_prev); flags = WT_READ_PREV | WT_READ_SKIP_INTL; /* Tree walk flags. */ if (truncating) LF_SET(WT_READ_TRUNCATE); WT_RET(__cursor_func_init(cbt, false)); /* * If we aren't already iterating in the right direction, there's * some setup to do. */ if (!F_ISSET(cbt, WT_CBT_ITERATE_PREV)) __wt_btcur_iterate_setup(cbt); /* * Walk any page we're holding until the underlying call returns not- * found. Then, move to the previous page, until we reach the start * of the file. */ for (newpage = false;; newpage = true) { page = cbt->ref == NULL ? NULL : cbt->ref->page; WT_ASSERT(session, page == NULL || !WT_PAGE_IS_INTERNAL(page)); /* * The last page in a column-store has appended entries. * We handle it separately from the usual cursor code: * it's only that one page and it's in a simple format. */ if (newpage && page != NULL && page->type != WT_PAGE_ROW_LEAF && (cbt->ins_head = WT_COL_APPEND(page)) != NULL) F_SET(cbt, WT_CBT_ITERATE_APPEND); if (F_ISSET(cbt, WT_CBT_ITERATE_APPEND)) { switch (page->type) { case WT_PAGE_COL_FIX: ret = __cursor_fix_append_prev(cbt, newpage); break; case WT_PAGE_COL_VAR: ret = __cursor_var_append_prev(cbt, newpage); break; WT_ILLEGAL_VALUE_ERR(session); } if (ret == 0) break; F_CLR(cbt, WT_CBT_ITERATE_APPEND); if (ret != WT_NOTFOUND) break; newpage = true; } if (page != NULL) { switch (page->type) { case WT_PAGE_COL_FIX: ret = __cursor_fix_prev(cbt, newpage); break; case WT_PAGE_COL_VAR: ret = __cursor_var_prev(cbt, newpage); break; case WT_PAGE_ROW_LEAF: ret = __cursor_row_prev(cbt, newpage); break; WT_ILLEGAL_VALUE_ERR(session); } if (ret != WT_NOTFOUND) break; } /* * If we saw a lot of deleted records on this page, or we went * all the way through a page and only saw deleted records, try * to evict the page when we release it. Otherwise repeatedly * deleting from the beginning of a tree can have quadratic * performance. Take care not to force eviction of pages that * are genuinely empty, in new trees. */ if (page != NULL && (cbt->page_deleted_count > WT_BTREE_DELETE_THRESHOLD || (newpage && cbt->page_deleted_count > 0))) __wt_page_evict_soon(page); cbt->page_deleted_count = 0; WT_ERR(__wt_tree_walk(session, &cbt->ref, flags)); WT_ERR_TEST(cbt->ref == NULL, WT_NOTFOUND); } err: if (ret != 0) WT_TRET(__cursor_reset(cbt)); return (ret); }
/* * __wt_btcur_next -- * Move to the next record in the tree. */ int __wt_btcur_next(WT_CURSOR_BTREE *cbt, bool truncating) { WT_CURSOR *cursor; WT_DECL_RET; WT_PAGE *page; WT_SESSION_IMPL *session; uint32_t flags; bool newpage; cursor = &cbt->iface; session = (WT_SESSION_IMPL *)cbt->iface.session; WT_STAT_CONN_INCR(session, cursor_next); WT_STAT_DATA_INCR(session, cursor_next); F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET); WT_RET(__cursor_func_init(cbt, false)); /* * If we aren't already iterating in the right direction, there's * some setup to do. */ if (!F_ISSET(cbt, WT_CBT_ITERATE_NEXT)) __wt_btcur_iterate_setup(cbt); /* * Walk any page we're holding until the underlying call returns not- * found. Then, move to the next page, until we reach the end of the * file. */ flags = WT_READ_SKIP_INTL; /* tree walk flags */ if (truncating) LF_SET(WT_READ_TRUNCATE); for (newpage = false;; newpage = true) { page = cbt->ref == NULL ? NULL : cbt->ref->page; if (F_ISSET(cbt, WT_CBT_ITERATE_APPEND)) { switch (page->type) { case WT_PAGE_COL_FIX: ret = __cursor_fix_append_next(cbt, newpage); break; case WT_PAGE_COL_VAR: ret = __cursor_var_append_next(cbt, newpage); break; WT_ILLEGAL_VALUE_ERR(session); } if (ret == 0) break; F_CLR(cbt, WT_CBT_ITERATE_APPEND); if (ret != WT_NOTFOUND) break; } else if (page != NULL) { switch (page->type) { case WT_PAGE_COL_FIX: ret = __cursor_fix_next(cbt, newpage); break; case WT_PAGE_COL_VAR: ret = __cursor_var_next(cbt, newpage); break; case WT_PAGE_ROW_LEAF: ret = __cursor_row_next(cbt, newpage); break; WT_ILLEGAL_VALUE_ERR(session); } if (ret != WT_NOTFOUND) break; /* * Column-store pages may have appended entries. Handle * it separately from the usual cursor code, it's in a * simple format. */ if (page->type != WT_PAGE_ROW_LEAF && (cbt->ins_head = WT_COL_APPEND(page)) != NULL) { F_SET(cbt, WT_CBT_ITERATE_APPEND); continue; } } /* * If we saw a lot of deleted records on this page, or we went * all the way through a page and only saw deleted records, try * to evict the page when we release it. Otherwise repeatedly * deleting from the beginning of a tree can have quadratic * performance. Take care not to force eviction of pages that * are genuinely empty, in new trees. */ if (page != NULL && (cbt->page_deleted_count > WT_BTREE_DELETE_THRESHOLD || (newpage && cbt->page_deleted_count > 0))) __wt_page_evict_soon(session, cbt->ref); cbt->page_deleted_count = 0; WT_ERR(__wt_tree_walk(session, &cbt->ref, flags)); WT_ERR_TEST(cbt->ref == NULL, WT_NOTFOUND); } #ifdef HAVE_DIAGNOSTIC if (ret == 0) WT_ERR(__wt_cursor_key_order_check(session, cbt, true)); #endif if (ret == 0) F_SET(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT); err: if (ret != 0) WT_TRET(__cursor_reset(cbt)); return (ret); }
/* * __tree_walk_internal -- * Move to the next/previous page in the tree. */ static inline int __tree_walk_internal(WT_SESSION_IMPL *session, WT_REF **refp, uint64_t *walkcntp, int (*skip_func)(WT_SESSION_IMPL *, WT_REF *, void *, bool *), void *func_cookie, uint32_t flags) { WT_BTREE *btree; WT_DECL_RET; WT_PAGE_INDEX *pindex; WT_REF *couple, *couple_orig, *ref; uint32_t slot; bool empty_internal, initial_descent, prev, skip; btree = S2BT(session); pindex = NULL; empty_internal = initial_descent = false; /* * Tree walks are special: they look inside page structures that splits * may want to free. Publish that the tree is active during this * window. */ WT_ENTER_PAGE_INDEX(session); /* Walk should never instantiate deleted pages. */ LF_SET(WT_READ_NO_EMPTY); /* * !!! * Fast-truncate currently only works on row-store trees. */ if (btree->type != BTREE_ROW) LF_CLR(WT_READ_TRUNCATE); prev = LF_ISSET(WT_READ_PREV) ? 1 : 0; /* * There are multiple reasons and approaches to walking the in-memory * tree: * * (1) finding pages to evict (the eviction server); * (2) writing just dirty leaves or internal nodes (checkpoint); * (3) discarding pages (close); * (4) truncating pages in a range (fast truncate); * (5) skipping pages based on outside information (compaction); * (6) cursor scans (applications). * * Except for cursor scans and compaction, the walk is limited to the * cache, no pages are read. In all cases, hazard pointers protect the * walked pages from eviction. * * Walks use hazard-pointer coupling through the tree and that's OK * (hazard pointers can't deadlock, so there's none of the usual * problems found when logically locking up a btree). If the eviction * thread tries to evict the active page, it fails because of our * hazard pointer. If eviction tries to evict our parent, that fails * because the parent has a child page that can't be discarded. We do * play one game: don't couple up to our parent and then back down to a * new leaf, couple to the next page to which we're descending, it * saves a hazard-pointer swap for each cursor page movement. * * !!! * NOTE: we depend on the fact it's OK to release a page we don't hold, * that is, it's OK to release couple when couple is set to NULL. * * Take a copy of any held page and clear the return value. Remember * the hazard pointer we're currently holding. * * Clear the returned value, it makes future error handling easier. */ couple = couple_orig = ref = *refp; *refp = NULL; /* If no page is active, begin a walk from the start/end of the tree. */ if (ref == NULL) { restart: /* * We can be here with a NULL or root WT_REF; the page release * function handles them internally, don't complicate this code * by calling them out. */ WT_ERR(__wt_page_release(session, couple, flags)); /* * We're not supposed to walk trees without root pages. As this * has not always been the case, assert to debug that change. */ WT_ASSERT(session, btree->root.page != NULL); couple = couple_orig = ref = &btree->root; initial_descent = true; goto descend; } /* * If the active page was the root, we've reached the walk's end; we * only get here if we've returned the root to our caller, so we're * holding no hazard pointers. */ if (__wt_ref_is_root(ref)) goto done; /* Figure out the current slot in the WT_REF array. */ __ref_index_slot(session, ref, &pindex, &slot); for (;;) { /* * If we're at the last/first slot on the internal page, return * it in post-order traversal. Otherwise move to the next/prev * slot and left/right-most element in that subtree. */ while ((prev && slot == 0) || (!prev && slot == pindex->entries - 1)) { /* Ascend to the parent. */ __ref_ascend(session, &ref, &pindex, &slot); /* * If at the root and returning internal pages, return * the root page, otherwise we're done. Regardless, no * hazard pointer is required, release the one we hold. */ if (__wt_ref_is_root(ref)) { WT_ERR(__wt_page_release( session, couple, flags)); if (!LF_ISSET(WT_READ_SKIP_INTL)) *refp = ref; goto done; } /* * If we got all the way through an internal page and * all of the child pages were deleted, mark it for * eviction. */ if (empty_internal && pindex->entries > 1) { __wt_page_evict_soon(session, ref); empty_internal = false; } /* * Optionally return internal pages. Swap our previous * hazard pointer for the page we'll return. We don't * handle restart or not-found returns, it would require * additional complexity and is not a possible return: * we're moving to the parent of the current child page, * the parent can't have been evicted. */ if (!LF_ISSET(WT_READ_SKIP_INTL)) { WT_ERR(__wt_page_swap( session, couple, ref, flags)); *refp = ref; goto done; } } if (prev) --slot; else ++slot; if (walkcntp != NULL) ++*walkcntp; for (;;) { /* * Move to the next slot, and set the reference hint if * it's wrong (used when we continue the walk). We don't * always update the hints when splitting, it's expected * for them to be incorrect in some workloads. */ ref = pindex->index[slot]; if (ref->pindex_hint != slot) ref->pindex_hint = slot; /* * If we see any child states other than deleted, the * page isn't empty. */ if (ref->state != WT_REF_DELETED && !LF_ISSET(WT_READ_TRUNCATE)) empty_internal = false; if (LF_ISSET(WT_READ_CACHE)) { /* * Only look at unlocked pages in memory: * fast-path some common cases. */ if (LF_ISSET(WT_READ_NO_WAIT) && ref->state != WT_REF_MEM) break; /* Skip lookaside pages if not requested. */ if (ref->state == WT_REF_LOOKASIDE && !LF_ISSET(WT_READ_LOOKASIDE)) break; } else if (LF_ISSET(WT_READ_TRUNCATE)) { /* * Avoid pulling a deleted page back in to try * to delete it again. */ if (ref->state == WT_REF_DELETED && __wt_delete_page_skip(session, ref, false)) break; /* * If deleting a range, try to delete the page * without instantiating it. */ WT_ERR(__wt_delete_page(session, ref, &skip)); if (skip) break; empty_internal = false; } else if (skip_func != NULL) { WT_ERR(skip_func(session, ref, func_cookie, &skip)); if (skip) break; } else { /* * Try to skip deleted pages visible to us. */ if (ref->state == WT_REF_DELETED && __wt_delete_page_skip(session, ref, false)) break; } ret = __wt_page_swap(session, couple, ref, WT_READ_NOTFOUND_OK | WT_READ_RESTART_OK | flags); /* * Not-found is an expected return when only walking * in-cache pages, or if we see a deleted page. */ if (ret == WT_NOTFOUND) { ret = 0; break; } /* * The page we're moving to might have split, in which * case move to the last position we held. */ if (ret == WT_RESTART) { ret = 0; /* * If a cursor is setting up at the end of the * tree, we can't use our parent page's index, * because it may have already split; restart * the walk. */ if (prev && initial_descent) goto restart; /* * If a new walk that never coupled from the * root to a new saved position in the tree, * restart the walk. */ if (couple == &btree->root) goto restart; /* * If restarting from some original position, * repeat the increment or decrement we made at * that time. Otherwise, couple is an internal * page we've acquired after moving from that * starting position and we can treat it as a * new page. This works because we never acquire * a hazard pointer on a leaf page we're not * going to return to our caller, this will quit * working if that ever changes. */ WT_ASSERT(session, couple == couple_orig || WT_PAGE_IS_INTERNAL(couple->page)); ref = couple; __ref_index_slot(session, ref, &pindex, &slot); if (couple == couple_orig) break; } WT_ERR(ret); couple = ref; /* * A new page: configure for traversal of any internal * page's children, else return the leaf page. */ if (WT_PAGE_IS_INTERNAL(ref->page)) { descend: empty_internal = true; /* * There's a split race when a cursor is setting * up at the end of the tree or moving backwards * through the tree and descending a level. When * splitting an internal page into its parent, * we move the WT_REF structures and update the * parent's page index before updating the split * page's page index, and it's not an atomic * update. A thread can read the parent page's * replacement page index, then read the split * page's original index, or the parent page's * original and the split page's replacement. * * This isn't a problem for a cursor setting up * at the start of the tree or moving forwards * through the tree because we do right-hand * splits on internal pages and the initial part * of the split page's namespace won't change as * part of a split. A thread reading the parent * page's and split page's indexes will move to * the same slot no matter what order of indexes * are read. * * Handle a cursor setting up at the end of the * tree or moving backwards through the tree. */ if (!prev) { WT_INTL_INDEX_GET( session, ref->page, pindex); slot = 0; } else if (initial_descent) { if (!__ref_initial_descent_prev( session, ref, &pindex)) goto restart; slot = pindex->entries - 1; } else { __ref_descend_prev( session, ref, &pindex); slot = pindex->entries - 1; } continue; } /* * The tree-walk restart code knows we return any leaf * page we acquire (never hazard-pointer coupling on * after acquiring a leaf page), and asserts no restart * happens while holding a leaf page. This page must be * returned to our caller. */ *refp = ref; goto done; } } done: err: WT_LEAVE_PAGE_INDEX(session); return (ret); }
/* * __wt_page_in_func -- * Acquire a hazard pointer to a page; if the page is not in-memory, * read it from the disk and build an in-memory version. */ int __wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags #ifdef HAVE_DIAGNOSTIC , const char *file, int line #endif ) { WT_BTREE *btree; WT_DECL_RET; WT_PAGE *page; u_int sleep_cnt, wait_cnt; int busy, cache_work, force_attempts, oldgen, stalled; btree = S2BT(session); stalled = 0; for (force_attempts = oldgen = 0, sleep_cnt = wait_cnt = 0;;) { switch (ref->state) { case WT_REF_DISK: case WT_REF_DELETED: if (LF_ISSET(WT_READ_CACHE)) return (WT_NOTFOUND); /* * The page isn't in memory, read it. If this thread is * allowed to do eviction work, check for space in the * cache. */ if (!LF_ISSET(WT_READ_NO_EVICT)) WT_RET(__wt_cache_eviction_check( session, 1, NULL)); WT_RET(__page_read(session, ref)); oldgen = LF_ISSET(WT_READ_WONT_NEED) || F_ISSET(session, WT_SESSION_NO_CACHE); continue; case WT_REF_READING: if (LF_ISSET(WT_READ_CACHE)) return (WT_NOTFOUND); if (LF_ISSET(WT_READ_NO_WAIT)) return (WT_NOTFOUND); /* Waiting on another thread's read, stall. */ WT_STAT_FAST_CONN_INCR(session, page_read_blocked); stalled = 1; break; case WT_REF_LOCKED: if (LF_ISSET(WT_READ_NO_WAIT)) return (WT_NOTFOUND); /* Waiting on eviction, stall. */ WT_STAT_FAST_CONN_INCR(session, page_locked_blocked); stalled = 1; break; case WT_REF_SPLIT: return (WT_RESTART); case WT_REF_MEM: /* * The page is in memory. * * Get a hazard pointer if one is required. We cannot * be evicting if no hazard pointer is required, we're * done. */ if (F_ISSET(btree, WT_BTREE_IN_MEMORY)) goto skip_evict; /* * The expected reason we can't get a hazard pointer is * because the page is being evicted, yield, try again. */ #ifdef HAVE_DIAGNOSTIC WT_RET( __wt_hazard_set(session, ref, &busy, file, line)); #else WT_RET(__wt_hazard_set(session, ref, &busy)); #endif if (busy) { WT_STAT_FAST_CONN_INCR( session, page_busy_blocked); break; } /* * If eviction is configured for this file, check to see * if the page qualifies for forced eviction and update * the page's generation number. If eviction isn't being * done on this file, we're done. */ if (LF_ISSET(WT_READ_NO_EVICT) || F_ISSET(session, WT_SESSION_NO_EVICTION) || F_ISSET(btree, WT_BTREE_NO_EVICTION)) goto skip_evict; /* * Forcibly evict pages that are too big. */ page = ref->page; if (force_attempts < 10 && __evict_force_check(session, page)) { ++force_attempts; ret = __wt_page_release_evict(session, ref); /* If forced eviction fails, stall. */ if (ret == EBUSY) { ret = 0; WT_STAT_FAST_CONN_INCR(session, page_forcible_evict_blocked); stalled = 1; break; } WT_RET(ret); /* * The result of a successful forced eviction * is a page-state transition (potentially to * an in-memory page we can use, or a restart * return for our caller), continue the outer * page-acquisition loop. */ continue; } /* * If we read the page and we are configured to not * trash the cache, set the oldest read generation so * the page is forcibly evicted as soon as possible. * * Otherwise, update the page's read generation. */ if (oldgen && page->read_gen == WT_READGEN_NOTSET) __wt_page_evict_soon(page); else if (!LF_ISSET(WT_READ_NO_GEN) && page->read_gen != WT_READGEN_OLDEST && page->read_gen < __wt_cache_read_gen(session)) page->read_gen = __wt_cache_read_gen_bump(session); skip_evict: /* * Check if we need an autocommit transaction. * Starting a transaction can trigger eviction, so skip * it if eviction isn't permitted. */ return (LF_ISSET(WT_READ_NO_EVICT) ? 0 : __wt_txn_autocommit_check(session)); WT_ILLEGAL_VALUE(session); } /* * We failed to get the page -- yield before retrying, and if * we've yielded enough times, start sleeping so we don't burn * CPU to no purpose. */ if (stalled) wait_cnt += 1000; else if (++wait_cnt < 1000) { __wt_yield(); continue; } /* * If stalling and this thread is allowed to do eviction work, * check if the cache needs help. If we do work for the cache, * substitute that for a sleep. */ if (!LF_ISSET(WT_READ_NO_EVICT)) { WT_RET( __wt_cache_eviction_check(session, 1, &cache_work)); if (cache_work) continue; } sleep_cnt = WT_MIN(sleep_cnt + 1000, 10000); WT_STAT_FAST_CONN_INCRV(session, page_sleep, sleep_cnt); __wt_sleep(0, sleep_cnt); } }
/*将btree cursor移动到下一个记录*/ int __wt_btcur_next(WT_CURSOR_BTREE *cbt, int truncating) { WT_DECL_RET; WT_PAGE *page; WT_SESSION_IMPL *session; uint32_t flags; int newpage; session = (WT_SESSION_IMPL *)cbt->iface.session; WT_STAT_FAST_CONN_INCR(session, cursor_next); WT_STAT_FAST_DATA_INCR(session, cursor_next); /*btree 扫描标示*/ flags = WT_READ_SKIP_INTL; if (truncating) LF_SET(WT_READ_TRUNCATE); /*激活一个btree cursor*/ WT_RET(__cursor_func_init(cbt, 0)); /*初始化cursor*/ if (!F_ISSET(cbt, WT_CBT_ITERATE_NEXT)) __wt_btcur_iterate_setup(cbt, 1); /*对btree的扫描*/ for (;;){ page = cbt->ref == NULL ? NULL : cbt->ref->page; WT_ASSERT(session, page == NULL || !WT_PAGE_IS_INTERNAL(page)); /*column store append方式,在insert header上做扫描*/ if (F_ISSET(cbt, WT_CBT_ITERATE_APPEND)){ switch (page->type){ case WT_PAGE_COL_FIX: ret = __cursor_fix_append_next(cbt, newpage); break; case WT_PAGE_COL_VAR: ret = __cursor_var_append_next(cbt, newpage); break; WT_ILLEGAL_VALUE_ERR(session); } if (ret == 0) break; /*清除掉column store的标记*/ F_CLR(cbt, WT_CBT_ITERATE_APPEND); if (ret != WT_NOTFOUND) break; } else if (page != NULL){ switch (page->type) { case WT_PAGE_COL_FIX: ret = __cursor_fix_next(cbt, newpage); break; case WT_PAGE_COL_VAR: ret = __cursor_var_next(cbt, newpage); break; case WT_PAGE_ROW_LEAF: ret = __cursor_row_next(cbt, newpage); break; WT_ILLEGAL_VALUE_ERR(session); } /*找到对应的记录了,直接返回*/ if (ret != WT_NOTFOUND) break; /*假如是column store方式,检查是否要扫描insert header list*/ if (page->type != WT_PAGE_ROW_LEAF && (cbt->ins_head = WT_COL_APPEND(page)) != NULL) { F_SET(cbt, WT_CBT_ITERATE_APPEND); continue; } } /*删除的记录太多,对page进行重组,增大page的填充因子*/ if (page != NULL && (cbt->page_deleted_count > WT_BTREE_DELETE_THRESHOLD || (newpage && cbt->page_deleted_count > 0))){ __wt_page_evict_soon(page); } cbt->page_deleted_count = 0; /*btree cursor跳转到下一个page上*/ WT_ERR(__wt_tree_walk(session, &cbt->ref, NULL, flags)); WT_ERR_TEST(cbt->ref == NULL, WT_NOTFOUND); } err: if (ret != 0) /*失败了,恢复cursor的状态*/ WT_TRET(__cursor_reset(cbt)); return ret; }
/* * __wt_page_in_func -- * Acquire a hazard pointer to a page; if the page is not in-memory, * read it from the disk and build an in-memory version. */ int __wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags #ifdef HAVE_DIAGNOSTIC , const char *file, int line #endif ) { WT_DECL_RET; WT_PAGE *page; int busy, force_attempts, oldgen; for (force_attempts = oldgen = 0;;) { switch (ref->state) { case WT_REF_DISK: case WT_REF_DELETED: if (LF_ISSET(WT_READ_CACHE)) return (WT_NOTFOUND); /* * The page isn't in memory, attempt to read it. * Make sure there is space in the cache. */ WT_RET(__wt_cache_full_check(session)); WT_RET(__wt_cache_read(session, ref)); oldgen = LF_ISSET(WT_READ_WONT_NEED) || F_ISSET(session, WT_SESSION_NO_CACHE); continue; case WT_REF_READING: if (LF_ISSET(WT_READ_CACHE)) return (WT_NOTFOUND); /* FALLTHROUGH */ case WT_REF_LOCKED: if (LF_ISSET(WT_READ_NO_WAIT)) return (WT_NOTFOUND); /* The page is busy -- wait. */ break; case WT_REF_SPLIT: return (WT_RESTART); case WT_REF_MEM: /* * The page is in memory: get a hazard pointer, update * the page's LRU and return. The expected reason we * can't get a hazard pointer is because the page is * being evicted; yield and try again. */ #ifdef HAVE_DIAGNOSTIC WT_RET( __wt_hazard_set(session, ref, &busy, file, line)); #else WT_RET(__wt_hazard_set(session, ref, &busy)); #endif if (busy) break; page = ref->page; WT_ASSERT(session, page != NULL); /* Forcibly evict pages that are too big. */ if (!LF_ISSET(WT_READ_NO_EVICT) && force_attempts < 10 && __evict_force_check(session, page)) { ++force_attempts; WT_RET(__wt_page_release(session, ref, flags)); break; } /* Check if we need an autocommit transaction. */ if ((ret = __wt_txn_autocommit_check(session)) != 0) { WT_TRET(__wt_hazard_clear(session, page)); return (ret); } /* * If we read the page and we are configured to not * trash the cache, set the oldest read generation so * the page is forcibly evicted as soon as possible. * * Otherwise, update the page's read generation. */ if (oldgen && page->read_gen == WT_READGEN_NOTSET) __wt_page_evict_soon(page); else if (!LF_ISSET(WT_READ_NO_GEN) && page->read_gen < __wt_cache_read_gen(session)) page->read_gen = __wt_cache_read_gen_set(session); return (0); WT_ILLEGAL_VALUE(session); } /* We failed to get the page -- yield before retrying. */ __wt_yield(); } }
/* * __wt_page_in_func -- * Acquire a hazard pointer to a page; if the page is not in-memory, * read it from the disk and build an in-memory version. */ int __wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags #ifdef HAVE_DIAGNOSTIC , const char *file, int line #endif ) { WT_DECL_RET; WT_PAGE *page; u_int sleep_cnt, wait_cnt; int busy, force_attempts, oldgen; for (force_attempts = oldgen = 0, wait_cnt = 0;;) { switch (ref->state) { case WT_REF_DISK: case WT_REF_DELETED: if (LF_ISSET(WT_READ_CACHE)) return (WT_NOTFOUND); /* * The page isn't in memory, attempt to read it. * Make sure there is space in the cache. */ WT_RET(__wt_cache_full_check(session)); WT_RET(__wt_cache_read(session, ref)); oldgen = LF_ISSET(WT_READ_WONT_NEED) || F_ISSET(session, WT_SESSION_NO_CACHE); continue; case WT_REF_READING: if (LF_ISSET(WT_READ_CACHE)) return (WT_NOTFOUND); if (LF_ISSET(WT_READ_NO_WAIT)) return (WT_NOTFOUND); WT_STAT_FAST_CONN_INCR(session, page_read_blocked); break; case WT_REF_LOCKED: if (LF_ISSET(WT_READ_NO_WAIT)) return (WT_NOTFOUND); WT_STAT_FAST_CONN_INCR(session, page_locked_blocked); break; case WT_REF_SPLIT: return (WT_RESTART); case WT_REF_MEM: /* * The page is in memory: get a hazard pointer, update * the page's LRU and return. The expected reason we * can't get a hazard pointer is because the page is * being evicted; yield and try again. */ #ifdef HAVE_DIAGNOSTIC WT_RET( __wt_hazard_set(session, ref, &busy, file, line)); #else WT_RET(__wt_hazard_set(session, ref, &busy)); #endif if (busy) { WT_STAT_FAST_CONN_INCR( session, page_busy_blocked); break; } page = ref->page; WT_ASSERT(session, page != NULL); /* * Forcibly evict pages that are too big. */ if (force_attempts < 10 && __evict_force_check(session, page, flags)) { ++force_attempts; ret = __wt_page_release_evict(session, ref); /* If forced eviction fails, stall. */ if (ret == EBUSY) { ret = 0; wait_cnt += 1000; WT_STAT_FAST_CONN_INCR(session, page_forcible_evict_blocked); break; } else WT_RET(ret); /* * The result of a successful forced eviction * is a page-state transition (potentially to * an in-memory page we can use, or a restart * return for our caller), continue the outer * page-acquisition loop. */ continue; } /* Check if we need an autocommit transaction. */ if ((ret = __wt_txn_autocommit_check(session)) != 0) { WT_TRET(__wt_hazard_clear(session, page)); return (ret); } /* * If we read the page and we are configured to not * trash the cache, set the oldest read generation so * the page is forcibly evicted as soon as possible. * * Otherwise, update the page's read generation. */ if (oldgen && page->read_gen == WT_READGEN_NOTSET) __wt_page_evict_soon(page); else if (!LF_ISSET(WT_READ_NO_GEN) && page->read_gen != WT_READGEN_OLDEST && page->read_gen < __wt_cache_read_gen(session)) page->read_gen = __wt_cache_read_gen_set(session); return (0); WT_ILLEGAL_VALUE(session); } /* * We failed to get the page -- yield before retrying, and if * we've yielded enough times, start sleeping so we don't burn * CPU to no purpose. */ if (++wait_cnt < 1000) __wt_yield(); else { sleep_cnt = WT_MIN(wait_cnt, 10000); wait_cnt *= 2; WT_STAT_FAST_CONN_INCRV(session, page_sleep, sleep_cnt); __wt_sleep(0, sleep_cnt); } } }
/* * __wt_tree_walk -- * Move to the next/previous page in the tree. */ int __wt_tree_walk(WT_SESSION_IMPL *session, WT_REF **refp, uint64_t *walkcntp, uint32_t flags) { WT_BTREE *btree; WT_DECL_RET; WT_PAGE *page; WT_PAGE_INDEX *pindex; WT_REF *couple, *couple_orig, *ref; bool empty_internal, prev, skip; uint32_t slot; btree = S2BT(session); empty_internal = false; /* * Tree walks are special: they look inside page structures that splits * may want to free. Publish that the tree is active during this * window. */ WT_ENTER_PAGE_INDEX(session); /* Walk should never instantiate deleted pages. */ LF_SET(WT_READ_NO_EMPTY); /* * !!! * Fast-truncate currently only works on row-store trees. */ if (btree->type != BTREE_ROW) LF_CLR(WT_READ_TRUNCATE); prev = LF_ISSET(WT_READ_PREV) ? 1 : 0; /* * There are multiple reasons and approaches to walking the in-memory * tree: * * (1) finding pages to evict (the eviction server); * (2) writing just dirty leaves or internal nodes (checkpoint); * (3) discarding pages (close); * (4) truncating pages in a range (fast truncate); * (5) skipping pages based on outside information (compaction); * (6) cursor scans (applications). * * Except for cursor scans and compaction, the walk is limited to the * cache, no pages are read. In all cases, hazard pointers protect the * walked pages from eviction. * * Walks use hazard-pointer coupling through the tree and that's OK * (hazard pointers can't deadlock, so there's none of the usual * problems found when logically locking up a btree). If the eviction * thread tries to evict the active page, it fails because of our * hazard pointer. If eviction tries to evict our parent, that fails * because the parent has a child page that can't be discarded. We do * play one game: don't couple up to our parent and then back down to a * new leaf, couple to the next page to which we're descending, it * saves a hazard-pointer swap for each cursor page movement. * * !!! * NOTE: we depend on the fact it's OK to release a page we don't hold, * that is, it's OK to release couple when couple is set to NULL. * * Take a copy of any held page and clear the return value. Remember * the hazard pointer we're currently holding. * * We may be passed a pointer to btree->evict_page that we are clearing * here. We check when discarding pages that we're not discarding that * page, so this clear must be done before the page is released. */ couple = couple_orig = ref = *refp; *refp = NULL; /* If no page is active, begin a walk from the start of the tree. */ if (ref == NULL) { ref = &btree->root; if (ref->page == NULL) goto done; goto descend; } ascend: /* * If the active page was the root, we've reached the walk's end. * Release any hazard-pointer we're holding. */ if (__wt_ref_is_root(ref)) { WT_ERR(__wt_page_release(session, couple, flags)); goto done; } /* Figure out the current slot in the WT_REF array. */ __page_refp(session, ref, &pindex, &slot); for (;;) { /* * If we're at the last/first slot on the page, return this page * in post-order traversal. Otherwise we move to the next/prev * slot and left/right-most element in its subtree. */ if ((prev && slot == 0) || (!prev && slot == pindex->entries - 1)) { ref = ref->home->pg_intl_parent_ref; /* * If we got all the way through an internal page and * all of the child pages were deleted, mark it for * eviction. */ if (empty_internal && pindex->entries > 1) { __wt_page_evict_soon(ref->page); empty_internal = false; } /* Optionally skip internal pages. */ if (LF_ISSET(WT_READ_SKIP_INTL)) goto ascend; /* * We've ascended the tree and are returning an internal * page. If it's the root, discard our hazard pointer, * otherwise, swap our hazard pointer for the page we'll * return. */ if (__wt_ref_is_root(ref)) WT_ERR(__wt_page_release( session, couple, flags)); else { /* * Locate the reference to our parent page then * swap our child hazard pointer for the parent. * We don't handle restart or not-found returns. * It would require additional complexity and is * not a possible return: we're moving to the * parent of the current child page, our parent * reference can't have split or been evicted. */ __page_refp(session, ref, &pindex, &slot); if ((ret = __wt_page_swap( session, couple, ref, flags)) != 0) { WT_TRET(__wt_page_release( session, couple, flags)); WT_ERR(ret); } } *refp = ref; goto done; } if (prev) --slot; else ++slot; if (walkcntp != NULL) ++*walkcntp; for (;;) { /* * Move to the next slot, and set the reference hint if * it's wrong (used when we continue the walk). We don't * update those hints when splitting, so it's common for * them to be incorrect in some workloads. */ ref = pindex->index[slot]; if (ref->pindex_hint != slot) ref->pindex_hint = slot; /* * If we see any child states other than deleted, the * page isn't empty. */ if (ref->state != WT_REF_DELETED && !LF_ISSET(WT_READ_TRUNCATE)) empty_internal = false; if (LF_ISSET(WT_READ_CACHE)) { /* * Only look at unlocked pages in memory: * fast-path some common cases. */ if (LF_ISSET(WT_READ_NO_WAIT) && ref->state != WT_REF_MEM) break; } else if (LF_ISSET(WT_READ_TRUNCATE)) { /* * Avoid pulling a deleted page back in to try * to delete it again. */ if (ref->state == WT_REF_DELETED && __wt_delete_page_skip(session, ref, false)) break; /* * If deleting a range, try to delete the page * without instantiating it. */ WT_ERR(__wt_delete_page(session, ref, &skip)); if (skip) break; empty_internal = false; } else if (LF_ISSET(WT_READ_COMPACT)) { /* * Skip deleted pages, rewriting them doesn't * seem useful. */ if (ref->state == WT_REF_DELETED) break; /* * If the page is in-memory, we want to look at * it (it may have been modified and written, * and the current location is the interesting * one in terms of compaction, not the original * location). If the page isn't in-memory, test * if the page will help with compaction, don't * read it if we don't have to. */ if (ref->state == WT_REF_DISK) { WT_ERR(__wt_compact_page_skip( session, ref, &skip)); if (skip) break; } } else { /* * Try to skip deleted pages visible to us. */ if (ref->state == WT_REF_DELETED && __wt_delete_page_skip(session, ref, false)) break; } ret = __wt_page_swap(session, couple, ref, flags); /* * Not-found is an expected return when only walking * in-cache pages, or if we see a deleted page. */ if (ret == WT_NOTFOUND) { ret = 0; break; } /* * The page we're moving to might have split, in which * case move to the last position we held. */ if (ret == WT_RESTART) { ret = 0; /* * If a new walk that never coupled from the * root to a new saved position in the tree, * restart the walk. */ if (couple == &btree->root) { ref = &btree->root; if (ref->page == NULL) goto done; goto descend; } /* * If restarting from some original position, * repeat the increment or decrement we made at * that time. Otherwise, couple is an internal * page we've acquired after moving from that * starting position and we can treat it as a * new page. This works because we never acquire * a hazard pointer on a leaf page we're not * going to return to our caller, this will quit * working if that ever changes. */ WT_ASSERT(session, couple == couple_orig || WT_PAGE_IS_INTERNAL(couple->page)); ref = couple; __page_refp(session, ref, &pindex, &slot); if (couple == couple_orig) break; } WT_ERR(ret); /* * A new page: configure for traversal of any internal * page's children, else return the leaf page. */ descend: couple = ref; page = ref->page; if (WT_PAGE_IS_INTERNAL(page)) { WT_INTL_INDEX_GET(session, page, pindex); slot = prev ? pindex->entries - 1 : 0; empty_internal = true; } else { *refp = ref; goto done; } } } done: err: WT_LEAVE_PAGE_INDEX(session); return (ret); }
/* * __wt_page_in_func -- * Acquire a hazard pointer to a page; if the page is not in-memory, * read it from the disk and build an in-memory version. */ int __wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags #ifdef HAVE_DIAGNOSTIC , const char *file, int line #endif ) { WT_BTREE *btree; WT_DECL_RET; WT_PAGE *page; u_int sleep_cnt, wait_cnt; bool busy, cache_work, evict_soon, stalled; int force_attempts; btree = S2BT(session); /* * Ignore reads of pages already known to be in cache, otherwise the * eviction server can dominate these statistics. */ if (!LF_ISSET(WT_READ_CACHE)) { WT_STAT_FAST_CONN_INCR(session, cache_pages_requested); WT_STAT_FAST_DATA_INCR(session, cache_pages_requested); } for (evict_soon = stalled = false, force_attempts = 0, sleep_cnt = wait_cnt = 0;;) { switch (ref->state) { case WT_REF_DELETED: if (LF_ISSET(WT_READ_NO_EMPTY) && __wt_delete_page_skip(session, ref, false)) return (WT_NOTFOUND); /* FALLTHROUGH */ case WT_REF_DISK: if (LF_ISSET(WT_READ_CACHE)) return (WT_NOTFOUND); /* * The page isn't in memory, read it. If this thread is * allowed to do eviction work, check for space in the * cache. */ if (!LF_ISSET(WT_READ_NO_EVICT)) WT_RET(__wt_cache_eviction_check( session, 1, NULL)); WT_RET(__page_read(session, ref)); /* * If configured to not trash the cache, leave the page * generation unset, we'll set it before returning to * the oldest read generation, so the page is forcibly * evicted as soon as possible. We don't do that set * here because we don't want to evict the page before * we "acquire" it. */ evict_soon = LF_ISSET(WT_READ_WONT_NEED) || F_ISSET(session, WT_SESSION_NO_CACHE); continue; case WT_REF_READING: if (LF_ISSET(WT_READ_CACHE)) return (WT_NOTFOUND); if (LF_ISSET(WT_READ_NO_WAIT)) return (WT_NOTFOUND); /* Waiting on another thread's read, stall. */ WT_STAT_FAST_CONN_INCR(session, page_read_blocked); stalled = true; break; case WT_REF_LOCKED: if (LF_ISSET(WT_READ_NO_WAIT)) return (WT_NOTFOUND); /* Waiting on eviction, stall. */ WT_STAT_FAST_CONN_INCR(session, page_locked_blocked); stalled = true; break; case WT_REF_SPLIT: return (WT_RESTART); case WT_REF_MEM: /* * The page is in memory. * * Get a hazard pointer if one is required. We cannot * be evicting if no hazard pointer is required, we're * done. */ if (F_ISSET(btree, WT_BTREE_IN_MEMORY)) goto skip_evict; /* * The expected reason we can't get a hazard pointer is * because the page is being evicted, yield, try again. */ #ifdef HAVE_DIAGNOSTIC WT_RET( __wt_hazard_set(session, ref, &busy, file, line)); #else WT_RET(__wt_hazard_set(session, ref, &busy)); #endif if (busy) { WT_STAT_FAST_CONN_INCR( session, page_busy_blocked); break; } /* * If eviction is configured for this file, check to see * if the page qualifies for forced eviction and update * the page's generation number. If eviction isn't being * done on this file, we're done. * In-memory split of large pages is allowed while * no_eviction is set on btree, whereas reconciliation * is not allowed. */ if (LF_ISSET(WT_READ_NO_EVICT) || F_ISSET(session, WT_SESSION_NO_EVICTION) || (F_ISSET(btree, WT_BTREE_NO_EVICTION) && !F_ISSET(btree, WT_BTREE_NO_RECONCILE))) goto skip_evict; /* * Forcibly evict pages that are too big. */ if (force_attempts < 10 && __evict_force_check(session, ref)) { ++force_attempts; ret = __wt_page_release_evict(session, ref); /* If forced eviction fails, stall. */ if (ret == EBUSY) { ret = 0; WT_STAT_FAST_CONN_INCR(session, page_forcible_evict_blocked); stalled = true; break; } WT_RET(ret); /* * The result of a successful forced eviction * is a page-state transition (potentially to * an in-memory page we can use, or a restart * return for our caller), continue the outer * page-acquisition loop. */ continue; } /* * If we read the page and are configured to not trash * the cache, and no other thread has already used the * page, set the oldest read generation so the page is * forcibly evicted as soon as possible. * * Otherwise, if we read the page, or, if configured to * update the page's read generation and the page isn't * already flagged for forced eviction, update the page * read generation. */ page = ref->page; if (page->read_gen == WT_READGEN_NOTSET) { if (evict_soon) __wt_page_evict_soon(session, ref); else __wt_cache_read_gen_new(session, page); } else if (!LF_ISSET(WT_READ_NO_GEN)) __wt_cache_read_gen_bump(session, page); skip_evict: /* * Check if we need an autocommit transaction. * Starting a transaction can trigger eviction, so skip * it if eviction isn't permitted. */ return (LF_ISSET(WT_READ_NO_EVICT) ? 0 : __wt_txn_autocommit_check(session)); WT_ILLEGAL_VALUE(session); } /* * We failed to get the page -- yield before retrying, and if * we've yielded enough times, start sleeping so we don't burn * CPU to no purpose. */ if (stalled) wait_cnt += WT_THOUSAND; else if (++wait_cnt < WT_THOUSAND) { __wt_yield(); continue; } /* * If stalling and this thread is allowed to do eviction work, * check if the cache needs help. If we do work for the cache, * substitute that for a sleep. */ if (!LF_ISSET(WT_READ_NO_EVICT)) { WT_RET( __wt_cache_eviction_check(session, 1, &cache_work)); if (cache_work) continue; } sleep_cnt = WT_MIN(sleep_cnt + WT_THOUSAND, 10000); WT_STAT_FAST_CONN_INCRV(session, page_sleep, sleep_cnt); __wt_sleep(0, sleep_cnt); } }