/* * __rec_discard_page -- * Discard the page. */ static void __rec_discard_page(WT_SESSION_IMPL *session, WT_PAGE *page, int exclusive) { /* We should never evict the file's current eviction point. */ WT_ASSERT(session, session->btree->evict_page != page); /* Make sure a page is not in the eviction request list. */ if (!exclusive) __wt_evict_list_clr_page(session, page); /* Discard the page. */ __wt_page_out(session, &page); }
/* * __rec_review -- * Get exclusive access to the page and review the page and its subtree * for conditions that would block its eviction. * * The ref and page arguments may appear to be redundant, because usually * ref->page == page and page->ref == ref. However, we need both because * (a) there are cases where ref == NULL (e.g., for root page or during * salvage), and (b) we can't safely look at page->ref until we have a * hazard pointer. */ static int __rec_review(WT_SESSION_IMPL *session, WT_REF *ref, WT_PAGE *page, int exclusive, int merge, int top, int *inmem_split, int *istree) { WT_BTREE *btree; WT_DECL_RET; WT_PAGE_MODIFY *mod; WT_PAGE *t; uint32_t i; btree = S2BT(session); /* * Get exclusive access to the page if our caller doesn't have the tree * locked down. */ if (!exclusive) { WT_RET(__hazard_exclusive(session, ref, top)); /* * Now the page is locked, remove it from the LRU eviction * queue. We have to do this before freeing the page memory or * otherwise touching the reference because eviction paths * assume a non-NULL reference on the queue is pointing at * valid memory. */ __wt_evict_list_clr_page(session, page); } /* * Recurse through the page's subtree: this happens first because we * have to write pages in depth-first order, otherwise we'll dirty * pages after we've written them. */ if (page->type == WT_PAGE_COL_INT || page->type == WT_PAGE_ROW_INT) WT_REF_FOREACH(page, ref, i) switch (ref->state) { case WT_REF_DISK: /* On-disk */ case WT_REF_DELETED: /* On-disk, deleted */ break; case WT_REF_MEM: /* In-memory */ /* * Tell our caller if there's a subtree so we * know to do a full walk when discarding the * page. */ *istree = 1; WT_RET(__rec_review(session, ref, ref->page, exclusive, merge, 0, inmem_split, istree)); break; case WT_REF_EVICT_WALK: /* Walk point */ case WT_REF_LOCKED: /* Being evicted */ case WT_REF_READING: /* Being read */ return (EBUSY); } /* * If the file is being checkpointed, we cannot evict dirty pages, * because that may free a page that appears on an internal page in the * checkpoint. Don't rely on new updates being skipped by the * transaction used for transaction reads: (1) there are paths that * dirty pages for artificial reasons; (2) internal pages aren't * transactional; and (3) if an update was skipped during the * checkpoint (leaving the page dirty), then rolled back, we could * still successfully overwrite a page and corrupt the checkpoint. * * Further, even for clean pages, the checkpoint's reconciliation of an * internal page might race with us as we evict a child in the page's * subtree. * * One half of that test is in the reconciliation code: the checkpoint * thread waits for eviction-locked pages to settle before determining * their status. The other half of the test is here: after acquiring * the exclusive eviction lock on a page, confirm no page in the page's * stack of pages from the root is being reconciled in a checkpoint. * This ensures we either see the checkpoint-walk state here, or the * reconciliation of the internal page sees our exclusive lock on the * child page and waits until we're finished evicting the child page * (or give up if eviction isn't possible). * * We must check the full stack (we might be attempting to evict a leaf * page multiple levels beneath the internal page being reconciled as * part of the checkpoint, and all of the intermediate nodes are being * merged into the internal page). * * There's no simple test for knowing if a page in our page stack is * involved in a checkpoint. The internal page's checkpoint-walk flag * is the best test, but it's not set anywhere for the root page, it's * not a complete test. * * Quit for any page that's not a simple, in-memory page. (Almost the * same as checking for the checkpoint-walk flag. I don't think there * are code paths that change the page's status from checkpoint-walk, * but these races are hard enough I'm not going to proceed if there's * anything other than a vanilla, in-memory tree stack.) Climb until * we find a page which can't be merged into its parent, and failing if * we never find such a page. */ if (btree->checkpointing && !merge && __wt_page_is_modified(page)) { ckpt: WT_STAT_FAST_CONN_INCR(session, cache_eviction_checkpoint); WT_STAT_FAST_DATA_INCR(session, cache_eviction_checkpoint); return (EBUSY); } if (btree->checkpointing && top) for (t = page->parent;; t = t->parent) { if (t == NULL || t->ref == NULL) /* root */ goto ckpt; if (t->ref->state != WT_REF_MEM) /* scary */ goto ckpt; if (t->modify == NULL || /* not merged */ !F_ISSET(t->modify, WT_PM_REC_EMPTY | WT_PM_REC_SPLIT | WT_PM_REC_SPLIT_MERGE)) break; } /* * If we are merging internal pages, we just need exclusive access, we * don't need to write everything. */ if (merge) return (0); /* * Fail if any page in the top-level page's subtree won't be merged into * its parent, the page that cannot be merged must be evicted first. * The test is necessary but should not fire much: the eviction code is * biased for leaf pages, an internal page shouldn't be selected for * eviction until its children have been evicted. * * We have to write dirty pages to know their final state, a page marked * empty may have had records added since reconciliation, a page marked * split may have had records deleted and no longer need to split. * Split-merge pages are the exception: they can never be change into * anything other than a split-merge page and are merged regardless of * being clean or dirty. * * Writing the page is expensive, do a cheap test first: if it doesn't * appear a subtree page can be merged, quit. It's possible the page * has been emptied since it was last reconciled, and writing it before * testing might be worthwhile, but it's more probable we're attempting * to evict an internal page with live children, and that's a waste of * time. */ mod = page->modify; if (!top && (mod == NULL || !F_ISSET(mod, WT_PM_REC_EMPTY | WT_PM_REC_SPLIT | WT_PM_REC_SPLIT_MERGE))) return (EBUSY); /* * If the page is dirty and can possibly change state, write it so we * know the final state. */ if (__wt_page_is_modified(page) && !F_ISSET(mod, WT_PM_REC_SPLIT_MERGE)) { ret = __wt_rec_write(session, page, NULL, WT_EVICTION_SERVER_LOCKED | WT_SKIP_UPDATE_QUIT); /* * Update the page's modification reference, reconciliation * might have changed it. */ mod = page->modify; /* * If reconciliation failed due to active modifications and * the page is a lot larger than the maximum allowed, it is * likely that we are having trouble reconciling it due to * contention, attempt to split the page in memory. * * Note, we won't be here if recursively descending a tree of * pages: dirty row-store leaf pages can't be merged into their * parents, which means if top wasn't true in this test, we'd * have returned busy before attempting reconciliation. */ if (ret == EBUSY && page->type == WT_PAGE_ROW_LEAF && __wt_eviction_force_check(session, page)) { *inmem_split = 1; return (0); } if (ret == EBUSY) { /* Give up if there are unwritten changes */ WT_VERBOSE_RET(session, evict, "eviction failed, reconciled page" " contained active updates"); /* * We may be able to discard any "update" memory the * page no longer needs. */ switch (page->type) { case WT_PAGE_COL_FIX: case WT_PAGE_COL_VAR: __wt_col_leaf_obsolete(session, page); break; case WT_PAGE_ROW_LEAF: __wt_row_leaf_obsolete(session, page); break; } } WT_RET(ret); WT_ASSERT(session, !__wt_page_is_modified(page)); } /* * If the page is clean, but was ever modified, make sure all of the * updates on the page are old enough that they can be discarded from * cache. */ if (!exclusive && mod != NULL && !__wt_txn_visible_all(session, mod->disk_txn)) return (EBUSY); /* * Repeat the test: fail if any page in the top-level page's subtree * won't be merged into its parent. */ if (!top && (mod == NULL || !F_ISSET(mod, WT_PM_REC_EMPTY | WT_PM_REC_SPLIT | WT_PM_REC_SPLIT_MERGE))) return (EBUSY); return (0); }