/* * __wt_ref_out -- * Discard an in-memory page, freeing all memory associated with it. */ void __wt_ref_out(WT_SESSION_IMPL *session, WT_REF *ref) { /* * A version of the page-out function that allows us to make additional * diagnostic checks. */ WT_ASSERT(session, S2BT(session)->evict_ref != ref); __wt_page_out(session, &ref->page); }
/* * __rec_discard_page -- * Discard the page. */ static void __rec_discard_page(WT_SESSION_IMPL *session, WT_PAGE *page, int exclusive) { /* We should never evict the file's current eviction point. */ WT_ASSERT(session, session->btree->evict_page != page); /* Make sure a page is not in the eviction request list. */ if (!exclusive) __wt_evict_list_clr_page(session, page); /* Discard the page. */ __wt_page_out(session, &page); }
/* * __wt_free_ref -- * Discard the contents of a WT_REF structure (optionally including the * pages it references). */ void __wt_free_ref( WT_SESSION_IMPL *session, WT_REF *ref, int page_type, bool free_pages) { WT_IKEY *ikey; if (ref == NULL) return; /* * Optionally free the referenced pages. (The path to free referenced * page is used for error cleanup, no instantiated and then discarded * page should have WT_REF entries with real pages. The page may have * been marked dirty as well; page discard checks for that, so we mark * it clean explicitly.) */ if (free_pages && ref->page != NULL) { __wt_page_modify_clear(session, ref->page); __wt_page_out(session, &ref->page); } /* * Optionally free row-store WT_REF key allocation. Historic versions of * this code looked in a passed-in page argument, but that is dangerous, * some of our error-path callers create WT_REF structures without ever * setting WT_REF.home or having a parent page to which the WT_REF will * be linked. Those WT_REF structures invariably have instantiated keys, * (they obviously cannot be on-page keys), and we must free the memory. */ switch (page_type) { case WT_PAGE_ROW_INT: case WT_PAGE_ROW_LEAF: if ((ikey = __wt_ref_key_instantiated(ref)) != NULL) __wt_free(session, ikey); break; } /* * Free any address allocation; if there's no linked WT_REF page, it * must be allocated. */ __wt_ref_addr_free(session, ref); /* Free any page-deleted information. */ if (ref->page_del != NULL) { __wt_free(session, ref->page_del->update_list); __wt_free(session, ref->page_del); } __wt_overwrite_and_free(session, ref); }
/* * __wt_free_ref -- * Discard the contents of a WT_REF structure (optionally including the * pages it references). */ void __wt_free_ref( WT_SESSION_IMPL *session, WT_PAGE *page, WT_REF *ref, bool free_pages) { WT_IKEY *ikey; if (ref == NULL) return; /* * Optionally free the referenced pages. (The path to free referenced * page is used for error cleanup, no instantiated and then discarded * page should have WT_REF entries with real pages. The page may have * been marked dirty as well; page discard checks for that, so we mark * it clean explicitly.) */ if (free_pages && ref->page != NULL) { __wt_page_modify_clear(session, ref->page); __wt_page_out(session, &ref->page); } /* Free any key allocation. */ switch (page->type) { case WT_PAGE_ROW_INT: case WT_PAGE_ROW_LEAF: if ((ikey = __wt_ref_key_instantiated(ref)) != NULL) __wt_free(session, ikey); break; } /* Free any address allocation. */ if (ref->addr != NULL && __wt_off_page(page, ref->addr)) { __wt_free(session, ((WT_ADDR *)ref->addr)->addr); __wt_free(session, ref->addr); } /* Free any page-deleted information. */ if (ref->page_del != NULL) { __wt_free(session, ref->page_del->update_list); __wt_free(session, ref->page_del); } __wt_overwrite_and_free(session, ref); }
/* * __free_page_modify -- * Discard the page's associated modification structures. */ static void __free_page_modify(WT_SESSION_IMPL *session, WT_PAGE *page) { WT_INSERT_HEAD *append; WT_PAGE_MODIFY *mod; mod = page->modify; switch (F_ISSET(mod, WT_PM_REC_MASK)) { case WT_PM_REC_SPLIT: /* * If the page split, there may one or more pages linked from * the page; walk the list, discarding pages. */ __wt_page_out(session, &mod->u.split); break; case WT_PM_REC_REPLACE: /* * Discard any replacement address: this memory is usually moved * into the parent's WT_REF, but at the root that can't happen. */ __wt_free(session, mod->u.replace.addr); break; default: break; } /* Free the append array. */ if ((append = WT_COL_APPEND(page)) != NULL) { __free_skip_list(session, WT_SKIP_FIRST(append)); __wt_free(session, append); __wt_free(session, mod->append); } /* Free the insert/update array. */ if (mod->update != NULL) __free_skip_array(session, mod->update, page->type == WT_PAGE_COL_FIX ? 1 : page->entries); /* Discard any objects the page was tracking plus associated memory. */ __wt_rec_track_discard(session, page); __wt_free(session, mod->track); __wt_free(session, page->modify); }
/* * __wt_ref_out -- * Discard an in-memory page, freeing all memory associated with it. */ void __wt_ref_out(WT_SESSION_IMPL *session, WT_REF *ref) { /* * A version of the page-out function that allows us to make additional * diagnostic checks. * * The WT_REF cannot be the eviction thread's location. */ WT_ASSERT(session, S2BT(session)->evict_ref != ref); #ifdef HAVE_DIAGNOSTIC { WT_HAZARD *hp; int i; /* * Make sure no other thread has a hazard pointer on the page we are * about to discard. This is complicated by the fact that readers * publish their hazard pointer before re-checking the page state, so * our check can race with readers without indicating a real problem. * Wait for up to a second for hazard pointers to be cleared. */ for (hp = NULL, i = 0; i < 100; i++) { if ((hp = __wt_hazard_check(session, ref)) == NULL) break; __wt_sleep(0, 10000); } if (hp != NULL) __wt_errx(session, "discarded page has hazard pointer: (%p: %s, line %d)", (void *)hp->ref, hp->file, hp->line); WT_ASSERT(session, hp == NULL); } #endif __wt_page_out(session, &ref->page); }
/* * __merge_new_page -- * Create a new in-memory internal page. */ static int __merge_new_page(WT_SESSION_IMPL *session, uint8_t type, uint32_t entries, int merge, WT_PAGE **pagep) { WT_DECL_RET; WT_PAGE *newpage; /* Allocate a new internal page and fill it in. */ WT_RET(__wt_page_alloc(session, type, entries, &newpage)); newpage->read_gen = WT_READ_GEN_NOTSET; newpage->entries = entries; WT_ERR(__wt_page_modify_init(session, newpage)); if (merge) F_SET(newpage->modify, WT_PM_REC_SPLIT_MERGE); else __wt_page_modify_set(session, newpage); *pagep = newpage; return (0); err: __wt_page_out(session, &newpage); return (ret); }
/* * __rec_discard_tree -- * Discard the tree rooted a page (that is, any pages merged into it), * then the page itself. */ static void __rec_discard_tree(WT_SESSION_IMPL *session, WT_PAGE *page, int exclusive) { WT_REF *ref; uint32_t i; switch (page->type) { case WT_PAGE_COL_INT: case WT_PAGE_ROW_INT: /* For each entry in the page... */ WT_REF_FOREACH(page, ref, i) { if (ref->state == WT_REF_DISK || ref->state == WT_REF_DELETED) continue; WT_ASSERT(session, exclusive || ref->state == WT_REF_LOCKED); __rec_discard_tree(session, ref->page, exclusive); } /* FALLTHROUGH */ default: __wt_page_out(session, &page); break; } }
/* * __wt_merge_tree -- * Attempt to collapse a stack of split-merge pages in memory into a * shallow tree. If enough keys are found, create a real internal node * that can be evicted (and, if necessary, split further). * * This code is designed to deal with workloads that otherwise create * arbitrarily deep (and slow) trees in memory. */ int __wt_merge_tree(WT_SESSION_IMPL *session, WT_PAGE *top) { WT_DECL_RET; WT_PAGE *lchild, *newtop, *rchild; WT_REF *newref; WT_VISIT_STATE visit_state; uint32_t refcnt, split; int promote; u_int levels; uint8_t page_type; WT_CLEAR(visit_state); visit_state.session = session; lchild = newtop = rchild = NULL; page_type = top->type; WT_ASSERT(session, __wt_btree_mergeable(top)); WT_ASSERT(session, top->ref->state == WT_REF_LOCKED); /* * Walk the subtree, count the references at the bottom level and * calculate the maximum depth. */ WT_RET(__merge_walk(session, top, 1, __merge_count, &visit_state)); /* If there aren't enough useful levels, give up. */ if (visit_state.maxdepth < WT_MERGE_STACK_MIN) return (EBUSY); /* * Don't allow split merges to generate arbitrarily large pages. * Ideally we would choose a size based on the internal_page_max * setting for the btree, but we don't have the correct btree handle * available. */ if (visit_state.refcnt > WT_MERGE_MAX_REFS) return (EBUSY); /* * Now we either collapse the internal pages into one split-merge page, * or if there are "enough" keys, we split into two equal internal * pages, each of which can be evicted independently. * * We set a flag (WT_PM_REC_SPLIT_MERGE) on the created page if it * isn't big enough to justify the cost of evicting it. If splits * continue, it will be merged again until it gets over this limit. */ promote = 0; refcnt = (uint32_t)visit_state.refcnt; if (refcnt >= WT_MERGE_FULL_PAGE && visit_state.seen_live) { /* * In the normal case where there are live children spread * through the subtree, create two child pages. * * Handle the case where the only live child is first / last * specially: put the live child into the top-level page. * * Set SPLIT_MERGE on the internal pages if there are any live * children: they can't be evicted, so there is no point * permanently deepening the tree. */ if (visit_state.first_live == visit_state.last_live && (visit_state.first_live == 0 || visit_state.first_live == refcnt - 1)) split = (visit_state.first_live == 0) ? 1 : refcnt - 1; else split = (refcnt + 1) / 2; /* Only promote if we can create a real page. */ if (split == 1 || split == refcnt - 1) promote = 1; else if (split >= WT_MERGE_FULL_PAGE && visit_state.first_live >= split) promote = 1; else if (refcnt - split >= WT_MERGE_FULL_PAGE && visit_state.last_live < split) promote = 1; } if (promote) { /* Create a new top-level split-merge page with two entries. */ WT_ERR(__merge_new_page(session, page_type, 2, 1, &newtop)); visit_state.split = split; /* Left split. */ if (split == 1) visit_state.first = newtop; else { WT_ERR(__merge_new_page(session, page_type, split, visit_state.first_live < split, &lchild)); visit_state.first = lchild; } /* Right split. */ if (split == refcnt - 1) { visit_state.second = newtop; visit_state.second_ref = &newtop->u.intl.t[1]; } else { WT_ERR(__merge_new_page(session, page_type, refcnt - split, visit_state.last_live >= split, &rchild)); visit_state.second = rchild; visit_state.second_ref = &visit_state.second->u.intl.t[0]; } } else { /* * Create a new split-merge page for small merges, or if the * page above is a split merge page. When we do a big enough * merge, we create a real page at the top and don't consider * it as a merge candidate again. Over time with an insert * workload the tree will grow deeper, but that's inevitable, * and this keeps individual merges small. */ WT_ERR(__merge_new_page(session, page_type, refcnt, refcnt < WT_MERGE_FULL_PAGE || __wt_btree_mergeable(top->parent), &newtop)); visit_state.first = newtop; } /* * Copy the references into the new tree, but don't update anything in * the locked tree in case there is an error and we need to back out. * We do this in a separate pass so that we can figure out the key for * the split point: that allocates memory and so it could still fail. */ visit_state.page = visit_state.first; visit_state.ref = visit_state.page->u.intl.t; visit_state.refcnt = 0; WT_ERR(__merge_walk(session, top, 0, __merge_copy_ref, &visit_state)); if (promote) { /* Promote keys into the top-level page. */ if (lchild != NULL) { newref = &newtop->u.intl.t[0]; WT_LINK_PAGE(newtop, newref, lchild); newref->state = WT_REF_MEM; WT_ERR(__merge_promote_key(session, newref)); } if (rchild != NULL) { newref = &newtop->u.intl.t[1]; WT_LINK_PAGE(newtop, newref, rchild); newref->state = WT_REF_MEM; WT_ERR(__merge_promote_key(session, newref)); } } /* * We have copied everything into place and allocated all of the memory * we need. Now link all pages into the new tree and unlock them. * * The only way this could fail is if a reference state has been * changed by another thread since they were locked. Panic in that * case: that should never happen. */ visit_state.page = visit_state.first; visit_state.ref = visit_state.page->u.intl.t; visit_state.refcnt = 0; ret = __merge_walk(session, top, 0, __merge_switch_page, &visit_state); if (ret != 0) WT_ERR(__wt_illegal_value(session, "__wt_merge_tree")); newtop->u.intl.recno = top->u.intl.recno; newtop->parent = top->parent; newtop->ref = top->ref; #ifdef HAVE_DIAGNOSTIC /* * Before swapping in the new tree, walk the pages we are discarding, * check that everything looks right. */ __merge_check_discard(session, top); #endif /* * Set up the new top-level page as a split so that it will be swapped * into place by our caller. */ top->modify->flags = WT_PM_REC_SPLIT; top->modify->u.split = newtop; WT_VERBOSE_ERR(session, evict, "Successfully %s %" PRIu32 " split-merge pages containing %" PRIu32 " keys\n", promote ? "promoted" : "merged", visit_state.maxdepth, refcnt); /* Evict new child pages as soon as possible. */ if (lchild != NULL && !F_ISSET(lchild->modify, WT_PM_REC_SPLIT_MERGE)) lchild->read_gen = WT_READ_GEN_OLDEST; if (rchild != NULL && !F_ISSET(rchild->modify, WT_PM_REC_SPLIT_MERGE)) rchild->read_gen = WT_READ_GEN_OLDEST; /* Update statistics. */ WT_CSTAT_INCR(session, cache_eviction_merge); WT_DSTAT_INCR(session, cache_eviction_merge); /* How many levels did we remove? */ levels = visit_state.maxdepth - (promote ? 2 : 1); WT_CSTAT_INCRV(session, cache_eviction_merge_levels, levels); WT_DSTAT_INCRV(session, cache_eviction_merge_levels, levels); return (0); err: WT_VERBOSE_TRET(session, evict, "Failed to merge %" PRIu32 " split-merge pages containing %" PRIu32 " keys\n", visit_state.maxdepth, refcnt); WT_CSTAT_INCR(session, cache_eviction_merge_fail); WT_DSTAT_INCR(session, cache_eviction_merge_fail); if (newtop != NULL) __wt_page_out(session, &newtop); if (lchild != NULL) __wt_page_out(session, &lchild); if (rchild != NULL) __wt_page_out(session, &rchild); return (ret); }
/* * __wt_page_out -- * Discard an in-memory page, freeing all memory associated with it. */ void __wt_page_out(WT_SESSION_IMPL *session, WT_PAGE **pagep) { WT_PAGE *page; WT_PAGE_HEADER *dsk; WT_PAGE_MODIFY *mod; /* * Kill our caller's reference, do our best to catch races. */ page = *pagep; *pagep = NULL; if (F_ISSET(session->dhandle, WT_DHANDLE_DEAD)) __wt_page_modify_clear(session, page); /* * We should never discard: * - a dirty page, * - a page queued for eviction, or * - a locked page. */ WT_ASSERT(session, !__wt_page_is_modified(page)); WT_ASSERT(session, !F_ISSET_ATOMIC(page, WT_PAGE_EVICT_LRU)); WT_ASSERT(session, !__wt_fair_islocked(session, &page->page_lock)); #ifdef HAVE_DIAGNOSTIC { WT_HAZARD *hp; int i; /* * Make sure no other thread has a hazard pointer on the page we are * about to discard. This is complicated by the fact that readers * publish their hazard pointer before re-checking the page state, so * our check can race with readers without indicating a real problem. * Wait for up to a second for hazard pointers to be cleared. */ for (hp = NULL, i = 0; i < 100; i++) { if ((hp = __wt_page_hazard_check(session, page)) == NULL) break; __wt_sleep(0, 10000); } if (hp != NULL) __wt_errx(session, "discarded page has hazard pointer: (%p: %s, line %d)", hp->page, hp->file, hp->line); WT_ASSERT(session, hp == NULL); } #endif /* * If a root page split, there may be one or more pages linked from the * page; walk the list, discarding pages. */ switch (page->type) { case WT_PAGE_COL_INT: case WT_PAGE_ROW_INT: mod = page->modify; if (mod != NULL && mod->mod_root_split != NULL) __wt_page_out(session, &mod->mod_root_split); break; } /* Update the cache's information. */ __wt_cache_page_evict(session, page); /* * If discarding the page as part of process exit, the application may * configure to leak the memory rather than do the work. */ if (F_ISSET(S2C(session), WT_CONN_LEAK_MEMORY)) return; /* Free the page modification information. */ if (page->modify != NULL) __free_page_modify(session, page); switch (page->type) { case WT_PAGE_COL_FIX: break; case WT_PAGE_COL_INT: case WT_PAGE_ROW_INT: __free_page_int(session, page); break; case WT_PAGE_COL_VAR: __free_page_col_var(session, page); break; case WT_PAGE_ROW_LEAF: __free_page_row_leaf(session, page); break; } /* Discard any disk image. */ dsk = (WT_PAGE_HEADER *)page->dsk; if (F_ISSET_ATOMIC(page, WT_PAGE_DISK_ALLOC)) __wt_overwrite_and_free_len(session, dsk, dsk->mem_size); if (F_ISSET_ATOMIC(page, WT_PAGE_DISK_MAPPED)) (void)__wt_mmap_discard(session, dsk, dsk->mem_size); __wt_overwrite_and_free(session, page); }
/* * __wt_page_inmem -- * Build in-memory page information. */ int __wt_page_inmem(WT_SESSION_IMPL *session, WT_REF *ref, const void *image, uint32_t flags, WT_PAGE **pagep) { WT_DECL_RET; WT_PAGE *page; const WT_PAGE_HEADER *dsk; uint32_t alloc_entries; size_t size; *pagep = NULL; dsk = image; alloc_entries = 0; /* * Figure out how many underlying objects the page references so we can * allocate them along with the page. */ switch (dsk->type) { case WT_PAGE_COL_FIX: case WT_PAGE_COL_INT: case WT_PAGE_COL_VAR: /* * Column-store leaf page entries map one-to-one to the number * of physical entries on the page (each physical entry is a * value item). * * Column-store internal page entries map one-to-one to the * number of physical entries on the page (each entry is a * location cookie). */ alloc_entries = dsk->u.entries; break; case WT_PAGE_ROW_INT: /* * Row-store internal page entries map one-to-two to the number * of physical entries on the page (each entry is a key and * location cookie pair). */ alloc_entries = dsk->u.entries / 2; break; case WT_PAGE_ROW_LEAF: /* * If the "no empty values" flag is set, row-store leaf page * entries map one-to-one to the number of physical entries * on the page (each physical entry is a key or value item). * If that flag is not set, there are more keys than values, * we have to walk the page to figure it out. */ if (F_ISSET(dsk, WT_PAGE_EMPTY_V_ALL)) alloc_entries = dsk->u.entries; else if (F_ISSET(dsk, WT_PAGE_EMPTY_V_NONE)) alloc_entries = dsk->u.entries / 2; else WT_RET(__inmem_row_leaf_entries( session, dsk, &alloc_entries)); break; WT_ILLEGAL_VALUE(session); } /* Allocate and initialize a new WT_PAGE. */ WT_RET(__wt_page_alloc( session, dsk->type, dsk->recno, alloc_entries, 1, &page)); page->dsk = dsk; F_SET_ATOMIC(page, flags); /* * Track the memory allocated to build this page so we can update the * cache statistics in a single call. */ size = LF_ISSET(WT_PAGE_DISK_ALLOC) ? dsk->mem_size : 0; switch (page->type) { case WT_PAGE_COL_FIX: __inmem_col_fix(session, page); break; case WT_PAGE_COL_INT: __inmem_col_int(session, page); break; case WT_PAGE_COL_VAR: WT_ERR(__inmem_col_var(session, page, &size)); break; case WT_PAGE_ROW_INT: WT_ERR(__inmem_row_int(session, page, &size)); break; case WT_PAGE_ROW_LEAF: WT_ERR(__inmem_row_leaf(session, page)); break; WT_ILLEGAL_VALUE_ERR(session); } /* Update the page's in-memory size and the cache statistics. */ __wt_cache_page_inmem_incr(session, page, size); /* Link the new internal page to the parent. */ if (ref != NULL) { switch (page->type) { case WT_PAGE_COL_INT: case WT_PAGE_ROW_INT: page->pg_intl_parent_ref = ref; break; } ref->page = page; } *pagep = page; return (0); err: __wt_page_out(session, &page); return (ret); }
/* * __wt_page_inmem -- * Build in-memory page information. */ int __wt_page_inmem( WT_SESSION_IMPL *session, WT_PAGE *parent, WT_REF *parent_ref, WT_PAGE_HEADER *dsk, int disk_not_alloc, WT_PAGE **pagep) { WT_DECL_RET; WT_PAGE *page; uint32_t alloc_entries; size_t size; alloc_entries = 0; *pagep = NULL; /* * Figure out how many underlying objects the page references so * we can allocate them along with the page. */ switch (dsk->type) { case WT_PAGE_COL_FIX: break; case WT_PAGE_COL_INT: /* * Column-store internal page entries map one-to-one to the * number of physical entries on the page (each physical entry * is an offset object). */ alloc_entries = dsk->u.entries; break; case WT_PAGE_COL_VAR: /* * Column-store leaf page entries map one-to-one to the number * of physical entries on the page (each physical entry is a * data item). */ alloc_entries = dsk->u.entries; break; case WT_PAGE_ROW_INT: /* * Row-store internal page entries map one-to-two to the number * of physical entries on the page (each in-memory entry is a * key item and location cookie). */ alloc_entries = dsk->u.entries / 2; break; case WT_PAGE_ROW_LEAF: /* * Row-store leaf page entries map in an indeterminate way to * the physical entries on the page, we have to walk the page * to figure it out. */ WT_RET(__inmem_row_leaf_entries(session, dsk, &alloc_entries)); break; WT_ILLEGAL_VALUE(session); } /* Allocate and initialize a new WT_PAGE. */ WT_RET(__wt_page_alloc(session, dsk->type, alloc_entries, &page)); page->dsk = dsk; page->read_gen = WT_READ_GEN_NOTSET; if (disk_not_alloc) F_SET_ATOMIC(page, WT_PAGE_DISK_NOT_ALLOC); /* * Track the memory allocated to build this page so we can update the * cache statistics in a single call. */ size = disk_not_alloc ? 0 : dsk->mem_size; switch (page->type) { case WT_PAGE_COL_FIX: page->entries = dsk->u.entries; page->u.col_fix.recno = dsk->recno; __inmem_col_fix(session, page); break; case WT_PAGE_COL_INT: page->entries = dsk->u.entries; page->u.intl.recno = dsk->recno; __inmem_col_int(session, page); break; case WT_PAGE_COL_VAR: page->entries = dsk->u.entries; page->u.col_var.recno = dsk->recno; WT_ERR(__inmem_col_var(session, page, &size)); break; case WT_PAGE_ROW_INT: page->entries = dsk->u.entries / 2; WT_ERR(__inmem_row_int(session, page, &size)); break; case WT_PAGE_ROW_LEAF: page->entries = alloc_entries; WT_ERR(__inmem_row_leaf(session, page)); break; WT_ILLEGAL_VALUE_ERR(session); } /* Update the page's in-memory size and the cache statistics. */ __wt_cache_page_inmem_incr(session, page, size); /* Link the new page into the parent. */ if (parent_ref != NULL) WT_LINK_PAGE(parent, parent_ref, page); *pagep = page; return (0); err: __wt_page_out(session, &page); return (ret); }
/* * __wt_rec_evict -- * Reconciliation plus eviction. */ int __wt_rec_evict(WT_SESSION_IMPL *session, WT_PAGE *page, int exclusive) { WT_DECL_RET; WT_PAGE_MODIFY *mod; WT_REF *parent_ref; int merge, inmem_split, istree; WT_VERBOSE_RET(session, evict, "page %p (%s)", page, __wt_page_type_string(page->type)); WT_ASSERT(session, session->excl_next == 0); inmem_split = istree = 0; /* * If we get a split-merge page during normal eviction, try to collapse * it. During close, it will be merged into its parent. */ mod = page->modify; merge = __wt_btree_mergeable(page); if (merge && exclusive) return (EBUSY); WT_ASSERT(session, merge || mod == NULL || !F_ISSET(mod, WT_PM_REC_SPLIT_MERGE)); /* * Get exclusive access to the page and review the page and its subtree * for conditions that would block our eviction of the page. If the * check fails (for example, we find a child page that can't be merged), * we're done. We have to make this check for clean pages, too: while * unlikely eviction would choose an internal page with children, it's * not disallowed anywhere. * * Note that page->ref may be NULL in some cases (e.g., for root pages * or during salvage). That's OK if exclusive is set: we won't check * hazard pointers in that case. */ parent_ref = page->ref; WT_ERR(__rec_review(session, parent_ref, page, exclusive, merge, 1, &inmem_split, &istree)); /* Try to merge internal pages. */ if (merge) WT_ERR(__wt_merge_tree(session, page)); /* * Try to split the page in memory. If the split succeeds, it swaps * the new pages into place: there is no need for further cleanup. */ if (inmem_split) { WT_ERR(__wt_split_page_inmem(session, page)); WT_STAT_FAST_CONN_INCR(session, cache_inmem_split); WT_STAT_FAST_DATA_INCR(session, cache_inmem_split); goto done; } /* * Update the page's modification reference, reconciliation might have * changed it. */ mod = page->modify; /* Count evictions of internal pages during normal operation. */ if (!exclusive && !merge && (page->type == WT_PAGE_COL_INT || page->type == WT_PAGE_ROW_INT)) { WT_STAT_FAST_CONN_INCR(session, cache_eviction_internal); WT_STAT_FAST_DATA_INCR(session, cache_eviction_internal); } /* * Update the parent and discard the page. */ if (mod == NULL || !F_ISSET(mod, WT_PM_REC_MASK)) { WT_ASSERT(session, exclusive || parent_ref->state == WT_REF_LOCKED); if (WT_PAGE_IS_ROOT(page)) __rec_root_update(session); else __rec_page_clean_update(session, parent_ref); WT_STAT_FAST_CONN_INCR(session, cache_eviction_clean); WT_STAT_FAST_DATA_INCR(session, cache_eviction_clean); } else { if (WT_PAGE_IS_ROOT(page)) __rec_root_update(session); else WT_ERR(__rec_page_dirty_update( session, parent_ref, page)); WT_STAT_FAST_CONN_INCR(session, cache_eviction_dirty); WT_STAT_FAST_DATA_INCR(session, cache_eviction_dirty); } /* Discard the page or tree rooted in this page. */ if (istree) __rec_discard_tree(session, page, exclusive); else __wt_page_out(session, &page); if (0) { err: /* * If unable to evict this page, release exclusive reference(s) * we've acquired. */ __rec_excl_clear(session); WT_STAT_FAST_CONN_INCR(session, cache_eviction_fail); WT_STAT_FAST_DATA_INCR(session, cache_eviction_fail); } done: session->excl_next = 0; return (ret); }
/* * __wt_page_out -- * Discard an in-memory page, freeing all memory associated with it. */ void __wt_page_out(WT_SESSION_IMPL *session, WT_PAGE **pagep) { WT_PAGE *page; WT_PAGE_HEADER *dsk; WT_PAGE_MODIFY *mod; /* * Kill our caller's reference, do our best to catch races. */ page = *pagep; *pagep = NULL; if (F_ISSET(session->dhandle, WT_DHANDLE_DEAD)) __wt_page_modify_clear(session, page); /* * We should never discard: * - a dirty page, * - a page queued for eviction, or * - a locked page. */ WT_ASSERT(session, !__wt_page_is_modified(page)); WT_ASSERT(session, !F_ISSET_ATOMIC(page, WT_PAGE_EVICT_LRU)); WT_ASSERT(session, !__wt_rwlock_islocked(session, &page->page_lock)); /* * If a root page split, there may be one or more pages linked from the * page; walk the list, discarding pages. */ switch (page->type) { case WT_PAGE_COL_INT: case WT_PAGE_ROW_INT: mod = page->modify; if (mod != NULL && mod->mod_root_split != NULL) __wt_page_out(session, &mod->mod_root_split); break; } /* Update the cache's information. */ __wt_cache_page_evict(session, page); dsk = (WT_PAGE_HEADER *)page->dsk; if (F_ISSET_ATOMIC(page, WT_PAGE_DISK_ALLOC)) __wt_cache_page_image_decr(session, dsk->mem_size); /* Discard any mapped image. */ if (F_ISSET_ATOMIC(page, WT_PAGE_DISK_MAPPED)) (void)S2BT(session)->bm->map_discard( S2BT(session)->bm, session, dsk, (size_t)dsk->mem_size); /* * If discarding the page as part of process exit, the application may * configure to leak the memory rather than do the work. */ if (F_ISSET(S2C(session), WT_CONN_LEAK_MEMORY)) return; /* Free the page modification information. */ if (page->modify != NULL) __free_page_modify(session, page); switch (page->type) { case WT_PAGE_COL_FIX: break; case WT_PAGE_COL_INT: case WT_PAGE_ROW_INT: __free_page_int(session, page); break; case WT_PAGE_COL_VAR: __free_page_col_var(session, page); break; case WT_PAGE_ROW_LEAF: __free_page_row_leaf(session, page); break; } /* Discard any allocated disk image. */ if (F_ISSET_ATOMIC(page, WT_PAGE_DISK_ALLOC)) __wt_overwrite_and_free_len(session, dsk, dsk->mem_size); __wt_overwrite_and_free(session, page); }