/* * __merge_switch_page -- * Switch a page from the locked tree into the new tree. */ static void __merge_switch_page(WT_PAGE *parent, WT_REF *ref, WT_VISIT_STATE *state) { WT_PAGE *child; WT_PAGE_MODIFY *modify; WT_REF *newref; if (state->split != 0 && state->refcnt++ == state->split) { state->page = state->second; state->ref = state->second_ref; } newref = state->ref++; if (ref->addr != NULL) __merge_transfer_footprint( state->session, state->page, parent, (uint32_t)sizeof(WT_ADDR) + ((WT_ADDR *)ref->addr)->size); if (parent->type == WT_PAGE_ROW_INT) __merge_transfer_footprint( state->session, state->page, parent, (uint32_t)sizeof(WT_IKEY) + ((WT_IKEY *)ref->u.key)->size); if (ref->state == WT_REF_LOCKED) { child = ref->page; /* * If the child has been split, update the split page to point * into the new tree. That way, if the split-merge page is * later swapped into place, it will point to the new parent. * * The order here is important: the parent page should point to * the original child page, so we link that in last. */ if ((modify = child->modify) != NULL && F_ISSET(modify, WT_PM_REC_SPLIT)) WT_LINK_PAGE(state->page, newref, modify->u.split); WT_LINK_PAGE(state->page, newref, child); /* * If we have a child that is a live internal page, its subtree * was locked by __rec_review. We're swapping it into the new * tree, unlock it now. */ if (child->type == WT_PAGE_ROW_INT || child->type == WT_PAGE_COL_INT) __merge_unlock(child); newref->state = WT_REF_MEM; } WT_CLEAR(*ref); }
/* * __wt_merge_tree -- * Attempt to collapse a stack of split-merge pages in memory into a * shallow tree. If enough keys are found, create a real internal node * that can be evicted (and, if necessary, split further). * * This code is designed to deal with workloads that otherwise create * arbitrarily deep (and slow) trees in memory. */ int __wt_merge_tree(WT_SESSION_IMPL *session, WT_PAGE *top) { WT_DECL_RET; WT_PAGE *lchild, *newtop, *rchild; WT_REF *newref; WT_VISIT_STATE visit_state; uint32_t refcnt, split; int promote; u_int levels; uint8_t page_type; WT_CLEAR(visit_state); visit_state.session = session; lchild = newtop = rchild = NULL; page_type = top->type; WT_ASSERT(session, __wt_btree_mergeable(top)); WT_ASSERT(session, top->ref->state == WT_REF_LOCKED); /* * Walk the subtree, count the references at the bottom level and * calculate the maximum depth. */ WT_RET(__merge_walk(session, top, 1, __merge_count, &visit_state)); /* If there aren't enough useful levels, give up. */ if (visit_state.maxdepth < WT_MERGE_STACK_MIN) return (EBUSY); /* * Don't allow split merges to generate arbitrarily large pages. * Ideally we would choose a size based on the internal_page_max * setting for the btree, but we don't have the correct btree handle * available. */ if (visit_state.refcnt > WT_MERGE_MAX_REFS) return (EBUSY); /* * Now we either collapse the internal pages into one split-merge page, * or if there are "enough" keys, we split into two equal internal * pages, each of which can be evicted independently. * * We set a flag (WT_PM_REC_SPLIT_MERGE) on the created page if it * isn't big enough to justify the cost of evicting it. If splits * continue, it will be merged again until it gets over this limit. */ promote = 0; refcnt = (uint32_t)visit_state.refcnt; if (refcnt >= WT_MERGE_FULL_PAGE && visit_state.seen_live) { /* * In the normal case where there are live children spread * through the subtree, create two child pages. * * Handle the case where the only live child is first / last * specially: put the live child into the top-level page. * * Set SPLIT_MERGE on the internal pages if there are any live * children: they can't be evicted, so there is no point * permanently deepening the tree. */ if (visit_state.first_live == visit_state.last_live && (visit_state.first_live == 0 || visit_state.first_live == refcnt - 1)) split = (visit_state.first_live == 0) ? 1 : refcnt - 1; else split = (refcnt + 1) / 2; /* Only promote if we can create a real page. */ if (split == 1 || split == refcnt - 1) promote = 1; else if (split >= WT_MERGE_FULL_PAGE && visit_state.first_live >= split) promote = 1; else if (refcnt - split >= WT_MERGE_FULL_PAGE && visit_state.last_live < split) promote = 1; } if (promote) { /* Create a new top-level split-merge page with two entries. */ WT_ERR(__merge_new_page(session, page_type, 2, 1, &newtop)); visit_state.split = split; /* Left split. */ if (split == 1) visit_state.first = newtop; else { WT_ERR(__merge_new_page(session, page_type, split, visit_state.first_live < split, &lchild)); visit_state.first = lchild; } /* Right split. */ if (split == refcnt - 1) { visit_state.second = newtop; visit_state.second_ref = &newtop->u.intl.t[1]; } else { WT_ERR(__merge_new_page(session, page_type, refcnt - split, visit_state.last_live >= split, &rchild)); visit_state.second = rchild; visit_state.second_ref = &visit_state.second->u.intl.t[0]; } } else { /* * Create a new split-merge page for small merges, or if the * page above is a split merge page. When we do a big enough * merge, we create a real page at the top and don't consider * it as a merge candidate again. Over time with an insert * workload the tree will grow deeper, but that's inevitable, * and this keeps individual merges small. */ WT_ERR(__merge_new_page(session, page_type, refcnt, refcnt < WT_MERGE_FULL_PAGE || __wt_btree_mergeable(top->parent), &newtop)); visit_state.first = newtop; } /* * Copy the references into the new tree, but don't update anything in * the locked tree in case there is an error and we need to back out. * We do this in a separate pass so that we can figure out the key for * the split point: that allocates memory and so it could still fail. */ visit_state.page = visit_state.first; visit_state.ref = visit_state.page->u.intl.t; visit_state.refcnt = 0; WT_ERR(__merge_walk(session, top, 0, __merge_copy_ref, &visit_state)); if (promote) { /* Promote keys into the top-level page. */ if (lchild != NULL) { newref = &newtop->u.intl.t[0]; WT_LINK_PAGE(newtop, newref, lchild); newref->state = WT_REF_MEM; WT_ERR(__merge_promote_key(session, newref)); } if (rchild != NULL) { newref = &newtop->u.intl.t[1]; WT_LINK_PAGE(newtop, newref, rchild); newref->state = WT_REF_MEM; WT_ERR(__merge_promote_key(session, newref)); } } /* * We have copied everything into place and allocated all of the memory * we need. Now link all pages into the new tree and unlock them. * * The only way this could fail is if a reference state has been * changed by another thread since they were locked. Panic in that * case: that should never happen. */ visit_state.page = visit_state.first; visit_state.ref = visit_state.page->u.intl.t; visit_state.refcnt = 0; ret = __merge_walk(session, top, 0, __merge_switch_page, &visit_state); if (ret != 0) WT_ERR(__wt_illegal_value(session, "__wt_merge_tree")); newtop->u.intl.recno = top->u.intl.recno; newtop->parent = top->parent; newtop->ref = top->ref; #ifdef HAVE_DIAGNOSTIC /* * Before swapping in the new tree, walk the pages we are discarding, * check that everything looks right. */ __merge_check_discard(session, top); #endif /* * Set up the new top-level page as a split so that it will be swapped * into place by our caller. */ top->modify->flags = WT_PM_REC_SPLIT; top->modify->u.split = newtop; WT_VERBOSE_ERR(session, evict, "Successfully %s %" PRIu32 " split-merge pages containing %" PRIu32 " keys\n", promote ? "promoted" : "merged", visit_state.maxdepth, refcnt); /* Evict new child pages as soon as possible. */ if (lchild != NULL && !F_ISSET(lchild->modify, WT_PM_REC_SPLIT_MERGE)) lchild->read_gen = WT_READ_GEN_OLDEST; if (rchild != NULL && !F_ISSET(rchild->modify, WT_PM_REC_SPLIT_MERGE)) rchild->read_gen = WT_READ_GEN_OLDEST; /* Update statistics. */ WT_CSTAT_INCR(session, cache_eviction_merge); WT_DSTAT_INCR(session, cache_eviction_merge); /* How many levels did we remove? */ levels = visit_state.maxdepth - (promote ? 2 : 1); WT_CSTAT_INCRV(session, cache_eviction_merge_levels, levels); WT_DSTAT_INCRV(session, cache_eviction_merge_levels, levels); return (0); err: WT_VERBOSE_TRET(session, evict, "Failed to merge %" PRIu32 " split-merge pages containing %" PRIu32 " keys\n", visit_state.maxdepth, refcnt); WT_CSTAT_INCR(session, cache_eviction_merge_fail); WT_DSTAT_INCR(session, cache_eviction_merge_fail); if (newtop != NULL) __wt_page_out(session, &newtop); if (lchild != NULL) __wt_page_out(session, &lchild); if (rchild != NULL) __wt_page_out(session, &rchild); return (ret); }
/* * __wt_page_inmem -- * Build in-memory page information. */ int __wt_page_inmem( WT_SESSION_IMPL *session, WT_PAGE *parent, WT_REF *parent_ref, WT_PAGE_HEADER *dsk, int disk_not_alloc, WT_PAGE **pagep) { WT_DECL_RET; WT_PAGE *page; uint32_t alloc_entries; size_t size; alloc_entries = 0; *pagep = NULL; /* * Figure out how many underlying objects the page references so * we can allocate them along with the page. */ switch (dsk->type) { case WT_PAGE_COL_FIX: break; case WT_PAGE_COL_INT: /* * Column-store internal page entries map one-to-one to the * number of physical entries on the page (each physical entry * is an offset object). */ alloc_entries = dsk->u.entries; break; case WT_PAGE_COL_VAR: /* * Column-store leaf page entries map one-to-one to the number * of physical entries on the page (each physical entry is a * data item). */ alloc_entries = dsk->u.entries; break; case WT_PAGE_ROW_INT: /* * Row-store internal page entries map one-to-two to the number * of physical entries on the page (each in-memory entry is a * key item and location cookie). */ alloc_entries = dsk->u.entries / 2; break; case WT_PAGE_ROW_LEAF: /* * Row-store leaf page entries map in an indeterminate way to * the physical entries on the page, we have to walk the page * to figure it out. */ WT_RET(__inmem_row_leaf_entries(session, dsk, &alloc_entries)); break; WT_ILLEGAL_VALUE(session); } /* Allocate and initialize a new WT_PAGE. */ WT_RET(__wt_page_alloc(session, dsk->type, alloc_entries, &page)); page->dsk = dsk; page->read_gen = WT_READ_GEN_NOTSET; if (disk_not_alloc) F_SET_ATOMIC(page, WT_PAGE_DISK_NOT_ALLOC); /* * Track the memory allocated to build this page so we can update the * cache statistics in a single call. */ size = disk_not_alloc ? 0 : dsk->mem_size; switch (page->type) { case WT_PAGE_COL_FIX: page->entries = dsk->u.entries; page->u.col_fix.recno = dsk->recno; __inmem_col_fix(session, page); break; case WT_PAGE_COL_INT: page->entries = dsk->u.entries; page->u.intl.recno = dsk->recno; __inmem_col_int(session, page); break; case WT_PAGE_COL_VAR: page->entries = dsk->u.entries; page->u.col_var.recno = dsk->recno; WT_ERR(__inmem_col_var(session, page, &size)); break; case WT_PAGE_ROW_INT: page->entries = dsk->u.entries / 2; WT_ERR(__inmem_row_int(session, page, &size)); break; case WT_PAGE_ROW_LEAF: page->entries = alloc_entries; WT_ERR(__inmem_row_leaf(session, page)); break; WT_ILLEGAL_VALUE_ERR(session); } /* Update the page's in-memory size and the cache statistics. */ __wt_cache_page_inmem_incr(session, page, size); /* Link the new page into the parent. */ if (parent_ref != NULL) WT_LINK_PAGE(parent, parent_ref, page); *pagep = page; return (0); err: __wt_page_out(session, &page); return (ret); }