/* * __wt_compact -- * Compact a file. */ int __wt_compact(WT_SESSION_IMPL *session, const char *cfg[]) { WT_BM *bm; WT_CONFIG_ITEM cval; WT_DECL_RET; WT_PAGE *page; int trigger, skip; bm = S2BT(session)->bm; WT_DSTAT_INCR(session, session_compact); WT_RET(__wt_config_gets(session, cfg, "trigger", &cval)); trigger = (int)cval.val; /* Check if compaction might be useful. */ WT_RET(bm->compact_skip(bm, session, trigger, &skip)); if (skip) return (0); /* * Walk the cache reviewing in-memory pages to see if they need to be * re-written. This requires looking at page reconciliation results, * which means the page cannot be reconciled at the same time as it's * being reviewed for compaction. The underlying functions ensure we * don't collide with page eviction, but we need to make sure we don't * collide with checkpoints either, they are the other operation that * can reconcile a page. */ __wt_spin_lock(session, &S2C(session)->metadata_lock); WT_RET(__wt_bt_cache_op(session, NULL, WT_SYNC_COMPACT)); __wt_spin_unlock(session, &S2C(session)->metadata_lock); /* * Walk the tree, reviewing on-disk pages to see if they need to be * re-written. */ for (page = NULL;;) { WT_RET(__wt_tree_walk(session, &page, WT_TREE_COMPACT)); if (page == NULL) break; /* * The only pages returned by the tree walk function are pages * we want to re-write; mark the page and tree dirty. */ if ((ret = __wt_page_modify_init(session, page)) != 0) { WT_TRET(__wt_page_release(session, page)); WT_RET(ret); } __wt_page_and_tree_modify_set(session, page); WT_DSTAT_INCR(session, btree_compact_rewrite); } return (0); }
/* * __wt_btcur_next_random -- * Move to a random record in the tree. */ int __wt_btcur_next_random(WT_CURSOR_BTREE *cbt) { WT_BTREE *btree; WT_DECL_RET; WT_SESSION_IMPL *session; session = (WT_SESSION_IMPL *)cbt->iface.session; btree = cbt->btree; WT_DSTAT_INCR(session, cursor_next); retry: WT_RET(__cursor_func_init(cbt, 1)); __cursor_position_clear(cbt); /* * Only supports row-store: applications can trivially select a random * value from a column-store, if there were any reason to do so. */ WT_ERR(btree->type == BTREE_ROW ? __wt_row_random(session, cbt) : ENOTSUP); ret = cbt->compare == 0 ? __wt_kv_return(session, cbt) : WT_NOTFOUND; err: if (ret == WT_RESTART) goto retry; WT_TRET(__cursor_func_resolve(cbt, ret)); return (ret); }
/* * __hazard_exclusive -- * Request exclusive access to a page. */ static int __hazard_exclusive(WT_SESSION_IMPL *session, WT_REF *ref, int top) { /* * Make sure there is space to track exclusive access so we can unlock * to clean up. */ if (session->excl_next * sizeof(WT_REF *) == session->excl_allocated) WT_RET(__wt_realloc(session, &session->excl_allocated, (session->excl_next + 50) * sizeof(WT_REF *), &session->excl)); /* * Hazard pointers are acquired down the tree, which means we can't * deadlock. * * Request exclusive access to the page. The top-level page should * already be in the locked state, lock child pages in memory. * If another thread already has this page, give up. */ if (!top && !WT_ATOMIC_CAS(ref->state, WT_REF_MEM, WT_REF_LOCKED)) return (EBUSY); /* We couldn't change the state. */ WT_ASSERT(session, ref->state == WT_REF_LOCKED); session->excl[session->excl_next++] = ref; /* Check for a matching hazard pointer. */ if (__wt_page_hazard_check(session, ref->page) == NULL) return (0); WT_DSTAT_INCR(session, cache_eviction_hazard); WT_CSTAT_INCR(session, cache_eviction_hazard); WT_VERBOSE_RET( session, evict, "page %p hazard request failed", ref->page); return (EBUSY); }
/* * __wt_merge_tree -- * Attempt to collapse a stack of split-merge pages in memory into a * shallow tree. If enough keys are found, create a real internal node * that can be evicted (and, if necessary, split further). * * This code is designed to deal with workloads that otherwise create * arbitrarily deep (and slow) trees in memory. */ int __wt_merge_tree(WT_SESSION_IMPL *session, WT_PAGE *top) { WT_DECL_RET; WT_PAGE *lchild, *newtop, *rchild; WT_REF *newref; WT_VISIT_STATE visit_state; uint32_t refcnt, split; int promote; u_int levels; uint8_t page_type; WT_CLEAR(visit_state); visit_state.session = session; lchild = newtop = rchild = NULL; page_type = top->type; WT_ASSERT(session, __wt_btree_mergeable(top)); WT_ASSERT(session, top->ref->state == WT_REF_LOCKED); /* * Walk the subtree, count the references at the bottom level and * calculate the maximum depth. */ WT_RET(__merge_walk(session, top, 1, __merge_count, &visit_state)); /* If there aren't enough useful levels, give up. */ if (visit_state.maxdepth < WT_MERGE_STACK_MIN) return (EBUSY); /* * Don't allow split merges to generate arbitrarily large pages. * Ideally we would choose a size based on the internal_page_max * setting for the btree, but we don't have the correct btree handle * available. */ if (visit_state.refcnt > WT_MERGE_MAX_REFS) return (EBUSY); /* * Now we either collapse the internal pages into one split-merge page, * or if there are "enough" keys, we split into two equal internal * pages, each of which can be evicted independently. * * We set a flag (WT_PM_REC_SPLIT_MERGE) on the created page if it * isn't big enough to justify the cost of evicting it. If splits * continue, it will be merged again until it gets over this limit. */ promote = 0; refcnt = (uint32_t)visit_state.refcnt; if (refcnt >= WT_MERGE_FULL_PAGE && visit_state.seen_live) { /* * In the normal case where there are live children spread * through the subtree, create two child pages. * * Handle the case where the only live child is first / last * specially: put the live child into the top-level page. * * Set SPLIT_MERGE on the internal pages if there are any live * children: they can't be evicted, so there is no point * permanently deepening the tree. */ if (visit_state.first_live == visit_state.last_live && (visit_state.first_live == 0 || visit_state.first_live == refcnt - 1)) split = (visit_state.first_live == 0) ? 1 : refcnt - 1; else split = (refcnt + 1) / 2; /* Only promote if we can create a real page. */ if (split == 1 || split == refcnt - 1) promote = 1; else if (split >= WT_MERGE_FULL_PAGE && visit_state.first_live >= split) promote = 1; else if (refcnt - split >= WT_MERGE_FULL_PAGE && visit_state.last_live < split) promote = 1; } if (promote) { /* Create a new top-level split-merge page with two entries. */ WT_ERR(__merge_new_page(session, page_type, 2, 1, &newtop)); visit_state.split = split; /* Left split. */ if (split == 1) visit_state.first = newtop; else { WT_ERR(__merge_new_page(session, page_type, split, visit_state.first_live < split, &lchild)); visit_state.first = lchild; } /* Right split. */ if (split == refcnt - 1) { visit_state.second = newtop; visit_state.second_ref = &newtop->u.intl.t[1]; } else { WT_ERR(__merge_new_page(session, page_type, refcnt - split, visit_state.last_live >= split, &rchild)); visit_state.second = rchild; visit_state.second_ref = &visit_state.second->u.intl.t[0]; } } else { /* * Create a new split-merge page for small merges, or if the * page above is a split merge page. When we do a big enough * merge, we create a real page at the top and don't consider * it as a merge candidate again. Over time with an insert * workload the tree will grow deeper, but that's inevitable, * and this keeps individual merges small. */ WT_ERR(__merge_new_page(session, page_type, refcnt, refcnt < WT_MERGE_FULL_PAGE || __wt_btree_mergeable(top->parent), &newtop)); visit_state.first = newtop; } /* * Copy the references into the new tree, but don't update anything in * the locked tree in case there is an error and we need to back out. * We do this in a separate pass so that we can figure out the key for * the split point: that allocates memory and so it could still fail. */ visit_state.page = visit_state.first; visit_state.ref = visit_state.page->u.intl.t; visit_state.refcnt = 0; WT_ERR(__merge_walk(session, top, 0, __merge_copy_ref, &visit_state)); if (promote) { /* Promote keys into the top-level page. */ if (lchild != NULL) { newref = &newtop->u.intl.t[0]; WT_LINK_PAGE(newtop, newref, lchild); newref->state = WT_REF_MEM; WT_ERR(__merge_promote_key(session, newref)); } if (rchild != NULL) { newref = &newtop->u.intl.t[1]; WT_LINK_PAGE(newtop, newref, rchild); newref->state = WT_REF_MEM; WT_ERR(__merge_promote_key(session, newref)); } } /* * We have copied everything into place and allocated all of the memory * we need. Now link all pages into the new tree and unlock them. * * The only way this could fail is if a reference state has been * changed by another thread since they were locked. Panic in that * case: that should never happen. */ visit_state.page = visit_state.first; visit_state.ref = visit_state.page->u.intl.t; visit_state.refcnt = 0; ret = __merge_walk(session, top, 0, __merge_switch_page, &visit_state); if (ret != 0) WT_ERR(__wt_illegal_value(session, "__wt_merge_tree")); newtop->u.intl.recno = top->u.intl.recno; newtop->parent = top->parent; newtop->ref = top->ref; #ifdef HAVE_DIAGNOSTIC /* * Before swapping in the new tree, walk the pages we are discarding, * check that everything looks right. */ __merge_check_discard(session, top); #endif /* * Set up the new top-level page as a split so that it will be swapped * into place by our caller. */ top->modify->flags = WT_PM_REC_SPLIT; top->modify->u.split = newtop; WT_VERBOSE_ERR(session, evict, "Successfully %s %" PRIu32 " split-merge pages containing %" PRIu32 " keys\n", promote ? "promoted" : "merged", visit_state.maxdepth, refcnt); /* Evict new child pages as soon as possible. */ if (lchild != NULL && !F_ISSET(lchild->modify, WT_PM_REC_SPLIT_MERGE)) lchild->read_gen = WT_READ_GEN_OLDEST; if (rchild != NULL && !F_ISSET(rchild->modify, WT_PM_REC_SPLIT_MERGE)) rchild->read_gen = WT_READ_GEN_OLDEST; /* Update statistics. */ WT_CSTAT_INCR(session, cache_eviction_merge); WT_DSTAT_INCR(session, cache_eviction_merge); /* How many levels did we remove? */ levels = visit_state.maxdepth - (promote ? 2 : 1); WT_CSTAT_INCRV(session, cache_eviction_merge_levels, levels); WT_DSTAT_INCRV(session, cache_eviction_merge_levels, levels); return (0); err: WT_VERBOSE_TRET(session, evict, "Failed to merge %" PRIu32 " split-merge pages containing %" PRIu32 " keys\n", visit_state.maxdepth, refcnt); WT_CSTAT_INCR(session, cache_eviction_merge_fail); WT_DSTAT_INCR(session, cache_eviction_merge_fail); if (newtop != NULL) __wt_page_out(session, &newtop); if (lchild != NULL) __wt_page_out(session, &lchild); if (rchild != NULL) __wt_page_out(session, &rchild); return (ret); }
/* * __wt_btcur_prev -- * Move to the previous record in the tree. */ int __wt_btcur_prev(WT_CURSOR_BTREE *cbt, int discard) { WT_DECL_RET; WT_PAGE *page; WT_SESSION_IMPL *session; uint32_t flags; int newpage; session = (WT_SESSION_IMPL *)cbt->iface.session; WT_DSTAT_INCR(session, cursor_prev); flags = WT_TREE_SKIP_INTL | WT_TREE_PREV; /* Tree walk flags. */ if (discard) LF_SET(WT_TREE_DISCARD); retry: WT_RET(__cursor_func_init(cbt, 0)); __cursor_position_clear(cbt); /* * If we aren't already iterating in the right direction, there's * some setup to do. */ if (!F_ISSET(cbt, WT_CBT_ITERATE_PREV)) __wt_btcur_iterate_setup(cbt, 0); /* * If this is a modification, we're about to read information from the * page, save the write generation. */ page = cbt->page; if (discard && page != NULL) { WT_ERR(__wt_page_modify_init(session, page)); WT_ORDERED_READ(cbt->write_gen, page->modify->write_gen); } /* * Walk any page we're holding until the underlying call returns not- * found. Then, move to the previous page, until we reach the start * of the file. */ for (newpage = 0;; newpage = 1) { if (F_ISSET(cbt, WT_CBT_ITERATE_APPEND)) { switch (page->type) { case WT_PAGE_COL_FIX: ret = __cursor_fix_append_prev(cbt, newpage); break; case WT_PAGE_COL_VAR: ret = __cursor_var_append_prev(cbt, newpage); break; WT_ILLEGAL_VALUE_ERR(session); } if (ret == 0) break; F_CLR(cbt, WT_CBT_ITERATE_APPEND); if (ret != WT_NOTFOUND) break; newpage = 1; } if (page != NULL) { switch (page->type) { case WT_PAGE_COL_FIX: ret = __cursor_fix_prev(cbt, newpage); break; case WT_PAGE_COL_VAR: ret = __cursor_var_prev(cbt, newpage); break; case WT_PAGE_ROW_LEAF: ret = __cursor_row_prev(cbt, newpage); break; WT_ILLEGAL_VALUE_ERR(session); } if (ret != WT_NOTFOUND) break; } cbt->page = NULL; WT_ERR(__wt_tree_walk(session, &page, flags)); WT_ERR_TEST(page == NULL, WT_NOTFOUND); WT_ASSERT(session, page->type != WT_PAGE_COL_INT && page->type != WT_PAGE_ROW_INT); cbt->page = page; /* Initialize the page's modification information */ if (discard) { WT_ERR(__wt_page_modify_init(session, page)); WT_ORDERED_READ( cbt->write_gen, page->modify->write_gen); } /* * The last page in a column-store has appended entries. * We handle it separately from the usual cursor code: * it's only that one page and it's in a simple format. */ if (page->type != WT_PAGE_ROW_LEAF && (cbt->ins_head = WT_COL_APPEND(page)) != NULL) F_SET(cbt, WT_CBT_ITERATE_APPEND); } err: if (ret == WT_RESTART) goto retry; WT_TRET(__cursor_func_resolve(cbt, ret)); return (ret); }
/* * __rec_review -- * Get exclusive access to the page and review the page and its subtree * for conditions that would block its eviction. * * The ref and page arguments may appear to be redundant, because usually * ref->page == page and page->ref == ref. However, we need both because * (a) there are cases where ref == NULL (e.g., for root page or during * salvage), and (b) we can't safely look at page->ref until we have a * hazard pointer. */ static int __rec_review(WT_SESSION_IMPL *session, WT_REF *ref, WT_PAGE *page, int exclusive, int merge, int top) { WT_BTREE *btree; WT_DECL_RET; WT_PAGE_MODIFY *mod; WT_PAGE *t; uint32_t i; btree = session->btree; /* * Get exclusive access to the page if our caller doesn't have the tree * locked down. */ if (!exclusive) WT_RET(__hazard_exclusive(session, ref, top)); /* * Recurse through the page's subtree: this happens first because we * have to write pages in depth-first order, otherwise we'll dirty * pages after we've written them. */ if (page->type == WT_PAGE_COL_INT || page->type == WT_PAGE_ROW_INT) WT_REF_FOREACH(page, ref, i) switch (ref->state) { case WT_REF_DISK: /* On-disk */ case WT_REF_DELETED: /* On-disk, deleted */ break; case WT_REF_MEM: /* In-memory */ WT_RET(__rec_review(session, ref, ref->page, exclusive, merge, 0)); break; case WT_REF_EVICT_WALK: /* Walk point */ case WT_REF_EVICT_FORCE: /* Forced evict */ case WT_REF_LOCKED: /* Being evicted */ case WT_REF_READING: /* Being read */ return (EBUSY); } /* * If the file is being checkpointed, we cannot evict dirty pages, * because that may free a page that appears on an internal page in the * checkpoint. Don't rely on new updates being skipped by the * transaction used for transaction reads: (1) there are paths that * dirty pages for artificial reasons; (2) internal pages aren't * transactional; and (3) if an update was skipped during the * checkpoint (leaving the page dirty), then rolled back, we could * still successfully overwrite a page and corrupt the checkpoint. * * Further, even for clean pages, the checkpoint's reconciliation of an * internal page might race with us as we evict a child in the page's * subtree. * * One half of that test is in the reconciliation code: the checkpoint * thread waits for eviction-locked pages to settle before determining * their status. The other half of the test is here: after acquiring * the exclusive eviction lock on a page, confirm no page in the page's * stack of pages from the root is being reconciled in a checkpoint. * This ensures we either see the checkpoint-walk state here, or the * reconciliation of the internal page sees our exclusive lock on the * child page and waits until we're finished evicting the child page * (or give up if eviction isn't possible). * * We must check the full stack (we might be attempting to evict a leaf * page multiple levels beneath the internal page being reconciled as * part of the checkpoint, and all of the intermediate nodes are being * merged into the internal page). * * There's no simple test for knowing if a page in our page stack is * involved in a checkpoint. The internal page's checkpoint-walk flag * is the best test, but it's not set anywhere for the root page, it's * not a complete test. * * Quit for any page that's not a simple, in-memory page. (Almost the * same as checking for the checkpoint-walk flag. I don't think there * are code paths that change the page's status from checkpoint-walk, * but these races are hard enough I'm not going to proceed if there's * anything other than a vanilla, in-memory tree stack.) Climb until * we find a page which can't be merged into its parent, and failing if * we never find such a page. */ if (btree->checkpointing && !merge && __wt_page_is_modified(page)) { ckpt: WT_CSTAT_INCR(session, cache_eviction_checkpoint); WT_DSTAT_INCR(session, cache_eviction_checkpoint); return (EBUSY); } if (btree->checkpointing && top) for (t = page->parent;; t = t->parent) { if (t == NULL || t->ref == NULL) /* root */ goto ckpt; if (t->ref->state != WT_REF_MEM) /* scary */ goto ckpt; if (t->modify == NULL || /* not merged */ !F_ISSET(t->modify, WT_PM_REC_EMPTY | WT_PM_REC_SPLIT | WT_PM_REC_SPLIT_MERGE)) break; } /* * If we are merging internal pages, we just need exclusive access, we * don't need to write everything. */ if (merge) return (0); /* * Fail if any page in the top-level page's subtree won't be merged into * its parent, the page that cannot be merged must be evicted first. * The test is necessary but should not fire much: the eviction code is * biased for leaf pages, an internal page shouldn't be selected for * eviction until its children have been evicted. * * We have to write dirty pages to know their final state, a page marked * empty may have had records added since reconciliation, a page marked * split may have had records deleted and no longer need to split. * Split-merge pages are the exception: they can never be change into * anything other than a split-merge page and are merged regardless of * being clean or dirty. * * Writing the page is expensive, do a cheap test first: if it doesn't * appear a subtree page can be merged, quit. It's possible the page * has been emptied since it was last reconciled, and writing it before * testing might be worthwhile, but it's more probable we're attempting * to evict an internal page with live children, and that's a waste of * time. */ mod = page->modify; if (!top && (mod == NULL || !F_ISSET(mod, WT_PM_REC_EMPTY | WT_PM_REC_SPLIT | WT_PM_REC_SPLIT_MERGE))) return (EBUSY); /* * If the page is dirty and can possibly change state, write it so we * know the final state. */ if (__wt_page_is_modified(page) && !F_ISSET(mod, WT_PM_REC_SPLIT_MERGE)) { ret = __wt_rec_write(session, page, NULL, WT_EVICTION_SERVER_LOCKED | WT_SKIP_UPDATE_QUIT); /* * Update the page's modification reference, reconciliation * might have changed it. */ mod = page->modify; /* If there are unwritten changes on the page, give up. */ if (ret == EBUSY) { WT_VERBOSE_RET(session, evict, "eviction failed, reconciled page not clean"); /* * We may be able to discard any "update" memory the * page no longer needs. */ switch (page->type) { case WT_PAGE_COL_FIX: case WT_PAGE_COL_VAR: __wt_col_leaf_obsolete(session, page); break; case WT_PAGE_ROW_LEAF: __wt_row_leaf_obsolete(session, page); break; } } WT_RET(ret); WT_ASSERT(session, __wt_page_is_modified(page) == 0); } /* * Repeat the test: fail if any page in the top-level page's subtree * won't be merged into its parent. */ if (!top && (mod == NULL || !F_ISSET(mod, WT_PM_REC_EMPTY | WT_PM_REC_SPLIT | WT_PM_REC_SPLIT_MERGE))) return (EBUSY); return (0); }
/* * __wt_rec_evict -- * Reconciliation plus eviction. */ int __wt_rec_evict(WT_SESSION_IMPL *session, WT_PAGE *page, int exclusive) { WT_DECL_RET; WT_PAGE_MODIFY *mod; int merge; WT_VERBOSE_RET(session, evict, "page %p (%s)", page, __wt_page_type_string(page->type)); WT_ASSERT(session, session->excl_next == 0); /* * If we get a split-merge page during normal eviction, try to collapse * it. During close, it will be merged into its parent. */ mod = page->modify; merge = __wt_btree_mergeable(page); if (merge && exclusive) return (EBUSY); WT_ASSERT(session, merge || mod == NULL || !F_ISSET(mod, WT_PM_REC_SPLIT_MERGE)); /* * Get exclusive access to the page and review the page and its subtree * for conditions that would block our eviction of the page. If the * check fails (for example, we find a child page that can't be merged), * we're done. We have to make this check for clean pages, too: while * unlikely eviction would choose an internal page with children, it's * not disallowed anywhere. * * Note that page->ref may be NULL in some cases (e.g., for root pages * or during salvage). That's OK if exclusive is set: we won't check * hazard pointers in that case. */ WT_ERR(__rec_review(session, page->ref, page, exclusive, merge, 1)); /* Try to merge internal pages. */ if (merge) WT_ERR(__wt_merge_tree(session, page)); /* * Update the page's modification reference, reconciliation might have * changed it. */ mod = page->modify; /* Count evictions of internal pages during normal operation. */ if (!exclusive && !merge && (page->type == WT_PAGE_COL_INT || page->type == WT_PAGE_ROW_INT)) { WT_CSTAT_INCR(session, cache_eviction_internal); WT_DSTAT_INCR(session, cache_eviction_internal); } /* * Update the parent and discard the page. */ if (mod == NULL || !F_ISSET(mod, WT_PM_REC_MASK)) { WT_ASSERT(session, exclusive || page->ref->state == WT_REF_LOCKED); if (WT_PAGE_IS_ROOT(page)) __rec_root_update(session); else __rec_page_clean_update(session, page); /* Discard the page. */ __rec_discard_page(session, page, exclusive); WT_CSTAT_INCR(session, cache_eviction_clean); WT_DSTAT_INCR(session, cache_eviction_clean); } else { if (WT_PAGE_IS_ROOT(page)) __rec_root_update(session); else WT_ERR(__rec_page_dirty_update(session, page)); /* Discard the tree rooted in this page. */ __rec_discard_tree(session, page, exclusive); WT_CSTAT_INCR(session, cache_eviction_dirty); WT_DSTAT_INCR(session, cache_eviction_dirty); } if (0) { err: /* * If unable to evict this page, release exclusive reference(s) * we've acquired. */ __rec_excl_clear(session); WT_CSTAT_INCR(session, cache_eviction_fail); WT_DSTAT_INCR(session, cache_eviction_fail); } session->excl_next = 0; return (ret); }
/* * __wt_bt_read -- * Read a cookie referenced block into a buffer. */ int __wt_bt_read(WT_SESSION_IMPL *session, WT_ITEM *buf, const uint8_t *addr, uint32_t addr_size) { WT_BM *bm; WT_BTREE *btree; WT_DECL_ITEM(tmp); WT_DECL_RET; WT_PAGE_HEADER *dsk; size_t result_len; btree = session->btree; bm = btree->bm; /* * If anticipating a compressed block, read into a scratch buffer and * decompress into the caller's buffer. Else, read directly into the * caller's buffer. */ if (btree->compressor == NULL) { WT_RET(bm->read(bm, session, buf, addr, addr_size)); dsk = buf->mem; } else { WT_RET(__wt_scr_alloc(session, 0, &tmp)); WT_ERR(bm->read(bm, session, tmp, addr, addr_size)); dsk = tmp->mem; } /* * If the block is compressed, copy the skipped bytes of the original * image into place, then decompress. */ if (F_ISSET(dsk, WT_PAGE_COMPRESSED)) { if (btree->compressor == NULL || btree->compressor->decompress == NULL) WT_ERR_MSG(session, WT_ERROR, "read compressed block where no compression engine " "configured"); /* * We're allocating the exact number of bytes we're expecting * from decompression. */ WT_ERR(__wt_buf_init(session, buf, dsk->mem_size)); buf->size = dsk->mem_size; /* * Note the source length is NOT the number of compressed bytes, * it's the length of the block we just read (minus the skipped * bytes). We don't store the number of compressed bytes: some * compression engines need that length stored externally, they * don't have markers in the stream to signal the end of the * compressed bytes. Those engines must store the compressed * byte length somehow, see the snappy compression extension for * an example. */ memcpy(buf->mem, tmp->mem, WT_BLOCK_COMPRESS_SKIP); WT_ERR(btree->compressor->decompress( btree->compressor, &session->iface, (uint8_t *)tmp->mem + WT_BLOCK_COMPRESS_SKIP, tmp->size - WT_BLOCK_COMPRESS_SKIP, (uint8_t *)buf->mem + WT_BLOCK_COMPRESS_SKIP, dsk->mem_size - WT_BLOCK_COMPRESS_SKIP, &result_len)); /* * If checksums were turned off because we're depending on the * decompression to fail on any corrupted data, we'll end up * here after corruption happens. If we're salvaging the file, * it's OK, otherwise it's really, really bad. */ if (result_len != dsk->mem_size - WT_BLOCK_COMPRESS_SKIP) WT_ERR( F_ISSET(session, WT_SESSION_SALVAGE_QUIET_ERR) ? WT_ERROR : __wt_illegal_value(session, btree->name)); } else if (btree->compressor == NULL) buf->size = dsk->mem_size; else /* * We guessed wrong: there was a compressor, but this * block was not compressed, and now the page is in the * wrong buffer and the buffer may be of the wrong size. * This should be rare, but happens with small blocks * that aren't worth compressing. */ WT_ERR(__wt_buf_set( session, buf, tmp->data, dsk->mem_size)); /* If the handle is a verify handle, verify the physical page. */ if (F_ISSET(btree, WT_BTREE_VERIFY)) { if (tmp == NULL) WT_ERR(__wt_scr_alloc(session, 0, &tmp)); WT_ERR(bm->addr_string(bm, session, tmp, addr, addr_size)); WT_ERR(__wt_verify_dsk(session, (const char *)tmp->data, buf)); } WT_CSTAT_INCR(session, cache_read); WT_DSTAT_INCR(session, cache_read); if (F_ISSET(dsk, WT_PAGE_COMPRESSED)) WT_DSTAT_INCR(session, compress_read); WT_CSTAT_INCRV(session, cache_bytes_read, addr_size); WT_DSTAT_INCRV(session, cache_bytes_read, addr_size); err: __wt_scr_free(&tmp); return (ret); }
/* * __wt_bt_write -- * Write a buffer into a block, returning the block's addr/size and * checksum. */ int __wt_bt_write(WT_SESSION_IMPL *session, WT_ITEM *buf, uint8_t *addr, uint32_t *addr_size, int checkpoint, int compressed) { WT_BM *bm; WT_BTREE *btree; WT_ITEM *ip; WT_DECL_ITEM(tmp); WT_DECL_RET; WT_PAGE_HEADER *dsk; size_t len, src_len, dst_len, result_len, size; int data_cksum, compression_failed; uint8_t *src, *dst; btree = session->btree; bm = btree->bm; /* Checkpoint calls are different than standard calls. */ WT_ASSERT(session, (checkpoint == 0 && addr != NULL && addr_size != NULL) || (checkpoint == 1 && addr == NULL && addr_size == NULL)); #ifdef HAVE_DIAGNOSTIC /* * We're passed a table's disk image. Decompress if necessary and * verify the image. Always check the in-memory length for accuracy. */ dsk = buf->mem; if (compressed) { WT_ERR(__wt_scr_alloc(session, dsk->mem_size, &tmp)); memcpy(tmp->mem, buf->data, WT_BLOCK_COMPRESS_SKIP); WT_ERR(btree->compressor->decompress( btree->compressor, &session->iface, (uint8_t *)buf->data + WT_BLOCK_COMPRESS_SKIP, buf->size - WT_BLOCK_COMPRESS_SKIP, (uint8_t *)tmp->data + WT_BLOCK_COMPRESS_SKIP, tmp->memsize - WT_BLOCK_COMPRESS_SKIP, &result_len)); WT_ASSERT(session, dsk->mem_size == result_len + WT_BLOCK_COMPRESS_SKIP); tmp->size = (uint32_t)result_len + WT_BLOCK_COMPRESS_SKIP; ip = tmp; } else { WT_ASSERT(session, dsk->mem_size == buf->size); ip = buf; } WT_ERR(__wt_verify_dsk(session, "[write-check]", ip)); __wt_scr_free(&tmp); #endif /* * Optionally stream-compress the data, but don't compress blocks that * are already as small as they're going to get. */ if (buf->size <= btree->allocsize || btree->compressor == NULL || btree->compressor->compress == NULL || compressed) { ip = buf; WT_DSTAT_INCR(session, compress_write_too_small); } else { /* Skip the header bytes of the source data. */ src = (uint8_t *)buf->mem + WT_BLOCK_COMPRESS_SKIP; src_len = buf->size - WT_BLOCK_COMPRESS_SKIP; /* * Compute the size needed for the destination buffer. We only * allocate enough memory for a copy of the original by default, * if any compressed version is bigger than the original, we * won't use it. However, some compression engines (snappy is * one example), may need more memory because they don't stop * just because there's no more memory into which to compress. */ if (btree->compressor->pre_size == NULL) len = src_len; else WT_ERR(btree->compressor->pre_size(btree->compressor, &session->iface, src, src_len, &len)); size = len + WT_BLOCK_COMPRESS_SKIP; WT_ERR(bm->write_size(bm, session, &size)); WT_ERR(__wt_scr_alloc(session, size, &tmp)); /* Skip the header bytes of the destination data. */ dst = (uint8_t *)tmp->mem + WT_BLOCK_COMPRESS_SKIP; dst_len = len; /* * If compression fails, fallback to the original version. This * isn't unexpected: if compression doesn't work for some chunk * of bytes for some reason (noting there's likely additional * format/header information which compressed output requires), * it just means the uncompressed version is as good as it gets, * and that's what we use. */ compression_failed = 0; WT_ERR(btree->compressor->compress(btree->compressor, &session->iface, src, src_len, dst, dst_len, &result_len, &compression_failed)); if (compression_failed) { ip = buf; WT_DSTAT_INCR(session, compress_write_fail); } else { compressed = 1; WT_DSTAT_INCR(session, compress_write); /* * Copy in the skipped header bytes, set the final data * size. */ memcpy(tmp->mem, buf->mem, WT_BLOCK_COMPRESS_SKIP); tmp->size = (uint32_t)result_len + WT_BLOCK_COMPRESS_SKIP; ip = tmp; } } dsk = ip->mem; /* If the buffer is compressed, set the flag. */ if (compressed) F_SET(dsk, WT_PAGE_COMPRESSED); /* * We increment the block's write generation so it's easy to identify * newer versions of blocks during salvage. (It's common in WiredTiger, * at least for the default block manager, for multiple blocks to be * internally consistent with identical first and last keys, so we need * a way to know the most recent state of the block. We could check * which leaf is referenced by a valid internal page, but that implies * salvaging internal pages, which I don't want to do, and it's not * as good anyway, because the internal page may not have been written * after the leaf page was updated. So, write generations it is. * * Nothing is locked at this point but two versions of a page with the * same generation is pretty unlikely, and if we did, they're going to * be roughly identical for the purposes of salvage, anyway. */ dsk->write_gen = ++btree->write_gen; /* * Checksum the data if the buffer isn't compressed or checksums are * configured. */ switch (btree->checksum) { case CKSUM_ON: data_cksum = 1; break; case CKSUM_OFF: data_cksum = 0; break; case CKSUM_UNCOMPRESSED: default: data_cksum = !compressed; break; } /* Call the block manager to write the block. */ WT_ERR(checkpoint ? bm->checkpoint(bm, session, ip, btree->ckpt, data_cksum) : bm->write(bm, session, ip, addr, addr_size, data_cksum)); WT_CSTAT_INCR(session, cache_write); WT_DSTAT_INCR(session, cache_write); WT_CSTAT_INCRV(session, cache_bytes_write, ip->size); WT_DSTAT_INCRV(session, cache_bytes_write, ip->size); err: __wt_scr_free(&tmp); return (ret); }
/* * __wt_compact_evict -- * Helper routine to decide if a file's size would benefit from re-writing * this page. */ int __wt_compact_evict(WT_SESSION_IMPL *session, WT_PAGE *page) { WT_BM *bm; WT_PAGE_MODIFY *mod; int skip; uint32_t addr_size; const uint8_t *addr; bm = S2BT(session)->bm; mod = page->modify; /* * We have to review page reconciliation information as an in-memory * page's original disk addresses might have been fine for compaction * but its replacement addresses might be a problem. To review page * reconciliation information, we have to lock out both eviction and * checkpoints, as those are the other two operations that can write * a page. * * Ignore the root: it may not have a replacement address, and besides, * if anything else gets written, so will it. */ if (WT_PAGE_IS_ROOT(page)) return (0); /* * If the page is already dirty, skip some work, it will be written in * any case. */ if (__wt_page_is_modified(page)) return (0); /* * If the page is clean, test the original addresses. * If the page is a 1-to-1 replacement, test the replacement addresses. * If the page is a split, ignore it, it will be merged into the parent. */ if (mod == NULL) goto disk; switch (F_ISSET(mod, WT_PM_REC_MASK)) { case 0: disk: __wt_get_addr(page->parent, page->ref, &addr, &addr_size); if (addr == NULL) return (0); WT_RET( bm->compact_page_skip(bm, session, addr, addr_size, &skip)); if (skip) return (0); break; case WT_PM_REC_EMPTY: return (0); case WT_PM_REC_REPLACE: WT_RET(bm->compact_page_skip(bm, session, mod->u.replace.addr, mod->u.replace.size, &skip)); if (skip) return (0); break; case WT_PM_REC_SPLIT: case WT_PM_REC_SPLIT_MERGE: return (0); } /* Mark the page and tree dirty, we want to write this page. */ WT_RET(__wt_page_modify_init(session, page)); __wt_page_and_tree_modify_set(session, page); WT_DSTAT_INCR(session, btree_compact_rewrite); return (0); }