/* * __stat_tree_walk -- * Gather btree statistics that require traversing the tree. */ static int __stat_tree_walk(WT_SESSION_IMPL *session) { WT_BTREE *btree; WT_DECL_RET; WT_DSRC_STATS **stats; WT_REF *next_walk; btree = S2BT(session); stats = btree->dhandle->stats; /* * Clear the statistics we're about to count. */ WT_STAT_SET(session, stats, btree_column_deleted, 0); WT_STAT_SET(session, stats, btree_column_fix, 0); WT_STAT_SET(session, stats, btree_column_internal, 0); WT_STAT_SET(session, stats, btree_column_rle, 0); WT_STAT_SET(session, stats, btree_column_variable, 0); WT_STAT_SET(session, stats, btree_entries, 0); WT_STAT_SET(session, stats, btree_overflow, 0); WT_STAT_SET(session, stats, btree_row_internal, 0); WT_STAT_SET(session, stats, btree_row_leaf, 0); next_walk = NULL; while ((ret = __wt_tree_walk( session, &next_walk, 0)) == 0 && next_walk != NULL) { WT_WITH_PAGE_INDEX(session, ret = __stat_page(session, next_walk->page, stats)); WT_RET(ret); } return (ret == WT_NOTFOUND ? 0 : ret); }
/* * __wt_btcur_next_random -- * Move to a random record in the tree. */ int __wt_btcur_next_random(WT_CURSOR_BTREE *cbt) { WT_BTREE *btree; WT_DECL_RET; WT_SESSION_IMPL *session; WT_UPDATE *upd; session = (WT_SESSION_IMPL *)cbt->iface.session; btree = cbt->btree; /* * Only supports row-store: applications can trivially select a random * value from a column-store, if there were any reason to do so. */ if (btree->type != BTREE_ROW) WT_RET(ENOTSUP); WT_STAT_FAST_CONN_INCR(session, cursor_next); WT_STAT_FAST_DATA_INCR(session, cursor_next); WT_RET(__cursor_func_init(cbt, 1)); WT_WITH_PAGE_INDEX(session, ret = __wt_row_random(session, cbt)); WT_ERR(ret); if (__cursor_valid(cbt, &upd)) WT_ERR(__wt_kv_return(session, cbt, upd)); else WT_ERR(__wt_btcur_search_near(cbt, NULL)); err: if (ret != 0) WT_TRET(__cursor_reset(cbt)); return (ret); }
/* * __cursor_row_search -- * Row-store search from an application cursor. */ static inline int __cursor_row_search(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, int insert) { WT_DECL_RET; WT_WITH_PAGE_INDEX(session, ret = __wt_row_search(session, &cbt->iface.key, NULL, cbt, insert)); return (ret); }
/* * __cursor_col_search -- * Column-store search from an application cursor. */ static inline int __cursor_col_search(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) { WT_DECL_RET; WT_WITH_PAGE_INDEX(session, ret = __wt_col_search(session, cbt->iface.recno, NULL, cbt)); return (ret); }
/* * __wt_btree_stat_init -- * Initialize the Btree statistics. */ int __wt_btree_stat_init(WT_SESSION_IMPL *session, WT_CURSOR_STAT *cst) { WT_BM *bm; WT_BTREE *btree; WT_DECL_RET; WT_DSRC_STATS **stats; WT_REF *next_walk; btree = S2BT(session); bm = btree->bm; stats = btree->dhandle->stats; WT_RET(bm->stat(bm, session, stats[0])); WT_STAT_SET(session, stats, btree_fixed_len, btree->bitcnt); WT_STAT_SET(session, stats, btree_maximum_depth, btree->maximum_depth); WT_STAT_SET(session, stats, btree_maxintlkey, btree->maxintlkey); WT_STAT_SET(session, stats, btree_maxintlpage, btree->maxintlpage); WT_STAT_SET(session, stats, btree_maxleafkey, btree->maxleafkey); WT_STAT_SET(session, stats, btree_maxleafpage, btree->maxleafpage); WT_STAT_SET(session, stats, btree_maxleafvalue, btree->maxleafvalue); /* Everything else is really, really expensive. */ if (!F_ISSET(cst, WT_CONN_STAT_ALL)) return (0); /* * Clear the statistics we're about to count. */ WT_STAT_SET(session, stats, btree_column_deleted, 0); WT_STAT_SET(session, stats, btree_column_fix, 0); WT_STAT_SET(session, stats, btree_column_internal, 0); WT_STAT_SET(session, stats, btree_column_rle, 0); WT_STAT_SET(session, stats, btree_column_variable, 0); WT_STAT_SET(session, stats, btree_entries, 0); WT_STAT_SET(session, stats, btree_overflow, 0); WT_STAT_SET(session, stats, btree_row_internal, 0); WT_STAT_SET(session, stats, btree_row_leaf, 0); next_walk = NULL; while ((ret = __wt_tree_walk( session, &next_walk, 0)) == 0 && next_walk != NULL) { WT_WITH_PAGE_INDEX(session, ret = __stat_page(session, next_walk->page, stats)); WT_RET(ret); } return (ret == WT_NOTFOUND ? 0 : ret); }
/* * __wt_btcur_next_random -- * Move to a random record in the tree. There are two algorithms, one * where we select a record at random from the whole tree on each * retrieval and one where we first select a record at random from the * whole tree, and then subsequently sample forward from that location. * The sampling approach allows us to select reasonably uniform random * points from unbalanced trees. */ int __wt_btcur_next_random(WT_CURSOR_BTREE *cbt) { WT_BTREE *btree; WT_DECL_RET; WT_SESSION_IMPL *session; WT_UPDATE *upd; wt_off_t size; uint64_t skip; session = (WT_SESSION_IMPL *)cbt->iface.session; btree = cbt->btree; /* * Only supports row-store: applications can trivially select a random * value from a column-store, if there were any reason to do so. */ if (btree->type != BTREE_ROW) WT_RET_MSG(session, ENOTSUP, "WT_CURSOR.next_random only supported by row-store tables"); WT_STAT_CONN_INCR(session, cursor_next); WT_STAT_DATA_INCR(session, cursor_next); /* * If retrieving random values without sampling, or we don't have a * page reference, pick a roughly random leaf page in the tree. */ if (cbt->ref == NULL || cbt->next_random_sample_size == 0) { /* * Skip past the sample size of the leaf pages in the tree * between each random key return to compensate for unbalanced * trees. * * Use the underlying file size divided by its block allocation * size as our guess of leaf pages in the file (this can be * entirely wrong, as it depends on how many pages are in this * particular checkpoint, how large the leaf and internal pages * really are, and other factors). Then, divide that value by * the configured sample size and increment the final result to * make sure tiny files don't leave us with a skip value of 0. * * !!! * Ideally, the number would be prime to avoid restart issues. */ if (cbt->next_random_sample_size != 0) { WT_ERR(btree->bm->size(btree->bm, session, &size)); cbt->next_random_leaf_skip = (uint64_t) ((size / btree->allocsize) / cbt->next_random_sample_size) + 1; } /* * Choose a leaf page from the tree. */ WT_ERR(__cursor_func_init(cbt, true)); WT_WITH_PAGE_INDEX( session, ret = __wt_row_random_descent(session, cbt)); WT_ERR(ret); } else { /* * Read through the tree, skipping leaf pages. Be cautious about * the skip count: if the last leaf page skipped was also the * last leaf page in the tree, it may be set to zero on return * with the end-of-walk condition. * * Pages read for data sampling aren't "useful"; don't update * the read generation of pages already in memory, and if a page * is read, set its generation to a low value so it is evicted * quickly. */ for (skip = cbt->next_random_leaf_skip; cbt->ref == NULL || skip > 0;) WT_ERR(__wt_tree_walk_skip(session, &cbt->ref, &skip, WT_READ_NO_GEN | WT_READ_SKIP_INTL | WT_READ_WONT_NEED)); } /* * Select a random entry from the leaf page. If it's not valid, move to * the next entry, if that doesn't work, move to the previous entry. */ WT_ERR(__wt_row_random_leaf(session, cbt)); if (__cursor_valid(cbt, &upd)) WT_ERR(__wt_kv_return(session, cbt, upd)); else { if ((ret = __wt_btcur_next(cbt, false)) == WT_NOTFOUND) ret = __wt_btcur_prev(cbt, false); WT_ERR(ret); } return (0); err: WT_TRET(__cursor_reset(cbt)); return (ret); }
/* * __wt_verify -- * Verify a file. */ int __wt_verify(WT_SESSION_IMPL *session, const char *cfg[]) { WT_BM *bm; WT_BTREE *btree; WT_CKPT *ckptbase, *ckpt; WT_DECL_RET; WT_VSTUFF *vs, _vstuff; size_t root_addr_size; uint8_t root_addr[WT_BTREE_MAX_ADDR_COOKIE]; bool bm_start, quit; btree = S2BT(session); bm = btree->bm; ckptbase = NULL; bm_start = false; WT_CLEAR(_vstuff); vs = &_vstuff; WT_ERR(__wt_scr_alloc(session, 0, &vs->max_key)); WT_ERR(__wt_scr_alloc(session, 0, &vs->max_addr)); WT_ERR(__wt_scr_alloc(session, 0, &vs->tmp1)); WT_ERR(__wt_scr_alloc(session, 0, &vs->tmp2)); WT_ERR(__wt_scr_alloc(session, 0, &vs->tmp3)); WT_ERR(__wt_scr_alloc(session, 0, &vs->tmp4)); /* Check configuration strings. */ WT_ERR(__verify_config(session, cfg, vs)); /* Optionally dump specific block offsets. */ WT_ERR(__verify_config_offsets(session, cfg, &quit)); if (quit) goto done; /* Get a list of the checkpoints for this file. */ WT_ERR( __wt_meta_ckptlist_get(session, btree->dhandle->name, &ckptbase)); /* Inform the underlying block manager we're verifying. */ WT_ERR(bm->verify_start(bm, session, ckptbase, cfg)); bm_start = true; /* Loop through the file's checkpoints, verifying each one. */ WT_CKPT_FOREACH(ckptbase, ckpt) { WT_ERR(__wt_verbose(session, WT_VERB_VERIFY, "%s: checkpoint %s", btree->dhandle->name, ckpt->name)); /* Fake checkpoints require no work. */ if (F_ISSET(ckpt, WT_CKPT_FAKE)) continue; /* House-keeping between checkpoints. */ __verify_checkpoint_reset(vs); if (WT_VRFY_DUMP(vs)) WT_ERR(__wt_msg(session, "%s: checkpoint %s", btree->dhandle->name, ckpt->name)); /* Load the checkpoint. */ WT_ERR(bm->checkpoint_load(bm, session, ckpt->raw.data, ckpt->raw.size, root_addr, &root_addr_size, true)); /* * Ignore trees with no root page. * Verify, then discard the checkpoint from the cache. */ if (root_addr_size != 0 && (ret = __wt_btree_tree_open( session, root_addr, root_addr_size)) == 0) { if (WT_VRFY_DUMP(vs)) WT_ERR(__wt_msg(session, "Root: %s %s", __wt_addr_string(session, root_addr, root_addr_size, vs->tmp1), __wt_page_type_string( btree->root.page->type))); WT_WITH_PAGE_INDEX(session, ret = __verify_tree(session, &btree->root, vs)); WT_TRET(__wt_cache_op(session, WT_SYNC_DISCARD)); } /* Unload the checkpoint. */ WT_TRET(bm->checkpoint_unload(bm, session)); WT_ERR(ret); /* Display the tree shape. */ if (vs->dump_shape) WT_ERR(__verify_tree_shape(session, vs)); }