/* * __stat_page -- * Stat any Btree page. */ static int __stat_page(WT_SESSION_IMPL *session, WT_PAGE *page, WT_DSRC_STATS **stats) { /* * All internal pages and overflow pages are trivial, all we track is * a count of the page type. */ switch (page->type) { case WT_PAGE_COL_FIX: WT_STAT_INCR(session, stats, btree_column_fix); WT_STAT_INCRV( session, stats, btree_entries, page->pg_fix_entries); break; case WT_PAGE_COL_INT: WT_STAT_INCR(session, stats, btree_column_internal); break; case WT_PAGE_COL_VAR: __stat_page_col_var(session, page, stats); break; case WT_PAGE_ROW_INT: __stat_page_row_int(session, page, stats); break; case WT_PAGE_ROW_LEAF: __stat_page_row_leaf(session, page, stats); break; WT_ILLEGAL_VALUE(session); } return (0); }
/* * __stat_page_col_var -- * Stat a WT_PAGE_COL_VAR page. */ static int __stat_page_col_var(WT_PAGE *page, WT_DSRC_STATS *stats) { WT_CELL *cell; WT_CELL_UNPACK *unpack, _unpack; WT_COL *cip; WT_INSERT *ins; WT_UPDATE *upd; uint32_t i; int orig_deleted; unpack = &_unpack; WT_STAT_INCR(stats, btree_column_variable); /* * Walk the page, counting regular and overflow data items, and checking * to be sure any updates weren't deletions. If the item was updated, * assume it was updated by an item of the same size (it's expensive to * figure out if it will require the same space or not, especially if * there's Huffman encoding). */ WT_COL_FOREACH(page, cip, i) { if ((cell = WT_COL_PTR(page, cip)) == NULL) { orig_deleted = 1; WT_STAT_INCR(stats, btree_column_deleted); } else { orig_deleted = 0; __wt_cell_unpack(cell, unpack); WT_STAT_INCRV( stats, btree_entries, __wt_cell_rle(unpack)); } /* * Walk the insert list, checking for changes. For each insert * we find, correct the original count based on its state. */ WT_SKIP_FOREACH(ins, WT_COL_UPDATE(page, cip)) { upd = ins->upd; if (WT_UPDATE_DELETED_ISSET(upd)) { if (orig_deleted) continue; WT_STAT_INCR(stats, btree_column_deleted); WT_STAT_DECR(stats, btree_entries); } else { if (!orig_deleted) continue; WT_STAT_DECR(stats, btree_column_deleted); WT_STAT_INCR(stats, btree_entries); } } }
/* * __stat_page_col_var -- * Stat a WT_PAGE_COL_VAR page. */ static void __stat_page_col_var( WT_SESSION_IMPL *session, WT_PAGE *page, WT_DSRC_STATS **stats) { WT_CELL *cell; WT_CELL_UNPACK *unpack, _unpack; WT_COL *cip; WT_INSERT *ins; WT_UPDATE *upd; uint64_t deleted_cnt, entry_cnt, ovfl_cnt, rle_cnt; uint32_t i; bool orig_deleted; unpack = &_unpack; deleted_cnt = entry_cnt = ovfl_cnt = rle_cnt = 0; WT_STAT_INCR(session, stats, btree_column_variable); /* * Walk the page counting regular items, adjusting if the item has been * subsequently deleted or not. This is a mess because 10-item RLE might * have 3 of the items subsequently deleted. Overflow items are harder, * we can't know if an updated item will be an overflow item or not; do * our best, and simply count every overflow item (or RLE set of items) * we see. */ WT_COL_FOREACH(page, cip, i) { if ((cell = WT_COL_PTR(page, cip)) == NULL) { orig_deleted = true; ++deleted_cnt; } else { orig_deleted = false; __wt_cell_unpack(cell, unpack); if (unpack->type == WT_CELL_ADDR_DEL) orig_deleted = true; else { entry_cnt += __wt_cell_rle(unpack); rle_cnt += __wt_cell_rle(unpack) - 1; } if (unpack->ovfl) ++ovfl_cnt; } /* * Walk the insert list, checking for changes. For each insert * we find, correct the original count based on its state. */ WT_SKIP_FOREACH(ins, WT_COL_UPDATE(page, cip)) { upd = ins->upd; if (WT_UPDATE_DELETED_ISSET(upd)) { if (!orig_deleted) { ++deleted_cnt; --entry_cnt; } } else if (orig_deleted) { --deleted_cnt; ++entry_cnt; } } }
/* * __rec_review -- * Get exclusive access to the page and review the page and its subtree * for conditions that would block its eviction. * * The ref and page arguments may appear to be redundant, because usually * ref->page == page and page->ref == ref. However, we need both because * (a) there are cases where ref == NULL (e.g., for root page or during * salvage), and (b) we can't safely look at page->ref until we have a * hazard reference. */ static int __rec_review(WT_SESSION_IMPL *session, WT_REF *ref, WT_PAGE *page, uint32_t flags, int top) { WT_DECL_RET; WT_PAGE_MODIFY *mod; WT_TXN *txn; uint32_t i; txn = &session->txn; /* * Get exclusive access to the page if our caller doesn't have the tree * locked down. */ if (!LF_ISSET(WT_REC_SINGLE)) WT_RET(__hazard_exclusive(session, ref, top)); /* * Recurse through the page's subtree: this happens first because we * have to write pages in depth-first order, otherwise we'll dirty * pages after we've written them. */ if (page->type == WT_PAGE_COL_INT || page->type == WT_PAGE_ROW_INT) WT_REF_FOREACH(page, ref, i) switch (ref->state) { case WT_REF_DISK: /* On-disk */ case WT_REF_DELETED: /* On-disk, deleted */ break; case WT_REF_MEM: /* In-memory */ WT_RET(__rec_review( session, ref, ref->page, flags, 0)); break; case WT_REF_EVICT_WALK: /* Walk point */ case WT_REF_LOCKED: /* Being evicted */ case WT_REF_READING: /* Being read */ return (EBUSY); } /* * Check if this page can be evicted: * * Fail if the top-level page is a page expected to be removed from the * tree as part of eviction (an empty page or a split-merge page). Note * "split" pages are NOT included in this test, because a split page can * be separately evicted, at which point it's replaced in its parent by * a reference to a split-merge page. That's a normal part of the leaf * page life-cycle if it grows too large and must be pushed out of the * cache. There is also an exception for empty pages, the root page may * be empty when evicted, but that only happens when the tree is closed. * * Fail if any page in the top-level page's subtree can't be merged into * its parent. You can't evict a page that references such in-memory * pages, they must be evicted first. The test is necessary but should * not fire much: the LRU-based eviction code is biased for leaf pages, * an internal page shouldn't be selected for LRU-based eviction until * its children have been evicted. Empty, split and split-merge pages * are all included in this test, they can all be merged into a parent. * * We have to write dirty pages to know their final state, a page marked * empty may have had records added since reconciliation, a page marked * split may have had records deleted and no longer need to split. * Split-merge pages are the exception: they can never be change into * anything other than a split-merge page and are merged regardless of * being clean or dirty. * * Writing the page is expensive, do a cheap test first: if it doesn't * appear a subtree page can be merged, quit. It's possible the page * has been emptied since it was last reconciled, and writing it before * testing might be worthwhile, but it's more probable we're attempting * to evict an internal page with live children, and that's a waste of * time. * * We don't do a cheap test for the top-level page: we're not called * to evict split-merge pages, which means the only interesting case * is an empty page. If the eviction thread picked an "empty" page * for eviction, it must have had reason, probably the empty page got * really, really full. */ mod = page->modify; if (!top && (mod == NULL || !F_ISSET(mod, WT_PM_REC_EMPTY | WT_PM_REC_SPLIT | WT_PM_REC_SPLIT_MERGE))) return (EBUSY); /* If the page is dirty, write it so we know the final state. */ if (__wt_page_is_modified(page) && !F_ISSET(mod, WT_PM_REC_SPLIT_MERGE)) { ret = __wt_rec_write(session, page, NULL, flags); /* If there are unwritten changes on the page, give up. */ if (ret == 0 && !LF_ISSET(WT_REC_SINGLE) && __wt_page_is_modified(page)) ret = EBUSY; if (ret == EBUSY) { WT_VERBOSE_RET(session, evict, "page %p written but not clean", page); if (F_ISSET(txn, TXN_RUNNING) && ++txn->eviction_fails >= 100) { txn->eviction_fails = 0; ret = WT_DEADLOCK; WT_STAT_INCR( S2C(session)->stats, txn_fail_cache); } /* * If there aren't multiple cursors active, there * are no consistency issues: try to bump our snapshot. */ if (session->ncursors <= 1) { __wt_txn_read_last(session); __wt_txn_read_first(session); } switch (page->type) { case WT_PAGE_COL_FIX: case WT_PAGE_COL_VAR: __wt_col_leaf_obsolete(session, page); break; case WT_PAGE_ROW_LEAF: __wt_row_leaf_obsolete(session, page); break; } } WT_RET(ret); txn->eviction_fails = 0; } /* * Repeat the eviction tests. * * Fail if the top-level page should be merged into its parent, and it's * not the root page. * * Fail if a page in the top-level page's subtree can't be merged into * its parent. */ if (top) { /* * We never get a top-level split-merge page to evict, they are * ignored by the eviction thread. Check out of sheer paranoia. */ if (mod != NULL) { if (F_ISSET(mod, WT_PM_REC_SPLIT_MERGE)) return (EBUSY); if (F_ISSET(mod, WT_PM_REC_EMPTY) && !WT_PAGE_IS_ROOT(page)) return (EBUSY); } } else if (mod == NULL || !F_ISSET(mod, WT_PM_REC_EMPTY | WT_PM_REC_SPLIT | WT_PM_REC_SPLIT_MERGE)) return (EBUSY); return (0); }
/* * __wt_rec_evict -- * Reconciliation plus eviction. */ int __wt_rec_evict(WT_SESSION_IMPL *session, WT_PAGE *page, uint32_t flags) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; int single; conn = S2C(session); WT_VERBOSE_RET(session, evict, "page %p (%s)", page, __wt_page_type_string(page->type)); WT_ASSERT(session, session->excl_next == 0); single = LF_ISSET(WT_REC_SINGLE) ? 1 : 0; /* * Get exclusive access to the page and review the page and its subtree * for conditions that would block our eviction of the page. If the * check fails (for example, we find a child page that can't be merged), * we're done. We have to make this check for clean pages, too: while * unlikely eviction would choose an internal page with children, it's * not disallowed anywhere. * * Note that page->ref may be NULL in some cases (e.g., for root pages * or during salvage). That's OK if WT_REC_SINGLE is set: we won't * check hazard references in that case. */ WT_ERR(__rec_review(session, page->ref, page, flags, 1)); /* Count evictions of internal pages during normal operation. */ if (!single && (page->type == WT_PAGE_COL_INT || page->type == WT_PAGE_ROW_INT)) WT_STAT_INCR(conn->stats, cache_evict_internal); /* Update the parent and discard the page. */ if (page->modify == NULL || !F_ISSET(page->modify, WT_PM_REC_MASK)) { WT_STAT_INCR(conn->stats, cache_evict_unmodified); WT_ASSERT(session, single || page->ref->state == WT_REF_LOCKED); if (WT_PAGE_IS_ROOT(page)) __rec_root_update(session); else __rec_page_clean_update(session, page); /* Discard the page. */ __rec_discard_page(session, page, single); } else { WT_STAT_INCR(conn->stats, cache_evict_modified); if (WT_PAGE_IS_ROOT(page)) __rec_root_update(session); else WT_ERR(__rec_page_dirty_update(session, page)); /* Discard the tree rooted in this page. */ __rec_discard_tree(session, page, single); } if (0) { err: /* * If unable to evict this page, release exclusive reference(s) * we've acquired. */ __rec_excl_clear(session); } session->excl_next = 0; return (ret); }
/* * __wt_lsm_stat_init -- * Initialize a LSM statistics structure. */ int __wt_lsm_stat_init(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, WT_CURSOR_STAT *cst, uint32_t flags) { WT_CURSOR *stat_cursor; WT_DECL_ITEM(uribuf); WT_DECL_RET; WT_DSRC_STATS *stats; WT_LSM_CHUNK *chunk; const char *cfg[] = API_CONF_DEFAULTS( session, open_cursor, "statistics_fast=on"); const char *disk_cfg[] = API_CONF_DEFAULTS(session, open_cursor, "checkpoint=WiredTigerCheckpoint,statistics_fast=on"); const char *desc, *pvalue; uint64_t value; u_int i; int locked, stat_key; WT_UNUSED(flags); locked = 0; WT_ERR(__wt_scr_alloc(session, 0, &uribuf)); /* Clear the statistics we are about to recalculate. */ if (cst->stats != NULL) stats = (WT_DSRC_STATS *)cst->stats; else { WT_ERR(__wt_calloc_def(session, 1, &stats)); __wt_stat_init_dsrc_stats(stats); cst->stats_first = cst->stats = (WT_STATS *)stats; cst->stats_count = sizeof(*stats) / sizeof(WT_STATS); } *stats = lsm_tree->stats; if (LF_ISSET(WT_STATISTICS_CLEAR)) __wt_stat_clear_dsrc_stats(&lsm_tree->stats); /* Hold the LSM lock so that we can safely walk through the chunks. */ WT_ERR(__wt_readlock(session, lsm_tree->rwlock)); locked = 1; /* Set the stats for this run. */ WT_STAT_SET(stats, lsm_chunk_count, lsm_tree->nchunks); for (i = 0; i < lsm_tree->nchunks; i++) { chunk = lsm_tree->chunk[i]; if (chunk->generation > (uint32_t)WT_STAT(stats, lsm_generation_max)) WT_STAT_SET(stats, lsm_generation_max, chunk->generation); /* * LSM chunk reads happen from a checkpoint, so get the * statistics for a checkpoint if one exists. */ WT_ERR(__wt_buf_fmt( session, uribuf, "statistics:%s", chunk->uri)); ret = __wt_curstat_open(session, uribuf->data, F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) ? disk_cfg : cfg, &stat_cursor); /* * XXX kludge: we may have an empty chunk where no checkpoint * was written. If so, try to open the ordinary handle on that * chunk instead. */ if (ret == WT_NOTFOUND && F_ISSET(chunk, WT_LSM_CHUNK_ONDISK)) ret = __wt_curstat_open( session, uribuf->data, cfg, &stat_cursor); WT_ERR(ret); while ((ret = stat_cursor->next(stat_cursor)) == 0) { WT_ERR(stat_cursor->get_key(stat_cursor, &stat_key)); WT_ERR(stat_cursor->get_value( stat_cursor, &desc, &pvalue, &value)); WT_STAT_INCRKV(stats, stat_key, value); } WT_ERR_NOTFOUND_OK(ret); WT_ERR(stat_cursor->close(stat_cursor)); if (!F_ISSET(chunk, WT_LSM_CHUNK_BLOOM)) continue; WT_STAT_INCR(stats, bloom_count); WT_STAT_INCRV(stats, bloom_size, (chunk->count * lsm_tree->bloom_bit_count) / 8); WT_ERR(__wt_buf_fmt( session, uribuf, "statistics:%s", chunk->bloom_uri)); WT_ERR(__wt_curstat_open(session, uribuf->data, cfg, &stat_cursor)); stat_cursor->set_key( stat_cursor, WT_STAT_DSRC_CACHE_EVICTION_CLEAN); WT_ERR(stat_cursor->search(stat_cursor)); WT_ERR(stat_cursor->get_value( stat_cursor, &desc, &pvalue, &value)); WT_STAT_INCRV(stats, cache_eviction_clean, value); WT_STAT_INCRV(stats, bloom_page_evict, value); stat_cursor->set_key( stat_cursor, WT_STAT_DSRC_CACHE_EVICTION_DIRTY); WT_ERR(stat_cursor->search(stat_cursor)); WT_ERR(stat_cursor->get_value( stat_cursor, &desc, &pvalue, &value)); WT_STAT_INCRV(stats, cache_eviction_dirty, value); WT_STAT_INCRV(stats, bloom_page_evict, value); stat_cursor->set_key( stat_cursor, WT_STAT_DSRC_CACHE_EVICTION_FAIL); WT_ERR(stat_cursor->search(stat_cursor)); WT_ERR(stat_cursor->get_value( stat_cursor, &desc, &pvalue, &value)); WT_STAT_INCRV(stats, cache_eviction_fail, value); stat_cursor->set_key(stat_cursor, WT_STAT_DSRC_CACHE_READ); WT_ERR(stat_cursor->search(stat_cursor)); WT_ERR(stat_cursor->get_value( stat_cursor, &desc, &pvalue, &value)); WT_STAT_INCRV(stats, cache_read, value); WT_STAT_INCRV(stats, bloom_page_read, value); stat_cursor->set_key(stat_cursor, WT_STAT_DSRC_CACHE_WRITE); WT_ERR(stat_cursor->search(stat_cursor)); WT_ERR(stat_cursor->get_value( stat_cursor, &desc, &pvalue, &value)); WT_STAT_INCRV(stats, cache_write, value); WT_ERR(stat_cursor->close(stat_cursor)); } err: if (locked) WT_TRET(__wt_rwunlock(session, lsm_tree->rwlock)); __wt_scr_free(&uribuf); return (ret); }