Exemplo n.º 1
0
/*
 * __stat_page_col_var --
 *	Stat a WT_PAGE_COL_VAR page.
 */
static int
__stat_page_col_var(WT_PAGE *page, WT_DSRC_STATS *stats)
{
	WT_CELL *cell;
	WT_CELL_UNPACK *unpack, _unpack;
	WT_COL *cip;
	WT_INSERT *ins;
	WT_UPDATE *upd;
	uint32_t i;
	int orig_deleted;

	unpack = &_unpack;

	WT_STAT_INCR(stats, btree_column_variable);

	/*
	 * Walk the page, counting regular and overflow data items, and checking
	 * to be sure any updates weren't deletions.  If the item was updated,
	 * assume it was updated by an item of the same size (it's expensive to
	 * figure out if it will require the same space or not, especially if
	 * there's Huffman encoding).
	 */
	WT_COL_FOREACH(page, cip, i) {
		if ((cell = WT_COL_PTR(page, cip)) == NULL) {
			orig_deleted = 1;
			WT_STAT_INCR(stats, btree_column_deleted);
		} else {
			orig_deleted = 0;
			__wt_cell_unpack(cell, unpack);
			WT_STAT_INCRV(
			    stats, btree_entries, __wt_cell_rle(unpack));
		}

		/*
		 * Walk the insert list, checking for changes.  For each insert
		 * we find, correct the original count based on its state.
		 */
		WT_SKIP_FOREACH(ins, WT_COL_UPDATE(page, cip)) {
			upd = ins->upd;
			if (WT_UPDATE_DELETED_ISSET(upd)) {
				if (orig_deleted)
					continue;
				WT_STAT_INCR(stats, btree_column_deleted);
				WT_STAT_DECR(stats, btree_entries);
			} else {
				if (!orig_deleted)
					continue;
				WT_STAT_DECR(stats, btree_column_deleted);
				WT_STAT_INCR(stats, btree_entries);
			}
		}
	}
Exemplo n.º 2
0
/*
 * __ovfl_cache_col_visible --
 *	column-store: check for a globally visible update.
 */
static bool
__ovfl_cache_col_visible(
    WT_SESSION_IMPL *session, WT_UPDATE *upd, WT_CELL_UNPACK *unpack)
{
	/*
	 * Column-store is harder than row_store: we're here because there's a
	 * reader in the system that might read the original version of an
	 * overflow record, which might match a number of records.  For example,
	 * the original overflow value was for records 100-200, we've replaced
	 * each of those records individually, but there exists a reader that
	 * might read any one of those records, and all of those records have
	 * different update entries with different transaction IDs.  Since it's
	 * infeasible to determine if there's a globally visible update for each
	 * reader for each record, we test the simple case where a single record
	 * has a single, globally visible update.  If that's not the case, cache
	 * the value.
	 */
	if (__wt_cell_rle(unpack) == 1 &&
	    upd != NULL &&		/* Sanity: upd should always be set. */
	    __wt_txn_visible_all(session, upd->txnid))
		return (true);
	return (false);
}
Exemplo n.º 3
0
/*
 * __cursor_var_prev --
 *	Move to the previous, variable-length column-store item.
 */
static inline int
__cursor_var_prev(WT_CURSOR_BTREE *cbt, bool newpage)
{
	WT_CELL *cell;
	WT_CELL_UNPACK unpack;
	WT_COL *cip;
	WT_INSERT *ins;
	WT_ITEM *val;
	WT_PAGE *page;
	WT_SESSION_IMPL *session;
	WT_UPDATE *upd;
	uint64_t rle_start;

	session = (WT_SESSION_IMPL *)cbt->iface.session;
	page = cbt->ref->page;
	val = &cbt->iface.value;

	rle_start = 0;			/* -Werror=maybe-uninitialized */

	/* Initialize for each new page. */
	if (newpage) {
		cbt->last_standard_recno = __col_var_last_recno(page);
		if (cbt->last_standard_recno == 0)
			return (WT_NOTFOUND);
		__cursor_set_recno(cbt, cbt->last_standard_recno);
		goto new_page;
	}

	/* Move to the previous entry and return the item. */
	for (;;) {
		__cursor_set_recno(cbt, cbt->recno - 1);

new_page:	if (cbt->recno < page->pg_var_recno)
			return (WT_NOTFOUND);

		/* Find the matching WT_COL slot. */
		if ((cip =
		    __col_var_search(page, cbt->recno, &rle_start)) == NULL)
			return (WT_NOTFOUND);
		cbt->slot = WT_COL_SLOT(page, cip);

		/* Check any insert list for a matching record. */
		cbt->ins_head = WT_COL_UPDATE_SLOT(page, cbt->slot);
		cbt->ins = __col_insert_search_match(cbt->ins_head, cbt->recno);
		upd = cbt->ins == NULL ?
		    NULL : __wt_txn_read(session, cbt->ins->upd);
		if (upd != NULL) {
			if (WT_UPDATE_DELETED_ISSET(upd)) {
				if (__wt_txn_visible_all(session, upd->txnid))
					++cbt->page_deleted_count;
				continue;
			}

			val->data = WT_UPDATE_DATA(upd);
			val->size = upd->size;
			return (0);
		}

		/*
		 * If we're at the same slot as the last reference and there's
		 * no matching insert list item, re-use the return information
		 * (so encoded items with large repeat counts aren't repeatedly
		 * decoded).  Otherwise, unpack the cell and build the return
		 * information.
		 */
		if (cbt->cip_saved != cip) {
			if ((cell = WT_COL_PTR(page, cip)) == NULL)
				continue;
			__wt_cell_unpack(cell, &unpack);
			if (unpack.type == WT_CELL_DEL) {
				if (__wt_cell_rle(&unpack) == 1)
					continue;
				/*
				 * There can be huge gaps in the variable-length
				 * column-store name space appearing as deleted
				 * records. If more than one deleted record, do
				 * the work of finding the next record to return
				 * instead of looping through the records.
				 *
				 * First, find the largest record in the update
				 * list that's smaller than the current record.
				 */
				ins = __col_insert_search_lt(
				    cbt->ins_head, cbt->recno);

				/*
				 * Second, for records with RLEs greater than 1,
				 * the above call to __col_var_search located
				 * this record in the page's list of repeating
				 * records, and returned the starting record.
				 * The starting record - 1 is the record to
				 * which we could skip, if there was no larger
				 * record in the update list.
				 */
				cbt->recno = rle_start - 1;
				if (ins != NULL &&
				    WT_INSERT_RECNO(ins) > cbt->recno)
					cbt->recno = WT_INSERT_RECNO(ins);

				/* Adjust for the outer loop decrement. */
				++cbt->recno;
				continue;
			}
			WT_RET(__wt_page_cell_data_ref(
			    session, page, &unpack, cbt->tmp));

			cbt->cip_saved = cip;
		}
		val->data = cbt->tmp->data;
		val->size = cbt->tmp->size;
		return (0);
	}
	/* NOTREACHED */
}
Exemplo n.º 4
0
/*移向下条variable-length column-store 记录*/
static inline int __cursor_var_next(WT_CURSOR_BTREE* cbt, int newpage)
{
	WT_CELL *cell;
	WT_CELL_UNPACK unpack;
	WT_COL *cip;
	WT_ITEM *val;
	WT_INSERT *ins;
	WT_PAGE *page;
	WT_SESSION_IMPL *session;
	WT_UPDATE *upd;
	uint64_t rle, rle_start;

	session = (WT_SESSION_IMPL *)cbt->iface.session;
	page = cbt->ref->page;
	val = &cbt->iface.value;

	rle_start = 0;			/* -Werror=maybe-uninitialized */

	if (newpage){
		cbt->last_standard_recno = __col_var_last_recno(page);
		if (cbt->last_standard_recno == 0)
			return (WT_NOTFOUND);
		__cursor_set_recno(cbt, page->pg_var_recno);
		goto new_page;
	}

	for (;;){
		if (cbt->recno >= cbt->last_standard_recno)
			return (WT_NOTFOUND);
		__cursor_set_recno(cbt, cbt->recno + 1);

	new_page:
		/*定位到recno对应的WT_COL slot*/
		if ((cip = __col_var_search(page, cbt->recno, &rle_start)) == NULL)
			return (WT_NOTFOUND);
		cbt->slot = WT_COL_SLOT(page, cip);

		/*读取内容值*/
		cbt->ins_head = WT_COL_UPDATE_SLOT(page, cbt->slot);
		cbt->ins = __col_insert_search_match(cbt->ins_head, cbt->recno);
		upd = cbt->ins == NULL ? NULL : __wt_txn_read(session, cbt->ins->upd);
		if (upd != NULL) {
			if (WT_UPDATE_DELETED_ISSET(upd)) {
				++cbt->page_deleted_count;
				continue;
			}

			val->data = WT_UPDATE_DATA(upd);
			val->size = upd->size;
			return (0);
		}

		/*
		* If we're at the same slot as the last reference and there's
		* no matching insert list item, re-use the return information
		* (so encoded items with large repeat counts aren't repeatedly
		* decoded).  Otherwise, unpack the cell and build the return
		* information.
		* upd == NULL, 记录可能被删除放入到了insert列表中,slot可能被重用了,那么需要进行cell unpack取值
		*/

		if (cbt->cip_saved != cip) {
			if ((cell = WT_COL_PTR(page, cip)) == NULL)
				continue;
			__wt_cell_unpack(cell, &unpack);
			if (unpack.type == WT_CELL_DEL) {
				if ((rle = __wt_cell_rle(&unpack)) == 1)
					continue;

				/*定位到修改列表中的记录*/
				ins = __col_insert_search_gt(cbt->ins_head, cbt->recno);
				cbt->recno = rle_start + rle;
				if (ins != NULL && WT_INSERT_RECNO(ins) < cbt->recno)
					cbt->recno = WT_INSERT_RECNO(ins);

				/* Adjust for the outer loop increment. */
				--cbt->recno;
				continue;
			}

			/*取出cell中的值到tmp中*/
			WT_RET(__wt_page_cell_data_ref(session, page, &unpack, &cbt->tmp));
			cbt->cip_saved = cip;
		}
		val->data = cbt->tmp.data;
		val->size = cbt->tmp.size;
		return 0;
	}
}
Exemplo n.º 5
0
/*
 * __verify_tree --
 *	Verify a tree, recursively descending through it in depth-first fashion.
 * The page argument was physically verified (so we know it's correctly formed),
 * and the in-memory version built.  Our job is to check logical relationships
 * in the page and in the tree.
 */
static int
__verify_tree(WT_SESSION_IMPL *session, WT_REF *ref, WT_VSTUFF *vs)
{
	WT_BM *bm;
	WT_CELL *cell;
	WT_CELL_UNPACK *unpack, _unpack;
	WT_COL *cip;
	WT_DECL_RET;
	WT_PAGE *page;
	WT_REF *child_ref;
	uint64_t recno;
	uint32_t entry, i;
	bool found;

	bm = S2BT(session)->bm;
	page = ref->page;

	unpack = &_unpack;
	WT_CLEAR(*unpack);	/* -Wuninitialized */

	WT_RET(__wt_verbose(session, WT_VERB_VERIFY, "%s %s",
	    __wt_page_addr_string(session, ref, vs->tmp1),
	    __wt_page_type_string(page->type)));

	/* Optionally dump the address. */
	if (vs->dump_address)
		WT_RET(__wt_msg(session, "%s %s",
		    __wt_page_addr_string(session, ref, vs->tmp1),
		    __wt_page_type_string(page->type)));

	/* Track the shape of the tree. */
	if (WT_PAGE_IS_INTERNAL(page))
		++vs->depth_internal[
		    WT_MIN(vs->depth, WT_ELEMENTS(vs->depth_internal) - 1)];
	else
		++vs->depth_leaf[
		    WT_MIN(vs->depth, WT_ELEMENTS(vs->depth_internal) - 1)];

	/*
	 * The page's physical structure was verified when it was read into
	 * memory by the read server thread, and then the in-memory version
	 * of the page was built. Now we make sure the page and tree are
	 * logically consistent.
	 *
	 * !!!
	 * The problem: (1) the read server has to build the in-memory version
	 * of the page because the read server is the thread that flags when
	 * any thread can access the page in the tree; (2) we can't build the
	 * in-memory version of the page until the physical structure is known
	 * to be OK, so the read server has to verify at least the physical
	 * structure of the page; (3) doing complete page verification requires
	 * reading additional pages (for example, overflow keys imply reading
	 * overflow pages in order to test the key's order in the page); (4)
	 * the read server cannot read additional pages because it will hang
	 * waiting on itself.  For this reason, we split page verification
	 * into a physical verification, which allows the in-memory version
	 * of the page to be built, and then a subsequent logical verification
	 * which happens here.
	 *
	 * Report progress occasionally.
	 */
#define	WT_VERIFY_PROGRESS_INTERVAL	100
	if (++vs->fcnt % WT_VERIFY_PROGRESS_INTERVAL == 0)
		WT_RET(__wt_progress(session, NULL, vs->fcnt));

#ifdef HAVE_DIAGNOSTIC
	/* Optionally dump the blocks or page in debugging mode. */
	if (vs->dump_blocks)
		WT_RET(__wt_debug_disk(session, page->dsk, NULL));
	if (vs->dump_pages)
		WT_RET(__wt_debug_page(session, page, NULL));
#endif

	/*
	 * Column-store key order checks: check the page's record number and
	 * then update the total record count.
	 */
	switch (page->type) {
	case WT_PAGE_COL_FIX:
		recno = page->pg_fix_recno;
		goto recno_chk;
	case WT_PAGE_COL_INT:
		recno = page->pg_intl_recno;
		goto recno_chk;
	case WT_PAGE_COL_VAR:
		recno = page->pg_var_recno;
recno_chk:	if (recno != vs->record_total + 1)
			WT_RET_MSG(session, WT_ERROR,
			    "page at %s has a starting record of %" PRIu64
			    " when the expected starting record is %" PRIu64,
			    __wt_page_addr_string(session, ref, vs->tmp1),
			    recno, vs->record_total + 1);
		break;
	}
	switch (page->type) {
	case WT_PAGE_COL_FIX:
		vs->record_total += page->pg_fix_entries;
		break;
	case WT_PAGE_COL_VAR:
		recno = 0;
		WT_COL_FOREACH(page, cip, i)
			if ((cell = WT_COL_PTR(page, cip)) == NULL)
				++recno;
			else {
				__wt_cell_unpack(cell, unpack);
				recno += __wt_cell_rle(unpack);
			}
		vs->record_total += recno;
		break;
	}

	/*
	 * Row-store leaf page key order check: it's a depth-first traversal,
	 * the first key on this page should be larger than any key previously
	 * seen.
	 */
	switch (page->type) {
	case WT_PAGE_ROW_LEAF:
		WT_RET(__verify_row_leaf_key_order(session, ref, vs));
		break;
	}

	/* If it's not the root page, unpack the parent cell. */
	if (!__wt_ref_is_root(ref)) {
		__wt_cell_unpack(ref->addr, unpack);

		/* Compare the parent cell against the page type. */
		switch (page->type) {
		case WT_PAGE_COL_FIX:
			if (unpack->raw != WT_CELL_ADDR_LEAF_NO)
				goto celltype_err;
			break;
		case WT_PAGE_COL_VAR:
			if (unpack->raw != WT_CELL_ADDR_LEAF &&
			    unpack->raw != WT_CELL_ADDR_LEAF_NO)
				goto celltype_err;
			break;
		case WT_PAGE_ROW_LEAF:
			if (unpack->raw != WT_CELL_ADDR_DEL &&
			    unpack->raw != WT_CELL_ADDR_LEAF &&
			    unpack->raw != WT_CELL_ADDR_LEAF_NO)
				goto celltype_err;
			break;
		case WT_PAGE_COL_INT:
		case WT_PAGE_ROW_INT:
			if (unpack->raw != WT_CELL_ADDR_INT)
celltype_err:			WT_RET_MSG(session, WT_ERROR,
				    "page at %s, of type %s, is referenced in "
				    "its parent by a cell of type %s",
				    __wt_page_addr_string(
					session, ref, vs->tmp1),
				    __wt_page_type_string(page->type),
				    __wt_cell_type_string(unpack->raw));
			break;
		}
	}

	/*
	 * Check overflow pages.  We check overflow cells separately from other
	 * tests that walk the page as it's simpler, and I don't care much how
	 * fast table verify runs.
	 */
	switch (page->type) {
	case WT_PAGE_COL_VAR:
	case WT_PAGE_ROW_INT:
	case WT_PAGE_ROW_LEAF:
		WT_RET(__verify_overflow_cell(session, ref, &found, vs));
		if (__wt_ref_is_root(ref) || page->type == WT_PAGE_ROW_INT)
			break;

		/*
		 * Object if a leaf-no-overflow address cell references a page
		 * with overflow keys, but don't object if a leaf address cell
		 * references a page without overflow keys.  Reconciliation
		 * doesn't guarantee every leaf page without overflow items will
		 * be a leaf-no-overflow type.
		 */
		if (found && unpack->raw == WT_CELL_ADDR_LEAF_NO)
			WT_RET_MSG(session, WT_ERROR,
			    "page at %s, of type %s and referenced in its "
			    "parent by a cell of type %s, contains overflow "
			    "items",
			    __wt_page_addr_string(session, ref, vs->tmp1),
			    __wt_page_type_string(page->type),
			    __wt_cell_type_string(WT_CELL_ADDR_LEAF_NO));
		break;
	}

	/* Check tree connections and recursively descend the tree. */
	switch (page->type) {
	case WT_PAGE_COL_INT:
		/* For each entry in an internal page, verify the subtree. */
		entry = 0;
		WT_INTL_FOREACH_BEGIN(session, page, child_ref) {
			/*
			 * It's a depth-first traversal: this entry's starting
			 * record number should be 1 more than the total records
			 * reviewed to this point.
			 */
			++entry;
			if (child_ref->key.recno != vs->record_total + 1) {
				WT_RET_MSG(session, WT_ERROR,
				    "the starting record number in entry %"
				    PRIu32 " of the column internal page at "
				    "%s is %" PRIu64 " and the expected "
				    "starting record number is %" PRIu64,
				    entry,
				    __wt_page_addr_string(
				    session, child_ref, vs->tmp1),
				    child_ref->key.recno,
				    vs->record_total + 1);
			}

			/* Verify the subtree. */
			++vs->depth;
			WT_RET(__wt_page_in(session, child_ref, 0));
			ret = __verify_tree(session, child_ref, vs);
			WT_TRET(__wt_page_release(session, child_ref, 0));
			--vs->depth;
			WT_RET(ret);

			__wt_cell_unpack(child_ref->addr, unpack);
			WT_RET(bm->verify_addr(
			    bm, session, unpack->data, unpack->size));
		} WT_INTL_FOREACH_END;
		break;
	case WT_PAGE_ROW_INT:
		/* For each entry in an internal page, verify the subtree. */
		entry = 0;
		WT_INTL_FOREACH_BEGIN(session, page, child_ref) {
			/*
			 * It's a depth-first traversal: this entry's starting
			 * key should be larger than the largest key previously
			 * reviewed.
			 *
			 * The 0th key of any internal page is magic, and we
			 * can't test against it.
			 */
			++entry;
			if (entry != 1)
				WT_RET(__verify_row_int_key_order(
				    session, page, child_ref, entry, vs));

			/* Verify the subtree. */
			++vs->depth;
			WT_RET(__wt_page_in(session, child_ref, 0));
			ret = __verify_tree(session, child_ref, vs);
			WT_TRET(__wt_page_release(session, child_ref, 0));
			--vs->depth;
			WT_RET(ret);

			__wt_cell_unpack(child_ref->addr, unpack);
			WT_RET(bm->verify_addr(
			    bm, session, unpack->data, unpack->size));
		} WT_INTL_FOREACH_END;
Exemplo n.º 6
0
/*
 * __stat_page_col_var --
 *	Stat a WT_PAGE_COL_VAR page.
 */
static void
__stat_page_col_var(
    WT_SESSION_IMPL *session, WT_PAGE *page, WT_DSRC_STATS **stats)
{
	WT_CELL *cell;
	WT_CELL_UNPACK *unpack, _unpack;
	WT_COL *cip;
	WT_INSERT *ins;
	WT_UPDATE *upd;
	uint64_t deleted_cnt, entry_cnt, ovfl_cnt, rle_cnt;
	uint32_t i;
	bool orig_deleted;

	unpack = &_unpack;
	deleted_cnt = entry_cnt = ovfl_cnt = rle_cnt = 0;

	WT_STAT_INCR(session, stats, btree_column_variable);

	/*
	 * Walk the page counting regular items, adjusting if the item has been
	 * subsequently deleted or not. This is a mess because 10-item RLE might
	 * have 3 of the items subsequently deleted. Overflow items are harder,
	 * we can't know if an updated item will be an overflow item or not; do
	 * our best, and simply count every overflow item (or RLE set of items)
	 * we see.
	 */
	WT_COL_FOREACH(page, cip, i) {
		if ((cell = WT_COL_PTR(page, cip)) == NULL) {
			orig_deleted = true;
			++deleted_cnt;
		} else {
			orig_deleted = false;
			__wt_cell_unpack(cell, unpack);
			if (unpack->type == WT_CELL_ADDR_DEL)
				orig_deleted = true;
			else {
				entry_cnt += __wt_cell_rle(unpack);
				rle_cnt += __wt_cell_rle(unpack) - 1;
			}
			if (unpack->ovfl)
				++ovfl_cnt;
		}

		/*
		 * Walk the insert list, checking for changes.  For each insert
		 * we find, correct the original count based on its state.
		 */
		WT_SKIP_FOREACH(ins, WT_COL_UPDATE(page, cip)) {
			upd = ins->upd;
			if (WT_UPDATE_DELETED_ISSET(upd)) {
				if (!orig_deleted) {
					++deleted_cnt;
					--entry_cnt;
				}
			} else
				if (orig_deleted) {
					--deleted_cnt;
					++entry_cnt;
				}
		}
	}
Exemplo n.º 7
0
/*
 * __verify_tree --
 *	Verify a tree, recursively descending through it in depth-first fashion.
 * The page argument was physically verified (so we know it's correctly formed),
 * and the in-memory version built.  Our job is to check logical relationships
 * in the page and in the tree.
 */
static int
__verify_tree(WT_SESSION_IMPL *session, WT_PAGE *page, WT_VSTUFF *vs)
{
	WT_BM *bm;
	WT_CELL *cell;
	WT_CELL_UNPACK *unpack, _unpack;
	WT_COL *cip;
	WT_DECL_RET;
	WT_REF *ref;
	uint64_t recno;
	uint32_t entry, i;
	int found, lno;

	bm = S2BT(session)->bm;
	unpack = &_unpack;

	WT_VERBOSE_RET(session, verify, "%s %s",
	    __wt_page_addr_string(session, vs->tmp1, page),
	    __wt_page_type_string(page->type));
#ifdef HAVE_DIAGNOSTIC
	if (vs->dump_address)
		WT_RET(__wt_msg(session, "%s %s",
		    __wt_page_addr_string(session, vs->tmp1, page),
		    __wt_page_type_string(page->type)));
#endif

	/*
	 * The page's physical structure was verified when it was read into
	 * memory by the read server thread, and then the in-memory version
	 * of the page was built.   Now we make sure the page and tree are
	 * logically consistent.
	 *
	 * !!!
	 * The problem: (1) the read server has to build the in-memory version
	 * of the page because the read server is the thread that flags when
	 * any thread can access the page in the tree; (2) we can't build the
	 * in-memory version of the page until the physical structure is known
	 * to be OK, so the read server has to verify at least the physical
	 * structure of the page; (3) doing complete page verification requires
	 * reading additional pages (for example, overflow keys imply reading
	 * overflow pages in order to test the key's order in the page); (4)
	 * the read server cannot read additional pages because it will hang
	 * waiting on itself.  For this reason, we split page verification
	 * into a physical verification, which allows the in-memory version
	 * of the page to be built, and then a subsequent logical verification
	 * which happens here.
	 *
	 * Report progress every 10 pages.
	 */
	if (++vs->fcnt % 10 == 0)
		WT_RET(__wt_progress(session, NULL, vs->fcnt));

#ifdef HAVE_DIAGNOSTIC
	/* Optionally dump the page in debugging mode. */
	if (vs->dump_blocks && page->dsk != NULL)
		WT_RET(__wt_debug_disk(session, page->dsk, NULL));
	if (vs->dump_pages)
		WT_RET(__wt_debug_page(session, page, NULL));
#endif

	/*
	 * Column-store key order checks: check the page's record number and
	 * then update the total record count.
	 */
	switch (page->type) {
	case WT_PAGE_COL_FIX:
		recno = page->u.col_fix.recno;
		goto recno_chk;
	case WT_PAGE_COL_INT:
		recno = page->u.intl.recno;
		goto recno_chk;
	case WT_PAGE_COL_VAR:
		recno = page->u.col_var.recno;
recno_chk:	if (recno != vs->record_total + 1)
			WT_RET_MSG(session, WT_ERROR,
			    "page at %s has a starting record of %" PRIu64
			    " when the expected starting record is %" PRIu64,
			    __wt_page_addr_string(session, vs->tmp1, page),
			    recno, vs->record_total + 1);
		break;
	}
	switch (page->type) {
	case WT_PAGE_COL_FIX:
		vs->record_total += page->entries;
		break;
	case WT_PAGE_COL_VAR:
		recno = 0;
		WT_COL_FOREACH(page, cip, i)
			if ((cell = WT_COL_PTR(page, cip)) == NULL)
				++recno;
			else {
				__wt_cell_unpack(cell, unpack);
				recno += __wt_cell_rle(unpack);
			}
		vs->record_total += recno;
		break;
	}

	/*
	 * Row-store leaf page key order check: it's a depth-first traversal,
	 * the first key on this page should be larger than any key previously
	 * seen.
	 */
	switch (page->type) {
	case WT_PAGE_ROW_LEAF:
		WT_RET(__verify_row_leaf_key_order(session, page, vs));
		break;
	}

	/*
	 * Check overflow pages.  We check overflow cells separately from other
	 * tests that walk the page as it's simpler, and I don't care much how
	 * fast table verify runs.
	 *
	 * Object if a leaf-no-overflow address cell references a page that has
	 * overflow keys, but don't object if a standard address cell references
	 * a page without overflow keys.  The leaf-no-overflow address cell is
	 * an optimization for trees without few, if any, overflow items, and
	 * may not be set by reconciliation in all possible cases.
	 */
	if (WT_PAGE_IS_ROOT(page))
		lno = 0;
	else {
		__wt_cell_unpack(page->ref->addr, unpack);
		lno = unpack->raw == WT_CELL_ADDR_LNO ? 1 : 0;
	}
	switch (page->type) {
	case WT_PAGE_COL_FIX:
		break;
	case WT_PAGE_COL_VAR:
	case WT_PAGE_ROW_INT:
	case WT_PAGE_ROW_LEAF:
		WT_RET(__verify_overflow_cell(session, page, &found, vs));
		if (found && lno)
			WT_RET_MSG(session, WT_ERROR,
			    "page at %s referenced in its parent by a cell of "
			    "type %s illegally contains overflow items",
			    __wt_page_addr_string(session, vs->tmp1, page),
			    __wt_cell_type_string(WT_CELL_ADDR_LNO));
		break;
	default:
		if (lno)
			WT_RET_MSG(session, WT_ERROR,
			    "page at %s is of type %s and is illegally "
			    "referenced in its parent by a cell of type %s",
			    __wt_page_addr_string(session, vs->tmp1, page),
			    __wt_page_type_string(page->type),
			    __wt_cell_type_string(WT_CELL_ADDR_LNO));
		break;
	}

	/* Check tree connections and recursively descend the tree. */
	switch (page->type) {
	case WT_PAGE_COL_INT:
		/* For each entry in an internal page, verify the subtree. */
		entry = 0;
		WT_REF_FOREACH(page, ref, i) {
			/*
			 * It's a depth-first traversal: this entry's starting
			 * record number should be 1 more than the total records
			 * reviewed to this point.
			 */
			++entry;
			if (ref->u.recno != vs->record_total + 1) {
				__wt_cell_unpack(ref->addr, unpack);
				WT_RET_MSG(session, WT_ERROR,
				    "the starting record number in entry %"
				    PRIu32 " of the column internal page at "
				    "%s is %" PRIu64 " and the expected "
				    "starting record number is %" PRIu64,
				    entry,
				    __wt_page_addr_string(
				    session, vs->tmp1, page),
				    ref->u.recno,
				    vs->record_total + 1);
			}

			/* Verify the subtree. */
			WT_RET(__wt_page_in(session, page, ref));
			ret = __verify_tree(session, ref->page, vs);
			WT_TRET(__wt_page_release(session, ref->page));
			WT_RET(ret);

			__wt_cell_unpack(ref->addr, unpack);
			WT_RET(bm->verify_addr(
			    bm, session, unpack->data, unpack->size));
		}
		break;
	case WT_PAGE_ROW_INT:
		/* For each entry in an internal page, verify the subtree. */
		entry = 0;
		WT_REF_FOREACH(page, ref, i) {
			/*
			 * It's a depth-first traversal: this entry's starting
			 * key should be larger than the largest key previously
			 * reviewed.
			 *
			 * The 0th key of any internal page is magic, and we
			 * can't test against it.
			 */
			++entry;
			if (entry != 1)
				WT_RET(__verify_row_int_key_order(
				    session, page, ref, entry, vs));

			/* Verify the subtree. */
			WT_RET(__wt_page_in(session, page, ref));
			ret = __verify_tree(session, ref->page, vs);
			WT_TRET(__wt_page_release(session, ref->page));
			WT_RET(ret);

			__wt_cell_unpack(ref->addr, unpack);
			WT_RET(bm->verify_addr(
			    bm, session, unpack->data, unpack->size));
		}