/* * __wt_debug_disk -- * Dump a disk page in debugging mode. */ int __wt_debug_disk( WT_SESSION_IMPL *session, const WT_PAGE_HEADER *dsk, const char *ofile) { WT_DBG *ds, _ds; ds = &_ds; WT_RET(__debug_config(session, ds, ofile)); WT_RET(ds->f(ds, "%s page", __wt_page_type_string(dsk->type))); switch (dsk->type) { case WT_PAGE_BLOCK_MANAGER: break; case WT_PAGE_COL_FIX: case WT_PAGE_COL_INT: case WT_PAGE_COL_VAR: WT_RET(ds->f(ds, ", recno %" PRIu64, dsk->recno)); /* FALLTHROUGH */ case WT_PAGE_ROW_INT: case WT_PAGE_ROW_LEAF: WT_RET(ds->f(ds, ", entries %" PRIu32, dsk->u.entries)); break; case WT_PAGE_OVFL: WT_RET(ds->f(ds, ", datalen %" PRIu32, dsk->u.datalen)); break; WT_ILLEGAL_VALUE(session); } if (F_ISSET(dsk, WT_PAGE_COMPRESSED)) WT_RET(ds->f(ds, ", compressed")); if (F_ISSET(dsk, WT_PAGE_ENCRYPTED)) WT_RET(ds->f(ds, ", encrypted")); if (F_ISSET(dsk, WT_PAGE_EMPTY_V_ALL)) WT_RET(ds->f(ds, ", empty-all")); if (F_ISSET(dsk, WT_PAGE_EMPTY_V_NONE)) WT_RET(ds->f(ds, ", empty-none")); if (F_ISSET(dsk, WT_PAGE_LAS_UPDATE)) WT_RET(ds->f(ds, ", LAS-update")); WT_RET(ds->f(ds, ", generation %" PRIu64 "\n", dsk->write_gen)); switch (dsk->type) { case WT_PAGE_BLOCK_MANAGER: break; case WT_PAGE_COL_FIX: WT_RET(__debug_dsk_col_fix(ds, dsk)); break; case WT_PAGE_COL_INT: case WT_PAGE_COL_VAR: case WT_PAGE_ROW_INT: case WT_PAGE_ROW_LEAF: WT_RET(__debug_dsk_cell(ds, dsk)); break; default: break; } return (__dmsg_wrapup(ds)); }
/* * __wt_debug_disk -- * Dump a disk page in debugging mode. */ int __wt_debug_disk( WT_SESSION_IMPL *session, WT_PAGE_HEADER *dsk, const char *ofile) { WT_DBG *ds, _ds; WT_DECL_RET; ds = &_ds; WT_RET(__debug_config(session, ds, ofile)); __dmsg(ds, "%s page", __wt_page_type_string(dsk->type)); switch (dsk->type) { case WT_PAGE_COL_FIX: case WT_PAGE_COL_INT: case WT_PAGE_COL_VAR: __dmsg(ds, ", recno %" PRIu64, dsk->recno); /* FALLTHROUGH */ case WT_PAGE_ROW_INT: case WT_PAGE_ROW_LEAF: __dmsg(ds, ", entries %" PRIu32 "\n", dsk->u.entries); break; case WT_PAGE_OVFL: __dmsg(ds, ", datalen %" PRIu32 "\n", dsk->u.datalen); break; WT_ILLEGAL_VALUE(session); } switch (dsk->type) { case WT_PAGE_COL_FIX: __debug_dsk_col_fix(ds, dsk); break; case WT_PAGE_COL_INT: case WT_PAGE_COL_VAR: case WT_PAGE_ROW_INT: case WT_PAGE_ROW_LEAF: ret = __debug_dsk_cell(ds, dsk); break; default: break; } __dmsg_wrapup(ds); return (ret); }
/* * __wt_verify_dsk_image -- * Verify a single block as read from disk. */ int __wt_verify_dsk_image(WT_SESSION_IMPL *session, const char *tag, const WT_PAGE_HEADER *dsk, size_t size, int empty_page_ok) { const uint8_t *p, *end; u_int i; uint8_t flags; /* Check the page type. */ switch (dsk->type) { case WT_PAGE_BLOCK_MANAGER: case WT_PAGE_COL_FIX: case WT_PAGE_COL_INT: case WT_PAGE_COL_VAR: case WT_PAGE_OVFL: case WT_PAGE_ROW_INT: case WT_PAGE_ROW_LEAF: break; case WT_PAGE_INVALID: default: WT_RET_VRFY(session, "page at %s has an invalid type of %" PRIu32, tag, dsk->type); } /* Check the page record number. */ switch (dsk->type) { case WT_PAGE_COL_FIX: case WT_PAGE_COL_INT: case WT_PAGE_COL_VAR: if (dsk->recno != 0) break; WT_RET_VRFY(session, "%s page at %s has a record number of zero", __wt_page_type_string(dsk->type), tag); case WT_PAGE_BLOCK_MANAGER: case WT_PAGE_OVFL: case WT_PAGE_ROW_INT: case WT_PAGE_ROW_LEAF: if (dsk->recno == 0) break; WT_RET_VRFY(session, "%s page at %s has a non-zero record number", __wt_page_type_string(dsk->type), tag); } /* Check the page flags. */ flags = dsk->flags; if (LF_ISSET(WT_PAGE_COMPRESSED)) LF_CLR(WT_PAGE_COMPRESSED); if (LF_ISSET(WT_PAGE_ENCRYPTED)) LF_CLR(WT_PAGE_ENCRYPTED); if (dsk->type == WT_PAGE_ROW_LEAF) { if (LF_ISSET(WT_PAGE_EMPTY_V_ALL) && LF_ISSET(WT_PAGE_EMPTY_V_NONE)) WT_RET_VRFY(session, "page at %s has invalid flags combination: 0x%" PRIx8, tag, dsk->flags); if (LF_ISSET(WT_PAGE_EMPTY_V_ALL)) LF_CLR(WT_PAGE_EMPTY_V_ALL); if (LF_ISSET(WT_PAGE_EMPTY_V_NONE)) LF_CLR(WT_PAGE_EMPTY_V_NONE); } if (flags != 0) WT_RET_VRFY(session, "page at %s has invalid flags set: 0x%" PRIx8, tag, flags); /* Unused bytes */ for (p = dsk->unused, i = sizeof(dsk->unused); i > 0; --i) if (*p != '\0') WT_RET_VRFY(session, "page at %s has non-zero unused page header bytes", tag); /* * Any bytes after the data chunk should be nul bytes; ignore if the * size is 0, that allows easy checking of disk images where we don't * have the size. */ if (size != 0) { p = (uint8_t *)dsk + dsk->mem_size; end = (uint8_t *)dsk + size; for (; p < end; ++p) if (*p != '\0') WT_RET_VRFY(session, "%s page at %s has non-zero trailing bytes", __wt_page_type_string(dsk->type), tag); } /* Check for empty pages, then verify the items on the page. */ switch (dsk->type) { case WT_PAGE_COL_INT: case WT_PAGE_COL_FIX: case WT_PAGE_COL_VAR: case WT_PAGE_ROW_INT: case WT_PAGE_ROW_LEAF: if (!empty_page_ok && dsk->u.entries == 0) WT_RET_VRFY(session, "%s page at %s has no entries", __wt_page_type_string(dsk->type), tag); break; case WT_PAGE_BLOCK_MANAGER: case WT_PAGE_OVFL: if (dsk->u.datalen == 0) WT_RET_VRFY(session, "%s page at %s has no data", __wt_page_type_string(dsk->type), tag); break; } switch (dsk->type) { case WT_PAGE_COL_INT: return (__verify_dsk_col_int(session, tag, dsk)); case WT_PAGE_COL_FIX: return (__verify_dsk_col_fix(session, tag, dsk)); case WT_PAGE_COL_VAR: return (__verify_dsk_col_var(session, tag, dsk)); case WT_PAGE_ROW_INT: case WT_PAGE_ROW_LEAF: return (__verify_dsk_row(session, tag, dsk)); case WT_PAGE_BLOCK_MANAGER: case WT_PAGE_OVFL: return (__verify_dsk_chunk(session, tag, dsk, dsk->u.datalen)); WT_ILLEGAL_VALUE(session); } /* NOTREACHED */ }
/* * __wt_verify_dsk -- * Verify a single Btree page as read from disk. */ int __wt_verify_dsk(WT_SESSION_IMPL *session, const char *addr, WT_ITEM *buf) { WT_PAGE_HEADER *dsk; uint32_t size; uint8_t *p, *end; u_int i; dsk = buf->mem; size = buf->size; /* Check the page type. */ switch (dsk->type) { case WT_PAGE_BLOCK_MANAGER: case WT_PAGE_COL_FIX: case WT_PAGE_COL_INT: case WT_PAGE_COL_VAR: case WT_PAGE_OVFL: case WT_PAGE_ROW_INT: case WT_PAGE_ROW_LEAF: break; case WT_PAGE_INVALID: default: WT_RET_VRFY(session, "page at %s has an invalid type of %" PRIu32, addr, dsk->type); } /* Check the page record number. */ switch (dsk->type) { case WT_PAGE_COL_FIX: case WT_PAGE_COL_INT: case WT_PAGE_COL_VAR: if (dsk->recno != 0) break; WT_RET_VRFY(session, "%s page at %s has a record number of zero", __wt_page_type_string(dsk->type), addr); case WT_PAGE_BLOCK_MANAGER: case WT_PAGE_OVFL: case WT_PAGE_ROW_INT: case WT_PAGE_ROW_LEAF: if (dsk->recno == 0) break; WT_RET_VRFY(session, "%s page at %s has a non-zero record number", __wt_page_type_string(dsk->type), addr); } /* Check the page flags. */ switch (dsk->flags) { case 0: case WT_PAGE_COMPRESSED: break; default: WT_RET_VRFY(session, "page at %s has an invalid flags value of 0x%" PRIx32, addr, (uint32_t)dsk->flags); } /* Unused bytes */ for (p = dsk->unused, i = sizeof(dsk->unused); i > 0; --i) if (*p != '\0') WT_RET_VRFY(session, "page at %s has non-zero unused page header bytes", addr); /* Any bytes after the data chunk should be nul bytes. */ p = (uint8_t *)dsk + dsk->mem_size; end = (uint8_t *)dsk + size; for (; p < end; ++p) if (*p != '\0') WT_RET_VRFY(session, "%s page at %s has non-zero trailing bytes", __wt_page_type_string(dsk->type), addr); /* Verify the items on the page. */ switch (dsk->type) { case WT_PAGE_COL_INT: return (__verify_dsk_col_int(session, addr, dsk)); case WT_PAGE_COL_FIX: return (__verify_dsk_col_fix(session, addr, dsk)); case WT_PAGE_COL_VAR: return (__verify_dsk_col_var(session, addr, dsk)); case WT_PAGE_ROW_INT: case WT_PAGE_ROW_LEAF: return (__verify_dsk_row(session, addr, dsk)); case WT_PAGE_BLOCK_MANAGER: case WT_PAGE_OVFL: return (__verify_dsk_chunk(session, addr, dsk, dsk->u.datalen)); WT_ILLEGAL_VALUE(session); } /* NOTREACHED */ }
/* * __verify_tree -- * Verify a tree, recursively descending through it in depth-first fashion. * The page argument was physically verified (so we know it's correctly formed), * and the in-memory version built. Our job is to check logical relationships * in the page and in the tree. */ static int __verify_tree(WT_SESSION_IMPL *session, WT_REF *ref, WT_VSTUFF *vs) { WT_BM *bm; WT_CELL *cell; WT_CELL_UNPACK *unpack, _unpack; WT_COL *cip; WT_DECL_RET; WT_PAGE *page; WT_REF *child_ref; uint64_t recno; uint32_t entry, i; bool found; bm = S2BT(session)->bm; page = ref->page; unpack = &_unpack; WT_CLEAR(*unpack); /* -Wuninitialized */ WT_RET(__wt_verbose(session, WT_VERB_VERIFY, "%s %s", __wt_page_addr_string(session, ref, vs->tmp1), __wt_page_type_string(page->type))); /* Optionally dump the address. */ if (vs->dump_address) WT_RET(__wt_msg(session, "%s %s", __wt_page_addr_string(session, ref, vs->tmp1), __wt_page_type_string(page->type))); /* Track the shape of the tree. */ if (WT_PAGE_IS_INTERNAL(page)) ++vs->depth_internal[ WT_MIN(vs->depth, WT_ELEMENTS(vs->depth_internal) - 1)]; else ++vs->depth_leaf[ WT_MIN(vs->depth, WT_ELEMENTS(vs->depth_internal) - 1)]; /* * The page's physical structure was verified when it was read into * memory by the read server thread, and then the in-memory version * of the page was built. Now we make sure the page and tree are * logically consistent. * * !!! * The problem: (1) the read server has to build the in-memory version * of the page because the read server is the thread that flags when * any thread can access the page in the tree; (2) we can't build the * in-memory version of the page until the physical structure is known * to be OK, so the read server has to verify at least the physical * structure of the page; (3) doing complete page verification requires * reading additional pages (for example, overflow keys imply reading * overflow pages in order to test the key's order in the page); (4) * the read server cannot read additional pages because it will hang * waiting on itself. For this reason, we split page verification * into a physical verification, which allows the in-memory version * of the page to be built, and then a subsequent logical verification * which happens here. * * Report progress occasionally. */ #define WT_VERIFY_PROGRESS_INTERVAL 100 if (++vs->fcnt % WT_VERIFY_PROGRESS_INTERVAL == 0) WT_RET(__wt_progress(session, NULL, vs->fcnt)); #ifdef HAVE_DIAGNOSTIC /* Optionally dump the blocks or page in debugging mode. */ if (vs->dump_blocks) WT_RET(__wt_debug_disk(session, page->dsk, NULL)); if (vs->dump_pages) WT_RET(__wt_debug_page(session, page, NULL)); #endif /* * Column-store key order checks: check the page's record number and * then update the total record count. */ switch (page->type) { case WT_PAGE_COL_FIX: recno = page->pg_fix_recno; goto recno_chk; case WT_PAGE_COL_INT: recno = page->pg_intl_recno; goto recno_chk; case WT_PAGE_COL_VAR: recno = page->pg_var_recno; recno_chk: if (recno != vs->record_total + 1) WT_RET_MSG(session, WT_ERROR, "page at %s has a starting record of %" PRIu64 " when the expected starting record is %" PRIu64, __wt_page_addr_string(session, ref, vs->tmp1), recno, vs->record_total + 1); break; } switch (page->type) { case WT_PAGE_COL_FIX: vs->record_total += page->pg_fix_entries; break; case WT_PAGE_COL_VAR: recno = 0; WT_COL_FOREACH(page, cip, i) if ((cell = WT_COL_PTR(page, cip)) == NULL) ++recno; else { __wt_cell_unpack(cell, unpack); recno += __wt_cell_rle(unpack); } vs->record_total += recno; break; } /* * Row-store leaf page key order check: it's a depth-first traversal, * the first key on this page should be larger than any key previously * seen. */ switch (page->type) { case WT_PAGE_ROW_LEAF: WT_RET(__verify_row_leaf_key_order(session, ref, vs)); break; } /* If it's not the root page, unpack the parent cell. */ if (!__wt_ref_is_root(ref)) { __wt_cell_unpack(ref->addr, unpack); /* Compare the parent cell against the page type. */ switch (page->type) { case WT_PAGE_COL_FIX: if (unpack->raw != WT_CELL_ADDR_LEAF_NO) goto celltype_err; break; case WT_PAGE_COL_VAR: if (unpack->raw != WT_CELL_ADDR_LEAF && unpack->raw != WT_CELL_ADDR_LEAF_NO) goto celltype_err; break; case WT_PAGE_ROW_LEAF: if (unpack->raw != WT_CELL_ADDR_DEL && unpack->raw != WT_CELL_ADDR_LEAF && unpack->raw != WT_CELL_ADDR_LEAF_NO) goto celltype_err; break; case WT_PAGE_COL_INT: case WT_PAGE_ROW_INT: if (unpack->raw != WT_CELL_ADDR_INT) celltype_err: WT_RET_MSG(session, WT_ERROR, "page at %s, of type %s, is referenced in " "its parent by a cell of type %s", __wt_page_addr_string( session, ref, vs->tmp1), __wt_page_type_string(page->type), __wt_cell_type_string(unpack->raw)); break; } } /* * Check overflow pages. We check overflow cells separately from other * tests that walk the page as it's simpler, and I don't care much how * fast table verify runs. */ switch (page->type) { case WT_PAGE_COL_VAR: case WT_PAGE_ROW_INT: case WT_PAGE_ROW_LEAF: WT_RET(__verify_overflow_cell(session, ref, &found, vs)); if (__wt_ref_is_root(ref) || page->type == WT_PAGE_ROW_INT) break; /* * Object if a leaf-no-overflow address cell references a page * with overflow keys, but don't object if a leaf address cell * references a page without overflow keys. Reconciliation * doesn't guarantee every leaf page without overflow items will * be a leaf-no-overflow type. */ if (found && unpack->raw == WT_CELL_ADDR_LEAF_NO) WT_RET_MSG(session, WT_ERROR, "page at %s, of type %s and referenced in its " "parent by a cell of type %s, contains overflow " "items", __wt_page_addr_string(session, ref, vs->tmp1), __wt_page_type_string(page->type), __wt_cell_type_string(WT_CELL_ADDR_LEAF_NO)); break; } /* Check tree connections and recursively descend the tree. */ switch (page->type) { case WT_PAGE_COL_INT: /* For each entry in an internal page, verify the subtree. */ entry = 0; WT_INTL_FOREACH_BEGIN(session, page, child_ref) { /* * It's a depth-first traversal: this entry's starting * record number should be 1 more than the total records * reviewed to this point. */ ++entry; if (child_ref->key.recno != vs->record_total + 1) { WT_RET_MSG(session, WT_ERROR, "the starting record number in entry %" PRIu32 " of the column internal page at " "%s is %" PRIu64 " and the expected " "starting record number is %" PRIu64, entry, __wt_page_addr_string( session, child_ref, vs->tmp1), child_ref->key.recno, vs->record_total + 1); } /* Verify the subtree. */ ++vs->depth; WT_RET(__wt_page_in(session, child_ref, 0)); ret = __verify_tree(session, child_ref, vs); WT_TRET(__wt_page_release(session, child_ref, 0)); --vs->depth; WT_RET(ret); __wt_cell_unpack(child_ref->addr, unpack); WT_RET(bm->verify_addr( bm, session, unpack->data, unpack->size)); } WT_INTL_FOREACH_END; break; case WT_PAGE_ROW_INT: /* For each entry in an internal page, verify the subtree. */ entry = 0; WT_INTL_FOREACH_BEGIN(session, page, child_ref) { /* * It's a depth-first traversal: this entry's starting * key should be larger than the largest key previously * reviewed. * * The 0th key of any internal page is magic, and we * can't test against it. */ ++entry; if (entry != 1) WT_RET(__verify_row_int_key_order( session, page, child_ref, entry, vs)); /* Verify the subtree. */ ++vs->depth; WT_RET(__wt_page_in(session, child_ref, 0)); ret = __verify_tree(session, child_ref, vs); WT_TRET(__wt_page_release(session, child_ref, 0)); --vs->depth; WT_RET(ret); __wt_cell_unpack(child_ref->addr, unpack); WT_RET(bm->verify_addr( bm, session, unpack->data, unpack->size)); } WT_INTL_FOREACH_END;
/* * __wt_verify -- * Verify a file. */ int __wt_verify(WT_SESSION_IMPL *session, const char *cfg[]) { WT_BM *bm; WT_BTREE *btree; WT_CKPT *ckptbase, *ckpt; WT_DECL_RET; WT_VSTUFF *vs, _vstuff; size_t root_addr_size; uint8_t root_addr[WT_BTREE_MAX_ADDR_COOKIE]; bool bm_start, quit; btree = S2BT(session); bm = btree->bm; ckptbase = NULL; bm_start = false; WT_CLEAR(_vstuff); vs = &_vstuff; WT_ERR(__wt_scr_alloc(session, 0, &vs->max_key)); WT_ERR(__wt_scr_alloc(session, 0, &vs->max_addr)); WT_ERR(__wt_scr_alloc(session, 0, &vs->tmp1)); WT_ERR(__wt_scr_alloc(session, 0, &vs->tmp2)); WT_ERR(__wt_scr_alloc(session, 0, &vs->tmp3)); WT_ERR(__wt_scr_alloc(session, 0, &vs->tmp4)); /* Check configuration strings. */ WT_ERR(__verify_config(session, cfg, vs)); /* Optionally dump specific block offsets. */ WT_ERR(__verify_config_offsets(session, cfg, &quit)); if (quit) goto done; /* Get a list of the checkpoints for this file. */ WT_ERR( __wt_meta_ckptlist_get(session, btree->dhandle->name, &ckptbase)); /* Inform the underlying block manager we're verifying. */ WT_ERR(bm->verify_start(bm, session, ckptbase, cfg)); bm_start = true; /* Loop through the file's checkpoints, verifying each one. */ WT_CKPT_FOREACH(ckptbase, ckpt) { WT_ERR(__wt_verbose(session, WT_VERB_VERIFY, "%s: checkpoint %s", btree->dhandle->name, ckpt->name)); /* Fake checkpoints require no work. */ if (F_ISSET(ckpt, WT_CKPT_FAKE)) continue; /* House-keeping between checkpoints. */ __verify_checkpoint_reset(vs); if (WT_VRFY_DUMP(vs)) WT_ERR(__wt_msg(session, "%s: checkpoint %s", btree->dhandle->name, ckpt->name)); /* Load the checkpoint. */ WT_ERR(bm->checkpoint_load(bm, session, ckpt->raw.data, ckpt->raw.size, root_addr, &root_addr_size, true)); /* * Ignore trees with no root page. * Verify, then discard the checkpoint from the cache. */ if (root_addr_size != 0 && (ret = __wt_btree_tree_open( session, root_addr, root_addr_size)) == 0) { if (WT_VRFY_DUMP(vs)) WT_ERR(__wt_msg(session, "Root: %s %s", __wt_addr_string(session, root_addr, root_addr_size, vs->tmp1), __wt_page_type_string( btree->root.page->type))); WT_WITH_PAGE_INDEX(session, ret = __verify_tree(session, &btree->root, vs)); WT_TRET(__wt_cache_op(session, WT_SYNC_DISCARD)); } /* Unload the checkpoint. */ WT_TRET(bm->checkpoint_unload(bm, session)); WT_ERR(ret); /* Display the tree shape. */ if (vs->dump_shape) WT_ERR(__verify_tree_shape(session, vs)); }
void run(int r) { char buf[128]; printf("\t%s: run %d\n", __wt_page_type_string(page_type), r); CHECK(system("rm -f WiredTiger* __slvg.*") == 0); CHECK((res_fp = fopen(RSLT, "w")) != NULL); /* * Each run builds the LOAD file, and then appends the first page of * the LOAD file into the SLVG file. The SLVG file is then salvaged, * verified, and dumped into the DUMP file, which is compared to the * results file, which are the expected results. */ switch (r) { case 1: /* * Smoke test: empty files. */ build(0, 0, 0); copy(0, 0); break; case 2: /* * Smoke test: * Sequential pages, all pages should be kept. */ build(100, 100, 20); copy(6, 1); build(200, 200, 20); copy(7, 21); build(300, 300, 20); copy(8, 41); print_res(100, 100, 20); print_res(200, 200, 20); print_res(300, 300, 20); break; case 3: /* * Smoke test: * Sequential pages, all pages should be kept. */ build(100, 100, 20); copy(8, 1); build(200, 200, 20); copy(7, 21); build(300, 300, 20); copy(6, 41); print_res(100, 100, 20); print_res(200, 200, 20); print_res(300, 300, 20); break; case 4: /* * Case #1: * 3 pages, each with 20 records starting with the same record * and sequential LSNs; salvage should leave the page with the * largest LSN. */ build(100, 100, 20); copy(6, 1); build(100, 200, 20); copy(7, 1); build(100, 300, 20); copy(8, 1); print_res(100, 300, 20); break; case 5: /* * Case #1: * 3 pages, each with 20 records starting with the same record * and sequential LSNs; salvage should leave the page with the * largest LSN. */ build(100, 100, 20); copy(6, 1); build(100, 200, 20); copy(8, 1); build(100, 300, 20); copy(7, 1); print_res(100, 200, 20); break; case 6: /* * Case #1: * 3 pages, each with 20 records starting with the same record * and sequential LSNs; salvage should leave the page with the * largest LSN. */ build(100, 100, 20); copy(8, 1); build(100, 200, 20); copy(7, 1); build(100, 300, 20); copy(6, 1); print_res(100, 100, 20); break; case 7: /* * Case #2: * The second page overlaps the beginning of the first page, and * the first page has a higher LSN. */ build(110, 100, 20); copy(7, 11); build(100, 200, 20); copy(6, 1); print_res(100, 200, 10); print_res(110, 100, 20); break; case 8: /* * Case #2: * The second page overlaps the beginning of the first page, and * the second page has a higher LSN. */ build(110, 100, 20); copy(6, 11); build(100, 200, 20); copy(7, 1); print_res(100, 200, 20); print_res(120, 110, 10); break; case 9: /* * Case #3: * The second page overlaps with the end of the first page, and * the first page has a higher LSN. */ build(100, 100, 20); copy(7, 1); build(110, 200, 20); copy(6, 11); print_res(100, 100, 20); print_res(120, 210, 10); break; case 10: /* * Case #3: * The second page overlaps with the end of the first page, and * the second page has a higher LSN. */ build(100, 100, 20); copy(6, 1); build(110, 200, 20); copy(7, 11); print_res(100, 100, 10); print_res(110, 200, 20); break; case 11: /* * Case #4: * The second page is a prefix of the first page, and the first * page has a higher LSN. */ build(100, 100, 20); copy(7, 1); build(100, 200, 5); copy(6, 1); print_res(100, 100, 20); break; case 12: /* * Case #4: * The second page is a prefix of the first page, and the second * page has a higher LSN. */ build(100, 100, 20); copy(6, 1); build(100, 200, 5); copy(7, 1); print_res(100, 200, 5); print_res(105, 105, 15); break; case 13: /* * Case #5: * The second page is in the middle of the first page, and the * first page has a higher LSN. */ build(100, 100, 40); copy(7, 1); build(110, 200, 10); copy(6, 11); print_res(100, 100, 40); break; case 14: /* * Case #5: * The second page is in the middle of the first page, and the * second page has a higher LSN. */ build(100, 100, 40); copy(6, 1); build(110, 200, 10); copy(7, 11); print_res(100, 100, 10); print_res(110, 200, 10); print_res(120, 120, 20); break; case 15: /* * Case #6: * The second page is a suffix of the first page, and the first * page has a higher LSN. */ build(100, 100, 40); copy(7, 1); build(130, 200, 10); copy(6, 31); print_res(100, 100, 40); break; case 16: /* * Case #6: * The second page is a suffix of the first page, and the second * page has a higher LSN. */ build(100, 100, 40); copy(6, 1); build(130, 200, 10); copy(7, 31); print_res(100, 100, 30); print_res(130, 200, 10); break; case 17: /* * Case #9: * The first page is a prefix of the second page, and the first * page has a higher LSN. */ build(100, 100, 20); copy(7, 1); build(100, 200, 40); copy(6, 1); print_res(100, 100, 20); print_res(120, 220, 20); break; case 18: /* * Case #9: * The first page is a prefix of the second page, and the second * page has a higher LSN. */ build(100, 100, 20); copy(6, 1); build(100, 200, 40); copy(7, 1); print_res(100, 200, 40); break; case 19: /* * Case #10: * The first page is a suffix of the second page, and the first * page has a higher LSN. */ build(130, 100, 10); copy(7, 31); build(100, 200, 40); copy(6, 1); print_res(100, 200, 30); print_res(130, 100, 10); break; case 20: /* * Case #10: * The first page is a suffix of the second page, and the second * page has a higher LSN. */ build(130, 100, 10); copy(6, 31); build(100, 200, 40); copy(7, 1); print_res(100, 200, 40); break; case 21: /* * Case #11: * The first page is in the middle of the second page, and the * first page has a higher LSN. */ build(110, 100, 10); copy(7, 11); build(100, 200, 40); copy(6, 1); print_res(100, 200, 10); print_res(110, 100, 10); print_res(120, 220, 20); break; case 22: /* * Case #11: * The first page is in the middle of the second page, and the * second page has a higher LSN. */ build(110, 100, 10); copy(6, 11); build(100, 200, 40); copy(7, 1); print_res(100, 200, 40); break; case 23: /* * Column-store only: missing an initial key range of 99 * records. */ build(100, 100, 10); copy(1, 100); empty(99); print_res(100, 100, 10); break; case 24: /* * Column-store only: missing a middle key range of 37 * records. */ build(100, 100, 10); copy(1, 1); build(138, 138, 10); copy(1, 48); print_res(100, 100, 10); empty(37); print_res(138, 138, 10); break; default: fprintf(stderr, "salvage: %d: no such test\n", r); exit(EXIT_FAILURE); } CHECK(fclose(res_fp) == 0); process(); snprintf(buf, sizeof(buf), "cmp %s %s > /dev/null", DUMP, RSLT); if (system(buf)) { fprintf(stderr, "check failed, salvage results were incorrect\n"); exit(EXIT_FAILURE); } }
/* * __wt_rec_evict -- * Reconciliation plus eviction. */ int __wt_rec_evict(WT_SESSION_IMPL *session, WT_PAGE *page, int exclusive) { WT_DECL_RET; WT_PAGE_MODIFY *mod; int merge; WT_VERBOSE_RET(session, evict, "page %p (%s)", page, __wt_page_type_string(page->type)); WT_ASSERT(session, session->excl_next == 0); /* * If we get a split-merge page during normal eviction, try to collapse * it. During close, it will be merged into its parent. */ mod = page->modify; merge = __wt_btree_mergeable(page); if (merge && exclusive) return (EBUSY); WT_ASSERT(session, merge || mod == NULL || !F_ISSET(mod, WT_PM_REC_SPLIT_MERGE)); /* * Get exclusive access to the page and review the page and its subtree * for conditions that would block our eviction of the page. If the * check fails (for example, we find a child page that can't be merged), * we're done. We have to make this check for clean pages, too: while * unlikely eviction would choose an internal page with children, it's * not disallowed anywhere. * * Note that page->ref may be NULL in some cases (e.g., for root pages * or during salvage). That's OK if exclusive is set: we won't check * hazard pointers in that case. */ WT_ERR(__rec_review(session, page->ref, page, exclusive, merge, 1)); /* Try to merge internal pages. */ if (merge) WT_ERR(__wt_merge_tree(session, page)); /* * Update the page's modification reference, reconciliation might have * changed it. */ mod = page->modify; /* Count evictions of internal pages during normal operation. */ if (!exclusive && !merge && (page->type == WT_PAGE_COL_INT || page->type == WT_PAGE_ROW_INT)) { WT_CSTAT_INCR(session, cache_eviction_internal); WT_DSTAT_INCR(session, cache_eviction_internal); } /* * Update the parent and discard the page. */ if (mod == NULL || !F_ISSET(mod, WT_PM_REC_MASK)) { WT_ASSERT(session, exclusive || page->ref->state == WT_REF_LOCKED); if (WT_PAGE_IS_ROOT(page)) __rec_root_update(session); else __rec_page_clean_update(session, page); /* Discard the page. */ __rec_discard_page(session, page, exclusive); WT_CSTAT_INCR(session, cache_eviction_clean); WT_DSTAT_INCR(session, cache_eviction_clean); } else { if (WT_PAGE_IS_ROOT(page)) __rec_root_update(session); else WT_ERR(__rec_page_dirty_update(session, page)); /* Discard the tree rooted in this page. */ __rec_discard_tree(session, page, exclusive); WT_CSTAT_INCR(session, cache_eviction_dirty); WT_DSTAT_INCR(session, cache_eviction_dirty); } if (0) { err: /* * If unable to evict this page, release exclusive reference(s) * we've acquired. */ __rec_excl_clear(session); WT_CSTAT_INCR(session, cache_eviction_fail); WT_DSTAT_INCR(session, cache_eviction_fail); } session->excl_next = 0; return (ret); }
/* * __wt_cache_read -- * Read a page from the file. */ int __wt_cache_read(WT_SESSION_IMPL *session, WT_REF *ref) { WT_DECL_RET; WT_ITEM tmp; WT_PAGE *page; WT_PAGE_STATE previous_state; size_t addr_size; const uint8_t *addr; page = NULL; /* * Don't pass an allocated buffer to the underlying block read function, * force allocation of new memory of the appropriate size. */ WT_CLEAR(tmp); /* * Attempt to set the state to WT_REF_READING for normal reads, or * WT_REF_LOCKED, for deleted pages. If successful, we've won the * race, read the page. */ if (WT_ATOMIC_CAS4(ref->state, WT_REF_DISK, WT_REF_READING)) previous_state = WT_REF_DISK; else if (WT_ATOMIC_CAS4(ref->state, WT_REF_DELETED, WT_REF_LOCKED)) previous_state = WT_REF_DELETED; else return (0); /* * Get the address: if there is no address, the page was deleted, but a * subsequent search or insert is forcing re-creation of the name space. * Otherwise, there's an address, read the backing disk page and build * an in-memory version of the page. */ WT_ERR(__wt_ref_info(session, ref, &addr, &addr_size, NULL)); if (addr == NULL) { WT_ASSERT(session, previous_state == WT_REF_DELETED); WT_ERR(__wt_btree_new_leaf_page(session, &page)); ref->page = page; } else { /* Read the backing disk page. */ WT_ERR(__wt_bt_read(session, &tmp, addr, addr_size)); /* Build the in-memory version of the page. */ WT_ERR(__wt_page_inmem(session, ref, tmp.data, WT_DATA_IN_ITEM(&tmp) ? WT_PAGE_DISK_ALLOC : WT_PAGE_DISK_MAPPED, &page)); /* If the page was deleted, instantiate that information. */ if (previous_state == WT_REF_DELETED) WT_ERR(__wt_delete_page_instantiate(session, ref)); } WT_ERR(__wt_verbose(session, WT_VERB_READ, "page %p: %s", page, __wt_page_type_string(page->type))); WT_PUBLISH(ref->state, WT_REF_MEM); return (0); err: /* * If the function building an in-memory version of the page failed, * it discarded the page, but not the disk image. Discard the page * and separately discard the disk image in all cases. */ if (ref->page != NULL) __wt_ref_out(session, ref); WT_PUBLISH(ref->state, previous_state); __wt_buf_free(session, &tmp); return (ret); }
/* * __wt_rec_evict -- * Reconciliation plus eviction. */ int __wt_rec_evict(WT_SESSION_IMPL *session, WT_PAGE *page, uint32_t flags) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; int single; conn = S2C(session); WT_VERBOSE_RET(session, evict, "page %p (%s)", page, __wt_page_type_string(page->type)); WT_ASSERT(session, session->excl_next == 0); single = LF_ISSET(WT_REC_SINGLE) ? 1 : 0; /* * Get exclusive access to the page and review the page and its subtree * for conditions that would block our eviction of the page. If the * check fails (for example, we find a child page that can't be merged), * we're done. We have to make this check for clean pages, too: while * unlikely eviction would choose an internal page with children, it's * not disallowed anywhere. * * Note that page->ref may be NULL in some cases (e.g., for root pages * or during salvage). That's OK if WT_REC_SINGLE is set: we won't * check hazard references in that case. */ WT_ERR(__rec_review(session, page->ref, page, flags, 1)); /* Count evictions of internal pages during normal operation. */ if (!single && (page->type == WT_PAGE_COL_INT || page->type == WT_PAGE_ROW_INT)) WT_STAT_INCR(conn->stats, cache_evict_internal); /* Update the parent and discard the page. */ if (page->modify == NULL || !F_ISSET(page->modify, WT_PM_REC_MASK)) { WT_STAT_INCR(conn->stats, cache_evict_unmodified); WT_ASSERT(session, single || page->ref->state == WT_REF_LOCKED); if (WT_PAGE_IS_ROOT(page)) __rec_root_update(session); else __rec_page_clean_update(session, page); /* Discard the page. */ __rec_discard_page(session, page, single); } else { WT_STAT_INCR(conn->stats, cache_evict_modified); if (WT_PAGE_IS_ROOT(page)) __rec_root_update(session); else WT_ERR(__rec_page_dirty_update(session, page)); /* Discard the tree rooted in this page. */ __rec_discard_tree(session, page, single); } if (0) { err: /* * If unable to evict this page, release exclusive reference(s) * we've acquired. */ __rec_excl_clear(session); } session->excl_next = 0; return (ret); }
/* * __verify_tree -- * Verify a tree, recursively descending through it in depth-first fashion. * The page argument was physically verified (so we know it's correctly formed), * and the in-memory version built. Our job is to check logical relationships * in the page and in the tree. */ static int __verify_tree(WT_SESSION_IMPL *session, WT_PAGE *page, WT_VSTUFF *vs) { WT_BM *bm; WT_CELL *cell; WT_CELL_UNPACK *unpack, _unpack; WT_COL *cip; WT_DECL_RET; WT_REF *ref; uint64_t recno; uint32_t entry, i; int found, lno; bm = S2BT(session)->bm; unpack = &_unpack; WT_VERBOSE_RET(session, verify, "%s %s", __wt_page_addr_string(session, vs->tmp1, page), __wt_page_type_string(page->type)); #ifdef HAVE_DIAGNOSTIC if (vs->dump_address) WT_RET(__wt_msg(session, "%s %s", __wt_page_addr_string(session, vs->tmp1, page), __wt_page_type_string(page->type))); #endif /* * The page's physical structure was verified when it was read into * memory by the read server thread, and then the in-memory version * of the page was built. Now we make sure the page and tree are * logically consistent. * * !!! * The problem: (1) the read server has to build the in-memory version * of the page because the read server is the thread that flags when * any thread can access the page in the tree; (2) we can't build the * in-memory version of the page until the physical structure is known * to be OK, so the read server has to verify at least the physical * structure of the page; (3) doing complete page verification requires * reading additional pages (for example, overflow keys imply reading * overflow pages in order to test the key's order in the page); (4) * the read server cannot read additional pages because it will hang * waiting on itself. For this reason, we split page verification * into a physical verification, which allows the in-memory version * of the page to be built, and then a subsequent logical verification * which happens here. * * Report progress every 10 pages. */ if (++vs->fcnt % 10 == 0) WT_RET(__wt_progress(session, NULL, vs->fcnt)); #ifdef HAVE_DIAGNOSTIC /* Optionally dump the page in debugging mode. */ if (vs->dump_blocks && page->dsk != NULL) WT_RET(__wt_debug_disk(session, page->dsk, NULL)); if (vs->dump_pages) WT_RET(__wt_debug_page(session, page, NULL)); #endif /* * Column-store key order checks: check the page's record number and * then update the total record count. */ switch (page->type) { case WT_PAGE_COL_FIX: recno = page->u.col_fix.recno; goto recno_chk; case WT_PAGE_COL_INT: recno = page->u.intl.recno; goto recno_chk; case WT_PAGE_COL_VAR: recno = page->u.col_var.recno; recno_chk: if (recno != vs->record_total + 1) WT_RET_MSG(session, WT_ERROR, "page at %s has a starting record of %" PRIu64 " when the expected starting record is %" PRIu64, __wt_page_addr_string(session, vs->tmp1, page), recno, vs->record_total + 1); break; } switch (page->type) { case WT_PAGE_COL_FIX: vs->record_total += page->entries; break; case WT_PAGE_COL_VAR: recno = 0; WT_COL_FOREACH(page, cip, i) if ((cell = WT_COL_PTR(page, cip)) == NULL) ++recno; else { __wt_cell_unpack(cell, unpack); recno += __wt_cell_rle(unpack); } vs->record_total += recno; break; } /* * Row-store leaf page key order check: it's a depth-first traversal, * the first key on this page should be larger than any key previously * seen. */ switch (page->type) { case WT_PAGE_ROW_LEAF: WT_RET(__verify_row_leaf_key_order(session, page, vs)); break; } /* * Check overflow pages. We check overflow cells separately from other * tests that walk the page as it's simpler, and I don't care much how * fast table verify runs. * * Object if a leaf-no-overflow address cell references a page that has * overflow keys, but don't object if a standard address cell references * a page without overflow keys. The leaf-no-overflow address cell is * an optimization for trees without few, if any, overflow items, and * may not be set by reconciliation in all possible cases. */ if (WT_PAGE_IS_ROOT(page)) lno = 0; else { __wt_cell_unpack(page->ref->addr, unpack); lno = unpack->raw == WT_CELL_ADDR_LNO ? 1 : 0; } switch (page->type) { case WT_PAGE_COL_FIX: break; case WT_PAGE_COL_VAR: case WT_PAGE_ROW_INT: case WT_PAGE_ROW_LEAF: WT_RET(__verify_overflow_cell(session, page, &found, vs)); if (found && lno) WT_RET_MSG(session, WT_ERROR, "page at %s referenced in its parent by a cell of " "type %s illegally contains overflow items", __wt_page_addr_string(session, vs->tmp1, page), __wt_cell_type_string(WT_CELL_ADDR_LNO)); break; default: if (lno) WT_RET_MSG(session, WT_ERROR, "page at %s is of type %s and is illegally " "referenced in its parent by a cell of type %s", __wt_page_addr_string(session, vs->tmp1, page), __wt_page_type_string(page->type), __wt_cell_type_string(WT_CELL_ADDR_LNO)); break; } /* Check tree connections and recursively descend the tree. */ switch (page->type) { case WT_PAGE_COL_INT: /* For each entry in an internal page, verify the subtree. */ entry = 0; WT_REF_FOREACH(page, ref, i) { /* * It's a depth-first traversal: this entry's starting * record number should be 1 more than the total records * reviewed to this point. */ ++entry; if (ref->u.recno != vs->record_total + 1) { __wt_cell_unpack(ref->addr, unpack); WT_RET_MSG(session, WT_ERROR, "the starting record number in entry %" PRIu32 " of the column internal page at " "%s is %" PRIu64 " and the expected " "starting record number is %" PRIu64, entry, __wt_page_addr_string( session, vs->tmp1, page), ref->u.recno, vs->record_total + 1); } /* Verify the subtree. */ WT_RET(__wt_page_in(session, page, ref)); ret = __verify_tree(session, ref->page, vs); WT_TRET(__wt_page_release(session, ref->page)); WT_RET(ret); __wt_cell_unpack(ref->addr, unpack); WT_RET(bm->verify_addr( bm, session, unpack->data, unpack->size)); } break; case WT_PAGE_ROW_INT: /* For each entry in an internal page, verify the subtree. */ entry = 0; WT_REF_FOREACH(page, ref, i) { /* * It's a depth-first traversal: this entry's starting * key should be larger than the largest key previously * reviewed. * * The 0th key of any internal page is magic, and we * can't test against it. */ ++entry; if (entry != 1) WT_RET(__verify_row_int_key_order( session, page, ref, entry, vs)); /* Verify the subtree. */ WT_RET(__wt_page_in(session, page, ref)); ret = __verify_tree(session, ref->page, vs); WT_TRET(__wt_page_release(session, ref->page)); WT_RET(ret); __wt_cell_unpack(ref->addr, unpack); WT_RET(bm->verify_addr( bm, session, unpack->data, unpack->size)); }