/* * __wt_block_ckpt_to_buffer -- * Convert the components into its checkpoint cookie. */ int __wt_block_ckpt_to_buffer(WT_SESSION_IMPL *session, WT_BLOCK *block, uint8_t **pp, WT_BLOCK_CKPT *ci) { uint64_t a; if (ci->version != WT_BM_CHECKPOINT_VERSION) WT_RET_MSG(session, WT_ERROR, "unsupported checkpoint version"); (*pp)[0] = ci->version; (*pp)++; WT_RET(__wt_block_addr_to_buffer(block, pp, ci->root_offset, ci->root_size, ci->root_cksum)); WT_RET(__wt_block_addr_to_buffer(block, pp, ci->alloc.offset, ci->alloc.size, ci->alloc.cksum)); WT_RET(__wt_block_addr_to_buffer(block, pp, ci->avail.offset, ci->avail.size, ci->avail.cksum)); WT_RET(__wt_block_addr_to_buffer(block, pp, ci->discard.offset, ci->discard.size, ci->discard.cksum)); a = (uint64_t)ci->file_size; WT_RET(__wt_vpack_uint(pp, 0, a)); a = (uint64_t)ci->ckpt_size; WT_RET(__wt_vpack_uint(pp, 0, a)); return (0); }
/* * __wt_debug_offset -- * Read and dump a disk page in debugging mode, using a file * offset/size/checksum triplet. */ int __wt_debug_offset(WT_SESSION_IMPL *session, wt_off_t offset, uint32_t size, uint32_t cksum, const char *ofile) { WT_DECL_ITEM(buf); WT_DECL_RET; uint8_t addr[WT_BTREE_MAX_ADDR_COOKIE], *endp; WT_ASSERT(session, S2BT_SAFE(session) != NULL); /* * This routine depends on the default block manager's view of files, * where an address consists of a file offset, length, and checksum. * This is for debugging only: other block managers might not see a * file or address the same way, that's why there's no block manager * method. * * Convert the triplet into an address structure. */ endp = addr; WT_RET(__wt_block_addr_to_buffer( S2BT(session)->bm->block, &endp, offset, size, cksum)); /* * Read the address through the btree I/O functions (so the block is * decompressed as necessary). */ WT_RET(__wt_scr_alloc(session, 0, &buf)); WT_ERR(__wt_bt_read(session, buf, addr, WT_PTRDIFF(endp, addr))); ret = __wt_debug_disk(session, buf->mem, ofile); err: __wt_scr_free(session, &buf); return (ret); }
/*将buffer的数据写入到block对应的文件中*/ int __wt_block_write(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, uint8_t *addr, size_t *addr_sizep, int data_cksum) { wt_off_t offset; uint32_t size, cksum; uint8_t *endp; WT_RET(__wt_block_write_off(session, block, buf, &offset, &size, &cksum, data_cksum, 0)); endp = addr; /*将block的checksum/长度对齐个数/偏移位置写入addr中*/ WT_RET(__wt_block_addr_to_buffer(block, &endp, offset, size, cksum)); *addr_sizep = WT_PTRDIFF(endp, addr); return 0; }
/* * __wt_block_write -- * Write a buffer into a block, returning the block's address cookie. */ int __wt_block_write(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, uint8_t *addr, size_t *addr_sizep, bool data_cksum) { wt_off_t offset; uint32_t size, cksum; uint8_t *endp; WT_RET(__wt_block_write_off( session, block, buf, &offset, &size, &cksum, data_cksum, false)); endp = addr; WT_RET(__wt_block_addr_to_buffer(block, &endp, offset, size, cksum)); *addr_sizep = WT_PTRDIFF(endp, addr); return (0); }
/* * __wt_block_write -- * Write a buffer into a block, returning the block's address cookie. */ int __wt_block_write(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, uint8_t *addr, uint32_t *addr_size, int data_cksum) { off_t offset; uint32_t size, cksum; uint8_t *endp; WT_UNUSED(addr_size); WT_RET(__wt_block_write_off( session, block, buf, &offset, &size, &cksum, data_cksum, 0)); endp = addr; WT_RET(__wt_block_addr_to_buffer(block, &endp, offset, size, cksum)); *addr_size = WT_PTRDIFF32(endp, addr); return (0); }
/* * __wt_block_checkpoint_load -- * Load a checkpoint. */ int __wt_block_checkpoint_load(WT_SESSION_IMPL *session, WT_BLOCK *block, const uint8_t *addr, size_t addr_size, uint8_t *root_addr, size_t *root_addr_sizep, bool checkpoint) { WT_BLOCK_CKPT *ci, _ci; WT_DECL_ITEM(tmp); WT_DECL_RET; uint8_t *endp; ci = NULL; /* * Sometimes we don't find a root page (we weren't given a checkpoint, * or the checkpoint was empty). In that case we return an empty root * address, set that up now. */ *root_addr_sizep = 0; #ifdef HAVE_VERBOSE if (WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT)) { if (addr != NULL) { WT_ERR(__wt_scr_alloc(session, 0, &tmp)); WT_ERR(__ckpt_string(session, block, addr, tmp)); } __wt_verbose(session, WT_VERB_CHECKPOINT, "%s: load-checkpoint: %s", block->name, addr == NULL ? "[Empty]" : (const char *)tmp->data); } #endif /* * There's a single checkpoint in the file that can be written, all of * the others are read-only. We use the same initialization calls for * readonly checkpoints, but the information doesn't persist. */ if (checkpoint) { ci = &_ci; WT_ERR(__wt_block_ckpt_init(session, ci, "checkpoint")); } else { /* * We depend on the btree level for locking: things will go bad * fast if we open the live system in two handles, or salvage, * truncate or verify the live/running file. */ #ifdef HAVE_DIAGNOSTIC __wt_spin_lock(session, &block->live_lock); WT_ASSERT(session, block->live_open == false); block->live_open = true; __wt_spin_unlock(session, &block->live_lock); #endif ci = &block->live; WT_ERR(__wt_block_ckpt_init(session, ci, "live")); } /* * If the checkpoint has an on-disk root page, load it. Otherwise, size * the file past the description information. */ if (addr == NULL || addr_size == 0) ci->file_size = block->allocsize; else { /* Crack the checkpoint cookie. */ WT_ERR(__wt_block_buffer_to_ckpt(session, block, addr, ci)); /* Verify sets up next. */ if (block->verify) WT_ERR(__wt_verify_ckpt_load(session, block, ci)); /* Read any root page. */ if (ci->root_offset != WT_BLOCK_INVALID_OFFSET) { endp = root_addr; WT_ERR(__wt_block_addr_to_buffer(block, &endp, ci->root_offset, ci->root_size, ci->root_checksum)); *root_addr_sizep = WT_PTRDIFF(endp, root_addr); } /* * Rolling a checkpoint forward requires the avail list, the * blocks from which we can allocate. */ if (!checkpoint) WT_ERR(__wt_block_extlist_read_avail( session, block, &ci->avail, ci->file_size)); } /* * If the checkpoint can be written, that means anything written after * the checkpoint is no longer interesting, truncate the file. Don't * bother checking the avail list for a block at the end of the file, * that was done when the checkpoint was first written (re-writing the * checkpoint might possibly make it relevant here, but it's unlikely * enough I don't bother). */ if (!checkpoint) WT_ERR(__wt_block_truncate(session, block, ci->file_size)); if (0) { err: /* * Don't call checkpoint-unload: unload does real work including * file truncation. If we fail early enough that the checkpoint * information isn't correct, bad things would happen. The only * allocated memory was in the service of verify, clean that up. */ if (block->verify) WT_TRET(__wt_verify_ckpt_unload(session, block)); } /* Checkpoints don't need the original information, discard it. */ if (checkpoint && ci != NULL) __wt_block_ckpt_destroy(session, ci); __wt_scr_free(session, &tmp); return (ret); }
/* * __wt_block_salvage_next -- * Return the address for the next potential block from the file. */ int __wt_block_salvage_next(WT_SESSION_IMPL *session, WT_BLOCK *block, uint8_t *addr, size_t *addr_sizep, bool *eofp) { WT_BLOCK_HEADER *blk; WT_DECL_ITEM(tmp); WT_DECL_RET; WT_FH *fh; wt_off_t max, offset; uint32_t allocsize, checksum, size; uint8_t *endp; *eofp = 0; fh = block->fh; allocsize = block->allocsize; WT_ERR(__wt_scr_alloc(session, allocsize, &tmp)); /* Read through the file, looking for pages. */ for (max = block->size;;) { offset = block->slvg_off; if (offset >= max) { /* Check eof. */ *eofp = 1; goto done; } /* * Read the start of a possible page (an allocation-size block), * and get a page length from it. Move to the next allocation * sized boundary, we'll never consider this one again. */ WT_ERR(__wt_read( session, fh, offset, (size_t)allocsize, tmp->mem)); blk = WT_BLOCK_HEADER_REF(tmp->mem); __wt_block_header_byteswap(blk); size = blk->disk_size; checksum = blk->checksum; /* * Check the block size: if it's not insane, read the block. * Reading the block validates any checksum; if reading the * block succeeds, return its address as a possible page, * otherwise, move past it. */ if (!__wt_block_offset_invalid(block, offset, size) && __wt_block_read_off( session, block, tmp, offset, size, checksum) == 0) break; /* Free the allocation-size block. */ __wt_verbose(session, WT_VERB_SALVAGE, "skipping %" PRIu32 "B at file offset %" PRIuMAX, allocsize, (uintmax_t)offset); WT_ERR(__wt_block_off_free( session, block, offset, (wt_off_t)allocsize)); block->slvg_off += allocsize; } /* Re-create the address cookie that should reference this block. */ endp = addr; WT_ERR(__wt_block_addr_to_buffer(block, &endp, offset, size, checksum)); *addr_sizep = WT_PTRDIFF(endp, addr); done: err: __wt_scr_free(session, &tmp); return (ret); }
/* * __wt_block_salvage_next -- * Return the next block from the file. */ int __wt_block_salvage_next( WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, uint8_t *addr, uint32_t *addr_sizep, uint64_t *write_genp, int *eofp) { WT_BLOCK_HEADER *blk; WT_FH *fh; off_t max, offset; uint32_t allocsize, cksum, size; uint8_t *endp; *eofp = 0; offset = block->slvg_off; fh = block->fh; allocsize = block->allocsize; WT_RET(__wt_buf_initsize(session, buf, allocsize)); /* Read through the file, looking for pages with valid checksums. */ for (max = fh->file_size;;) { if (offset >= max) { /* Check eof. */ *eofp = 1; return (0); } /* * Read the start of a possible page (an allocation-size block), * and get a page length from it. */ WT_RET(__wt_read(session, fh, offset, allocsize, buf->mem)); blk = WT_BLOCK_HEADER_REF(buf->mem); /* * The page can't be more than the min/max page size, or past * the end of the file. */ size = blk->disk_size; cksum = blk->cksum; if (size == 0 || size % allocsize != 0 || size > WT_BTREE_PAGE_SIZE_MAX || offset + (off_t)size > max) goto skip; /* * The page size isn't insane, read the entire page: reading the * page validates the checksum and then decompresses the page as * needed. If reading the page fails, it's probably corruption, * we ignore this block. */ if (__wt_block_read_off( session, block, buf, offset, size, cksum)) { skip: WT_VERBOSE_RET(session, salvage, "skipping %" PRIu32 "B at file offset %" PRIuMAX, allocsize, (uintmax_t)offset); /* * Free the block and make sure we don't return it more * than once. */ WT_RET(__wt_block_off_free( session, block, offset, (off_t)allocsize)); block->slvg_off = offset += allocsize; continue; } /* * Valid block, return to our caller. * * The buffer may have grown: make sure we read from the full * page image. */ blk = WT_BLOCK_HEADER_REF(buf->mem); break; } /* * Track the largest write-generation we've seen in the file so future * writes, done after salvage completes, are preferred to these blocks. */ *write_genp = blk->write_gen; if (block->live.write_gen < blk->write_gen) block->live.write_gen = blk->write_gen; /* Re-create the address cookie that should reference this block. */ endp = addr; WT_RET(__wt_block_addr_to_buffer(block, &endp, offset, size, cksum)); *addr_sizep = WT_PTRDIFF32(endp, addr); /* We're successfully returning the page, move past it. */ block->slvg_off = offset + size; return (0); }
/* * __wt_block_salvage_next -- * Return the address for the next potential block from the file. */ int __wt_block_salvage_next(WT_SESSION_IMPL *session, WT_BLOCK *block, uint8_t *addr, uint32_t *addr_sizep, int *eofp) { WT_BLOCK_HEADER *blk; WT_DECL_ITEM(tmp); WT_DECL_RET; WT_FH *fh; off_t max, offset; uint32_t allocsize, cksum, size; uint8_t *endp; *eofp = 0; fh = block->fh; allocsize = block->allocsize; WT_ERR(__wt_scr_alloc(session, allocsize, &tmp)); /* Read through the file, looking for pages. */ for (max = fh->size;;) { offset = block->slvg_off; if (offset >= max) { /* Check eof. */ *eofp = 1; goto done; } /* * Read the start of a possible page (an allocation-size block), * and get a page length from it. Move to the next allocation * sized boundary, we'll never consider this one again. */ WT_ERR(__wt_read(session, fh, offset, allocsize, tmp->mem)); blk = WT_BLOCK_HEADER_REF(tmp->mem); block->slvg_off += allocsize; /* * The page can't be more than the min/max page size, or past * the end of the file. */ size = blk->disk_size; cksum = blk->cksum; if (size == 0 || size % allocsize != 0 || size > WT_BTREE_PAGE_SIZE_MAX || offset + (off_t)size > max) goto skip; /* * The block size isn't insane, read the entire block. Reading * the block validates the checksum; if reading the block fails, * ignore it. If reading the block succeeds, return its address * as a possible page. */ if (__wt_block_read_off( session, block, tmp, offset, size, cksum) == 0) break; skip: WT_VERBOSE_ERR(session, salvage, "skipping %" PRIu32 "B at file offset %" PRIuMAX, allocsize, (uintmax_t)offset); /* Free the allocation-size block. */ WT_ERR(__wt_block_off_free( session, block, offset, (off_t)allocsize)); } /* Re-create the address cookie that should reference this block. */ endp = addr; WT_ERR(__wt_block_addr_to_buffer(block, &endp, offset, size, cksum)); *addr_sizep = WT_PTRDIFF32(endp, addr); done: err: __wt_scr_free(&tmp); return (ret); }