/* * __wt_btree_stat_init -- * Initialize the Btree statistics. */ int __wt_btree_stat_init(WT_SESSION_IMPL *session, uint32_t flags) { WT_BM *bm; WT_BTREE *btree; WT_DECL_RET; WT_DSRC_STATS *stats; WT_PAGE *page; btree = S2BT(session); bm = btree->bm; stats = &btree->dhandle->stats; WT_RET(bm->stat(bm, session, stats)); WT_STAT_SET(stats, btree_fixed_len, btree->bitcnt); WT_STAT_SET(stats, btree_maximum_depth, btree->maximum_depth); WT_STAT_SET(stats, btree_maxintlitem, btree->maxintlitem); WT_STAT_SET(stats, btree_maxintlpage, btree->maxintlpage); WT_STAT_SET(stats, btree_maxleafitem, btree->maxleafitem); WT_STAT_SET(stats, btree_maxleafpage, btree->maxleafpage); page = NULL; if (LF_ISSET(WT_STATISTICS_FAST)) return (0); while ((ret = __wt_tree_walk(session, &page, 0)) == 0 && page != NULL) WT_RET(__stat_page(session, page, stats)); return (ret == WT_NOTFOUND ? 0 : ret); }
/* * __wt_rec_track_wrapup_err -- * Resolve the page's list of tracked objects after an error occurs. */ int __wt_rec_track_wrapup_err(WT_SESSION_IMPL *session, WT_PAGE *page) { WT_BM *bm; WT_DECL_RET; WT_PAGE_MODIFY *mod; WT_PAGE_TRACK *track; uint32_t i; bm = session->btree->bm; /* * After a failed reconciliation of a page, discard entries added in the * current reconciliation, their information is incorrect, additionally, * clear the in-use flag in preparation for the next reconciliation. */ mod = page->modify; for (track = mod->track, i = 0; i < mod->track_entries; ++track, ++i) if (F_ISSET(track, WT_TRK_JUST_ADDED)) { /* * The in-use flag is used to avoid discarding backing * blocks: if an object is both just-added and in-use, * we allocated the blocks on this run, and we want to * discard them on error. */ if (F_ISSET(track, WT_TRK_INUSE)) WT_TRET(bm->free(bm, session, track->addr.addr, track->addr.size)); __wt_free(session, track->addr.addr); memset(track, 0, sizeof(*track)); } else F_CLR(track, WT_TRK_INUSE); return (ret); }
/* * __meta_track_apply -- * Apply the changes in a metadata tracking record. */ static int __meta_track_apply(WT_SESSION_IMPL *session, WT_META_TRACK *trk) { WT_BM *bm; WT_BTREE *btree; WT_DECL_RET; switch (trk->op) { case WT_ST_EMPTY: /* Unused slot */ break; case WT_ST_CHECKPOINT: /* Checkpoint, see above */ btree = trk->dhandle->handle; bm = btree->bm; WT_WITH_DHANDLE(session, trk->dhandle, ret = bm->checkpoint_resolve(bm, session)); break; case WT_ST_DROP_COMMIT: if ((ret = __wt_block_manager_drop(session, trk->a, false)) != 0) __wt_err(session, ret, "metadata remove dropped file %s", trk->a); break; case WT_ST_LOCK: WT_WITH_DHANDLE(session, trk->dhandle, ret = __wt_session_release_btree(session)); break; case WT_ST_FILEOP: case WT_ST_REMOVE: case WT_ST_SET: break; } __meta_track_clear(session, trk); return (ret); }
/* * __wt_btree_stat_init -- * Initialize the Btree statistics. */ int __wt_btree_stat_init(WT_SESSION_IMPL *session, WT_CURSOR_STAT *cst) { WT_BM *bm; WT_BTREE *btree; WT_DSRC_STATS **stats; btree = S2BT(session); bm = btree->bm; stats = btree->dhandle->stats; WT_RET(bm->stat(bm, session, stats[0])); WT_STAT_SET(session, stats, btree_fixed_len, btree->bitcnt); WT_STAT_SET(session, stats, btree_maximum_depth, btree->maximum_depth); WT_STAT_SET(session, stats, btree_maxintlkey, btree->maxintlkey); WT_STAT_SET(session, stats, btree_maxintlpage, btree->maxintlpage); WT_STAT_SET(session, stats, btree_maxleafkey, btree->maxleafkey); WT_STAT_SET(session, stats, btree_maxleafpage, btree->maxleafpage); WT_STAT_SET(session, stats, btree_maxleafvalue, btree->maxleafvalue); WT_STAT_SET(session, stats, cache_bytes_inuse, __wt_btree_bytes_inuse(session)); if (F_ISSET(cst, WT_STAT_TYPE_CACHE_WALK)) __wt_curstat_cache_walk(session); if (F_ISSET(cst, WT_STAT_TYPE_TREE_WALK)) WT_RET(__stat_tree_walk(session)); return (0); }
/* * __wt_compact_page_skip -- * Return if the block-manager wants us to re-write this page. */ int __wt_compact_page_skip( WT_SESSION_IMPL *session, WT_PAGE *parent, WT_REF *ref, int *skipp) { WT_BM *bm; uint32_t addr_size; const uint8_t *addr; bm = S2BT(session)->bm; /* * There's one compaction test we do before we read the page, to see * if the block-manager thinks it useful to rewrite the page. If a * rewrite won't help, we don't want to do I/O for nothing. For that * reason, this check is done in a call from inside the tree-walking * routine. * * Ignore everything but on-disk pages, we've already done a pass over * the in-memory pages. */ if (ref->state != WT_REF_DISK) { *skipp = 1; return (0); } __wt_get_addr(parent, ref, &addr, &addr_size); if (addr == NULL) { *skipp = 1; return (0); } return (bm->compact_page_skip(bm, session, addr, addr_size, skipp)); }
/* * __wt_compact_page_skip -- * Return if compaction requires we read this page. */ int __wt_compact_page_skip(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp) { WT_BM *bm; size_t addr_size; u_int type; const uint8_t *addr; *skipp = false; /* Default to reading. */ type = 0; /* Keep compiler quiet. */ bm = S2BT(session)->bm; /* * We aren't holding a hazard pointer, so we can't look at the page * itself, all we can look at is the WT_REF information. If there's no * address, the page isn't on disk, but we have to read internal pages * to walk the tree regardless; throw up our hands and read it. */ __wt_ref_info(ref, &addr, &addr_size, &type); if (addr == NULL) return (0); /* * Internal pages must be read to walk the tree; ask the block-manager * if it's useful to rewrite leaf pages, don't do the I/O if a rewrite * won't help. */ return (type == WT_CELL_ADDR_INT ? 0 : bm->compact_page_skip(bm, session, addr, addr_size, skipp)); }
/* * __compact_end -- * End object compaction. */ static int __compact_end(WT_SESSION_IMPL *session) { WT_BM *bm; bm = S2BT(session)->bm; return (bm->compact_end(bm, session)); }
/* * __wt_compact -- * Compact a file. */ int __wt_compact(WT_SESSION_IMPL *session, const char *cfg[]) { WT_BM *bm; WT_CONFIG_ITEM cval; WT_DECL_RET; WT_PAGE *page; int trigger, skip; bm = S2BT(session)->bm; WT_DSTAT_INCR(session, session_compact); WT_RET(__wt_config_gets(session, cfg, "trigger", &cval)); trigger = (int)cval.val; /* Check if compaction might be useful. */ WT_RET(bm->compact_skip(bm, session, trigger, &skip)); if (skip) return (0); /* * Walk the cache reviewing in-memory pages to see if they need to be * re-written. This requires looking at page reconciliation results, * which means the page cannot be reconciled at the same time as it's * being reviewed for compaction. The underlying functions ensure we * don't collide with page eviction, but we need to make sure we don't * collide with checkpoints either, they are the other operation that * can reconcile a page. */ __wt_spin_lock(session, &S2C(session)->metadata_lock); WT_RET(__wt_bt_cache_op(session, NULL, WT_SYNC_COMPACT)); __wt_spin_unlock(session, &S2C(session)->metadata_lock); /* * Walk the tree, reviewing on-disk pages to see if they need to be * re-written. */ for (page = NULL;;) { WT_RET(__wt_tree_walk(session, &page, WT_TREE_COMPACT)); if (page == NULL) break; /* * The only pages returned by the tree walk function are pages * we want to re-write; mark the page and tree dirty. */ if ((ret = __wt_page_modify_init(session, page)) != 0) { WT_TRET(__wt_page_release(session, page)); WT_RET(ret); } __wt_page_and_tree_modify_set(session, page); WT_DSTAT_INCR(session, btree_compact_rewrite); } return (0); }
/* * __ovfl_reuse_wrapup_err -- * Resolve the page's overflow reuse list after an error occurs. */ static int __ovfl_reuse_wrapup_err(WT_SESSION_IMPL *session, WT_PAGE *page) { WT_BM *bm; WT_DECL_RET; WT_OVFL_REUSE **e, **head, *reuse; size_t decr; int i; bm = S2BT(session)->bm; head = page->modify->ovfl_track->ovfl_reuse; /* * Discard any overflow records that were just added, freeing underlying * blocks. * * First, walk the overflow reuse lists (except for the lowest one), * fixing up skiplist links. */ for (i = WT_SKIP_MAXDEPTH - 1; i > 0; --i) for (e = &head[i]; (reuse = *e) != NULL;) { if (!F_ISSET(reuse, WT_OVFL_REUSE_JUST_ADDED)) { e = &reuse->next[i]; continue; } *e = reuse->next[i]; } /* * Second, discard any overflow record with a just-added flag, clear the * flags for the next run. */ decr = 0; for (e = &head[0]; (reuse = *e) != NULL;) { if (!F_ISSET(reuse, WT_OVFL_REUSE_JUST_ADDED)) { F_CLR(reuse, WT_OVFL_REUSE_INUSE); e = &reuse->next[0]; continue; } *e = reuse->next[0]; if (WT_VERBOSE_ISSET(session, WT_VERB_OVERFLOW)) WT_RET( __ovfl_reuse_verbose(session, page, reuse, "free")); WT_TRET(bm->free( bm, session, WT_OVFL_REUSE_ADDR(reuse), reuse->addr_size)); decr += WT_OVFL_SIZE(reuse, WT_OVFL_REUSE); __wt_free(session, reuse); } if (decr != 0) __wt_cache_page_inmem_decr(session, page, decr); return (0); }
/* * __compact_rewrite -- * Return if a page needs to be re-written. */ static int __compact_rewrite(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp) { WT_BM *bm; WT_DECL_RET; WT_PAGE *page; WT_PAGE_MODIFY *mod; size_t addr_size; const uint8_t *addr; *skipp = true; /* Default skip. */ bm = S2BT(session)->bm; page = ref->page; mod = page->modify; /* * Ignore the root: it may not have a replacement address, and besides, * if anything else gets written, so will it. */ if (__wt_ref_is_root(ref)) return (0); /* Ignore currently dirty pages, they will be written regardless. */ if (__wt_page_is_modified(page)) return (0); /* * If the page is clean, test the original addresses. * If the page is a 1-to-1 replacement, test the replacement addresses. * Ignore empty pages, they get merged into the parent. */ if (mod == NULL || mod->rec_result == 0) { WT_RET(__wt_ref_info(session, ref, &addr, &addr_size, NULL)); if (addr == NULL) return (0); WT_RET( bm->compact_page_skip(bm, session, addr, addr_size, skipp)); } else if (mod->rec_result == WT_PM_REC_REPLACE) { /* * The page's modification information can change underfoot if * the page is being reconciled, serialize with reconciliation. */ WT_RET(__wt_fair_lock(session, &page->page_lock)); ret = bm->compact_page_skip(bm, session, mod->mod_replace.addr, mod->mod_replace.size, skipp); WT_TRET(__wt_fair_unlock(session, &page->page_lock)); WT_RET(ret); } return (0); }
/* * __wt_btree_stat_init -- * Initialize the Btree statistics. */ int __wt_btree_stat_init(WT_SESSION_IMPL *session, WT_CURSOR_STAT *cst) { WT_BM *bm; WT_BTREE *btree; WT_DECL_RET; WT_DSRC_STATS **stats; WT_REF *next_walk; btree = S2BT(session); bm = btree->bm; stats = btree->dhandle->stats; WT_RET(bm->stat(bm, session, stats[0])); WT_STAT_SET(session, stats, btree_fixed_len, btree->bitcnt); WT_STAT_SET(session, stats, btree_maximum_depth, btree->maximum_depth); WT_STAT_SET(session, stats, btree_maxintlkey, btree->maxintlkey); WT_STAT_SET(session, stats, btree_maxintlpage, btree->maxintlpage); WT_STAT_SET(session, stats, btree_maxleafkey, btree->maxleafkey); WT_STAT_SET(session, stats, btree_maxleafpage, btree->maxleafpage); WT_STAT_SET(session, stats, btree_maxleafvalue, btree->maxleafvalue); /* Everything else is really, really expensive. */ if (!F_ISSET(cst, WT_CONN_STAT_ALL)) return (0); /* * Clear the statistics we're about to count. */ WT_STAT_SET(session, stats, btree_column_deleted, 0); WT_STAT_SET(session, stats, btree_column_fix, 0); WT_STAT_SET(session, stats, btree_column_internal, 0); WT_STAT_SET(session, stats, btree_column_rle, 0); WT_STAT_SET(session, stats, btree_column_variable, 0); WT_STAT_SET(session, stats, btree_entries, 0); WT_STAT_SET(session, stats, btree_overflow, 0); WT_STAT_SET(session, stats, btree_row_internal, 0); WT_STAT_SET(session, stats, btree_row_leaf, 0); next_walk = NULL; while ((ret = __wt_tree_walk( session, &next_walk, 0)) == 0 && next_walk != NULL) { WT_WITH_PAGE_INDEX(session, ret = __stat_page(session, next_walk->page, stats)); WT_RET(ret); } return (ret == WT_NOTFOUND ? 0 : ret); }
/* * __wt_addr_string -- * Load a buffer with a printable, nul-terminated representation of an * address. */ const char * __wt_addr_string( WT_SESSION_IMPL *session, WT_ITEM *buf, const uint8_t *addr, uint32_t size) { WT_BM *bm; bm = S2BT(session)->bm; if (addr == NULL) { buf->data = "[NoAddr]"; buf->size = WT_STORE_SIZE(strlen("[NoAddr]")); } else if (bm->addr_string(bm, session, buf, addr, size) != 0) { buf->data = "[Error]"; buf->size = WT_STORE_SIZE(strlen("[Error]")); } return (buf->data); }
/* * __wt_ovfl_discard -- * Discard an on-page overflow value, and reset the page's cell. */ int __wt_ovfl_discard(WT_SESSION_IMPL *session, WT_CELL *cell) { WT_BM *bm; WT_BTREE *btree; WT_CELL_UNPACK *unpack, _unpack; WT_DECL_RET; btree = S2BT(session); bm = btree->bm; unpack = &_unpack; __wt_cell_unpack(cell, unpack); /* * Finally remove overflow key/value objects, called when reconciliation * finishes after successfully writing a page. * * Keys must have already been instantiated and value objects must have * already been cached (if they might potentially still be read by any * running transaction). * * Acquire the overflow lock to avoid racing with a thread reading the * backing overflow blocks. */ WT_RET(__wt_writelock(session, btree->ovfl_lock)); switch (unpack->raw) { case WT_CELL_KEY_OVFL: __wt_cell_type_reset(session, unpack->cell, WT_CELL_KEY_OVFL, WT_CELL_KEY_OVFL_RM); break; case WT_CELL_VALUE_OVFL: __wt_cell_type_reset(session, unpack->cell, WT_CELL_VALUE_OVFL, WT_CELL_VALUE_OVFL_RM); break; WT_ILLEGAL_VALUE(session); } WT_TRET(__wt_writeunlock(session, btree->ovfl_lock)); /* Free the backing disk blocks. */ WT_TRET(bm->free(bm, session, unpack->data, unpack->size)); return (ret); }
/* * __wt_debug_addr -- * Read and dump a disk page in debugging mode, using an addr/size pair. */ int __wt_debug_addr(WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size, const char *ofile) { WT_BM *bm; WT_DECL_ITEM(buf); WT_DECL_RET; bm = S2BT(session)->bm; WT_RET(__wt_scr_alloc(session, 1024, &buf)); WT_ERR(bm->read(bm, session, buf, addr, addr_size)); ret = __wt_debug_disk(session, buf->mem, ofile); err: __wt_scr_free(session, &buf); return (ret); }
/* * __wt_addr_string -- * Load a buffer with a printable, nul-terminated representation of an * address. */ const char * __wt_addr_string(WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size, WT_ITEM *buf) { WT_BM *bm; WT_BTREE *btree; btree = S2BT_SAFE(session); if (addr == NULL) { buf->data = "[NoAddr]"; buf->size = strlen("[NoAddr]"); } else if (btree == NULL || (bm = btree->bm) == NULL || bm->addr_string(bm, session, buf, addr, addr_size) != 0) { buf->data = "[Error]"; buf->size = strlen("[Error]"); } return (buf->data); }
/* * __cursor_size_chk -- * Return if an inserted item is too large. */ static inline int __cursor_size_chk(WT_SESSION_IMPL *session, WT_ITEM *kv) { WT_BM *bm; WT_BTREE *btree; WT_DECL_RET; size_t size; btree = S2BT(session); bm = btree->bm; if (btree->type == BTREE_COL_FIX) { /* Fixed-size column-stores take a single byte. */ if (kv->size != 1) WT_RET_MSG(session, EINVAL, "item size of %" WT_SIZET_FMT " does not match " "fixed-length file requirement of 1 byte", kv->size); return (0); } /* Don't waste effort, 1GB is always cool. */ if (kv->size <= WT_GIGABYTE) return (0); /* * There are two checks: what we are willing to store in the tree, and * what the block manager can actually write. */ if (kv->size > WT_BTREE_MAX_OBJECT_SIZE) ret = EINVAL; else { size = kv->size; ret = bm->write_size(bm, session, &size); } if (ret != 0) WT_RET_MSG(session, ret, "item size of %" WT_SIZET_FMT " exceeds the maximum " "supported size", kv->size); return (0); }
/* * __wt_block_manager_open -- * Open a file. */ int __wt_block_manager_open(WT_SESSION_IMPL *session, const char *filename, const char *cfg[], int forced_salvage, WT_BM **bmp) { WT_BM *bm; WT_DECL_RET; *bmp = NULL; WT_RET(__wt_calloc_def(session, 1, &bm)); __bm_method_set(bm, 0); WT_ERR(__wt_block_open( session, filename, cfg, forced_salvage, &bm->block)); *bmp = bm; return (0); err: WT_TRET(bm->close(bm, session)); return (ret); }
/* * __wt_block_manager_open -- * Open a file. */ int __wt_block_manager_open(WT_SESSION_IMPL *session, const char *filename, const char *cfg[], bool forced_salvage, bool readonly, uint32_t allocsize, WT_BM **bmp) { WT_BM *bm; WT_DECL_RET; *bmp = NULL; WT_RET(__wt_calloc_one(session, &bm)); __bm_method_set(bm, false); WT_ERR(__wt_block_open(session, filename, cfg, forced_salvage, readonly, allocsize, &bm->block)); *bmp = bm; return (0); err: WT_TRET(bm->close(bm, session)); return (ret); }
/*将多余的文件空间compact到合适的位置,如果ref在compact范围内,返回skip = 1,表示文件空间不能进行compact*/ static int __compact_rewrite(WT_SESSION_IMPL* session, WT_REF* ref, int* skipp) { WT_BM *bm; WT_DECL_RET; WT_PAGE *page; WT_PAGE_MODIFY *mod; size_t addr_size; const uint8_t *addr; *skipp = 1; bm = S2BT(session)->bm; page = ref->page; mod = page->modify; /*root page是不能被compact*/ if (__wt_ref_is_root(ref)) return 0; /*ref指向的是个脏页,不进行compact*/ if (__wt_page_is_modified(page)) return (0); /*假如page一已经被清空的,直接判断是否可以它的block空间compact*/ if (mod == NULL || F_ISSET(mod, WT_PM_REC_MASK) == 0) { WT_RET(__wt_ref_info(session, ref, &addr, &addr_size, NULL)); if (addr == NULL) return (0); WT_RET(bm->compact_page_skip(bm, session, addr, addr_size, skipp)); } else if (F_ISSET(mod, WT_PM_REC_MASK) == WT_PM_REC_REPLACE){ /*如果page空间是替换,那么进行替换block的compact操作判断*/ WT_PAGE_LOCK(session, page); ret = bm->compact_page_skip(bm, session, mod->mod_replace.addr, mod->mod_replace.size, skipp); WT_PAGE_UNLOCK(session, page); WT_RET(ret); } return 0; }
/*在读取ref对应的page时,检查它是否需要compact*/ int __wt_compact_page_skip(WT_SESSION_IMPL* session, WT_REF* ref, int* skipp) { WT_BM *bm; size_t addr_size; u_int type; const uint8_t *addr; *skipp = 0; type = 0; bm = S2BT(session)->bm; /* * We aren't holding a hazard pointer, so we can't look at the page * itself, all we can look at is the WT_REF information. If there's no * address, the page isn't on disk, but we have to read internal pages * to walk the tree regardless; throw up our hands and read it. */ WT_RET(__wt_ref_info(session, ref, &addr, &addr_size, &type)); if (addr == NULL) return 0; return (type == WT_CELL_ADDR_INT ? 0 : bm->compact_page_skip(bm, session, addr, addr_size, skipp)); }
/* * __wt_bt_write -- * Write a buffer into a block, returning the block's addr/size and * checksum. */ int __wt_bt_write(WT_SESSION_IMPL *session, WT_ITEM *buf, uint8_t *addr, size_t *addr_sizep, bool checkpoint, bool compressed) { WT_BM *bm; WT_BTREE *btree; WT_ITEM *ip; WT_DECL_ITEM(tmp); WT_DECL_RET; WT_PAGE_HEADER *dsk; size_t dst_len, len, result_len, size, src_len; int compression_failed; /* Extension API, so not a bool. */ uint8_t *dst, *src; bool data_cksum; btree = S2BT(session); bm = btree->bm; /* Checkpoint calls are different than standard calls. */ WT_ASSERT(session, (!checkpoint && addr != NULL && addr_sizep != NULL) || (checkpoint && addr == NULL && addr_sizep == NULL)); #ifdef HAVE_DIAGNOSTIC /* * We're passed a table's disk image. Decompress if necessary and * verify the image. Always check the in-memory length for accuracy. */ dsk = buf->mem; if (compressed) { WT_ERR(__wt_scr_alloc(session, dsk->mem_size, &tmp)); memcpy(tmp->mem, buf->data, WT_BLOCK_COMPRESS_SKIP); WT_ERR(btree->compressor->decompress( btree->compressor, &session->iface, (uint8_t *)buf->data + WT_BLOCK_COMPRESS_SKIP, buf->size - WT_BLOCK_COMPRESS_SKIP, (uint8_t *)tmp->data + WT_BLOCK_COMPRESS_SKIP, tmp->memsize - WT_BLOCK_COMPRESS_SKIP, &result_len)); WT_ASSERT(session, dsk->mem_size == result_len + WT_BLOCK_COMPRESS_SKIP); tmp->size = (uint32_t)result_len + WT_BLOCK_COMPRESS_SKIP; ip = tmp; } else { WT_ASSERT(session, dsk->mem_size == buf->size); ip = buf; } WT_ERR(__wt_verify_dsk(session, "[write-check]", ip)); __wt_scr_free(session, &tmp); #endif /* * Optionally stream-compress the data, but don't compress blocks that * are already as small as they're going to get. */ if (btree->compressor == NULL || btree->compressor->compress == NULL || compressed) ip = buf; else if (buf->size <= btree->allocsize) { ip = buf; WT_STAT_FAST_DATA_INCR(session, compress_write_too_small); } else { /* Skip the header bytes of the source data. */ src = (uint8_t *)buf->mem + WT_BLOCK_COMPRESS_SKIP; src_len = buf->size - WT_BLOCK_COMPRESS_SKIP; /* * Compute the size needed for the destination buffer. We only * allocate enough memory for a copy of the original by default, * if any compressed version is bigger than the original, we * won't use it. However, some compression engines (snappy is * one example), may need more memory because they don't stop * just because there's no more memory into which to compress. */ if (btree->compressor->pre_size == NULL) len = src_len; else WT_ERR(btree->compressor->pre_size(btree->compressor, &session->iface, src, src_len, &len)); size = len + WT_BLOCK_COMPRESS_SKIP; WT_ERR(bm->write_size(bm, session, &size)); WT_ERR(__wt_scr_alloc(session, size, &tmp)); /* Skip the header bytes of the destination data. */ dst = (uint8_t *)tmp->mem + WT_BLOCK_COMPRESS_SKIP; dst_len = len; compression_failed = 0; WT_ERR(btree->compressor->compress(btree->compressor, &session->iface, src, src_len, dst, dst_len, &result_len, &compression_failed)); result_len += WT_BLOCK_COMPRESS_SKIP; /* * If compression fails, or doesn't gain us at least one unit of * allocation, fallback to the original version. This isn't * unexpected: if compression doesn't work for some chunk of * data for some reason (noting likely additional format/header * information which compressed output requires), it just means * the uncompressed version is as good as it gets, and that's * what we use. */ if (compression_failed || buf->size / btree->allocsize <= result_len / btree->allocsize) { ip = buf; WT_STAT_FAST_DATA_INCR(session, compress_write_fail); } else { compressed = true; WT_STAT_FAST_DATA_INCR(session, compress_write); /* * Copy in the skipped header bytes, set the final data * size. */ memcpy(tmp->mem, buf->mem, WT_BLOCK_COMPRESS_SKIP); tmp->size = result_len; ip = tmp; } } dsk = ip->mem; /* If the buffer is compressed, set the flag. */ if (compressed) F_SET(dsk, WT_PAGE_COMPRESSED); /* * We increment the block's write generation so it's easy to identify * newer versions of blocks during salvage. (It's common in WiredTiger, * at least for the default block manager, for multiple blocks to be * internally consistent with identical first and last keys, so we need * a way to know the most recent state of the block. We could check * which leaf is referenced by a valid internal page, but that implies * salvaging internal pages, which I don't want to do, and it's not * as good anyway, because the internal page may not have been written * after the leaf page was updated. So, write generations it is. * * Nothing is locked at this point but two versions of a page with the * same generation is pretty unlikely, and if we did, they're going to * be roughly identical for the purposes of salvage, anyway. */ dsk->write_gen = ++btree->write_gen; /* * Checksum the data if the buffer isn't compressed or checksums are * configured. */ switch (btree->checksum) { case CKSUM_ON: data_cksum = true; break; case CKSUM_OFF: data_cksum = false; break; case CKSUM_UNCOMPRESSED: default: data_cksum = !compressed; break; } /* Call the block manager to write the block. */ WT_ERR(checkpoint ? bm->checkpoint(bm, session, ip, btree->ckpt, data_cksum) : bm->write(bm, session, ip, addr, addr_sizep, data_cksum)); WT_STAT_FAST_CONN_INCR(session, cache_write); WT_STAT_FAST_DATA_INCR(session, cache_write); WT_STAT_FAST_CONN_INCRV(session, cache_bytes_write, dsk->mem_size); WT_STAT_FAST_DATA_INCRV(session, cache_bytes_write, dsk->mem_size); err: __wt_scr_free(session, &tmp); return (ret); }
/* * __wt_bt_read -- * Read a cookie referenced block into a buffer. */ int __wt_bt_read(WT_SESSION_IMPL *session, WT_ITEM *buf, const uint8_t *addr, size_t addr_size) { WT_BM *bm; WT_BTREE *btree; WT_DECL_ITEM(etmp); WT_DECL_ITEM(tmp); WT_DECL_RET; WT_ENCRYPTOR *encryptor; WT_ITEM *ip; const WT_PAGE_HEADER *dsk; const char *fail_msg; size_t result_len; btree = S2BT(session); bm = btree->bm; fail_msg = NULL; /* -Wuninitialized */ /* * If anticipating a compressed or encrypted block, read into a scratch * buffer and decompress into the caller's buffer. Else, read directly * into the caller's buffer. */ if (btree->compressor == NULL && btree->kencryptor == NULL) { WT_RET(bm->read(bm, session, buf, addr, addr_size)); dsk = buf->data; ip = NULL; } else { WT_RET(__wt_scr_alloc(session, 0, &tmp)); WT_ERR(bm->read(bm, session, tmp, addr, addr_size)); dsk = tmp->data; ip = tmp; } /* * If the block is encrypted, copy the skipped bytes of the original * image into place, then decrypt. */ if (F_ISSET(dsk, WT_PAGE_ENCRYPTED)) { if (btree->kencryptor == NULL || (encryptor = btree->kencryptor->encryptor) == NULL || encryptor->decrypt == NULL) { fail_msg = "encrypted block in file for which no encryption " "configured"; goto corrupt; } WT_ERR(__wt_scr_alloc(session, 0, &etmp)); if ((ret = __wt_decrypt(session, encryptor, WT_BLOCK_ENCRYPT_SKIP, ip, etmp)) != 0) { fail_msg = "block decryption failed"; goto corrupt; } ip = etmp; dsk = ip->data; } else if (btree->kencryptor != NULL) { fail_msg = "unencrypted block in file for which encryption configured"; goto corrupt; } if (F_ISSET(dsk, WT_PAGE_COMPRESSED)) { if (btree->compressor == NULL || btree->compressor->decompress == NULL) { fail_msg = "compressed block in file for which no compression " "configured"; goto corrupt; } /* * Size the buffer based on the in-memory bytes we're expecting * from decompression. */ WT_ERR(__wt_buf_initsize(session, buf, dsk->mem_size)); /* * Note the source length is NOT the number of compressed bytes, * it's the length of the block we just read (minus the skipped * bytes). We don't store the number of compressed bytes: some * compression engines need that length stored externally, they * don't have markers in the stream to signal the end of the * compressed bytes. Those engines must store the compressed * byte length somehow, see the snappy compression extension for * an example. */ memcpy(buf->mem, ip->data, WT_BLOCK_COMPRESS_SKIP); ret = btree->compressor->decompress( btree->compressor, &session->iface, (uint8_t *)ip->data + WT_BLOCK_COMPRESS_SKIP, tmp->size - WT_BLOCK_COMPRESS_SKIP, (uint8_t *)buf->mem + WT_BLOCK_COMPRESS_SKIP, dsk->mem_size - WT_BLOCK_COMPRESS_SKIP, &result_len); /* * If checksums were turned off because we're depending on the * decompression to fail on any corrupted data, we'll end up * here after corruption happens. If we're salvaging the file, * it's OK, otherwise it's really, really bad. */ if (ret != 0 || result_len != dsk->mem_size - WT_BLOCK_COMPRESS_SKIP) { fail_msg = "block decryption failed"; goto corrupt; } } else /* * If we uncompressed above, the page is in the correct buffer. * If we get here the data may be in the wrong buffer and the * buffer may be the wrong size. If needed, get the page * into the destination buffer. */ if (ip != NULL) WT_ERR(__wt_buf_set( session, buf, ip->data, dsk->mem_size)); /* If the handle is a verify handle, verify the physical page. */ if (F_ISSET(btree, WT_BTREE_VERIFY)) { if (tmp == NULL) WT_ERR(__wt_scr_alloc(session, 0, &tmp)); WT_ERR(bm->addr_string(bm, session, tmp, addr, addr_size)); WT_ERR(__wt_verify_dsk(session, tmp->data, buf)); } WT_STAT_FAST_CONN_INCR(session, cache_read); WT_STAT_FAST_DATA_INCR(session, cache_read); if (F_ISSET(dsk, WT_PAGE_COMPRESSED)) WT_STAT_FAST_DATA_INCR(session, compress_read); WT_STAT_FAST_CONN_INCRV(session, cache_bytes_read, dsk->mem_size); WT_STAT_FAST_DATA_INCRV(session, cache_bytes_read, dsk->mem_size); if (0) { corrupt: if (ret == 0) ret = WT_ERROR; if (!F_ISSET(btree, WT_BTREE_VERIFY) && !F_ISSET(session, WT_SESSION_QUIET_CORRUPT_FILE)) { __wt_err(session, ret, "%s", fail_msg); ret = __wt_illegal_value(session, btree->dhandle->name); } } err: __wt_scr_free(session, &tmp); __wt_scr_free(session, &etmp); return (ret); }
/* * __verify_tree -- * Verify a tree, recursively descending through it in depth-first fashion. * The page argument was physically verified (so we know it's correctly formed), * and the in-memory version built. Our job is to check logical relationships * in the page and in the tree. */ static int __verify_tree(WT_SESSION_IMPL *session, WT_REF *ref, WT_VSTUFF *vs) { WT_BM *bm; WT_CELL *cell; WT_CELL_UNPACK *unpack, _unpack; WT_COL *cip; WT_DECL_RET; WT_PAGE *page; WT_REF *child_ref; uint64_t recno; uint32_t entry, i; bool found; bm = S2BT(session)->bm; page = ref->page; unpack = &_unpack; WT_CLEAR(*unpack); /* -Wuninitialized */ WT_RET(__wt_verbose(session, WT_VERB_VERIFY, "%s %s", __wt_page_addr_string(session, ref, vs->tmp1), __wt_page_type_string(page->type))); /* Optionally dump the address. */ if (vs->dump_address) WT_RET(__wt_msg(session, "%s %s", __wt_page_addr_string(session, ref, vs->tmp1), __wt_page_type_string(page->type))); /* Track the shape of the tree. */ if (WT_PAGE_IS_INTERNAL(page)) ++vs->depth_internal[ WT_MIN(vs->depth, WT_ELEMENTS(vs->depth_internal) - 1)]; else ++vs->depth_leaf[ WT_MIN(vs->depth, WT_ELEMENTS(vs->depth_internal) - 1)]; /* * The page's physical structure was verified when it was read into * memory by the read server thread, and then the in-memory version * of the page was built. Now we make sure the page and tree are * logically consistent. * * !!! * The problem: (1) the read server has to build the in-memory version * of the page because the read server is the thread that flags when * any thread can access the page in the tree; (2) we can't build the * in-memory version of the page until the physical structure is known * to be OK, so the read server has to verify at least the physical * structure of the page; (3) doing complete page verification requires * reading additional pages (for example, overflow keys imply reading * overflow pages in order to test the key's order in the page); (4) * the read server cannot read additional pages because it will hang * waiting on itself. For this reason, we split page verification * into a physical verification, which allows the in-memory version * of the page to be built, and then a subsequent logical verification * which happens here. * * Report progress occasionally. */ #define WT_VERIFY_PROGRESS_INTERVAL 100 if (++vs->fcnt % WT_VERIFY_PROGRESS_INTERVAL == 0) WT_RET(__wt_progress(session, NULL, vs->fcnt)); #ifdef HAVE_DIAGNOSTIC /* Optionally dump the blocks or page in debugging mode. */ if (vs->dump_blocks) WT_RET(__wt_debug_disk(session, page->dsk, NULL)); if (vs->dump_pages) WT_RET(__wt_debug_page(session, page, NULL)); #endif /* * Column-store key order checks: check the page's record number and * then update the total record count. */ switch (page->type) { case WT_PAGE_COL_FIX: recno = page->pg_fix_recno; goto recno_chk; case WT_PAGE_COL_INT: recno = page->pg_intl_recno; goto recno_chk; case WT_PAGE_COL_VAR: recno = page->pg_var_recno; recno_chk: if (recno != vs->record_total + 1) WT_RET_MSG(session, WT_ERROR, "page at %s has a starting record of %" PRIu64 " when the expected starting record is %" PRIu64, __wt_page_addr_string(session, ref, vs->tmp1), recno, vs->record_total + 1); break; } switch (page->type) { case WT_PAGE_COL_FIX: vs->record_total += page->pg_fix_entries; break; case WT_PAGE_COL_VAR: recno = 0; WT_COL_FOREACH(page, cip, i) if ((cell = WT_COL_PTR(page, cip)) == NULL) ++recno; else { __wt_cell_unpack(cell, unpack); recno += __wt_cell_rle(unpack); } vs->record_total += recno; break; } /* * Row-store leaf page key order check: it's a depth-first traversal, * the first key on this page should be larger than any key previously * seen. */ switch (page->type) { case WT_PAGE_ROW_LEAF: WT_RET(__verify_row_leaf_key_order(session, ref, vs)); break; } /* If it's not the root page, unpack the parent cell. */ if (!__wt_ref_is_root(ref)) { __wt_cell_unpack(ref->addr, unpack); /* Compare the parent cell against the page type. */ switch (page->type) { case WT_PAGE_COL_FIX: if (unpack->raw != WT_CELL_ADDR_LEAF_NO) goto celltype_err; break; case WT_PAGE_COL_VAR: if (unpack->raw != WT_CELL_ADDR_LEAF && unpack->raw != WT_CELL_ADDR_LEAF_NO) goto celltype_err; break; case WT_PAGE_ROW_LEAF: if (unpack->raw != WT_CELL_ADDR_DEL && unpack->raw != WT_CELL_ADDR_LEAF && unpack->raw != WT_CELL_ADDR_LEAF_NO) goto celltype_err; break; case WT_PAGE_COL_INT: case WT_PAGE_ROW_INT: if (unpack->raw != WT_CELL_ADDR_INT) celltype_err: WT_RET_MSG(session, WT_ERROR, "page at %s, of type %s, is referenced in " "its parent by a cell of type %s", __wt_page_addr_string( session, ref, vs->tmp1), __wt_page_type_string(page->type), __wt_cell_type_string(unpack->raw)); break; } } /* * Check overflow pages. We check overflow cells separately from other * tests that walk the page as it's simpler, and I don't care much how * fast table verify runs. */ switch (page->type) { case WT_PAGE_COL_VAR: case WT_PAGE_ROW_INT: case WT_PAGE_ROW_LEAF: WT_RET(__verify_overflow_cell(session, ref, &found, vs)); if (__wt_ref_is_root(ref) || page->type == WT_PAGE_ROW_INT) break; /* * Object if a leaf-no-overflow address cell references a page * with overflow keys, but don't object if a leaf address cell * references a page without overflow keys. Reconciliation * doesn't guarantee every leaf page without overflow items will * be a leaf-no-overflow type. */ if (found && unpack->raw == WT_CELL_ADDR_LEAF_NO) WT_RET_MSG(session, WT_ERROR, "page at %s, of type %s and referenced in its " "parent by a cell of type %s, contains overflow " "items", __wt_page_addr_string(session, ref, vs->tmp1), __wt_page_type_string(page->type), __wt_cell_type_string(WT_CELL_ADDR_LEAF_NO)); break; } /* Check tree connections and recursively descend the tree. */ switch (page->type) { case WT_PAGE_COL_INT: /* For each entry in an internal page, verify the subtree. */ entry = 0; WT_INTL_FOREACH_BEGIN(session, page, child_ref) { /* * It's a depth-first traversal: this entry's starting * record number should be 1 more than the total records * reviewed to this point. */ ++entry; if (child_ref->key.recno != vs->record_total + 1) { WT_RET_MSG(session, WT_ERROR, "the starting record number in entry %" PRIu32 " of the column internal page at " "%s is %" PRIu64 " and the expected " "starting record number is %" PRIu64, entry, __wt_page_addr_string( session, child_ref, vs->tmp1), child_ref->key.recno, vs->record_total + 1); } /* Verify the subtree. */ ++vs->depth; WT_RET(__wt_page_in(session, child_ref, 0)); ret = __verify_tree(session, child_ref, vs); WT_TRET(__wt_page_release(session, child_ref, 0)); --vs->depth; WT_RET(ret); __wt_cell_unpack(child_ref->addr, unpack); WT_RET(bm->verify_addr( bm, session, unpack->data, unpack->size)); } WT_INTL_FOREACH_END; break; case WT_PAGE_ROW_INT: /* For each entry in an internal page, verify the subtree. */ entry = 0; WT_INTL_FOREACH_BEGIN(session, page, child_ref) { /* * It's a depth-first traversal: this entry's starting * key should be larger than the largest key previously * reviewed. * * The 0th key of any internal page is magic, and we * can't test against it. */ ++entry; if (entry != 1) WT_RET(__verify_row_int_key_order( session, page, child_ref, entry, vs)); /* Verify the subtree. */ ++vs->depth; WT_RET(__wt_page_in(session, child_ref, 0)); ret = __verify_tree(session, child_ref, vs); WT_TRET(__wt_page_release(session, child_ref, 0)); --vs->depth; WT_RET(ret); __wt_cell_unpack(child_ref->addr, unpack); WT_RET(bm->verify_addr( bm, session, unpack->data, unpack->size)); } WT_INTL_FOREACH_END;
/* * __wt_verify -- * Verify a file. */ int __wt_verify(WT_SESSION_IMPL *session, const char *cfg[]) { WT_BM *bm; WT_BTREE *btree; WT_CKPT *ckptbase, *ckpt; WT_DECL_RET; WT_VSTUFF *vs, _vstuff; size_t root_addr_size; uint8_t root_addr[WT_BTREE_MAX_ADDR_COOKIE]; bool bm_start, quit; btree = S2BT(session); bm = btree->bm; ckptbase = NULL; bm_start = false; WT_CLEAR(_vstuff); vs = &_vstuff; WT_ERR(__wt_scr_alloc(session, 0, &vs->max_key)); WT_ERR(__wt_scr_alloc(session, 0, &vs->max_addr)); WT_ERR(__wt_scr_alloc(session, 0, &vs->tmp1)); WT_ERR(__wt_scr_alloc(session, 0, &vs->tmp2)); WT_ERR(__wt_scr_alloc(session, 0, &vs->tmp3)); WT_ERR(__wt_scr_alloc(session, 0, &vs->tmp4)); /* Check configuration strings. */ WT_ERR(__verify_config(session, cfg, vs)); /* Optionally dump specific block offsets. */ WT_ERR(__verify_config_offsets(session, cfg, &quit)); if (quit) goto done; /* Get a list of the checkpoints for this file. */ WT_ERR( __wt_meta_ckptlist_get(session, btree->dhandle->name, &ckptbase)); /* Inform the underlying block manager we're verifying. */ WT_ERR(bm->verify_start(bm, session, ckptbase, cfg)); bm_start = true; /* Loop through the file's checkpoints, verifying each one. */ WT_CKPT_FOREACH(ckptbase, ckpt) { WT_ERR(__wt_verbose(session, WT_VERB_VERIFY, "%s: checkpoint %s", btree->dhandle->name, ckpt->name)); /* Fake checkpoints require no work. */ if (F_ISSET(ckpt, WT_CKPT_FAKE)) continue; /* House-keeping between checkpoints. */ __verify_checkpoint_reset(vs); if (WT_VRFY_DUMP(vs)) WT_ERR(__wt_msg(session, "%s: checkpoint %s", btree->dhandle->name, ckpt->name)); /* Load the checkpoint. */ WT_ERR(bm->checkpoint_load(bm, session, ckpt->raw.data, ckpt->raw.size, root_addr, &root_addr_size, true)); /* * Ignore trees with no root page. * Verify, then discard the checkpoint from the cache. */ if (root_addr_size != 0 && (ret = __wt_btree_tree_open( session, root_addr, root_addr_size)) == 0) { if (WT_VRFY_DUMP(vs)) WT_ERR(__wt_msg(session, "Root: %s %s", __wt_addr_string(session, root_addr, root_addr_size, vs->tmp1), __wt_page_type_string( btree->root.page->type))); WT_WITH_PAGE_INDEX(session, ret = __verify_tree(session, &btree->root, vs)); WT_TRET(__wt_cache_op(session, WT_SYNC_DISCARD)); } /* Unload the checkpoint. */ WT_TRET(bm->checkpoint_unload(bm, session)); WT_ERR(ret); /* Display the tree shape. */ if (vs->dump_shape) WT_ERR(__verify_tree_shape(session, vs)); }
/* * __ovfl_reuse_wrapup -- * Resolve the page's overflow reuse list after a page is written. */ static int __ovfl_reuse_wrapup(WT_SESSION_IMPL *session, WT_PAGE *page) { WT_BM *bm; WT_OVFL_REUSE **e, **head, *reuse; size_t decr; int i; bm = S2BT(session)->bm; head = page->modify->ovfl_track->ovfl_reuse; /* * Discard any overflow records that aren't in-use, freeing underlying * blocks. * * First, walk the overflow reuse lists (except for the lowest one), * fixing up skiplist links. */ for (i = WT_SKIP_MAXDEPTH - 1; i > 0; --i) for (e = &head[i]; (reuse = *e) != NULL;) { if (F_ISSET(reuse, WT_OVFL_REUSE_INUSE)) { e = &reuse->next[i]; continue; } *e = reuse->next[i]; } /* * Second, discard any overflow record without an in-use flag, clear * the flags for the next run. * * As part of the pass through the lowest level, figure out how much * space we added/subtracted from the page, and update its footprint. * We don't get it exactly correct because we don't know the depth of * the skiplist here, but it's close enough, and figuring out the * memory footprint change in the reconciliation wrapup code means * fewer atomic updates and less code overall. */ decr = 0; for (e = &head[0]; (reuse = *e) != NULL;) { if (F_ISSET(reuse, WT_OVFL_REUSE_INUSE)) { F_CLR(reuse, WT_OVFL_REUSE_INUSE | WT_OVFL_REUSE_JUST_ADDED); e = &reuse->next[0]; continue; } *e = reuse->next[0]; WT_ASSERT(session, !F_ISSET(reuse, WT_OVFL_REUSE_JUST_ADDED)); if (WT_VERBOSE_ISSET(session, WT_VERB_OVERFLOW)) WT_RET( __ovfl_reuse_verbose(session, page, reuse, "free")); WT_RET(bm->free( bm, session, WT_OVFL_REUSE_ADDR(reuse), reuse->addr_size)); decr += WT_OVFL_SIZE(reuse, WT_OVFL_REUSE); __wt_free(session, reuse); } if (decr != 0) __wt_cache_page_inmem_decr(session, page, decr); return (0); }
/* * __wt_compact -- * Compact a file. */ int __wt_compact(WT_SESSION_IMPL *session, const char *cfg[]) { WT_BM *bm; WT_BTREE *btree; WT_DECL_RET; WT_REF *ref; bool block_manager_begin, skip; WT_UNUSED(cfg); btree = S2BT(session); bm = btree->bm; ref = NULL; block_manager_begin = false; WT_STAT_FAST_DATA_INCR(session, session_compact); /* * Check if compaction might be useful -- the API layer will quit trying * to compact the data source if we make no progress, set a flag if the * block layer thinks compaction is possible. */ WT_RET(bm->compact_skip(bm, session, &skip)); if (skip) return (0); /* * Reviewing in-memory pages requires looking at page reconciliation * results, because we care about where the page is stored now, not * where the page was stored when we first read it into the cache. * We need to ensure we don't race with page reconciliation as it's * writing the page modify information. * * There are three ways we call reconciliation: checkpoints, threads * writing leaf pages (usually in preparation for a checkpoint or if * closing a file), and eviction. * * We're holding the schema lock which serializes with checkpoints. */ WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_SCHEMA)); /* * Get the tree handle's flush lock which blocks threads writing leaf * pages. */ __wt_spin_lock(session, &btree->flush_lock); /* Start compaction. */ WT_ERR(bm->compact_start(bm, session)); block_manager_begin = true; /* Walk the tree reviewing pages to see if they should be re-written. */ for (;;) { /* * Pages read for compaction aren't "useful"; don't update the * read generation of pages already in memory, and if a page is * read, set its generation to a low value so it is evicted * quickly. */ WT_ERR(__wt_tree_walk(session, &ref, WT_READ_COMPACT | WT_READ_NO_GEN | WT_READ_WONT_NEED)); if (ref == NULL) break; WT_ERR(__compact_rewrite(session, ref, &skip)); if (skip) continue; session->compact_state = WT_COMPACT_SUCCESS; /* Rewrite the page: mark the page and tree dirty. */ WT_ERR(__wt_page_modify_init(session, ref->page)); __wt_page_modify_set(session, ref->page); WT_STAT_FAST_DATA_INCR(session, btree_compact_rewrite); } err: if (ref != NULL) WT_TRET(__wt_page_release(session, ref, 0)); if (block_manager_begin) WT_TRET(bm->compact_end(bm, session)); /* Unblock threads writing leaf pages. */ __wt_spin_unlock(session, &btree->flush_lock); return (ret); }
/* * __wt_bt_read -- * Read a cookie referenced block into a buffer. */ int __wt_bt_read(WT_SESSION_IMPL *session, WT_ITEM *buf, const uint8_t *addr, size_t addr_size) { WT_BM *bm; WT_BTREE *btree; WT_DECL_ITEM(tmp); WT_DECL_RET; const WT_PAGE_HEADER *dsk; size_t result_len; btree = S2BT(session); bm = btree->bm; /* * If anticipating a compressed block, read into a scratch buffer and * decompress into the caller's buffer. Else, read directly into the * caller's buffer. */ if (btree->compressor == NULL) { WT_RET(bm->read(bm, session, buf, addr, addr_size)); dsk = buf->data; } else { WT_RET(__wt_scr_alloc(session, 0, &tmp)); WT_ERR(bm->read(bm, session, tmp, addr, addr_size)); dsk = tmp->data; } /* * If the block is compressed, copy the skipped bytes of the original * image into place, then decompress. */ if (F_ISSET(dsk, WT_PAGE_COMPRESSED)) { if (btree->compressor == NULL || btree->compressor->decompress == NULL) WT_ERR_MSG(session, WT_ERROR, "read compressed block where no compression engine " "configured"); /* * We're allocating the exact number of bytes we're expecting * from decompression. */ WT_ERR(__wt_buf_initsize(session, buf, dsk->mem_size)); /* * Note the source length is NOT the number of compressed bytes, * it's the length of the block we just read (minus the skipped * bytes). We don't store the number of compressed bytes: some * compression engines need that length stored externally, they * don't have markers in the stream to signal the end of the * compressed bytes. Those engines must store the compressed * byte length somehow, see the snappy compression extension for * an example. */ memcpy(buf->mem, tmp->data, WT_BLOCK_COMPRESS_SKIP); ret = btree->compressor->decompress( btree->compressor, &session->iface, (uint8_t *)tmp->data + WT_BLOCK_COMPRESS_SKIP, tmp->size - WT_BLOCK_COMPRESS_SKIP, (uint8_t *)buf->mem + WT_BLOCK_COMPRESS_SKIP, dsk->mem_size - WT_BLOCK_COMPRESS_SKIP, &result_len); /* * If checksums were turned off because we're depending on the * decompression to fail on any corrupted data, we'll end up * here after corruption happens. If we're salvaging the file, * it's OK, otherwise it's really, really bad. */ if (ret != 0 || result_len != dsk->mem_size - WT_BLOCK_COMPRESS_SKIP) WT_ERR( F_ISSET(btree, WT_BTREE_VERIFY) || F_ISSET(session, WT_SESSION_SALVAGE_CORRUPT_OK) ? WT_ERROR : __wt_illegal_value(session, btree->dhandle->name)); } else if (btree->compressor == NULL) buf->size = dsk->mem_size; else /* * We guessed wrong: there was a compressor, but this * block was not compressed, and now the page is in the * wrong buffer and the buffer may be of the wrong size. * This should be rare, but happens with small blocks * that aren't worth compressing. */ WT_ERR(__wt_buf_set( session, buf, tmp->data, dsk->mem_size)); /* If the handle is a verify handle, verify the physical page. */ if (F_ISSET(btree, WT_BTREE_VERIFY)) { if (tmp == NULL) WT_ERR(__wt_scr_alloc(session, 0, &tmp)); WT_ERR(bm->addr_string(bm, session, tmp, addr, addr_size)); WT_ERR(__wt_verify_dsk(session, (const char *)tmp->data, buf)); } WT_STAT_FAST_CONN_INCR(session, cache_read); WT_STAT_FAST_DATA_INCR(session, cache_read); if (F_ISSET(dsk, WT_PAGE_COMPRESSED)) WT_STAT_FAST_DATA_INCR(session, compress_read); WT_STAT_FAST_CONN_INCRV(session, cache_bytes_read, dsk->mem_size); WT_STAT_FAST_DATA_INCRV(session, cache_bytes_read, dsk->mem_size); err: __wt_scr_free(session, &tmp); return (ret); }
/* * __compact_rewrite -- * Return if a page needs to be re-written. */ static int __compact_rewrite(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp) { WT_BM *bm; WT_DECL_RET; WT_MULTI *multi; WT_PAGE *page; WT_PAGE_MODIFY *mod; size_t addr_size; uint32_t i; const uint8_t *addr; *skipp = true; /* Default skip. */ bm = S2BT(session)->bm; page = ref->page; mod = page->modify; /* * Ignore the root: it may not have a replacement address, and besides, * if anything else gets written, so will it. */ if (__wt_ref_is_root(ref)) return (0); /* Ignore currently dirty pages, they will be written regardless. */ if (__wt_page_is_modified(page)) return (0); /* * If the page is clean, test the original addresses. * If the page is a replacement, test the replacement addresses. * Ignore empty pages, they get merged into the parent. */ if (mod == NULL || mod->rec_result == 0) { __wt_ref_info(ref, &addr, &addr_size, NULL); if (addr == NULL) return (0); return ( bm->compact_page_skip(bm, session, addr, addr_size, skipp)); } /* * The page's modification information can change underfoot if the page * is being reconciled, serialize with reconciliation. */ if (mod->rec_result == WT_PM_REC_REPLACE || mod->rec_result == WT_PM_REC_MULTIBLOCK) WT_RET(__wt_fair_lock(session, &page->page_lock)); if (mod->rec_result == WT_PM_REC_REPLACE) ret = bm->compact_page_skip(bm, session, mod->mod_replace.addr, mod->mod_replace.size, skipp); if (mod->rec_result == WT_PM_REC_MULTIBLOCK) for (multi = mod->mod_multi, i = 0; i < mod->mod_multi_entries; ++multi, ++i) { if (multi->disk_image != NULL) continue; if ((ret = bm->compact_page_skip(bm, session, multi->addr.addr, multi->addr.size, skipp)) != 0) break; if (!*skipp) break; } if (mod->rec_result == WT_PM_REC_REPLACE || mod->rec_result == WT_PM_REC_MULTIBLOCK) WT_TRET(__wt_fair_unlock(session, &page->page_lock)); return (ret); }
/* * __verify_dsk_row -- * Walk a WT_PAGE_ROW_INT or WT_PAGE_ROW_LEAF disk page and verify it. */ static int __verify_dsk_row( WT_SESSION_IMPL *session, const char *tag, const WT_PAGE_HEADER *dsk) { WT_BM *bm; WT_BTREE *btree; WT_CELL *cell; WT_CELL_UNPACK *unpack, _unpack; WT_DECL_ITEM(current); WT_DECL_ITEM(last_ovfl); WT_DECL_ITEM(last_pfx); WT_DECL_RET; WT_ITEM *last; enum { FIRST, WAS_KEY, WAS_VALUE } last_cell_type; void *huffman; uint32_t cell_num, cell_type, i, key_cnt, prefix; uint8_t *end; int cmp; btree = S2BT(session); bm = btree->bm; unpack = &_unpack; huffman = dsk->type == WT_PAGE_ROW_INT ? NULL : btree->huffman_key; WT_ERR(__wt_scr_alloc(session, 0, ¤t)); WT_ERR(__wt_scr_alloc(session, 0, &last_pfx)); WT_ERR(__wt_scr_alloc(session, 0, &last_ovfl)); last = last_ovfl; end = (uint8_t *)dsk + dsk->mem_size; last_cell_type = FIRST; cell_num = 0; key_cnt = 0; WT_CELL_FOREACH(btree, dsk, cell, unpack, i) { ++cell_num; /* Carefully unpack the cell. */ if (__wt_cell_unpack_safe(cell, unpack, end) != 0) { ret = __err_cell_corrupted(session, cell_num, tag); goto err; } /* Check the raw and collapsed cell types. */ WT_ERR(__err_cell_type( session, cell_num, tag, unpack->raw, dsk->type)); WT_ERR(__err_cell_type( session, cell_num, tag, unpack->type, dsk->type)); cell_type = unpack->type; /* * Check ordering relationships between the WT_CELL entries. * For row-store internal pages, check for: * two values in a row, * two keys in a row, * a value as the first cell on a page. * For row-store leaf pages, check for: * two values in a row, * a value as the first cell on a page. */ switch (cell_type) { case WT_CELL_KEY: case WT_CELL_KEY_OVFL: ++key_cnt; switch (last_cell_type) { case FIRST: case WAS_VALUE: break; case WAS_KEY: if (dsk->type == WT_PAGE_ROW_LEAF) break; WT_ERR_VRFY(session, "cell %" PRIu32 " on page at %s is the " "first of two adjacent keys", cell_num - 1, tag); } last_cell_type = WAS_KEY; break; case WT_CELL_ADDR_DEL: case WT_CELL_ADDR_INT: case WT_CELL_ADDR_LEAF: case WT_CELL_ADDR_LEAF_NO: case WT_CELL_VALUE: case WT_CELL_VALUE_OVFL: switch (last_cell_type) { case FIRST: WT_ERR_VRFY(session, "page at %s begins with a value", tag); case WAS_KEY: break; case WAS_VALUE: WT_ERR_VRFY(session, "cell %" PRIu32 " on page at %s is the " "first of two adjacent values", cell_num - 1, tag); } last_cell_type = WAS_VALUE; break; } /* Check if any referenced item has a valid address. */ switch (cell_type) { case WT_CELL_ADDR_DEL: case WT_CELL_ADDR_INT: case WT_CELL_ADDR_LEAF: case WT_CELL_ADDR_LEAF_NO: case WT_CELL_KEY_OVFL: case WT_CELL_VALUE_OVFL: if (!bm->addr_valid(bm, session, unpack->data, unpack->size)) goto eof; break; } /* * Remaining checks are for key order and prefix compression. * If this cell isn't a key, we're done, move to the next cell. * If this cell is an overflow item, instantiate the key and * compare it with the last key. Otherwise, we have to deal with * prefix compression. */ switch (cell_type) { case WT_CELL_KEY: break; case WT_CELL_KEY_OVFL: WT_ERR(__wt_dsk_cell_data_ref( session, dsk->type, unpack, current)); goto key_compare; default: /* Not a key -- continue with the next cell. */ continue; } /* * Prefix compression checks. * * Confirm the first non-overflow key on a page has a zero * prefix compression count. */ prefix = unpack->prefix; if (last_pfx->size == 0 && prefix != 0) WT_ERR_VRFY(session, "the %" PRIu32 " key on page at %s is the first " "non-overflow key on the page and has a non-zero " "prefix compression value", cell_num, tag); /* Confirm the prefix compression count is possible. */ if (cell_num > 1 && prefix > last->size) WT_ERR_VRFY(session, "key %" PRIu32 " on page at %s has a prefix " "compression count of %" PRIu32 ", larger than " "the length of the previous key, %" WT_SIZET_FMT, cell_num, tag, prefix, last->size); /* * If Huffman decoding required, unpack the cell to build the * key, then resolve the prefix. Else, we can do it faster * internally because we don't have to shuffle memory around as * much. */ if (huffman != NULL) { WT_ERR(__wt_dsk_cell_data_ref( session, dsk->type, unpack, current)); /* * If there's a prefix, make sure there's enough buffer * space, then shift the decoded data past the prefix * and copy the prefix into place. Take care with the * pointers: current->data may be pointing inside the * buffer. */ if (prefix != 0) { WT_ERR(__wt_buf_grow( session, current, prefix + current->size)); memmove((uint8_t *)current->mem + prefix, current->data, current->size); memcpy(current->mem, last->data, prefix); current->data = current->mem; current->size += prefix; } } else { /* * Get the cell's data/length and make sure we have * enough buffer space. */ WT_ERR(__wt_buf_init( session, current, prefix + unpack->size)); /* Copy the prefix then the data into place. */ if (prefix != 0) memcpy(current->mem, last->data, prefix); memcpy((uint8_t *)current->mem + prefix, unpack->data, unpack->size); current->size = prefix + unpack->size; } key_compare: /* * Compare the current key against the last key. * * Be careful about the 0th key on internal pages: we only store * the first byte and custom collators may not be able to handle * truncated keys. */ if ((dsk->type == WT_PAGE_ROW_INT && cell_num > 3) || (dsk->type != WT_PAGE_ROW_INT && cell_num > 1)) { WT_ERR(__wt_compare( session, btree->collator, last, current, &cmp)); if (cmp >= 0) WT_ERR_VRFY(session, "the %" PRIu32 " and %" PRIu32 " keys on " "page at %s are incorrectly sorted", cell_num - 2, cell_num, tag); } /* * Swap the buffers: last always references the last key entry, * last_pfx and last_ovfl reference the last prefix-compressed * and last overflow key entries. Current gets pointed to the * buffer we're not using this time around, which is where the * next key goes. */ last = current; if (cell_type == WT_CELL_KEY) { current = last_pfx; last_pfx = last; } else { current = last_ovfl; last_ovfl = last; } WT_ASSERT(session, last != current); }
/* * __meta_track_apply -- * Apply the changes in a metadata tracking record. */ static int __meta_track_apply(WT_SESSION_IMPL *session, WT_META_TRACK *trk, int unroll) { WT_BM *bm; WT_BTREE *btree; WT_DECL_RET; int tret; /* * Unlock handles and complete checkpoints regardless of whether we are * unrolling. */ if (!unroll && trk->op != WT_ST_CHECKPOINT && trk->op != WT_ST_LOCK) goto free; switch (trk->op) { case WT_ST_EMPTY: /* Unused slot */ break; case WT_ST_CHECKPOINT: /* Checkpoint, see above */ if (!unroll) { btree = trk->dhandle->handle; bm = btree->bm; WT_WITH_DHANDLE(session, trk->dhandle, WT_TRET(bm->checkpoint_resolve(bm, session))); } break; case WT_ST_LOCK: /* Handle lock, see above */ if (unroll && trk->created) F_SET(trk->dhandle, WT_DHANDLE_DISCARD); WT_WITH_DHANDLE(session, trk->dhandle, WT_TRET(__wt_session_release_btree(session))); break; case WT_ST_FILEOP: /* File operation */ /* * For renames, both a and b are set. * For creates, a is NULL. * For removes, b is NULL. */ if (trk->a != NULL && trk->b != NULL && (tret = __wt_rename(session, trk->b + strlen("file:"), trk->a + strlen("file:"))) != 0) { __wt_err(session, tret, "metadata unroll rename %s to %s", trk->b, trk->a); WT_TRET(tret); } else if (trk->a == NULL) { if ((tret = __wt_remove(session, trk->b + strlen("file:"))) != 0) { __wt_err(session, tret, "metadata unroll create %s", trk->b); WT_TRET(tret); } } /* * We can't undo removes yet: that would imply * some kind of temporary rename and remove in * roll forward. */ break; case WT_ST_REMOVE: /* Remove trk.a */ if ((tret = __wt_metadata_remove(session, trk->a)) != 0) { __wt_err(session, tret, "metadata unroll remove: %s", trk->a); WT_TRET(tret); } break; case WT_ST_SET: /* Set trk.a to trk.b */ if ((tret = __wt_metadata_update( session, trk->a, trk->b)) != 0) { __wt_err(session, tret, "metadata unroll update %s to %s", trk->a, trk->b); WT_TRET(tret); } break; WT_ILLEGAL_VALUE(session); } free: trk->op = WT_ST_EMPTY; __wt_free(session, trk->a); __wt_free(session, trk->b); trk->dhandle = NULL; return (ret); }