/* * __compact_rewrite -- * Return if a page needs to be re-written. */ static int __compact_rewrite(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp) { WT_BM *bm; WT_DECL_RET; WT_PAGE *page; WT_PAGE_MODIFY *mod; size_t addr_size; const uint8_t *addr; *skipp = true; /* Default skip. */ bm = S2BT(session)->bm; page = ref->page; mod = page->modify; /* * Ignore the root: it may not have a replacement address, and besides, * if anything else gets written, so will it. */ if (__wt_ref_is_root(ref)) return (0); /* Ignore currently dirty pages, they will be written regardless. */ if (__wt_page_is_modified(page)) return (0); /* * If the page is clean, test the original addresses. * If the page is a 1-to-1 replacement, test the replacement addresses. * Ignore empty pages, they get merged into the parent. */ if (mod == NULL || mod->rec_result == 0) { WT_RET(__wt_ref_info(session, ref, &addr, &addr_size, NULL)); if (addr == NULL) return (0); WT_RET( bm->compact_page_skip(bm, session, addr, addr_size, skipp)); } else if (mod->rec_result == WT_PM_REC_REPLACE) { /* * The page's modification information can change underfoot if * the page is being reconciled, serialize with reconciliation. */ WT_RET(__wt_fair_lock(session, &page->page_lock)); ret = bm->compact_page_skip(bm, session, mod->mod_replace.addr, mod->mod_replace.size, skipp); WT_TRET(__wt_fair_unlock(session, &page->page_lock)); WT_RET(ret); } return (0); }
/* * __wt_compact_page_skip -- * Return if compaction requires we read this page. */ int __wt_compact_page_skip(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp) { WT_BM *bm; size_t addr_size; u_int type; const uint8_t *addr; *skipp = false; /* Default to reading. */ type = 0; /* Keep compiler quiet. */ bm = S2BT(session)->bm; /* * We aren't holding a hazard pointer, so we can't look at the page * itself, all we can look at is the WT_REF information. If there's no * address, the page isn't on disk, but we have to read internal pages * to walk the tree regardless; throw up our hands and read it. */ __wt_ref_info(ref, &addr, &addr_size, &type); if (addr == NULL) return (0); /* * Internal pages must be read to walk the tree; ask the block-manager * if it's useful to rewrite leaf pages, don't do the I/O if a rewrite * won't help. */ return (type == WT_CELL_ADDR_INT ? 0 : bm->compact_page_skip(bm, session, addr, addr_size, skipp)); }
/* * __wt_compact_page_skip -- * Return if the block-manager wants us to re-write this page. */ int __wt_compact_page_skip( WT_SESSION_IMPL *session, WT_PAGE *parent, WT_REF *ref, int *skipp) { WT_BM *bm; uint32_t addr_size; const uint8_t *addr; bm = S2BT(session)->bm; /* * There's one compaction test we do before we read the page, to see * if the block-manager thinks it useful to rewrite the page. If a * rewrite won't help, we don't want to do I/O for nothing. For that * reason, this check is done in a call from inside the tree-walking * routine. * * Ignore everything but on-disk pages, we've already done a pass over * the in-memory pages. */ if (ref->state != WT_REF_DISK) { *skipp = 1; return (0); } __wt_get_addr(parent, ref, &addr, &addr_size); if (addr == NULL) { *skipp = 1; return (0); } return (bm->compact_page_skip(bm, session, addr, addr_size, skipp)); }
/*将多余的文件空间compact到合适的位置,如果ref在compact范围内,返回skip = 1,表示文件空间不能进行compact*/ static int __compact_rewrite(WT_SESSION_IMPL* session, WT_REF* ref, int* skipp) { WT_BM *bm; WT_DECL_RET; WT_PAGE *page; WT_PAGE_MODIFY *mod; size_t addr_size; const uint8_t *addr; *skipp = 1; bm = S2BT(session)->bm; page = ref->page; mod = page->modify; /*root page是不能被compact*/ if (__wt_ref_is_root(ref)) return 0; /*ref指向的是个脏页,不进行compact*/ if (__wt_page_is_modified(page)) return (0); /*假如page一已经被清空的,直接判断是否可以它的block空间compact*/ if (mod == NULL || F_ISSET(mod, WT_PM_REC_MASK) == 0) { WT_RET(__wt_ref_info(session, ref, &addr, &addr_size, NULL)); if (addr == NULL) return (0); WT_RET(bm->compact_page_skip(bm, session, addr, addr_size, skipp)); } else if (F_ISSET(mod, WT_PM_REC_MASK) == WT_PM_REC_REPLACE){ /*如果page空间是替换,那么进行替换block的compact操作判断*/ WT_PAGE_LOCK(session, page); ret = bm->compact_page_skip(bm, session, mod->mod_replace.addr, mod->mod_replace.size, skipp); WT_PAGE_UNLOCK(session, page); WT_RET(ret); } return 0; }
/*在读取ref对应的page时,检查它是否需要compact*/ int __wt_compact_page_skip(WT_SESSION_IMPL* session, WT_REF* ref, int* skipp) { WT_BM *bm; size_t addr_size; u_int type; const uint8_t *addr; *skipp = 0; type = 0; bm = S2BT(session)->bm; /* * We aren't holding a hazard pointer, so we can't look at the page * itself, all we can look at is the WT_REF information. If there's no * address, the page isn't on disk, but we have to read internal pages * to walk the tree regardless; throw up our hands and read it. */ WT_RET(__wt_ref_info(session, ref, &addr, &addr_size, &type)); if (addr == NULL) return 0; return (type == WT_CELL_ADDR_INT ? 0 : bm->compact_page_skip(bm, session, addr, addr_size, skipp)); }
/* * __compact_rewrite -- * Return if a page needs to be re-written. */ static int __compact_rewrite(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp) { WT_BM *bm; WT_DECL_RET; WT_MULTI *multi; WT_PAGE *page; WT_PAGE_MODIFY *mod; size_t addr_size; uint32_t i; const uint8_t *addr; *skipp = true; /* Default skip. */ bm = S2BT(session)->bm; page = ref->page; mod = page->modify; /* * Ignore the root: it may not have a replacement address, and besides, * if anything else gets written, so will it. */ if (__wt_ref_is_root(ref)) return (0); /* Ignore currently dirty pages, they will be written regardless. */ if (__wt_page_is_modified(page)) return (0); /* * If the page is clean, test the original addresses. * If the page is a replacement, test the replacement addresses. * Ignore empty pages, they get merged into the parent. */ if (mod == NULL || mod->rec_result == 0) { __wt_ref_info(ref, &addr, &addr_size, NULL); if (addr == NULL) return (0); return ( bm->compact_page_skip(bm, session, addr, addr_size, skipp)); } /* * The page's modification information can change underfoot if the page * is being reconciled, serialize with reconciliation. */ if (mod->rec_result == WT_PM_REC_REPLACE || mod->rec_result == WT_PM_REC_MULTIBLOCK) WT_RET(__wt_fair_lock(session, &page->page_lock)); if (mod->rec_result == WT_PM_REC_REPLACE) ret = bm->compact_page_skip(bm, session, mod->mod_replace.addr, mod->mod_replace.size, skipp); if (mod->rec_result == WT_PM_REC_MULTIBLOCK) for (multi = mod->mod_multi, i = 0; i < mod->mod_multi_entries; ++multi, ++i) { if (multi->disk_image != NULL) continue; if ((ret = bm->compact_page_skip(bm, session, multi->addr.addr, multi->addr.size, skipp)) != 0) break; if (!*skipp) break; } if (mod->rec_result == WT_PM_REC_REPLACE || mod->rec_result == WT_PM_REC_MULTIBLOCK) WT_TRET(__wt_fair_unlock(session, &page->page_lock)); return (ret); }
/* * __wt_compact_evict -- * Helper routine to decide if a file's size would benefit from re-writing * this page. */ int __wt_compact_evict(WT_SESSION_IMPL *session, WT_PAGE *page) { WT_BM *bm; WT_PAGE_MODIFY *mod; int skip; uint32_t addr_size; const uint8_t *addr; bm = S2BT(session)->bm; mod = page->modify; /* * We have to review page reconciliation information as an in-memory * page's original disk addresses might have been fine for compaction * but its replacement addresses might be a problem. To review page * reconciliation information, we have to lock out both eviction and * checkpoints, as those are the other two operations that can write * a page. * * Ignore the root: it may not have a replacement address, and besides, * if anything else gets written, so will it. */ if (WT_PAGE_IS_ROOT(page)) return (0); /* * If the page is already dirty, skip some work, it will be written in * any case. */ if (__wt_page_is_modified(page)) return (0); /* * If the page is clean, test the original addresses. * If the page is a 1-to-1 replacement, test the replacement addresses. * If the page is a split, ignore it, it will be merged into the parent. */ if (mod == NULL) goto disk; switch (F_ISSET(mod, WT_PM_REC_MASK)) { case 0: disk: __wt_get_addr(page->parent, page->ref, &addr, &addr_size); if (addr == NULL) return (0); WT_RET( bm->compact_page_skip(bm, session, addr, addr_size, &skip)); if (skip) return (0); break; case WT_PM_REC_EMPTY: return (0); case WT_PM_REC_REPLACE: WT_RET(bm->compact_page_skip(bm, session, mod->u.replace.addr, mod->u.replace.size, &skip)); if (skip) return (0); break; case WT_PM_REC_SPLIT: case WT_PM_REC_SPLIT_MERGE: return (0); } /* Mark the page and tree dirty, we want to write this page. */ WT_RET(__wt_page_modify_init(session, page)); __wt_page_and_tree_modify_set(session, page); WT_DSTAT_INCR(session, btree_compact_rewrite); return (0); }
/* * __wt_compact_page_skip -- * Return if compaction requires we read this page. */ int __wt_compact_page_skip(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp) { WT_BM *bm; WT_DECL_RET; size_t addr_size; u_int type; const uint8_t *addr; /* * Skip deleted pages, rewriting them doesn't seem useful; in a better * world we'd write the parent to delete the page. */ if (ref->state == WT_REF_DELETED) { *skipp = true; return (0); } *skipp = false; /* Default to reading */ /* * If the page is in-memory, we want to look at it (it may have been * modified and written, and the current location is the interesting * one in terms of compaction, not the original location). * * This test could be combined with the next one, but this is a cheap * test and the next one is expensive. */ if (ref->state != WT_REF_DISK) return (0); /* * There's nothing to prevent the WT_REF state from changing underfoot, * which can change its address. For example, the WT_REF address might * reference an on-page cell, and page eviction can free that memory. * Lock the WT_REF so we can look at its address. */ if (!__wt_atomic_casv32(&ref->state, WT_REF_DISK, WT_REF_LOCKED)) return (0); /* * The page is on disk, so there had better be an address; assert that * fact, test at run-time to avoid the core dump. * * Internal pages must be read to walk the tree; ask the block-manager * if it's useful to rewrite leaf pages, don't do the I/O if a rewrite * won't help. */ __wt_ref_info(ref, &addr, &addr_size, &type); WT_ASSERT(session, addr != NULL); if (addr != NULL && type != WT_CELL_ADDR_INT) { bm = S2BT(session)->bm; ret = bm->compact_page_skip( bm, session, addr, addr_size, skipp); } /* * Reset the WT_REF state and push the change. The full-barrier isn't * necessary, but it's better to keep pages in circulation than not. */ ref->state = WT_REF_DISK; WT_FULL_BARRIER(); return (ret); }