/* * __wt_compact_page_skip -- * Return if compaction requires we read this page. */ int __wt_compact_page_skip(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp) { WT_BM *bm; size_t addr_size; u_int type; const uint8_t *addr; *skipp = false; /* Default to reading. */ type = 0; /* Keep compiler quiet. */ bm = S2BT(session)->bm; /* * We aren't holding a hazard pointer, so we can't look at the page * itself, all we can look at is the WT_REF information. If there's no * address, the page isn't on disk, but we have to read internal pages * to walk the tree regardless; throw up our hands and read it. */ __wt_ref_info(ref, &addr, &addr_size, &type); if (addr == NULL) return (0); /* * Internal pages must be read to walk the tree; ask the block-manager * if it's useful to rewrite leaf pages, don't do the I/O if a rewrite * won't help. */ return (type == WT_CELL_ADDR_INT ? 0 : bm->compact_page_skip(bm, session, addr, addr_size, skipp)); }
/* * __compact_rewrite -- * Return if a page needs to be re-written. */ static int __compact_rewrite(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp) { WT_BM *bm; WT_DECL_RET; WT_PAGE *page; WT_PAGE_MODIFY *mod; size_t addr_size; const uint8_t *addr; *skipp = true; /* Default skip. */ bm = S2BT(session)->bm; page = ref->page; mod = page->modify; /* * Ignore the root: it may not have a replacement address, and besides, * if anything else gets written, so will it. */ if (__wt_ref_is_root(ref)) return (0); /* Ignore currently dirty pages, they will be written regardless. */ if (__wt_page_is_modified(page)) return (0); /* * If the page is clean, test the original addresses. * If the page is a 1-to-1 replacement, test the replacement addresses. * Ignore empty pages, they get merged into the parent. */ if (mod == NULL || mod->rec_result == 0) { WT_RET(__wt_ref_info(session, ref, &addr, &addr_size, NULL)); if (addr == NULL) return (0); WT_RET( bm->compact_page_skip(bm, session, addr, addr_size, skipp)); } else if (mod->rec_result == WT_PM_REC_REPLACE) { /* * The page's modification information can change underfoot if * the page is being reconciled, serialize with reconciliation. */ WT_RET(__wt_fair_lock(session, &page->page_lock)); ret = bm->compact_page_skip(bm, session, mod->mod_replace.addr, mod->mod_replace.size, skipp); WT_TRET(__wt_fair_unlock(session, &page->page_lock)); WT_RET(ret); } return (0); }
/* * __wt_page_addr_string -- * Figure out a page's "address" and load a buffer with a printable, * nul-terminated representation of that address. */ const char * __wt_page_addr_string(WT_SESSION_IMPL *session, WT_REF *ref, WT_ITEM *buf) { size_t addr_size; const uint8_t *addr; if (__wt_ref_is_root(ref)) { buf->data = "[Root]"; buf->size = strlen("[Root]"); return (buf->data); } (void)__wt_ref_info(session, ref, &addr, &addr_size, NULL); return (__wt_addr_string(session, addr, addr_size, buf)); }
/* * __ref_is_leaf -- * Check if a reference is for a leaf page. */ static inline bool __ref_is_leaf(WT_REF *ref) { size_t addr_size; const uint8_t *addr; u_int type; /* * If the page has a disk address, we can crack it to figure out if * this page is a leaf page or not. If there's no address, the page * isn't on disk and we don't know the page type. */ __wt_ref_info(ref, &addr, &addr_size, &type); return (addr == NULL ? false : type == WT_CELL_ADDR_LEAF || type == WT_CELL_ADDR_LEAF_NO); }
/* * __wt_page_addr_string -- * Figure out a page's "address" and load a buffer with a printable, * nul-terminated representation of that address. */ const char * __wt_page_addr_string(WT_SESSION_IMPL *session, WT_ITEM *buf, WT_PAGE *page) { uint32_t size; const uint8_t *addr; if (WT_PAGE_IS_ROOT(page)) { buf->data = "[Root]"; buf->size = WT_STORE_SIZE(strlen("[Root]")); return (buf->data); } (void)__wt_ref_info( session, page->parent, page->ref, &addr, &size, NULL); return (__wt_addr_string(session, buf, addr, size)); }
/*将多余的文件空间compact到合适的位置,如果ref在compact范围内,返回skip = 1,表示文件空间不能进行compact*/ static int __compact_rewrite(WT_SESSION_IMPL* session, WT_REF* ref, int* skipp) { WT_BM *bm; WT_DECL_RET; WT_PAGE *page; WT_PAGE_MODIFY *mod; size_t addr_size; const uint8_t *addr; *skipp = 1; bm = S2BT(session)->bm; page = ref->page; mod = page->modify; /*root page是不能被compact*/ if (__wt_ref_is_root(ref)) return 0; /*ref指向的是个脏页,不进行compact*/ if (__wt_page_is_modified(page)) return (0); /*假如page一已经被清空的,直接判断是否可以它的block空间compact*/ if (mod == NULL || F_ISSET(mod, WT_PM_REC_MASK) == 0) { WT_RET(__wt_ref_info(session, ref, &addr, &addr_size, NULL)); if (addr == NULL) return (0); WT_RET(bm->compact_page_skip(bm, session, addr, addr_size, skipp)); } else if (F_ISSET(mod, WT_PM_REC_MASK) == WT_PM_REC_REPLACE){ /*如果page空间是替换,那么进行替换block的compact操作判断*/ WT_PAGE_LOCK(session, page); ret = bm->compact_page_skip(bm, session, mod->mod_replace.addr, mod->mod_replace.size, skipp); WT_PAGE_UNLOCK(session, page); WT_RET(ret); } return 0; }
/*在读取ref对应的page时,检查它是否需要compact*/ int __wt_compact_page_skip(WT_SESSION_IMPL* session, WT_REF* ref, int* skipp) { WT_BM *bm; size_t addr_size; u_int type; const uint8_t *addr; *skipp = 0; type = 0; bm = S2BT(session)->bm; /* * We aren't holding a hazard pointer, so we can't look at the page * itself, all we can look at is the WT_REF information. If there's no * address, the page isn't on disk, but we have to read internal pages * to walk the tree regardless; throw up our hands and read it. */ WT_RET(__wt_ref_info(session, ref, &addr, &addr_size, &type)); if (addr == NULL) return 0; return (type == WT_CELL_ADDR_INT ? 0 : bm->compact_page_skip(bm, session, addr, addr_size, skipp)); }
/* * __page_read -- * Read a page from the file. */ static int __page_read(WT_SESSION_IMPL *session, WT_REF *ref) { const WT_PAGE_HEADER *dsk; WT_BTREE *btree; WT_DECL_RET; WT_ITEM tmp; WT_PAGE *page; size_t addr_size; uint32_t previous_state; const uint8_t *addr; btree = S2BT(session); page = NULL; /* * Don't pass an allocated buffer to the underlying block read function, * force allocation of new memory of the appropriate size. */ WT_CLEAR(tmp); /* * Attempt to set the state to WT_REF_READING for normal reads, or * WT_REF_LOCKED, for deleted pages. If successful, we've won the * race, read the page. */ if (__wt_atomic_casv32(&ref->state, WT_REF_DISK, WT_REF_READING)) previous_state = WT_REF_DISK; else if (__wt_atomic_casv32(&ref->state, WT_REF_DELETED, WT_REF_LOCKED)) previous_state = WT_REF_DELETED; else return (0); /* * Get the address: if there is no address, the page was deleted, but a * subsequent search or insert is forcing re-creation of the name space. */ WT_ERR(__wt_ref_info(session, ref, &addr, &addr_size, NULL)); if (addr == NULL) { WT_ASSERT(session, previous_state == WT_REF_DELETED); WT_ERR(__wt_btree_new_leaf_page(session, &page)); ref->page = page; goto done; } /* * There's an address, read or map the backing disk page and build an * in-memory version of the page. */ WT_ERR(__wt_bt_read(session, &tmp, addr, addr_size)); WT_ERR(__wt_page_inmem(session, ref, tmp.data, tmp.memsize, WT_DATA_IN_ITEM(&tmp) ? WT_PAGE_DISK_ALLOC : WT_PAGE_DISK_MAPPED, &page)); /* * Clear the local reference to an allocated copy of the disk image on * return; the page steals it, errors in this code should not free it. */ tmp.mem = NULL; /* * If reading for a checkpoint, there's no additional work to do, the * page on disk is correct as written. */ if (session->dhandle->checkpoint != NULL) goto done; /* If the page was deleted, instantiate that information. */ if (previous_state == WT_REF_DELETED) WT_ERR(__wt_delete_page_instantiate(session, ref)); /* * Instantiate updates from the database's lookaside table. The page * flag was set when the page was written, potentially a long time ago. * We only care if the lookaside table is currently active, check that * before doing any work. */ dsk = tmp.data; if (F_ISSET(dsk, WT_PAGE_LAS_UPDATE) && __wt_las_is_written(session)) { WT_STAT_FAST_CONN_INCR(session, cache_read_lookaside); WT_STAT_FAST_DATA_INCR(session, cache_read_lookaside); WT_ERR(__las_page_instantiate( session, ref, btree->id, addr, addr_size)); } done: WT_PUBLISH(ref->state, WT_REF_MEM); return (0); err: /* * If the function building an in-memory version of the page failed, * it discarded the page, but not the disk image. Discard the page * and separately discard the disk image in all cases. */ if (ref->page != NULL) __wt_ref_out(session, ref); WT_PUBLISH(ref->state, previous_state); __wt_buf_free(session, &tmp); return (ret); }
/* * __compact_rewrite -- * Return if a page needs to be re-written. */ static int __compact_rewrite(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp) { WT_BM *bm; WT_DECL_RET; WT_MULTI *multi; WT_PAGE *page; WT_PAGE_MODIFY *mod; size_t addr_size; uint32_t i; const uint8_t *addr; *skipp = true; /* Default skip. */ bm = S2BT(session)->bm; page = ref->page; mod = page->modify; /* * Ignore the root: it may not have a replacement address, and besides, * if anything else gets written, so will it. */ if (__wt_ref_is_root(ref)) return (0); /* Ignore currently dirty pages, they will be written regardless. */ if (__wt_page_is_modified(page)) return (0); /* * If the page is clean, test the original addresses. * If the page is a replacement, test the replacement addresses. * Ignore empty pages, they get merged into the parent. */ if (mod == NULL || mod->rec_result == 0) { __wt_ref_info(ref, &addr, &addr_size, NULL); if (addr == NULL) return (0); return ( bm->compact_page_skip(bm, session, addr, addr_size, skipp)); } /* * The page's modification information can change underfoot if the page * is being reconciled, serialize with reconciliation. */ if (mod->rec_result == WT_PM_REC_REPLACE || mod->rec_result == WT_PM_REC_MULTIBLOCK) WT_RET(__wt_fair_lock(session, &page->page_lock)); if (mod->rec_result == WT_PM_REC_REPLACE) ret = bm->compact_page_skip(bm, session, mod->mod_replace.addr, mod->mod_replace.size, skipp); if (mod->rec_result == WT_PM_REC_MULTIBLOCK) for (multi = mod->mod_multi, i = 0; i < mod->mod_multi_entries; ++multi, ++i) { if (multi->disk_image != NULL) continue; if ((ret = bm->compact_page_skip(bm, session, multi->addr.addr, multi->addr.size, skipp)) != 0) break; if (!*skipp) break; } if (mod->rec_result == WT_PM_REC_REPLACE || mod->rec_result == WT_PM_REC_MULTIBLOCK) WT_TRET(__wt_fair_unlock(session, &page->page_lock)); return (ret); }
/* * __wt_cache_read -- * Read a page from the file. */ int __wt_cache_read(WT_SESSION_IMPL *session, WT_REF *ref) { WT_DECL_RET; WT_ITEM tmp; WT_PAGE *page; WT_PAGE_STATE previous_state; size_t addr_size; const uint8_t *addr; page = NULL; /* * Don't pass an allocated buffer to the underlying block read function, * force allocation of new memory of the appropriate size. */ WT_CLEAR(tmp); /* * Attempt to set the state to WT_REF_READING for normal reads, or * WT_REF_LOCKED, for deleted pages. If successful, we've won the * race, read the page. */ if (WT_ATOMIC_CAS4(ref->state, WT_REF_DISK, WT_REF_READING)) previous_state = WT_REF_DISK; else if (WT_ATOMIC_CAS4(ref->state, WT_REF_DELETED, WT_REF_LOCKED)) previous_state = WT_REF_DELETED; else return (0); /* * Get the address: if there is no address, the page was deleted, but a * subsequent search or insert is forcing re-creation of the name space. * Otherwise, there's an address, read the backing disk page and build * an in-memory version of the page. */ WT_ERR(__wt_ref_info(session, ref, &addr, &addr_size, NULL)); if (addr == NULL) { WT_ASSERT(session, previous_state == WT_REF_DELETED); WT_ERR(__wt_btree_new_leaf_page(session, &page)); ref->page = page; } else { /* Read the backing disk page. */ WT_ERR(__wt_bt_read(session, &tmp, addr, addr_size)); /* Build the in-memory version of the page. */ WT_ERR(__wt_page_inmem(session, ref, tmp.data, WT_DATA_IN_ITEM(&tmp) ? WT_PAGE_DISK_ALLOC : WT_PAGE_DISK_MAPPED, &page)); /* If the page was deleted, instantiate that information. */ if (previous_state == WT_REF_DELETED) WT_ERR(__wt_delete_page_instantiate(session, ref)); } WT_ERR(__wt_verbose(session, WT_VERB_READ, "page %p: %s", page, __wt_page_type_string(page->type))); WT_PUBLISH(ref->state, WT_REF_MEM); return (0); err: /* * If the function building an in-memory version of the page failed, * it discarded the page, but not the disk image. Discard the page * and separately discard the disk image in all cases. */ if (ref->page != NULL) __wt_ref_out(session, ref); WT_PUBLISH(ref->state, previous_state); __wt_buf_free(session, &tmp); return (ret); }
/* * __wt_compact_page_skip -- * Return if compaction requires we read this page. */ int __wt_compact_page_skip(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp) { WT_BM *bm; WT_DECL_RET; size_t addr_size; u_int type; const uint8_t *addr; /* * Skip deleted pages, rewriting them doesn't seem useful; in a better * world we'd write the parent to delete the page. */ if (ref->state == WT_REF_DELETED) { *skipp = true; return (0); } *skipp = false; /* Default to reading */ /* * If the page is in-memory, we want to look at it (it may have been * modified and written, and the current location is the interesting * one in terms of compaction, not the original location). * * This test could be combined with the next one, but this is a cheap * test and the next one is expensive. */ if (ref->state != WT_REF_DISK) return (0); /* * There's nothing to prevent the WT_REF state from changing underfoot, * which can change its address. For example, the WT_REF address might * reference an on-page cell, and page eviction can free that memory. * Lock the WT_REF so we can look at its address. */ if (!__wt_atomic_casv32(&ref->state, WT_REF_DISK, WT_REF_LOCKED)) return (0); /* * The page is on disk, so there had better be an address; assert that * fact, test at run-time to avoid the core dump. * * Internal pages must be read to walk the tree; ask the block-manager * if it's useful to rewrite leaf pages, don't do the I/O if a rewrite * won't help. */ __wt_ref_info(ref, &addr, &addr_size, &type); WT_ASSERT(session, addr != NULL); if (addr != NULL && type != WT_CELL_ADDR_INT) { bm = S2BT(session)->bm; ret = bm->compact_page_skip( bm, session, addr, addr_size, skipp); } /* * Reset the WT_REF state and push the change. The full-barrier isn't * necessary, but it's better to keep pages in circulation than not. */ ref->state = WT_REF_DISK; WT_FULL_BARRIER(); return (ret); }