/* * __wt_compact -- * Compact a file. */ int __wt_compact(WT_SESSION_IMPL *session, const char *cfg[]) { WT_BM *bm; WT_CONFIG_ITEM cval; WT_DECL_RET; WT_PAGE *page; int trigger, skip; bm = S2BT(session)->bm; WT_DSTAT_INCR(session, session_compact); WT_RET(__wt_config_gets(session, cfg, "trigger", &cval)); trigger = (int)cval.val; /* Check if compaction might be useful. */ WT_RET(bm->compact_skip(bm, session, trigger, &skip)); if (skip) return (0); /* * Walk the cache reviewing in-memory pages to see if they need to be * re-written. This requires looking at page reconciliation results, * which means the page cannot be reconciled at the same time as it's * being reviewed for compaction. The underlying functions ensure we * don't collide with page eviction, but we need to make sure we don't * collide with checkpoints either, they are the other operation that * can reconcile a page. */ __wt_spin_lock(session, &S2C(session)->metadata_lock); WT_RET(__wt_bt_cache_op(session, NULL, WT_SYNC_COMPACT)); __wt_spin_unlock(session, &S2C(session)->metadata_lock); /* * Walk the tree, reviewing on-disk pages to see if they need to be * re-written. */ for (page = NULL;;) { WT_RET(__wt_tree_walk(session, &page, WT_TREE_COMPACT)); if (page == NULL) break; /* * The only pages returned by the tree walk function are pages * we want to re-write; mark the page and tree dirty. */ if ((ret = __wt_page_modify_init(session, page)) != 0) { WT_TRET(__wt_page_release(session, page)); WT_RET(ret); } __wt_page_and_tree_modify_set(session, page); WT_DSTAT_INCR(session, btree_compact_rewrite); } return (0); }
/* * __sync_dup_walk -- * Duplicate a tree walk point. */ static inline int __sync_dup_walk( WT_SESSION_IMPL *session, WT_REF *walk, uint32_t flags, WT_REF **dupp) { WT_REF *old; bool busy; if ((old = *dupp) != NULL) { *dupp = NULL; WT_RET(__wt_page_release(session, old, flags)); } /* It is okay to duplicate a walk before it starts. */ if (walk == NULL || __wt_ref_is_root(walk)) { *dupp = walk; return (0); } /* Get a duplicate hazard pointer. */ for (;;) { #ifdef HAVE_DIAGNOSTIC WT_RET( __wt_hazard_set(session, walk, &busy, __func__, __LINE__)); #else WT_RET(__wt_hazard_set(session, walk, &busy)); #endif /* * We already have a hazard pointer, we should generally be able * to get another one. We can get spurious busy errors (e.g., if * eviction is attempting to lock the page. Keep trying: we have * one hazard pointer so we should be able to get another one. */ if (!busy) break; __wt_yield(); } *dupp = walk; return (0); }
/* * __wt_tree_walk -- * Move to the next/previous page in the tree. */ int __wt_tree_walk(WT_SESSION_IMPL *session, WT_REF **refp, uint64_t *walkcntp, uint32_t flags) { WT_BTREE *btree; WT_DECL_RET; WT_PAGE *page; WT_PAGE_INDEX *pindex; WT_REF *couple, *couple_orig, *ref; int prev, skip; uint32_t slot; btree = S2BT(session); /* * Tree walks are special: they look inside page structures that splits * may want to free. Publish that the tree is active during this * window. */ WT_ENTER_PAGE_INDEX(session); /* * !!! * Fast-truncate currently only works on row-store trees. */ if (btree->type != BTREE_ROW) LF_CLR(WT_READ_TRUNCATE); prev = LF_ISSET(WT_READ_PREV) ? 1 : 0; /* * There are multiple reasons and approaches to walking the in-memory * tree: * * (1) finding pages to evict (the eviction server); * (2) writing just dirty leaves or internal nodes (checkpoint); * (3) discarding pages (close); * (4) truncating pages in a range (fast truncate); * (5) skipping pages based on outside information (compaction); * (6) cursor scans (applications). * * Except for cursor scans and compaction, the walk is limited to the * cache, no pages are read. In all cases, hazard pointers protect the * walked pages from eviction. * * Walks use hazard-pointer coupling through the tree and that's OK * (hazard pointers can't deadlock, so there's none of the usual * problems found when logically locking up a btree). If the eviction * thread tries to evict the active page, it fails because of our * hazard pointer. If eviction tries to evict our parent, that fails * because the parent has a child page that can't be discarded. We do * play one game: don't couple up to our parent and then back down to a * new leaf, couple to the next page to which we're descending, it * saves a hazard-pointer swap for each cursor page movement. * * !!! * NOTE: we depend on the fact it's OK to release a page we don't hold, * that is, it's OK to release couple when couple is set to NULL. * * Take a copy of any held page and clear the return value. Remember * the hazard pointer we're currently holding. * * We may be passed a pointer to btree->evict_page that we are clearing * here. We check when discarding pages that we're not discarding that * page, so this clear must be done before the page is released. */ couple = couple_orig = ref = *refp; *refp = NULL; /* If no page is active, begin a walk from the start of the tree. */ if (ref == NULL) { ref = &btree->root; if (ref->page == NULL) goto done; goto descend; } ascend: /* * If the active page was the root, we've reached the walk's end. * Release any hazard-pointer we're holding. */ if (__wt_ref_is_root(ref)) { WT_ERR(__wt_page_release(session, couple, flags)); goto done; } /* Figure out the current slot in the WT_REF array. */ __wt_page_refp(session, ref, &pindex, &slot); for (;;) { /* * If we're at the last/first slot on the page, return this page * in post-order traversal. Otherwise we move to the next/prev * slot and left/right-most element in its subtree. */ if ((prev && slot == 0) || (!prev && slot == pindex->entries - 1)) { ref = ref->home->pg_intl_parent_ref; /* Optionally skip internal pages. */ if (LF_ISSET(WT_READ_SKIP_INTL)) goto ascend; /* * We've ascended the tree and are returning an internal * page. If it's the root, discard our hazard pointer, * otherwise, swap our hazard pointer for the page we'll * return. */ if (__wt_ref_is_root(ref)) WT_ERR(__wt_page_release( session, couple, flags)); else { /* * Locate the reference to our parent page then * swap our child hazard pointer for the parent. * We don't handle restart or not-found returns. * It would require additional complexity and is * not a possible return: we're moving to the * parent of the current child page, our parent * reference can't have split or been evicted. */ __wt_page_refp(session, ref, &pindex, &slot); if ((ret = __wt_page_swap( session, couple, ref, flags)) != 0) { WT_TRET(__wt_page_release( session, couple, flags)); WT_ERR(ret); } } *refp = ref; goto done; } if (prev) --slot; else ++slot; if (walkcntp != NULL) ++*walkcntp; for (;;) { ref = pindex->index[slot]; if (LF_ISSET(WT_READ_CACHE)) { /* * Only look at unlocked pages in memory: * fast-path some common cases. */ if (LF_ISSET(WT_READ_NO_WAIT) && ref->state != WT_REF_MEM) break; } else if (LF_ISSET(WT_READ_TRUNCATE)) { /* * Avoid pulling a deleted page back in to try * to delete it again. */ if (ref->state == WT_REF_DELETED && __wt_delete_page_skip(session, ref)) break; /* * If deleting a range, try to delete the page * without instantiating it. */ WT_ERR(__wt_delete_page(session, ref, &skip)); if (skip) break; } else if (LF_ISSET(WT_READ_COMPACT)) { /* * Skip deleted pages, rewriting them doesn't * seem useful. */ if (ref->state == WT_REF_DELETED) break; /* * If the page is in-memory, we want to look at * it (it may have been modified and written, * and the current location is the interesting * one in terms of compaction, not the original * location). If the page isn't in-memory, test * if the page will help with compaction, don't * read it if we don't have to. */ if (ref->state == WT_REF_DISK) { WT_ERR(__wt_compact_page_skip( session, ref, &skip)); if (skip) break; } } else { /* * Try to skip deleted pages visible to us. */ if (ref->state == WT_REF_DELETED && __wt_delete_page_skip(session, ref)) break; } ret = __wt_page_swap(session, couple, ref, flags); /* * Not-found is an expected return when only walking * in-cache pages. */ if (ret == WT_NOTFOUND) { ret = 0; break; } /* * The page we're moving to might have split, in which * case move to the last position we held. */ if (ret == WT_RESTART) { ret = 0; /* * If a new walk that never coupled from the * root to a new saved position in the tree, * restart the walk. */ if (couple == &btree->root) { ref = &btree->root; if (ref->page == NULL) goto done; goto descend; } /* * If restarting from some original position, * repeat the increment or decrement we made at * that time. Otherwise, couple is an internal * page we've acquired after moving from that * starting position and we can treat it as a * new page. This works because we never acquire * a hazard pointer on a leaf page we're not * going to return to our caller, this will quit * working if that ever changes. */ WT_ASSERT(session, couple == couple_orig || WT_PAGE_IS_INTERNAL(couple->page)); ref = couple; __wt_page_refp(session, ref, &pindex, &slot); if (couple == couple_orig) break; } WT_ERR(ret); /* * A new page: configure for traversal of any internal * page's children, else return the leaf page. */ descend: couple = ref; page = ref->page; if (page->type == WT_PAGE_ROW_INT || page->type == WT_PAGE_COL_INT) { WT_INTL_INDEX_GET(session, page, pindex); slot = prev ? pindex->entries - 1 : 0; } else { *refp = ref; goto done; } } } done: err: WT_LEAVE_PAGE_INDEX(session); return (ret); }
/* * __sync_file -- * Flush pages for a specific file. */ static int __sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop) { struct timespec end, start; WT_BTREE *btree; WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_PAGE *page; WT_PAGE_MODIFY *mod; WT_REF *walk; WT_TXN *txn; uint64_t internal_bytes, internal_pages, leaf_bytes, leaf_pages; uint64_t oldest_id, saved_snap_min; uint32_t flags; conn = S2C(session); btree = S2BT(session); walk = NULL; txn = &session->txn; saved_snap_min = WT_SESSION_TXN_STATE(session)->snap_min; flags = WT_READ_CACHE | WT_READ_NO_GEN; internal_bytes = leaf_bytes = 0; internal_pages = leaf_pages = 0; if (WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT)) WT_RET(__wt_epoch(session, &start)); switch (syncop) { case WT_SYNC_WRITE_LEAVES: /* * Write all immediately available, dirty in-cache leaf pages. * * Writing the leaf pages is done without acquiring a high-level * lock, serialize so multiple threads don't walk the tree at * the same time. */ if (!btree->modified) return (0); __wt_spin_lock(session, &btree->flush_lock); if (!btree->modified) { __wt_spin_unlock(session, &btree->flush_lock); return (0); } /* * Save the oldest transaction ID we need to keep around. * Otherwise, in a busy system, we could be updating pages so * fast that write leaves never catches up. We deliberately * have no transaction running at this point that would keep * the oldest ID from moving forwards as we walk the tree. */ oldest_id = __wt_txn_oldest_id(session); flags |= WT_READ_NO_WAIT | WT_READ_SKIP_INTL; for (walk = NULL;;) { WT_ERR(__wt_tree_walk(session, &walk, flags)); if (walk == NULL) break; /* * Write dirty pages if nobody beat us to it. Don't * try to write hot pages (defined as pages that have * been updated since the write phase leaves started): * checkpoint will have to visit them anyway. */ page = walk->page; if (__wt_page_is_modified(page) && WT_TXNID_LT(page->modify->update_txn, oldest_id)) { if (txn->isolation == WT_ISO_READ_COMMITTED) __wt_txn_get_snapshot(session); leaf_bytes += page->memory_footprint; ++leaf_pages; WT_ERR(__wt_reconcile(session, walk, NULL, 0)); } } break; case WT_SYNC_CHECKPOINT: /* * If we are flushing a file at read-committed isolation, which * is of particular interest for flushing the metadata to make * schema-changing operation durable, get a transactional * snapshot now. * * All changes committed up to this point should be included. * We don't update the snapshot in between pages because (a) * the metadata shouldn't be that big, and (b) if we do ever */ if (txn->isolation == WT_ISO_READ_COMMITTED) __wt_txn_get_snapshot(session); /* * We cannot check the tree modified flag in the case of a * checkpoint, the checkpoint code has already cleared it. * * Writing the leaf pages is done without acquiring a high-level * lock, serialize so multiple threads don't walk the tree at * the same time. We're holding the schema lock, but need the * lower-level lock as well. */ __wt_spin_lock(session, &btree->flush_lock); /* * In the final checkpoint pass, child pages cannot be evicted * from underneath internal pages nor can underlying blocks be * freed until the checkpoint's block lists are stable. Also, * we cannot split child pages into parents unless we know the * final pass will write a consistent view of that namespace. * Set the checkpointing flag to block such actions and wait for * any problematic eviction or page splits to complete. */ WT_PUBLISH(btree->checkpointing, WT_CKPT_PREPARE); WT_ERR(__wt_evict_file_exclusive_on(session)); __wt_evict_file_exclusive_off(session); WT_PUBLISH(btree->checkpointing, WT_CKPT_RUNNING); /* Write all dirty in-cache pages. */ flags |= WT_READ_NO_EVICT; for (walk = NULL;;) { WT_ERR(__wt_tree_walk(session, &walk, flags)); if (walk == NULL) break; /* Skip clean pages. */ if (!__wt_page_is_modified(walk->page)) continue; /* * Take a local reference to the page modify structure * now that we know the page is dirty. It needs to be * done in this order otherwise the page modify * structure could have been created between taking the * reference and checking modified. */ page = walk->page; mod = page->modify; /* * Write dirty pages, unless we can be sure they only * became dirty after the checkpoint started. * * We can skip dirty pages if: * (1) they are leaf pages; * (2) there is a snapshot transaction active (which * is the case in ordinary application checkpoints * but not all internal cases); and * (3) the first dirty update on the page is * sufficiently recent that the checkpoint * transaction would skip them. * * Mark the tree dirty: the checkpoint marked it clean * and we can't skip future checkpoints until this page * is written. */ if (!WT_PAGE_IS_INTERNAL(page) && F_ISSET(txn, WT_TXN_HAS_SNAPSHOT) && WT_TXNID_LT(txn->snap_max, mod->first_dirty_txn)) { __wt_page_modify_set(session, page); continue; } if (WT_PAGE_IS_INTERNAL(page)) { internal_bytes += page->memory_footprint; ++internal_pages; } else { leaf_bytes += page->memory_footprint; ++leaf_pages; } WT_ERR(__wt_reconcile(session, walk, NULL, 0)); } break; case WT_SYNC_CLOSE: case WT_SYNC_DISCARD: WT_ILLEGAL_VALUE_ERR(session); } if (WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT)) { WT_ERR(__wt_epoch(session, &end)); WT_ERR(__wt_verbose(session, WT_VERB_CHECKPOINT, "__sync_file WT_SYNC_%s wrote:\n\t %" PRIu64 " bytes, %" PRIu64 " pages of leaves\n\t %" PRIu64 " bytes, %" PRIu64 " pages of internal\n\t" "Took: %" PRIu64 "ms", syncop == WT_SYNC_WRITE_LEAVES ? "WRITE_LEAVES" : "CHECKPOINT", leaf_bytes, leaf_pages, internal_bytes, internal_pages, WT_TIMEDIFF_MS(end, start))); } err: /* On error, clear any left-over tree walk. */ if (walk != NULL) WT_TRET(__wt_page_release(session, walk, flags)); /* * If we got a snapshot in order to write pages, and there was no * snapshot active when we started, release it. */ if (txn->isolation == WT_ISO_READ_COMMITTED && saved_snap_min == WT_TXN_NONE) __wt_txn_release_snapshot(session); if (btree->checkpointing != WT_CKPT_OFF) { /* * Update the checkpoint generation for this handle so visible * updates newer than the checkpoint can be evicted. * * This has to be published before eviction is enabled again, * so that eviction knows that the checkpoint has completed. */ WT_PUBLISH(btree->checkpoint_gen, conn->txn_global.checkpoint_gen); WT_STAT_FAST_DATA_SET(session, btree_checkpoint_generation, btree->checkpoint_gen); /* * Clear the checkpoint flag and push the change; not required, * but publishing the change means stalled eviction gets moving * as soon as possible. */ btree->checkpointing = WT_CKPT_OFF; WT_FULL_BARRIER(); /* * If this tree was being skipped by the eviction server during * the checkpoint, clear the wait. */ btree->evict_walk_period = 0; /* * Wake the eviction server, in case application threads have * stalled while the eviction server decided it couldn't make * progress. Without this, application threads will be stalled * until the eviction server next wakes. */ WT_TRET(__wt_evict_server_wake(session)); } __wt_spin_unlock(session, &btree->flush_lock); /* * Leaves are written before a checkpoint (or as part of a file close, * before checkpointing the file). Start a flush to stable storage, * but don't wait for it. */ if (ret == 0 && syncop == WT_SYNC_WRITE_LEAVES && F_ISSET(conn, WT_CONN_CKPT_SYNC)) WT_RET(btree->bm->sync(btree->bm, session, true)); return (ret); }
/* * __verify_tree -- * Verify a tree, recursively descending through it in depth-first fashion. * The page argument was physically verified (so we know it's correctly formed), * and the in-memory version built. Our job is to check logical relationships * in the page and in the tree. */ static int __verify_tree(WT_SESSION_IMPL *session, WT_REF *ref, WT_VSTUFF *vs) { WT_BM *bm; WT_CELL *cell; WT_CELL_UNPACK *unpack, _unpack; WT_COL *cip; WT_DECL_RET; WT_PAGE *page; WT_REF *child_ref; uint64_t recno; uint32_t entry, i; bool found; bm = S2BT(session)->bm; page = ref->page; unpack = &_unpack; WT_CLEAR(*unpack); /* -Wuninitialized */ WT_RET(__wt_verbose(session, WT_VERB_VERIFY, "%s %s", __wt_page_addr_string(session, ref, vs->tmp1), __wt_page_type_string(page->type))); /* Optionally dump the address. */ if (vs->dump_address) WT_RET(__wt_msg(session, "%s %s", __wt_page_addr_string(session, ref, vs->tmp1), __wt_page_type_string(page->type))); /* Track the shape of the tree. */ if (WT_PAGE_IS_INTERNAL(page)) ++vs->depth_internal[ WT_MIN(vs->depth, WT_ELEMENTS(vs->depth_internal) - 1)]; else ++vs->depth_leaf[ WT_MIN(vs->depth, WT_ELEMENTS(vs->depth_internal) - 1)]; /* * The page's physical structure was verified when it was read into * memory by the read server thread, and then the in-memory version * of the page was built. Now we make sure the page and tree are * logically consistent. * * !!! * The problem: (1) the read server has to build the in-memory version * of the page because the read server is the thread that flags when * any thread can access the page in the tree; (2) we can't build the * in-memory version of the page until the physical structure is known * to be OK, so the read server has to verify at least the physical * structure of the page; (3) doing complete page verification requires * reading additional pages (for example, overflow keys imply reading * overflow pages in order to test the key's order in the page); (4) * the read server cannot read additional pages because it will hang * waiting on itself. For this reason, we split page verification * into a physical verification, which allows the in-memory version * of the page to be built, and then a subsequent logical verification * which happens here. * * Report progress occasionally. */ #define WT_VERIFY_PROGRESS_INTERVAL 100 if (++vs->fcnt % WT_VERIFY_PROGRESS_INTERVAL == 0) WT_RET(__wt_progress(session, NULL, vs->fcnt)); #ifdef HAVE_DIAGNOSTIC /* Optionally dump the blocks or page in debugging mode. */ if (vs->dump_blocks) WT_RET(__wt_debug_disk(session, page->dsk, NULL)); if (vs->dump_pages) WT_RET(__wt_debug_page(session, page, NULL)); #endif /* * Column-store key order checks: check the page's record number and * then update the total record count. */ switch (page->type) { case WT_PAGE_COL_FIX: recno = page->pg_fix_recno; goto recno_chk; case WT_PAGE_COL_INT: recno = page->pg_intl_recno; goto recno_chk; case WT_PAGE_COL_VAR: recno = page->pg_var_recno; recno_chk: if (recno != vs->record_total + 1) WT_RET_MSG(session, WT_ERROR, "page at %s has a starting record of %" PRIu64 " when the expected starting record is %" PRIu64, __wt_page_addr_string(session, ref, vs->tmp1), recno, vs->record_total + 1); break; } switch (page->type) { case WT_PAGE_COL_FIX: vs->record_total += page->pg_fix_entries; break; case WT_PAGE_COL_VAR: recno = 0; WT_COL_FOREACH(page, cip, i) if ((cell = WT_COL_PTR(page, cip)) == NULL) ++recno; else { __wt_cell_unpack(cell, unpack); recno += __wt_cell_rle(unpack); } vs->record_total += recno; break; } /* * Row-store leaf page key order check: it's a depth-first traversal, * the first key on this page should be larger than any key previously * seen. */ switch (page->type) { case WT_PAGE_ROW_LEAF: WT_RET(__verify_row_leaf_key_order(session, ref, vs)); break; } /* If it's not the root page, unpack the parent cell. */ if (!__wt_ref_is_root(ref)) { __wt_cell_unpack(ref->addr, unpack); /* Compare the parent cell against the page type. */ switch (page->type) { case WT_PAGE_COL_FIX: if (unpack->raw != WT_CELL_ADDR_LEAF_NO) goto celltype_err; break; case WT_PAGE_COL_VAR: if (unpack->raw != WT_CELL_ADDR_LEAF && unpack->raw != WT_CELL_ADDR_LEAF_NO) goto celltype_err; break; case WT_PAGE_ROW_LEAF: if (unpack->raw != WT_CELL_ADDR_DEL && unpack->raw != WT_CELL_ADDR_LEAF && unpack->raw != WT_CELL_ADDR_LEAF_NO) goto celltype_err; break; case WT_PAGE_COL_INT: case WT_PAGE_ROW_INT: if (unpack->raw != WT_CELL_ADDR_INT) celltype_err: WT_RET_MSG(session, WT_ERROR, "page at %s, of type %s, is referenced in " "its parent by a cell of type %s", __wt_page_addr_string( session, ref, vs->tmp1), __wt_page_type_string(page->type), __wt_cell_type_string(unpack->raw)); break; } } /* * Check overflow pages. We check overflow cells separately from other * tests that walk the page as it's simpler, and I don't care much how * fast table verify runs. */ switch (page->type) { case WT_PAGE_COL_VAR: case WT_PAGE_ROW_INT: case WT_PAGE_ROW_LEAF: WT_RET(__verify_overflow_cell(session, ref, &found, vs)); if (__wt_ref_is_root(ref) || page->type == WT_PAGE_ROW_INT) break; /* * Object if a leaf-no-overflow address cell references a page * with overflow keys, but don't object if a leaf address cell * references a page without overflow keys. Reconciliation * doesn't guarantee every leaf page without overflow items will * be a leaf-no-overflow type. */ if (found && unpack->raw == WT_CELL_ADDR_LEAF_NO) WT_RET_MSG(session, WT_ERROR, "page at %s, of type %s and referenced in its " "parent by a cell of type %s, contains overflow " "items", __wt_page_addr_string(session, ref, vs->tmp1), __wt_page_type_string(page->type), __wt_cell_type_string(WT_CELL_ADDR_LEAF_NO)); break; } /* Check tree connections and recursively descend the tree. */ switch (page->type) { case WT_PAGE_COL_INT: /* For each entry in an internal page, verify the subtree. */ entry = 0; WT_INTL_FOREACH_BEGIN(session, page, child_ref) { /* * It's a depth-first traversal: this entry's starting * record number should be 1 more than the total records * reviewed to this point. */ ++entry; if (child_ref->key.recno != vs->record_total + 1) { WT_RET_MSG(session, WT_ERROR, "the starting record number in entry %" PRIu32 " of the column internal page at " "%s is %" PRIu64 " and the expected " "starting record number is %" PRIu64, entry, __wt_page_addr_string( session, child_ref, vs->tmp1), child_ref->key.recno, vs->record_total + 1); } /* Verify the subtree. */ ++vs->depth; WT_RET(__wt_page_in(session, child_ref, 0)); ret = __verify_tree(session, child_ref, vs); WT_TRET(__wt_page_release(session, child_ref, 0)); --vs->depth; WT_RET(ret); __wt_cell_unpack(child_ref->addr, unpack); WT_RET(bm->verify_addr( bm, session, unpack->data, unpack->size)); } WT_INTL_FOREACH_END; break; case WT_PAGE_ROW_INT: /* For each entry in an internal page, verify the subtree. */ entry = 0; WT_INTL_FOREACH_BEGIN(session, page, child_ref) { /* * It's a depth-first traversal: this entry's starting * key should be larger than the largest key previously * reviewed. * * The 0th key of any internal page is magic, and we * can't test against it. */ ++entry; if (entry != 1) WT_RET(__verify_row_int_key_order( session, page, child_ref, entry, vs)); /* Verify the subtree. */ ++vs->depth; WT_RET(__wt_page_in(session, child_ref, 0)); ret = __verify_tree(session, child_ref, vs); WT_TRET(__wt_page_release(session, child_ref, 0)); --vs->depth; WT_RET(ret); __wt_cell_unpack(child_ref->addr, unpack); WT_RET(bm->verify_addr( bm, session, unpack->data, unpack->size)); } WT_INTL_FOREACH_END;
/* * __wt_evict_file -- * Discard pages for a specific file. */ int __wt_evict_file(WT_SESSION_IMPL *session, int syncop) { WT_DECL_RET; WT_PAGE *page; WT_REF *next_ref, *ref; bool evict_reset; /* * We need exclusive access to the file -- disable ordinary eviction * and drain any blocks already queued. */ WT_RET(__wt_evict_file_exclusive_on(session, &evict_reset)); /* Make sure the oldest transaction ID is up-to-date. */ __wt_txn_update_oldest(session, true); /* Walk the tree, discarding pages. */ next_ref = NULL; WT_ERR(__wt_tree_walk(session, &next_ref, NULL, WT_READ_CACHE | WT_READ_NO_EVICT)); while ((ref = next_ref) != NULL) { page = ref->page; /* * Eviction can fail when a page in the evicted page's subtree * switches state. For example, if we don't evict a page marked * empty, because we expect it to be merged into its parent, it * might no longer be empty after it's reconciled, in which case * eviction of its parent would fail. We can either walk the * tree multiple times (until it's finally empty), or reconcile * each page to get it to its final state before considering if * it's an eviction target or will be merged into its parent. * * Don't limit this test to any particular page type, that tends * to introduce bugs when the reconciliation of other page types * changes, and there's no advantage to doing so. * * Eviction can also fail because an update cannot be written. * If sessions have disjoint sets of files open, updates in a * no-longer-referenced file may not yet be globally visible, * and the write will fail with EBUSY. Our caller handles that * error, retrying later. */ if (syncop == WT_SYNC_CLOSE && __wt_page_is_modified(page)) WT_ERR(__wt_reconcile(session, ref, NULL, WT_EVICTING)); /* * We can't evict the page just returned to us (it marks our * place in the tree), so move the walk to one page ahead of * the page being evicted. Note, we reconciled the returned * page first: if reconciliation of that page were to change * the shape of the tree, and we did the next walk call before * the reconciliation, the next walk call could miss a page in * the tree. */ WT_ERR(__wt_tree_walk(session, &next_ref, NULL, WT_READ_CACHE | WT_READ_NO_EVICT)); switch (syncop) { case WT_SYNC_CLOSE: /* * Evict the page. */ WT_ERR(__wt_evict(session, ref, 1)); break; case WT_SYNC_DISCARD: /* * Dead handles may reference dirty pages; clean the * page, both to keep statistics correct, and to let * the page-discard function assert no dirty page is * ever discarded. */ if (F_ISSET(session->dhandle, WT_DHANDLE_DEAD)) __wt_page_modify_clear(session, page); WT_ASSERT(session, F_ISSET(session->dhandle, WT_DHANDLE_DEAD) || __wt_page_can_evict(session, ref, false, NULL)); __wt_evict_page_clean_update(session, ref, 1); break; WT_ILLEGAL_VALUE_ERR(session); } } if (0) { err: /* On error, clear any left-over tree walk. */ if (next_ref != NULL) WT_TRET(__wt_page_release( session, next_ref, WT_READ_NO_EVICT)); } if (evict_reset) __wt_evict_file_exclusive_off(session); return (ret); }
/* * __wt_compact -- * Compact a file. */ int __wt_compact(WT_SESSION_IMPL *session, const char *cfg[]) { WT_BM *bm; WT_BTREE *btree; WT_DECL_RET; WT_REF *ref; bool block_manager_begin, skip; WT_UNUSED(cfg); btree = S2BT(session); bm = btree->bm; ref = NULL; block_manager_begin = false; WT_STAT_FAST_DATA_INCR(session, session_compact); /* * Check if compaction might be useful -- the API layer will quit trying * to compact the data source if we make no progress, set a flag if the * block layer thinks compaction is possible. */ WT_RET(bm->compact_skip(bm, session, &skip)); if (skip) return (0); /* * Reviewing in-memory pages requires looking at page reconciliation * results, because we care about where the page is stored now, not * where the page was stored when we first read it into the cache. * We need to ensure we don't race with page reconciliation as it's * writing the page modify information. * * There are three ways we call reconciliation: checkpoints, threads * writing leaf pages (usually in preparation for a checkpoint or if * closing a file), and eviction. * * We're holding the schema lock which serializes with checkpoints. */ WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_SCHEMA)); /* * Get the tree handle's flush lock which blocks threads writing leaf * pages. */ __wt_spin_lock(session, &btree->flush_lock); /* Start compaction. */ WT_ERR(bm->compact_start(bm, session)); block_manager_begin = true; /* Walk the tree reviewing pages to see if they should be re-written. */ for (;;) { /* * Pages read for compaction aren't "useful"; don't update the * read generation of pages already in memory, and if a page is * read, set its generation to a low value so it is evicted * quickly. */ WT_ERR(__wt_tree_walk(session, &ref, WT_READ_COMPACT | WT_READ_NO_GEN | WT_READ_WONT_NEED)); if (ref == NULL) break; WT_ERR(__compact_rewrite(session, ref, &skip)); if (skip) continue; session->compact_state = WT_COMPACT_SUCCESS; /* Rewrite the page: mark the page and tree dirty. */ WT_ERR(__wt_page_modify_init(session, ref->page)); __wt_page_modify_set(session, ref->page); WT_STAT_FAST_DATA_INCR(session, btree_compact_rewrite); } err: if (ref != NULL) WT_TRET(__wt_page_release(session, ref, 0)); if (block_manager_begin) WT_TRET(bm->compact_end(bm, session)); /* Unblock threads writing leaf pages. */ __wt_spin_unlock(session, &btree->flush_lock); return (ret); }
/* * __tree_walk_internal -- * Move to the next/previous page in the tree. */ static inline int __tree_walk_internal(WT_SESSION_IMPL *session, WT_REF **refp, uint64_t *walkcntp, uint64_t *skipleafcntp, uint32_t flags) { WT_BTREE *btree; WT_DECL_RET; WT_PAGE_INDEX *pindex; WT_REF *couple, *couple_orig, *ref; bool empty_internal, prev, skip; uint32_t slot; btree = S2BT(session); empty_internal = false; /* * Tree walks are special: they look inside page structures that splits * may want to free. Publish that the tree is active during this * window. */ WT_ENTER_PAGE_INDEX(session); /* Walk should never instantiate deleted pages. */ LF_SET(WT_READ_NO_EMPTY); /* * !!! * Fast-truncate currently only works on row-store trees. */ if (btree->type != BTREE_ROW) LF_CLR(WT_READ_TRUNCATE); prev = LF_ISSET(WT_READ_PREV) ? 1 : 0; /* * There are multiple reasons and approaches to walking the in-memory * tree: * * (1) finding pages to evict (the eviction server); * (2) writing just dirty leaves or internal nodes (checkpoint); * (3) discarding pages (close); * (4) truncating pages in a range (fast truncate); * (5) skipping pages based on outside information (compaction); * (6) cursor scans (applications). * * Except for cursor scans and compaction, the walk is limited to the * cache, no pages are read. In all cases, hazard pointers protect the * walked pages from eviction. * * Walks use hazard-pointer coupling through the tree and that's OK * (hazard pointers can't deadlock, so there's none of the usual * problems found when logically locking up a btree). If the eviction * thread tries to evict the active page, it fails because of our * hazard pointer. If eviction tries to evict our parent, that fails * because the parent has a child page that can't be discarded. We do * play one game: don't couple up to our parent and then back down to a * new leaf, couple to the next page to which we're descending, it * saves a hazard-pointer swap for each cursor page movement. * * !!! * NOTE: we depend on the fact it's OK to release a page we don't hold, * that is, it's OK to release couple when couple is set to NULL. * * Take a copy of any held page and clear the return value. Remember * the hazard pointer we're currently holding. * * We may be passed a pointer to btree->evict_page that we are clearing * here. We check when discarding pages that we're not discarding that * page, so this clear must be done before the page is released. */ couple = couple_orig = ref = *refp; *refp = NULL; /* If no page is active, begin a walk from the start of the tree. */ if (ref == NULL) { ref = &btree->root; if (ref->page == NULL) goto done; goto descend; } /* * If the active page was the root, we've reached the walk's end. * Release any hazard-pointer we're holding. */ if (__wt_ref_is_root(ref)) { WT_ERR(__wt_page_release(session, couple, flags)); goto done; } /* Figure out the current slot in the WT_REF array. */ __ref_index_slot(session, ref, &pindex, &slot); for (;;) { /* * If we're at the last/first slot on the internal page, return * it in post-order traversal. Otherwise move to the next/prev * slot and left/right-most element in that subtree. */ while ((prev && slot == 0) || (!prev && slot == pindex->entries - 1)) { /* Ascend to the parent. */ __page_ascend(session, &ref, &pindex, &slot); /* * If we got all the way through an internal page and * all of the child pages were deleted, mark it for * eviction. */ if (empty_internal && pindex->entries > 1) { __wt_page_evict_soon(ref->page); empty_internal = false; } /* * If at the root and returning internal pages, return * the root page, otherwise we're done. Regardless, no * hazard pointer is required, release the one we hold. */ if (__wt_ref_is_root(ref)) { WT_ERR(__wt_page_release( session, couple, flags)); if (!LF_ISSET(WT_READ_SKIP_INTL)) *refp = ref; goto done; } /* * Optionally return internal pages. Swap our previous * hazard pointer for the page we'll return. We don't * handle restart or not-found returns, it would require * additional complexity and is not a possible return: * we're moving to the parent of the current child page, * the parent can't have been evicted. */ if (!LF_ISSET(WT_READ_SKIP_INTL)) { WT_ERR(__wt_page_swap( session, couple, ref, flags)); *refp = ref; goto done; } } if (prev) --slot; else ++slot; if (walkcntp != NULL) ++*walkcntp; for (;;) { /* * Move to the next slot, and set the reference hint if * it's wrong (used when we continue the walk). We don't * update those hints when splitting, so it's common for * them to be incorrect in some workloads. */ ref = pindex->index[slot]; if (ref->pindex_hint != slot) ref->pindex_hint = slot; /* * If we see any child states other than deleted, the * page isn't empty. */ if (ref->state != WT_REF_DELETED && !LF_ISSET(WT_READ_TRUNCATE)) empty_internal = false; if (LF_ISSET(WT_READ_CACHE)) { /* * Only look at unlocked pages in memory: * fast-path some common cases. */ if (LF_ISSET(WT_READ_NO_WAIT) && ref->state != WT_REF_MEM) break; } else if (LF_ISSET(WT_READ_TRUNCATE)) { /* * Avoid pulling a deleted page back in to try * to delete it again. */ if (ref->state == WT_REF_DELETED && __wt_delete_page_skip(session, ref, false)) break; /* * If deleting a range, try to delete the page * without instantiating it. */ WT_ERR(__wt_delete_page(session, ref, &skip)); if (skip) break; empty_internal = false; } else if (LF_ISSET(WT_READ_COMPACT)) { /* * Skip deleted pages, rewriting them doesn't * seem useful. */ if (ref->state == WT_REF_DELETED) break; /* * If the page is in-memory, we want to look at * it (it may have been modified and written, * and the current location is the interesting * one in terms of compaction, not the original * location). If the page isn't in-memory, test * if the page will help with compaction, don't * read it if we don't have to. */ if (ref->state == WT_REF_DISK) { WT_ERR(__wt_compact_page_skip( session, ref, &skip)); if (skip) break; } } else { /* * Try to skip deleted pages visible to us. */ if (ref->state == WT_REF_DELETED && __wt_delete_page_skip(session, ref, false)) break; } /* * Optionally skip leaf pages: skip all leaf pages if * WT_READ_SKIP_LEAF is set, when the skip-leaf-count * variable is non-zero, skip some count of leaf pages. * If this page is disk-based, crack the cell to figure * out it's a leaf page without reading it. * * If skipping some number of leaf pages, decrement the * count of pages to zero, and then take the next leaf * page we can. Be cautious around the page decrement, * if for some reason don't take this particular page, * we can take the next one, and, there are additional * tests/decrements when we're about to return a leaf * page. */ if (skipleafcntp != NULL || LF_ISSET(WT_READ_SKIP_LEAF)) if (__ref_is_leaf(ref)) { if (LF_ISSET(WT_READ_SKIP_LEAF)) break; if (*skipleafcntp > 0) { --*skipleafcntp; break; } } ret = __wt_page_swap(session, couple, ref, WT_READ_NOTFOUND_OK | WT_READ_RESTART_OK | flags); /* * Not-found is an expected return when only walking * in-cache pages, or if we see a deleted page. */ if (ret == WT_NOTFOUND) { ret = 0; break; } /* * The page we're moving to might have split, in which * case move to the last position we held. */ if (ret == WT_RESTART) { ret = 0; /* * If a new walk that never coupled from the * root to a new saved position in the tree, * restart the walk. */ if (couple == &btree->root) { ref = &btree->root; if (ref->page == NULL) goto done; goto descend; } /* * If restarting from some original position, * repeat the increment or decrement we made at * that time. Otherwise, couple is an internal * page we've acquired after moving from that * starting position and we can treat it as a * new page. This works because we never acquire * a hazard pointer on a leaf page we're not * going to return to our caller, this will quit * working if that ever changes. */ WT_ASSERT(session, couple == couple_orig || WT_PAGE_IS_INTERNAL(couple->page)); ref = couple; __ref_index_slot(session, ref, &pindex, &slot); if (couple == couple_orig) break; } WT_ERR(ret); /* * A new page: configure for traversal of any internal * page's children, else return the leaf page. */ if (WT_PAGE_IS_INTERNAL(ref->page)) { descend: couple = ref; empty_internal = true; __page_descend( session, ref->page, &pindex, &slot, prev); } else { /* * Optionally skip leaf pages, the second half. * We didn't have an on-page cell to figure out * if it was a leaf page, we had to acquire the * hazard pointer and look at the page. */ if (skipleafcntp != NULL || LF_ISSET(WT_READ_SKIP_LEAF)) { couple = ref; if (LF_ISSET(WT_READ_SKIP_LEAF)) break; if (*skipleafcntp > 0) { --*skipleafcntp; break; } } *refp = ref; goto done; } } } done: err: WT_LEAVE_PAGE_INDEX(session); return (ret); }
/* * __sync_file -- * Flush pages for a specific file. */ static int __sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop) { WT_BTREE *btree; WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_PAGE *page; WT_PAGE_MODIFY *mod; WT_REF *prev, *walk; WT_TXN *txn; uint64_t internal_bytes, internal_pages, leaf_bytes, leaf_pages; uint64_t oldest_id, saved_pinned_id, time_start, time_stop; uint32_t flags; bool timer, tried_eviction; conn = S2C(session); btree = S2BT(session); prev = walk = NULL; txn = &session->txn; tried_eviction = false; time_start = time_stop = 0; /* Only visit pages in cache and don't bump page read generations. */ flags = WT_READ_CACHE | WT_READ_NO_GEN; /* * Skip all deleted pages. For a page to be marked deleted, it must * have been evicted from cache and marked clean. Checkpoint should * never instantiate deleted pages: if a truncate is not visible to the * checkpoint, the on-disk version is correct. If the truncate is * visible, we skip over the child page when writing its parent. We * check whether a truncate is visible in the checkpoint as part of * reconciling internal pages (specifically in __rec_child_modify). */ LF_SET(WT_READ_DELETED_SKIP); internal_bytes = leaf_bytes = 0; internal_pages = leaf_pages = 0; saved_pinned_id = WT_SESSION_TXN_STATE(session)->pinned_id; timer = WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT); if (timer) time_start = __wt_clock(session); switch (syncop) { case WT_SYNC_WRITE_LEAVES: /* * Write all immediately available, dirty in-cache leaf pages. * * Writing the leaf pages is done without acquiring a high-level * lock, serialize so multiple threads don't walk the tree at * the same time. */ if (!btree->modified) return (0); __wt_spin_lock(session, &btree->flush_lock); if (!btree->modified) { __wt_spin_unlock(session, &btree->flush_lock); return (0); } /* * Save the oldest transaction ID we need to keep around. * Otherwise, in a busy system, we could be updating pages so * fast that write leaves never catches up. We deliberately * have no transaction running at this point that would keep * the oldest ID from moving forwards as we walk the tree. */ oldest_id = __wt_txn_oldest_id(session); LF_SET(WT_READ_NO_WAIT | WT_READ_SKIP_INTL); for (;;) { WT_ERR(__wt_tree_walk(session, &walk, flags)); if (walk == NULL) break; /* * Write dirty pages if nobody beat us to it. Don't * try to write hot pages (defined as pages that have * been updated since the write phase leaves started): * checkpoint will have to visit them anyway. */ page = walk->page; if (__wt_page_is_modified(page) && WT_TXNID_LT(page->modify->update_txn, oldest_id)) { if (txn->isolation == WT_ISO_READ_COMMITTED) __wt_txn_get_snapshot(session); leaf_bytes += page->memory_footprint; ++leaf_pages; WT_ERR(__wt_reconcile(session, walk, NULL, WT_REC_CHECKPOINT, NULL)); } } break; case WT_SYNC_CHECKPOINT: /* * If we are flushing a file at read-committed isolation, which * is of particular interest for flushing the metadata to make * a schema-changing operation durable, get a transactional * snapshot now. * * All changes committed up to this point should be included. * We don't update the snapshot in between pages because the * metadata shouldn't have many pages. Instead, read-committed * isolation ensures that all metadata updates completed before * the checkpoint are included. */ if (txn->isolation == WT_ISO_READ_COMMITTED) __wt_txn_get_snapshot(session); /* * We cannot check the tree modified flag in the case of a * checkpoint, the checkpoint code has already cleared it. * * Writing the leaf pages is done without acquiring a high-level * lock, serialize so multiple threads don't walk the tree at * the same time. We're holding the schema lock, but need the * lower-level lock as well. */ __wt_spin_lock(session, &btree->flush_lock); /* * In the final checkpoint pass, child pages cannot be evicted * from underneath internal pages nor can underlying blocks be * freed until the checkpoint's block lists are stable. Also, * we cannot split child pages into parents unless we know the * final pass will write a consistent view of that namespace. * Set the checkpointing flag to block such actions and wait for * any problematic eviction or page splits to complete. */ WT_ASSERT(session, btree->syncing == WT_BTREE_SYNC_OFF && btree->sync_session == NULL); btree->sync_session = session; btree->syncing = WT_BTREE_SYNC_WAIT; (void)__wt_gen_next_drain(session, WT_GEN_EVICT); btree->syncing = WT_BTREE_SYNC_RUNNING; /* Write all dirty in-cache pages. */ LF_SET(WT_READ_NO_EVICT); /* Read pages with lookaside entries and evict them asap. */ LF_SET(WT_READ_LOOKASIDE | WT_READ_WONT_NEED); for (;;) { WT_ERR(__sync_dup_walk(session, walk, flags, &prev)); WT_ERR(__wt_tree_walk(session, &walk, flags)); if (walk == NULL) break; /* * Skip clean pages, but need to make sure maximum * transaction ID is always updated. */ if (!__wt_page_is_modified(walk->page)) { if (((mod = walk->page->modify) != NULL) && mod->rec_max_txn > btree->rec_max_txn) btree->rec_max_txn = mod->rec_max_txn; if (mod != NULL && btree->rec_max_timestamp < mod->rec_max_timestamp) btree->rec_max_timestamp = mod->rec_max_timestamp; continue; } /* * Take a local reference to the page modify structure * now that we know the page is dirty. It needs to be * done in this order otherwise the page modify * structure could have been created between taking the * reference and checking modified. */ page = walk->page; /* * Write dirty pages, if we can't skip them. If we skip * a page, mark the tree dirty. The checkpoint marked it * clean and we can't skip future checkpoints until this * page is written. */ if (__sync_checkpoint_can_skip(session, page)) { __wt_tree_modify_set(session); continue; } if (WT_PAGE_IS_INTERNAL(page)) { internal_bytes += page->memory_footprint; ++internal_pages; } else { leaf_bytes += page->memory_footprint; ++leaf_pages; } /* * If the page was pulled into cache by our read, try * to evict it now. * * For eviction to have a chance, we first need to move * the walk point to the next page checkpoint will * visit. We want to avoid this code being too special * purpose, so try to reuse the ordinary eviction path. * * Regardless of whether eviction succeeds or fails, * the walk continues from the previous location. We * remember whether we tried eviction, and don't try * again. Even if eviction fails (the page may stay in * cache clean but with history that cannot be * discarded), that is not wasted effort because * checkpoint doesn't need to write the page again. */ if (!WT_PAGE_IS_INTERNAL(page) && page->read_gen == WT_READGEN_WONT_NEED && !tried_eviction) { WT_ERR_BUSY_OK( __wt_page_release_evict(session, walk)); walk = prev; prev = NULL; tried_eviction = true; continue; } tried_eviction = false; WT_ERR(__wt_reconcile( session, walk, NULL, WT_REC_CHECKPOINT, NULL)); /* * Update checkpoint IO tracking data if configured * to log verbose progress messages. */ if (conn->ckpt_timer_start.tv_sec > 0) { conn->ckpt_write_bytes += page->memory_footprint; ++conn->ckpt_write_pages; /* Periodically log checkpoint progress. */ if (conn->ckpt_write_pages % 5000 == 0) __wt_checkpoint_progress( session, false); } } break; case WT_SYNC_CLOSE: case WT_SYNC_DISCARD: WT_ERR(__wt_illegal_value(session, syncop)); break; } if (timer) { time_stop = __wt_clock(session); __wt_verbose(session, WT_VERB_CHECKPOINT, "__sync_file WT_SYNC_%s wrote: %" PRIu64 " leaf pages (%" PRIu64 "B), %" PRIu64 " internal pages (%" PRIu64 "B), and took %" PRIu64 "ms", syncop == WT_SYNC_WRITE_LEAVES ? "WRITE_LEAVES" : "CHECKPOINT", leaf_pages, leaf_bytes, internal_pages, internal_bytes, WT_CLOCKDIFF_MS(time_stop, time_start)); } err: /* On error, clear any left-over tree walk. */ WT_TRET(__wt_page_release(session, walk, flags)); WT_TRET(__wt_page_release(session, prev, flags)); /* * If we got a snapshot in order to write pages, and there was no * snapshot active when we started, release it. */ if (txn->isolation == WT_ISO_READ_COMMITTED && saved_pinned_id == WT_TXN_NONE) __wt_txn_release_snapshot(session); /* Clear the checkpoint flag. */ btree->syncing = WT_BTREE_SYNC_OFF; btree->sync_session = NULL; __wt_spin_unlock(session, &btree->flush_lock); /* * Leaves are written before a checkpoint (or as part of a file close, * before checkpointing the file). Start a flush to stable storage, * but don't wait for it. */ if (ret == 0 && syncop == WT_SYNC_WRITE_LEAVES && F_ISSET(conn, WT_CONN_CKPT_SYNC)) WT_RET(btree->bm->sync(btree->bm, session, false)); return (ret); }
/*对文件进行compact操作*/ int __wt_compact(WT_SESSION_IMPL* session, const char* cfg[]) { WT_BM *bm; WT_BTREE *btree; WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_REF *ref; int block_manager_begin, evict_reset, skip; WT_UNUSED(cfg); conn = S2C(session); btree = S2BT(session); bm = btree->bm; ref = NULL; block_manager_begin = 0; WT_STAT_FAST_DATA_INCR(session, session_compact); /*检查bm对相应的blocks是否可以compact,如果不可以,直接返回*/ WT_RET(bm->compact_skip(bm, session, &skip)); if (skip) return 0; /* * Reviewing in-memory pages requires looking at page reconciliation * results, because we care about where the page is stored now, not * where the page was stored when we first read it into the cache. * We need to ensure we don't race with page reconciliation as it's * writing the page modify information. * * There are three ways we call reconciliation: checkpoints, threads * writing leaf pages (usually in preparation for a checkpoint or if * closing a file), and eviction. * * We're holding the schema lock which serializes with checkpoints. */ WT_ASSERT(session, F_ISSET(session, WT_SESSION_SCHEMA_LOCKED)); /*获得btree flusk_lock,防止在文件空间compact被其他线程flush*/ __wt_spin_lock(session, &btree->flush_lock); conn->compact_in_memory_pass = 1; WT_ERR(__wt_evict_file_exclusive_on(session, &evict_reset)); if (evict_reset) __wt_evict_file_exclusive_off(session); WT_ERR(bm->compact_start(bm, session)); block_manager_begin = 1; session->compaction = 1; for (;;){ WT_ERR(__wt_tree_walk(session, &ref, NULL, WT_READ_COMPACT | WT_READ_NO_GEN | WT_READ_WONT_NEED)); if (ref == NULL) break; /*进行compact标记*/ WT_ERR(__compact_rewrite(session, ref, &skip)); if (skip) continue; /*如果需要compact的page需要标记为脏page,通过内存驱逐来回写compact结果*/ WT_ERR(__wt_page_modify_init(session, ref->page)); __wt_page_modify_set(session, ref->page); WT_STAT_FAST_DATA_INCR(session, btree_compact_rewrite); } err: if (ref != NULL) WT_TRET(__wt_page_release(session, ref, 0)); /*结束compact动作*/ if (block_manager_begin) WT_TRET(bm->compact_end(bm, session)); /* * Unlock will be a release barrier, use it to update the compaction * status for reconciliation. */ conn->compact_in_memory_pass = 0; __wt_spin_unlock(session, &btree->flush_lock); return ret; }
/* * __wt_page_in -- * Acquire a hazard pointer to a page; if the page is not in-memory, * read it from the disk and build an in-memory version. */ int __wt_page_in_func( WT_SESSION_IMPL *session, WT_PAGE *parent, WT_REF *ref #ifdef HAVE_DIAGNOSTIC , const char *file, int line #endif ) { WT_DECL_RET; WT_PAGE *page; WT_TXN *txn; int busy, oldgen; txn = &session->txn; for (oldgen = 0;;) { switch (ref->state) { case WT_REF_DISK: case WT_REF_DELETED: /* * The page isn't in memory, attempt to read it. * * First make sure there is space in the cache. */ WT_RET(__wt_cache_full_check(session, 0)); WT_RET(__wt_cache_read(session, parent, ref)); oldgen = F_ISSET(session, WT_SESSION_NO_CACHE) ? 1 : 0; continue; case WT_REF_LOCKED: case WT_REF_READING: /* * The page is being read or considered for eviction -- * wait for that to be resolved. */ break; case WT_REF_EVICT_WALK: case WT_REF_MEM: /* * The page is in memory: get a hazard pointer, update * the page's LRU and return. The expected reason we * can't get a hazard pointer is because the page is * being evicted; yield and try again. */ #ifdef HAVE_DIAGNOSTIC WT_RET( __wt_hazard_set(session, ref, &busy, file, line)); #else WT_RET(__wt_hazard_set(session, ref, &busy)); #endif if (busy) break; page = ref->page; WT_ASSERT(session, page != NULL && !WT_PAGE_IS_ROOT(page)); /* * Make sure the page isn't too big. Only do this * check once per transaction: it is not a common case, * and we don't want to get stuck if it isn't possible * to evict the page. */ if (!F_ISSET(txn, TXN_FORCE_EVICT) && __wt_eviction_page_force(session, page)) { F_SET(txn, TXN_FORCE_EVICT); page->read_gen = WT_READ_GEN_OLDEST; WT_RET(__wt_page_release(session, page)); break; } /* Check if we need an autocommit transaction. */ if ((ret = __wt_txn_autocommit_check(session)) != 0) { WT_TRET(__wt_hazard_clear(session, page)); return (ret); } /* * If this page has ever been considered for eviction, * and its generation is aging, update it. */ if (page->read_gen != WT_READ_GEN_NOTSET && page->read_gen < __wt_cache_read_gen(session)) page->read_gen = __wt_cache_read_gen_set(session); /* * If we read the page and we are configured to not * trash the cache, set the oldest read generation so * the page is forcibly evicted as soon as possible. */ if (oldgen && page->read_gen == WT_READ_GEN_NOTSET) page->read_gen = WT_READ_GEN_OLDEST; return (0); WT_ILLEGAL_VALUE(session); } /* We failed to get the page -- yield before retrying. */ __wt_yield(); } }
/* * __wt_page_in -- * Acquire a hazard pointer to a page; if the page is not in-memory, * read it from the disk and build an in-memory version. */ int __wt_page_in_func( WT_SESSION_IMPL *session, WT_PAGE *parent, WT_REF *ref #ifdef HAVE_DIAGNOSTIC , const char *file, int line #endif ) { WT_DECL_RET; WT_PAGE *page; int busy, oldgen; for (oldgen = 0;;) { switch (ref->state) { case WT_REF_DISK: case WT_REF_DELETED: /* * The page isn't in memory, attempt to read it. * * First make sure there is space in the cache. */ WT_RET(__wt_cache_full_check(session, 0)); WT_RET(__wt_cache_read(session, parent, ref)); oldgen = F_ISSET(session, WT_SESSION_NO_CACHE) ? 1 : 0; continue; case WT_REF_EVICT_FORCE: case WT_REF_LOCKED: case WT_REF_READING: /* * The page is being read or considered for eviction -- * wait for that to be resolved. */ break; case WT_REF_EVICT_WALK: case WT_REF_MEM: /* * The page is in memory: get a hazard pointer, update * the page's LRU and return. The expected reason we * can't get a hazard pointer is because the page is * being evicted; yield and try again. */ #ifdef HAVE_DIAGNOSTIC WT_RET( __wt_hazard_set(session, ref, &busy, file, line)); #else WT_RET(__wt_hazard_set(session, ref, &busy)); #endif if (busy) break; page = ref->page; WT_ASSERT(session, page != NULL && !WT_PAGE_IS_ROOT(page)); /* * Ensure the page doesn't have ancient updates on it. * If it did, reading the page could ignore committed * updates. This should be extremely unlikely in real * applications, wait for eviction of the page to avoid * the issue. */ if (page->modify != NULL && __wt_txn_ancient(session, page->modify->first_id)) { page->read_gen = WT_READ_GEN_OLDEST; WT_RET(__wt_page_release(session, page)); break; } /* Check if we need an autocommit transaction. */ if ((ret = __wt_txn_autocommit_check(session)) != 0) { WT_TRET(__wt_hazard_clear(session, page)); return (ret); } /* * If this page has ever been considered for eviction, * and its generation is aging, update it. */ if (page->read_gen != WT_READ_GEN_NOTSET && page->read_gen < __wt_cache_read_gen(session)) page->read_gen = __wt_cache_read_gen_set(session); /* * If we read the page and we are configured to not * trash the cache, set the oldest read generation so * the page is forcibly evicted as soon as possible. */ if (oldgen && page->read_gen == WT_READ_GEN_NOTSET) page->read_gen = WT_READ_GEN_OLDEST; return (0); WT_ILLEGAL_VALUE(session); } /* We failed to get the page -- yield before retrying. */ __wt_yield(); } }
/* * __wt_compact -- * Compact a file. */ int __wt_compact(WT_SESSION_IMPL *session) { WT_BM *bm; WT_BTREE *btree; WT_DECL_RET; WT_REF *ref; u_int i; bool skip; btree = S2BT(session); bm = btree->bm; ref = NULL; WT_STAT_DATA_INCR(session, session_compact); /* * Check if compaction might be useful -- the API layer will quit trying * to compact the data source if we make no progress, set a flag if the * block layer thinks compaction is possible. */ WT_RET(bm->compact_skip(bm, session, &skip)); if (skip) return (0); /* * Reviewing in-memory pages requires looking at page reconciliation * results, because we care about where the page is stored now, not * where the page was stored when we first read it into the cache. * We need to ensure we don't race with page reconciliation as it's * writing the page modify information. * * There are two ways we call reconciliation: checkpoints and eviction. * Get the tree's flush lock which blocks threads writing pages for * checkpoints. */ __wt_spin_lock(session, &btree->flush_lock); /* Walk the tree reviewing pages to see if they should be re-written. */ for (i = 0;;) { /* Periodically check if we've run out of time. */ if (++i > 100) { WT_ERR(__wt_session_compact_check_timeout(session)); i = 0; } /* * Pages read for compaction aren't "useful"; don't update the * read generation of pages already in memory, and if a page is * read, set its generation to a low value so it is evicted * quickly. */ WT_ERR(__wt_tree_walk(session, &ref, WT_READ_COMPACT | WT_READ_NO_GEN | WT_READ_WONT_NEED)); if (ref == NULL) break; WT_ERR(__compact_rewrite(session, ref, &skip)); if (skip) continue; session->compact_state = WT_COMPACT_SUCCESS; /* Rewrite the page: mark the page and tree dirty. */ WT_ERR(__wt_page_modify_init(session, ref->page)); __wt_page_modify_set(session, ref->page); WT_STAT_DATA_INCR(session, btree_compact_rewrite); } err: if (ref != NULL) WT_TRET(__wt_page_release(session, ref, 0)); /* Unblock threads writing leaf pages. */ __wt_spin_unlock(session, &btree->flush_lock); return (ret); }
/* * __verify_tree -- * Verify a tree, recursively descending through it in depth-first fashion. * The page argument was physically verified (so we know it's correctly formed), * and the in-memory version built. Our job is to check logical relationships * in the page and in the tree. */ static int __verify_tree(WT_SESSION_IMPL *session, WT_PAGE *page, WT_VSTUFF *vs) { WT_BM *bm; WT_CELL *cell; WT_CELL_UNPACK *unpack, _unpack; WT_COL *cip; WT_DECL_RET; WT_REF *ref; uint64_t recno; uint32_t entry, i; int found, lno; bm = S2BT(session)->bm; unpack = &_unpack; WT_VERBOSE_RET(session, verify, "%s %s", __wt_page_addr_string(session, vs->tmp1, page), __wt_page_type_string(page->type)); #ifdef HAVE_DIAGNOSTIC if (vs->dump_address) WT_RET(__wt_msg(session, "%s %s", __wt_page_addr_string(session, vs->tmp1, page), __wt_page_type_string(page->type))); #endif /* * The page's physical structure was verified when it was read into * memory by the read server thread, and then the in-memory version * of the page was built. Now we make sure the page and tree are * logically consistent. * * !!! * The problem: (1) the read server has to build the in-memory version * of the page because the read server is the thread that flags when * any thread can access the page in the tree; (2) we can't build the * in-memory version of the page until the physical structure is known * to be OK, so the read server has to verify at least the physical * structure of the page; (3) doing complete page verification requires * reading additional pages (for example, overflow keys imply reading * overflow pages in order to test the key's order in the page); (4) * the read server cannot read additional pages because it will hang * waiting on itself. For this reason, we split page verification * into a physical verification, which allows the in-memory version * of the page to be built, and then a subsequent logical verification * which happens here. * * Report progress every 10 pages. */ if (++vs->fcnt % 10 == 0) WT_RET(__wt_progress(session, NULL, vs->fcnt)); #ifdef HAVE_DIAGNOSTIC /* Optionally dump the page in debugging mode. */ if (vs->dump_blocks && page->dsk != NULL) WT_RET(__wt_debug_disk(session, page->dsk, NULL)); if (vs->dump_pages) WT_RET(__wt_debug_page(session, page, NULL)); #endif /* * Column-store key order checks: check the page's record number and * then update the total record count. */ switch (page->type) { case WT_PAGE_COL_FIX: recno = page->u.col_fix.recno; goto recno_chk; case WT_PAGE_COL_INT: recno = page->u.intl.recno; goto recno_chk; case WT_PAGE_COL_VAR: recno = page->u.col_var.recno; recno_chk: if (recno != vs->record_total + 1) WT_RET_MSG(session, WT_ERROR, "page at %s has a starting record of %" PRIu64 " when the expected starting record is %" PRIu64, __wt_page_addr_string(session, vs->tmp1, page), recno, vs->record_total + 1); break; } switch (page->type) { case WT_PAGE_COL_FIX: vs->record_total += page->entries; break; case WT_PAGE_COL_VAR: recno = 0; WT_COL_FOREACH(page, cip, i) if ((cell = WT_COL_PTR(page, cip)) == NULL) ++recno; else { __wt_cell_unpack(cell, unpack); recno += __wt_cell_rle(unpack); } vs->record_total += recno; break; } /* * Row-store leaf page key order check: it's a depth-first traversal, * the first key on this page should be larger than any key previously * seen. */ switch (page->type) { case WT_PAGE_ROW_LEAF: WT_RET(__verify_row_leaf_key_order(session, page, vs)); break; } /* * Check overflow pages. We check overflow cells separately from other * tests that walk the page as it's simpler, and I don't care much how * fast table verify runs. * * Object if a leaf-no-overflow address cell references a page that has * overflow keys, but don't object if a standard address cell references * a page without overflow keys. The leaf-no-overflow address cell is * an optimization for trees without few, if any, overflow items, and * may not be set by reconciliation in all possible cases. */ if (WT_PAGE_IS_ROOT(page)) lno = 0; else { __wt_cell_unpack(page->ref->addr, unpack); lno = unpack->raw == WT_CELL_ADDR_LNO ? 1 : 0; } switch (page->type) { case WT_PAGE_COL_FIX: break; case WT_PAGE_COL_VAR: case WT_PAGE_ROW_INT: case WT_PAGE_ROW_LEAF: WT_RET(__verify_overflow_cell(session, page, &found, vs)); if (found && lno) WT_RET_MSG(session, WT_ERROR, "page at %s referenced in its parent by a cell of " "type %s illegally contains overflow items", __wt_page_addr_string(session, vs->tmp1, page), __wt_cell_type_string(WT_CELL_ADDR_LNO)); break; default: if (lno) WT_RET_MSG(session, WT_ERROR, "page at %s is of type %s and is illegally " "referenced in its parent by a cell of type %s", __wt_page_addr_string(session, vs->tmp1, page), __wt_page_type_string(page->type), __wt_cell_type_string(WT_CELL_ADDR_LNO)); break; } /* Check tree connections and recursively descend the tree. */ switch (page->type) { case WT_PAGE_COL_INT: /* For each entry in an internal page, verify the subtree. */ entry = 0; WT_REF_FOREACH(page, ref, i) { /* * It's a depth-first traversal: this entry's starting * record number should be 1 more than the total records * reviewed to this point. */ ++entry; if (ref->u.recno != vs->record_total + 1) { __wt_cell_unpack(ref->addr, unpack); WT_RET_MSG(session, WT_ERROR, "the starting record number in entry %" PRIu32 " of the column internal page at " "%s is %" PRIu64 " and the expected " "starting record number is %" PRIu64, entry, __wt_page_addr_string( session, vs->tmp1, page), ref->u.recno, vs->record_total + 1); } /* Verify the subtree. */ WT_RET(__wt_page_in(session, page, ref)); ret = __verify_tree(session, ref->page, vs); WT_TRET(__wt_page_release(session, ref->page)); WT_RET(ret); __wt_cell_unpack(ref->addr, unpack); WT_RET(bm->verify_addr( bm, session, unpack->data, unpack->size)); } break; case WT_PAGE_ROW_INT: /* For each entry in an internal page, verify the subtree. */ entry = 0; WT_REF_FOREACH(page, ref, i) { /* * It's a depth-first traversal: this entry's starting * key should be larger than the largest key previously * reviewed. * * The 0th key of any internal page is magic, and we * can't test against it. */ ++entry; if (entry != 1) WT_RET(__verify_row_int_key_order( session, page, ref, entry, vs)); /* Verify the subtree. */ WT_RET(__wt_page_in(session, page, ref)); ret = __verify_tree(session, ref->page, vs); WT_TRET(__wt_page_release(session, ref->page)); WT_RET(ret); __wt_cell_unpack(ref->addr, unpack); WT_RET(bm->verify_addr( bm, session, unpack->data, unpack->size)); }
/* * __sync_file -- * Flush pages for a specific file. */ static int __sync_file(WT_SESSION_IMPL *session, int syncop) { struct timespec end, start; WT_BTREE *btree; WT_DECL_RET; WT_PAGE *page; WT_PAGE_MODIFY *mod; WT_REF *walk; WT_TXN *txn; uint64_t internal_bytes, leaf_bytes; uint64_t internal_pages, leaf_pages; uint32_t flags; bool evict_reset; btree = S2BT(session); flags = WT_READ_CACHE | WT_READ_NO_GEN; walk = NULL; txn = &session->txn; internal_bytes = leaf_bytes = 0; internal_pages = leaf_pages = 0; if (WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT)) WT_RET(__wt_epoch(session, &start)); switch (syncop) { case WT_SYNC_WRITE_LEAVES: /* * Write all immediately available, dirty in-cache leaf pages. * * Writing the leaf pages is done without acquiring a high-level * lock, serialize so multiple threads don't walk the tree at * the same time. */ if (!btree->modified) return (0); __wt_spin_lock(session, &btree->flush_lock); if (!btree->modified) { __wt_spin_unlock(session, &btree->flush_lock); return (0); } flags |= WT_READ_NO_WAIT | WT_READ_SKIP_INTL; for (walk = NULL;;) { WT_ERR(__wt_tree_walk(session, &walk, NULL, flags)); if (walk == NULL) break; /* * Write dirty pages if nobody beat us to it. Don't * try to write the hottest pages: checkpoint will have * to visit them anyway. */ page = walk->page; if (__wt_page_is_modified(page) && __wt_txn_visible_all( session, page->modify->update_txn)) { if (txn->isolation == WT_ISO_READ_COMMITTED) __wt_txn_get_snapshot(session); leaf_bytes += page->memory_footprint; ++leaf_pages; WT_ERR(__wt_reconcile(session, walk, NULL, 0)); } } break; case WT_SYNC_CHECKPOINT: /* * We cannot check the tree modified flag in the case of a * checkpoint, the checkpoint code has already cleared it. * * Writing the leaf pages is done without acquiring a high-level * lock, serialize so multiple threads don't walk the tree at * the same time. We're holding the schema lock, but need the * lower-level lock as well. */ __wt_spin_lock(session, &btree->flush_lock); /* * When internal pages are being reconciled by checkpoint their * child pages cannot disappear from underneath them or be split * into them, nor can underlying blocks be freed until the block * lists for the checkpoint are stable. Set the checkpointing * flag to block eviction of dirty pages until the checkpoint's * internal page pass is complete, then wait for any existing * eviction to complete. */ btree->checkpointing = 1; WT_FULL_BARRIER(); WT_ERR(__wt_evict_file_exclusive_on(session, &evict_reset)); if (evict_reset) __wt_evict_file_exclusive_off(session); /* Write all dirty in-cache pages. */ flags |= WT_READ_NO_EVICT; for (walk = NULL;;) { /* * If we have a page, and it was ever modified, track * the highest transaction ID in the tree. We do this * here because we want the value after reconciling * dirty pages. */ if (walk != NULL && walk->page != NULL && (mod = walk->page->modify) != NULL && WT_TXNID_LT(btree->rec_max_txn, mod->rec_max_txn)) btree->rec_max_txn = mod->rec_max_txn; WT_ERR(__wt_tree_walk(session, &walk, NULL, flags)); if (walk == NULL) break; page = walk->page; mod = page->modify; /* Skip clean pages. */ if (!__wt_page_is_modified(page)) continue; /* * Write dirty pages, unless we can be sure they only * became dirty after the checkpoint started. * * We can skip dirty pages if: * (1) they are leaf pages; * (2) there is a snapshot transaction active (which * is the case in ordinary application checkpoints * but not all internal cases); and * (3) the first dirty update on the page is * sufficiently recent that the checkpoint * transaction would skip them. * * Mark the tree dirty: the checkpoint marked it clean * and we can't skip future checkpoints until this page * is written. */ if (!WT_PAGE_IS_INTERNAL(page) && F_ISSET(txn, WT_TXN_HAS_SNAPSHOT) && WT_TXNID_LT(txn->snap_max, mod->first_dirty_txn) && mod->rec_result != WT_PM_REC_REWRITE) { __wt_page_modify_set(session, page); continue; } if (WT_PAGE_IS_INTERNAL(page)) { internal_bytes += page->memory_footprint; ++internal_pages; } else { leaf_bytes += page->memory_footprint; ++leaf_pages; } WT_ERR(__wt_reconcile(session, walk, NULL, 0)); } break; } if (WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT)) { WT_ERR(__wt_epoch(session, &end)); WT_ERR(__wt_verbose(session, WT_VERB_CHECKPOINT, "__sync_file WT_SYNC_%s wrote:\n\t %" PRIu64 " bytes, %" PRIu64 " pages of leaves\n\t %" PRIu64 " bytes, %" PRIu64 " pages of internal\n\t" "Took: %" PRIu64 "ms", syncop == WT_SYNC_WRITE_LEAVES ? "WRITE_LEAVES" : "CHECKPOINT", leaf_bytes, leaf_pages, internal_bytes, internal_pages, WT_TIMEDIFF(end, start) / WT_MILLION)); } err: /* On error, clear any left-over tree walk. */ if (walk != NULL) WT_TRET(__wt_page_release(session, walk, flags)); if (txn->isolation == WT_ISO_READ_COMMITTED && session->ncursors == 0) __wt_txn_release_snapshot(session); if (btree->checkpointing) { /* * Update the checkpoint generation for this handle so visible * updates newer than the checkpoint can be evicted. * * This has to be published before eviction is enabled again, * so that eviction knows that the checkpoint has completed. */ WT_PUBLISH(btree->checkpoint_gen, S2C(session)->txn_global.checkpoint_gen); WT_STAT_FAST_DATA_SET(session, btree_checkpoint_generation, btree->checkpoint_gen); /* * Clear the checkpoint flag and push the change; not required, * but publishing the change means stalled eviction gets moving * as soon as possible. */ btree->checkpointing = 0; WT_FULL_BARRIER(); /* * If this tree was being skipped by the eviction server during * the checkpoint, clear the wait. */ btree->evict_walk_period = 0; /* * Wake the eviction server, in case application threads have * stalled while the eviction server decided it couldn't make * progress. Without this, application threads will be stalled * until the eviction server next wakes. */ WT_TRET(__wt_evict_server_wake(session)); } __wt_spin_unlock(session, &btree->flush_lock); /* * Leaves are written before a checkpoint (or as part of a file close, * before checkpointing the file). Start a flush to stable storage, * but don't wait for it. */ if (ret == 0 && syncop == WT_SYNC_WRITE_LEAVES) WT_RET(btree->bm->sync(btree->bm, session, true)); return (ret); }
/* * __tree_walk_internal -- * Move to the next/previous page in the tree. */ static inline int __tree_walk_internal(WT_SESSION_IMPL *session, WT_REF **refp, uint64_t *walkcntp, int (*skip_func)(WT_SESSION_IMPL *, WT_REF *, void *, bool *), void *func_cookie, uint32_t flags) { WT_BTREE *btree; WT_DECL_RET; WT_PAGE_INDEX *pindex; WT_REF *couple, *couple_orig, *ref; uint32_t slot; bool empty_internal, initial_descent, prev, skip; btree = S2BT(session); pindex = NULL; empty_internal = initial_descent = false; /* * Tree walks are special: they look inside page structures that splits * may want to free. Publish that the tree is active during this * window. */ WT_ENTER_PAGE_INDEX(session); /* Walk should never instantiate deleted pages. */ LF_SET(WT_READ_NO_EMPTY); /* * !!! * Fast-truncate currently only works on row-store trees. */ if (btree->type != BTREE_ROW) LF_CLR(WT_READ_TRUNCATE); prev = LF_ISSET(WT_READ_PREV) ? 1 : 0; /* * There are multiple reasons and approaches to walking the in-memory * tree: * * (1) finding pages to evict (the eviction server); * (2) writing just dirty leaves or internal nodes (checkpoint); * (3) discarding pages (close); * (4) truncating pages in a range (fast truncate); * (5) skipping pages based on outside information (compaction); * (6) cursor scans (applications). * * Except for cursor scans and compaction, the walk is limited to the * cache, no pages are read. In all cases, hazard pointers protect the * walked pages from eviction. * * Walks use hazard-pointer coupling through the tree and that's OK * (hazard pointers can't deadlock, so there's none of the usual * problems found when logically locking up a btree). If the eviction * thread tries to evict the active page, it fails because of our * hazard pointer. If eviction tries to evict our parent, that fails * because the parent has a child page that can't be discarded. We do * play one game: don't couple up to our parent and then back down to a * new leaf, couple to the next page to which we're descending, it * saves a hazard-pointer swap for each cursor page movement. * * !!! * NOTE: we depend on the fact it's OK to release a page we don't hold, * that is, it's OK to release couple when couple is set to NULL. * * Take a copy of any held page and clear the return value. Remember * the hazard pointer we're currently holding. * * Clear the returned value, it makes future error handling easier. */ couple = couple_orig = ref = *refp; *refp = NULL; /* If no page is active, begin a walk from the start/end of the tree. */ if (ref == NULL) { restart: /* * We can be here with a NULL or root WT_REF; the page release * function handles them internally, don't complicate this code * by calling them out. */ WT_ERR(__wt_page_release(session, couple, flags)); /* * We're not supposed to walk trees without root pages. As this * has not always been the case, assert to debug that change. */ WT_ASSERT(session, btree->root.page != NULL); couple = couple_orig = ref = &btree->root; initial_descent = true; goto descend; } /* * If the active page was the root, we've reached the walk's end; we * only get here if we've returned the root to our caller, so we're * holding no hazard pointers. */ if (__wt_ref_is_root(ref)) goto done; /* Figure out the current slot in the WT_REF array. */ __ref_index_slot(session, ref, &pindex, &slot); for (;;) { /* * If we're at the last/first slot on the internal page, return * it in post-order traversal. Otherwise move to the next/prev * slot and left/right-most element in that subtree. */ while ((prev && slot == 0) || (!prev && slot == pindex->entries - 1)) { /* Ascend to the parent. */ __ref_ascend(session, &ref, &pindex, &slot); /* * If at the root and returning internal pages, return * the root page, otherwise we're done. Regardless, no * hazard pointer is required, release the one we hold. */ if (__wt_ref_is_root(ref)) { WT_ERR(__wt_page_release( session, couple, flags)); if (!LF_ISSET(WT_READ_SKIP_INTL)) *refp = ref; goto done; } /* * If we got all the way through an internal page and * all of the child pages were deleted, mark it for * eviction. */ if (empty_internal && pindex->entries > 1) { __wt_page_evict_soon(session, ref); empty_internal = false; } /* * Optionally return internal pages. Swap our previous * hazard pointer for the page we'll return. We don't * handle restart or not-found returns, it would require * additional complexity and is not a possible return: * we're moving to the parent of the current child page, * the parent can't have been evicted. */ if (!LF_ISSET(WT_READ_SKIP_INTL)) { WT_ERR(__wt_page_swap( session, couple, ref, flags)); *refp = ref; goto done; } } if (prev) --slot; else ++slot; if (walkcntp != NULL) ++*walkcntp; for (;;) { /* * Move to the next slot, and set the reference hint if * it's wrong (used when we continue the walk). We don't * always update the hints when splitting, it's expected * for them to be incorrect in some workloads. */ ref = pindex->index[slot]; if (ref->pindex_hint != slot) ref->pindex_hint = slot; /* * If we see any child states other than deleted, the * page isn't empty. */ if (ref->state != WT_REF_DELETED && !LF_ISSET(WT_READ_TRUNCATE)) empty_internal = false; if (LF_ISSET(WT_READ_CACHE)) { /* * Only look at unlocked pages in memory: * fast-path some common cases. */ if (LF_ISSET(WT_READ_NO_WAIT) && ref->state != WT_REF_MEM) break; /* Skip lookaside pages if not requested. */ if (ref->state == WT_REF_LOOKASIDE && !LF_ISSET(WT_READ_LOOKASIDE)) break; } else if (LF_ISSET(WT_READ_TRUNCATE)) { /* * Avoid pulling a deleted page back in to try * to delete it again. */ if (ref->state == WT_REF_DELETED && __wt_delete_page_skip(session, ref, false)) break; /* * If deleting a range, try to delete the page * without instantiating it. */ WT_ERR(__wt_delete_page(session, ref, &skip)); if (skip) break; empty_internal = false; } else if (skip_func != NULL) { WT_ERR(skip_func(session, ref, func_cookie, &skip)); if (skip) break; } else { /* * Try to skip deleted pages visible to us. */ if (ref->state == WT_REF_DELETED && __wt_delete_page_skip(session, ref, false)) break; } ret = __wt_page_swap(session, couple, ref, WT_READ_NOTFOUND_OK | WT_READ_RESTART_OK | flags); /* * Not-found is an expected return when only walking * in-cache pages, or if we see a deleted page. */ if (ret == WT_NOTFOUND) { ret = 0; break; } /* * The page we're moving to might have split, in which * case move to the last position we held. */ if (ret == WT_RESTART) { ret = 0; /* * If a cursor is setting up at the end of the * tree, we can't use our parent page's index, * because it may have already split; restart * the walk. */ if (prev && initial_descent) goto restart; /* * If a new walk that never coupled from the * root to a new saved position in the tree, * restart the walk. */ if (couple == &btree->root) goto restart; /* * If restarting from some original position, * repeat the increment or decrement we made at * that time. Otherwise, couple is an internal * page we've acquired after moving from that * starting position and we can treat it as a * new page. This works because we never acquire * a hazard pointer on a leaf page we're not * going to return to our caller, this will quit * working if that ever changes. */ WT_ASSERT(session, couple == couple_orig || WT_PAGE_IS_INTERNAL(couple->page)); ref = couple; __ref_index_slot(session, ref, &pindex, &slot); if (couple == couple_orig) break; } WT_ERR(ret); couple = ref; /* * A new page: configure for traversal of any internal * page's children, else return the leaf page. */ if (WT_PAGE_IS_INTERNAL(ref->page)) { descend: empty_internal = true; /* * There's a split race when a cursor is setting * up at the end of the tree or moving backwards * through the tree and descending a level. When * splitting an internal page into its parent, * we move the WT_REF structures and update the * parent's page index before updating the split * page's page index, and it's not an atomic * update. A thread can read the parent page's * replacement page index, then read the split * page's original index, or the parent page's * original and the split page's replacement. * * This isn't a problem for a cursor setting up * at the start of the tree or moving forwards * through the tree because we do right-hand * splits on internal pages and the initial part * of the split page's namespace won't change as * part of a split. A thread reading the parent * page's and split page's indexes will move to * the same slot no matter what order of indexes * are read. * * Handle a cursor setting up at the end of the * tree or moving backwards through the tree. */ if (!prev) { WT_INTL_INDEX_GET( session, ref->page, pindex); slot = 0; } else if (initial_descent) { if (!__ref_initial_descent_prev( session, ref, &pindex)) goto restart; slot = pindex->entries - 1; } else { __ref_descend_prev( session, ref, &pindex); slot = pindex->entries - 1; } continue; } /* * The tree-walk restart code knows we return any leaf * page we acquire (never hazard-pointer coupling on * after acquiring a leaf page), and asserts no restart * happens while holding a leaf page. This page must be * returned to our caller. */ *refp = ref; goto done; } } done: err: WT_LEAVE_PAGE_INDEX(session); return (ret); }
/* * __wt_evict_file -- * Discard pages for a specific file. */ int __wt_evict_file(WT_SESSION_IMPL *session, int syncop) { WT_BTREE *btree; WT_DECL_RET; WT_PAGE *page; WT_REF *next_ref, *ref; int eviction_enabled; btree = S2BT(session); eviction_enabled = !F_ISSET(btree, WT_BTREE_NO_EVICTION); /* * We need exclusive access to the file -- disable ordinary eviction * and drain any blocks already queued. */ if (eviction_enabled) WT_RET(__wt_evict_file_exclusive_on(session)); /* Make sure the oldest transaction ID is up-to-date. */ __wt_txn_update_oldest(session); /* Walk the tree, discarding pages. */ next_ref = NULL; WT_ERR(__wt_tree_walk( session, &next_ref, WT_READ_CACHE | WT_READ_NO_EVICT)); while ((ref = next_ref) != NULL) { page = ref->page; /* * Eviction can fail when a page in the evicted page's subtree * switches state. For example, if we don't evict a page marked * empty, because we expect it to be merged into its parent, it * might no longer be empty after it's reconciled, in which case * eviction of its parent would fail. We can either walk the * tree multiple times (until it's finally empty), or reconcile * each page to get it to its final state before considering if * it's an eviction target or will be merged into its parent. * * Don't limit this test to any particular page type, that tends * to introduce bugs when the reconciliation of other page types * changes, and there's no advantage to doing so. * * Eviction can also fail because an update cannot be written. * If sessions have disjoint sets of files open, updates in a * no-longer-referenced file may not yet be globally visible, * and the write will fail with EBUSY. Our caller handles that * error, retrying later. */ if (syncop == WT_SYNC_CLOSE && __wt_page_is_modified(page)) WT_ERR(__wt_reconcile(session, ref, NULL, WT_EVICTING)); /* * We can't evict the page just returned to us (it marks our * place in the tree), so move the walk to one page ahead of * the page being evicted. Note, we reconciled the returned * page first: if reconciliation of that page were to change * the shape of the tree, and we did the next walk call before * the reconciliation, the next walk call could miss a page in * the tree. */ WT_ERR(__wt_tree_walk( session, &next_ref, WT_READ_CACHE | WT_READ_NO_EVICT)); switch (syncop) { case WT_SYNC_CLOSE: /* * Evict the page. * Do not attempt to evict pages expected to be merged * into their parents, with the exception that the root * page can't be merged, it must be written. */ if (__wt_ref_is_root(ref) || page->modify == NULL || !F_ISSET(page->modify, WT_PM_REC_EMPTY)) WT_ERR(__wt_evict(session, ref, 1)); break; case WT_SYNC_DISCARD: /* * Ordinary discard of the page, whether clean or dirty. * If we see a dirty page in an ordinary discard (e.g., * from sweep), give up: an update must have happened * since the file was selected for sweeping. */ if (__wt_page_is_modified(page)) WT_ERR(EBUSY); /* * If the page contains an update that is too recent to * evict, stop. This should never happen during * connection close, but in other paths our caller * should be prepared to deal with this case. */ if (page->modify != NULL && !__wt_txn_visible_all(session, page->modify->rec_max_txn)) WT_ERR(EBUSY); __wt_evict_page_clean_update(session, ref); break; case WT_SYNC_DISCARD_FORCE: /* * Forced discard of the page, whether clean or dirty. * If we see a dirty page in a forced discard, clean * the page, both to keep statistics correct, and to * let the page-discard function assert no dirty page * is ever discarded. */ if (__wt_page_is_modified(page)) { page->modify->write_gen = 0; __wt_cache_dirty_decr(session, page); } F_SET(session, WT_SESSION_DISCARD_FORCE); __wt_evict_page_clean_update(session, ref); F_CLR(session, WT_SESSION_DISCARD_FORCE); break; WT_ILLEGAL_VALUE_ERR(session); } } if (0) { err: /* On error, clear any left-over tree walk. */ if (next_ref != NULL) WT_TRET(__wt_page_release( session, next_ref, WT_READ_NO_EVICT)); } if (eviction_enabled) __wt_evict_file_exclusive_off(session); return (ret); }
/* * __wt_page_in_func -- * Acquire a hazard pointer to a page; if the page is not in-memory, * read it from the disk and build an in-memory version. */ int __wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags #ifdef HAVE_DIAGNOSTIC , const char *file, int line #endif ) { WT_DECL_RET; WT_PAGE *page; int busy, force_attempts, oldgen; for (force_attempts = oldgen = 0;;) { switch (ref->state) { case WT_REF_DISK: case WT_REF_DELETED: if (LF_ISSET(WT_READ_CACHE)) return (WT_NOTFOUND); /* * The page isn't in memory, attempt to read it. * Make sure there is space in the cache. */ WT_RET(__wt_cache_full_check(session)); WT_RET(__wt_cache_read(session, ref)); oldgen = LF_ISSET(WT_READ_WONT_NEED) || F_ISSET(session, WT_SESSION_NO_CACHE); continue; case WT_REF_READING: if (LF_ISSET(WT_READ_CACHE)) return (WT_NOTFOUND); /* FALLTHROUGH */ case WT_REF_LOCKED: if (LF_ISSET(WT_READ_NO_WAIT)) return (WT_NOTFOUND); /* The page is busy -- wait. */ break; case WT_REF_SPLIT: return (WT_RESTART); case WT_REF_MEM: /* * The page is in memory: get a hazard pointer, update * the page's LRU and return. The expected reason we * can't get a hazard pointer is because the page is * being evicted; yield and try again. */ #ifdef HAVE_DIAGNOSTIC WT_RET( __wt_hazard_set(session, ref, &busy, file, line)); #else WT_RET(__wt_hazard_set(session, ref, &busy)); #endif if (busy) break; page = ref->page; WT_ASSERT(session, page != NULL); /* Forcibly evict pages that are too big. */ if (!LF_ISSET(WT_READ_NO_EVICT) && force_attempts < 10 && __evict_force_check(session, page)) { ++force_attempts; WT_RET(__wt_page_release(session, ref, flags)); break; } /* Check if we need an autocommit transaction. */ if ((ret = __wt_txn_autocommit_check(session)) != 0) { WT_TRET(__wt_hazard_clear(session, page)); return (ret); } /* * If we read the page and we are configured to not * trash the cache, set the oldest read generation so * the page is forcibly evicted as soon as possible. * * Otherwise, update the page's read generation. */ if (oldgen && page->read_gen == WT_READGEN_NOTSET) __wt_page_evict_soon(page); else if (!LF_ISSET(WT_READ_NO_GEN) && page->read_gen < __wt_cache_read_gen(session)) page->read_gen = __wt_cache_read_gen_set(session); return (0); WT_ILLEGAL_VALUE(session); } /* We failed to get the page -- yield before retrying. */ __wt_yield(); } }
/* * __wt_compact -- * Compact a file. */ int __wt_compact(WT_SESSION_IMPL *session, const char *cfg[]) { WT_BM *bm; WT_BTREE *btree; WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_REF *ref; int block_manager_begin, evict_reset, skip; WT_UNUSED(cfg); conn = S2C(session); btree = S2BT(session); bm = btree->bm; ref = NULL; block_manager_begin = 0; WT_STAT_FAST_DATA_INCR(session, session_compact); /* * Check if compaction might be useful -- the API layer will quit trying * to compact the data source if we make no progress, set a flag if the * block layer thinks compaction is possible. */ WT_RET(bm->compact_skip(bm, session, &skip)); if (skip) return (0); /* * Reviewing in-memory pages requires looking at page reconciliation * results, because we care about where the page is stored now, not * where the page was stored when we first read it into the cache. * We need to ensure we don't race with page reconciliation as it's * writing the page modify information. * * There are three ways we call reconciliation: checkpoints, threads * writing leaf pages (usually in preparation for a checkpoint or if * closing a file), and eviction. * * We're holding the schema lock which serializes with checkpoints. */ WT_ASSERT(session, F_ISSET(session, WT_SESSION_SCHEMA_LOCKED)); /* * Get the tree handle's flush lock which blocks threads writing leaf * pages. */ __wt_spin_lock(session, &btree->flush_lock); /* * That leaves eviction, we don't want to block eviction. Set a flag * so reconciliation knows compaction is running. If reconciliation * sees the flag it locks the page it's writing, we acquire the same * lock when reading the page's modify information, serializing access. * The same page lock blocks work on the page, but compaction is an * uncommon, heavy-weight operation. If it's ever a problem, there's * no reason we couldn't use an entirely separate lock than the page * lock. * * We also need to ensure we don't race with an on-going reconciliation. * After we set the flag, wait for eviction of this file to drain, and * then let eviction continue; */ conn->compact_in_memory_pass = 1; WT_ERR(__wt_evict_file_exclusive_on(session, &evict_reset)); if (evict_reset) __wt_evict_file_exclusive_off(session); /* Start compaction. */ WT_ERR(bm->compact_start(bm, session)); block_manager_begin = 1; /* Walk the tree reviewing pages to see if they should be re-written. */ session->compaction = 1; for (;;) { /* * Pages read for compaction aren't "useful"; don't update the * read generation of pages already in memory, and if a page is * read, set its generation to a low value so it is evicted * quickly. */ WT_ERR(__wt_tree_walk(session, &ref, NULL, WT_READ_COMPACT | WT_READ_NO_GEN | WT_READ_WONT_NEED)); if (ref == NULL) break; WT_ERR(__compact_rewrite(session, ref, &skip)); if (skip) continue; /* Rewrite the page: mark the page and tree dirty. */ WT_ERR(__wt_page_modify_init(session, ref->page)); __wt_page_modify_set(session, ref->page); WT_STAT_FAST_DATA_INCR(session, btree_compact_rewrite); } err: if (ref != NULL) WT_TRET(__wt_page_release(session, ref, 0)); if (block_manager_begin) WT_TRET(bm->compact_end(bm, session)); /* * Unlock will be a release barrier, use it to update the compaction * status for reconciliation. */ conn->compact_in_memory_pass = 0; __wt_spin_unlock(session, &btree->flush_lock); return (ret); }