/* * __wt_lsm_checkpoint_chunk -- * Flush a single LSM chunk to disk. */ int __wt_lsm_checkpoint_chunk(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, WT_LSM_CHUNK *chunk) { WT_DECL_RET; WT_TXN_ISOLATION saved_isolation; bool flush_set; flush_set = false; /* * If the chunk is already checkpointed, make sure it is also evicted. * Either way, there is no point trying to checkpoint it again. */ if (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) && !F_ISSET(chunk, WT_LSM_CHUNK_STABLE) && !chunk->evicted) { WT_WITH_HANDLE_LIST_LOCK(session, ret = __lsm_discard_handle(session, chunk->uri, NULL)); if (ret == 0) chunk->evicted = 1; else if (ret == EBUSY) ret = 0; else WT_RET_MSG(session, ret, "discard handle"); } if (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK)) { WT_RET(__wt_verbose(session, WT_VERB_LSM, "LSM worker %s already on disk", chunk->uri)); return (0); } /* Stop if a running transaction needs the chunk. */ __wt_txn_update_oldest(session, true); if (chunk->switch_txn == WT_TXN_NONE || !__wt_txn_visible_all(session, chunk->switch_txn)) { WT_RET(__wt_verbose(session, WT_VERB_LSM, "LSM worker %s: running transaction, return", chunk->uri)); return (0); } if (!__wt_atomic_cas8(&chunk->flushing, 0, 1)) return (0); flush_set = true; WT_ERR(__wt_verbose(session, WT_VERB_LSM, "LSM worker flushing %s", chunk->uri)); /* * Flush the file before checkpointing: this is the expensive part in * terms of I/O. * * !!! * We can wait here for checkpoints and fsyncs to complete, which can * take a long time. */ if ((ret = __wt_session_get_btree( session, chunk->uri, NULL, NULL, 0)) == 0) { /* * Set read-uncommitted: we have already checked that all of the * updates in this chunk are globally visible, use the cheapest * possible check in reconciliation. */ saved_isolation = session->txn.isolation; session->txn.isolation = WT_ISO_READ_UNCOMMITTED; ret = __wt_cache_op(session, NULL, WT_SYNC_WRITE_LEAVES); session->txn.isolation = saved_isolation; WT_TRET(__wt_session_release_btree(session)); } WT_ERR(ret); WT_ERR(__wt_verbose(session, WT_VERB_LSM, "LSM worker checkpointing %s", chunk->uri)); /* * Turn on metadata tracking to ensure the checkpoint gets the * necessary handle locks. * * Ensure that we don't race with a running checkpoint: the checkpoint * lock protects against us racing with an application checkpoint in * this chunk. Don't wait for it, though: checkpoints can take a long * time, and our checkpoint operation should be very quick. */ WT_ERR(__wt_meta_track_on(session)); WT_WITH_CHECKPOINT_LOCK(session, ret, WT_WITH_SCHEMA_LOCK(session, ret, ret = __wt_schema_worker( session, chunk->uri, __wt_checkpoint, NULL, NULL, 0))); WT_TRET(__wt_meta_track_off(session, false, ret != 0)); if (ret != 0) WT_ERR_MSG(session, ret, "LSM checkpoint"); /* Now the file is written, get the chunk size. */ WT_ERR(__wt_lsm_tree_set_chunk_size(session, chunk)); /* Update the flush timestamp to help track ongoing progress. */ WT_ERR(__wt_epoch(session, &lsm_tree->last_flush_ts)); ++lsm_tree->chunks_flushed; /* Lock the tree, mark the chunk as on disk and update the metadata. */ WT_ERR(__wt_lsm_tree_writelock(session, lsm_tree)); F_SET(chunk, WT_LSM_CHUNK_ONDISK); ret = __wt_lsm_meta_write(session, lsm_tree); ++lsm_tree->dsk_gen; /* Update the throttle time. */ __wt_lsm_tree_throttle(session, lsm_tree, true); WT_TRET(__wt_lsm_tree_writeunlock(session, lsm_tree)); if (ret != 0) WT_ERR_MSG(session, ret, "LSM metadata write"); WT_PUBLISH(chunk->flushing, 0); flush_set = false; /* * Clear the no-eviction flag so the primary can be evicted and * eventually closed. Only do this once the checkpoint has succeeded: * otherwise, accessing the leaf page during the checkpoint can trigger * forced eviction. */ WT_ERR(__wt_session_get_btree(session, chunk->uri, NULL, NULL, 0)); __wt_btree_evictable(session, true); WT_ERR(__wt_session_release_btree(session)); /* Make sure we aren't pinning a transaction ID. */ __wt_txn_release_snapshot(session); WT_ERR(__wt_verbose(session, WT_VERB_LSM, "LSM worker checkpointed %s", chunk->uri)); /* Schedule a bloom filter create for our newly flushed chunk. */ if (!FLD_ISSET(lsm_tree->bloom, WT_LSM_BLOOM_OFF)) WT_ERR(__wt_lsm_manager_push_entry( session, WT_LSM_WORK_BLOOM, 0, lsm_tree)); else WT_ERR(__wt_lsm_manager_push_entry( session, WT_LSM_WORK_MERGE, 0, lsm_tree)); err: if (flush_set) WT_PUBLISH(chunk->flushing, 0); return (ret); }
/* * __sweep -- * Close unused dhandles on the connection dhandle list. */ static int __sweep(WT_SESSION_IMPL *session) { WT_CONNECTION_IMPL *conn; WT_DATA_HANDLE *dhandle, *dhandle_next; WT_DECL_RET; time_t now; conn = S2C(session); /* Don't discard handles that have been open recently. */ WT_RET(__wt_seconds(session, &now)); dhandle = SLIST_FIRST(&conn->dhlh); for (; dhandle != NULL; dhandle = dhandle_next) { dhandle_next = SLIST_NEXT(dhandle, l); if (dhandle->session_ref != 0 || now - dhandle->timeofdeath <= WT_DHANDLE_SWEEP_WAIT) continue; /* * We have a candidate for closing; if it's open, flush dirty * leaf pages, then acquire an exclusive lock on the handle * and close it. We might be blocking opens for a long time * (over disk I/O), but the handle was quiescent for awhile. * * The close can fail if an update cannot be written (updates in * a no-longer-referenced file might not yet be globally visible * if sessions have disjoint sets of files open). If the handle * is busy, skip it, we'll retry the close the next time, after * the transaction state has progressed. */ if (F_ISSET(dhandle, WT_DHANDLE_OPEN)) { WT_WITH_DHANDLE(session, dhandle, ret = __wt_cache_op( session, NULL, WT_SYNC_WRITE_LEAVES)); WT_RET(ret); /* Re-check that this looks like a good candidate. */ if (dhandle->session_ref != 0 || now - dhandle->timeofdeath <= WT_DHANDLE_SWEEP_WAIT) continue; /* * We don't set WT_DHANDLE_EXCLUSIVE deliberately, we * want opens to block on us rather than returning an * EBUSY error to the application. */ ret = __wt_try_writelock(session, dhandle->rwlock); if (ret == EBUSY) { ret = 0; continue; } WT_RET(ret); WT_WITH_DHANDLE(session, dhandle, ret = __wt_conn_btree_sync_and_close(session, 0)); if (ret == EBUSY) ret = 0; WT_TRET(__wt_writeunlock(session, dhandle->rwlock)); WT_RET(ret); } /* * Attempt to discard the handle (the called function checks the * handle-open flag after acquiring appropriate locks, which is * why we don't do any special handling of EBUSY returns above, * that path never cleared the handle-open flag. */ WT_WITH_DHANDLE(session, dhandle, ret = __wt_conn_dhandle_discard_single(session, 0)); if (ret == EBUSY) ret = 0; WT_RET(ret); } return (0); }
/* * __wt_verify -- * Verify a file. */ int __wt_verify(WT_SESSION_IMPL *session, const char *cfg[]) { WT_BM *bm; WT_BTREE *btree; WT_CKPT *ckptbase, *ckpt; WT_DECL_RET; WT_VSTUFF *vs, _vstuff; size_t root_addr_size; uint8_t root_addr[WT_BTREE_MAX_ADDR_COOKIE]; bool bm_start, quit; btree = S2BT(session); bm = btree->bm; ckptbase = NULL; bm_start = false; WT_CLEAR(_vstuff); vs = &_vstuff; WT_ERR(__wt_scr_alloc(session, 0, &vs->max_key)); WT_ERR(__wt_scr_alloc(session, 0, &vs->max_addr)); WT_ERR(__wt_scr_alloc(session, 0, &vs->tmp1)); WT_ERR(__wt_scr_alloc(session, 0, &vs->tmp2)); WT_ERR(__wt_scr_alloc(session, 0, &vs->tmp3)); WT_ERR(__wt_scr_alloc(session, 0, &vs->tmp4)); /* Check configuration strings. */ WT_ERR(__verify_config(session, cfg, vs)); /* Optionally dump specific block offsets. */ WT_ERR(__verify_config_offsets(session, cfg, &quit)); if (quit) goto done; /* Get a list of the checkpoints for this file. */ WT_ERR( __wt_meta_ckptlist_get(session, btree->dhandle->name, &ckptbase)); /* Inform the underlying block manager we're verifying. */ WT_ERR(bm->verify_start(bm, session, ckptbase, cfg)); bm_start = true; /* Loop through the file's checkpoints, verifying each one. */ WT_CKPT_FOREACH(ckptbase, ckpt) { WT_ERR(__wt_verbose(session, WT_VERB_VERIFY, "%s: checkpoint %s", btree->dhandle->name, ckpt->name)); /* Fake checkpoints require no work. */ if (F_ISSET(ckpt, WT_CKPT_FAKE)) continue; /* House-keeping between checkpoints. */ __verify_checkpoint_reset(vs); if (WT_VRFY_DUMP(vs)) WT_ERR(__wt_msg(session, "%s: checkpoint %s", btree->dhandle->name, ckpt->name)); /* Load the checkpoint. */ WT_ERR(bm->checkpoint_load(bm, session, ckpt->raw.data, ckpt->raw.size, root_addr, &root_addr_size, true)); /* * Ignore trees with no root page. * Verify, then discard the checkpoint from the cache. */ if (root_addr_size != 0 && (ret = __wt_btree_tree_open( session, root_addr, root_addr_size)) == 0) { if (WT_VRFY_DUMP(vs)) WT_ERR(__wt_msg(session, "Root: %s %s", __wt_addr_string(session, root_addr, root_addr_size, vs->tmp1), __wt_page_type_string( btree->root.page->type))); WT_WITH_PAGE_INDEX(session, ret = __verify_tree(session, &btree->root, vs)); WT_TRET(__wt_cache_op(session, WT_SYNC_DISCARD)); } /* Unload the checkpoint. */ WT_TRET(bm->checkpoint_unload(bm, session)); WT_ERR(ret); /* Display the tree shape. */ if (vs->dump_shape) WT_ERR(__verify_tree_shape(session, vs)); }
/* * __wt_lsm_checkpoint_chunk -- * Flush a single LSM chunk to disk. */ int __wt_lsm_checkpoint_chunk(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, WT_LSM_CHUNK *chunk) { WT_DECL_RET; WT_TXN_ISOLATION saved_isolation; /* * If the chunk is already checkpointed, make sure it is also evicted. * Either way, there is no point trying to checkpoint it again. */ if (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) && !F_ISSET(chunk, WT_LSM_CHUNK_STABLE) && !chunk->evicted) { if ((ret = __lsm_discard_handle( session, chunk->uri, NULL)) == 0) chunk->evicted = 1; else if (ret == EBUSY) ret = 0; else WT_RET_MSG(session, ret, "discard handle"); } if (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK)) { WT_RET(__wt_verbose(session, WT_VERB_LSM, "LSM worker %s already on disk", chunk->uri)); return (0); } /* Stop if a running transaction needs the chunk. */ __wt_txn_update_oldest(session); if (chunk->switch_txn == WT_TXN_NONE || !__wt_txn_visible_all(session, chunk->switch_txn)) { WT_RET(__wt_verbose(session, WT_VERB_LSM, "LSM worker %s: running transaction, return", chunk->uri)); return (0); } WT_RET(__wt_verbose(session, WT_VERB_LSM, "LSM worker flushing %s", chunk->uri)); /* * Flush the file before checkpointing: this is the expensive part in * terms of I/O. * * Use the special eviction isolation level to avoid interfering with * an application checkpoint: we have already checked that all of the * updates in this chunk are globally visible. * * !!! We can wait here for checkpoints and fsyncs to complete, which * can be a long time. */ if ((ret = __wt_session_get_btree( session, chunk->uri, NULL, NULL, 0)) == 0) { saved_isolation = session->txn.isolation; session->txn.isolation = TXN_ISO_EVICTION; ret = __wt_cache_op(session, NULL, WT_SYNC_WRITE_LEAVES); session->txn.isolation = saved_isolation; WT_TRET(__wt_session_release_btree(session)); } WT_RET(ret); WT_RET(__wt_verbose(session, WT_VERB_LSM, "LSM worker checkpointing %s", chunk->uri)); WT_WITH_SCHEMA_LOCK(session, ret = __wt_schema_worker(session, chunk->uri, __wt_checkpoint, NULL, NULL, 0)); if (ret != 0) WT_RET_MSG(session, ret, "LSM checkpoint"); /* Now the file is written, get the chunk size. */ WT_RET(__wt_lsm_tree_set_chunk_size(session, chunk)); /* Update the flush timestamp to help track ongoing progress. */ WT_RET(__wt_epoch(session, &lsm_tree->last_flush_ts)); /* Lock the tree, mark the chunk as on disk and update the metadata. */ WT_RET(__wt_lsm_tree_writelock(session, lsm_tree)); F_SET(chunk, WT_LSM_CHUNK_ONDISK); ret = __wt_lsm_meta_write(session, lsm_tree); ++lsm_tree->dsk_gen; /* Update the throttle time. */ __wt_lsm_tree_throttle(session, lsm_tree, 1); WT_TRET(__wt_lsm_tree_writeunlock(session, lsm_tree)); if (ret != 0) WT_RET_MSG(session, ret, "LSM metadata write"); /* * Clear the no-eviction flag so the primary can be evicted and * eventually closed. Only do this once the checkpoint has succeeded: * otherwise, accessing the leaf page during the checkpoint can trigger * forced eviction. */ WT_RET(__wt_session_get_btree(session, chunk->uri, NULL, NULL, 0)); __wt_btree_evictable(session, 1); WT_RET(__wt_session_release_btree(session)); /* Make sure we aren't pinning a transaction ID. */ __wt_txn_release_snapshot(session); WT_RET(__wt_verbose(session, WT_VERB_LSM, "LSM worker checkpointed %s", chunk->uri)); /* Schedule a bloom filter create for our newly flushed chunk. */ if (!FLD_ISSET(lsm_tree->bloom, WT_LSM_BLOOM_OFF)) WT_RET(__wt_lsm_manager_push_entry( session, WT_LSM_WORK_BLOOM, 0, lsm_tree)); else WT_RET(__wt_lsm_manager_push_entry( session, WT_LSM_WORK_MERGE, 0, lsm_tree)); return (0); }