/* * __wt_lsm_work_switch -- * Do a switch if the LSM tree needs one. */ int __wt_lsm_work_switch( WT_SESSION_IMPL *session, WT_LSM_WORK_UNIT **entryp, bool *ran) { WT_DECL_RET; WT_LSM_WORK_UNIT *entry; /* We've become responsible for freeing the work unit. */ entry = *entryp; *ran = false; *entryp = NULL; if (F_ISSET(entry->lsm_tree, WT_LSM_TREE_NEED_SWITCH)) { WT_WITH_SCHEMA_LOCK(session, ret, ret = __wt_lsm_tree_switch(session, entry->lsm_tree)); /* Failing to complete the switch is fine */ if (ret == EBUSY) { if (F_ISSET(entry->lsm_tree, WT_LSM_TREE_NEED_SWITCH)) WT_ERR(__wt_lsm_manager_push_entry(session, WT_LSM_WORK_SWITCH, 0, entry->lsm_tree)); ret = 0; } else *ran = true; } err: __wt_lsm_manager_free_work_unit(session, entry); return (ret); }
/*__wt_session_compact*/ int __wt_session_compact(WT_SESSION *wt_session, const char *uri, const char *config) { WT_COMPACT compact; WT_CONFIG_ITEM cval; WT_DECL_RET; WT_SESSION_IMPL *session; session = (WT_SESSION_IMPL *)wt_session; SESSION_API_CALL(session, compact, config, cfg); /* Setup the structure in the session handle */ memset(&compact, 0, sizeof(WT_COMPACT)); session->compact = &compact; WT_ERR(__wt_config_gets(session, cfg, "timeout", &cval)); session->compact->max_time = (uint64_t)cval.val; /* Find the types of data sources are being compacted. */ WT_WITH_SCHEMA_LOCK(session, ret = __wt_schema_worker(session, uri, NULL, __wt_compact_uri_analyze, cfg, 0)); WT_ERR(ret); if (session->compact->lsm_count != 0) WT_ERR(__wt_schema_worker(session, uri, NULL, __wt_lsm_compact, cfg, 0)); if (session->compact->file_count != 0) WT_ERR(__compact_file(session, uri, cfg)); err: session->compact = NULL; API_END_RET_NOTFOUND_MAP(session, ret); }
/* * __lsm_drop_file -- * Helper function to drop part of an LSM tree. */ static int __lsm_drop_file(WT_SESSION_IMPL *session, const char *uri) { WT_DECL_RET; const char *drop_cfg[] = { WT_CONFIG_BASE( session, WT_SESSION_drop), "remove_files=false", NULL }; /* * We need to grab the schema lock to drop the file, so first try to * make sure there is minimal work to freeing space in the cache. Only * bother trying to discard the checkpoint handle: the in-memory handle * should have been closed already. * * This will fail with EBUSY if the file is still in use. */ WT_RET(__lsm_discard_handle(session, uri, WT_CHECKPOINT)); /* * Take the schema lock for the drop operation. Since __wt_schema_drop * results in the hot backup lock being taken when it updates the * metadata (which would be too late to prevent our drop). */ WT_WITH_SCHEMA_LOCK(session, ret = __wt_schema_drop(session, uri, drop_cfg)); if (ret == 0) ret = __wt_remove(session, uri + strlen("file:")); WT_RET(__wt_verbose(session, WT_VERB_LSM, "Dropped %s", uri)); if (ret == EBUSY || ret == ENOENT) WT_RET(__wt_verbose(session, WT_VERB_LSM, "LSM worker drop of %s failed with %d", uri, ret)); return (ret); }
/* * __wt_curbackup_open -- * WT_SESSION->open_cursor method for the backup cursor type. */ int __wt_curbackup_open(WT_SESSION_IMPL *session, const char *uri, const char *cfg[], WT_CURSOR **cursorp) { WT_CURSOR_STATIC_INIT(iface, __wt_cursor_get_key, /* get-key */ __wt_cursor_get_value_notsup, /* get-value */ __wt_cursor_set_key_notsup, /* set-key */ __wt_cursor_set_value_notsup, /* set-value */ __wt_cursor_compare_notsup, /* compare */ __wt_cursor_equals_notsup, /* equals */ __curbackup_next, /* next */ __wt_cursor_notsup, /* prev */ __curbackup_reset, /* reset */ __wt_cursor_notsup, /* search */ __wt_cursor_search_near_notsup, /* search-near */ __wt_cursor_notsup, /* insert */ __wt_cursor_modify_notsup, /* modify */ __wt_cursor_notsup, /* update */ __wt_cursor_notsup, /* remove */ __wt_cursor_notsup, /* reserve */ __wt_cursor_reconfigure_notsup, /* reconfigure */ __curbackup_close); /* close */ WT_CURSOR *cursor; WT_CURSOR_BACKUP *cb; WT_DECL_RET; WT_STATIC_ASSERT(offsetof(WT_CURSOR_BACKUP, iface) == 0); cb = NULL; WT_RET(__wt_calloc_one(session, &cb)); cursor = &cb->iface; *cursor = iface; cursor->session = &session->iface; session->bkp_cursor = cb; cursor->key_format = "S"; /* Return the file names as the key. */ cursor->value_format = ""; /* No value. */ /* * Start the backup and fill in the cursor's list. Acquire the schema * lock, we need a consistent view when creating a copy. */ WT_WITH_CHECKPOINT_LOCK(session, WT_WITH_SCHEMA_LOCK(session, ret = __backup_start(session, cb, cfg))); WT_ERR(ret); WT_ERR(__wt_cursor_init(cursor, uri, NULL, cfg, cursorp)); if (0) { err: WT_TRET(__curbackup_close(cursor)); *cursorp = NULL; } return (ret); }
/* * __wt_clsm_open_bulk -- * WT_SESSION->open_cursor method for LSM bulk cursors. */ int __wt_clsm_open_bulk(WT_CURSOR_LSM *clsm, const char *cfg[]) { WT_CURSOR *cursor, *bulk_cursor; WT_DECL_RET; WT_LSM_TREE *lsm_tree; WT_SESSION_IMPL *session; bulk_cursor = NULL; cursor = &clsm->iface; lsm_tree = clsm->lsm_tree; session = (WT_SESSION_IMPL *)clsm->iface.session; F_SET(clsm, WT_CLSM_BULK); /* Bulk cursors are limited to insert and close. */ __wt_cursor_set_notsup(cursor); cursor->insert = __clsm_insert_bulk; cursor->close = __clsm_close_bulk; /* * Setup the first chunk in the tree. This is the only time we switch * without using the LSM worker threads, it's safe to do here since * we have an exclusive lock on the LSM tree. We need to do this * switch inline, since switch needs a schema lock and online index * creation opens a bulk cursor while holding the schema lock. */ WT_WITH_SCHEMA_LOCK(session, ret, ret = __wt_lsm_tree_switch(session, lsm_tree)); WT_RET(ret); /* * Open a bulk cursor on the first chunk, it's not a regular LSM chunk * cursor, but use the standard storage locations. Allocate the space * for a bloom filter - it makes cleanup simpler. Cleaned up by * cursor close on error. */ WT_RET(__wt_calloc_one(session, &clsm->blooms)); clsm->bloom_alloc = 1; WT_RET(__wt_calloc_one(session, &clsm->cursors)); clsm->cursor_alloc = 1; clsm->nchunks = 1; /* * Open a bulk cursor on the first chunk in the tree - take a read * lock on the LSM tree while we are opening the chunk, to ensure * that the first chunk has been fully created before we succeed. * Pass through the application config to ensure the tree is open * for bulk access. */ WT_RET(__wt_open_cursor(session, lsm_tree->chunk[0]->uri, &clsm->iface, cfg, &bulk_cursor)); clsm->cursors[0] = bulk_cursor; /* LSM cursors are always raw */ F_SET(bulk_cursor, WT_CURSTD_RAW); return (0); }
/*btree file的compact操作*/ static int __compact_file(WT_SESSION_IMPL* session, const char* uri, const char* cfg[]) { WT_DECL_RET; WT_DECL_ITEM(t); WT_SESSION *wt_session; WT_TXN *txn; int i; struct timespec start_time; txn = &session->txn; wt_session = &session->iface; /* * File compaction requires checkpoints, which will fail in a * transactional context. Check now so the error message isn't * confusing. */ if(session->compact->file_count != 0 && F_ISSET(txn, TXN_RUNNING)) WT_ERR_MSG(session, EINVAL, " File compaction not permitted in a transaction"); /* * Force the checkpoint: we don't want to skip it because the work we * need to have done is done in the underlying block manager. */ WT_ERR(__wt_scr_alloc(session, 128, &t)); WT_ERR(__wt_buf_fmt(session, t, "target=(\"%s\"),force=1", uri)); WT_ERR(__wt_epoch(session, &start_time)); /* * We compact 10% of the file on each pass (but the overall size of the * file is decreasing each time, so we're not compacting 10% of the * original file each time). Try 100 times (which is clearly more than * we need); quit if we make no progress and check for a timeout each * time through the loop. */ for (i = 0; i < 100; ++i) { WT_ERR(wt_session->checkpoint(wt_session, t->data)); session->compaction = 0; WT_WITH_SCHEMA_LOCK(session, ret = __wt_schema_worker(session, uri, __wt_compact, NULL, cfg, 0)); WT_ERR(ret); if (!session->compaction) break; WT_ERR(wt_session->checkpoint(wt_session, t->data)); WT_ERR(wt_session->checkpoint(wt_session, t->data)); WT_ERR(__session_compact_check_timeout(session, start_time)); } err: __wt_scr_free(session, &t); }
/* * __wt_curbackup_open -- * WT_SESSION->open_cursor method for the backup cursor type. */ int __wt_curbackup_open(WT_SESSION_IMPL *session, const char *uri, const char *cfg[], WT_CURSOR **cursorp) { WT_CURSOR_STATIC_INIT(iface, NULL, /* get-key */ __wt_cursor_notsup, /* get-value */ __wt_cursor_notsup, /* set-key */ __wt_cursor_notsup, /* set-value */ NULL, /* compare */ __curbackup_next, /* next */ __wt_cursor_notsup, /* prev */ __curbackup_reset, /* reset */ __wt_cursor_notsup, /* search */ __wt_cursor_notsup, /* search-near */ __wt_cursor_notsup, /* insert */ __wt_cursor_notsup, /* update */ __wt_cursor_notsup, /* remove */ __curbackup_close); /* close */ WT_CURSOR *cursor; WT_CURSOR_BACKUP *cb; WT_DECL_RET; STATIC_ASSERT(offsetof(WT_CURSOR_BACKUP, iface) == 0); cb = NULL; WT_RET(__wt_calloc_def(session, 1, &cb)); cursor = &cb->iface; *cursor = iface; cursor->session = &session->iface; session->bkp_cursor = cb; cursor->key_format = "S"; /* Return the file names as the key. */ cursor->value_format = ""; /* No value. */ /* * Start the backup and fill in the cursor's list. Acquire the schema * lock, we need a consistent view when creating a copy. */ WT_WITH_SCHEMA_LOCK(session, ret = __backup_start(session, cb, cfg)); WT_ERR(ret); /* __wt_cursor_init is last so we don't have to clean up on error. */ WT_ERR(__wt_cursor_init(cursor, uri, NULL, cfg, cursorp)); if (0) { err: __wt_free(session, cb); } return (ret); }
/* * __metadata_init -- * Create the metadata file. */ static int __metadata_init(WT_SESSION_IMPL *session) { WT_DECL_RET; /* * We're single-threaded, but acquire the schema lock regardless: the * lower level code checks that it is appropriately synchronized. */ WT_WITH_SCHEMA_LOCK(session, ret = __wt_schema_create(session, WT_METAFILE_URI, NULL)); return (ret); }
/* * __session_verify -- * WT_SESSION->verify method. */ static int __session_verify(WT_SESSION *wt_session, const char *uri, const char *config) { WT_DECL_RET; WT_SESSION_IMPL *session; session = (WT_SESSION_IMPL *)wt_session; SESSION_API_CALL(session, verify, config, cfg); WT_WITH_SCHEMA_LOCK(session, ret = __wt_schema_worker(session, uri, __wt_verify, cfg, WT_BTREE_EXCLUSIVE | WT_BTREE_VERIFY)); err: API_END_NOTFOUND_MAP(session, ret); }
/* * __session_drop -- * WT_SESSION->drop method. */ static int __session_drop(WT_SESSION *wt_session, const char *uri, const char *config) { WT_DECL_RET; WT_SESSION_IMPL *session; session = (WT_SESSION_IMPL *)wt_session; SESSION_API_CALL(session, drop, config, cfg); WT_WITH_SCHEMA_LOCK(session, ret = __wt_schema_drop(session, uri, cfg)); err: /* Note: drop operations cannot be unrolled (yet?). */ API_END_NOTFOUND_MAP(session, ret); }
/* * __session_compact_worker -- * Worker function to do the actual compaction call. */ static int __session_compact_worker( WT_SESSION *wt_session, const char *uri, const char *config) { WT_DECL_RET; WT_SESSION_IMPL *session; session = (WT_SESSION_IMPL *)wt_session; SESSION_API_CALL(session, compact, config, cfg); WT_WITH_SCHEMA_LOCK(session, ret = __wt_schema_worker(session, uri, __wt_compact, cfg, 0)); err: API_END_NOTFOUND_MAP(session, ret); }
/* * __session_rename -- * WT_SESSION->rename method. */ static int __session_rename(WT_SESSION *wt_session, const char *uri, const char *newname, const char *config) { WT_DECL_RET; WT_SESSION_IMPL *session; session = (WT_SESSION_IMPL *)wt_session; SESSION_API_CALL(session, rename, config, cfg); WT_WITH_SCHEMA_LOCK(session, ret = __wt_schema_rename(session, uri, newname, cfg)); err: API_END_NOTFOUND_MAP(session, ret); }
/* * __session_create -- * WT_SESSION->create method. */ static int __session_create(WT_SESSION *wt_session, const char *uri, const char *config) { WT_DECL_RET; WT_SESSION_IMPL *session; session = (WT_SESSION_IMPL *)wt_session; SESSION_API_CALL(session, create, config, cfg); WT_UNUSED(cfg); /* Disallow objects in the WiredTiger name space. */ WT_ERR(__wt_schema_name_check(session, uri)); WT_WITH_SCHEMA_LOCK(session, ret = __wt_schema_create(session, uri, config)); err: API_END_NOTFOUND_MAP(session, ret); }
/* * __session_checkpoint -- * WT_SESSION->checkpoint method. */ static int __session_checkpoint(WT_SESSION *wt_session, const char *config) { WT_DECL_RET; WT_SESSION_IMPL *session; WT_TXN *txn; session = (WT_SESSION_IMPL *)wt_session; txn = &session->txn; WT_CSTAT_INCR(session, checkpoint); SESSION_API_CALL(session, checkpoint, config, cfg); /* * Checkpoints require a snapshot to write a transactionally consistent * snapshot of the data. * * We can't use an application's transaction: if it has uncommitted * changes, they will be written in the checkpoint and may appear after * a crash. * * Use a real snapshot transaction: we don't want any chance of the * snapshot being updated during the checkpoint. Eviction is prevented * from evicting anything newer than this because we track the oldest * transaction ID in the system that is not visible to all readers. */ if (F_ISSET(txn, TXN_RUNNING)) WT_ERR_MSG(session, EINVAL, "Checkpoint not permitted in a transaction"); /* * Reset open cursors. * * We do this here explicitly even though it will happen implicitly in * the call to begin_transaction for the checkpoint, in case some * implementation of WT_CURSOR::reset needs the schema lock. */ WT_ERR(__session_reset_cursors(session)); WT_WITH_SCHEMA_LOCK(session, ret = __wt_txn_checkpoint(session, cfg)); err: API_END_NOTFOUND_MAP(session, ret); }
/* * __wt_las_create -- * Initialize the database's lookaside store. */ int __wt_las_create(WT_SESSION_IMPL *session) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; uint32_t session_flags; const char *drop_cfg[] = { WT_CONFIG_BASE(session, WT_SESSION_drop), "force=true", NULL }; conn = S2C(session); /* Read-only and in-memory configurations don't need the LAS table. */ if (F_ISSET(conn, WT_CONN_IN_MEMORY | WT_CONN_READONLY)) return (0); /* * Done at startup: we cannot do it on demand because we require the * schema lock to create and drop the table, and it may not always be * available. * * Discard any previous incarnation of the table. */ WT_WITH_SCHEMA_LOCK(session, ret = __wt_schema_drop(session, WT_LAS_URI, drop_cfg)); WT_RET(ret); /* Re-create the table. */ WT_RET(__wt_session_create(session, WT_LAS_URI, WT_LAS_FORMAT)); /* * Open a shared internal session used to access the lookaside table. * This session should never be tapped for eviction. */ session_flags = WT_SESSION_LOOKASIDE_CURSOR | WT_SESSION_NO_EVICTION; WT_RET(__wt_open_internal_session( conn, "lookaside table", true, session_flags, &conn->las_session)); /* Flag that the lookaside table has been created. */ F_SET(conn, WT_CONN_LAS_OPEN); return (0); }
/* * __wt_clsm_init_merge -- * Initialize an LSM cursor for a merge. */ int __wt_clsm_init_merge( WT_CURSOR *cursor, u_int start_chunk, uint32_t start_id, u_int nchunks) { WT_CURSOR_LSM *clsm; WT_DECL_RET; WT_SESSION_IMPL *session; clsm = (WT_CURSOR_LSM *)cursor; session = (WT_SESSION_IMPL *)cursor->session; F_SET(clsm, WT_CLSM_MERGE); if (start_chunk != 0) F_SET(clsm, WT_CLSM_MINOR_MERGE); clsm->nchunks = nchunks; WT_WITH_SCHEMA_LOCK(session, ret = __clsm_open_cursors(clsm, 0, start_chunk, start_id)); return (ret); }
/* * __curbackup_close -- * WT_CURSOR->close method for the backup cursor type. */ static int __curbackup_close(WT_CURSOR *cursor) { WT_CURSOR_BACKUP *cb; WT_DECL_RET; WT_SESSION_IMPL *session; int tret; cb = (WT_CURSOR_BACKUP *)cursor; CURSOR_API_CALL(cursor, session, close, NULL); WT_TRET(__backup_cleanup_handles(session, cb)); WT_TRET(__wt_cursor_close(cursor)); session->bkp_cursor = NULL; WT_WITH_SCHEMA_LOCK(session, tret = __backup_stop(session)); /* Stop the backup. */ WT_TRET(tret); err: API_END_RET(session, ret); }
/* * __wt_lsm_checkpoint_chunk -- * Flush a single LSM chunk to disk. */ int __wt_lsm_checkpoint_chunk(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, WT_LSM_CHUNK *chunk) { WT_DECL_RET; WT_TXN_ISOLATION saved_isolation; bool flush_set; flush_set = false; /* * If the chunk is already checkpointed, make sure it is also evicted. * Either way, there is no point trying to checkpoint it again. */ if (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) && !F_ISSET(chunk, WT_LSM_CHUNK_STABLE) && !chunk->evicted) { WT_WITH_HANDLE_LIST_LOCK(session, ret = __lsm_discard_handle(session, chunk->uri, NULL)); if (ret == 0) chunk->evicted = 1; else if (ret == EBUSY) ret = 0; else WT_RET_MSG(session, ret, "discard handle"); } if (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK)) { WT_RET(__wt_verbose(session, WT_VERB_LSM, "LSM worker %s already on disk", chunk->uri)); return (0); } /* Stop if a running transaction needs the chunk. */ __wt_txn_update_oldest(session, true); if (chunk->switch_txn == WT_TXN_NONE || !__wt_txn_visible_all(session, chunk->switch_txn)) { WT_RET(__wt_verbose(session, WT_VERB_LSM, "LSM worker %s: running transaction, return", chunk->uri)); return (0); } if (!__wt_atomic_cas8(&chunk->flushing, 0, 1)) return (0); flush_set = true; WT_ERR(__wt_verbose(session, WT_VERB_LSM, "LSM worker flushing %s", chunk->uri)); /* * Flush the file before checkpointing: this is the expensive part in * terms of I/O. * * !!! * We can wait here for checkpoints and fsyncs to complete, which can * take a long time. */ if ((ret = __wt_session_get_btree( session, chunk->uri, NULL, NULL, 0)) == 0) { /* * Set read-uncommitted: we have already checked that all of the * updates in this chunk are globally visible, use the cheapest * possible check in reconciliation. */ saved_isolation = session->txn.isolation; session->txn.isolation = WT_ISO_READ_UNCOMMITTED; ret = __wt_cache_op(session, NULL, WT_SYNC_WRITE_LEAVES); session->txn.isolation = saved_isolation; WT_TRET(__wt_session_release_btree(session)); } WT_ERR(ret); WT_ERR(__wt_verbose(session, WT_VERB_LSM, "LSM worker checkpointing %s", chunk->uri)); /* * Turn on metadata tracking to ensure the checkpoint gets the * necessary handle locks. * * Ensure that we don't race with a running checkpoint: the checkpoint * lock protects against us racing with an application checkpoint in * this chunk. Don't wait for it, though: checkpoints can take a long * time, and our checkpoint operation should be very quick. */ WT_ERR(__wt_meta_track_on(session)); WT_WITH_CHECKPOINT_LOCK(session, ret, WT_WITH_SCHEMA_LOCK(session, ret, ret = __wt_schema_worker( session, chunk->uri, __wt_checkpoint, NULL, NULL, 0))); WT_TRET(__wt_meta_track_off(session, false, ret != 0)); if (ret != 0) WT_ERR_MSG(session, ret, "LSM checkpoint"); /* Now the file is written, get the chunk size. */ WT_ERR(__wt_lsm_tree_set_chunk_size(session, chunk)); /* Update the flush timestamp to help track ongoing progress. */ WT_ERR(__wt_epoch(session, &lsm_tree->last_flush_ts)); ++lsm_tree->chunks_flushed; /* Lock the tree, mark the chunk as on disk and update the metadata. */ WT_ERR(__wt_lsm_tree_writelock(session, lsm_tree)); F_SET(chunk, WT_LSM_CHUNK_ONDISK); ret = __wt_lsm_meta_write(session, lsm_tree); ++lsm_tree->dsk_gen; /* Update the throttle time. */ __wt_lsm_tree_throttle(session, lsm_tree, true); WT_TRET(__wt_lsm_tree_writeunlock(session, lsm_tree)); if (ret != 0) WT_ERR_MSG(session, ret, "LSM metadata write"); WT_PUBLISH(chunk->flushing, 0); flush_set = false; /* * Clear the no-eviction flag so the primary can be evicted and * eventually closed. Only do this once the checkpoint has succeeded: * otherwise, accessing the leaf page during the checkpoint can trigger * forced eviction. */ WT_ERR(__wt_session_get_btree(session, chunk->uri, NULL, NULL, 0)); __wt_btree_evictable(session, true); WT_ERR(__wt_session_release_btree(session)); /* Make sure we aren't pinning a transaction ID. */ __wt_txn_release_snapshot(session); WT_ERR(__wt_verbose(session, WT_VERB_LSM, "LSM worker checkpointed %s", chunk->uri)); /* Schedule a bloom filter create for our newly flushed chunk. */ if (!FLD_ISSET(lsm_tree->bloom, WT_LSM_BLOOM_OFF)) WT_ERR(__wt_lsm_manager_push_entry( session, WT_LSM_WORK_BLOOM, 0, lsm_tree)); else WT_ERR(__wt_lsm_manager_push_entry( session, WT_LSM_WORK_MERGE, 0, lsm_tree)); err: if (flush_set) WT_PUBLISH(chunk->flushing, 0); return (ret); }
/* * __wt_lsm_checkpoint_chunk -- * Flush a single LSM chunk to disk. */ int __wt_lsm_checkpoint_chunk(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, WT_LSM_CHUNK *chunk) { WT_DECL_RET; WT_TXN_ISOLATION saved_isolation; /* * If the chunk is already checkpointed, make sure it is also evicted. * Either way, there is no point trying to checkpoint it again. */ if (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) && !F_ISSET(chunk, WT_LSM_CHUNK_STABLE) && !chunk->evicted) { if ((ret = __lsm_discard_handle( session, chunk->uri, NULL)) == 0) chunk->evicted = 1; else if (ret == EBUSY) ret = 0; else WT_RET_MSG(session, ret, "discard handle"); } if (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK)) { WT_RET(__wt_verbose(session, WT_VERB_LSM, "LSM worker %s already on disk", chunk->uri)); return (0); } /* Stop if a running transaction needs the chunk. */ __wt_txn_update_oldest(session); if (chunk->switch_txn == WT_TXN_NONE || !__wt_txn_visible_all(session, chunk->switch_txn)) { WT_RET(__wt_verbose(session, WT_VERB_LSM, "LSM worker %s: running transaction, return", chunk->uri)); return (0); } WT_RET(__wt_verbose(session, WT_VERB_LSM, "LSM worker flushing %s", chunk->uri)); /* * Flush the file before checkpointing: this is the expensive part in * terms of I/O. * * Use the special eviction isolation level to avoid interfering with * an application checkpoint: we have already checked that all of the * updates in this chunk are globally visible. * * !!! We can wait here for checkpoints and fsyncs to complete, which * can be a long time. */ if ((ret = __wt_session_get_btree( session, chunk->uri, NULL, NULL, 0)) == 0) { saved_isolation = session->txn.isolation; session->txn.isolation = TXN_ISO_EVICTION; ret = __wt_cache_op(session, NULL, WT_SYNC_WRITE_LEAVES); session->txn.isolation = saved_isolation; WT_TRET(__wt_session_release_btree(session)); } WT_RET(ret); WT_RET(__wt_verbose(session, WT_VERB_LSM, "LSM worker checkpointing %s", chunk->uri)); WT_WITH_SCHEMA_LOCK(session, ret = __wt_schema_worker(session, chunk->uri, __wt_checkpoint, NULL, NULL, 0)); if (ret != 0) WT_RET_MSG(session, ret, "LSM checkpoint"); /* Now the file is written, get the chunk size. */ WT_RET(__wt_lsm_tree_set_chunk_size(session, chunk)); /* Update the flush timestamp to help track ongoing progress. */ WT_RET(__wt_epoch(session, &lsm_tree->last_flush_ts)); /* Lock the tree, mark the chunk as on disk and update the metadata. */ WT_RET(__wt_lsm_tree_writelock(session, lsm_tree)); F_SET(chunk, WT_LSM_CHUNK_ONDISK); ret = __wt_lsm_meta_write(session, lsm_tree); ++lsm_tree->dsk_gen; /* Update the throttle time. */ __wt_lsm_tree_throttle(session, lsm_tree, 1); WT_TRET(__wt_lsm_tree_writeunlock(session, lsm_tree)); if (ret != 0) WT_RET_MSG(session, ret, "LSM metadata write"); /* * Clear the no-eviction flag so the primary can be evicted and * eventually closed. Only do this once the checkpoint has succeeded: * otherwise, accessing the leaf page during the checkpoint can trigger * forced eviction. */ WT_RET(__wt_session_get_btree(session, chunk->uri, NULL, NULL, 0)); __wt_btree_evictable(session, 1); WT_RET(__wt_session_release_btree(session)); /* Make sure we aren't pinning a transaction ID. */ __wt_txn_release_snapshot(session); WT_RET(__wt_verbose(session, WT_VERB_LSM, "LSM worker checkpointed %s", chunk->uri)); /* Schedule a bloom filter create for our newly flushed chunk. */ if (!FLD_ISSET(lsm_tree->bloom, WT_LSM_BLOOM_OFF)) WT_RET(__wt_lsm_manager_push_entry( session, WT_LSM_WORK_BLOOM, 0, lsm_tree)); else WT_RET(__wt_lsm_manager_push_entry( session, WT_LSM_WORK_MERGE, 0, lsm_tree)); return (0); }
/* * wiredtiger_open -- * Main library entry point: open a new connection to a WiredTiger * database. */ int wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler, const char *config, WT_CONNECTION **wt_connp) { static WT_CONNECTION stdc = { __conn_close, __conn_reconfigure, __conn_get_home, __conn_is_new, __conn_open_session, __conn_load_extension, __conn_add_data_source, __conn_add_collator, __conn_add_compressor, __conn_add_extractor }; static struct { const char *name; uint32_t flag; } *ft, directio_types[] = { { "data", WT_DIRECTIO_DATA }, { "log", WT_DIRECTIO_LOG }, { NULL, 0 } }; WT_CONFIG subconfig; WT_CONFIG_ITEM cval, skey, sval; WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_ITEM *cbuf, expath, exconfig; WT_SESSION_IMPL *session; const char *cfg[] = { __wt_confdfl_wiredtiger_open, config, NULL, NULL, NULL }; int exist; *wt_connp = NULL; session = NULL; cbuf = NULL; WT_CLEAR(expath); WT_CLEAR(exconfig); WT_RET(__wt_library_init()); WT_RET(__wt_calloc_def(NULL, 1, &conn)); conn->iface = stdc; /* * Immediately link the structure into the connection structure list: * the only thing ever looked at on that list is the database name, * and a NULL value is fine. */ __wt_spin_lock(NULL, &__wt_process.spinlock); TAILQ_INSERT_TAIL(&__wt_process.connqh, conn, q); __wt_spin_unlock(NULL, &__wt_process.spinlock); session = conn->default_session = &conn->dummy_session; session->iface.connection = &conn->iface; session->name = "wiredtiger_open"; __wt_event_handler_set(session, event_handler); /* Remaining basic initialization of the connection structure. */ WT_ERR(__wt_connection_init(conn)); /* Check the configuration strings. */ WT_ERR(__wt_config_check( session, __wt_confchk_wiredtiger_open, config, 0)); /* Get the database home. */ WT_ERR(__conn_home(session, home, cfg)); /* Make sure no other thread of control already owns this database. */ WT_ERR(__conn_single(session, cfg)); /* Read the database-home configuration file. */ WT_ERR(__conn_config_file(session, cfg, &cbuf)); /* Read the environment variable configuration. */ WT_ERR(__conn_config_env(session, cfg)); WT_ERR(__wt_config_gets(session, cfg, "hazard_max", &cval)); conn->hazard_max = (uint32_t)cval.val; WT_ERR(__wt_config_gets(session, cfg, "session_max", &cval)); conn->session_size = (uint32_t)cval.val + WT_NUM_INTERNAL_SESSIONS; WT_ERR(__wt_config_gets(session, cfg, "lsm_merge", &cval)); if (cval.val) F_SET(conn, WT_CONN_LSM_MERGE); WT_ERR(__wt_config_gets(session, cfg, "sync", &cval)); if (cval.val) F_SET(conn, WT_CONN_SYNC); WT_ERR(__wt_config_gets(session, cfg, "transactional", &cval)); if (cval.val) F_SET(conn, WT_CONN_TRANSACTIONAL); /* Configure verbose flags. */ WT_ERR(__conn_verbose_config(session, cfg)); WT_ERR(__wt_conn_cache_pool_config(session, cfg)); WT_ERR(__wt_config_gets(session, cfg, "logging", &cval)); if (cval.val != 0) WT_ERR(__wt_open( session, WT_LOG_FILENAME, 1, 0, 0, &conn->log_fh)); /* Configure direct I/O and buffer alignment. */ WT_ERR(__wt_config_gets(session, cfg, "buffer_alignment", &cval)); if (cval.val == -1) conn->buffer_alignment = WT_BUFFER_ALIGNMENT_DEFAULT; else conn->buffer_alignment = (size_t)cval.val; #ifndef HAVE_POSIX_MEMALIGN if (conn->buffer_alignment != 0) WT_ERR_MSG(session, EINVAL, "buffer_alignment requires posix_memalign"); #endif /* * Configuration: direct_io, mmap, statistics. */ WT_ERR(__wt_config_gets(session, cfg, "direct_io", &cval)); for (ft = directio_types; ft->name != NULL; ft++) { ret = __wt_config_subgets(session, &cval, ft->name, &sval); if (ret == 0) { if (sval.val) FLD_SET(conn->direct_io, ft->flag); } else if (ret != WT_NOTFOUND) goto err; } WT_ERR(__wt_config_gets(session, cfg, "mmap", &cval)); conn->mmap = cval.val == 0 ? 0 : 1; WT_ERR(__wt_config_gets(session, cfg, "statistics", &cval)); conn->statistics = cval.val == 0 ? 0 : 1; /* Load any extensions referenced in the config. */ WT_ERR(__wt_config_gets(session, cfg, "extensions", &cval)); WT_ERR(__wt_config_subinit(session, &subconfig, &cval)); while ((ret = __wt_config_next(&subconfig, &skey, &sval)) == 0) { WT_ERR(__wt_buf_fmt( session, &expath, "%.*s", (int)skey.len, skey.str)); if (sval.len > 0) WT_ERR(__wt_buf_fmt(session, &exconfig, "entry=%.*s\n", (int)sval.len, sval.str)); WT_ERR(conn->iface.load_extension(&conn->iface, expath.data, (sval.len > 0) ? exconfig.data : NULL)); } WT_ERR_NOTFOUND_OK(ret); /* * Open the connection; if that fails, the connection handle has been * destroyed by the time the open function returns. */ if ((ret = __wt_connection_open(conn, cfg)) != 0) { conn = NULL; WT_ERR(ret); } /* Open the default session. */ WT_ERR(__wt_open_session(conn, 1, NULL, NULL, &conn->default_session)); session = conn->default_session; /* * Check on the turtle and metadata files, creating them if necessary * (which avoids application threads racing to create the metadata file * later). */ WT_ERR(__wt_meta_turtle_init(session, &exist)); if (!exist) { /* * We're single-threaded, but acquire the schema lock * regardless: the lower level code checks that it is * appropriately synchronized. */ WT_WITH_SCHEMA_LOCK(session, ret = __wt_schema_create(session, WT_METADATA_URI, NULL)); WT_ERR(ret); } WT_ERR(__wt_metadata_open(session)); /* If there's a hot-backup file, load it. */ WT_ERR(__wt_metadata_load_backup(session)); /* * XXX LSM initialization. * This is structured so that it could be moved to an extension. */ WT_ERR(__wt_lsm_init(&conn->iface, NULL)); STATIC_ASSERT(offsetof(WT_CONNECTION_IMPL, iface) == 0); *wt_connp = &conn->iface; /* * Destroying the connection on error will destroy our session handle, * cleanup using the session handle first, then discard the connection. */ err: if (cbuf != NULL) __wt_buf_free(session, cbuf); __wt_buf_free(session, &expath); __wt_buf_free(session, &exconfig); if (ret != 0 && conn != NULL) WT_TRET(__wt_connection_destroy(conn)); /* Let the server threads proceed. */ if (ret == 0) conn->connection_initialized = 1; return (ret); }
/* * __wt_lsm_merge -- * Merge a set of chunks of an LSM tree. */ int __wt_lsm_merge(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, u_int id) { WT_BLOOM *bloom; WT_CURSOR *dest, *src; WT_DECL_RET; WT_ITEM key, value; WT_LSM_CHUNK *chunk; uint32_t generation; uint64_t insert_count, record_count; u_int dest_id, end_chunk, i, nchunks, start_chunk, start_id, verb; int tret; bool created_chunk, create_bloom, locked, in_sync; const char *cfg[3]; const char *drop_cfg[] = { WT_CONFIG_BASE(session, WT_SESSION_drop), "force", NULL }; bloom = NULL; chunk = NULL; dest = src = NULL; start_id = 0; created_chunk = create_bloom = locked = in_sync = false; /* Fast path if it's obvious no merges could be done. */ if (lsm_tree->nchunks < lsm_tree->merge_min && lsm_tree->merge_aggressiveness < WT_LSM_AGGRESSIVE_THRESHOLD) return (WT_NOTFOUND); /* * Use the lsm_tree lock to read the chunks (so no switches occur), but * avoid holding it while the merge is in progress: that may take a * long time. */ WT_RET(__wt_lsm_tree_writelock(session, lsm_tree)); locked = true; WT_ERR(__lsm_merge_span(session, lsm_tree, id, &start_chunk, &end_chunk, &record_count)); nchunks = (end_chunk + 1) - start_chunk; WT_ASSERT(session, nchunks > 0); start_id = lsm_tree->chunk[start_chunk]->id; /* Find the merge generation. */ for (generation = 0, i = 0; i < nchunks; i++) generation = WT_MAX(generation, lsm_tree->chunk[start_chunk + i]->generation + 1); WT_ERR(__wt_lsm_tree_writeunlock(session, lsm_tree)); locked = false; /* Allocate an ID for the merge. */ dest_id = __wt_atomic_add32(&lsm_tree->last, 1); /* * We only want to do the chunk loop if we're running with verbose, * so we wrap these statements in the conditional. Avoid the loop * in the normal path. */ if (WT_VERBOSE_ISSET(session, WT_VERB_LSM)) { WT_ERR(__wt_verbose(session, WT_VERB_LSM, "Merging %s chunks %u-%u into %u (%" PRIu64 " records)" ", generation %" PRIu32, lsm_tree->name, start_chunk, end_chunk, dest_id, record_count, generation)); for (verb = start_chunk; verb <= end_chunk; verb++) WT_ERR(__wt_verbose(session, WT_VERB_LSM, "Merging %s: Chunk[%u] id %u, gen: %" PRIu32 ", size: %" PRIu64 ", records: %" PRIu64, lsm_tree->name, verb, lsm_tree->chunk[verb]->id, lsm_tree->chunk[verb]->generation, lsm_tree->chunk[verb]->size, lsm_tree->chunk[verb]->count)); } WT_ERR(__wt_calloc_one(session, &chunk)); created_chunk = true; chunk->id = dest_id; if (FLD_ISSET(lsm_tree->bloom, WT_LSM_BLOOM_MERGED) && (FLD_ISSET(lsm_tree->bloom, WT_LSM_BLOOM_OLDEST) || start_chunk > 0) && record_count > 0) create_bloom = true; /* * Special setup for the merge cursor: * first, reset to open the dependent cursors; * then restrict the cursor to a specific number of chunks; * then set MERGE so the cursor doesn't track updates to the tree. */ WT_ERR(__wt_open_cursor(session, lsm_tree->name, NULL, NULL, &src)); F_SET(src, WT_CURSTD_RAW); WT_ERR(__wt_clsm_init_merge(src, start_chunk, start_id, nchunks)); WT_WITH_SCHEMA_LOCK(session, ret = __wt_lsm_tree_setup_chunk(session, lsm_tree, chunk)); WT_ERR(ret); if (create_bloom) { WT_ERR(__wt_lsm_tree_setup_bloom(session, lsm_tree, chunk)); WT_ERR(__wt_bloom_create(session, chunk->bloom_uri, lsm_tree->bloom_config, record_count, lsm_tree->bloom_bit_count, lsm_tree->bloom_hash_count, &bloom)); } /* Discard pages we read as soon as we're done with them. */ F_SET(session, WT_SESSION_NO_CACHE); cfg[0] = WT_CONFIG_BASE(session, WT_SESSION_open_cursor); cfg[1] = "bulk,raw,skip_sort_check"; cfg[2] = NULL; WT_ERR(__wt_open_cursor(session, chunk->uri, NULL, cfg, &dest)); #define LSM_MERGE_CHECK_INTERVAL WT_THOUSAND for (insert_count = 0; (ret = src->next(src)) == 0; insert_count++) { if (insert_count % LSM_MERGE_CHECK_INTERVAL == 0) { if (!F_ISSET(lsm_tree, WT_LSM_TREE_ACTIVE)) WT_ERR(EINTR); WT_STAT_FAST_CONN_INCRV(session, lsm_rows_merged, LSM_MERGE_CHECK_INTERVAL); ++lsm_tree->merge_progressing; } WT_ERR(src->get_key(src, &key)); dest->set_key(dest, &key); WT_ERR(src->get_value(src, &value)); dest->set_value(dest, &value); WT_ERR(dest->insert(dest)); if (create_bloom) WT_ERR(__wt_bloom_insert(bloom, &key)); } WT_ERR_NOTFOUND_OK(ret); WT_STAT_FAST_CONN_INCRV(session, lsm_rows_merged, insert_count % LSM_MERGE_CHECK_INTERVAL); ++lsm_tree->merge_progressing; WT_ERR(__wt_verbose(session, WT_VERB_LSM, "Bloom size for %" PRIu64 " has %" PRIu64 " items inserted.", record_count, insert_count)); /* * Closing and syncing the files can take a while. Set the * merge_syncing field so that compact knows it is still in * progress. */ (void)__wt_atomic_add32(&lsm_tree->merge_syncing, 1); in_sync = true; /* * We've successfully created the new chunk. Now install it. We need * to ensure that the NO_CACHE flag is cleared and the bloom filter * is closed (even if a step fails), so track errors but don't return * until we've cleaned up. */ WT_TRET(src->close(src)); WT_TRET(dest->close(dest)); src = dest = NULL; F_CLR(session, WT_SESSION_NO_CACHE); /* * We're doing advisory reads to fault the new trees into cache. * Don't block if the cache is full: our next unit of work may be to * discard some trees to free space. */ F_SET(session, WT_SESSION_NO_EVICTION); if (create_bloom) { if (ret == 0) WT_TRET(__wt_bloom_finalize(bloom)); /* * Read in a key to make sure the Bloom filters btree handle is * open before it becomes visible to application threads. * Otherwise application threads will stall while it is opened * and internal pages are read into cache. */ if (ret == 0) { WT_CLEAR(key); WT_TRET_NOTFOUND_OK(__wt_bloom_get(bloom, &key)); } WT_TRET(__wt_bloom_close(bloom)); bloom = NULL; } WT_ERR(ret); /* * Open a handle on the new chunk before application threads attempt * to access it, opening it pre-loads internal pages into the file * system cache. */ cfg[1] = "checkpoint=" WT_CHECKPOINT; WT_ERR(__wt_open_cursor(session, chunk->uri, NULL, cfg, &dest)); WT_TRET(dest->close(dest)); dest = NULL; ++lsm_tree->merge_progressing; (void)__wt_atomic_sub32(&lsm_tree->merge_syncing, 1); in_sync = false; WT_ERR_NOTFOUND_OK(ret); WT_ERR(__wt_lsm_tree_set_chunk_size(session, chunk)); WT_ERR(__wt_lsm_tree_writelock(session, lsm_tree)); locked = true; /* * Check whether we raced with another merge, and adjust the chunk * array offset as necessary. */ if (start_chunk >= lsm_tree->nchunks || lsm_tree->chunk[start_chunk]->id != start_id) for (start_chunk = 0; start_chunk < lsm_tree->nchunks; start_chunk++) if (lsm_tree->chunk[start_chunk]->id == start_id) break; /* * It is safe to error out here - since the update can only fail * prior to making updates to the tree. */ WT_ERR(__wt_lsm_merge_update_tree( session, lsm_tree, start_chunk, nchunks, chunk)); if (create_bloom) F_SET(chunk, WT_LSM_CHUNK_BLOOM); chunk->count = insert_count; chunk->generation = generation; F_SET(chunk, WT_LSM_CHUNK_ONDISK); /* * We have no current way of continuing if the metadata update fails, * so we will panic in that case. Put some effort into cleaning up * after ourselves here - so things have a chance of shutting down. * * Any errors that happened after the tree was locked are * fatal - we can't guarantee the state of the tree. */ if ((ret = __wt_lsm_meta_write(session, lsm_tree)) != 0) WT_PANIC_ERR(session, ret, "Failed finalizing LSM merge"); lsm_tree->dsk_gen++; /* Update the throttling while holding the tree lock. */ __wt_lsm_tree_throttle(session, lsm_tree, true); /* Schedule a pass to discard old chunks */ WT_ERR(__wt_lsm_manager_push_entry( session, WT_LSM_WORK_DROP, 0, lsm_tree)); err: if (locked) WT_TRET(__wt_lsm_tree_writeunlock(session, lsm_tree)); if (in_sync) (void)__wt_atomic_sub32(&lsm_tree->merge_syncing, 1); if (src != NULL) WT_TRET(src->close(src)); if (dest != NULL) WT_TRET(dest->close(dest)); if (bloom != NULL) WT_TRET(__wt_bloom_close(bloom)); if (ret != 0 && created_chunk) { /* Drop the newly-created files on error. */ if (chunk->uri != NULL) { WT_WITH_SCHEMA_LOCK(session, tret = __wt_schema_drop(session, chunk->uri, drop_cfg)); WT_TRET(tret); } if (create_bloom && chunk->bloom_uri != NULL) { WT_WITH_SCHEMA_LOCK(session, tret = __wt_schema_drop( session, chunk->bloom_uri, drop_cfg)); WT_TRET(tret); } __wt_free(session, chunk->bloom_uri); __wt_free(session, chunk->uri); __wt_free(session, chunk); if (ret == EINTR) WT_TRET(__wt_verbose(session, WT_VERB_LSM, "Merge aborted due to close")); else WT_TRET(__wt_verbose(session, WT_VERB_LSM, "Merge failed with %s", __wt_strerror(session, ret, NULL, 0))); } F_CLR(session, WT_SESSION_NO_CACHE | WT_SESSION_NO_EVICTION); return (ret); }
/* * __clsm_enter -- * Start an operation on an LSM cursor, update if the tree has changed. */ static inline int __clsm_enter(WT_CURSOR_LSM *clsm, bool reset, bool update) { WT_DECL_RET; WT_LSM_TREE *lsm_tree; WT_SESSION_IMPL *session; WT_TXN *txn; uint64_t *switch_txnp; uint64_t snap_min; lsm_tree = clsm->lsm_tree; session = (WT_SESSION_IMPL *)clsm->iface.session; txn = &session->txn; /* Merge cursors never update. */ if (F_ISSET(clsm, WT_CLSM_MERGE)) return (0); if (reset) { WT_ASSERT(session, !F_ISSET(&clsm->iface, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT)); WT_RET(__clsm_reset_cursors(clsm, NULL)); } for (;;) { /* * If the cursor looks up-to-date, check if the cache is full. * In case this call blocks, the check will be repeated before * proceeding. */ if (clsm->dsk_gen != lsm_tree->dsk_gen && lsm_tree->nchunks != 0) goto open; if (clsm->dsk_gen != lsm_tree->dsk_gen && lsm_tree->nchunks != 0) goto open; /* Update the maximum transaction ID in the primary chunk. */ if (update) { /* * Ensure that there is a transaction snapshot active. */ WT_RET(__wt_txn_autocommit_check(session)); WT_RET(__wt_txn_id_check(session)); WT_RET(__clsm_enter_update(clsm)); if (clsm->dsk_gen != clsm->lsm_tree->dsk_gen) goto open; if (txn->isolation == WT_ISO_SNAPSHOT) __wt_txn_cursor_op(session); /* * Figure out how many updates are required for * snapshot isolation. * * This is not a normal visibility check on the maximum * transaction ID in each chunk: any transaction ID * that overlaps with our snapshot is a potential * conflict. */ clsm->nupdates = 1; if (txn->isolation == WT_ISO_SNAPSHOT && F_ISSET(clsm, WT_CLSM_OPEN_SNAPSHOT)) { WT_ASSERT(session, F_ISSET(txn, WT_TXN_HAS_SNAPSHOT)); snap_min = txn->snap_min; for (switch_txnp = &clsm->switch_txn[clsm->nchunks - 2]; clsm->nupdates < clsm->nchunks; clsm->nupdates++, switch_txnp--) { if (WT_TXNID_LT(*switch_txnp, snap_min)) break; WT_ASSERT(session, !__wt_txn_visible_all( session, *switch_txnp)); } } } /* * Stop when we are up-to-date, as long as this is: * - a snapshot isolation update and the cursor is set up for * that; * - an update operation with a primary chunk, or * - a read operation and the cursor is open for reading. */ if ((!update || txn->isolation != WT_ISO_SNAPSHOT || F_ISSET(clsm, WT_CLSM_OPEN_SNAPSHOT)) && ((update && clsm->primary_chunk != NULL) || (!update && F_ISSET(clsm, WT_CLSM_OPEN_READ)))) break; open: WT_WITH_SCHEMA_LOCK(session, ret = __clsm_open_cursors(clsm, update, 0, 0)); WT_RET(ret); } if (!F_ISSET(clsm, WT_CLSM_ACTIVE)) { WT_RET(__cursor_enter(session)); F_SET(clsm, WT_CLSM_ACTIVE); } return (0); }
/* * __wt_session_compact -- * WT_SESSION.compact method. */ int __wt_session_compact( WT_SESSION *wt_session, const char *uri, const char *config) { WT_COMPACT_STATE compact; WT_CONFIG_ITEM cval; WT_DATA_SOURCE *dsrc; WT_DECL_RET; WT_SESSION_IMPL *session; u_int i; bool ignore_cache_size_set; ignore_cache_size_set = false; session = (WT_SESSION_IMPL *)wt_session; SESSION_API_CALL(session, compact, config, cfg); /* * The compaction thread should not block when the cache is full: it is * holding locks blocking checkpoints and once the cache is full, it can * spend a long time doing eviction. */ if (!F_ISSET(session, WT_SESSION_IGNORE_CACHE_SIZE)) { ignore_cache_size_set = true; F_SET(session, WT_SESSION_IGNORE_CACHE_SIZE); } /* In-memory ignores compaction operations. */ if (F_ISSET(S2C(session), WT_CONN_IN_MEMORY)) goto err; /* * Non-LSM object compaction requires checkpoints, which are impossible * in transactional contexts. Disallow in all contexts (there's no * reason for LSM to allow this, possible or not), and check now so the * error message isn't confusing. */ WT_ERR(__wt_txn_context_check(session, false)); /* Disallow objects in the WiredTiger name space. */ WT_ERR(__wt_str_name_check(session, uri)); if (!WT_PREFIX_MATCH(uri, "colgroup:") && !WT_PREFIX_MATCH(uri, "file:") && !WT_PREFIX_MATCH(uri, "index:") && !WT_PREFIX_MATCH(uri, "lsm:") && !WT_PREFIX_MATCH(uri, "table:")) { if ((dsrc = __wt_schema_get_source(session, uri)) != NULL) ret = dsrc->compact == NULL ? __wt_object_unsupported(session, uri) : dsrc->compact( dsrc, wt_session, uri, (WT_CONFIG_ARG *)cfg); else ret = __wt_bad_object_type(session, uri); goto err; } /* Setup the session handle's compaction state structure. */ memset(&compact, 0, sizeof(WT_COMPACT_STATE)); session->compact = &compact; /* Compaction can be time-limited. */ WT_ERR(__wt_config_gets(session, cfg, "timeout", &cval)); session->compact->max_time = (uint64_t)cval.val; __wt_epoch(session, &session->compact->begin); /* Find the types of data sources being compacted. */ WT_WITH_SCHEMA_LOCK(session, ret = __wt_schema_worker(session, uri, __compact_handle_append, __compact_uri_analyze, cfg, 0)); WT_ERR(ret); if (session->compact->lsm_count != 0) WT_ERR(__wt_schema_worker( session, uri, NULL, __wt_lsm_compact, cfg, 0)); if (session->compact->file_count != 0) WT_ERR(__compact_worker(session)); err: session->compact = NULL; for (i = 0; i < session->op_handle_next; ++i) { WT_WITH_DHANDLE(session, session->op_handle[i], WT_TRET(__compact_end(session))); WT_WITH_DHANDLE(session, session->op_handle[i], WT_TRET(__wt_session_release_dhandle(session))); } __wt_free(session, session->op_handle); session->op_handle_allocated = session->op_handle_next = 0; /* * Release common session resources (for example, checkpoint may acquire * significant reconciliation structures/memory). */ WT_TRET(__wt_session_release_resources(session)); if (ignore_cache_size_set) F_CLR(session, WT_SESSION_IGNORE_CACHE_SIZE); if (ret != 0) WT_STAT_CONN_INCR(session, session_table_compact_fail); else WT_STAT_CONN_INCR(session, session_table_compact_success); API_END_RET_NOTFOUND_MAP(session, ret); }
/* * __wt_lsm_checkpoint_worker -- * A worker thread for an LSM tree, responsible for flushing new chunks to * disk. */ void * __wt_lsm_checkpoint_worker(void *arg) { WT_DECL_RET; WT_LSM_CHUNK *chunk; WT_LSM_TREE *lsm_tree; WT_LSM_WORKER_COOKIE cookie; WT_SESSION_IMPL *session; WT_TXN_ISOLATION saved_isolation; u_int i, j; int locked; lsm_tree = arg; session = lsm_tree->ckpt_session; WT_CLEAR(cookie); while (F_ISSET(lsm_tree, WT_LSM_TREE_WORKING)) { if (F_ISSET(lsm_tree, WT_LSM_TREE_NEED_SWITCH)) { WT_WITH_SCHEMA_LOCK(session, ret = __wt_lsm_tree_switch(session, lsm_tree)); WT_ERR(ret); } WT_ERR(__lsm_copy_chunks(session, lsm_tree, &cookie, 0)); /* Write checkpoints in all completed files. */ for (i = 0, j = 0; i < cookie.nchunks - 1; i++) { if (!F_ISSET(lsm_tree, WT_LSM_TREE_WORKING)) goto err; if (F_ISSET(lsm_tree, WT_LSM_TREE_NEED_SWITCH)) break; chunk = cookie.chunk_array[i]; /* Stop if a running transaction needs the chunk. */ __wt_txn_update_oldest(session); if (!__wt_txn_visible_all(session, chunk->txnid_max)) break; /* * If the chunk is already checkpointed, make sure it * is also evicted. Either way, there is no point * trying to checkpoint it again. */ if (F_ISSET_ATOMIC(chunk, WT_LSM_CHUNK_ONDISK)) { if (F_ISSET_ATOMIC(chunk, WT_LSM_CHUNK_EVICTED)) continue; if ((ret = __lsm_discard_handle( session, chunk->uri, NULL)) == 0) F_SET_ATOMIC( chunk, WT_LSM_CHUNK_EVICTED); else if (ret == EBUSY) ret = 0; else WT_ERR_MSG(session, ret, "discard handle"); continue; } WT_VERBOSE_ERR(session, lsm, "LSM worker flushing %u", i); /* * Flush the file before checkpointing: this is the * expensive part in terms of I/O: do it without * holding the schema lock. * * Use the special eviction isolation level to avoid * interfering with an application checkpoint: we have * already checked that all of the updates in this * chunk are globally visible. * * !!! We can wait here for checkpoints and fsyncs to * complete, which can be a long time. * * Don't keep waiting for the lock if application * threads are waiting for a switch. Don't skip * flushing the leaves either: that just means we'll * hold the schema lock for (much) longer, which blocks * the world. */ WT_ERR(__wt_session_get_btree( session, chunk->uri, NULL, NULL, 0)); for (locked = 0; !locked && ret == 0 && !F_ISSET(lsm_tree, WT_LSM_TREE_NEED_SWITCH);) { if ((ret = __wt_spin_trylock(session, &S2C(session)->checkpoint_lock)) == 0) locked = 1; else if (ret == EBUSY) { __wt_yield(); ret = 0; } } if (locked) { saved_isolation = session->txn.isolation; session->txn.isolation = TXN_ISO_EVICTION; ret = __wt_bt_cache_op( session, NULL, WT_SYNC_WRITE_LEAVES); session->txn.isolation = saved_isolation; __wt_spin_unlock( session, &S2C(session)->checkpoint_lock); } WT_TRET(__wt_session_release_btree(session)); WT_ERR(ret); if (F_ISSET(lsm_tree, WT_LSM_TREE_NEED_SWITCH)) break; WT_VERBOSE_ERR(session, lsm, "LSM worker checkpointing %u", i); WT_WITH_SCHEMA_LOCK(session, ret = __wt_schema_worker(session, chunk->uri, __wt_checkpoint, NULL, NULL, 0)); if (ret != 0) { __wt_err(session, ret, "LSM checkpoint"); break; } WT_ERR(__wt_lsm_tree_set_chunk_size(session, chunk)); /* * Clear the "cache resident" flag so the primary can * be evicted and eventually closed. Only do this once * the checkpoint has succeeded: otherwise, accessing * the leaf page during the checkpoint can trigger * forced eviction. */ WT_ERR(__wt_session_get_btree( session, chunk->uri, NULL, NULL, 0)); __wt_btree_evictable(session, 1); WT_ERR(__wt_session_release_btree(session)); ++j; WT_ERR(__wt_lsm_tree_lock(session, lsm_tree, 1)); F_SET_ATOMIC(chunk, WT_LSM_CHUNK_ONDISK); ret = __wt_lsm_meta_write(session, lsm_tree); ++lsm_tree->dsk_gen; /* Update the throttle time. */ __wt_lsm_tree_throttle(session, lsm_tree); WT_TRET(__wt_lsm_tree_unlock(session, lsm_tree)); /* Make sure we aren't pinning a transaction ID. */ __wt_txn_release_snapshot(session); if (ret != 0) { __wt_err(session, ret, "LSM checkpoint metadata write"); break; } WT_VERBOSE_ERR(session, lsm, "LSM worker checkpointed %u", i); } __lsm_unpin_chunks(session, &cookie); if (j == 0 && F_ISSET(lsm_tree, WT_LSM_TREE_WORKING) && !F_ISSET(lsm_tree, WT_LSM_TREE_NEED_SWITCH)) WT_ERR_TIMEDOUT_OK(__wt_cond_wait( session, lsm_tree->work_cond, 100000)); } err: __lsm_unpin_chunks(session, &cookie); __wt_free(session, cookie.chunk_array); /* * The thread will only exit with failure if we run out of memory or * there is some other system driven failure. We can't keep going * after such a failure - ensure WiredTiger shuts down. */ if (ret != 0 && ret != WT_NOTFOUND) WT_PANIC_ERR(session, ret, "Shutting down LSM checkpoint utility thread"); return (NULL); }
/* * __wt_lsm_merge_worker -- * The merge worker thread for an LSM tree, responsible for merging * on-disk trees. */ void * __wt_lsm_merge_worker(void *vargs) { WT_DECL_RET; WT_LSM_WORKER_ARGS *args; WT_LSM_TREE *lsm_tree; WT_SESSION_IMPL *session; u_int aggressive, chunk_wait, id, old_aggressive, stallms; int progress; args = vargs; lsm_tree = args->lsm_tree; id = args->id; session = lsm_tree->worker_sessions[id]; __wt_free(session, args); aggressive = stallms = 0; while (F_ISSET(lsm_tree, WT_LSM_TREE_WORKING)) { /* * Help out with switching chunks in case the checkpoint worker * is busy. */ if (F_ISSET(lsm_tree, WT_LSM_TREE_NEED_SWITCH)) { WT_WITH_SCHEMA_LOCK(session, ret = __wt_lsm_tree_switch(session, lsm_tree)); WT_ERR(ret); } progress = 0; /* Clear any state from previous worker thread iterations. */ session->dhandle = NULL; /* Try to create a Bloom filter. */ if (__lsm_bloom_work(session, lsm_tree) == 0) progress = 1; /* If we didn't create a Bloom filter, try to merge. */ if (progress == 0 && __wt_lsm_merge(session, lsm_tree, id, aggressive) == 0) progress = 1; /* Clear any state from previous worker thread iterations. */ WT_CLEAR_BTREE_IN_SESSION(session); /* * Only have one thread freeing old chunks, and only if there * are chunks to free. */ if (id == 0 && lsm_tree->nold_chunks > 0 && __lsm_free_chunks(session, lsm_tree) == 0) progress = 1; if (progress) stallms = 0; else if (F_ISSET(lsm_tree, WT_LSM_TREE_WORKING) && !F_ISSET(lsm_tree, WT_LSM_TREE_NEED_SWITCH)) { /* Poll 10 times per second. */ WT_ERR_TIMEDOUT_OK(__wt_cond_wait( session, lsm_tree->work_cond, 100000)); stallms += 100; /* * Get aggressive if more than enough chunks for a * merge should have been created while we waited. * Use 10 seconds as a default if we don't have an * estimate. */ chunk_wait = stallms / (lsm_tree->chunk_fill_ms == 0 ? 10000 : lsm_tree->chunk_fill_ms); old_aggressive = aggressive; aggressive = chunk_wait / lsm_tree->merge_min; if (aggressive > old_aggressive) WT_VERBOSE_ERR(session, lsm, "LSM merge got aggressive (%u), " "%u / %" PRIu64, aggressive, stallms, lsm_tree->chunk_fill_ms); } } if (0) { err: __wt_err(session, ret, "LSM merge worker failed"); } return (NULL); }
/* * __session_truncate -- * WT_SESSION->truncate method. */ static int __session_truncate(WT_SESSION *wt_session, const char *uri, WT_CURSOR *start, WT_CURSOR *stop, const char *config) { WT_DECL_RET; WT_SESSION_IMPL *session; WT_CURSOR *cursor; int cmp; session = (WT_SESSION_IMPL *)wt_session; SESSION_TXN_API_CALL(session, truncate, config, cfg); /* * If the URI is specified, we don't need a start/stop, if start/stop * is specified, we don't need a URI. * * If no URI is specified, and both cursors are specified, start/stop * must reference the same object. * * Any specified cursor must have been initialized. */ if ((uri == NULL && start == NULL && stop == NULL) || (uri != NULL && (start != NULL || stop != NULL))) WT_ERR_MSG(session, EINVAL, "the truncate method should be passed either a URI or " "start/stop cursors, but not both"); if (uri != NULL) { WT_WITH_SCHEMA_LOCK(session, ret = __wt_schema_truncate(session, uri, cfg)); goto done; } /* Truncate is only supported for file and table objects. */ cursor = start == NULL ? stop : start; if (!WT_PREFIX_MATCH(cursor->uri, "file:") && !WT_PREFIX_MATCH(cursor->uri, "table:")) WT_ERR(__wt_bad_object_type(session, cursor->uri)); /* * If both cursors set, check they're correctly ordered with respect to * each other. We have to test this before any search, the search can * change the initial cursor position. * * Rather happily, the compare routine will also confirm the cursors * reference the same object and the keys are set. */ if (start != NULL && stop != NULL) { WT_ERR(start->compare(start, stop, &cmp)); if (cmp > 0) WT_ERR_MSG(session, EINVAL, "the start cursor position is after the stop " "cursor position"); } /* * Truncate does not require keys actually exist so that applications * can discard parts of the object's name space without knowing exactly * what records currently appear in the object. For this reason, do a * search-near, rather than a search. Additionally, we have to correct * after calling search-near, to position the start/stop cursors on the * next record greater than/less than the original key. If the cursors * hit the beginning/end of the object, or the start/stop keys cross, * we're done, the range must be empty. */ if (start != NULL) { WT_ERR(start->search_near(start, &cmp)); if (cmp < 0 && (ret = start->next(start)) != 0) { WT_ERR_NOTFOUND_OK(ret); goto done; } } if (stop != NULL) { WT_ERR(stop->search_near(stop, &cmp)); if (cmp > 0 && (ret = stop->prev(stop)) != 0) { WT_ERR_NOTFOUND_OK(ret); goto done; } if (start != NULL) { WT_ERR(start->compare(start, stop, &cmp)); if (cmp > 0) goto done; } } if (WT_PREFIX_MATCH(cursor->uri, "file:")) WT_ERR(__wt_curfile_truncate(session, start, stop)); else WT_ERR(__wt_curtable_truncate(session, start, stop)); done: err: TXN_API_END_NOTFOUND_MAP(session, ret); }
/* * __clsm_enter -- * Start an operation on an LSM cursor, update if the tree has changed. */ static inline int __clsm_enter(WT_CURSOR_LSM *clsm, int reset, int update) { WT_CURSOR *c; WT_DECL_RET; WT_LSM_CHUNK *chunk; WT_SESSION_IMPL *session; uint64_t *txnid_maxp; uint64_t id, myid, snap_min; session = (WT_SESSION_IMPL *)clsm->iface.session; /* Merge cursors never update. */ if (F_ISSET(clsm, WT_CLSM_MERGE)) return (0); if (reset) { c = &clsm->iface; /* Copy out data before resetting chunk cursors. */ if (F_ISSET(c, WT_CURSTD_KEY_INT) && !WT_DATA_IN_ITEM(&c->key)) WT_RET(__wt_buf_set( session, &c->key, c->key.data, c->key.size)); if (F_ISSET(c, WT_CURSTD_VALUE_INT) && !WT_DATA_IN_ITEM(&c->value)) WT_RET(__wt_buf_set( session, &c->value, c->value.data, c->value.size)); WT_RET(__clsm_reset_cursors(clsm, NULL)); } for (;;) { /* * If the cursor looks up-to-date, check if the cache is full. * In case this call blocks, the check will be repeated before * proceeding. */ if (clsm->dsk_gen != clsm->lsm_tree->dsk_gen) goto open; WT_RET(__wt_cache_full_check(session)); if (clsm->dsk_gen != clsm->lsm_tree->dsk_gen) goto open; /* Update the maximum transaction ID in the primary chunk. */ if (update && (chunk = clsm->primary_chunk) != NULL) { WT_RET(__wt_txn_autocommit_check(session)); for (id = chunk->txnid_max, myid = session->txn.id; !TXNID_LE(myid, id); id = chunk->txnid_max) { WT_ASSERT(session, myid != WT_TXN_NONE); (void)WT_ATOMIC_CAS( chunk->txnid_max, id, myid); } } /* * Figure out how many updates are required for snapshot * isolation. * * This is not a normal visibility check on the maximum * transaction ID in each chunk: any transaction ID that * overlaps with our snapshot is a potential conflict. */ clsm->nupdates = 1; if (session->txn.isolation == TXN_ISO_SNAPSHOT && F_ISSET(clsm, WT_CLSM_OPEN_SNAPSHOT)) { snap_min = session->txn.snap_min; for (txnid_maxp = &clsm->txnid_max[clsm->nchunks - 2]; clsm->nupdates < clsm->nchunks; clsm->nupdates++, txnid_maxp--) if (TXNID_LT(*txnid_maxp, snap_min)) break; } /* * Stop when we are up-to-date, as long as this is: * - a snapshot isolation update and the cursor is set up for * that; * - an update operation with a primary chunk, or * - a read operation and the cursor is open for reading. */ if ((!update || session->txn.isolation != TXN_ISO_SNAPSHOT || F_ISSET(clsm, WT_CLSM_OPEN_SNAPSHOT)) && ((update && clsm->primary_chunk != NULL) || (!update && F_ISSET(clsm, WT_CLSM_OPEN_READ)))) break; open: WT_WITH_SCHEMA_LOCK(session, ret = __clsm_open_cursors(clsm, update, 0, 0)); WT_RET(ret); } if (!F_ISSET(clsm, WT_CLSM_ACTIVE)) { WT_RET(__cursor_enter(session)); F_SET(clsm, WT_CLSM_ACTIVE); } return (0); }