/*__wt_session_compact*/ int __wt_session_compact(WT_SESSION *wt_session, const char *uri, const char *config) { WT_COMPACT compact; WT_CONFIG_ITEM cval; WT_DECL_RET; WT_SESSION_IMPL *session; session = (WT_SESSION_IMPL *)wt_session; SESSION_API_CALL(session, compact, config, cfg); /* Setup the structure in the session handle */ memset(&compact, 0, sizeof(WT_COMPACT)); session->compact = &compact; WT_ERR(__wt_config_gets(session, cfg, "timeout", &cval)); session->compact->max_time = (uint64_t)cval.val; /* Find the types of data sources are being compacted. */ WT_WITH_SCHEMA_LOCK(session, ret = __wt_schema_worker(session, uri, NULL, __wt_compact_uri_analyze, cfg, 0)); WT_ERR(ret); if (session->compact->lsm_count != 0) WT_ERR(__wt_schema_worker(session, uri, NULL, __wt_lsm_compact, cfg, 0)); if (session->compact->file_count != 0) WT_ERR(__compact_file(session, uri, cfg)); err: session->compact = NULL; API_END_RET_NOTFOUND_MAP(session, ret); }
/* * __wt_lsm_tree_worker -- * Run a schema worker operation on each level of a LSM tree. */ int __wt_lsm_tree_worker(WT_SESSION_IMPL *session, const char *uri, int (*file_func)(WT_SESSION_IMPL *, const char *[]), int (*name_func)(WT_SESSION_IMPL *, const char *, int *), const char *cfg[], uint32_t open_flags) { WT_DECL_RET; WT_LSM_CHUNK *chunk; WT_LSM_TREE *lsm_tree; u_int i; int exclusive, locked; locked = 0; exclusive = FLD_ISSET(open_flags, WT_DHANDLE_EXCLUSIVE) ? 1 : 0; WT_RET(__wt_lsm_tree_get(session, uri, exclusive, &lsm_tree)); /* * We mark that we're busy using the tree to coordinate * with merges so that merging doesn't change the chunk * array out from underneath us. */ WT_ERR(exclusive ? __wt_lsm_tree_writelock(session, lsm_tree) : __wt_lsm_tree_readlock(session, lsm_tree)); locked = 1; for (i = 0; i < lsm_tree->nchunks; i++) { chunk = lsm_tree->chunk[i]; if (file_func == __wt_checkpoint && F_ISSET(chunk, WT_LSM_CHUNK_ONDISK)) continue; WT_ERR(__wt_schema_worker(session, chunk->uri, file_func, name_func, cfg, open_flags)); if (name_func == __wt_backup_list_uri_append && F_ISSET(chunk, WT_LSM_CHUNK_BLOOM)) WT_ERR(__wt_schema_worker(session, chunk->bloom_uri, file_func, name_func, cfg, open_flags)); } err: if (locked) WT_TRET(exclusive ? __wt_lsm_tree_writeunlock(session, lsm_tree) : __wt_lsm_tree_readunlock(session, lsm_tree)); __wt_lsm_tree_release(session, lsm_tree); return (ret); }
/* * __backup_uri -- * Backup a list of objects. */ static int __backup_uri(WT_SESSION_IMPL *session, WT_CURSOR_BACKUP *cb, const char *cfg[], int *foundp, int *log_only) { WT_CONFIG targetconf; WT_CONFIG_ITEM cval, k, v; WT_DECL_ITEM(tmp); WT_DECL_RET; int target_list; const char *uri; *foundp = 0; *log_only = 0; /* * If we find a non-empty target configuration string, we have a job, * otherwise it's not our problem. */ WT_RET(__wt_config_gets(session, cfg, "target", &cval)); WT_RET(__wt_config_subinit(session, &targetconf, &cval)); for (cb->list_next = 0, target_list = 0; (ret = __wt_config_next(&targetconf, &k, &v)) == 0; ++target_list) { /* If it is our first time through, allocate. */ if (target_list == 0) { *foundp = 1; WT_ERR(__wt_scr_alloc(session, 512, &tmp)); } WT_ERR(__wt_buf_fmt(session, tmp, "%.*s", (int)k.len, k.str)); uri = tmp->data; if (v.len != 0) WT_ERR_MSG(session, EINVAL, "%s: invalid backup target: URIs may need quoting", uri); /* * Handle log targets. We do not need to go through the * schema worker, just call the function to append them. * Set log_only only if it is our only URI target. */ if (WT_PREFIX_MATCH(uri, "log:")) { if (target_list == 0) *log_only = 1; else *log_only = 0; WT_ERR(__wt_backup_list_uri_append( session, uri, NULL)); } else WT_ERR(__wt_schema_worker(session, uri, NULL, __wt_backup_list_uri_append, cfg, 0)); } WT_ERR_NOTFOUND_OK(ret); err: __wt_scr_free(session, &tmp); return (ret); }
/*btree file的compact操作*/ static int __compact_file(WT_SESSION_IMPL* session, const char* uri, const char* cfg[]) { WT_DECL_RET; WT_DECL_ITEM(t); WT_SESSION *wt_session; WT_TXN *txn; int i; struct timespec start_time; txn = &session->txn; wt_session = &session->iface; /* * File compaction requires checkpoints, which will fail in a * transactional context. Check now so the error message isn't * confusing. */ if(session->compact->file_count != 0 && F_ISSET(txn, TXN_RUNNING)) WT_ERR_MSG(session, EINVAL, " File compaction not permitted in a transaction"); /* * Force the checkpoint: we don't want to skip it because the work we * need to have done is done in the underlying block manager. */ WT_ERR(__wt_scr_alloc(session, 128, &t)); WT_ERR(__wt_buf_fmt(session, t, "target=(\"%s\"),force=1", uri)); WT_ERR(__wt_epoch(session, &start_time)); /* * We compact 10% of the file on each pass (but the overall size of the * file is decreasing each time, so we're not compacting 10% of the * original file each time). Try 100 times (which is clearly more than * we need); quit if we make no progress and check for a timeout each * time through the loop. */ for (i = 0; i < 100; ++i) { WT_ERR(wt_session->checkpoint(wt_session, t->data)); session->compaction = 0; WT_WITH_SCHEMA_LOCK(session, ret = __wt_schema_worker(session, uri, __wt_compact, NULL, cfg, 0)); WT_ERR(ret); if (!session->compaction) break; WT_ERR(wt_session->checkpoint(wt_session, t->data)); WT_ERR(wt_session->checkpoint(wt_session, t->data)); WT_ERR(__session_compact_check_timeout(session, start_time)); } err: __wt_scr_free(session, &t); }
/* * __session_sync -- * WT_SESSION->sync method. */ static int __session_sync(WT_SESSION *wt_session, const char *uri, const char *config) { WT_SESSION_IMPL *session; int ret; session = (WT_SESSION_IMPL *)wt_session; SESSION_API_CALL(session, sync, config, cfg); ret = __wt_schema_worker(session, uri, cfg, __wt_btree_sync, 0); err: API_END_NOTFOUND_MAP(session, ret); }
/* * __session_dumpfile -- * WT_SESSION->dumpfile method. */ static int __session_dumpfile(WT_SESSION *wt_session, const char *uri, const char *config) { WT_SESSION_IMPL *session; int ret; session = (WT_SESSION_IMPL *)wt_session; SESSION_API_CALL(session, dumpfile, config, cfg); ret = __wt_schema_worker(session, uri, cfg, __wt_dumpfile, WT_BTREE_EXCLUSIVE | WT_BTREE_VERIFY); err: API_END_NOTFOUND_MAP(session, ret); }
/* * __session_verify -- * WT_SESSION->verify method. */ static int __session_verify(WT_SESSION *wt_session, const char *uri, const char *config) { WT_DECL_RET; WT_SESSION_IMPL *session; session = (WT_SESSION_IMPL *)wt_session; SESSION_API_CALL(session, verify, config, cfg); WT_WITH_SCHEMA_LOCK(session, ret = __wt_schema_worker(session, uri, __wt_verify, cfg, WT_BTREE_EXCLUSIVE | WT_BTREE_VERIFY)); err: API_END_NOTFOUND_MAP(session, ret); }
/* * __session_compact_worker -- * Worker function to do the actual compaction call. */ static int __session_compact_worker( WT_SESSION *wt_session, const char *uri, const char *config) { WT_DECL_RET; WT_SESSION_IMPL *session; session = (WT_SESSION_IMPL *)wt_session; SESSION_API_CALL(session, compact, config, cfg); WT_WITH_SCHEMA_LOCK(session, ret = __wt_schema_worker(session, uri, __wt_compact, cfg, 0)); err: API_END_NOTFOUND_MAP(session, ret); }
/* * __backup_uri -- * Backup a list of objects. */ static int __backup_uri(WT_SESSION_IMPL *session, WT_CURSOR_BACKUP *cb, const char *cfg[], int *foundp) { WT_CONFIG targetconf; WT_CONFIG_ITEM cval, k, v; WT_DECL_ITEM(tmp); WT_DECL_RET; int target_list; const char *uri; *foundp = target_list = 0; /* * If we find a non-empty target configuration string, we have a job, * otherwise it's not our problem. */ WT_RET(__wt_config_gets(session, cfg, "target", &cval)); WT_RET(__wt_config_subinit(session, &targetconf, &cval)); for (cb->list_next = 0; (ret = __wt_config_next(&targetconf, &k, &v)) == 0;) { if (!target_list) { target_list = *foundp = 1; WT_ERR(__wt_scr_alloc(session, 512, &tmp)); } WT_ERR(__wt_buf_fmt(session, tmp, "%.*s", (int)k.len, k.str)); uri = tmp->data; if (v.len != 0) WT_ERR_MSG(session, EINVAL, "%s: invalid backup target: URIs may need quoting", uri); WT_ERR(__wt_schema_worker( session, uri, NULL, __wt_backup_list_uri_append, cfg, 0)); } WT_ERR_NOTFOUND_OK(ret); err: __wt_scr_free(&tmp); return (ret); }
/* * __wt_lsm_tree_worker -- * Run a schema worker operation on each level of a LSM tree. */ int __wt_lsm_tree_worker(WT_SESSION_IMPL *session, const char *uri, int (*func)(WT_SESSION_IMPL *, const char *[]), const char *cfg[], uint32_t open_flags) { WT_DECL_RET; WT_LSM_CHUNK *chunk; WT_LSM_TREE *lsm_tree; u_int i; WT_RET(__wt_lsm_tree_get(session, uri, FLD_ISSET(open_flags, WT_BTREE_EXCLUSIVE) ? 1 : 0, &lsm_tree)); for (i = 0; i < lsm_tree->nchunks; i++) { chunk = lsm_tree->chunk[i]; if (func == __wt_checkpoint && F_ISSET(chunk, WT_LSM_CHUNK_ONDISK)) continue; WT_ERR(__wt_schema_worker( session, chunk->uri, func, cfg, open_flags)); } err: __wt_lsm_tree_release(session, lsm_tree); return (ret); }
/* * __wt_session_compact -- * WT_SESSION.compact method. */ int __wt_session_compact( WT_SESSION *wt_session, const char *uri, const char *config) { WT_COMPACT_STATE compact; WT_CONFIG_ITEM cval; WT_DATA_SOURCE *dsrc; WT_DECL_RET; WT_SESSION_IMPL *session; u_int i; bool ignore_cache_size_set; ignore_cache_size_set = false; session = (WT_SESSION_IMPL *)wt_session; SESSION_API_CALL(session, compact, config, cfg); /* * The compaction thread should not block when the cache is full: it is * holding locks blocking checkpoints and once the cache is full, it can * spend a long time doing eviction. */ if (!F_ISSET(session, WT_SESSION_IGNORE_CACHE_SIZE)) { ignore_cache_size_set = true; F_SET(session, WT_SESSION_IGNORE_CACHE_SIZE); } /* In-memory ignores compaction operations. */ if (F_ISSET(S2C(session), WT_CONN_IN_MEMORY)) goto err; /* * Non-LSM object compaction requires checkpoints, which are impossible * in transactional contexts. Disallow in all contexts (there's no * reason for LSM to allow this, possible or not), and check now so the * error message isn't confusing. */ WT_ERR(__wt_txn_context_check(session, false)); /* Disallow objects in the WiredTiger name space. */ WT_ERR(__wt_str_name_check(session, uri)); if (!WT_PREFIX_MATCH(uri, "colgroup:") && !WT_PREFIX_MATCH(uri, "file:") && !WT_PREFIX_MATCH(uri, "index:") && !WT_PREFIX_MATCH(uri, "lsm:") && !WT_PREFIX_MATCH(uri, "table:")) { if ((dsrc = __wt_schema_get_source(session, uri)) != NULL) ret = dsrc->compact == NULL ? __wt_object_unsupported(session, uri) : dsrc->compact( dsrc, wt_session, uri, (WT_CONFIG_ARG *)cfg); else ret = __wt_bad_object_type(session, uri); goto err; } /* Setup the session handle's compaction state structure. */ memset(&compact, 0, sizeof(WT_COMPACT_STATE)); session->compact = &compact; /* Compaction can be time-limited. */ WT_ERR(__wt_config_gets(session, cfg, "timeout", &cval)); session->compact->max_time = (uint64_t)cval.val; __wt_epoch(session, &session->compact->begin); /* Find the types of data sources being compacted. */ WT_WITH_SCHEMA_LOCK(session, ret = __wt_schema_worker(session, uri, __compact_handle_append, __compact_uri_analyze, cfg, 0)); WT_ERR(ret); if (session->compact->lsm_count != 0) WT_ERR(__wt_schema_worker( session, uri, NULL, __wt_lsm_compact, cfg, 0)); if (session->compact->file_count != 0) WT_ERR(__compact_worker(session)); err: session->compact = NULL; for (i = 0; i < session->op_handle_next; ++i) { WT_WITH_DHANDLE(session, session->op_handle[i], WT_TRET(__compact_end(session))); WT_WITH_DHANDLE(session, session->op_handle[i], WT_TRET(__wt_session_release_dhandle(session))); } __wt_free(session, session->op_handle); session->op_handle_allocated = session->op_handle_next = 0; /* * Release common session resources (for example, checkpoint may acquire * significant reconciliation structures/memory). */ WT_TRET(__wt_session_release_resources(session)); if (ignore_cache_size_set) F_CLR(session, WT_SESSION_IGNORE_CACHE_SIZE); if (ret != 0) WT_STAT_CONN_INCR(session, session_table_compact_fail); else WT_STAT_CONN_INCR(session, session_table_compact_success); API_END_RET_NOTFOUND_MAP(session, ret); }
/* * __wt_schema_worker -- * Get Btree handles for the object and cycle through calls to an * underlying worker function with each handle. */ int __wt_schema_worker(WT_SESSION_IMPL *session, const char *uri, int (*file_func)(WT_SESSION_IMPL *, const char *[]), int (*name_func)(WT_SESSION_IMPL *, const char *, bool *), const char *cfg[], uint32_t open_flags) { WT_COLGROUP *colgroup; WT_DATA_SOURCE *dsrc; WT_DECL_RET; WT_INDEX *idx; WT_SESSION *wt_session; WT_TABLE *table; u_int i; bool skip; table = NULL; skip = false; if (name_func != NULL) WT_ERR(name_func(session, uri, &skip)); /* If the callback said to skip this object, we're done. */ if (skip) return (0); /* Get the btree handle(s) and call the underlying function. */ if (WT_PREFIX_MATCH(uri, "file:")) { if (file_func != NULL) WT_ERR(__wt_exclusive_handle_operation(session, uri, file_func, cfg, open_flags)); } else if (WT_PREFIX_MATCH(uri, "colgroup:")) { WT_ERR(__wt_schema_get_colgroup( session, uri, false, NULL, &colgroup)); WT_ERR(__wt_schema_worker(session, colgroup->source, file_func, name_func, cfg, open_flags)); } else if (WT_PREFIX_MATCH(uri, "index:")) { idx = NULL; WT_ERR(__wt_schema_get_index(session, uri, false, false, &idx)); WT_ERR(__wt_schema_worker(session, idx->source, file_func, name_func, cfg, open_flags)); } else if (WT_PREFIX_MATCH(uri, "lsm:")) { WT_ERR(__wt_lsm_tree_worker(session, uri, file_func, name_func, cfg, open_flags)); } else if (WT_PREFIX_MATCH(uri, "table:")) { /* * Note: we would like to use open_flags here (e.g., to lock * the table exclusive during schema-changing operations), but * that is currently problematic because we get the table again * in order to discover column groups and indexes. */ WT_ERR(__wt_schema_get_table_uri( session, uri, false, 0, &table)); /* * We could make a recursive call for each colgroup or index * URI, but since we have already opened the table, we can take * a short cut and skip straight to the sources. If we have a * name function, it needs to know about the intermediate URIs. */ for (i = 0; i < WT_COLGROUPS(table); i++) { colgroup = table->cgroups[i]; skip = false; if (name_func != NULL) WT_ERR(name_func( session, colgroup->name, &skip)); if (!skip) WT_ERR(__wt_schema_worker( session, colgroup->source, file_func, name_func, cfg, open_flags)); } WT_ERR(__wt_schema_open_indices(session, table)); for (i = 0; i < table->nindices; i++) { idx = table->indices[i]; skip = false; if (name_func != NULL) WT_ERR(name_func(session, idx->name, &skip)); if (!skip) WT_ERR(__wt_schema_worker(session, idx->source, file_func, name_func, cfg, open_flags)); } } else if ((dsrc = __wt_schema_get_source(session, uri)) != NULL) { wt_session = (WT_SESSION *)session; if (file_func == __wt_salvage && dsrc->salvage != NULL) WT_ERR(dsrc->salvage( dsrc, wt_session, uri, (WT_CONFIG_ARG *)cfg)); else if (file_func == __wt_verify && dsrc->verify != NULL) WT_ERR(dsrc->verify( dsrc, wt_session, uri, (WT_CONFIG_ARG *)cfg)); else if (file_func == __wt_checkpoint) ; else if (file_func == __wt_checkpoint_get_handles) ; else if (file_func == __wt_checkpoint_sync) ; else WT_ERR(__wt_object_unsupported(session, uri)); } else WT_ERR(__wt_bad_object_type(session, uri)); err: if (table != NULL) WT_TRET(__wt_schema_release_table(session, table)); return (ret); }
/* * __backup_uri -- * Backup a list of objects. */ static int __backup_uri(WT_SESSION_IMPL *session, const char *cfg[], bool *foundp, bool *log_only) { WT_CONFIG targetconf; WT_CONFIG_ITEM cval, k, v; WT_DECL_ITEM(tmp); WT_DECL_RET; bool target_list; const char *uri; *foundp = *log_only = false; /* * If we find a non-empty target configuration string, we have a job, * otherwise it's not our problem. */ WT_RET(__wt_config_gets(session, cfg, "target", &cval)); __wt_config_subinit(session, &targetconf, &cval); for (target_list = false; (ret = __wt_config_next(&targetconf, &k, &v)) == 0; target_list = true) { /* If it is our first time through, allocate. */ if (!target_list) { *foundp = true; WT_ERR(__wt_scr_alloc(session, 512, &tmp)); } WT_ERR(__wt_buf_fmt(session, tmp, "%.*s", (int)k.len, k.str)); uri = tmp->data; if (v.len != 0) WT_ERR_MSG(session, EINVAL, "%s: invalid backup target: URIs may need quoting", uri); /* * Handle log targets. We do not need to go through the schema * worker, just call the function to append them. Set log_only * only if it is our only URI target. */ if (WT_PREFIX_MATCH(uri, "log:")) { /* * Log archive cannot mix with incremental backup, don't * let that happen. */ if (FLD_ISSET( S2C(session)->log_flags, WT_CONN_LOG_ARCHIVE)) WT_ERR_MSG(session, EINVAL, "incremental backup not possible when " "automatic log archival configured"); *log_only = !target_list; WT_ERR(__backup_log_append( session, session->bkp_cursor, false)); } else { *log_only = false; WT_ERR(__wt_schema_worker(session, uri, NULL, __backup_list_uri_append, cfg, 0)); } } WT_ERR_NOTFOUND_OK(ret); err: __wt_scr_free(session, &tmp); return (ret); }
/* * __wt_lsm_checkpoint_worker -- * A worker thread for an LSM tree, responsible for flushing new chunks to * disk. */ void * __wt_lsm_checkpoint_worker(void *arg) { WT_DECL_RET; WT_LSM_CHUNK *chunk; WT_LSM_TREE *lsm_tree; WT_LSM_WORKER_COOKIE cookie; WT_SESSION_IMPL *session; WT_TXN_ISOLATION saved_isolation; u_int i, j; int locked; lsm_tree = arg; session = lsm_tree->ckpt_session; WT_CLEAR(cookie); while (F_ISSET(lsm_tree, WT_LSM_TREE_WORKING)) { if (F_ISSET(lsm_tree, WT_LSM_TREE_NEED_SWITCH)) { WT_WITH_SCHEMA_LOCK(session, ret = __wt_lsm_tree_switch(session, lsm_tree)); WT_ERR(ret); } WT_ERR(__lsm_copy_chunks(session, lsm_tree, &cookie, 0)); /* Write checkpoints in all completed files. */ for (i = 0, j = 0; i < cookie.nchunks - 1; i++) { if (!F_ISSET(lsm_tree, WT_LSM_TREE_WORKING)) goto err; if (F_ISSET(lsm_tree, WT_LSM_TREE_NEED_SWITCH)) break; chunk = cookie.chunk_array[i]; /* Stop if a running transaction needs the chunk. */ __wt_txn_update_oldest(session); if (!__wt_txn_visible_all(session, chunk->txnid_max)) break; /* * If the chunk is already checkpointed, make sure it * is also evicted. Either way, there is no point * trying to checkpoint it again. */ if (F_ISSET_ATOMIC(chunk, WT_LSM_CHUNK_ONDISK)) { if (F_ISSET_ATOMIC(chunk, WT_LSM_CHUNK_EVICTED)) continue; if ((ret = __lsm_discard_handle( session, chunk->uri, NULL)) == 0) F_SET_ATOMIC( chunk, WT_LSM_CHUNK_EVICTED); else if (ret == EBUSY) ret = 0; else WT_ERR_MSG(session, ret, "discard handle"); continue; } WT_VERBOSE_ERR(session, lsm, "LSM worker flushing %u", i); /* * Flush the file before checkpointing: this is the * expensive part in terms of I/O: do it without * holding the schema lock. * * Use the special eviction isolation level to avoid * interfering with an application checkpoint: we have * already checked that all of the updates in this * chunk are globally visible. * * !!! We can wait here for checkpoints and fsyncs to * complete, which can be a long time. * * Don't keep waiting for the lock if application * threads are waiting for a switch. Don't skip * flushing the leaves either: that just means we'll * hold the schema lock for (much) longer, which blocks * the world. */ WT_ERR(__wt_session_get_btree( session, chunk->uri, NULL, NULL, 0)); for (locked = 0; !locked && ret == 0 && !F_ISSET(lsm_tree, WT_LSM_TREE_NEED_SWITCH);) { if ((ret = __wt_spin_trylock(session, &S2C(session)->checkpoint_lock)) == 0) locked = 1; else if (ret == EBUSY) { __wt_yield(); ret = 0; } } if (locked) { saved_isolation = session->txn.isolation; session->txn.isolation = TXN_ISO_EVICTION; ret = __wt_bt_cache_op( session, NULL, WT_SYNC_WRITE_LEAVES); session->txn.isolation = saved_isolation; __wt_spin_unlock( session, &S2C(session)->checkpoint_lock); } WT_TRET(__wt_session_release_btree(session)); WT_ERR(ret); if (F_ISSET(lsm_tree, WT_LSM_TREE_NEED_SWITCH)) break; WT_VERBOSE_ERR(session, lsm, "LSM worker checkpointing %u", i); WT_WITH_SCHEMA_LOCK(session, ret = __wt_schema_worker(session, chunk->uri, __wt_checkpoint, NULL, NULL, 0)); if (ret != 0) { __wt_err(session, ret, "LSM checkpoint"); break; } WT_ERR(__wt_lsm_tree_set_chunk_size(session, chunk)); /* * Clear the "cache resident" flag so the primary can * be evicted and eventually closed. Only do this once * the checkpoint has succeeded: otherwise, accessing * the leaf page during the checkpoint can trigger * forced eviction. */ WT_ERR(__wt_session_get_btree( session, chunk->uri, NULL, NULL, 0)); __wt_btree_evictable(session, 1); WT_ERR(__wt_session_release_btree(session)); ++j; WT_ERR(__wt_lsm_tree_lock(session, lsm_tree, 1)); F_SET_ATOMIC(chunk, WT_LSM_CHUNK_ONDISK); ret = __wt_lsm_meta_write(session, lsm_tree); ++lsm_tree->dsk_gen; /* Update the throttle time. */ __wt_lsm_tree_throttle(session, lsm_tree); WT_TRET(__wt_lsm_tree_unlock(session, lsm_tree)); /* Make sure we aren't pinning a transaction ID. */ __wt_txn_release_snapshot(session); if (ret != 0) { __wt_err(session, ret, "LSM checkpoint metadata write"); break; } WT_VERBOSE_ERR(session, lsm, "LSM worker checkpointed %u", i); } __lsm_unpin_chunks(session, &cookie); if (j == 0 && F_ISSET(lsm_tree, WT_LSM_TREE_WORKING) && !F_ISSET(lsm_tree, WT_LSM_TREE_NEED_SWITCH)) WT_ERR_TIMEDOUT_OK(__wt_cond_wait( session, lsm_tree->work_cond, 100000)); } err: __lsm_unpin_chunks(session, &cookie); __wt_free(session, cookie.chunk_array); /* * The thread will only exit with failure if we run out of memory or * there is some other system driven failure. We can't keep going * after such a failure - ensure WiredTiger shuts down. */ if (ret != 0 && ret != WT_NOTFOUND) WT_PANIC_ERR(session, ret, "Shutting down LSM checkpoint utility thread"); return (NULL); }
/* * __wt_lsm_checkpoint_chunk -- * Flush a single LSM chunk to disk. */ int __wt_lsm_checkpoint_chunk(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, WT_LSM_CHUNK *chunk) { WT_DECL_RET; WT_TXN_ISOLATION saved_isolation; /* * If the chunk is already checkpointed, make sure it is also evicted. * Either way, there is no point trying to checkpoint it again. */ if (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) && !F_ISSET(chunk, WT_LSM_CHUNK_STABLE) && !chunk->evicted) { if ((ret = __lsm_discard_handle( session, chunk->uri, NULL)) == 0) chunk->evicted = 1; else if (ret == EBUSY) ret = 0; else WT_RET_MSG(session, ret, "discard handle"); } if (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK)) { WT_RET(__wt_verbose(session, WT_VERB_LSM, "LSM worker %s already on disk", chunk->uri)); return (0); } /* Stop if a running transaction needs the chunk. */ __wt_txn_update_oldest(session); if (chunk->switch_txn == WT_TXN_NONE || !__wt_txn_visible_all(session, chunk->switch_txn)) { WT_RET(__wt_verbose(session, WT_VERB_LSM, "LSM worker %s: running transaction, return", chunk->uri)); return (0); } WT_RET(__wt_verbose(session, WT_VERB_LSM, "LSM worker flushing %s", chunk->uri)); /* * Flush the file before checkpointing: this is the expensive part in * terms of I/O. * * Use the special eviction isolation level to avoid interfering with * an application checkpoint: we have already checked that all of the * updates in this chunk are globally visible. * * !!! We can wait here for checkpoints and fsyncs to complete, which * can be a long time. */ if ((ret = __wt_session_get_btree( session, chunk->uri, NULL, NULL, 0)) == 0) { saved_isolation = session->txn.isolation; session->txn.isolation = TXN_ISO_EVICTION; ret = __wt_cache_op(session, NULL, WT_SYNC_WRITE_LEAVES); session->txn.isolation = saved_isolation; WT_TRET(__wt_session_release_btree(session)); } WT_RET(ret); WT_RET(__wt_verbose(session, WT_VERB_LSM, "LSM worker checkpointing %s", chunk->uri)); WT_WITH_SCHEMA_LOCK(session, ret = __wt_schema_worker(session, chunk->uri, __wt_checkpoint, NULL, NULL, 0)); if (ret != 0) WT_RET_MSG(session, ret, "LSM checkpoint"); /* Now the file is written, get the chunk size. */ WT_RET(__wt_lsm_tree_set_chunk_size(session, chunk)); /* Update the flush timestamp to help track ongoing progress. */ WT_RET(__wt_epoch(session, &lsm_tree->last_flush_ts)); /* Lock the tree, mark the chunk as on disk and update the metadata. */ WT_RET(__wt_lsm_tree_writelock(session, lsm_tree)); F_SET(chunk, WT_LSM_CHUNK_ONDISK); ret = __wt_lsm_meta_write(session, lsm_tree); ++lsm_tree->dsk_gen; /* Update the throttle time. */ __wt_lsm_tree_throttle(session, lsm_tree, 1); WT_TRET(__wt_lsm_tree_writeunlock(session, lsm_tree)); if (ret != 0) WT_RET_MSG(session, ret, "LSM metadata write"); /* * Clear the no-eviction flag so the primary can be evicted and * eventually closed. Only do this once the checkpoint has succeeded: * otherwise, accessing the leaf page during the checkpoint can trigger * forced eviction. */ WT_RET(__wt_session_get_btree(session, chunk->uri, NULL, NULL, 0)); __wt_btree_evictable(session, 1); WT_RET(__wt_session_release_btree(session)); /* Make sure we aren't pinning a transaction ID. */ __wt_txn_release_snapshot(session); WT_RET(__wt_verbose(session, WT_VERB_LSM, "LSM worker checkpointed %s", chunk->uri)); /* Schedule a bloom filter create for our newly flushed chunk. */ if (!FLD_ISSET(lsm_tree->bloom, WT_LSM_BLOOM_OFF)) WT_RET(__wt_lsm_manager_push_entry( session, WT_LSM_WORK_BLOOM, 0, lsm_tree)); else WT_RET(__wt_lsm_manager_push_entry( session, WT_LSM_WORK_MERGE, 0, lsm_tree)); return (0); }
/* * __wt_lsm_checkpoint_chunk -- * Flush a single LSM chunk to disk. */ int __wt_lsm_checkpoint_chunk(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, WT_LSM_CHUNK *chunk) { WT_DECL_RET; WT_TXN_ISOLATION saved_isolation; bool flush_set; flush_set = false; /* * If the chunk is already checkpointed, make sure it is also evicted. * Either way, there is no point trying to checkpoint it again. */ if (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) && !F_ISSET(chunk, WT_LSM_CHUNK_STABLE) && !chunk->evicted) { WT_WITH_HANDLE_LIST_LOCK(session, ret = __lsm_discard_handle(session, chunk->uri, NULL)); if (ret == 0) chunk->evicted = 1; else if (ret == EBUSY) ret = 0; else WT_RET_MSG(session, ret, "discard handle"); } if (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK)) { WT_RET(__wt_verbose(session, WT_VERB_LSM, "LSM worker %s already on disk", chunk->uri)); return (0); } /* Stop if a running transaction needs the chunk. */ __wt_txn_update_oldest(session, true); if (chunk->switch_txn == WT_TXN_NONE || !__wt_txn_visible_all(session, chunk->switch_txn)) { WT_RET(__wt_verbose(session, WT_VERB_LSM, "LSM worker %s: running transaction, return", chunk->uri)); return (0); } if (!__wt_atomic_cas8(&chunk->flushing, 0, 1)) return (0); flush_set = true; WT_ERR(__wt_verbose(session, WT_VERB_LSM, "LSM worker flushing %s", chunk->uri)); /* * Flush the file before checkpointing: this is the expensive part in * terms of I/O. * * !!! * We can wait here for checkpoints and fsyncs to complete, which can * take a long time. */ if ((ret = __wt_session_get_btree( session, chunk->uri, NULL, NULL, 0)) == 0) { /* * Set read-uncommitted: we have already checked that all of the * updates in this chunk are globally visible, use the cheapest * possible check in reconciliation. */ saved_isolation = session->txn.isolation; session->txn.isolation = WT_ISO_READ_UNCOMMITTED; ret = __wt_cache_op(session, NULL, WT_SYNC_WRITE_LEAVES); session->txn.isolation = saved_isolation; WT_TRET(__wt_session_release_btree(session)); } WT_ERR(ret); WT_ERR(__wt_verbose(session, WT_VERB_LSM, "LSM worker checkpointing %s", chunk->uri)); /* * Turn on metadata tracking to ensure the checkpoint gets the * necessary handle locks. * * Ensure that we don't race with a running checkpoint: the checkpoint * lock protects against us racing with an application checkpoint in * this chunk. Don't wait for it, though: checkpoints can take a long * time, and our checkpoint operation should be very quick. */ WT_ERR(__wt_meta_track_on(session)); WT_WITH_CHECKPOINT_LOCK(session, ret, WT_WITH_SCHEMA_LOCK(session, ret, ret = __wt_schema_worker( session, chunk->uri, __wt_checkpoint, NULL, NULL, 0))); WT_TRET(__wt_meta_track_off(session, false, ret != 0)); if (ret != 0) WT_ERR_MSG(session, ret, "LSM checkpoint"); /* Now the file is written, get the chunk size. */ WT_ERR(__wt_lsm_tree_set_chunk_size(session, chunk)); /* Update the flush timestamp to help track ongoing progress. */ WT_ERR(__wt_epoch(session, &lsm_tree->last_flush_ts)); ++lsm_tree->chunks_flushed; /* Lock the tree, mark the chunk as on disk and update the metadata. */ WT_ERR(__wt_lsm_tree_writelock(session, lsm_tree)); F_SET(chunk, WT_LSM_CHUNK_ONDISK); ret = __wt_lsm_meta_write(session, lsm_tree); ++lsm_tree->dsk_gen; /* Update the throttle time. */ __wt_lsm_tree_throttle(session, lsm_tree, true); WT_TRET(__wt_lsm_tree_writeunlock(session, lsm_tree)); if (ret != 0) WT_ERR_MSG(session, ret, "LSM metadata write"); WT_PUBLISH(chunk->flushing, 0); flush_set = false; /* * Clear the no-eviction flag so the primary can be evicted and * eventually closed. Only do this once the checkpoint has succeeded: * otherwise, accessing the leaf page during the checkpoint can trigger * forced eviction. */ WT_ERR(__wt_session_get_btree(session, chunk->uri, NULL, NULL, 0)); __wt_btree_evictable(session, true); WT_ERR(__wt_session_release_btree(session)); /* Make sure we aren't pinning a transaction ID. */ __wt_txn_release_snapshot(session); WT_ERR(__wt_verbose(session, WT_VERB_LSM, "LSM worker checkpointed %s", chunk->uri)); /* Schedule a bloom filter create for our newly flushed chunk. */ if (!FLD_ISSET(lsm_tree->bloom, WT_LSM_BLOOM_OFF)) WT_ERR(__wt_lsm_manager_push_entry( session, WT_LSM_WORK_BLOOM, 0, lsm_tree)); else WT_ERR(__wt_lsm_manager_push_entry( session, WT_LSM_WORK_MERGE, 0, lsm_tree)); err: if (flush_set) WT_PUBLISH(chunk->flushing, 0); return (ret); }
/* * __wt_schema_worker -- * Get Btree handles for the object and cycle through calls to an * underlying worker function with each handle. */ int __wt_schema_worker(WT_SESSION_IMPL *session, const char *uri, int (*file_func)(WT_SESSION_IMPL *, const char *[]), int (*name_func)(WT_SESSION_IMPL *, const char *), const char *cfg[], uint32_t open_flags) { WT_COLGROUP *colgroup; WT_DATA_SOURCE *dsrc; WT_DECL_RET; WT_INDEX *idx; WT_SESSION *wt_session; WT_TABLE *table; const char *tablename; u_int i; table = NULL; tablename = uri; if (name_func != NULL) WT_ERR(name_func(session, uri)); /* Get the btree handle(s) and call the underlying function. */ if (WT_PREFIX_MATCH(uri, "file:")) { if (file_func != NULL) { WT_ERR(__wt_session_get_btree_ckpt( session, uri, cfg, open_flags)); ret = file_func(session, cfg); WT_TRET(__wt_session_release_btree(session)); } } else if (WT_PREFIX_MATCH(uri, "colgroup:")) { WT_ERR(__wt_schema_get_colgroup(session, uri, NULL, &colgroup)); WT_ERR(__wt_schema_worker(session, colgroup->source, file_func, name_func, cfg, open_flags)); } else if (WT_PREFIX_SKIP(tablename, "index:")) { idx = NULL; WT_ERR(__wt_schema_get_index(session, uri, NULL, &idx)); WT_ERR(__wt_schema_worker(session, idx->source, file_func, name_func, cfg, open_flags)); } else if (WT_PREFIX_MATCH(uri, "lsm:")) { WT_ERR(__wt_lsm_tree_worker( session, uri, file_func, name_func, cfg, open_flags)); } else if (WT_PREFIX_SKIP(tablename, "table:")) { WT_ERR(__wt_schema_get_table(session, tablename, strlen(tablename), 0, &table)); WT_ASSERT(session, session->dhandle == NULL); /* * We could make a recursive call for each colgroup or index * URI, but since we have already opened the table, we can take * a short cut and skip straight to the sources. If we have a * name function, it needs to know about the intermediate URIs. */ for (i = 0; i < WT_COLGROUPS(table); i++) { colgroup = table->cgroups[i]; if (name_func != NULL) WT_ERR(name_func(session, colgroup->name)); WT_ERR(__wt_schema_worker(session, colgroup->source, file_func, name_func, cfg, open_flags)); } WT_ERR(__wt_schema_open_indices(session, table)); for (i = 0; i < table->nindices; i++) { idx = table->indices[i]; if (name_func != NULL) WT_ERR(name_func(session, idx->name)); WT_ERR(__wt_schema_worker(session, idx->source, file_func, name_func, cfg, open_flags)); } } else if ((dsrc = __wt_schema_get_source(session, uri)) != NULL) { wt_session = (WT_SESSION *)session; if (file_func == __wt_compact && dsrc->compact != NULL) WT_ERR(dsrc->compact( dsrc, wt_session, uri, (WT_CONFIG_ARG *)cfg)); else if (file_func == __wt_salvage && dsrc->salvage != NULL) WT_ERR(dsrc->salvage( dsrc, wt_session, uri, (WT_CONFIG_ARG *)cfg)); else if (file_func == __wt_verify && dsrc->verify != NULL) WT_ERR(dsrc->verify( dsrc, wt_session, uri, (WT_CONFIG_ARG *)cfg)); else WT_ERR(__wt_object_unsupported(session, uri)); } else WT_ERR(__wt_bad_object_type(session, uri)); err: if (table != NULL) __wt_schema_release_table(session, table); return (ret); }