/* * __wt_lsm_tree_truncate -- * Truncate an LSM tree. */ int __wt_lsm_tree_truncate( WT_SESSION_IMPL *session, const char *name, const char *cfg[]) { WT_DECL_RET; WT_LSM_CHUNK *chunk; WT_LSM_TREE *lsm_tree; int locked; WT_UNUSED(cfg); chunk = NULL; locked = 0; /* Get the LSM tree. */ WT_RET(__wt_lsm_tree_get(session, name, 1, &lsm_tree)); /* Shut down the LSM worker. */ WT_RET(__lsm_tree_close(session, lsm_tree)); /* Prevent any new opens. */ WT_RET(__wt_lsm_tree_lock(session, lsm_tree, 1)); locked = 1; /* Create the new chunk. */ WT_ERR(__wt_calloc_def(session, 1, &chunk)); chunk->id = WT_ATOMIC_ADD(lsm_tree->last, 1); WT_ERR(__wt_lsm_tree_setup_chunk(session, lsm_tree, chunk)); /* Mark all chunks old. */ WT_ERR(__wt_lsm_merge_update_tree( session, lsm_tree, 0, lsm_tree->nchunks, chunk)); WT_ERR(__wt_lsm_meta_write(session, lsm_tree)); WT_ERR(__lsm_tree_start_worker(session, lsm_tree)); locked = 0; WT_ERR(__wt_lsm_tree_unlock(session, lsm_tree)); __wt_lsm_tree_release(session, lsm_tree); err: if (locked) WT_TRET(__wt_lsm_tree_unlock(session, lsm_tree)); if (ret != 0) { if (chunk != NULL) { (void)__wt_schema_drop(session, chunk->uri, NULL); __wt_free(session, chunk); } /* * Discard the LSM tree structure on error. This will force the * LSM tree to be re-opened the next time it is accessed and * the last good version of the metadata will be used, resulting * in a valid (not truncated) tree. */ WT_TRET(__lsm_tree_discard(session, lsm_tree)); } return (ret); }
/* * __wt_lsm_tree_drop -- * Drop an LSM tree. */ int __wt_lsm_tree_drop( WT_SESSION_IMPL *session, const char *name, const char *cfg[]) { WT_DECL_RET; WT_LSM_CHUNK *chunk; WT_LSM_TREE *lsm_tree; u_int i; int locked; locked = 0; /* Get the LSM tree. */ WT_RET(__wt_lsm_tree_get(session, name, 1, &lsm_tree)); /* Shut down the LSM worker. */ WT_ERR(__lsm_tree_close(session, lsm_tree)); /* Prevent any new opens. */ WT_ERR(__wt_try_writelock(session, lsm_tree->rwlock)); locked = 1; /* Drop the chunks. */ for (i = 0; i < lsm_tree->nchunks; i++) { chunk = lsm_tree->chunk[i]; WT_ERR(__wt_schema_drop(session, chunk->uri, cfg)); if (F_ISSET(chunk, WT_LSM_CHUNK_BLOOM)) WT_ERR( __wt_schema_drop(session, chunk->bloom_uri, cfg)); } /* Drop any chunks on the obsolete list. */ for (i = 0; i < lsm_tree->nold_chunks; i++) { if ((chunk = lsm_tree->old_chunks[i]) == NULL) continue; WT_ERR(__wt_schema_drop(session, chunk->uri, cfg)); if (F_ISSET(chunk, WT_LSM_CHUNK_BLOOM)) WT_ERR( __wt_schema_drop(session, chunk->bloom_uri, cfg)); } ret = __wt_rwunlock(session, lsm_tree->rwlock); locked = 0; if (ret == 0) ret = __wt_metadata_remove(session, name); err: if (locked) WT_TRET(__wt_rwunlock(session, lsm_tree->rwlock)); WT_TRET(__lsm_tree_discard(session, lsm_tree)); return (ret); }
/* * __wt_lsm_tree_truncate -- * Truncate an LSM tree. */ int __wt_lsm_tree_truncate( WT_SESSION_IMPL *session, const char *name, const char *cfg[]) { WT_DECL_RET; WT_LSM_CHUNK *chunk; WT_LSM_TREE *lsm_tree; int locked; WT_UNUSED(cfg); locked = 0; /* Get the LSM tree. */ WT_RET(__wt_lsm_tree_get(session, name, 1, &lsm_tree)); /* Shut down the LSM worker. */ WT_RET(__lsm_tree_close(session, lsm_tree)); /* Prevent any new opens. */ WT_RET(__wt_try_writelock(session, lsm_tree->rwlock)); locked = 1; /* Create the new chunk. */ WT_ERR(__wt_calloc_def(session, 1, &chunk)); chunk->id = WT_ATOMIC_ADD(lsm_tree->last, 1); WT_ERR(__wt_lsm_tree_setup_chunk(session, lsm_tree, chunk)); /* Mark all chunks old. */ WT_ERR(__wt_lsm_merge_update_tree( session, lsm_tree, 0, lsm_tree->nchunks, chunk)); WT_ERR(__wt_lsm_meta_write(session, lsm_tree)); WT_ERR(__lsm_tree_start_worker(session, lsm_tree)); ret = __wt_rwunlock(session, lsm_tree->rwlock); locked = 0; if (ret == 0) __wt_lsm_tree_release(session, lsm_tree); err: if (locked) WT_TRET(__wt_rwunlock(session, lsm_tree->rwlock)); /* * Don't discard the LSM tree structure unless there has been an * error. The handle remains valid for future operations. */ if (ret != 0) WT_TRET(__lsm_tree_discard(session, lsm_tree)); return (ret); }
/* * __wt_lsm_compact -- * Compact an LSM tree called via __wt_schema_worker. */ int __wt_lsm_compact(WT_SESSION_IMPL *session, const char *name, int *skip) { WT_DECL_RET; WT_LSM_TREE *lsm_tree; uint64_t last_merge_progressing; time_t begin, end; /* * This function is applied to all matching sources: ignore anything * that is not an LSM tree. */ if (!WT_PREFIX_MATCH(name, "lsm:")) return (0); /* Tell __wt_schema_worker not to look inside the LSM tree. */ *skip = 1; WT_RET(__wt_lsm_tree_get(session, name, 0, &lsm_tree)); if (!F_ISSET(S2C(session), WT_CONN_LSM_MERGE) || lsm_tree->merge_threads == 0) WT_RET_MSG(session, EINVAL, "LSM compaction requires active merge threads"); WT_RET(__wt_seconds(session, &begin)); F_SET(lsm_tree, WT_LSM_TREE_COMPACTING); /* Wake up the merge threads. */ WT_RET(__wt_cond_signal(session, lsm_tree->work_cond)); /* Now wait for merge activity to stop. */ do { last_merge_progressing = lsm_tree->merge_progressing; __wt_sleep(1, 0); WT_RET(__wt_seconds(session, &end)); if (session->compact->max_time > 0 && session->compact->max_time < (uint64_t)(end - begin)) WT_ERR(ETIMEDOUT); } while (lsm_tree->merge_progressing != last_merge_progressing && lsm_tree->nchunks > 1); err: F_CLR(lsm_tree, WT_LSM_TREE_COMPACTING); return (ret); }
/* * __wt_lsm_tree_worker -- * Run a schema worker operation on each level of a LSM tree. */ int __wt_lsm_tree_worker(WT_SESSION_IMPL *session, const char *uri, int (*file_func)(WT_SESSION_IMPL *, const char *[]), int (*name_func)(WT_SESSION_IMPL *, const char *, int *), const char *cfg[], uint32_t open_flags) { WT_DECL_RET; WT_LSM_CHUNK *chunk; WT_LSM_TREE *lsm_tree; u_int i; int exclusive, locked; locked = 0; exclusive = FLD_ISSET(open_flags, WT_DHANDLE_EXCLUSIVE) ? 1 : 0; WT_RET(__wt_lsm_tree_get(session, uri, exclusive, &lsm_tree)); /* * We mark that we're busy using the tree to coordinate * with merges so that merging doesn't change the chunk * array out from underneath us. */ WT_ERR(exclusive ? __wt_lsm_tree_writelock(session, lsm_tree) : __wt_lsm_tree_readlock(session, lsm_tree)); locked = 1; for (i = 0; i < lsm_tree->nchunks; i++) { chunk = lsm_tree->chunk[i]; if (file_func == __wt_checkpoint && F_ISSET(chunk, WT_LSM_CHUNK_ONDISK)) continue; WT_ERR(__wt_schema_worker(session, chunk->uri, file_func, name_func, cfg, open_flags)); if (name_func == __wt_backup_list_uri_append && F_ISSET(chunk, WT_LSM_CHUNK_BLOOM)) WT_ERR(__wt_schema_worker(session, chunk->bloom_uri, file_func, name_func, cfg, open_flags)); } err: if (locked) WT_TRET(exclusive ? __wt_lsm_tree_writeunlock(session, lsm_tree) : __wt_lsm_tree_readunlock(session, lsm_tree)); __wt_lsm_tree_release(session, lsm_tree); return (ret); }
/* * __wt_lsm_tree_worker -- * Run a schema worker operation on each level of a LSM tree. */ int __wt_lsm_tree_worker(WT_SESSION_IMPL *session, const char *uri, int (*func)(WT_SESSION_IMPL *, const char *[]), const char *cfg[], uint32_t open_flags) { WT_DECL_RET; WT_LSM_CHUNK *chunk; WT_LSM_TREE *lsm_tree; u_int i; WT_RET(__wt_lsm_tree_get(session, uri, FLD_ISSET(open_flags, WT_BTREE_EXCLUSIVE) ? 1 : 0, &lsm_tree)); for (i = 0; i < lsm_tree->nchunks; i++) { chunk = lsm_tree->chunk[i]; if (func == __wt_checkpoint && F_ISSET(chunk, WT_LSM_CHUNK_ONDISK)) continue; WT_ERR(__wt_schema_worker( session, chunk->uri, func, cfg, open_flags)); } err: __wt_lsm_tree_release(session, lsm_tree); return (ret); }
/* * __wt_lsm_tree_rename -- * Rename an LSM tree. */ int __wt_lsm_tree_rename(WT_SESSION_IMPL *session, const char *oldname, const char *newname, const char *cfg[]) { WT_DECL_RET; WT_ITEM buf; WT_LSM_CHUNK *chunk; WT_LSM_TREE *lsm_tree; const char *old; u_int i; int locked; old = NULL; WT_CLEAR(buf); locked = 0; /* Get the LSM tree. */ WT_RET(__wt_lsm_tree_get(session, oldname, 1, &lsm_tree)); /* Shut down the LSM worker. */ WT_ERR(__lsm_tree_close(session, lsm_tree)); /* Prevent any new opens. */ WT_ERR(__wt_try_writelock(session, lsm_tree->rwlock)); locked = 1; /* Set the new name. */ WT_ERR(__lsm_tree_set_name(session, lsm_tree, newname)); /* Rename the chunks. */ for (i = 0; i < lsm_tree->nchunks; i++) { chunk = lsm_tree->chunk[i]; old = chunk->uri; chunk->uri = NULL; WT_ERR(__wt_lsm_tree_chunk_name( session, lsm_tree, chunk->id, &buf)); chunk->uri = __wt_buf_steal(session, &buf, NULL); WT_ERR(__wt_schema_rename(session, old, chunk->uri, cfg)); __wt_free(session, old); if (F_ISSET(chunk, WT_LSM_CHUNK_BLOOM)) { old = chunk->bloom_uri; chunk->bloom_uri = NULL; WT_ERR(__wt_lsm_tree_bloom_name( session, lsm_tree, chunk->id, &buf)); chunk->bloom_uri = __wt_buf_steal(session, &buf, NULL); F_SET(chunk, WT_LSM_CHUNK_BLOOM); WT_ERR(__wt_schema_rename( session, old, chunk->uri, cfg)); __wt_free(session, old); } } ret = __wt_rwunlock(session, lsm_tree->rwlock); locked = 0; if (ret == 0) ret = __wt_lsm_meta_write(session, lsm_tree); if (ret == 0) ret = __wt_metadata_remove(session, oldname); err: if (locked) WT_TRET(__wt_rwunlock(session, lsm_tree->rwlock)); if (old != NULL) __wt_free(session, old); /* * Discard this LSM tree structure. The first operation on the renamed * tree will create a new one. */ WT_TRET(__lsm_tree_discard(session, lsm_tree)); return (ret); }
/* * __wt_lsm_tree_create -- * Create an LSM tree structure for the given name. */ int __wt_lsm_tree_create(WT_SESSION_IMPL *session, const char *uri, int exclusive, const char *config) { WT_CONFIG_ITEM cval; WT_DECL_ITEM(buf); WT_DECL_RET; WT_LSM_TREE *lsm_tree; const char *cfg[] = API_CONF_DEFAULTS(session, create, config); const char *tmpconfig; /* If the tree is open, it already exists. */ if ((ret = __wt_lsm_tree_get(session, uri, 0, &lsm_tree)) == 0) { __wt_lsm_tree_release(session, lsm_tree); return (exclusive ? EEXIST : 0); } WT_RET_NOTFOUND_OK(ret); /* * If the tree has metadata, it already exists. * * !!! * Use a local variable: we don't care what the existing configuration * is, but we don't want to overwrite the real config. */ if (__wt_metadata_read(session, uri, &tmpconfig) == 0) { __wt_free(session, tmpconfig); return (exclusive ? EEXIST : 0); } WT_RET_NOTFOUND_OK(ret); WT_RET(__wt_config_gets(session, cfg, "key_format", &cval)); if (WT_STRING_MATCH("r", cval.str, cval.len)) WT_RET_MSG(session, EINVAL, "LSM trees cannot be configured as column stores"); WT_RET(__wt_calloc_def(session, 1, &lsm_tree)); WT_RET(__lsm_tree_set_name(session, lsm_tree, uri)); WT_ERR(__wt_config_gets(session, cfg, "key_format", &cval)); WT_ERR(__wt_strndup(session, cval.str, cval.len, &lsm_tree->key_format)); WT_ERR(__wt_config_gets(session, cfg, "value_format", &cval)); WT_ERR(__wt_strndup(session, cval.str, cval.len, &lsm_tree->value_format)); WT_ERR(__wt_config_gets(session, cfg, "lsm_bloom", &cval)); FLD_SET(lsm_tree->bloom, (cval.val == 0 ? WT_LSM_BLOOM_OFF : WT_LSM_BLOOM_MERGED)); WT_ERR(__wt_config_gets(session, cfg, "lsm_bloom_newest", &cval)); if (cval.val != 0) FLD_SET(lsm_tree->bloom, WT_LSM_BLOOM_NEWEST); WT_ERR(__wt_config_gets(session, cfg, "lsm_bloom_oldest", &cval)); if (cval.val != 0) FLD_SET(lsm_tree->bloom, WT_LSM_BLOOM_OLDEST); if (FLD_ISSET(lsm_tree->bloom, WT_LSM_BLOOM_OFF) && (FLD_ISSET(lsm_tree->bloom, WT_LSM_BLOOM_NEWEST) || FLD_ISSET(lsm_tree->bloom, WT_LSM_BLOOM_OLDEST))) WT_ERR_MSG(session, EINVAL, "Bloom filters can only be created on newest and oldest " "chunks if bloom filters are enabled"); WT_ERR(__wt_config_gets(session, cfg, "lsm_bloom_config", &cval)); if (cval.type == ITEM_STRUCT) { cval.str++; cval.len -= 2; } WT_ERR(__wt_strndup(session, cval.str, cval.len, &lsm_tree->bloom_config)); WT_ERR(__wt_config_gets(session, cfg, "lsm_bloom_bit_count", &cval)); lsm_tree->bloom_bit_count = (uint32_t)cval.val; WT_ERR(__wt_config_gets(session, cfg, "lsm_bloom_hash_count", &cval)); lsm_tree->bloom_hash_count = (uint32_t)cval.val; WT_ERR(__wt_config_gets(session, cfg, "lsm_chunk_size", &cval)); lsm_tree->chunk_size = (uint32_t)cval.val; WT_ERR(__wt_config_gets(session, cfg, "lsm_merge_max", &cval)); lsm_tree->merge_max = (uint32_t)cval.val; WT_ERR(__wt_config_gets(session, cfg, "lsm_merge_threads", &cval)); lsm_tree->merge_threads = (uint32_t)cval.val; /* Sanity check that api_data.py is in sync with lsm.h */ WT_ASSERT(session, lsm_tree->merge_threads <= WT_LSM_MAX_WORKERS); WT_ERR(__wt_scr_alloc(session, 0, &buf)); WT_ERR(__wt_buf_fmt(session, buf, "%s,key_format=u,value_format=u", config)); lsm_tree->file_config = __wt_buf_steal(session, buf, NULL); /* Create the first chunk and flush the metadata. */ WT_ERR(__wt_lsm_meta_write(session, lsm_tree)); /* Discard our partially populated handle. */ ret = __lsm_tree_discard(session, lsm_tree); lsm_tree = NULL; /* * Open our new tree and add it to the handle cache. Don't discard on * error: the returned handle is NULL on error, and the metadata * tracking macros handle cleaning up on failure. */ if (ret == 0) ret = __lsm_tree_open(session, uri, &lsm_tree); if (ret == 0) __wt_lsm_tree_release(session, lsm_tree); if (0) { err: WT_TRET(__lsm_tree_discard(session, lsm_tree)); } __wt_scr_free(&buf); return (ret); }
/* * __wt_lsm_tree_create -- * Create an LSM tree structure for the given name. */ int __wt_lsm_tree_create(WT_SESSION_IMPL *session, const char *uri, int exclusive, const char *config) { WT_CONFIG_ITEM cval; WT_DECL_ITEM(buf); WT_DECL_RET; WT_LSM_TREE *lsm_tree; const char *cfg[] = { WT_CONFIG_BASE(session, session_create), config, NULL }; char *tmpconfig; /* If the tree is open, it already exists. */ if ((ret = __wt_lsm_tree_get(session, uri, 0, &lsm_tree)) == 0) { __wt_lsm_tree_release(session, lsm_tree); return (exclusive ? EEXIST : 0); } WT_RET_NOTFOUND_OK(ret); /* * If the tree has metadata, it already exists. * * !!! * Use a local variable: we don't care what the existing configuration * is, but we don't want to overwrite the real config. */ if (__wt_metadata_search(session, uri, &tmpconfig) == 0) { __wt_free(session, tmpconfig); return (exclusive ? EEXIST : 0); } WT_RET_NOTFOUND_OK(ret); WT_RET(__wt_config_gets(session, cfg, "key_format", &cval)); if (WT_STRING_MATCH("r", cval.str, cval.len)) WT_RET_MSG(session, EINVAL, "LSM trees cannot be configured as column stores"); WT_RET(__wt_calloc_def(session, 1, &lsm_tree)); WT_ERR(__lsm_tree_set_name(session, lsm_tree, uri)); WT_ERR(__wt_config_gets(session, cfg, "key_format", &cval)); WT_ERR(__wt_strndup( session, cval.str, cval.len, &lsm_tree->key_format)); WT_ERR(__wt_config_gets(session, cfg, "value_format", &cval)); WT_ERR(__wt_strndup( session, cval.str, cval.len, &lsm_tree->value_format)); WT_ERR(__wt_config_gets(session, cfg, "collator", &cval)); WT_ERR(__wt_strndup( session, cval.str, cval.len, &lsm_tree->collator_name)); WT_ERR(__wt_config_gets(session, cfg, "lsm.auto_throttle", &cval)); if (cval.val) F_SET(lsm_tree, WT_LSM_TREE_THROTTLE); else F_CLR(lsm_tree, WT_LSM_TREE_THROTTLE); WT_ERR(__wt_config_gets(session, cfg, "lsm.bloom", &cval)); FLD_SET(lsm_tree->bloom, (cval.val == 0 ? WT_LSM_BLOOM_OFF : WT_LSM_BLOOM_MERGED)); WT_ERR(__wt_config_gets(session, cfg, "lsm.bloom_oldest", &cval)); if (cval.val != 0) FLD_SET(lsm_tree->bloom, WT_LSM_BLOOM_OLDEST); if (FLD_ISSET(lsm_tree->bloom, WT_LSM_BLOOM_OFF) && FLD_ISSET(lsm_tree->bloom, WT_LSM_BLOOM_OLDEST)) WT_ERR_MSG(session, EINVAL, "Bloom filters can only be created on newest and oldest " "chunks if bloom filters are enabled"); WT_ERR(__wt_config_gets(session, cfg, "lsm.bloom_config", &cval)); if (cval.type == WT_CONFIG_ITEM_STRUCT) { cval.str++; cval.len -= 2; } WT_ERR(__wt_strndup( session, cval.str, cval.len, &lsm_tree->bloom_config)); WT_ERR(__wt_config_gets(session, cfg, "lsm.bloom_bit_count", &cval)); lsm_tree->bloom_bit_count = (uint32_t)cval.val; WT_ERR(__wt_config_gets(session, cfg, "lsm.bloom_hash_count", &cval)); lsm_tree->bloom_hash_count = (uint32_t)cval.val; WT_ERR(__wt_config_gets(session, cfg, "lsm.chunk_max", &cval)); lsm_tree->chunk_max = (uint64_t)cval.val; WT_ERR(__wt_config_gets(session, cfg, "lsm.chunk_size", &cval)); lsm_tree->chunk_size = (uint64_t)cval.val; if (lsm_tree->chunk_size > lsm_tree->chunk_max) WT_ERR_MSG(session, EINVAL, "Chunk size (chunk_size) must be smaller than or equal to " "the maximum chunk size (chunk_max)"); WT_ERR(__wt_config_gets(session, cfg, "lsm.merge_max", &cval)); lsm_tree->merge_max = (uint32_t)cval.val; WT_ERR(__wt_config_gets(session, cfg, "lsm.merge_min", &cval)); lsm_tree->merge_min = (uint32_t)cval.val; if (lsm_tree->merge_min > lsm_tree->merge_max) WT_ERR_MSG(session, EINVAL, "LSM merge_min must be less than or equal to merge_max"); /* * Set up the config for each chunk. * * Make the memory_page_max double the chunk size, so application * threads don't immediately try to force evict the chunk when the * worker thread clears the NO_EVICTION flag. */ WT_ERR(__wt_scr_alloc(session, 0, &buf)); WT_ERR(__wt_buf_fmt(session, buf, "%s,key_format=u,value_format=u,memory_page_max=%" PRIu64, config, 2 * lsm_tree->chunk_max)); WT_ERR(__wt_strndup( session, buf->data, buf->size, &lsm_tree->file_config)); /* Create the first chunk and flush the metadata. */ WT_ERR(__wt_lsm_meta_write(session, lsm_tree)); /* Discard our partially populated handle. */ ret = __lsm_tree_discard(session, lsm_tree); lsm_tree = NULL; /* * Open our new tree and add it to the handle cache. Don't discard on * error: the returned handle is NULL on error, and the metadata * tracking macros handle cleaning up on failure. */ if (ret == 0) ret = __lsm_tree_open(session, uri, &lsm_tree); if (ret == 0) __wt_lsm_tree_release(session, lsm_tree); if (0) { err: WT_TRET(__lsm_tree_discard(session, lsm_tree)); } __wt_scr_free(&buf); return (ret); }
/* * __wt_lsm_compact -- * Compact an LSM tree called via __wt_schema_worker. */ int __wt_lsm_compact(WT_SESSION_IMPL *session, const char *name, int *skip) { WT_DECL_RET; WT_LSM_CHUNK *chunk; WT_LSM_TREE *lsm_tree; time_t begin, end; uint64_t progress; int i, compacting, flushing, locked, ref; compacting = flushing = locked = ref = 0; chunk = NULL; /* * This function is applied to all matching sources: ignore anything * that is not an LSM tree. */ if (!WT_PREFIX_MATCH(name, "lsm:")) return (0); /* Tell __wt_schema_worker not to look inside the LSM tree. */ *skip = 1; WT_RET(__wt_lsm_tree_get(session, name, 0, &lsm_tree)); if (!F_ISSET(S2C(session), WT_CONN_LSM_MERGE)) WT_ERR_MSG(session, EINVAL, "LSM compaction requires active merge threads"); WT_ERR(__wt_seconds(session, &begin)); /* * Compacting has two distinct phases. * 1. All in-memory chunks up to and including the current * current chunk must be flushed. Normally, the flush code * does not flush the last, in-use chunk, so we set a force * flag to include that last chunk. We monitor the state of the * last chunk and periodically push another forced flush work * unit until it is complete. * 2. After all flushing is done, we move onto the merging * phase for compaction. Again, we monitor the state and * continue to push merge work units until all merging is done. */ /* Lock the tree: single-thread compaction. */ WT_ERR(__wt_lsm_tree_writelock(session, lsm_tree)); locked = 1; /* Clear any merge throttle: compact throws out that calculation. */ lsm_tree->merge_throttle = 0; lsm_tree->merge_aggressiveness = 0; progress = lsm_tree->merge_progressing; /* If another thread started a compact on this tree, we're done. */ if (F_ISSET(lsm_tree, WT_LSM_TREE_COMPACTING)) goto err; /* * Set the switch transaction on the current chunk, if it * hasn't been set before. This prevents further writes, so it * can be flushed by the checkpoint worker. */ if (lsm_tree->nchunks > 0 && (chunk = lsm_tree->chunk[lsm_tree->nchunks - 1]) != NULL) { if (chunk->switch_txn == WT_TXN_NONE) chunk->switch_txn = __wt_txn_new_id(session); /* * If we have a chunk, we want to look for it to be on-disk. * So we need to add a reference to keep it available. */ (void)WT_ATOMIC_ADD4(chunk->refcnt, 1); ref = 1; } locked = 0; WT_ERR(__wt_lsm_tree_writeunlock(session, lsm_tree)); if (chunk != NULL) { WT_ERR(__wt_verbose(session, WT_VERB_LSM, "Compact force flush %s flags 0x%" PRIx32 " chunk %u flags 0x%" PRIx32, name, lsm_tree->flags, chunk->id, chunk->flags)); flushing = 1; /* * Make sure the in-memory chunk gets flushed do not push a * switch, because we don't want to create a new in-memory * chunk if the tree is being used read-only now. */ WT_ERR(__wt_lsm_manager_push_entry(session, WT_LSM_WORK_FLUSH, WT_LSM_WORK_FORCE, lsm_tree)); } else { /* * If there is no chunk to flush, go straight to the * compacting state. */ compacting = 1; progress = lsm_tree->merge_progressing; F_SET(lsm_tree, WT_LSM_TREE_COMPACTING); WT_ERR(__wt_verbose(session, WT_VERB_LSM, "COMPACT: Start compacting %s", lsm_tree->name)); } /* Wait for the work unit queues to drain. */ while (F_ISSET(lsm_tree, WT_LSM_TREE_ACTIVE)) { /* * The flush flag is cleared when the chunk has been flushed. * Continue to push forced flushes until the chunk is on disk. * Once it is on disk move to the compacting phase. */ if (flushing) { WT_ASSERT(session, chunk != NULL); if (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK)) { WT_ERR(__wt_verbose(session, WT_VERB_LSM, "Compact flush done %s chunk %u. " "Start compacting progress %" PRIu64, name, chunk->id, lsm_tree->merge_progressing)); (void)WT_ATOMIC_SUB4(chunk->refcnt, 1); flushing = ref = 0; compacting = 1; F_SET(lsm_tree, WT_LSM_TREE_COMPACTING); progress = lsm_tree->merge_progressing; } else { WT_ERR(__wt_verbose(session, WT_VERB_LSM, "Compact flush retry %s chunk %u", name, chunk->id)); WT_ERR(__wt_lsm_manager_push_entry(session, WT_LSM_WORK_FLUSH, WT_LSM_WORK_FORCE, lsm_tree)); } } /* * The compacting flag is cleared when no merges can be done. * Ensure that we push through some aggressive merges before * stopping otherwise we might not do merges that would * span chunks with different generations. */ if (compacting && !F_ISSET(lsm_tree, WT_LSM_TREE_COMPACTING)) { if (lsm_tree->merge_aggressiveness < 10 || (progress < lsm_tree->merge_progressing) || lsm_tree->merge_syncing) { progress = lsm_tree->merge_progressing; F_SET(lsm_tree, WT_LSM_TREE_COMPACTING); lsm_tree->merge_aggressiveness = 10; } else break; } __wt_sleep(1, 0); WT_ERR(__wt_seconds(session, &end)); if (session->compact->max_time > 0 && session->compact->max_time < (uint64_t)(end - begin)) { WT_ERR(ETIMEDOUT); } /* * Push merge operations while they are still getting work * done. If we are pushing merges, make sure they are * aggressive, to avoid duplicating effort. */ if (compacting) #define COMPACT_PARALLEL_MERGES 5 for (i = lsm_tree->queue_ref; i < COMPACT_PARALLEL_MERGES; i++) { lsm_tree->merge_aggressiveness = 10; WT_ERR(__wt_lsm_manager_push_entry( session, WT_LSM_WORK_MERGE, 0, lsm_tree)); } } err: /* Ensure anything we set is cleared. */ if (ref) (void)WT_ATOMIC_SUB4(chunk->refcnt, 1); if (compacting) { F_CLR(lsm_tree, WT_LSM_TREE_COMPACTING); lsm_tree->merge_aggressiveness = 0; } if (locked) WT_TRET(__wt_lsm_tree_writeunlock(session, lsm_tree)); WT_TRET(__wt_verbose(session, WT_VERB_LSM, "Compact %s complete, return %d", name, ret)); __wt_lsm_tree_release(session, lsm_tree); return (ret); }
/* * __wt_lsm_tree_create -- * Create an LSM tree structure for the given name. */ int __wt_lsm_tree_create(WT_SESSION_IMPL *session, const char *uri, int exclusive, const char *config) { WT_CONFIG_ITEM cval; WT_DECL_ITEM(buf); WT_DECL_RET; WT_LSM_TREE *lsm_tree; const char *cfg[] = { WT_CONFIG_BASE(session, session_create), config, NULL }; const char *tmpconfig; /* If the tree is open, it already exists. */ if ((ret = __wt_lsm_tree_get(session, uri, 0, &lsm_tree)) == 0) { __wt_lsm_tree_release(session, lsm_tree); return (exclusive ? EEXIST : 0); } WT_RET_NOTFOUND_OK(ret); /* * If the tree has metadata, it already exists. * * !!! * Use a local variable: we don't care what the existing configuration * is, but we don't want to overwrite the real config. */ if (__wt_metadata_search(session, uri, &tmpconfig) == 0) { __wt_free(session, tmpconfig); return (exclusive ? EEXIST : 0); } WT_RET_NOTFOUND_OK(ret); WT_RET(__wt_config_gets(session, cfg, "key_format", &cval)); if (WT_STRING_MATCH("r", cval.str, cval.len)) WT_RET_MSG(session, EINVAL, "LSM trees cannot be configured as column stores"); WT_RET(__wt_calloc_def(session, 1, &lsm_tree)); WT_ERR(__lsm_tree_set_name(session, lsm_tree, uri)); WT_ERR(__wt_config_gets(session, cfg, "key_format", &cval)); WT_ERR(__wt_strndup(session, cval.str, cval.len, &lsm_tree->key_format)); WT_ERR(__wt_config_gets(session, cfg, "value_format", &cval)); WT_ERR(__wt_strndup(session, cval.str, cval.len, &lsm_tree->value_format)); WT_ERR(__wt_config_gets(session, cfg, "collator", &cval)); WT_ERR(__wt_strndup(session, cval.str, cval.len, &lsm_tree->collator_name)); WT_ERR(__wt_config_gets(session, cfg, "lsm.auto_throttle", &cval)); if (cval.val) F_SET(lsm_tree, WT_LSM_TREE_THROTTLE); else F_CLR(lsm_tree, WT_LSM_TREE_THROTTLE); WT_ERR(__wt_config_gets(session, cfg, "lsm.bloom", &cval)); FLD_SET(lsm_tree->bloom, (cval.val == 0 ? WT_LSM_BLOOM_OFF : WT_LSM_BLOOM_MERGED)); WT_ERR(__wt_config_gets(session, cfg, "lsm.bloom_oldest", &cval)); if (cval.val != 0) FLD_SET(lsm_tree->bloom, WT_LSM_BLOOM_OLDEST); if (FLD_ISSET(lsm_tree->bloom, WT_LSM_BLOOM_OFF) && FLD_ISSET(lsm_tree->bloom, WT_LSM_BLOOM_OLDEST)) WT_ERR_MSG(session, EINVAL, "Bloom filters can only be created on newest and oldest " "chunks if bloom filters are enabled"); WT_ERR(__wt_config_gets(session, cfg, "lsm.bloom_config", &cval)); if (cval.type == WT_CONFIG_ITEM_STRUCT) { cval.str++; cval.len -= 2; } WT_ERR(__wt_strndup(session, cval.str, cval.len, &lsm_tree->bloom_config)); WT_ERR(__wt_config_gets(session, cfg, "lsm.bloom_bit_count", &cval)); lsm_tree->bloom_bit_count = (uint32_t)cval.val; WT_ERR(__wt_config_gets(session, cfg, "lsm.bloom_hash_count", &cval)); lsm_tree->bloom_hash_count = (uint32_t)cval.val; WT_ERR(__wt_config_gets(session, cfg, "lsm.chunk_max", &cval)); lsm_tree->chunk_max = (uint64_t)cval.val; WT_ERR(__wt_config_gets(session, cfg, "lsm.chunk_size", &cval)); lsm_tree->chunk_size = (uint64_t)cval.val; if (lsm_tree->chunk_size > lsm_tree->chunk_max) WT_ERR_MSG(session, EINVAL, "Chunk size (chunk_size) must be smaller than or equal to " "the maximum chunk size (chunk_max)"); WT_ERR(__wt_config_gets(session, cfg, "lsm.merge_max", &cval)); lsm_tree->merge_max = (uint32_t)cval.val; lsm_tree->merge_min = lsm_tree->merge_max / 2; WT_ERR(__wt_config_gets(session, cfg, "lsm.merge_threads", &cval)); lsm_tree->merge_threads = (uint32_t)cval.val; /* Sanity check that api_data.py is in sync with lsm.h */ WT_ASSERT(session, lsm_tree->merge_threads <= WT_LSM_MAX_WORKERS); /* * Set up the config for each chunk. If possible, avoid high latencies * from fsync by flushing the cache every 8MB (will be overridden by * any application setting). */ tmpconfig = ""; #ifdef HAVE_SYNC_FILE_RANGE if (!S2C(session)->direct_io) tmpconfig = "os_cache_dirty_max=8MB,"; #endif WT_ERR(__wt_scr_alloc(session, 0, &buf)); WT_ERR(__wt_buf_fmt(session, buf, "%s%s,key_format=u,value_format=u", tmpconfig, config)); lsm_tree->file_config = __wt_buf_steal(session, buf, NULL); /* Create the first chunk and flush the metadata. */ WT_ERR(__wt_lsm_meta_write(session, lsm_tree)); /* Discard our partially populated handle. */ ret = __lsm_tree_discard(session, lsm_tree); lsm_tree = NULL; /* * Open our new tree and add it to the handle cache. Don't discard on * error: the returned handle is NULL on error, and the metadata * tracking macros handle cleaning up on failure. */ if (ret == 0) ret = __lsm_tree_open(session, uri, &lsm_tree); if (ret == 0) __wt_lsm_tree_release(session, lsm_tree); if (0) { err: WT_TRET(__lsm_tree_discard(session, lsm_tree)); } __wt_scr_free(&buf); return (ret); }