/* * __wt_lsm_tree_setup_chunk -- * Initialize a chunk of an LSM tree. */ int __wt_lsm_tree_setup_chunk( WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, WT_LSM_CHUNK *chunk) { WT_ITEM buf; const char *cfg[] = API_CONF_DEFAULTS(session, drop, "force"); WT_CLEAR(buf); WT_RET(__wt_lsm_tree_chunk_name(session, lsm_tree, chunk->id, &buf)); chunk->uri = __wt_buf_steal(session, &buf, NULL); /* * Drop the chunk first - there may be some content hanging over from * an aborted merge or checkpoint. * * Don't do this for the very first chunk: we are called during * WT_SESSION::create, and doing a drop inside there does interesting * things with handle locks and metadata tracking. It can never have * been the result of an interrupted merge, anyway. */ if (chunk->id > 1) WT_RET(__wt_schema_drop(session, chunk->uri, cfg)); return (__wt_schema_create(session, chunk->uri, lsm_tree->file_config)); }
/* * __wt_meta_track_off -- * Turn off metadata operation tracking, unrolling on error. */ int __wt_meta_track_off(WT_SESSION_IMPL *session, int unroll) { WT_BTREE *saved_btree; WT_DECL_RET; WT_META_TRACK *trk, *trk_orig; const char *ckpt_cfg[] = API_CONF_DEFAULTS(session, checkpoint, NULL); WT_ASSERT(session, WT_META_TRACKING(session) && session->meta_track_nest > 0); if (--session->meta_track_nest != 0) return (0); trk_orig = session->meta_track; trk = session->meta_track_next; /* Turn off tracking for unroll. */ session->meta_track_next = session->meta_track_sub = NULL; while (--trk >= trk_orig) WT_TRET(__meta_track_apply(session, trk, unroll)); /* If the operation succeeded, checkpoint the metadata. */ if (!unroll && ret == 0 && session->metafile != NULL) { saved_btree = session->btree; session->btree = session->metafile; ret = __wt_checkpoint(session, ckpt_cfg); session->btree = saved_btree; } return (ret); }
/* * __wt_meta_turtle_init -- * Check the turtle file and create if necessary. */ int __wt_meta_turtle_init(WT_SESSION_IMPL *session, int *existp) { WT_DECL_RET; WT_ITEM *buf; int exist; const char *metaconf; const char *cfg[] = API_CONF_DEFAULTS(file, meta, NULL); buf = NULL; metaconf = NULL; *existp = 0; /* Discard any turtle setup file left-over from previous runs. */ WT_RET(__wt_exist(session, WT_METADATA_TURTLE_SET, &exist)); if (exist) WT_RET(__wt_remove(session, WT_METADATA_TURTLE_SET)); /* If there's already a turtle file, we're done. */ WT_RET(__wt_exist(session, WT_METADATA_TURTLE, &exist)); if (exist) { *existp = 1; return (0); } /* Create a turtle file with default values. */ WT_ERR(__wt_scr_alloc(session, 0, &buf)); WT_ERR(__wt_buf_fmt(session, buf, "key_format=S,value_format=S,version=(major=%d,minor=%d)", WT_BTREE_MAJOR_VERSION, WT_BTREE_MINOR_VERSION)); cfg[1] = buf->data; WT_ERR(__wt_config_collapse(session, cfg, &metaconf)); WT_ERR(__wt_meta_turtle_update(session, WT_METADATA_URI, metaconf)); err: __wt_free(session, metaconf); __wt_scr_free(&buf); return (ret); }
/* * __lsm_tree_open_check -- * Validate the configuration of an LSM tree. */ static int __lsm_tree_open_check( WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) { WT_CONFIG_ITEM cval; const char *cfg[] = API_CONF_DEFAULTS( session, create, lsm_tree->file_config); uint64_t required; uint32_t maxleafpage; WT_RET(__wt_config_gets( session, cfg, "leaf_page_max", &cval)); maxleafpage = (uint32_t)cval.val; /* Three chunks, plus one page for each participant in a merge. */ required = 3 * lsm_tree->chunk_size + lsm_tree->merge_threads * (lsm_tree->merge_max * maxleafpage); if (S2C(session)->cache_size < required) WT_RET_MSG(session, EINVAL, "The LSM configuration requires a cache size of at least %" PRIu64 ". Configured size is %" PRIu64, required, S2C(session)->cache_size); return (0); }
/* * __wt_lsm_tree_create -- * Create an LSM tree structure for the given name. */ int __wt_lsm_tree_create(WT_SESSION_IMPL *session, const char *uri, int exclusive, const char *config) { WT_CONFIG_ITEM cval; WT_DECL_ITEM(buf); WT_DECL_RET; WT_LSM_TREE *lsm_tree; const char *cfg[] = API_CONF_DEFAULTS(session, create, config); const char *tmpconfig; /* If the tree is open, it already exists. */ if ((ret = __wt_lsm_tree_get(session, uri, 0, &lsm_tree)) == 0) { __wt_lsm_tree_release(session, lsm_tree); return (exclusive ? EEXIST : 0); } WT_RET_NOTFOUND_OK(ret); /* * If the tree has metadata, it already exists. * * !!! * Use a local variable: we don't care what the existing configuration * is, but we don't want to overwrite the real config. */ if (__wt_metadata_read(session, uri, &tmpconfig) == 0) { __wt_free(session, tmpconfig); return (exclusive ? EEXIST : 0); } WT_RET_NOTFOUND_OK(ret); WT_RET(__wt_config_gets(session, cfg, "key_format", &cval)); if (WT_STRING_MATCH("r", cval.str, cval.len)) WT_RET_MSG(session, EINVAL, "LSM trees cannot be configured as column stores"); WT_RET(__wt_calloc_def(session, 1, &lsm_tree)); WT_RET(__lsm_tree_set_name(session, lsm_tree, uri)); WT_ERR(__wt_config_gets(session, cfg, "key_format", &cval)); WT_ERR(__wt_strndup(session, cval.str, cval.len, &lsm_tree->key_format)); WT_ERR(__wt_config_gets(session, cfg, "value_format", &cval)); WT_ERR(__wt_strndup(session, cval.str, cval.len, &lsm_tree->value_format)); WT_ERR(__wt_config_gets(session, cfg, "lsm_bloom", &cval)); FLD_SET(lsm_tree->bloom, (cval.val == 0 ? WT_LSM_BLOOM_OFF : WT_LSM_BLOOM_MERGED)); WT_ERR(__wt_config_gets(session, cfg, "lsm_bloom_newest", &cval)); if (cval.val != 0) FLD_SET(lsm_tree->bloom, WT_LSM_BLOOM_NEWEST); WT_ERR(__wt_config_gets(session, cfg, "lsm_bloom_oldest", &cval)); if (cval.val != 0) FLD_SET(lsm_tree->bloom, WT_LSM_BLOOM_OLDEST); if (FLD_ISSET(lsm_tree->bloom, WT_LSM_BLOOM_OFF) && (FLD_ISSET(lsm_tree->bloom, WT_LSM_BLOOM_NEWEST) || FLD_ISSET(lsm_tree->bloom, WT_LSM_BLOOM_OLDEST))) WT_ERR_MSG(session, EINVAL, "Bloom filters can only be created on newest and oldest " "chunks if bloom filters are enabled"); WT_ERR(__wt_config_gets(session, cfg, "lsm_bloom_config", &cval)); if (cval.type == ITEM_STRUCT) { cval.str++; cval.len -= 2; } WT_ERR(__wt_strndup(session, cval.str, cval.len, &lsm_tree->bloom_config)); WT_ERR(__wt_config_gets(session, cfg, "lsm_bloom_bit_count", &cval)); lsm_tree->bloom_bit_count = (uint32_t)cval.val; WT_ERR(__wt_config_gets(session, cfg, "lsm_bloom_hash_count", &cval)); lsm_tree->bloom_hash_count = (uint32_t)cval.val; WT_ERR(__wt_config_gets(session, cfg, "lsm_chunk_size", &cval)); lsm_tree->chunk_size = (uint32_t)cval.val; WT_ERR(__wt_config_gets(session, cfg, "lsm_merge_max", &cval)); lsm_tree->merge_max = (uint32_t)cval.val; WT_ERR(__wt_config_gets(session, cfg, "lsm_merge_threads", &cval)); lsm_tree->merge_threads = (uint32_t)cval.val; /* Sanity check that api_data.py is in sync with lsm.h */ WT_ASSERT(session, lsm_tree->merge_threads <= WT_LSM_MAX_WORKERS); WT_ERR(__wt_scr_alloc(session, 0, &buf)); WT_ERR(__wt_buf_fmt(session, buf, "%s,key_format=u,value_format=u", config)); lsm_tree->file_config = __wt_buf_steal(session, buf, NULL); /* Create the first chunk and flush the metadata. */ WT_ERR(__wt_lsm_meta_write(session, lsm_tree)); /* Discard our partially populated handle. */ ret = __lsm_tree_discard(session, lsm_tree); lsm_tree = NULL; /* * Open our new tree and add it to the handle cache. Don't discard on * error: the returned handle is NULL on error, and the metadata * tracking macros handle cleaning up on failure. */ if (ret == 0) ret = __lsm_tree_open(session, uri, &lsm_tree); if (ret == 0) __wt_lsm_tree_release(session, lsm_tree); if (0) { err: WT_TRET(__lsm_tree_discard(session, lsm_tree)); } __wt_scr_free(&buf); return (ret); }
/* * __wt_lsm_stat_init -- * Initialize a LSM statistics structure. */ int __wt_lsm_stat_init(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, WT_CURSOR_STAT *cst, uint32_t flags) { WT_CURSOR *stat_cursor; WT_DECL_ITEM(uribuf); WT_DECL_RET; WT_DSRC_STATS *stats; WT_LSM_CHUNK *chunk; const char *cfg[] = API_CONF_DEFAULTS( session, open_cursor, "statistics_fast=on"); const char *disk_cfg[] = API_CONF_DEFAULTS(session, open_cursor, "checkpoint=WiredTigerCheckpoint,statistics_fast=on"); const char *desc, *pvalue; uint64_t value; u_int i; int locked, stat_key; WT_UNUSED(flags); locked = 0; WT_ERR(__wt_scr_alloc(session, 0, &uribuf)); /* Clear the statistics we are about to recalculate. */ if (cst->stats != NULL) stats = (WT_DSRC_STATS *)cst->stats; else { WT_ERR(__wt_calloc_def(session, 1, &stats)); __wt_stat_init_dsrc_stats(stats); cst->stats_first = cst->stats = (WT_STATS *)stats; cst->stats_count = sizeof(*stats) / sizeof(WT_STATS); } *stats = lsm_tree->stats; if (LF_ISSET(WT_STATISTICS_CLEAR)) __wt_stat_clear_dsrc_stats(&lsm_tree->stats); /* Hold the LSM lock so that we can safely walk through the chunks. */ WT_ERR(__wt_readlock(session, lsm_tree->rwlock)); locked = 1; /* Set the stats for this run. */ WT_STAT_SET(stats, lsm_chunk_count, lsm_tree->nchunks); for (i = 0; i < lsm_tree->nchunks; i++) { chunk = lsm_tree->chunk[i]; if (chunk->generation > (uint32_t)WT_STAT(stats, lsm_generation_max)) WT_STAT_SET(stats, lsm_generation_max, chunk->generation); /* * LSM chunk reads happen from a checkpoint, so get the * statistics for a checkpoint if one exists. */ WT_ERR(__wt_buf_fmt( session, uribuf, "statistics:%s", chunk->uri)); ret = __wt_curstat_open(session, uribuf->data, F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) ? disk_cfg : cfg, &stat_cursor); /* * XXX kludge: we may have an empty chunk where no checkpoint * was written. If so, try to open the ordinary handle on that * chunk instead. */ if (ret == WT_NOTFOUND && F_ISSET(chunk, WT_LSM_CHUNK_ONDISK)) ret = __wt_curstat_open( session, uribuf->data, cfg, &stat_cursor); WT_ERR(ret); while ((ret = stat_cursor->next(stat_cursor)) == 0) { WT_ERR(stat_cursor->get_key(stat_cursor, &stat_key)); WT_ERR(stat_cursor->get_value( stat_cursor, &desc, &pvalue, &value)); WT_STAT_INCRKV(stats, stat_key, value); } WT_ERR_NOTFOUND_OK(ret); WT_ERR(stat_cursor->close(stat_cursor)); if (!F_ISSET(chunk, WT_LSM_CHUNK_BLOOM)) continue; WT_STAT_INCR(stats, bloom_count); WT_STAT_INCRV(stats, bloom_size, (chunk->count * lsm_tree->bloom_bit_count) / 8); WT_ERR(__wt_buf_fmt( session, uribuf, "statistics:%s", chunk->bloom_uri)); WT_ERR(__wt_curstat_open(session, uribuf->data, cfg, &stat_cursor)); stat_cursor->set_key( stat_cursor, WT_STAT_DSRC_CACHE_EVICTION_CLEAN); WT_ERR(stat_cursor->search(stat_cursor)); WT_ERR(stat_cursor->get_value( stat_cursor, &desc, &pvalue, &value)); WT_STAT_INCRV(stats, cache_eviction_clean, value); WT_STAT_INCRV(stats, bloom_page_evict, value); stat_cursor->set_key( stat_cursor, WT_STAT_DSRC_CACHE_EVICTION_DIRTY); WT_ERR(stat_cursor->search(stat_cursor)); WT_ERR(stat_cursor->get_value( stat_cursor, &desc, &pvalue, &value)); WT_STAT_INCRV(stats, cache_eviction_dirty, value); WT_STAT_INCRV(stats, bloom_page_evict, value); stat_cursor->set_key( stat_cursor, WT_STAT_DSRC_CACHE_EVICTION_FAIL); WT_ERR(stat_cursor->search(stat_cursor)); WT_ERR(stat_cursor->get_value( stat_cursor, &desc, &pvalue, &value)); WT_STAT_INCRV(stats, cache_eviction_fail, value); stat_cursor->set_key(stat_cursor, WT_STAT_DSRC_CACHE_READ); WT_ERR(stat_cursor->search(stat_cursor)); WT_ERR(stat_cursor->get_value( stat_cursor, &desc, &pvalue, &value)); WT_STAT_INCRV(stats, cache_read, value); WT_STAT_INCRV(stats, bloom_page_read, value); stat_cursor->set_key(stat_cursor, WT_STAT_DSRC_CACHE_WRITE); WT_ERR(stat_cursor->search(stat_cursor)); WT_ERR(stat_cursor->get_value( stat_cursor, &desc, &pvalue, &value)); WT_STAT_INCRV(stats, cache_write, value); WT_ERR(stat_cursor->close(stat_cursor)); } err: if (locked) WT_TRET(__wt_rwunlock(session, lsm_tree->rwlock)); __wt_scr_free(&uribuf); return (ret); }