/* * __wt_lsm_tree_get -- * Get an LSM tree structure for the given name. Optionally get exclusive * access to the handle. Exclusive access works separately to the LSM * tree lock - since operations that need exclusive access may also need * to take the LSM tree lock for example outstanding work unit operations. */ int __wt_lsm_tree_get(WT_SESSION_IMPL *session, const char *uri, int exclusive, WT_LSM_TREE **treep) { WT_LSM_TREE *lsm_tree; /* See if the tree is already open. */ TAILQ_FOREACH(lsm_tree, &S2C(session)->lsmqh, q) if (strcmp(uri, lsm_tree->name) == 0) { /* * Short circuit if the handle is already held * exclusively or exclusive access is requested and * there are references held. */ if ((exclusive && lsm_tree->refcnt > 0) || F_ISSET_ATOMIC(lsm_tree, WT_LSM_TREE_EXCLUSIVE)) return (EBUSY); if (exclusive) { F_SET_ATOMIC(lsm_tree, WT_LSM_TREE_EXCLUSIVE); if (!WT_ATOMIC_CAS4(lsm_tree->refcnt, 0, 1)) { F_CLR(lsm_tree, WT_LSM_TREE_EXCLUSIVE); return (EBUSY); } } else (void)WT_ATOMIC_ADD4(lsm_tree->refcnt, 1); /* * If we got a reference, but an exclusive reference * beat us to it, give our reference up. */ if (!exclusive && F_ISSET_ATOMIC(lsm_tree, WT_LSM_TREE_EXCLUSIVE)) { (void)WT_ATOMIC_SUB4(lsm_tree->refcnt, 1); return (EBUSY); } *treep = lsm_tree; return (0); } /* Open a new tree. */ return (__lsm_tree_open(session, uri, treep)); }
/* * __wt_page_inmem -- * Build in-memory page information. */ int __wt_page_inmem(WT_SESSION_IMPL *session, WT_REF *ref, const void *image, uint32_t flags, WT_PAGE **pagep) { WT_DECL_RET; WT_PAGE *page; const WT_PAGE_HEADER *dsk; uint32_t alloc_entries; size_t size; *pagep = NULL; dsk = image; alloc_entries = 0; /* * Figure out how many underlying objects the page references so we can * allocate them along with the page. */ switch (dsk->type) { case WT_PAGE_COL_FIX: case WT_PAGE_COL_INT: case WT_PAGE_COL_VAR: /* * Column-store leaf page entries map one-to-one to the number * of physical entries on the page (each physical entry is a * value item). * * Column-store internal page entries map one-to-one to the * number of physical entries on the page (each entry is a * location cookie). */ alloc_entries = dsk->u.entries; break; case WT_PAGE_ROW_INT: /* * Row-store internal page entries map one-to-two to the number * of physical entries on the page (each entry is a key and * location cookie pair). */ alloc_entries = dsk->u.entries / 2; break; case WT_PAGE_ROW_LEAF: /* * If the "no empty values" flag is set, row-store leaf page * entries map one-to-one to the number of physical entries * on the page (each physical entry is a key or value item). * If that flag is not set, there are more keys than values, * we have to walk the page to figure it out. */ if (F_ISSET(dsk, WT_PAGE_EMPTY_V_ALL)) alloc_entries = dsk->u.entries; else if (F_ISSET(dsk, WT_PAGE_EMPTY_V_NONE)) alloc_entries = dsk->u.entries / 2; else WT_RET(__inmem_row_leaf_entries( session, dsk, &alloc_entries)); break; WT_ILLEGAL_VALUE(session); } /* Allocate and initialize a new WT_PAGE. */ WT_RET(__wt_page_alloc( session, dsk->type, dsk->recno, alloc_entries, 1, &page)); page->dsk = dsk; F_SET_ATOMIC(page, flags); /* * Track the memory allocated to build this page so we can update the * cache statistics in a single call. */ size = LF_ISSET(WT_PAGE_DISK_ALLOC) ? dsk->mem_size : 0; switch (page->type) { case WT_PAGE_COL_FIX: __inmem_col_fix(session, page); break; case WT_PAGE_COL_INT: __inmem_col_int(session, page); break; case WT_PAGE_COL_VAR: WT_ERR(__inmem_col_var(session, page, &size)); break; case WT_PAGE_ROW_INT: WT_ERR(__inmem_row_int(session, page, &size)); break; case WT_PAGE_ROW_LEAF: WT_ERR(__inmem_row_leaf(session, page)); break; WT_ILLEGAL_VALUE_ERR(session); } /* Update the page's in-memory size and the cache statistics. */ __wt_cache_page_inmem_incr(session, page, size); /* Link the new internal page to the parent. */ if (ref != NULL) { switch (page->type) { case WT_PAGE_COL_INT: case WT_PAGE_ROW_INT: page->pg_intl_parent_ref = ref; break; } ref->page = page; } *pagep = page; return (0); err: __wt_page_out(session, &page); return (ret); }
/* * __wt_page_inmem -- * Build in-memory page information. */ int __wt_page_inmem( WT_SESSION_IMPL *session, WT_PAGE *parent, WT_REF *parent_ref, WT_PAGE_HEADER *dsk, int disk_not_alloc, WT_PAGE **pagep) { WT_DECL_RET; WT_PAGE *page; uint32_t alloc_entries; size_t size; alloc_entries = 0; *pagep = NULL; /* * Figure out how many underlying objects the page references so * we can allocate them along with the page. */ switch (dsk->type) { case WT_PAGE_COL_FIX: break; case WT_PAGE_COL_INT: /* * Column-store internal page entries map one-to-one to the * number of physical entries on the page (each physical entry * is an offset object). */ alloc_entries = dsk->u.entries; break; case WT_PAGE_COL_VAR: /* * Column-store leaf page entries map one-to-one to the number * of physical entries on the page (each physical entry is a * data item). */ alloc_entries = dsk->u.entries; break; case WT_PAGE_ROW_INT: /* * Row-store internal page entries map one-to-two to the number * of physical entries on the page (each in-memory entry is a * key item and location cookie). */ alloc_entries = dsk->u.entries / 2; break; case WT_PAGE_ROW_LEAF: /* * Row-store leaf page entries map in an indeterminate way to * the physical entries on the page, we have to walk the page * to figure it out. */ WT_RET(__inmem_row_leaf_entries(session, dsk, &alloc_entries)); break; WT_ILLEGAL_VALUE(session); } /* Allocate and initialize a new WT_PAGE. */ WT_RET(__wt_page_alloc(session, dsk->type, alloc_entries, &page)); page->dsk = dsk; page->read_gen = WT_READ_GEN_NOTSET; if (disk_not_alloc) F_SET_ATOMIC(page, WT_PAGE_DISK_NOT_ALLOC); /* * Track the memory allocated to build this page so we can update the * cache statistics in a single call. */ size = disk_not_alloc ? 0 : dsk->mem_size; switch (page->type) { case WT_PAGE_COL_FIX: page->entries = dsk->u.entries; page->u.col_fix.recno = dsk->recno; __inmem_col_fix(session, page); break; case WT_PAGE_COL_INT: page->entries = dsk->u.entries; page->u.intl.recno = dsk->recno; __inmem_col_int(session, page); break; case WT_PAGE_COL_VAR: page->entries = dsk->u.entries; page->u.col_var.recno = dsk->recno; WT_ERR(__inmem_col_var(session, page, &size)); break; case WT_PAGE_ROW_INT: page->entries = dsk->u.entries / 2; WT_ERR(__inmem_row_int(session, page, &size)); break; case WT_PAGE_ROW_LEAF: page->entries = alloc_entries; WT_ERR(__inmem_row_leaf(session, page)); break; WT_ILLEGAL_VALUE_ERR(session); } /* Update the page's in-memory size and the cache statistics. */ __wt_cache_page_inmem_incr(session, page, size); /* Link the new page into the parent. */ if (parent_ref != NULL) WT_LINK_PAGE(parent, parent_ref, page); *pagep = page; return (0); err: __wt_page_out(session, &page); return (ret); }
/* * __wt_lsm_tree_rename -- * Rename an LSM tree. */ int __wt_lsm_tree_rename(WT_SESSION_IMPL *session, const char *olduri, const char *newuri, const char *cfg[]) { WT_DECL_RET; WT_ITEM buf; WT_LSM_CHUNK *chunk; WT_LSM_TREE *lsm_tree; const char *old; u_int i; int locked; old = NULL; WT_CLEAR(buf); locked = 0; /* Get the LSM tree. */ WT_RET(__wt_lsm_tree_get(session, olduri, 1, &lsm_tree)); /* Shut down the LSM worker. */ WT_ERR(__lsm_tree_close(session, lsm_tree)); /* Prevent any new opens. */ WT_ERR(__wt_lsm_tree_lock(session, lsm_tree, 1)); locked = 1; /* Set the new name. */ WT_ERR(__lsm_tree_set_name(session, lsm_tree, newuri)); /* Rename the chunks. */ for (i = 0; i < lsm_tree->nchunks; i++) { chunk = lsm_tree->chunk[i]; old = chunk->uri; chunk->uri = NULL; WT_ERR(__wt_lsm_tree_chunk_name( session, lsm_tree, chunk->id, &buf)); chunk->uri = __wt_buf_steal(session, &buf, NULL); WT_ERR(__wt_schema_rename(session, old, chunk->uri, cfg)); __wt_free(session, old); if (F_ISSET_ATOMIC(chunk, WT_LSM_CHUNK_BLOOM)) { old = chunk->bloom_uri; chunk->bloom_uri = NULL; WT_ERR(__wt_lsm_tree_bloom_name( session, lsm_tree, chunk->id, &buf)); chunk->bloom_uri = __wt_buf_steal(session, &buf, NULL); F_SET_ATOMIC(chunk, WT_LSM_CHUNK_BLOOM); WT_ERR(__wt_schema_rename( session, old, chunk->uri, cfg)); __wt_free(session, old); } } WT_ERR(__wt_lsm_meta_write(session, lsm_tree)); locked = 0; WT_ERR(__wt_lsm_tree_unlock(session, lsm_tree)); WT_ERR(__wt_metadata_remove(session, olduri)); err: if (locked) WT_TRET(__wt_lsm_tree_unlock(session, lsm_tree)); if (old != NULL) __wt_free(session, old); /* * Discard this LSM tree structure. The first operation on the renamed * tree will create a new one. */ WT_TRET(__lsm_tree_discard(session, lsm_tree)); return (ret); }
/* * __lsm_bloom_create -- * Create a bloom filter for a chunk of the LSM tree that has been * checkpointed but not yet been merged. */ static int __lsm_bloom_create(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, WT_LSM_CHUNK *chunk, u_int chunk_off) { WT_BLOOM *bloom; WT_CURSOR *src; WT_DECL_RET; WT_ITEM buf, key; WT_SESSION *wt_session; uint64_t insert_count; int exist; /* * Normally, the Bloom URI is populated when the chunk struct is * allocated. After an open, however, it may not have been. * Deal with that here. */ if (chunk->bloom_uri == NULL) { WT_CLEAR(buf); WT_RET(__wt_lsm_tree_bloom_name( session, lsm_tree, chunk->id, &buf)); chunk->bloom_uri = __wt_buf_steal(session, &buf, NULL); } /* * Drop the bloom filter first - there may be some content hanging over * from an aborted merge or checkpoint. */ wt_session = &session->iface; WT_RET(__wt_exist(session, chunk->bloom_uri + strlen("file:"), &exist)); if (exist) WT_RET(wt_session->drop(wt_session, chunk->bloom_uri, "force")); bloom = NULL; /* * This is merge-like activity, and we don't want compacts to give up * because we are creating a bunch of bloom filters before merging. */ ++lsm_tree->merge_progressing; WT_RET(__wt_bloom_create(session, chunk->bloom_uri, lsm_tree->bloom_config, chunk->count, lsm_tree->bloom_bit_count, lsm_tree->bloom_hash_count, &bloom)); /* Open a special merge cursor just on this chunk. */ WT_ERR(__wt_open_cursor(session, lsm_tree->name, NULL, NULL, &src)); F_SET(src, WT_CURSTD_RAW); WT_ERR(__wt_clsm_init_merge(src, chunk_off, chunk->id, 1)); F_SET(session, WT_SESSION_NO_CACHE); for (insert_count = 0; (ret = src->next(src)) == 0; insert_count++) { WT_ERR(src->get_key(src, &key)); WT_ERR(__wt_bloom_insert(bloom, &key)); } WT_ERR_NOTFOUND_OK(ret); WT_TRET(src->close(src)); WT_TRET(__wt_bloom_finalize(bloom)); WT_ERR(ret); F_CLR(session, WT_SESSION_NO_CACHE); /* Load the new Bloom filter into cache. */ WT_CLEAR(key); WT_ERR_NOTFOUND_OK(__wt_bloom_get(bloom, &key)); WT_VERBOSE_ERR(session, lsm, "LSM worker created bloom filter %s. " "Expected %" PRIu64 " items, got %" PRIu64, chunk->bloom_uri, chunk->count, insert_count); /* Ensure the bloom filter is in the metadata. */ WT_ERR(__wt_lsm_tree_lock(session, lsm_tree, 1)); F_SET_ATOMIC(chunk, WT_LSM_CHUNK_BLOOM); ret = __wt_lsm_meta_write(session, lsm_tree); ++lsm_tree->dsk_gen; WT_TRET(__wt_lsm_tree_unlock(session, lsm_tree)); if (ret != 0) WT_ERR_MSG(session, ret, "LSM bloom worker metadata write"); err: if (bloom != NULL) WT_TRET(__wt_bloom_close(bloom)); F_CLR(session, WT_SESSION_NO_CACHE); return (ret); }
/* * __wt_lsm_checkpoint_worker -- * A worker thread for an LSM tree, responsible for flushing new chunks to * disk. */ void * __wt_lsm_checkpoint_worker(void *arg) { WT_DECL_RET; WT_LSM_CHUNK *chunk; WT_LSM_TREE *lsm_tree; WT_LSM_WORKER_COOKIE cookie; WT_SESSION_IMPL *session; WT_TXN_ISOLATION saved_isolation; u_int i, j; int locked; lsm_tree = arg; session = lsm_tree->ckpt_session; WT_CLEAR(cookie); while (F_ISSET(lsm_tree, WT_LSM_TREE_WORKING)) { if (F_ISSET(lsm_tree, WT_LSM_TREE_NEED_SWITCH)) { WT_WITH_SCHEMA_LOCK(session, ret = __wt_lsm_tree_switch(session, lsm_tree)); WT_ERR(ret); } WT_ERR(__lsm_copy_chunks(session, lsm_tree, &cookie, 0)); /* Write checkpoints in all completed files. */ for (i = 0, j = 0; i < cookie.nchunks - 1; i++) { if (!F_ISSET(lsm_tree, WT_LSM_TREE_WORKING)) goto err; if (F_ISSET(lsm_tree, WT_LSM_TREE_NEED_SWITCH)) break; chunk = cookie.chunk_array[i]; /* Stop if a running transaction needs the chunk. */ __wt_txn_update_oldest(session); if (!__wt_txn_visible_all(session, chunk->txnid_max)) break; /* * If the chunk is already checkpointed, make sure it * is also evicted. Either way, there is no point * trying to checkpoint it again. */ if (F_ISSET_ATOMIC(chunk, WT_LSM_CHUNK_ONDISK)) { if (F_ISSET_ATOMIC(chunk, WT_LSM_CHUNK_EVICTED)) continue; if ((ret = __lsm_discard_handle( session, chunk->uri, NULL)) == 0) F_SET_ATOMIC( chunk, WT_LSM_CHUNK_EVICTED); else if (ret == EBUSY) ret = 0; else WT_ERR_MSG(session, ret, "discard handle"); continue; } WT_VERBOSE_ERR(session, lsm, "LSM worker flushing %u", i); /* * Flush the file before checkpointing: this is the * expensive part in terms of I/O: do it without * holding the schema lock. * * Use the special eviction isolation level to avoid * interfering with an application checkpoint: we have * already checked that all of the updates in this * chunk are globally visible. * * !!! We can wait here for checkpoints and fsyncs to * complete, which can be a long time. * * Don't keep waiting for the lock if application * threads are waiting for a switch. Don't skip * flushing the leaves either: that just means we'll * hold the schema lock for (much) longer, which blocks * the world. */ WT_ERR(__wt_session_get_btree( session, chunk->uri, NULL, NULL, 0)); for (locked = 0; !locked && ret == 0 && !F_ISSET(lsm_tree, WT_LSM_TREE_NEED_SWITCH);) { if ((ret = __wt_spin_trylock(session, &S2C(session)->checkpoint_lock)) == 0) locked = 1; else if (ret == EBUSY) { __wt_yield(); ret = 0; } } if (locked) { saved_isolation = session->txn.isolation; session->txn.isolation = TXN_ISO_EVICTION; ret = __wt_bt_cache_op( session, NULL, WT_SYNC_WRITE_LEAVES); session->txn.isolation = saved_isolation; __wt_spin_unlock( session, &S2C(session)->checkpoint_lock); } WT_TRET(__wt_session_release_btree(session)); WT_ERR(ret); if (F_ISSET(lsm_tree, WT_LSM_TREE_NEED_SWITCH)) break; WT_VERBOSE_ERR(session, lsm, "LSM worker checkpointing %u", i); WT_WITH_SCHEMA_LOCK(session, ret = __wt_schema_worker(session, chunk->uri, __wt_checkpoint, NULL, NULL, 0)); if (ret != 0) { __wt_err(session, ret, "LSM checkpoint"); break; } WT_ERR(__wt_lsm_tree_set_chunk_size(session, chunk)); /* * Clear the "cache resident" flag so the primary can * be evicted and eventually closed. Only do this once * the checkpoint has succeeded: otherwise, accessing * the leaf page during the checkpoint can trigger * forced eviction. */ WT_ERR(__wt_session_get_btree( session, chunk->uri, NULL, NULL, 0)); __wt_btree_evictable(session, 1); WT_ERR(__wt_session_release_btree(session)); ++j; WT_ERR(__wt_lsm_tree_lock(session, lsm_tree, 1)); F_SET_ATOMIC(chunk, WT_LSM_CHUNK_ONDISK); ret = __wt_lsm_meta_write(session, lsm_tree); ++lsm_tree->dsk_gen; /* Update the throttle time. */ __wt_lsm_tree_throttle(session, lsm_tree); WT_TRET(__wt_lsm_tree_unlock(session, lsm_tree)); /* Make sure we aren't pinning a transaction ID. */ __wt_txn_release_snapshot(session); if (ret != 0) { __wt_err(session, ret, "LSM checkpoint metadata write"); break; } WT_VERBOSE_ERR(session, lsm, "LSM worker checkpointed %u", i); } __lsm_unpin_chunks(session, &cookie); if (j == 0 && F_ISSET(lsm_tree, WT_LSM_TREE_WORKING) && !F_ISSET(lsm_tree, WT_LSM_TREE_NEED_SWITCH)) WT_ERR_TIMEDOUT_OK(__wt_cond_wait( session, lsm_tree->work_cond, 100000)); } err: __lsm_unpin_chunks(session, &cookie); __wt_free(session, cookie.chunk_array); /* * The thread will only exit with failure if we run out of memory or * there is some other system driven failure. We can't keep going * after such a failure - ensure WiredTiger shuts down. */ if (ret != 0 && ret != WT_NOTFOUND) WT_PANIC_ERR(session, ret, "Shutting down LSM checkpoint utility thread"); return (NULL); }