/* * __wt_lsm_work_bloom -- * Try to create a Bloom filter for the newest on-disk chunk that doesn't * have one. */ int __wt_lsm_work_bloom(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) { WT_DECL_RET; WT_LSM_CHUNK *chunk; WT_LSM_WORKER_COOKIE cookie; u_int i, merge; WT_CLEAR(cookie); WT_RET(__lsm_copy_chunks(session, lsm_tree, &cookie, 0)); /* Create bloom filters in all checkpointed chunks. */ merge = 0; for (i = 0; i < cookie.nchunks; i++) { chunk = cookie.chunk_array[i]; /* * Skip if a thread is still active in the chunk or it * isn't suitable. */ if (!F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) || F_ISSET(chunk, WT_LSM_CHUNK_BLOOM | WT_LSM_CHUNK_MERGING) || chunk->generation > 0 || chunk->count == 0) continue; /* * See if we win the race to switch on the "busy" flag and * recheck that the chunk still needs a Bloom filter. */ if (WT_ATOMIC_CAS4(chunk->bloom_busy, 0, 1)) { if (!F_ISSET(chunk, WT_LSM_CHUNK_BLOOM)) { ret = __lsm_bloom_create( session, lsm_tree, chunk, (u_int)i); /* * Record if we were successful so that we can * later push a merge work unit. */ if (ret == 0) merge = 1; } chunk->bloom_busy = 0; break; } } /* * If we created any bloom filters, we push a merge work unit now. */ if (merge) WT_ERR(__wt_lsm_manager_push_entry( session, WT_LSM_WORK_MERGE, 0, lsm_tree)); err: __lsm_unpin_chunks(session, &cookie); __wt_free(session, cookie.chunk_array); return (ret); }
/* * __lsm_bloom_work -- * Try to create a Bloom filter for the newest on-disk chunk. */ static int __lsm_bloom_work(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) { WT_DECL_RET; WT_LSM_CHUNK *chunk; WT_LSM_WORKER_COOKIE cookie; u_int i; WT_CLEAR(cookie); /* If no work is done, tell our caller by returning WT_NOTFOUND. */ ret = WT_NOTFOUND; WT_RET(__lsm_copy_chunks(session, lsm_tree, &cookie, 0)); /* Create bloom filters in all checkpointed chunks. */ for (i = 0; i < cookie.nchunks; i++) { chunk = cookie.chunk_array[i]; /* * Skip if a thread is still active in the chunk or it * isn't suitable. */ if (!F_ISSET_ATOMIC(chunk, WT_LSM_CHUNK_ONDISK) || F_ISSET_ATOMIC(chunk, WT_LSM_CHUNK_BLOOM | WT_LSM_CHUNK_MERGING) || chunk->generation > 0 || chunk->count == 0) continue; /* See if we win the race to switch on the "busy" flag. */ if (WT_ATOMIC_CAS(chunk->bloom_busy, 0, 1)) { ret = __lsm_bloom_create( session, lsm_tree, chunk, (u_int)i); chunk->bloom_busy = 0; break; } } __lsm_unpin_chunks(session, &cookie); __wt_free(session, cookie.chunk_array); return (ret); }
/* * __wt_lsm_free_chunks -- * Try to drop chunks from the tree that are no longer required. */ int __wt_lsm_free_chunks(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) { WT_DECL_RET; WT_LSM_CHUNK *chunk; WT_LSM_WORKER_COOKIE cookie; u_int i, skipped; int drop_ret; bool flush_metadata; flush_metadata = false; if (lsm_tree->nold_chunks == 0) return (0); /* * Make sure only a single thread is freeing the old chunk array * at any time. */ if (!__wt_atomic_cas32(&lsm_tree->freeing_old_chunks, 0, 1)) return (0); /* * Take a copy of the current state of the LSM tree and look for chunks * to drop. We do it this way to avoid holding the LSM tree lock while * doing I/O or waiting on the schema lock. * * This is safe because only one thread will be in this function at a * time. Merges may complete concurrently, and the old_chunks array * may be extended, but we shuffle down the pointers each time we free * one to keep the non-NULL slots at the beginning of the array. */ WT_CLEAR(cookie); WT_RET(__lsm_copy_chunks(session, lsm_tree, &cookie, true)); for (i = skipped = 0; i < cookie.nchunks; i++) { chunk = cookie.chunk_array[i]; WT_ASSERT(session, chunk != NULL); /* Skip the chunk if another worker is using it. */ if (chunk->refcnt > 1) { ++skipped; continue; } /* * Don't remove files if a hot backup is in progress. * * The schema lock protects the set of live files, this check * prevents us from removing a file that hot backup already * knows about. */ if (S2C(session)->hot_backup) break; /* * Drop any bloom filters and chunks we can. Don't try to drop * a chunk if the bloom filter drop fails. * An EBUSY return indicates that a cursor is still open in * the tree - move to the next chunk in that case. * An ENOENT return indicates that the LSM tree metadata was * out of sync with the on disk state. Update the * metadata to match in that case. */ if (F_ISSET(chunk, WT_LSM_CHUNK_BLOOM)) { drop_ret = __lsm_drop_file(session, chunk->bloom_uri); if (drop_ret == EBUSY) { ++skipped; continue; } else if (drop_ret != ENOENT) WT_ERR(drop_ret); flush_metadata = true; F_CLR(chunk, WT_LSM_CHUNK_BLOOM); } if (chunk->uri != NULL) { drop_ret = __lsm_drop_file(session, chunk->uri); if (drop_ret == EBUSY) { ++skipped; continue; } else if (drop_ret != ENOENT) WT_ERR(drop_ret); flush_metadata = true; } /* Lock the tree to clear out the old chunk information. */ WT_ERR(__wt_lsm_tree_writelock(session, lsm_tree)); /* * The chunk we are looking at should be the first one in the * tree that we haven't already skipped over. */ WT_ASSERT(session, lsm_tree->old_chunks[skipped] == chunk); __wt_free(session, chunk->bloom_uri); __wt_free(session, chunk->uri); __wt_free(session, lsm_tree->old_chunks[skipped]); /* Shuffle down to keep all occupied slots at the beginning. */ if (--lsm_tree->nold_chunks > skipped) { memmove(lsm_tree->old_chunks + skipped, lsm_tree->old_chunks + skipped + 1, (lsm_tree->nold_chunks - skipped) * sizeof(WT_LSM_CHUNK *)); lsm_tree->old_chunks[lsm_tree->nold_chunks] = NULL; } WT_ERR(__wt_lsm_tree_writeunlock(session, lsm_tree)); /* * Clear the chunk in the cookie so we don't attempt to * decrement the reference count. */ cookie.chunk_array[i] = NULL; } err: /* Flush the metadata unless the system is in panic */ if (flush_metadata && ret != WT_PANIC) { WT_TRET(__wt_lsm_tree_writelock(session, lsm_tree)); WT_TRET(__wt_lsm_meta_write(session, lsm_tree)); WT_TRET(__wt_lsm_tree_writeunlock(session, lsm_tree)); } __lsm_unpin_chunks(session, &cookie); __wt_free(session, cookie.chunk_array); lsm_tree->freeing_old_chunks = 0; /* Returning non-zero means there is no work to do. */ if (!flush_metadata) WT_TRET(WT_NOTFOUND); return (ret); }
/* * __lsm_free_chunks -- * Try to drop chunks from the tree that are no longer required. */ static int __lsm_free_chunks(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) { WT_DECL_RET; WT_LSM_CHUNK *chunk; WT_LSM_WORKER_COOKIE cookie; u_int i, skipped; int progress; /* * Take a copy of the current state of the LSM tree and look for chunks * to drop. We do it this way to avoid holding the LSM tree lock while * doing I/O or waiting on the schema lock. * * This is safe because only one thread will be in this function at a * time (the first merge thread). Merges may complete concurrently, * and the old_chunks array may be extended, but we shuffle down the * pointers each time we free one to keep the non-NULL slots at the * beginning of the array. */ WT_CLEAR(cookie); WT_RET(__lsm_copy_chunks(session, lsm_tree, &cookie, 1)); for (i = skipped = 0, progress = 0; i < cookie.nchunks; i++) { chunk = cookie.chunk_array[i]; WT_ASSERT(session, chunk != NULL); /* Skip the chunk if another worker is using it. */ if (chunk->refcnt > 1) { ++skipped; continue; } if (F_ISSET_ATOMIC(chunk, WT_LSM_CHUNK_BLOOM)) { /* * An EBUSY return is acceptable - a cursor may still * be positioned on this old chunk. */ if ((ret = __lsm_drop_file( session, chunk->bloom_uri)) == EBUSY) { WT_VERBOSE_ERR(session, lsm, "LSM worker bloom drop busy: %s.", chunk->bloom_uri); ++skipped; continue; } else WT_ERR(ret); F_CLR_ATOMIC(chunk, WT_LSM_CHUNK_BLOOM); } if (chunk->uri != NULL) { /* * An EBUSY return is acceptable - a cursor may still * be positioned on this old chunk. */ if ((ret = __lsm_drop_file( session, chunk->uri)) == EBUSY) { WT_VERBOSE_ERR(session, lsm, "LSM worker drop busy: %s.", chunk->uri); ++skipped; continue; } else WT_ERR(ret); } progress = 1; /* Lock the tree to clear out the old chunk information. */ WT_ERR(__wt_lsm_tree_lock(session, lsm_tree, 1)); /* * The chunk we are looking at should be the first one in the * tree that we haven't already skipped over. */ WT_ASSERT(session, lsm_tree->old_chunks[skipped] == chunk); __wt_free(session, chunk->bloom_uri); __wt_free(session, chunk->uri); __wt_free(session, lsm_tree->old_chunks[skipped]); /* Shuffle down to keep all occupied slots at the beginning. */ if (--lsm_tree->nold_chunks > skipped) { memmove(lsm_tree->old_chunks + skipped, lsm_tree->old_chunks + skipped + 1, (lsm_tree->nold_chunks - skipped) * sizeof(WT_LSM_CHUNK *)); lsm_tree->old_chunks[lsm_tree->nold_chunks] = NULL; } /* * Clear the chunk in the cookie so we don't attempt to * decrement the reference count. */ cookie.chunk_array[i] = NULL; /* * Update the metadata. We used to try to optimize by only * updating the metadata once at the end, but the error * handling is not straightforward. */ WT_TRET(__wt_lsm_meta_write(session, lsm_tree)); WT_ERR(__wt_lsm_tree_unlock(session, lsm_tree)); } err: __lsm_unpin_chunks(session, &cookie); __wt_free(session, cookie.chunk_array); /* Returning non-zero means there is no work to do. */ if (!progress) WT_TRET(WT_NOTFOUND); return (ret); }
/* * __wt_lsm_checkpoint_worker -- * A worker thread for an LSM tree, responsible for flushing new chunks to * disk. */ void * __wt_lsm_checkpoint_worker(void *arg) { WT_DECL_RET; WT_LSM_CHUNK *chunk; WT_LSM_TREE *lsm_tree; WT_LSM_WORKER_COOKIE cookie; WT_SESSION_IMPL *session; WT_TXN_ISOLATION saved_isolation; u_int i, j; int locked; lsm_tree = arg; session = lsm_tree->ckpt_session; WT_CLEAR(cookie); while (F_ISSET(lsm_tree, WT_LSM_TREE_WORKING)) { if (F_ISSET(lsm_tree, WT_LSM_TREE_NEED_SWITCH)) { WT_WITH_SCHEMA_LOCK(session, ret = __wt_lsm_tree_switch(session, lsm_tree)); WT_ERR(ret); } WT_ERR(__lsm_copy_chunks(session, lsm_tree, &cookie, 0)); /* Write checkpoints in all completed files. */ for (i = 0, j = 0; i < cookie.nchunks - 1; i++) { if (!F_ISSET(lsm_tree, WT_LSM_TREE_WORKING)) goto err; if (F_ISSET(lsm_tree, WT_LSM_TREE_NEED_SWITCH)) break; chunk = cookie.chunk_array[i]; /* Stop if a running transaction needs the chunk. */ __wt_txn_update_oldest(session); if (!__wt_txn_visible_all(session, chunk->txnid_max)) break; /* * If the chunk is already checkpointed, make sure it * is also evicted. Either way, there is no point * trying to checkpoint it again. */ if (F_ISSET_ATOMIC(chunk, WT_LSM_CHUNK_ONDISK)) { if (F_ISSET_ATOMIC(chunk, WT_LSM_CHUNK_EVICTED)) continue; if ((ret = __lsm_discard_handle( session, chunk->uri, NULL)) == 0) F_SET_ATOMIC( chunk, WT_LSM_CHUNK_EVICTED); else if (ret == EBUSY) ret = 0; else WT_ERR_MSG(session, ret, "discard handle"); continue; } WT_VERBOSE_ERR(session, lsm, "LSM worker flushing %u", i); /* * Flush the file before checkpointing: this is the * expensive part in terms of I/O: do it without * holding the schema lock. * * Use the special eviction isolation level to avoid * interfering with an application checkpoint: we have * already checked that all of the updates in this * chunk are globally visible. * * !!! We can wait here for checkpoints and fsyncs to * complete, which can be a long time. * * Don't keep waiting for the lock if application * threads are waiting for a switch. Don't skip * flushing the leaves either: that just means we'll * hold the schema lock for (much) longer, which blocks * the world. */ WT_ERR(__wt_session_get_btree( session, chunk->uri, NULL, NULL, 0)); for (locked = 0; !locked && ret == 0 && !F_ISSET(lsm_tree, WT_LSM_TREE_NEED_SWITCH);) { if ((ret = __wt_spin_trylock(session, &S2C(session)->checkpoint_lock)) == 0) locked = 1; else if (ret == EBUSY) { __wt_yield(); ret = 0; } } if (locked) { saved_isolation = session->txn.isolation; session->txn.isolation = TXN_ISO_EVICTION; ret = __wt_bt_cache_op( session, NULL, WT_SYNC_WRITE_LEAVES); session->txn.isolation = saved_isolation; __wt_spin_unlock( session, &S2C(session)->checkpoint_lock); } WT_TRET(__wt_session_release_btree(session)); WT_ERR(ret); if (F_ISSET(lsm_tree, WT_LSM_TREE_NEED_SWITCH)) break; WT_VERBOSE_ERR(session, lsm, "LSM worker checkpointing %u", i); WT_WITH_SCHEMA_LOCK(session, ret = __wt_schema_worker(session, chunk->uri, __wt_checkpoint, NULL, NULL, 0)); if (ret != 0) { __wt_err(session, ret, "LSM checkpoint"); break; } WT_ERR(__wt_lsm_tree_set_chunk_size(session, chunk)); /* * Clear the "cache resident" flag so the primary can * be evicted and eventually closed. Only do this once * the checkpoint has succeeded: otherwise, accessing * the leaf page during the checkpoint can trigger * forced eviction. */ WT_ERR(__wt_session_get_btree( session, chunk->uri, NULL, NULL, 0)); __wt_btree_evictable(session, 1); WT_ERR(__wt_session_release_btree(session)); ++j; WT_ERR(__wt_lsm_tree_lock(session, lsm_tree, 1)); F_SET_ATOMIC(chunk, WT_LSM_CHUNK_ONDISK); ret = __wt_lsm_meta_write(session, lsm_tree); ++lsm_tree->dsk_gen; /* Update the throttle time. */ __wt_lsm_tree_throttle(session, lsm_tree); WT_TRET(__wt_lsm_tree_unlock(session, lsm_tree)); /* Make sure we aren't pinning a transaction ID. */ __wt_txn_release_snapshot(session); if (ret != 0) { __wt_err(session, ret, "LSM checkpoint metadata write"); break; } WT_VERBOSE_ERR(session, lsm, "LSM worker checkpointed %u", i); } __lsm_unpin_chunks(session, &cookie); if (j == 0 && F_ISSET(lsm_tree, WT_LSM_TREE_WORKING) && !F_ISSET(lsm_tree, WT_LSM_TREE_NEED_SWITCH)) WT_ERR_TIMEDOUT_OK(__wt_cond_wait( session, lsm_tree->work_cond, 100000)); } err: __lsm_unpin_chunks(session, &cookie); __wt_free(session, cookie.chunk_array); /* * The thread will only exit with failure if we run out of memory or * there is some other system driven failure. We can't keep going * after such a failure - ensure WiredTiger shuts down. */ if (ret != 0 && ret != WT_NOTFOUND) WT_PANIC_ERR(session, ret, "Shutting down LSM checkpoint utility thread"); return (NULL); }