/* * __wt_page_out -- * Discard an in-memory page, freeing all memory associated with it. */ void __wt_page_out(WT_SESSION_IMPL *session, WT_PAGE **pagep) { WT_PAGE *page; /* * When a page is discarded, it's been disconnected from its parent and * its parent's WT_REF structure may now point to a different page. * Make sure we don't accidentally use the page itself or any other * information. */ page = *pagep; *pagep = NULL; page->parent = NULL; page->ref = NULL; WT_ASSERT(session, !F_ISSET_ATOMIC(page, WT_PAGE_EVICT_LRU)); #ifdef HAVE_DIAGNOSTIC { WT_HAZARD *hp; if ((hp = __wt_page_hazard_check(session, page)) != NULL) __wt_errx(session, "discarded page has hazard pointer: (%p: %s, line %d)", hp->page, hp->file, hp->line); } #endif /* Update the cache's information. */ __wt_cache_page_evict(session, page); /* Free the page modification information. */ if (page->modify != NULL) __free_page_modify(session, page); switch (page->type) { case WT_PAGE_COL_FIX: break; case WT_PAGE_COL_INT: __free_page_col_int(session, page); break; case WT_PAGE_COL_VAR: __free_page_col_var(session, page); break; case WT_PAGE_ROW_INT: __free_page_row_int(session, page); break; case WT_PAGE_ROW_LEAF: __free_page_row_leaf(session, page); break; } /* Free any allocated disk image. */ if (!F_ISSET_ATOMIC(page, WT_PAGE_DISK_NOT_ALLOC)) __wt_free(session, page->dsk); __wt_overwrite_and_free(session, page); }
/* * __wt_lsm_tree_drop -- * Drop an LSM tree. */ int __wt_lsm_tree_drop( WT_SESSION_IMPL *session, const char *name, const char *cfg[]) { WT_DECL_RET; WT_LSM_CHUNK *chunk; WT_LSM_TREE *lsm_tree; u_int i; int locked; locked = 0; /* Get the LSM tree. */ WT_RET(__wt_lsm_tree_get(session, name, 1, &lsm_tree)); /* Shut down the LSM worker. */ WT_ERR(__lsm_tree_close(session, lsm_tree)); /* Prevent any new opens. */ WT_ERR(__wt_lsm_tree_lock(session, lsm_tree, 1)); locked = 1; /* Drop the chunks. */ for (i = 0; i < lsm_tree->nchunks; i++) { chunk = lsm_tree->chunk[i]; WT_ERR(__wt_schema_drop(session, chunk->uri, cfg)); if (F_ISSET_ATOMIC(chunk, WT_LSM_CHUNK_BLOOM)) WT_ERR( __wt_schema_drop(session, chunk->bloom_uri, cfg)); } /* Drop any chunks on the obsolete list. */ for (i = 0; i < lsm_tree->nold_chunks; i++) { if ((chunk = lsm_tree->old_chunks[i]) == NULL) continue; WT_ERR(__wt_schema_drop(session, chunk->uri, cfg)); if (F_ISSET_ATOMIC(chunk, WT_LSM_CHUNK_BLOOM)) WT_ERR( __wt_schema_drop(session, chunk->bloom_uri, cfg)); } locked = 0; WT_ERR(__wt_lsm_tree_unlock(session, lsm_tree)); ret = __wt_metadata_remove(session, name); err: if (locked) WT_TRET(__wt_lsm_tree_unlock(session, lsm_tree)); WT_TRET(__lsm_tree_discard(session, lsm_tree)); return (ret); }
/* * __free_page_row_leaf -- * Discard a WT_PAGE_ROW_LEAF page. */ static void __free_page_row_leaf(WT_SESSION_IMPL *session, WT_PAGE *page) { WT_IKEY *ikey; WT_ROW *rip; uint32_t i; void *copy; bool update_ignore; /* In some failed-split cases, we can't discard updates. */ update_ignore = F_ISSET_ATOMIC(page, WT_PAGE_UPDATE_IGNORE); /* * Free the in-memory index array. * * For each entry, see if the key was an allocation (that is, if it * points somewhere other than the original page), and if so, free * the memory. */ WT_ROW_FOREACH(page, rip, i) { copy = WT_ROW_KEY_COPY(rip); (void)__wt_row_leaf_key_info( page, copy, &ikey, NULL, NULL, NULL); if (ikey != NULL) __wt_free(session, ikey); }
/* * __wt_lsm_tree_get -- * Get an LSM tree structure for the given name. Optionally get exclusive * access to the handle. Exclusive access works separately to the LSM * tree lock - since operations that need exclusive access may also need * to take the LSM tree lock for example outstanding work unit operations. */ int __wt_lsm_tree_get(WT_SESSION_IMPL *session, const char *uri, int exclusive, WT_LSM_TREE **treep) { WT_LSM_TREE *lsm_tree; /* See if the tree is already open. */ TAILQ_FOREACH(lsm_tree, &S2C(session)->lsmqh, q) if (strcmp(uri, lsm_tree->name) == 0) { /* * Short circuit if the handle is already held * exclusively or exclusive access is requested and * there are references held. */ if ((exclusive && lsm_tree->refcnt > 0) || F_ISSET_ATOMIC(lsm_tree, WT_LSM_TREE_EXCLUSIVE)) return (EBUSY); if (exclusive) { F_SET_ATOMIC(lsm_tree, WT_LSM_TREE_EXCLUSIVE); if (!WT_ATOMIC_CAS4(lsm_tree->refcnt, 0, 1)) { F_CLR(lsm_tree, WT_LSM_TREE_EXCLUSIVE); return (EBUSY); } } else (void)WT_ATOMIC_ADD4(lsm_tree->refcnt, 1); /* * If we got a reference, but an exclusive reference * beat us to it, give our reference up. */ if (!exclusive && F_ISSET_ATOMIC(lsm_tree, WT_LSM_TREE_EXCLUSIVE)) { (void)WT_ATOMIC_SUB4(lsm_tree->refcnt, 1); return (EBUSY); } *treep = lsm_tree; return (0); } /* Open a new tree. */ return (__lsm_tree_open(session, uri, treep)); }
/* * __lsm_bloom_work -- * Try to create a Bloom filter for the newest on-disk chunk. */ static int __lsm_bloom_work(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) { WT_DECL_RET; WT_LSM_CHUNK *chunk; WT_LSM_WORKER_COOKIE cookie; u_int i; WT_CLEAR(cookie); /* If no work is done, tell our caller by returning WT_NOTFOUND. */ ret = WT_NOTFOUND; WT_RET(__lsm_copy_chunks(session, lsm_tree, &cookie, 0)); /* Create bloom filters in all checkpointed chunks. */ for (i = 0; i < cookie.nchunks; i++) { chunk = cookie.chunk_array[i]; /* * Skip if a thread is still active in the chunk or it * isn't suitable. */ if (!F_ISSET_ATOMIC(chunk, WT_LSM_CHUNK_ONDISK) || F_ISSET_ATOMIC(chunk, WT_LSM_CHUNK_BLOOM | WT_LSM_CHUNK_MERGING) || chunk->generation > 0 || chunk->count == 0) continue; /* See if we win the race to switch on the "busy" flag. */ if (WT_ATOMIC_CAS(chunk->bloom_busy, 0, 1)) { ret = __lsm_bloom_create( session, lsm_tree, chunk, (u_int)i); chunk->bloom_busy = 0; break; } } __lsm_unpin_chunks(session, &cookie); __wt_free(session, cookie.chunk_array); return (ret); }
/* * __wt_lsm_tree_worker -- * Run a schema worker operation on each level of a LSM tree. */ int __wt_lsm_tree_worker(WT_SESSION_IMPL *session, const char *uri, int (*file_func)(WT_SESSION_IMPL *, const char *[]), int (*name_func)(WT_SESSION_IMPL *, const char *, int *), const char *cfg[], uint32_t open_flags) { WT_DECL_RET; WT_LSM_CHUNK *chunk; WT_LSM_TREE *lsm_tree; u_int i; WT_RET(__wt_lsm_tree_get(session, uri, FLD_ISSET(open_flags, WT_DHANDLE_EXCLUSIVE) ? 1 : 0, &lsm_tree)); /* * We mark that we're busy using the tree to coordinate * with merges so that merging doesn't change the chunk * array out from underneath us. */ WT_RET(__wt_lsm_tree_lock(session, lsm_tree, 0)); for (i = 0; i < lsm_tree->nchunks; i++) { chunk = lsm_tree->chunk[i]; if (file_func == __wt_checkpoint && F_ISSET_ATOMIC(chunk, WT_LSM_CHUNK_ONDISK)) continue; WT_ERR(__wt_schema_worker(session, chunk->uri, file_func, name_func, cfg, open_flags)); if (name_func == __wt_backup_list_uri_append && F_ISSET_ATOMIC(chunk, WT_LSM_CHUNK_BLOOM)) WT_ERR(__wt_schema_worker(session, chunk->bloom_uri, file_func, name_func, cfg, open_flags)); } err: WT_TRET(__wt_lsm_tree_unlock(session, lsm_tree)); __wt_lsm_tree_release(session, lsm_tree); return (ret); }
/* * __wt_lsm_tree_switch -- * Switch to a new in-memory tree. */ int __wt_lsm_tree_switch(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) { WT_DECL_RET; WT_LSM_CHUNK *chunk; uint32_t nchunks, new_id; WT_RET(__wt_lsm_tree_lock(session, lsm_tree, 1)); /* * Check if a switch is still needed: we may have raced while waiting * for a lock. */ if ((nchunks = lsm_tree->nchunks) != 0 && (chunk = lsm_tree->chunk[nchunks - 1]) != NULL && !F_ISSET_ATOMIC(chunk, WT_LSM_CHUNK_ONDISK) && !F_ISSET(lsm_tree, WT_LSM_TREE_NEED_SWITCH)) goto err; /* Update the throttle time. */ __wt_lsm_tree_throttle(session, lsm_tree); new_id = WT_ATOMIC_ADD(lsm_tree->last, 1); WT_ERR(__wt_realloc_def(session, &lsm_tree->chunk_alloc, nchunks + 1, &lsm_tree->chunk)); WT_VERBOSE_ERR(session, lsm, "Tree switch to: %" PRIu32 ", throttle %ld", new_id, lsm_tree->throttle_sleep); WT_ERR(__wt_calloc_def(session, 1, &chunk)); chunk->id = new_id; chunk->txnid_max = WT_TXN_NONE; lsm_tree->chunk[lsm_tree->nchunks++] = chunk; WT_ERR(__wt_lsm_tree_setup_chunk(session, lsm_tree, chunk)); WT_ERR(__wt_lsm_meta_write(session, lsm_tree)); F_CLR(lsm_tree, WT_LSM_TREE_NEED_SWITCH); ++lsm_tree->dsk_gen; lsm_tree->modified = 1; err: /* TODO: mark lsm_tree bad on error(?) */ WT_TRET(__wt_lsm_tree_unlock(session, lsm_tree)); return (ret); }
/* * __split_ovfl_key_cleanup -- * Handle cleanup for on-page row-store overflow keys. */ static int __split_ovfl_key_cleanup(WT_SESSION_IMPL *session, WT_PAGE *page, WT_REF *ref) { WT_CELL *cell; WT_CELL_UNPACK kpack; WT_IKEY *ikey; uint32_t cell_offset; /* There's a per-page flag if there are any overflow keys at all. */ if (!F_ISSET_ATOMIC(page, WT_PAGE_OVERFLOW_KEYS)) return (0); /* * A key being discarded (page split) or moved to a different page (page * deepening) may be an on-page overflow key. Clear any reference to an * underlying disk image, and, if the key hasn't been deleted, delete it * along with any backing blocks. */ if ((ikey = __wt_ref_key_instantiated(ref)) == NULL) return (0); if ((cell_offset = ikey->cell_offset) == 0) return (0); /* Leak blocks rather than try this twice. */ ikey->cell_offset = 0; cell = WT_PAGE_REF_OFFSET(page, cell_offset); __wt_cell_unpack(cell, &kpack); if (kpack.ovfl && kpack.raw != WT_CELL_KEY_OVFL_RM) { /* * Eviction cannot free overflow items once a checkpoint is * running in a tree: that can corrupt the checkpoint's block * management. Assert that checkpoints aren't running to make * sure we're catching all paths and to avoid regressions. */ WT_ASSERT(session, S2BT(session)->checkpointing != WT_CKPT_RUNNING); WT_RET(__wt_ovfl_discard(session, cell)); } return (0); }
/* * __wt_page_out -- * Discard an in-memory page, freeing all memory associated with it. */ void __wt_page_out(WT_SESSION_IMPL *session, WT_PAGE **pagep) { WT_PAGE *page; WT_PAGE_HEADER *dsk; WT_PAGE_MODIFY *mod; /* * Kill our caller's reference, do our best to catch races. */ page = *pagep; *pagep = NULL; if (F_ISSET(session->dhandle, WT_DHANDLE_DEAD)) __wt_page_modify_clear(session, page); /* * We should never discard: * - a dirty page, * - a page queued for eviction, or * - a locked page. */ WT_ASSERT(session, !__wt_page_is_modified(page)); WT_ASSERT(session, !F_ISSET_ATOMIC(page, WT_PAGE_EVICT_LRU)); WT_ASSERT(session, !__wt_fair_islocked(session, &page->page_lock)); #ifdef HAVE_DIAGNOSTIC { WT_HAZARD *hp; int i; /* * Make sure no other thread has a hazard pointer on the page we are * about to discard. This is complicated by the fact that readers * publish their hazard pointer before re-checking the page state, so * our check can race with readers without indicating a real problem. * Wait for up to a second for hazard pointers to be cleared. */ for (hp = NULL, i = 0; i < 100; i++) { if ((hp = __wt_page_hazard_check(session, page)) == NULL) break; __wt_sleep(0, 10000); } if (hp != NULL) __wt_errx(session, "discarded page has hazard pointer: (%p: %s, line %d)", hp->page, hp->file, hp->line); WT_ASSERT(session, hp == NULL); } #endif /* * If a root page split, there may be one or more pages linked from the * page; walk the list, discarding pages. */ switch (page->type) { case WT_PAGE_COL_INT: case WT_PAGE_ROW_INT: mod = page->modify; if (mod != NULL && mod->mod_root_split != NULL) __wt_page_out(session, &mod->mod_root_split); break; } /* Update the cache's information. */ __wt_cache_page_evict(session, page); /* * If discarding the page as part of process exit, the application may * configure to leak the memory rather than do the work. */ if (F_ISSET(S2C(session), WT_CONN_LEAK_MEMORY)) return; /* Free the page modification information. */ if (page->modify != NULL) __free_page_modify(session, page); switch (page->type) { case WT_PAGE_COL_FIX: break; case WT_PAGE_COL_INT: case WT_PAGE_ROW_INT: __free_page_int(session, page); break; case WT_PAGE_COL_VAR: __free_page_col_var(session, page); break; case WT_PAGE_ROW_LEAF: __free_page_row_leaf(session, page); break; } /* Discard any disk image. */ dsk = (WT_PAGE_HEADER *)page->dsk; if (F_ISSET_ATOMIC(page, WT_PAGE_DISK_ALLOC)) __wt_overwrite_and_free_len(session, dsk, dsk->mem_size); if (F_ISSET_ATOMIC(page, WT_PAGE_DISK_MAPPED)) (void)__wt_mmap_discard(session, dsk, dsk->mem_size); __wt_overwrite_and_free(session, page); }
/* * __free_page_modify -- * Discard the page's associated modification structures. */ static void __free_page_modify(WT_SESSION_IMPL *session, WT_PAGE *page) { WT_INSERT_HEAD *append; WT_MULTI *multi; WT_PAGE_MODIFY *mod; uint32_t i; bool update_ignore; mod = page->modify; /* In some failed-split cases, we can't discard updates. */ update_ignore = F_ISSET_ATOMIC(page, WT_PAGE_UPDATE_IGNORE); switch (mod->rec_result) { case WT_PM_REC_MULTIBLOCK: /* Free list of replacement blocks. */ for (multi = mod->mod_multi, i = 0; i < mod->mod_multi_entries; ++multi, ++i) { switch (page->type) { case WT_PAGE_ROW_INT: case WT_PAGE_ROW_LEAF: __wt_free(session, multi->key.ikey); break; } __wt_free(session, multi->supd); __wt_free(session, multi->disk_image); __wt_free(session, multi->addr.addr); } __wt_free(session, mod->mod_multi); break; case WT_PM_REC_REPLACE: /* * Discard any replacement address: this memory is usually moved * into the parent's WT_REF, but at the root that can't happen. */ __wt_free(session, mod->mod_replace.addr); break; } switch (page->type) { case WT_PAGE_COL_FIX: case WT_PAGE_COL_VAR: /* Free the append array. */ if ((append = WT_COL_APPEND(page)) != NULL) { __free_skip_list( session, WT_SKIP_FIRST(append), update_ignore); __wt_free(session, append); __wt_free(session, mod->mod_append); } /* Free the insert/update array. */ if (mod->mod_update != NULL) __free_skip_array(session, mod->mod_update, page->type == WT_PAGE_COL_FIX ? 1 : page->pg_var_entries, update_ignore); break; } /* Free the overflow on-page, reuse and transaction-cache skiplists. */ __wt_ovfl_reuse_free(session, page); __wt_ovfl_txnc_free(session, page); __wt_ovfl_discard_free(session, page); __wt_free(session, page->modify->ovfl_track); __wt_free(session, page->modify); }
/* * __evict_stat_walk -- * Walk all the pages in cache for a dhandle gathering stats information */ static void __evict_stat_walk(WT_SESSION_IMPL *session) { WT_BTREE *btree; WT_CACHE *cache; WT_PAGE *page; WT_REF *next_walk; uint64_t dsk_size, gen_gap, gen_gap_max, gen_gap_sum, max_pagesize; uint64_t min_written_size, num_memory, num_not_queueable, num_queued; uint64_t num_smaller_allocsz, pages_clean, pages_dirty, pages_internal; uint64_t pages_leaf, seen_count, size, visited_count; uint64_t visited_age_gap_sum, unvisited_count, unvisited_age_gap_sum; uint64_t walk_count, written_size_cnt, written_size_sum; btree = S2BT(session); cache = S2C(session)->cache; next_walk = NULL; gen_gap_max = gen_gap_sum = max_pagesize = 0; num_memory = num_not_queueable = num_queued = 0; num_smaller_allocsz = pages_clean = pages_dirty = pages_internal = 0; pages_leaf = seen_count = size = visited_count = 0; visited_age_gap_sum = unvisited_count = unvisited_age_gap_sum = 0; walk_count = written_size_cnt = written_size_sum = 0; min_written_size = UINT64_MAX; while (__wt_tree_walk_count(session, &next_walk, &walk_count, WT_READ_CACHE | WT_READ_NO_EVICT | WT_READ_NO_GEN | WT_READ_NO_WAIT) == 0 && next_walk != NULL) { ++seen_count; page = next_walk->page; size = page->memory_footprint; if (__wt_page_is_modified(page)) ++pages_dirty; else ++pages_clean; if (!__wt_ref_is_root(next_walk) && !__wt_page_can_evict(session, next_walk, NULL)) ++num_not_queueable; if (F_ISSET_ATOMIC(page, WT_PAGE_EVICT_LRU)) ++num_queued; if (size > max_pagesize) max_pagesize = size; dsk_size = page->dsk != NULL ? page->dsk->mem_size : 0; if (dsk_size != 0) { if (dsk_size < btree->allocsize) ++num_smaller_allocsz; if (dsk_size < min_written_size) min_written_size = dsk_size; ++written_size_cnt; written_size_sum += dsk_size; } else ++num_memory; if (WT_PAGE_IS_INTERNAL(page)) ++pages_internal; else ++pages_leaf; /* Skip root pages since they are never considered */ if (__wt_ref_is_root(next_walk)) continue; if (page->evict_pass_gen == 0) { unvisited_age_gap_sum += (cache->evict_pass_gen - page->cache_create_gen); ++unvisited_count; } else { visited_age_gap_sum += (cache->evict_pass_gen - page->cache_create_gen); gen_gap = cache->evict_pass_gen - page->evict_pass_gen; if (gen_gap > gen_gap_max) gen_gap_max = gen_gap; gen_gap_sum += gen_gap; ++visited_count; } } WT_STAT_DATA_SET(session, cache_state_gen_avg_gap, visited_count == 0 ? 0 : gen_gap_sum / visited_count); WT_STAT_DATA_SET(session, cache_state_avg_unvisited_age, unvisited_count == 0 ? 0 : unvisited_age_gap_sum / unvisited_count); WT_STAT_DATA_SET(session, cache_state_avg_visited_age, visited_count == 0 ? 0 : visited_age_gap_sum / visited_count); WT_STAT_DATA_SET(session, cache_state_avg_written_size, written_size_cnt == 0 ? 0 : written_size_sum / written_size_cnt); WT_STAT_DATA_SET(session, cache_state_gen_max_gap, gen_gap_max); WT_STAT_DATA_SET(session, cache_state_max_pagesize, max_pagesize); WT_STAT_DATA_SET(session, cache_state_min_written_size, min_written_size); WT_STAT_DATA_SET(session, cache_state_memory, num_memory); WT_STAT_DATA_SET(session, cache_state_queued, num_queued); WT_STAT_DATA_SET(session, cache_state_not_queueable, num_not_queueable); WT_STAT_DATA_SET(session, cache_state_pages, walk_count); WT_STAT_DATA_SET(session, cache_state_pages_clean, pages_clean); WT_STAT_DATA_SET(session, cache_state_pages_dirty, pages_dirty); WT_STAT_DATA_SET(session, cache_state_pages_internal, pages_internal); WT_STAT_DATA_SET(session, cache_state_pages_leaf, pages_leaf); WT_STAT_DATA_SET(session, cache_state_refs_skipped, walk_count - seen_count); WT_STAT_DATA_SET(session, cache_state_smaller_alloc_size, num_smaller_allocsz); WT_STAT_DATA_SET(session, cache_state_unvisited_count, unvisited_count); }
/* * __wt_lsm_tree_throttle -- * Calculate whether LSM updates need to be throttled. */ void __wt_lsm_tree_throttle(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) { WT_LSM_CHUNK *chunk, **cp, *prev_chunk; uint64_t cache_sz, cache_used, in_memory, record_count; uint64_t oldtime, timediff; uint32_t i; /* Never throttle in small trees. */ if (lsm_tree->nchunks < 3) return; cache_sz = S2C(session)->cache_size; /* * In the steady state, we expect that the checkpoint worker thread * will keep up with inserts. If not, throttle the insert rate to * avoid filling the cache with in-memory chunks. Threads sleep every * 100 operations, so take that into account in the calculation. * * Count the number of in-memory chunks, and find the most recent * on-disk chunk (if any). */ record_count = 1; for (i = in_memory = 0, cp = lsm_tree->chunk + lsm_tree->nchunks - 1; i < lsm_tree->nchunks; ++i, --cp) if (!F_ISSET_ATOMIC(*cp, WT_LSM_CHUNK_ONDISK)) { record_count += (*cp)->count; ++in_memory; } else if ((*cp)->generation == 0 || F_ISSET_ATOMIC(*cp, WT_LSM_CHUNK_STABLE)) break; chunk = lsm_tree->chunk[lsm_tree->nchunks - 1]; if (!F_ISSET(lsm_tree, WT_LSM_TREE_THROTTLE) || in_memory <= 3) lsm_tree->throttle_sleep = 0; else if (i == lsm_tree->nchunks || F_ISSET_ATOMIC(*cp, WT_LSM_CHUNK_STABLE)) { /* * No checkpoint has completed this run. Keep slowing down * inserts until one does. */ lsm_tree->throttle_sleep = WT_MAX(20, 2 * lsm_tree->throttle_sleep); } else { WT_ASSERT(session, WT_TIMECMP(chunk->create_ts, (*cp)->create_ts) >= 0); timediff = WT_TIMEDIFF(chunk->create_ts, (*cp)->create_ts); lsm_tree->throttle_sleep = (long)((in_memory - 2) * timediff / (20 * record_count)); /* * Get more aggressive as the number of in memory chunks * consumes a large proportion of the cache. In memory chunks * are allowed to grow up to twice as large as the configured * value when checkpoints aren't keeping up. That worst case * is when this calculation is relevant. * There is nothing particularly special about the chosen * multipliers. */ cache_used = in_memory * lsm_tree->chunk_size * 2; if (cache_used > cache_sz * 0.8) lsm_tree->throttle_sleep *= 5; } /* * Update our estimate of how long each in-memory chunk stays active. * Filter out some noise by keeping a weighted history of the * calculated value. Wait until we have enough chunks that we can * check that the new value is sane: otherwise, after a long idle * period, we can calculate a crazy value. */ if (in_memory > 1 && i != lsm_tree->nchunks && !F_ISSET_ATOMIC(*cp, WT_LSM_CHUNK_STABLE)) { prev_chunk = lsm_tree->chunk[lsm_tree->nchunks - 2]; WT_ASSERT(session, prev_chunk->generation == 0); WT_ASSERT(session, WT_TIMECMP(chunk->create_ts, prev_chunk->create_ts) >= 0); timediff = WT_TIMEDIFF(chunk->create_ts, prev_chunk->create_ts); WT_ASSERT(session, WT_TIMECMP(prev_chunk->create_ts, (*cp)->create_ts) >= 0); oldtime = WT_TIMEDIFF(prev_chunk->create_ts, (*cp)->create_ts); if (timediff < 10 * oldtime) lsm_tree->chunk_fill_ms = (3 * lsm_tree->chunk_fill_ms + timediff / 1000000) / 4; } }
/* * __lsm_free_chunks -- * Try to drop chunks from the tree that are no longer required. */ static int __lsm_free_chunks(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) { WT_DECL_RET; WT_LSM_CHUNK *chunk; WT_LSM_WORKER_COOKIE cookie; u_int i, skipped; int progress; /* * Take a copy of the current state of the LSM tree and look for chunks * to drop. We do it this way to avoid holding the LSM tree lock while * doing I/O or waiting on the schema lock. * * This is safe because only one thread will be in this function at a * time (the first merge thread). Merges may complete concurrently, * and the old_chunks array may be extended, but we shuffle down the * pointers each time we free one to keep the non-NULL slots at the * beginning of the array. */ WT_CLEAR(cookie); WT_RET(__lsm_copy_chunks(session, lsm_tree, &cookie, 1)); for (i = skipped = 0, progress = 0; i < cookie.nchunks; i++) { chunk = cookie.chunk_array[i]; WT_ASSERT(session, chunk != NULL); /* Skip the chunk if another worker is using it. */ if (chunk->refcnt > 1) { ++skipped; continue; } if (F_ISSET_ATOMIC(chunk, WT_LSM_CHUNK_BLOOM)) { /* * An EBUSY return is acceptable - a cursor may still * be positioned on this old chunk. */ if ((ret = __lsm_drop_file( session, chunk->bloom_uri)) == EBUSY) { WT_VERBOSE_ERR(session, lsm, "LSM worker bloom drop busy: %s.", chunk->bloom_uri); ++skipped; continue; } else WT_ERR(ret); F_CLR_ATOMIC(chunk, WT_LSM_CHUNK_BLOOM); } if (chunk->uri != NULL) { /* * An EBUSY return is acceptable - a cursor may still * be positioned on this old chunk. */ if ((ret = __lsm_drop_file( session, chunk->uri)) == EBUSY) { WT_VERBOSE_ERR(session, lsm, "LSM worker drop busy: %s.", chunk->uri); ++skipped; continue; } else WT_ERR(ret); } progress = 1; /* Lock the tree to clear out the old chunk information. */ WT_ERR(__wt_lsm_tree_lock(session, lsm_tree, 1)); /* * The chunk we are looking at should be the first one in the * tree that we haven't already skipped over. */ WT_ASSERT(session, lsm_tree->old_chunks[skipped] == chunk); __wt_free(session, chunk->bloom_uri); __wt_free(session, chunk->uri); __wt_free(session, lsm_tree->old_chunks[skipped]); /* Shuffle down to keep all occupied slots at the beginning. */ if (--lsm_tree->nold_chunks > skipped) { memmove(lsm_tree->old_chunks + skipped, lsm_tree->old_chunks + skipped + 1, (lsm_tree->nold_chunks - skipped) * sizeof(WT_LSM_CHUNK *)); lsm_tree->old_chunks[lsm_tree->nold_chunks] = NULL; } /* * Clear the chunk in the cookie so we don't attempt to * decrement the reference count. */ cookie.chunk_array[i] = NULL; /* * Update the metadata. We used to try to optimize by only * updating the metadata once at the end, but the error * handling is not straightforward. */ WT_TRET(__wt_lsm_meta_write(session, lsm_tree)); WT_ERR(__wt_lsm_tree_unlock(session, lsm_tree)); } err: __lsm_unpin_chunks(session, &cookie); __wt_free(session, cookie.chunk_array); /* Returning non-zero means there is no work to do. */ if (!progress) WT_TRET(WT_NOTFOUND); return (ret); }
/* * __wt_lsm_checkpoint_worker -- * A worker thread for an LSM tree, responsible for flushing new chunks to * disk. */ void * __wt_lsm_checkpoint_worker(void *arg) { WT_DECL_RET; WT_LSM_CHUNK *chunk; WT_LSM_TREE *lsm_tree; WT_LSM_WORKER_COOKIE cookie; WT_SESSION_IMPL *session; WT_TXN_ISOLATION saved_isolation; u_int i, j; int locked; lsm_tree = arg; session = lsm_tree->ckpt_session; WT_CLEAR(cookie); while (F_ISSET(lsm_tree, WT_LSM_TREE_WORKING)) { if (F_ISSET(lsm_tree, WT_LSM_TREE_NEED_SWITCH)) { WT_WITH_SCHEMA_LOCK(session, ret = __wt_lsm_tree_switch(session, lsm_tree)); WT_ERR(ret); } WT_ERR(__lsm_copy_chunks(session, lsm_tree, &cookie, 0)); /* Write checkpoints in all completed files. */ for (i = 0, j = 0; i < cookie.nchunks - 1; i++) { if (!F_ISSET(lsm_tree, WT_LSM_TREE_WORKING)) goto err; if (F_ISSET(lsm_tree, WT_LSM_TREE_NEED_SWITCH)) break; chunk = cookie.chunk_array[i]; /* Stop if a running transaction needs the chunk. */ __wt_txn_update_oldest(session); if (!__wt_txn_visible_all(session, chunk->txnid_max)) break; /* * If the chunk is already checkpointed, make sure it * is also evicted. Either way, there is no point * trying to checkpoint it again. */ if (F_ISSET_ATOMIC(chunk, WT_LSM_CHUNK_ONDISK)) { if (F_ISSET_ATOMIC(chunk, WT_LSM_CHUNK_EVICTED)) continue; if ((ret = __lsm_discard_handle( session, chunk->uri, NULL)) == 0) F_SET_ATOMIC( chunk, WT_LSM_CHUNK_EVICTED); else if (ret == EBUSY) ret = 0; else WT_ERR_MSG(session, ret, "discard handle"); continue; } WT_VERBOSE_ERR(session, lsm, "LSM worker flushing %u", i); /* * Flush the file before checkpointing: this is the * expensive part in terms of I/O: do it without * holding the schema lock. * * Use the special eviction isolation level to avoid * interfering with an application checkpoint: we have * already checked that all of the updates in this * chunk are globally visible. * * !!! We can wait here for checkpoints and fsyncs to * complete, which can be a long time. * * Don't keep waiting for the lock if application * threads are waiting for a switch. Don't skip * flushing the leaves either: that just means we'll * hold the schema lock for (much) longer, which blocks * the world. */ WT_ERR(__wt_session_get_btree( session, chunk->uri, NULL, NULL, 0)); for (locked = 0; !locked && ret == 0 && !F_ISSET(lsm_tree, WT_LSM_TREE_NEED_SWITCH);) { if ((ret = __wt_spin_trylock(session, &S2C(session)->checkpoint_lock)) == 0) locked = 1; else if (ret == EBUSY) { __wt_yield(); ret = 0; } } if (locked) { saved_isolation = session->txn.isolation; session->txn.isolation = TXN_ISO_EVICTION; ret = __wt_bt_cache_op( session, NULL, WT_SYNC_WRITE_LEAVES); session->txn.isolation = saved_isolation; __wt_spin_unlock( session, &S2C(session)->checkpoint_lock); } WT_TRET(__wt_session_release_btree(session)); WT_ERR(ret); if (F_ISSET(lsm_tree, WT_LSM_TREE_NEED_SWITCH)) break; WT_VERBOSE_ERR(session, lsm, "LSM worker checkpointing %u", i); WT_WITH_SCHEMA_LOCK(session, ret = __wt_schema_worker(session, chunk->uri, __wt_checkpoint, NULL, NULL, 0)); if (ret != 0) { __wt_err(session, ret, "LSM checkpoint"); break; } WT_ERR(__wt_lsm_tree_set_chunk_size(session, chunk)); /* * Clear the "cache resident" flag so the primary can * be evicted and eventually closed. Only do this once * the checkpoint has succeeded: otherwise, accessing * the leaf page during the checkpoint can trigger * forced eviction. */ WT_ERR(__wt_session_get_btree( session, chunk->uri, NULL, NULL, 0)); __wt_btree_evictable(session, 1); WT_ERR(__wt_session_release_btree(session)); ++j; WT_ERR(__wt_lsm_tree_lock(session, lsm_tree, 1)); F_SET_ATOMIC(chunk, WT_LSM_CHUNK_ONDISK); ret = __wt_lsm_meta_write(session, lsm_tree); ++lsm_tree->dsk_gen; /* Update the throttle time. */ __wt_lsm_tree_throttle(session, lsm_tree); WT_TRET(__wt_lsm_tree_unlock(session, lsm_tree)); /* Make sure we aren't pinning a transaction ID. */ __wt_txn_release_snapshot(session); if (ret != 0) { __wt_err(session, ret, "LSM checkpoint metadata write"); break; } WT_VERBOSE_ERR(session, lsm, "LSM worker checkpointed %u", i); } __lsm_unpin_chunks(session, &cookie); if (j == 0 && F_ISSET(lsm_tree, WT_LSM_TREE_WORKING) && !F_ISSET(lsm_tree, WT_LSM_TREE_NEED_SWITCH)) WT_ERR_TIMEDOUT_OK(__wt_cond_wait( session, lsm_tree->work_cond, 100000)); } err: __lsm_unpin_chunks(session, &cookie); __wt_free(session, cookie.chunk_array); /* * The thread will only exit with failure if we run out of memory or * there is some other system driven failure. We can't keep going * after such a failure - ensure WiredTiger shuts down. */ if (ret != 0 && ret != WT_NOTFOUND) WT_PANIC_ERR(session, ret, "Shutting down LSM checkpoint utility thread"); return (NULL); }
/* * __wt_lsm_tree_rename -- * Rename an LSM tree. */ int __wt_lsm_tree_rename(WT_SESSION_IMPL *session, const char *olduri, const char *newuri, const char *cfg[]) { WT_DECL_RET; WT_ITEM buf; WT_LSM_CHUNK *chunk; WT_LSM_TREE *lsm_tree; const char *old; u_int i; int locked; old = NULL; WT_CLEAR(buf); locked = 0; /* Get the LSM tree. */ WT_RET(__wt_lsm_tree_get(session, olduri, 1, &lsm_tree)); /* Shut down the LSM worker. */ WT_ERR(__lsm_tree_close(session, lsm_tree)); /* Prevent any new opens. */ WT_ERR(__wt_lsm_tree_lock(session, lsm_tree, 1)); locked = 1; /* Set the new name. */ WT_ERR(__lsm_tree_set_name(session, lsm_tree, newuri)); /* Rename the chunks. */ for (i = 0; i < lsm_tree->nchunks; i++) { chunk = lsm_tree->chunk[i]; old = chunk->uri; chunk->uri = NULL; WT_ERR(__wt_lsm_tree_chunk_name( session, lsm_tree, chunk->id, &buf)); chunk->uri = __wt_buf_steal(session, &buf, NULL); WT_ERR(__wt_schema_rename(session, old, chunk->uri, cfg)); __wt_free(session, old); if (F_ISSET_ATOMIC(chunk, WT_LSM_CHUNK_BLOOM)) { old = chunk->bloom_uri; chunk->bloom_uri = NULL; WT_ERR(__wt_lsm_tree_bloom_name( session, lsm_tree, chunk->id, &buf)); chunk->bloom_uri = __wt_buf_steal(session, &buf, NULL); F_SET_ATOMIC(chunk, WT_LSM_CHUNK_BLOOM); WT_ERR(__wt_schema_rename( session, old, chunk->uri, cfg)); __wt_free(session, old); } } WT_ERR(__wt_lsm_meta_write(session, lsm_tree)); locked = 0; WT_ERR(__wt_lsm_tree_unlock(session, lsm_tree)); WT_ERR(__wt_metadata_remove(session, olduri)); err: if (locked) WT_TRET(__wt_lsm_tree_unlock(session, lsm_tree)); if (old != NULL) __wt_free(session, old); /* * Discard this LSM tree structure. The first operation on the renamed * tree will create a new one. */ WT_TRET(__lsm_tree_discard(session, lsm_tree)); return (ret); }
/* * __cursor_row_prev -- * Move to the previous row-store item. */ static inline int __cursor_row_prev(WT_CURSOR_BTREE *cbt, bool newpage) { WT_INSERT *ins; WT_ITEM *key, *val; WT_PAGE *page; WT_ROW *rip; WT_SESSION_IMPL *session; WT_UPDATE *upd; session = (WT_SESSION_IMPL *)cbt->iface.session; page = cbt->ref->page; key = &cbt->iface.key; val = &cbt->iface.value; /* * For row-store pages, we need a single item that tells us the part * of the page we're walking (otherwise switching from next to prev * and vice-versa is just too complicated), so we map the WT_ROW and * WT_INSERT_HEAD insert array slots into a single name space: slot 1 * is the "smallest key insert list", slot 2 is WT_ROW[0], slot 3 is * WT_INSERT_HEAD[0], and so on. This means WT_INSERT lists are * odd-numbered slots, and WT_ROW array slots are even-numbered slots. * * New page configuration. */ if (newpage) { /* * If we haven't instantiated keys on this page, do so, else it * is a very, very slow traversal. */ if (!F_ISSET_ATOMIC(page, WT_PAGE_BUILD_KEYS)) WT_RET(__wt_row_leaf_keys(session, page)); if (page->pg_row_entries == 0) cbt->ins_head = WT_ROW_INSERT_SMALLEST(page); else cbt->ins_head = WT_ROW_INSERT_SLOT(page, page->pg_row_entries - 1); cbt->ins = WT_SKIP_LAST(cbt->ins_head); cbt->row_iteration_slot = page->pg_row_entries * 2 + 1; goto new_insert; } /* Move to the previous entry and return the item. */ for (;;) { /* * Continue traversing any insert list. Maintain the reference * to the current insert element in case we switch to a cursor * next movement. */ if (cbt->ins != NULL) WT_RET(__cursor_skip_prev(cbt)); new_insert: if ((ins = cbt->ins) != NULL) { if ((upd = __wt_txn_read(session, ins->upd)) == NULL) continue; if (WT_UPDATE_DELETED_ISSET(upd)) { if (__wt_txn_visible_all(session, upd->txnid)) ++cbt->page_deleted_count; continue; } key->data = WT_INSERT_KEY(ins); key->size = WT_INSERT_KEY_SIZE(ins); val->data = WT_UPDATE_DATA(upd); val->size = upd->size; return (0); } /* Check for the beginning of the page. */ if (cbt->row_iteration_slot == 1) return (WT_NOTFOUND); --cbt->row_iteration_slot; /* * Odd-numbered slots configure as WT_INSERT_HEAD entries, * even-numbered slots configure as WT_ROW entries. */ if (cbt->row_iteration_slot & 0x01) { cbt->ins_head = cbt->row_iteration_slot == 1 ? WT_ROW_INSERT_SMALLEST(page) : WT_ROW_INSERT_SLOT( page, cbt->row_iteration_slot / 2 - 1); cbt->ins = WT_SKIP_LAST(cbt->ins_head); goto new_insert; } cbt->ins_head = NULL; cbt->ins = NULL; cbt->slot = cbt->row_iteration_slot / 2 - 1; rip = &page->pg_row_d[cbt->slot]; upd = __wt_txn_read(session, WT_ROW_UPDATE(page, rip)); if (upd != NULL && WT_UPDATE_DELETED_ISSET(upd)) { if (__wt_txn_visible_all(session, upd->txnid)) ++cbt->page_deleted_count; continue; } return (__cursor_row_slot_return(cbt, rip, upd)); } /* NOTREACHED */ }
/* * __wt_page_out -- * Discard an in-memory page, freeing all memory associated with it. */ void __wt_page_out(WT_SESSION_IMPL *session, WT_PAGE **pagep) { WT_PAGE *page; WT_PAGE_HEADER *dsk; WT_PAGE_MODIFY *mod; /* * Kill our caller's reference, do our best to catch races. */ page = *pagep; *pagep = NULL; if (F_ISSET(session->dhandle, WT_DHANDLE_DEAD)) __wt_page_modify_clear(session, page); /* * We should never discard: * - a dirty page, * - a page queued for eviction, or * - a locked page. */ WT_ASSERT(session, !__wt_page_is_modified(page)); WT_ASSERT(session, !F_ISSET_ATOMIC(page, WT_PAGE_EVICT_LRU)); WT_ASSERT(session, !__wt_rwlock_islocked(session, &page->page_lock)); /* * If a root page split, there may be one or more pages linked from the * page; walk the list, discarding pages. */ switch (page->type) { case WT_PAGE_COL_INT: case WT_PAGE_ROW_INT: mod = page->modify; if (mod != NULL && mod->mod_root_split != NULL) __wt_page_out(session, &mod->mod_root_split); break; } /* Update the cache's information. */ __wt_cache_page_evict(session, page); dsk = (WT_PAGE_HEADER *)page->dsk; if (F_ISSET_ATOMIC(page, WT_PAGE_DISK_ALLOC)) __wt_cache_page_image_decr(session, dsk->mem_size); /* Discard any mapped image. */ if (F_ISSET_ATOMIC(page, WT_PAGE_DISK_MAPPED)) (void)S2BT(session)->bm->map_discard( S2BT(session)->bm, session, dsk, (size_t)dsk->mem_size); /* * If discarding the page as part of process exit, the application may * configure to leak the memory rather than do the work. */ if (F_ISSET(S2C(session), WT_CONN_LEAK_MEMORY)) return; /* Free the page modification information. */ if (page->modify != NULL) __free_page_modify(session, page); switch (page->type) { case WT_PAGE_COL_FIX: break; case WT_PAGE_COL_INT: case WT_PAGE_ROW_INT: __free_page_int(session, page); break; case WT_PAGE_COL_VAR: __free_page_col_var(session, page); break; case WT_PAGE_ROW_LEAF: __free_page_row_leaf(session, page); break; } /* Discard any allocated disk image. */ if (F_ISSET_ATOMIC(page, WT_PAGE_DISK_ALLOC)) __wt_overwrite_and_free_len(session, dsk, dsk->mem_size); __wt_overwrite_and_free(session, page); }