/* * __wt_lsm_tree_switch -- * Switch to a new in-memory tree. */ int __wt_lsm_tree_switch( WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) { WT_DECL_RET; WT_LSM_CHUNK *chunk, **cp; uint32_t in_memory, new_id; new_id = WT_ATOMIC_ADD(lsm_tree->last, 1); if ((lsm_tree->nchunks + 1) * sizeof(*lsm_tree->chunk) > lsm_tree->chunk_alloc) WT_ERR(__wt_realloc(session, &lsm_tree->chunk_alloc, WT_MAX(10 * sizeof(*lsm_tree->chunk), 2 * lsm_tree->chunk_alloc), &lsm_tree->chunk)); /* * In the steady state, we expect that the checkpoint worker thread * will keep up with inserts. If not, we throttle the insert rate to * avoid filling the cache with in-memory chunks. Threads sleep every * 100 operations, so take that into account in the calculation. */ for (in_memory = 1, cp = lsm_tree->chunk + lsm_tree->nchunks - 1; in_memory < lsm_tree->nchunks && !F_ISSET(*cp, WT_LSM_CHUNK_ONDISK); ++in_memory, --cp) ; if (!F_ISSET(lsm_tree, WT_LSM_TREE_THROTTLE) || in_memory <= 2) lsm_tree->throttle_sleep = 0; else if (in_memory == lsm_tree->nchunks || F_ISSET(*cp, WT_LSM_CHUNK_STABLE)) { /* * No checkpoint has completed this run. Keep slowing down * inserts until one does. */ lsm_tree->throttle_sleep = WT_MAX(20, 2 * lsm_tree->throttle_sleep); } else { chunk = lsm_tree->chunk[lsm_tree->nchunks - 1]; lsm_tree->throttle_sleep = (long)((in_memory - 2) * WT_TIMEDIFF(chunk->create_ts, (*cp)->create_ts) / (20 * in_memory * chunk->count)); } WT_VERBOSE_ERR(session, lsm, "Tree switch to: %d, throttle %d", new_id, (int)lsm_tree->throttle_sleep); WT_ERR(__wt_calloc_def(session, 1, &chunk)); chunk->id = new_id; lsm_tree->chunk[lsm_tree->nchunks++] = chunk; WT_ERR(__wt_lsm_tree_setup_chunk(session, lsm_tree, chunk)); ++lsm_tree->dsk_gen; F_CLR(lsm_tree, WT_LSM_TREE_NEED_SWITCH); WT_ERR(__wt_lsm_meta_write(session, lsm_tree)); err: /* TODO: mark lsm_tree bad on error(?) */ return (ret); }
/* * __meta_track_next -- * Extend the list of operations we're tracking, as necessary, and * optionally return the next slot. */ static int __meta_track_next(WT_SESSION_IMPL *session, WT_META_TRACK **trkp) { size_t offset, sub_off; if (session->meta_track_next == NULL) session->meta_track_next = session->meta_track; offset = WT_PTRDIFF(session->meta_track_next, session->meta_track); sub_off = WT_PTRDIFF(session->meta_track_sub, session->meta_track); if (offset == session->meta_track_alloc) { WT_RET(__wt_realloc(session, &session->meta_track_alloc, WT_MAX(2 * session->meta_track_alloc, 20 * sizeof(WT_META_TRACK)), &session->meta_track)); /* Maintain positions in the new chunk of memory. */ session->meta_track_next = (uint8_t *)session->meta_track + offset; if (session->meta_track_sub != NULL) session->meta_track_sub = (uint8_t *)session->meta_track + sub_off; } WT_ASSERT(session, session->meta_track_next != NULL); if (trkp != NULL) { *trkp = session->meta_track_next; session->meta_track_next = *trkp + 1; } return (0); }
/* * __wt_lsm_tree_switch -- * Switch to a new in-memory tree. */ int __wt_lsm_tree_switch( WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) { WT_DECL_RET; WT_LSM_CHUNK *chunk; uint32_t new_id; new_id = WT_ATOMIC_ADD(lsm_tree->last, 1); WT_VERBOSE_RET(session, lsm, "Tree switch to: %d", new_id); if ((lsm_tree->nchunks + 1) * sizeof(*lsm_tree->chunk) > lsm_tree->chunk_alloc) WT_ERR(__wt_realloc(session, &lsm_tree->chunk_alloc, WT_MAX(10 * sizeof(*lsm_tree->chunk), 2 * lsm_tree->chunk_alloc), &lsm_tree->chunk)); WT_ERR(__wt_calloc_def(session, 1, &chunk)); chunk->id = new_id; lsm_tree->chunk[lsm_tree->nchunks++] = chunk; WT_ERR(__wt_lsm_tree_setup_chunk(session, lsm_tree, chunk)); ++lsm_tree->dsk_gen; WT_ERR(__wt_lsm_meta_write(session, lsm_tree)); err: /* TODO: mark lsm_tree bad on error(?) */ return (ret); }
/* * __wt_block_read_off -- * Read an addr/size pair referenced block into a buffer. */ int __wt_block_read_off(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, wt_off_t offset, uint32_t size, uint32_t cksum) { WT_BLOCK_HEADER *blk; size_t bufsize; uint32_t page_cksum; WT_RET(__wt_verbose(session, WT_VERB_READ, "off %" PRIuMAX ", size %" PRIu32 ", cksum %" PRIu32, (uintmax_t)offset, size, cksum)); WT_STAT_FAST_CONN_INCR(session, block_read); WT_STAT_FAST_CONN_INCRV(session, block_byte_read, size); /* * Grow the buffer as necessary and read the block. Buffers should be * aligned for reading, but there are lots of buffers (for example, file * cursors have two buffers each, key and value), and it's difficult to * be sure we've found all of them. If the buffer isn't aligned, it's * an easy fix: set the flag and guarantee we reallocate it. (Most of * the time on reads, the buffer memory has not yet been allocated, so * we're not adding any additional processing time.) */ if (F_ISSET(buf, WT_ITEM_ALIGNED)) bufsize = size; else { F_SET(buf, WT_ITEM_ALIGNED); bufsize = WT_MAX(size, buf->memsize + 10); } WT_RET(__wt_buf_init(session, buf, bufsize)); WT_RET(__wt_read(session, block->fh, offset, size, buf->mem)); buf->size = size; blk = WT_BLOCK_HEADER_REF(buf->mem); page_cksum = blk->cksum; if (page_cksum == cksum) { blk->cksum = 0; page_cksum = __wt_cksum(buf->mem, F_ISSET(blk, WT_BLOCK_DATA_CKSUM) ? size : WT_BLOCK_COMPRESS_SKIP); if (page_cksum == cksum) return (0); } if (!F_ISSET(session, WT_SESSION_SALVAGE_CORRUPT_OK)) __wt_errx(session, "read checksum error [%" PRIu32 "B @ %" PRIuMAX ", %" PRIu32 " != %" PRIu32 "]", size, (uintmax_t)offset, cksum, page_cksum); /* Panic if a checksum fails during an ordinary read. */ return (block->verify || F_ISSET(session, WT_SESSION_SALVAGE_CORRUPT_OK) ? WT_ERROR : __wt_illegal_value(session, block->name)); }
/* * __wt_logmgr_create -- * Initialize the log subsystem (before running recovery). */ int __wt_logmgr_create(WT_SESSION_IMPL *session, const char *cfg[]) { WT_CONNECTION_IMPL *conn; WT_LOG *log; bool run; conn = S2C(session); /* Handle configuration. */ WT_RET(__logmgr_config(session, cfg, &run, false)); /* If logging is not configured, we're done. */ if (!run) return (0); FLD_SET(conn->log_flags, WT_CONN_LOG_ENABLED); /* * Logging is on, allocate the WT_LOG structure and open the log file. */ WT_RET(__wt_calloc_one(session, &conn->log)); log = conn->log; WT_RET(__wt_spin_init(session, &log->log_lock, "log")); WT_RET(__wt_spin_init(session, &log->log_slot_lock, "log slot")); WT_RET(__wt_spin_init(session, &log->log_sync_lock, "log sync")); WT_RET(__wt_spin_init(session, &log->log_writelsn_lock, "log write LSN")); WT_RET(__wt_rwlock_alloc(session, &log->log_archive_lock, "log archive lock")); if (FLD_ISSET(conn->direct_io, WT_FILE_TYPE_LOG)) log->allocsize = WT_MAX((uint32_t)conn->buffer_alignment, WT_LOG_ALIGN); else log->allocsize = WT_LOG_ALIGN; WT_INIT_LSN(&log->alloc_lsn); WT_INIT_LSN(&log->ckpt_lsn); WT_INIT_LSN(&log->first_lsn); WT_INIT_LSN(&log->sync_lsn); /* * We only use file numbers for directory sync, so this needs to * initialized to zero. */ WT_ZERO_LSN(&log->sync_dir_lsn); WT_INIT_LSN(&log->trunc_lsn); WT_INIT_LSN(&log->write_lsn); WT_INIT_LSN(&log->write_start_lsn); log->fileid = 0; WT_RET(__wt_cond_alloc( session, "log sync", false, &log->log_sync_cond)); WT_RET(__wt_cond_alloc( session, "log write", false, &log->log_write_cond)); WT_RET(__wt_log_open(session)); WT_RET(__wt_log_slot_init(session)); return (0); }
/* * __wt_log_slot_grow_buffers -- * Increase the buffer size of all available slots in the buffer pool. * Go to some lengths to include active (but unused) slots to handle * the case where all log write record sizes exceed the size of the * active buffer. */ int __wt_log_slot_grow_buffers(WT_SESSION_IMPL *session, size_t newsize) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_LOG *log; WT_LOGSLOT *slot; int64_t orig_state; uint64_t old_size, total_growth; int i; conn = S2C(session); log = conn->log; total_growth = 0; WT_STAT_FAST_CONN_INCR(session, log_buffer_grow); /* * Take the log slot lock to prevent other threads growing buffers * at the same time. Could tighten the scope of this lock, or have * a separate lock if there is contention. */ __wt_spin_lock(session, &log->log_slot_lock); for (i = 0; i < SLOT_POOL; i++) { slot = &log->slot_pool[i]; /* Avoid atomic operations if they won't succeed. */ if (slot->slot_state != WT_LOG_SLOT_FREE && slot->slot_state != WT_LOG_SLOT_READY) continue; /* Don't keep growing unrelated buffers. */ if (slot->slot_buf.memsize > (10 * newsize) && !F_ISSET(slot, SLOT_BUF_GROW)) continue; orig_state = WT_ATOMIC_CAS_VAL8( slot->slot_state, WT_LOG_SLOT_FREE, WT_LOG_SLOT_PENDING); if (orig_state != WT_LOG_SLOT_FREE) { orig_state = WT_ATOMIC_CAS_VAL8(slot->slot_state, WT_LOG_SLOT_READY, WT_LOG_SLOT_PENDING); if (orig_state != WT_LOG_SLOT_READY) continue; } /* We have a slot - now go ahead and grow the buffer. */ old_size = slot->slot_buf.memsize; F_CLR(slot, SLOT_BUF_GROW); WT_ERR(__wt_buf_grow(session, &slot->slot_buf, WT_MAX(slot->slot_buf.memsize * 2, newsize))); slot->slot_state = orig_state; total_growth += slot->slot_buf.memsize - old_size; } err: __wt_spin_unlock(session, &log->log_slot_lock); WT_STAT_FAST_CONN_INCRV(session, log_buffer_size, total_growth); return (ret); }
/* * __ckpt_server_config -- * Parse and setup the checkpoint server options. */ static int __ckpt_server_config(WT_SESSION_IMPL *session, const char **cfg, bool *startp) { WT_CONFIG_ITEM cval; WT_CONNECTION_IMPL *conn; *startp = false; conn = S2C(session); WT_RET(__wt_config_gets(session, cfg, "checkpoint.wait", &cval)); conn->ckpt_usecs = (uint64_t)cval.val * WT_MILLION; WT_RET(__wt_config_gets(session, cfg, "checkpoint.log_size", &cval)); conn->ckpt_logsize = (wt_off_t)cval.val; /* * The checkpoint configuration requires a wait time and/or a log size, * if neither is set, we're not running at all. Checkpoints based on log * size also require logging be enabled. */ if (conn->ckpt_usecs != 0 || (conn->ckpt_logsize != 0 && FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED))) { /* * If checkpointing based on log data, use a minimum of the * log file size. The logging subsystem has already been * initialized. */ if (conn->ckpt_logsize != 0 && FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED)) conn->ckpt_logsize = WT_MAX( conn->ckpt_logsize, conn->log_file_max); /* Checkpoints are incompatible with in-memory configuration */ WT_RET(__wt_config_gets(session, cfg, "in_memory", &cval)); if (cval.val != 0) WT_RET_MSG(session, EINVAL, "checkpoint configuration incompatible with " "in-memory configuration"); __wt_log_written_reset(session); *startp = true; } return (0); }
/* * __wt_log_open -- * Open the appropriate log file for the connection. The purpose is * to find the last log file that exists, open it and set our initial * LSNs to the end of that file. If none exist, call __wt_log_newfile * to create it. */ int __wt_log_open(WT_SESSION_IMPL *session) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_LOG *log; uint32_t firstlog, lastlog, lognum; u_int i, logcount; char **logfiles; conn = S2C(session); log = conn->log; lastlog = 0; firstlog = UINT32_MAX; WT_RET(__wt_log_get_files(session, &logfiles, &logcount)); for (i = 0; i < logcount; i++) { WT_ERR(__wt_log_extract_lognum(session, logfiles[i], &lognum)); lastlog = WT_MAX(lastlog, lognum); firstlog = WT_MIN(firstlog, lognum); } log->fileid = lastlog; WT_ERR(__wt_verbose(session, WT_VERB_LOG, "log_open: first log %d last log %d", firstlog, lastlog)); log->first_lsn.file = firstlog; log->first_lsn.offset = 0; /* * Start logging at the beginning of the next log file, no matter * where the previous log file ends. */ WT_ERR(__wt_log_newfile(session, 1)); /* * If there were log files, run recovery. * XXX belongs at a higher level than this. */ if (logcount > 0) { log->trunc_lsn = log->alloc_lsn; WT_ERR(__wt_txn_recover(session)); } err: __wt_log_files_free(session, logfiles, logcount); return (ret); }
/* * __wt_mmap_preload -- * Cause a section of a memory map to be faulted in. */ int __wt_mmap_preload(WT_SESSION_IMPL *session, const void *p, size_t size) { #ifdef HAVE_POSIX_MADVISE /* Linux requires the address be aligned to a 4KB boundary. */ WT_BM *bm = S2BT(session)->bm; WT_DECL_RET; void *blk = (void *)((uintptr_t)p & ~(uintptr_t)(WT_VM_PAGESIZE - 1)); size += WT_PTRDIFF(p, blk); /* XXX proxy for "am I doing a scan?" -- manual read-ahead */ if (F_ISSET(session, WT_SESSION_NO_CACHE)) { /* Read in 2MB blocks every 1MB of data. */ if (((uintptr_t)((uint8_t *)blk + size) & (uintptr_t)((1<<20) - 1)) < (uintptr_t)blk) return (0); size = WT_MIN(WT_MAX(20 * size, 2 << 20), WT_PTRDIFF((uint8_t *)bm->map + bm->maplen, blk)); } /* * Manual pages aren't clear on whether alignment is required for the * size, so we will be conservative. */ size &= ~(size_t)(WT_VM_PAGESIZE - 1); if (size > WT_VM_PAGESIZE && (ret = posix_madvise(blk, size, POSIX_MADV_WILLNEED)) != 0) WT_RET_MSG(session, ret, "posix_madvise will need"); #else WT_UNUSED(session); WT_UNUSED(p); WT_UNUSED(size); #endif return (0); }
/* * __schema_open_index -- * Open one or more indices for a table (internal version). */ static int __schema_open_index(WT_SESSION_IMPL *session, WT_TABLE *table, const char *idxname, size_t len, WT_INDEX **indexp) { WT_CURSOR *cursor; WT_DECL_ITEM(tmp); WT_DECL_RET; WT_INDEX *idx; u_int i; int cmp; bool match; const char *idxconf, *name, *tablename, *uri; /* Check if we've already done the work. */ if (idxname == NULL && table->idx_complete) return (0); cursor = NULL; idx = NULL; match = false; /* Build a search key. */ tablename = table->name; (void)WT_PREFIX_SKIP(tablename, "table:"); WT_ERR(__wt_scr_alloc(session, 512, &tmp)); WT_ERR(__wt_buf_fmt(session, tmp, "index:%s:", tablename)); /* Find matching indices. */ WT_ERR(__wt_metadata_cursor(session, &cursor)); cursor->set_key(cursor, tmp->data); if ((ret = cursor->search_near(cursor, &cmp)) == 0 && cmp < 0) ret = cursor->next(cursor); for (i = 0; ret == 0; i++, ret = cursor->next(cursor)) { WT_ERR(cursor->get_key(cursor, &uri)); name = uri; if (!WT_PREFIX_SKIP(name, tmp->data)) break; /* Is this the index we are looking for? */ match = idxname == NULL || WT_STRING_MATCH(name, idxname, len); /* * Ensure there is space, including if we have to make room for * a new entry in the middle of the list. */ WT_ERR(__wt_realloc_def(session, &table->idx_alloc, WT_MAX(i, table->nindices) + 1, &table->indices)); /* Keep the in-memory list in sync with the metadata. */ cmp = 0; while (table->indices[i] != NULL && (cmp = strcmp(uri, table->indices[i]->name)) > 0) { /* Index no longer exists, remove it. */ __wt_free(session, table->indices[i]); memmove(&table->indices[i], &table->indices[i + 1], (table->nindices - i) * sizeof(WT_INDEX *)); table->indices[--table->nindices] = NULL; } if (cmp < 0) { /* Make room for a new index. */ memmove(&table->indices[i + 1], &table->indices[i], (table->nindices - i) * sizeof(WT_INDEX *)); table->indices[i] = NULL; ++table->nindices; } if (!match) continue; if (table->indices[i] == NULL) { WT_ERR(cursor->get_value(cursor, &idxconf)); WT_ERR(__wt_calloc_one(session, &idx)); WT_ERR(__wt_strdup(session, uri, &idx->name)); WT_ERR(__wt_strdup(session, idxconf, &idx->config)); WT_ERR(__open_index(session, table, idx)); /* * If we're checking the creation of an index before a * table is fully created, don't save the index: it * will need to be reopened once the table is complete. */ if (!table->cg_complete) { WT_ERR( __wt_schema_destroy_index(session, &idx)); if (idxname != NULL) break; continue; } table->indices[i] = idx; idx = NULL; /* * If the slot is bigger than anything else we've seen, * bump the number of indices. */ if (i >= table->nindices) table->nindices = i + 1; } /* If we were looking for a single index, we're done. */ if (indexp != NULL) *indexp = table->indices[i]; if (idxname != NULL) break; } WT_ERR_NOTFOUND_OK(ret); if (idxname != NULL && !match) ret = WT_NOTFOUND; /* If we did a full pass, we won't need to do it again. */ if (idxname == NULL) { table->nindices = i; table->idx_complete = true; } err: WT_TRET(__wt_metadata_cursor_release(session, &cursor)); WT_TRET(__wt_schema_destroy_index(session, &idx)); __wt_scr_free(session, &tmp); return (ret); }
/* * __wt_block_read_off -- * Read an addr/size pair referenced block into a buffer. */ int __wt_block_read_off(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, wt_off_t offset, uint32_t size, uint32_t cksum) { WT_BLOCK_HEADER *blk, swap; size_t bufsize; uint32_t page_cksum; __wt_verbose(session, WT_VERB_READ, "off %" PRIuMAX ", size %" PRIu32 ", cksum %" PRIu32, (uintmax_t)offset, size, cksum); WT_STAT_FAST_CONN_INCR(session, block_read); WT_STAT_FAST_CONN_INCRV(session, block_byte_read, size); /* * Grow the buffer as necessary and read the block. Buffers should be * aligned for reading, but there are lots of buffers (for example, file * cursors have two buffers each, key and value), and it's difficult to * be sure we've found all of them. If the buffer isn't aligned, it's * an easy fix: set the flag and guarantee we reallocate it. (Most of * the time on reads, the buffer memory has not yet been allocated, so * we're not adding any additional processing time.) */ if (F_ISSET(buf, WT_ITEM_ALIGNED)) bufsize = size; else { F_SET(buf, WT_ITEM_ALIGNED); bufsize = WT_MAX(size, buf->memsize + 10); } WT_RET(__wt_buf_init(session, buf, bufsize)); WT_RET(__wt_read(session, block->fh, offset, size, buf->mem)); buf->size = size; /* * We incrementally read through the structure before doing a checksum, * do little- to big-endian handling early on, and then select from the * original or swapped structure as needed. */ blk = WT_BLOCK_HEADER_REF(buf->mem); __wt_block_header_byteswap_copy(blk, &swap); if (swap.cksum == cksum) { blk->cksum = 0; page_cksum = __wt_cksum(buf->mem, F_ISSET(&swap, WT_BLOCK_DATA_CKSUM) ? size : WT_BLOCK_COMPRESS_SKIP); if (page_cksum == cksum) { /* * Swap the page-header as needed; this doesn't belong * here, but it's the best place to catch all callers. */ __wt_page_header_byteswap(buf->mem); return (0); } if (!F_ISSET(session, WT_SESSION_QUIET_CORRUPT_FILE)) __wt_errx(session, "read checksum error for %" PRIu32 "B block at " "offset %" PRIuMAX ": calculated block checksum " "of %" PRIu32 " doesn't match expected checksum " "of %" PRIu32, size, (uintmax_t)offset, page_cksum, cksum); } else if (!F_ISSET(session, WT_SESSION_QUIET_CORRUPT_FILE)) __wt_errx(session, "read checksum error for %" PRIu32 "B block at " "offset %" PRIuMAX ": block header checksum " "of %" PRIu32 " doesn't match expected checksum " "of %" PRIu32, size, (uintmax_t)offset, swap.cksum, cksum); /* Panic if a checksum fails during an ordinary read. */ return (block->verify || F_ISSET(session, WT_SESSION_QUIET_CORRUPT_FILE) ? WT_ERROR : __wt_illegal_value(session, block->name)); }
/* * __wt_curjoin_join -- * Add a new join to a join cursor. */ int __wt_curjoin_join(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin, WT_INDEX *idx, WT_CURSOR *ref_cursor, uint8_t flags, uint8_t range, uint64_t count, uint32_t bloom_bit_count, uint32_t bloom_hash_count) { WT_CURSOR_JOIN_ENTRY *entry; WT_DECL_RET; WT_CURSOR_JOIN_ENDPOINT *end, *newend; bool hasins, needbloom, range_eq; u_int i, ins, nonbloom; const char *raw_cfg[] = { WT_CONFIG_BASE( session, WT_SESSION_open_cursor), "raw", NULL }; char *main_uri; size_t namesize, newsize; entry = NULL; hasins = needbloom = false; ins = 0; /* -Wuninitialized */ main_uri = NULL; nonbloom = 0; /* -Wuninitialized */ namesize = strlen(cjoin->table->name); for (i = 0; i < cjoin->entries_next; i++) { if (cjoin->entries[i].index == idx) { entry = &cjoin->entries[i]; break; } if (!needbloom && i > 0 && !F_ISSET(&cjoin->entries[i], WT_CURJOIN_ENTRY_BLOOM)) { needbloom = true; nonbloom = i; } } if (entry == NULL) { WT_ERR(__wt_realloc_def(session, &cjoin->entries_allocated, cjoin->entries_next + 1, &cjoin->entries)); if (LF_ISSET(WT_CURJOIN_ENTRY_BLOOM) && needbloom) { /* * Reorder the list so that after the first entry, * the Bloom filtered entries come next, followed by * the non-Bloom entries. Once the Bloom filters * are built, determining membership via Bloom is * faster than without Bloom, so we can answer * membership questions more quickly, and with less * I/O, with the Bloom entries first. */ entry = &cjoin->entries[nonbloom]; memmove(entry + 1, entry, (cjoin->entries_next - nonbloom) * sizeof(WT_CURSOR_JOIN_ENTRY)); memset(entry, 0, sizeof(WT_CURSOR_JOIN_ENTRY)); } else entry = &cjoin->entries[cjoin->entries_next]; entry->index = idx; entry->flags = flags; entry->count = count; entry->bloom_bit_count = bloom_bit_count; entry->bloom_hash_count = bloom_hash_count; ++cjoin->entries_next; } else { /* Merge the join into an existing entry for this index */ if (count != 0 && entry->count != 0 && entry->count != count) WT_ERR_MSG(session, EINVAL, "count=%" PRIu64 " does not match " "previous count=%" PRIu64 " for this index", count, entry->count); if (LF_MASK(WT_CURJOIN_ENTRY_BLOOM) != F_MASK(entry, WT_CURJOIN_ENTRY_BLOOM)) WT_ERR_MSG(session, EINVAL, "join has incompatible strategy " "values for the same index"); /* * Check against other comparisons (we call them endpoints) * already set up for this index. * We allow either: * - one or more "eq" (with disjunction) * - exactly one "eq" (with conjunction) * - exactly one of "gt" or "ge" (conjunction or disjunction) * - exactly one of "lt" or "le" (conjunction or disjunction) * - one of "gt"/"ge" along with one of "lt"/"le" * (currently restricted to conjunction). * * Some other combinations, although expressible either do * not make sense (X == 3 AND X == 5) or are reducible (X < * 7 AND X < 9). Other specific cases of (X < 7 OR X > 15) * or (X == 4 OR X > 15) make sense but we don't handle yet. */ for (i = 0; i < entry->ends_next; i++) { end = &entry->ends[i]; range_eq = (range == WT_CURJOIN_END_EQ); if ((F_ISSET(end, WT_CURJOIN_END_GT) && ((range & WT_CURJOIN_END_GT) != 0 || range_eq)) || (F_ISSET(end, WT_CURJOIN_END_LT) && ((range & WT_CURJOIN_END_LT) != 0 || range_eq)) || (end->flags == WT_CURJOIN_END_EQ && (range & (WT_CURJOIN_END_LT | WT_CURJOIN_END_GT)) != 0)) WT_ERR_MSG(session, EINVAL, "join has overlapping ranges"); if (range == WT_CURJOIN_END_EQ && end->flags == WT_CURJOIN_END_EQ && !F_ISSET(entry, WT_CURJOIN_ENTRY_DISJUNCTION)) WT_ERR_MSG(session, EINVAL, "compare=eq can only be combined " "using operation=or"); /* * Sort "gt"/"ge" to the front, followed by any number * of "eq", and finally "lt"/"le". */ if (!hasins && ((range & WT_CURJOIN_END_GT) != 0 || (range == WT_CURJOIN_END_EQ && !F_ISSET(end, WT_CURJOIN_END_GT)))) { ins = i; hasins = true; } } /* All checks completed, merge any new configuration now */ entry->count = count; entry->bloom_bit_count = WT_MAX(entry->bloom_bit_count, bloom_bit_count); entry->bloom_hash_count = WT_MAX(entry->bloom_hash_count, bloom_hash_count); } WT_ERR(__wt_realloc_def(session, &entry->ends_allocated, entry->ends_next + 1, &entry->ends)); if (!hasins) ins = entry->ends_next; newend = &entry->ends[ins]; memmove(newend + 1, newend, (entry->ends_next - ins) * sizeof(WT_CURSOR_JOIN_ENDPOINT)); memset(newend, 0, sizeof(WT_CURSOR_JOIN_ENDPOINT)); entry->ends_next++; newend->cursor = ref_cursor; F_SET(newend, range); /* Open the main file with a projection of the indexed columns. */ if (entry->main == NULL && entry->index != NULL) { namesize = strlen(cjoin->table->name); newsize = namesize + entry->index->colconf.len + 1; WT_ERR(__wt_calloc(session, 1, newsize, &main_uri)); snprintf(main_uri, newsize, "%s%.*s", cjoin->table->name, (int)entry->index->colconf.len, entry->index->colconf.str); WT_ERR(__wt_open_cursor(session, main_uri, (WT_CURSOR *)cjoin, raw_cfg, &entry->main)); } err: if (main_uri != NULL) __wt_free(session, main_uri); return (ret); }
/* * __curjoin_init_iter -- * Initialize before any iteration. */ static int __curjoin_init_iter(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin) { WT_BLOOM *bloom; WT_DECL_RET; WT_CURSOR_JOIN_ENTRY *je, *jeend, *je2; WT_CURSOR_JOIN_ENDPOINT *end; uint32_t f, k; if (cjoin->entries_next == 0) WT_RET_MSG(session, EINVAL, "join cursor has not yet been joined with any other " "cursors"); je = &cjoin->entries[0]; WT_RET(__curjoin_entry_iter_init(session, cjoin, je, &cjoin->iter)); jeend = &cjoin->entries[cjoin->entries_next]; for (je = cjoin->entries; je < jeend; je++) { __wt_stat_join_init_single(&je->stats); for (end = &je->ends[0]; end < &je->ends[je->ends_next]; end++) WT_RET(__curjoin_endpoint_init_key(session, je, end)); /* * The first entry is iterated as the 'outermost' cursor. * For the common GE case, we don't have to test against * the left reference key, we know it will be true since * the btree is ordered. */ if (je == cjoin->entries && je->ends[0].flags == (WT_CURJOIN_END_GT | WT_CURJOIN_END_EQ)) F_SET(cjoin, WT_CURJOIN_SKIP_FIRST_LEFT); if (F_ISSET(je, WT_CURJOIN_ENTRY_BLOOM)) { if (je->bloom == NULL) { /* * Look for compatible filters to be shared, * pick compatible numbers for bit counts * and number of hashes. */ f = je->bloom_bit_count; k = je->bloom_hash_count; for (je2 = je + 1; je2 < jeend; je2++) if (F_ISSET(je2, WT_CURJOIN_ENTRY_BLOOM) && je2->count == je->count) { f = WT_MAX( je2->bloom_bit_count, f); k = WT_MAX( je2->bloom_hash_count, k); } je->bloom_bit_count = f; je->bloom_hash_count = k; WT_RET(__wt_bloom_create(session, NULL, NULL, je->count, f, k, &je->bloom)); F_SET(je, WT_CURJOIN_ENTRY_OWN_BLOOM); WT_RET(__curjoin_init_bloom(session, cjoin, je, je->bloom)); /* * Share the Bloom filter, making all * config info consistent. */ for (je2 = je + 1; je2 < jeend; je2++) if (F_ISSET(je2, WT_CURJOIN_ENTRY_BLOOM) && je2->count == je->count) { WT_ASSERT(session, je2->bloom == NULL); je2->bloom = je->bloom; je2->bloom_bit_count = f; je2->bloom_hash_count = k; } } else { /* * Create a temporary filter that we'll * merge into the shared one. The Bloom * parameters of the two filters must match. */ WT_RET(__wt_bloom_create(session, NULL, NULL, je->count, je->bloom_bit_count, je->bloom_hash_count, &bloom)); WT_RET(__curjoin_init_bloom(session, cjoin, je, bloom)); WT_RET(__wt_bloom_intersection(je->bloom, bloom)); WT_RET(__wt_bloom_close(bloom)); } } } F_SET(cjoin, WT_CURJOIN_INITIALIZED); return (ret); }
/* * __wt_lsm_merge -- * Merge a set of chunks of an LSM tree. */ int __wt_lsm_merge(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, u_int id) { WT_BLOOM *bloom; WT_CURSOR *dest, *src; WT_DECL_RET; WT_ITEM key, value; WT_LSM_CHUNK *chunk; uint32_t generation; uint64_t insert_count, record_count; u_int dest_id, end_chunk, i, nchunks, start_chunk, start_id, verb; int tret; bool created_chunk, create_bloom, locked, in_sync; const char *cfg[3]; const char *drop_cfg[] = { WT_CONFIG_BASE(session, WT_SESSION_drop), "force", NULL }; bloom = NULL; chunk = NULL; dest = src = NULL; start_id = 0; created_chunk = create_bloom = locked = in_sync = false; /* Fast path if it's obvious no merges could be done. */ if (lsm_tree->nchunks < lsm_tree->merge_min && lsm_tree->merge_aggressiveness < WT_LSM_AGGRESSIVE_THRESHOLD) return (WT_NOTFOUND); /* * Use the lsm_tree lock to read the chunks (so no switches occur), but * avoid holding it while the merge is in progress: that may take a * long time. */ WT_RET(__wt_lsm_tree_writelock(session, lsm_tree)); locked = true; WT_ERR(__lsm_merge_span(session, lsm_tree, id, &start_chunk, &end_chunk, &record_count)); nchunks = (end_chunk + 1) - start_chunk; WT_ASSERT(session, nchunks > 0); start_id = lsm_tree->chunk[start_chunk]->id; /* Find the merge generation. */ for (generation = 0, i = 0; i < nchunks; i++) generation = WT_MAX(generation, lsm_tree->chunk[start_chunk + i]->generation + 1); WT_ERR(__wt_lsm_tree_writeunlock(session, lsm_tree)); locked = false; /* Allocate an ID for the merge. */ dest_id = __wt_atomic_add32(&lsm_tree->last, 1); /* * We only want to do the chunk loop if we're running with verbose, * so we wrap these statements in the conditional. Avoid the loop * in the normal path. */ if (WT_VERBOSE_ISSET(session, WT_VERB_LSM)) { WT_ERR(__wt_verbose(session, WT_VERB_LSM, "Merging %s chunks %u-%u into %u (%" PRIu64 " records)" ", generation %" PRIu32, lsm_tree->name, start_chunk, end_chunk, dest_id, record_count, generation)); for (verb = start_chunk; verb <= end_chunk; verb++) WT_ERR(__wt_verbose(session, WT_VERB_LSM, "Merging %s: Chunk[%u] id %u, gen: %" PRIu32 ", size: %" PRIu64 ", records: %" PRIu64, lsm_tree->name, verb, lsm_tree->chunk[verb]->id, lsm_tree->chunk[verb]->generation, lsm_tree->chunk[verb]->size, lsm_tree->chunk[verb]->count)); } WT_ERR(__wt_calloc_one(session, &chunk)); created_chunk = true; chunk->id = dest_id; if (FLD_ISSET(lsm_tree->bloom, WT_LSM_BLOOM_MERGED) && (FLD_ISSET(lsm_tree->bloom, WT_LSM_BLOOM_OLDEST) || start_chunk > 0) && record_count > 0) create_bloom = true; /* * Special setup for the merge cursor: * first, reset to open the dependent cursors; * then restrict the cursor to a specific number of chunks; * then set MERGE so the cursor doesn't track updates to the tree. */ WT_ERR(__wt_open_cursor(session, lsm_tree->name, NULL, NULL, &src)); F_SET(src, WT_CURSTD_RAW); WT_ERR(__wt_clsm_init_merge(src, start_chunk, start_id, nchunks)); WT_WITH_SCHEMA_LOCK(session, ret = __wt_lsm_tree_setup_chunk(session, lsm_tree, chunk)); WT_ERR(ret); if (create_bloom) { WT_ERR(__wt_lsm_tree_setup_bloom(session, lsm_tree, chunk)); WT_ERR(__wt_bloom_create(session, chunk->bloom_uri, lsm_tree->bloom_config, record_count, lsm_tree->bloom_bit_count, lsm_tree->bloom_hash_count, &bloom)); } /* Discard pages we read as soon as we're done with them. */ F_SET(session, WT_SESSION_NO_CACHE); cfg[0] = WT_CONFIG_BASE(session, WT_SESSION_open_cursor); cfg[1] = "bulk,raw,skip_sort_check"; cfg[2] = NULL; WT_ERR(__wt_open_cursor(session, chunk->uri, NULL, cfg, &dest)); #define LSM_MERGE_CHECK_INTERVAL WT_THOUSAND for (insert_count = 0; (ret = src->next(src)) == 0; insert_count++) { if (insert_count % LSM_MERGE_CHECK_INTERVAL == 0) { if (!F_ISSET(lsm_tree, WT_LSM_TREE_ACTIVE)) WT_ERR(EINTR); WT_STAT_FAST_CONN_INCRV(session, lsm_rows_merged, LSM_MERGE_CHECK_INTERVAL); ++lsm_tree->merge_progressing; } WT_ERR(src->get_key(src, &key)); dest->set_key(dest, &key); WT_ERR(src->get_value(src, &value)); dest->set_value(dest, &value); WT_ERR(dest->insert(dest)); if (create_bloom) WT_ERR(__wt_bloom_insert(bloom, &key)); } WT_ERR_NOTFOUND_OK(ret); WT_STAT_FAST_CONN_INCRV(session, lsm_rows_merged, insert_count % LSM_MERGE_CHECK_INTERVAL); ++lsm_tree->merge_progressing; WT_ERR(__wt_verbose(session, WT_VERB_LSM, "Bloom size for %" PRIu64 " has %" PRIu64 " items inserted.", record_count, insert_count)); /* * Closing and syncing the files can take a while. Set the * merge_syncing field so that compact knows it is still in * progress. */ (void)__wt_atomic_add32(&lsm_tree->merge_syncing, 1); in_sync = true; /* * We've successfully created the new chunk. Now install it. We need * to ensure that the NO_CACHE flag is cleared and the bloom filter * is closed (even if a step fails), so track errors but don't return * until we've cleaned up. */ WT_TRET(src->close(src)); WT_TRET(dest->close(dest)); src = dest = NULL; F_CLR(session, WT_SESSION_NO_CACHE); /* * We're doing advisory reads to fault the new trees into cache. * Don't block if the cache is full: our next unit of work may be to * discard some trees to free space. */ F_SET(session, WT_SESSION_NO_EVICTION); if (create_bloom) { if (ret == 0) WT_TRET(__wt_bloom_finalize(bloom)); /* * Read in a key to make sure the Bloom filters btree handle is * open before it becomes visible to application threads. * Otherwise application threads will stall while it is opened * and internal pages are read into cache. */ if (ret == 0) { WT_CLEAR(key); WT_TRET_NOTFOUND_OK(__wt_bloom_get(bloom, &key)); } WT_TRET(__wt_bloom_close(bloom)); bloom = NULL; } WT_ERR(ret); /* * Open a handle on the new chunk before application threads attempt * to access it, opening it pre-loads internal pages into the file * system cache. */ cfg[1] = "checkpoint=" WT_CHECKPOINT; WT_ERR(__wt_open_cursor(session, chunk->uri, NULL, cfg, &dest)); WT_TRET(dest->close(dest)); dest = NULL; ++lsm_tree->merge_progressing; (void)__wt_atomic_sub32(&lsm_tree->merge_syncing, 1); in_sync = false; WT_ERR_NOTFOUND_OK(ret); WT_ERR(__wt_lsm_tree_set_chunk_size(session, chunk)); WT_ERR(__wt_lsm_tree_writelock(session, lsm_tree)); locked = true; /* * Check whether we raced with another merge, and adjust the chunk * array offset as necessary. */ if (start_chunk >= lsm_tree->nchunks || lsm_tree->chunk[start_chunk]->id != start_id) for (start_chunk = 0; start_chunk < lsm_tree->nchunks; start_chunk++) if (lsm_tree->chunk[start_chunk]->id == start_id) break; /* * It is safe to error out here - since the update can only fail * prior to making updates to the tree. */ WT_ERR(__wt_lsm_merge_update_tree( session, lsm_tree, start_chunk, nchunks, chunk)); if (create_bloom) F_SET(chunk, WT_LSM_CHUNK_BLOOM); chunk->count = insert_count; chunk->generation = generation; F_SET(chunk, WT_LSM_CHUNK_ONDISK); /* * We have no current way of continuing if the metadata update fails, * so we will panic in that case. Put some effort into cleaning up * after ourselves here - so things have a chance of shutting down. * * Any errors that happened after the tree was locked are * fatal - we can't guarantee the state of the tree. */ if ((ret = __wt_lsm_meta_write(session, lsm_tree)) != 0) WT_PANIC_ERR(session, ret, "Failed finalizing LSM merge"); lsm_tree->dsk_gen++; /* Update the throttling while holding the tree lock. */ __wt_lsm_tree_throttle(session, lsm_tree, true); /* Schedule a pass to discard old chunks */ WT_ERR(__wt_lsm_manager_push_entry( session, WT_LSM_WORK_DROP, 0, lsm_tree)); err: if (locked) WT_TRET(__wt_lsm_tree_writeunlock(session, lsm_tree)); if (in_sync) (void)__wt_atomic_sub32(&lsm_tree->merge_syncing, 1); if (src != NULL) WT_TRET(src->close(src)); if (dest != NULL) WT_TRET(dest->close(dest)); if (bloom != NULL) WT_TRET(__wt_bloom_close(bloom)); if (ret != 0 && created_chunk) { /* Drop the newly-created files on error. */ if (chunk->uri != NULL) { WT_WITH_SCHEMA_LOCK(session, tret = __wt_schema_drop(session, chunk->uri, drop_cfg)); WT_TRET(tret); } if (create_bloom && chunk->bloom_uri != NULL) { WT_WITH_SCHEMA_LOCK(session, tret = __wt_schema_drop( session, chunk->bloom_uri, drop_cfg)); WT_TRET(tret); } __wt_free(session, chunk->bloom_uri); __wt_free(session, chunk->uri); __wt_free(session, chunk); if (ret == EINTR) WT_TRET(__wt_verbose(session, WT_VERB_LSM, "Merge aborted due to close")); else WT_TRET(__wt_verbose(session, WT_VERB_LSM, "Merge failed with %s", __wt_strerror(session, ret, NULL, 0))); } F_CLR(session, WT_SESSION_NO_CACHE | WT_SESSION_NO_EVICTION); return (ret); }
/* * __wt_block_read_off -- * Read an addr/size pair referenced block into a buffer. */ int __wt_block_read_off(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, off_t offset, uint32_t size, uint32_t cksum) { WT_BLOCK_HEADER *blk; uint32_t alloc_size, page_cksum; WT_VERBOSE_RET(session, read, "off %" PRIuMAX ", size %" PRIu32 ", cksum %" PRIu32, (uintmax_t)offset, size, cksum); #ifdef HAVE_DIAGNOSTIC /* * In diagnostic mode, verify the block we're about to read isn't on * either the available or discard lists. * * Don't check during salvage, it's possible we're reading an already * freed overflow page. */ if (!F_ISSET(session, WT_SESSION_SALVAGE_QUIET_ERR)) WT_RET( __wt_block_misplaced(session, block, "read", offset, size)); #endif /* * Grow the buffer as necessary and read the block. Buffers should be * aligned for reading, but there are lots of buffers (for example, file * cursors have two buffers each, key and value), and it's difficult to * be sure we've found all of them. If the buffer isn't aligned, it's * an easy fix: set the flag and guarantee we reallocate it. (Most of * the time on reads, the buffer memory has not yet been allocated, so * we're not adding any additional processing time.) */ if (F_ISSET(buf, WT_ITEM_ALIGNED)) alloc_size = size; else { F_SET(buf, WT_ITEM_ALIGNED); alloc_size = (uint32_t)WT_MAX(size, buf->memsize + 10); } WT_RET(__wt_buf_init(session, buf, alloc_size)); WT_RET(__wt_read(session, block->fh, offset, size, buf->mem)); buf->size = size; blk = WT_BLOCK_HEADER_REF(buf->mem); blk->cksum = 0; page_cksum = __wt_cksum(buf->mem, F_ISSET(blk, WT_BLOCK_DATA_CKSUM) ? size : WT_BLOCK_COMPRESS_SKIP); if (cksum != page_cksum) { if (!F_ISSET(session, WT_SESSION_SALVAGE_QUIET_ERR)) __wt_errx(session, "read checksum error [%" PRIu32 "B @ %" PRIuMAX ", %" PRIu32 " != %" PRIu32 "]", size, (uintmax_t)offset, cksum, page_cksum); return (WT_ERROR); } WT_CSTAT_INCR(session, block_read); WT_CSTAT_INCRV(session, block_byte_read, size); return (0); }
/* * __wt_lsm_tree_throttle -- * Calculate whether LSM updates need to be throttled. */ void __wt_lsm_tree_throttle(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) { WT_LSM_CHUNK *chunk, **cp, *prev_chunk; uint64_t cache_sz, cache_used, in_memory, record_count; uint64_t oldtime, timediff; uint32_t i; /* Never throttle in small trees. */ if (lsm_tree->nchunks < 3) return; cache_sz = S2C(session)->cache_size; /* * In the steady state, we expect that the checkpoint worker thread * will keep up with inserts. If not, throttle the insert rate to * avoid filling the cache with in-memory chunks. Threads sleep every * 100 operations, so take that into account in the calculation. * * Count the number of in-memory chunks, and find the most recent * on-disk chunk (if any). */ record_count = 1; for (i = in_memory = 0, cp = lsm_tree->chunk + lsm_tree->nchunks - 1; i < lsm_tree->nchunks; ++i, --cp) if (!F_ISSET_ATOMIC(*cp, WT_LSM_CHUNK_ONDISK)) { record_count += (*cp)->count; ++in_memory; } else if ((*cp)->generation == 0 || F_ISSET_ATOMIC(*cp, WT_LSM_CHUNK_STABLE)) break; chunk = lsm_tree->chunk[lsm_tree->nchunks - 1]; if (!F_ISSET(lsm_tree, WT_LSM_TREE_THROTTLE) || in_memory <= 3) lsm_tree->throttle_sleep = 0; else if (i == lsm_tree->nchunks || F_ISSET_ATOMIC(*cp, WT_LSM_CHUNK_STABLE)) { /* * No checkpoint has completed this run. Keep slowing down * inserts until one does. */ lsm_tree->throttle_sleep = WT_MAX(20, 2 * lsm_tree->throttle_sleep); } else { WT_ASSERT(session, WT_TIMECMP(chunk->create_ts, (*cp)->create_ts) >= 0); timediff = WT_TIMEDIFF(chunk->create_ts, (*cp)->create_ts); lsm_tree->throttle_sleep = (long)((in_memory - 2) * timediff / (20 * record_count)); /* * Get more aggressive as the number of in memory chunks * consumes a large proportion of the cache. In memory chunks * are allowed to grow up to twice as large as the configured * value when checkpoints aren't keeping up. That worst case * is when this calculation is relevant. * There is nothing particularly special about the chosen * multipliers. */ cache_used = in_memory * lsm_tree->chunk_size * 2; if (cache_used > cache_sz * 0.8) lsm_tree->throttle_sleep *= 5; } /* * Update our estimate of how long each in-memory chunk stays active. * Filter out some noise by keeping a weighted history of the * calculated value. Wait until we have enough chunks that we can * check that the new value is sane: otherwise, after a long idle * period, we can calculate a crazy value. */ if (in_memory > 1 && i != lsm_tree->nchunks && !F_ISSET_ATOMIC(*cp, WT_LSM_CHUNK_STABLE)) { prev_chunk = lsm_tree->chunk[lsm_tree->nchunks - 2]; WT_ASSERT(session, prev_chunk->generation == 0); WT_ASSERT(session, WT_TIMECMP(chunk->create_ts, prev_chunk->create_ts) >= 0); timediff = WT_TIMEDIFF(chunk->create_ts, prev_chunk->create_ts); WT_ASSERT(session, WT_TIMECMP(prev_chunk->create_ts, (*cp)->create_ts) >= 0); oldtime = WT_TIMEDIFF(prev_chunk->create_ts, (*cp)->create_ts); if (timediff < 10 * oldtime) lsm_tree->chunk_fill_ms = (3 * lsm_tree->chunk_fill_ms + timediff / 1000000) / 4; } }
/* * __wt_log_scan -- * Scan the logs, calling a function on each record found. */ int __wt_log_scan(WT_SESSION_IMPL *session, WT_LSN *lsnp, uint32_t flags, int (*func)(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp, void *cookie), void *cookie) { WT_CONNECTION_IMPL *conn; WT_ITEM buf; WT_DECL_RET; WT_FH *log_fh; WT_LOG *log; WT_LOG_RECORD *logrec; WT_LSN end_lsn, rd_lsn, start_lsn; off_t log_size; uint32_t allocsize, cksum, firstlog, lastlog, lognum, rdup_len, reclen; u_int i, logcount; int eol; char **logfiles; conn = S2C(session); log = conn->log; log_fh = NULL; logcount = 0; logfiles = NULL; eol = 0; WT_CLEAR(buf); /* * If the caller did not give us a callback function there is nothing * to do. */ if (func == NULL) return (0); if (LF_ISSET(WT_LOGSCAN_RECOVER)) WT_RET(__wt_verbose(session, WT_VERB_LOG, "__wt_log_scan truncating to %u/%" PRIuMAX, log->trunc_lsn.file, (uintmax_t)log->trunc_lsn.offset)); if (log != NULL) { allocsize = log->allocsize; if (lsnp == NULL) { if (LF_ISSET(WT_LOGSCAN_FIRST)) start_lsn = log->first_lsn; else if (LF_ISSET(WT_LOGSCAN_FROM_CKP)) start_lsn = log->ckpt_lsn; else return (WT_ERROR); /* Illegal usage */ } else { if (LF_ISSET(WT_LOGSCAN_FIRST|WT_LOGSCAN_FROM_CKP)) WT_RET_MSG(session, WT_ERROR, "choose either a start LSN or a start flag"); /* Offsets must be on allocation boundaries. */ if (lsnp->offset % allocsize != 0 || lsnp->file > log->fileid) return (WT_NOTFOUND); /* * Log cursors may not know the starting LSN. If an * LSN pointer is passed in, but it is the INIT_LSN, * start from the first_lsn. */ start_lsn = *lsnp; if (IS_INIT_LSN(&start_lsn)) start_lsn = log->first_lsn; } end_lsn = log->alloc_lsn; } else { /* * If logging is not configured, we can still print out the log * if log files exist. We just need to set the LSNs from what * is in the files versus what is in the live connection. */ /* * Set allocsize to the minimum alignment it could be. Larger * records and larger allocation boundaries should always be * a multiple of this. */ allocsize = LOG_ALIGN; lastlog = 0; firstlog = UINT32_MAX; WT_RET(__wt_log_get_files(session, &logfiles, &logcount)); if (logcount == 0) /* * Return it is not supported if none don't exist. */ return (ENOTSUP); for (i = 0; i < logcount; i++) { WT_ERR(__wt_log_extract_lognum(session, logfiles[i], &lognum)); lastlog = WT_MAX(lastlog, lognum); firstlog = WT_MIN(firstlog, lognum); } start_lsn.file = firstlog; end_lsn.file = lastlog; start_lsn.offset = end_lsn.offset = 0; __wt_log_files_free(session, logfiles, logcount); logfiles = NULL; } WT_ERR(__log_openfile(session, 0, &log_fh, start_lsn.file)); WT_ERR(__log_filesize(session, log_fh, &log_size)); rd_lsn = start_lsn; WT_ERR(__wt_buf_initsize(session, &buf, LOG_ALIGN)); for (;;) { if (rd_lsn.offset + allocsize > log_size) { advance: /* * If we read the last record, go to the next file. */ WT_ERR(__wt_close(session, log_fh)); log_fh = NULL; eol = 1; /* * Truncate this log file before we move to the next. */ if (LF_ISSET(WT_LOGSCAN_RECOVER)) WT_ERR(__log_truncate(session, &rd_lsn, 1)); rd_lsn.file++; rd_lsn.offset = 0; /* * Avoid an error message when we reach end of log * by checking here. */ if (rd_lsn.file > end_lsn.file) break; WT_ERR(__log_openfile( session, 0, &log_fh, rd_lsn.file)); WT_ERR(__log_filesize(session, log_fh, &log_size)); continue; } /* * Read the minimum allocation size a record could be. */ WT_ASSERT(session, buf.memsize >= allocsize); WT_ERR(__wt_read(session, log_fh, rd_lsn.offset, (size_t)allocsize, buf.mem)); /* * First 8 bytes is the real record length. See if we * need to read more than the allocation size. We expect * that we rarely will have to read more. Most log records * will be fairly small. */ reclen = *(uint32_t *)buf.mem; /* * Log files are pre-allocated. We never expect a zero length * unless we've reached the end of the log. The log can be * written out of order, so when recovery finds the end of * the log, truncate the file and remove any later log files * that may exist. */ if (reclen == 0) { /* This LSN is the end. */ break; } rdup_len = __wt_rduppo2(reclen, allocsize); if (reclen > allocsize) { /* * The log file end could be the middle of this * log record. */ if (rd_lsn.offset + rdup_len > log_size) goto advance; /* * We need to round up and read in the full padded * record, especially for direct I/O. */ WT_ERR(__wt_buf_grow(session, &buf, rdup_len)); WT_ERR(__wt_read(session, log_fh, rd_lsn.offset, (size_t)rdup_len, buf.mem)); WT_STAT_FAST_CONN_INCR(session, log_scan_rereads); } /* * We read in the record, verify checksum. */ buf.size = reclen; logrec = (WT_LOG_RECORD *)buf.mem; cksum = logrec->checksum; logrec->checksum = 0; logrec->checksum = __wt_cksum(logrec, logrec->len); if (logrec->checksum != cksum) { /* * A checksum mismatch means we have reached the end of * the useful part of the log. This should be found on * the first pass through recovery. In the second pass * where we truncate the log, this is where it should * end. */ if (log != NULL) log->trunc_lsn = rd_lsn; break; } /* * We have a valid log record. If it is not the log file * header, invoke the callback. */ WT_STAT_FAST_CONN_INCR(session, log_scan_records); if (rd_lsn.offset != 0) { WT_ERR((*func)(session, &buf, &rd_lsn, cookie)); if (LF_ISSET(WT_LOGSCAN_ONE)) break; } rd_lsn.offset += (off_t)rdup_len; } /* Truncate if we're in recovery. */ if (LF_ISSET(WT_LOGSCAN_RECOVER) && LOG_CMP(&rd_lsn, &log->trunc_lsn) < 0) WT_ERR(__log_truncate(session, &rd_lsn, 0)); err: WT_STAT_FAST_CONN_INCR(session, log_scans); if (logfiles != NULL) __wt_log_files_free(session, logfiles, logcount); __wt_buf_free(session, &buf); /* * If the caller wants one record and it is at the end of log, * return WT_NOTFOUND. */ if (LF_ISSET(WT_LOGSCAN_ONE) && eol && ret == 0) ret = WT_NOTFOUND; if (ret == ENOENT) ret = 0; if (log_fh != NULL) WT_TRET(__wt_close(session, log_fh)); return (ret); }
/* * __curjoin_init_next -- * Initialize the cursor join when the next function is first called. */ static int __curjoin_init_next(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin, bool iterable) { WT_BLOOM *bloom; WT_CURSOR *origcur; WT_CURSOR_JOIN_ENDPOINT *end; WT_CURSOR_JOIN_ENTRY *je, *jeend, *je2; WT_DECL_RET; size_t size; uint32_t f, k; char *mainbuf; const char *def_cfg[] = { WT_CONFIG_BASE( session, WT_SESSION_open_cursor), NULL }; const char *raw_cfg[] = { WT_CONFIG_BASE( session, WT_SESSION_open_cursor), "raw", NULL }; const char **config, *proj, *urimain; mainbuf = NULL; if (cjoin->entries_next == 0) WT_RET_MSG(session, EINVAL, "join cursor has not yet been joined with any other " "cursors"); /* Get a consistent view of our subordinate cursors if appropriate. */ __wt_txn_cursor_op(session); if (F_ISSET((WT_CURSOR *)cjoin, WT_CURSTD_RAW)) config = &raw_cfg[0]; else config = &def_cfg[0]; urimain = cjoin->table->iface.name; if ((proj = cjoin->projection) != NULL) { size = strlen(urimain) + strlen(proj) + 1; WT_ERR(__wt_calloc(session, size, 1, &mainbuf)); WT_ERR(__wt_snprintf(mainbuf, size, "%s%s", urimain, proj)); urimain = mainbuf; } WT_ERR(__wt_open_cursor(session, urimain, (WT_CURSOR *)cjoin, config, &cjoin->main)); jeend = &cjoin->entries[cjoin->entries_next]; for (je = cjoin->entries; je < jeend; je++) { if (je->subjoin != NULL) { WT_ERR(__curjoin_init_next(session, je->subjoin, iterable)); continue; } __wt_stat_join_init_single(&je->stats); /* * For a single compare=le/lt endpoint in any entry that may * be iterated, construct a companion compare=ge endpoint * that will actually be iterated. */ if (iterable && je->ends_next == 1 && F_ISSET(&je->ends[0], WT_CURJOIN_END_LT)) { origcur = je->ends[0].cursor; WT_ERR(__curjoin_insert_endpoint(session, je, 0, &end)); WT_ERR(__wt_open_cursor(session, origcur->uri, (WT_CURSOR *)cjoin, F_ISSET(origcur, WT_CURSTD_RAW) ? raw_cfg : def_cfg, &end->cursor)); end->flags = WT_CURJOIN_END_GT | WT_CURJOIN_END_EQ | WT_CURJOIN_END_OWN_CURSOR; WT_ERR(end->cursor->next(end->cursor)); F_CLR(je, WT_CURJOIN_ENTRY_DISJUNCTION); } for (end = &je->ends[0]; end < &je->ends[je->ends_next]; end++) WT_ERR(__curjoin_endpoint_init_key(session, je, end)); /* * Do any needed Bloom filter initialization. Ignore Bloom * filters for entries that will be iterated. They won't * help since these entries either don't need an inclusion * check or are doing any needed check during the iteration. */ if (!iterable && F_ISSET(je, WT_CURJOIN_ENTRY_BLOOM)) { if (session->txn.isolation == WT_ISO_READ_UNCOMMITTED) WT_ERR_MSG(session, EINVAL, "join cursors with Bloom filters cannot be " "used with read-uncommitted isolation"); if (je->bloom == NULL) { /* * Look for compatible filters to be shared, * pick compatible numbers for bit counts * and number of hashes. */ f = je->bloom_bit_count; k = je->bloom_hash_count; for (je2 = je + 1; je2 < jeend; je2++) if (F_ISSET(je2, WT_CURJOIN_ENTRY_BLOOM) && je2->count == je->count) { f = WT_MAX( je2->bloom_bit_count, f); k = WT_MAX( je2->bloom_hash_count, k); } je->bloom_bit_count = f; je->bloom_hash_count = k; WT_ERR(__wt_bloom_create(session, NULL, NULL, je->count, f, k, &je->bloom)); F_SET(je, WT_CURJOIN_ENTRY_OWN_BLOOM); WT_ERR(__curjoin_init_bloom(session, cjoin, je, je->bloom)); /* * Share the Bloom filter, making all * config info consistent. */ for (je2 = je + 1; je2 < jeend; je2++) if (F_ISSET(je2, WT_CURJOIN_ENTRY_BLOOM) && je2->count == je->count) { WT_ASSERT(session, je2->bloom == NULL); je2->bloom = je->bloom; je2->bloom_bit_count = f; je2->bloom_hash_count = k; } } else { /* * Create a temporary filter that we'll * merge into the shared one. The Bloom * parameters of the two filters must match. */ WT_ERR(__wt_bloom_create(session, NULL, NULL, je->count, je->bloom_bit_count, je->bloom_hash_count, &bloom)); WT_ERR(__curjoin_init_bloom(session, cjoin, je, bloom)); WT_ERR(__wt_bloom_intersection(je->bloom, bloom)); WT_ERR(__wt_bloom_close(bloom)); } } if (!F_ISSET(cjoin, WT_CURJOIN_DISJUNCTION)) iterable = false; } F_SET(cjoin, WT_CURJOIN_INITIALIZED); err: __wt_free(session, mainbuf); return (ret); }
/* * __wt_las_sweep -- * Sweep the lookaside table. */ int __wt_las_sweep(WT_SESSION_IMPL *session) { WT_CONNECTION_IMPL *conn; WT_CURSOR *cursor; WT_DECL_ITEM(las_addr); WT_DECL_ITEM(las_key); WT_DECL_RET; WT_ITEM *key; uint64_t cnt, las_counter, las_txnid; int64_t remove_cnt; uint32_t las_id, session_flags; int notused; conn = S2C(session); cursor = NULL; key = &conn->las_sweep_key; remove_cnt = 0; session_flags = 0; /* [-Werror=maybe-uninitialized] */ WT_ERR(__wt_scr_alloc(session, 0, &las_addr)); WT_ERR(__wt_scr_alloc(session, 0, &las_key)); WT_ERR(__wt_las_cursor(session, &cursor, &session_flags)); /* * If we're not starting a new sweep, position the cursor using the key * from the last call (we don't care if we're before or after the key, * just roughly in the same spot is fine). */ if (key->size != 0) { __wt_cursor_set_raw_key(cursor, key); ret = cursor->search_near(cursor, ¬used); /* * Don't search for the same key twice; if we don't set a new * key below, it's because we've reached the end of the table * and we want the next pass to start at the beginning of the * table. Searching for the same key could leave us stuck at * the end of the table, repeatedly checking the same rows. */ key->size = 0; if (ret != 0) goto srch_notfound; } /* * The sweep server wakes up every 10 seconds (by default), it's a slow * moving thread. Try to review the entire lookaside table once every 5 * minutes, or every 30 calls. * * The reason is because the lookaside table exists because we're seeing * cache/eviction pressure (it allows us to trade performance and disk * space for cache space), and it's likely lookaside blocks are being * evicted, and reading them back in doesn't help things. A trickier, * but possibly better, alternative might be to review all lookaside * blocks in the cache in order to get rid of them, and slowly review * lookaside blocks that have already been evicted. */ cnt = (uint64_t)WT_MAX(100, conn->las_record_cnt / 30); /* Discard pages we read as soon as we're done with them. */ F_SET(session, WT_SESSION_NO_CACHE); /* Walk the file. */ for (; cnt > 0 && (ret = cursor->next(cursor)) == 0; --cnt) { /* * If the loop terminates after completing a work unit, we will * continue the table sweep next time. Get a local copy of the * sweep key, we're going to reset the cursor; do so before * calling cursor.remove, cursor.remove can discard our hazard * pointer and the page could be evicted from underneath us. */ if (cnt == 1) { WT_ERR(__wt_cursor_get_raw_key(cursor, key)); if (!WT_DATA_IN_ITEM(key)) WT_ERR(__wt_buf_set( session, key, key->data, key->size)); } WT_ERR(cursor->get_key(cursor, &las_id, las_addr, &las_counter, &las_txnid, las_key)); /* * If the on-page record transaction ID associated with the * record is globally visible, the record can be discarded. * * Cursor opened overwrite=true: won't return WT_NOTFOUND should * another thread remove the record before we do, and the cursor * remains positioned in that case. */ if (__wt_txn_visible_all(session, las_txnid)) { WT_ERR(cursor->remove(cursor)); ++remove_cnt; } } srch_notfound: WT_ERR_NOTFOUND_OK(ret); if (0) { err: __wt_buf_free(session, key); } WT_TRET(__wt_las_cursor_close(session, &cursor, session_flags)); /* * If there were races to remove records, we can over-count. All * arithmetic is signed, so underflow isn't fatal, but check anyway so * we don't skew low over time. */ if (remove_cnt > S2C(session)->las_record_cnt) S2C(session)->las_record_cnt = 0; else if (remove_cnt > 0) (void)__wt_atomic_subi64(&conn->las_record_cnt, remove_cnt); F_CLR(session, WT_SESSION_NO_CACHE); __wt_scr_free(session, &las_addr); __wt_scr_free(session, &las_key); return (ret); }
/* * __wt_curjoin_join -- * Add a new join to a join cursor. */ int __wt_curjoin_join(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin, WT_INDEX *idx, WT_CURSOR *ref_cursor, uint8_t flags, uint8_t range, uint64_t count, uint32_t bloom_bit_count, uint32_t bloom_hash_count) { WT_CURSOR_INDEX *cindex; WT_CURSOR_JOIN *child; WT_CURSOR_JOIN_ENDPOINT *end; WT_CURSOR_JOIN_ENTRY *entry; size_t len; uint8_t endrange; u_int i, ins, nonbloom; bool hasins, needbloom, nested, range_eq; entry = NULL; hasins = needbloom = false; ins = nonbloom = 0; /* -Wuninitialized */ if (cjoin->entries_next == 0) { if (LF_ISSET(WT_CURJOIN_ENTRY_DISJUNCTION)) F_SET(cjoin, WT_CURJOIN_DISJUNCTION); } else if (LF_ISSET(WT_CURJOIN_ENTRY_DISJUNCTION) && !F_ISSET(cjoin, WT_CURJOIN_DISJUNCTION)) WT_RET_MSG(session, EINVAL, "operation=or does not match previous operation=and"); else if (!LF_ISSET(WT_CURJOIN_ENTRY_DISJUNCTION) && F_ISSET(cjoin, WT_CURJOIN_DISJUNCTION)) WT_RET_MSG(session, EINVAL, "operation=and does not match previous operation=or"); nested = WT_PREFIX_MATCH(ref_cursor->uri, "join:"); if (!nested) for (i = 0; i < cjoin->entries_next; i++) { if (cjoin->entries[i].index == idx && cjoin->entries[i].subjoin == NULL) { entry = &cjoin->entries[i]; break; } if (!needbloom && i > 0 && !F_ISSET(&cjoin->entries[i], WT_CURJOIN_ENTRY_BLOOM)) { needbloom = true; nonbloom = i; } } else { if (LF_ISSET(WT_CURJOIN_ENTRY_BLOOM)) WT_RET_MSG(session, EINVAL, "Bloom filters cannot be used with subjoins"); } if (entry == NULL) { WT_RET(__wt_realloc_def(session, &cjoin->entries_allocated, cjoin->entries_next + 1, &cjoin->entries)); if (LF_ISSET(WT_CURJOIN_ENTRY_BLOOM) && needbloom) { /* * Reorder the list so that after the first entry, * the Bloom filtered entries come next, followed by * the non-Bloom entries. Once the Bloom filters * are built, determining membership via Bloom is * faster than without Bloom, so we can answer * membership questions more quickly, and with less * I/O, with the Bloom entries first. */ entry = &cjoin->entries[nonbloom]; memmove(entry + 1, entry, (cjoin->entries_next - nonbloom) * sizeof(WT_CURSOR_JOIN_ENTRY)); memset(entry, 0, sizeof(WT_CURSOR_JOIN_ENTRY)); } else entry = &cjoin->entries[cjoin->entries_next]; entry->index = idx; entry->flags = flags; entry->count = count; entry->bloom_bit_count = bloom_bit_count; entry->bloom_hash_count = bloom_hash_count; ++cjoin->entries_next; } else { /* Merge the join into an existing entry for this index */ if (count != 0 && entry->count != 0 && entry->count != count) WT_RET_MSG(session, EINVAL, "count=%" PRIu64 " does not match " "previous count=%" PRIu64 " for this index", count, entry->count); if (LF_MASK(WT_CURJOIN_ENTRY_BLOOM) != F_MASK(entry, WT_CURJOIN_ENTRY_BLOOM)) WT_RET_MSG(session, EINVAL, "join has incompatible strategy " "values for the same index"); if (LF_MASK(WT_CURJOIN_ENTRY_FALSE_POSITIVES) != F_MASK(entry, WT_CURJOIN_ENTRY_FALSE_POSITIVES)) WT_RET_MSG(session, EINVAL, "join has incompatible bloom_false_positives " "values for the same index"); /* * Check against other comparisons (we call them endpoints) * already set up for this index. * We allow either: * - one or more "eq" (with disjunction) * - exactly one "eq" (with conjunction) * - exactly one of "gt" or "ge" (conjunction or disjunction) * - exactly one of "lt" or "le" (conjunction or disjunction) * - one of "gt"/"ge" along with one of "lt"/"le" * (currently restricted to conjunction). * * Some other combinations, although expressible either do * not make sense (X == 3 AND X == 5) or are reducible (X < * 7 AND X < 9). Other specific cases of (X < 7 OR X > 15) * or (X == 4 OR X > 15) make sense but we don't handle yet. */ for (i = 0; i < entry->ends_next; i++) { end = &entry->ends[i]; range_eq = (range == WT_CURJOIN_END_EQ); endrange = WT_CURJOIN_END_RANGE(end); if ((F_ISSET(end, WT_CURJOIN_END_GT) && ((range & WT_CURJOIN_END_GT) != 0 || range_eq)) || (F_ISSET(end, WT_CURJOIN_END_LT) && ((range & WT_CURJOIN_END_LT) != 0 || range_eq)) || (endrange == WT_CURJOIN_END_EQ && (range & (WT_CURJOIN_END_LT | WT_CURJOIN_END_GT)) != 0)) WT_RET_MSG(session, EINVAL, "join has overlapping ranges"); if (range == WT_CURJOIN_END_EQ && endrange == WT_CURJOIN_END_EQ && !F_ISSET(entry, WT_CURJOIN_ENTRY_DISJUNCTION)) WT_RET_MSG(session, EINVAL, "compare=eq can only be combined " "using operation=or"); /* * Sort "gt"/"ge" to the front, followed by any number * of "eq", and finally "lt"/"le". */ if (!hasins && ((range & WT_CURJOIN_END_GT) != 0 || (range == WT_CURJOIN_END_EQ && endrange != WT_CURJOIN_END_EQ && !F_ISSET(end, WT_CURJOIN_END_GT)))) { ins = i; hasins = true; } } /* All checks completed, merge any new configuration now */ entry->count = count; entry->bloom_bit_count = WT_MAX(entry->bloom_bit_count, bloom_bit_count); entry->bloom_hash_count = WT_MAX(entry->bloom_hash_count, bloom_hash_count); } if (nested) { child = (WT_CURSOR_JOIN *)ref_cursor; entry->subjoin = child; child->parent = cjoin; } else { WT_RET(__curjoin_insert_endpoint(session, entry, hasins ? ins : entry->ends_next, &end)); end->cursor = ref_cursor; F_SET(end, range); if (entry->main == NULL && idx != NULL) { /* * Open the main file with a projection of the * indexed columns. */ WT_RET(__curjoin_open_main(session, cjoin, entry)); /* * When we are repacking index keys to remove the * primary key, we never want to transform trailing * 'u'. Use no-op padding to force this. */ cindex = (WT_CURSOR_INDEX *)ref_cursor; len = strlen(cindex->iface.key_format) + 3; WT_RET(__wt_calloc(session, len, 1, &entry->repack_format)); WT_RET(__wt_snprintf(entry->repack_format, len, "%s0x", cindex->iface.key_format)); } } return (0); }
/* * modify_run * Run some tests: * 1. Create an initial value, a copy and a fake cursor to use with the * WiredTiger routines. Generate a set of modify vectors and apply them to * the item stored in the cursor using the modify apply API. Also apply the * same modify vector to one of the copies using a helper routine written * to test the modify API. The final value generated with the modify API * and the helper routine should match. * * 2. Use the initial value and the modified value generated above as * inputs into the calculate-modify API to generate a set of modify * vectors. Apply this generated vector to the initial value using the * modify apply API to obtain a final value. The final value generated * should match the modified value that was used as input to the * calculate-modify API. */ static void modify_run(bool verbose) { WT_CURSOR *cursor, _cursor; WT_DECL_RET; WT_ITEM *localA, _localA, *localB, _localB; size_t len; int i, j; /* Initialize the RNG. */ __wt_random_init_seed(NULL, &rnd); /* Set up replacement information. */ modify_repl_init(); /* We need three WT_ITEMs, one of them part of a fake cursor. */ localA = &_localA; memset(&_localA, 0, sizeof(_localA)); localB = &_localB; memset(&_localB, 0, sizeof(_localB)); cursor = &_cursor; memset(&_cursor, 0, sizeof(_cursor)); cursor->value_format = "u"; #define NRUNS 10000 for (i = 0; i < NRUNS; ++i) { /* Create an initial value. */ len = (size_t)(__wt_random(&rnd) % MAX_REPL_BYTES); testutil_check(__wt_buf_set(NULL, localA, modify_repl, len)); for (j = 0; j < 1000; ++j) { /* Copy the current value into the second item. */ testutil_check(__wt_buf_set( NULL, localB, localA->data, localA->size)); /* * Create a random set of modify vectors, run the * underlying library modification function, then * compare the result against our implementation * of modify. */ modify_build(); testutil_check(__wt_buf_set( NULL, &cursor->value, localA->data, localA->size)); testutil_check(__wt_modify_apply_api( NULL, cursor, entries, nentries)); slow_apply_api(localA); compare(localA, &cursor->value); /* * Call the WiredTiger function to build a modification * vector for the change, and repeat the test using the * WiredTiger modification vector, then compare results * against our implementation of modify. */ nentries = WT_ELEMENTS(entries); ret = wiredtiger_calc_modify(NULL, localB, localA, WT_MAX(localB->size, localA->size) + 100, entries, &nentries); if (ret == WT_NOTFOUND) continue; testutil_check(ret); testutil_check(__wt_buf_set( NULL, &cursor->value, localB->data, localB->size)); testutil_check(__wt_modify_apply_api( NULL, cursor, entries, nentries)); compare(localA, &cursor->value); } if (verbose) { printf("%d (%d%%)\r", i, (i * 100) / NRUNS); fflush(stdout); } } if (verbose) printf("%d (100%%)\n", i); __wt_buf_free(NULL, localA); __wt_buf_free(NULL, localB); __wt_buf_free(NULL, &cursor->value); }
/* * __wt_lsm_meta_read -- * Read the metadata for an LSM tree. */ int __wt_lsm_meta_read(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) { WT_CONFIG cparser, lparser; WT_CONFIG_ITEM ck, cv, fileconf, lk, lv, metadata; WT_DECL_RET; WT_LSM_CHUNK *chunk; char *lsmconfig; u_int nchunks; chunk = NULL; /* -Wconditional-uninitialized */ /* LSM trees inherit the merge setting from the connection. */ if (F_ISSET(S2C(session), WT_CONN_LSM_MERGE)) F_SET(lsm_tree, WT_LSM_TREE_MERGES); WT_RET(__wt_metadata_search(session, lsm_tree->name, &lsmconfig)); WT_ERR(__wt_config_init(session, &cparser, lsmconfig)); while ((ret = __wt_config_next(&cparser, &ck, &cv)) == 0) { if (WT_STRING_MATCH("key_format", ck.str, ck.len)) { __wt_free(session, lsm_tree->key_format); WT_ERR(__wt_strndup(session, cv.str, cv.len, &lsm_tree->key_format)); } else if (WT_STRING_MATCH("value_format", ck.str, ck.len)) { __wt_free(session, lsm_tree->value_format); WT_ERR(__wt_strndup(session, cv.str, cv.len, &lsm_tree->value_format)); } else if (WT_STRING_MATCH("collator", ck.str, ck.len)) { if (cv.len == 0 || WT_STRING_MATCH("none", cv.str, cv.len)) continue; /* * Extract the application-supplied metadata (if any) * from the file configuration. */ WT_ERR(__wt_config_getones( session, lsmconfig, "file_config", &fileconf)); WT_CLEAR(metadata); WT_ERR_NOTFOUND_OK(__wt_config_subgets( session, &fileconf, "app_metadata", &metadata)); WT_ERR(__wt_collator_config(session, lsm_tree->name, &cv, &metadata, &lsm_tree->collator, &lsm_tree->collator_owned)); WT_ERR(__wt_strndup(session, cv.str, cv.len, &lsm_tree->collator_name)); } else if (WT_STRING_MATCH("bloom_config", ck.str, ck.len)) { __wt_free(session, lsm_tree->bloom_config); /* Don't include the brackets. */ WT_ERR(__wt_strndup(session, cv.str + 1, cv.len - 2, &lsm_tree->bloom_config)); } else if (WT_STRING_MATCH("file_config", ck.str, ck.len)) { __wt_free(session, lsm_tree->file_config); /* Don't include the brackets. */ WT_ERR(__wt_strndup(session, cv.str + 1, cv.len - 2, &lsm_tree->file_config)); } else if (WT_STRING_MATCH("auto_throttle", ck.str, ck.len)) { if (cv.val) F_SET(lsm_tree, WT_LSM_TREE_THROTTLE); else F_CLR(lsm_tree, WT_LSM_TREE_THROTTLE); } else if (WT_STRING_MATCH("bloom", ck.str, ck.len)) lsm_tree->bloom = (uint32_t)cv.val; else if (WT_STRING_MATCH("bloom_bit_count", ck.str, ck.len)) lsm_tree->bloom_bit_count = (uint32_t)cv.val; else if (WT_STRING_MATCH("bloom_hash_count", ck.str, ck.len)) lsm_tree->bloom_hash_count = (uint32_t)cv.val; else if (WT_STRING_MATCH("chunk_count_limit", ck.str, ck.len)) { lsm_tree->chunk_count_limit = (uint32_t)cv.val; if (cv.val != 0) F_CLR(lsm_tree, WT_LSM_TREE_MERGES); } else if (WT_STRING_MATCH("chunk_max", ck.str, ck.len)) lsm_tree->chunk_max = (uint64_t)cv.val; else if (WT_STRING_MATCH("chunk_size", ck.str, ck.len)) lsm_tree->chunk_size = (uint64_t)cv.val; else if (WT_STRING_MATCH("merge_max", ck.str, ck.len)) lsm_tree->merge_max = (uint32_t)cv.val; else if (WT_STRING_MATCH("merge_min", ck.str, ck.len)) lsm_tree->merge_min = (uint32_t)cv.val; else if (WT_STRING_MATCH("last", ck.str, ck.len)) lsm_tree->last = (u_int)cv.val; else if (WT_STRING_MATCH("chunks", ck.str, ck.len)) { WT_ERR(__wt_config_subinit(session, &lparser, &cv)); for (nchunks = 0; (ret = __wt_config_next(&lparser, &lk, &lv)) == 0; ) { if (WT_STRING_MATCH("id", lk.str, lk.len)) { WT_ERR(__wt_realloc_def(session, &lsm_tree->chunk_alloc, nchunks + 1, &lsm_tree->chunk)); WT_ERR( __wt_calloc_one(session, &chunk)); lsm_tree->chunk[nchunks++] = chunk; chunk->id = (uint32_t)lv.val; WT_ERR(__wt_lsm_tree_chunk_name(session, lsm_tree, chunk->id, &chunk->uri)); F_SET(chunk, WT_LSM_CHUNK_ONDISK | WT_LSM_CHUNK_STABLE); } else if (WT_STRING_MATCH( "bloom", lk.str, lk.len)) { WT_ERR(__wt_lsm_tree_bloom_name( session, lsm_tree, chunk->id, &chunk->bloom_uri)); F_SET(chunk, WT_LSM_CHUNK_BLOOM); continue; } else if (WT_STRING_MATCH( "chunk_size", lk.str, lk.len)) { chunk->size = (uint64_t)lv.val; continue; } else if (WT_STRING_MATCH( "count", lk.str, lk.len)) { chunk->count = (uint64_t)lv.val; continue; } else if (WT_STRING_MATCH( "generation", lk.str, lk.len)) { chunk->generation = (uint32_t)lv.val; continue; } } WT_ERR_NOTFOUND_OK(ret); lsm_tree->nchunks = nchunks; } else if (WT_STRING_MATCH("old_chunks", ck.str, ck.len)) { WT_ERR(__wt_config_subinit(session, &lparser, &cv)); for (nchunks = 0; (ret = __wt_config_next(&lparser, &lk, &lv)) == 0; ) { if (WT_STRING_MATCH("bloom", lk.str, lk.len)) { WT_ERR(__wt_strndup(session, lv.str, lv.len, &chunk->bloom_uri)); F_SET(chunk, WT_LSM_CHUNK_BLOOM); continue; } WT_ERR(__wt_realloc_def(session, &lsm_tree->old_alloc, nchunks + 1, &lsm_tree->old_chunks)); WT_ERR(__wt_calloc_one(session, &chunk)); lsm_tree->old_chunks[nchunks++] = chunk; WT_ERR(__wt_strndup(session, lk.str, lk.len, &chunk->uri)); F_SET(chunk, WT_LSM_CHUNK_ONDISK); } WT_ERR_NOTFOUND_OK(ret); lsm_tree->nold_chunks = nchunks; } /* * Ignore any other values: the metadata entry might have been * created by a future release, with unknown options. */ } WT_ERR_NOTFOUND_OK(ret); /* * If the default merge_min was not overridden, calculate it now. We * do this here so that trees created before merge_min was added get a * sane value. */ if (lsm_tree->merge_min < 2) lsm_tree->merge_min = WT_MAX(2, lsm_tree->merge_max / 2); err: __wt_free(session, lsmconfig); return (ret); }
/* * __wt_lsm_tree_throttle -- * Calculate whether LSM updates need to be throttled. Must be called * with the LSM tree lock held. */ void __wt_lsm_tree_throttle( WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, int decrease_only) { WT_LSM_CHUNK *last_chunk, **cp, *ondisk, *prev_chunk; uint64_t cache_sz, cache_used, oldtime, record_count, timediff; uint32_t in_memory, gen0_chunks; /* Never throttle in small trees. */ if (lsm_tree->nchunks < 3) { lsm_tree->ckpt_throttle = lsm_tree->merge_throttle = 0; return; } cache_sz = S2C(session)->cache_size; /* * In the steady state, we expect that the checkpoint worker thread * will keep up with inserts. If not, throttle the insert rate to * avoid filling the cache with in-memory chunks. Threads sleep every * 100 operations, so take that into account in the calculation. * * Also throttle based on whether merge threads are keeping up. If * there are enough chunks that have never been merged we slow down * inserts so that merges have some chance of keeping up. * * Count the number of in-memory chunks, the number of unmerged chunk * on disk, and find the most recent on-disk chunk (if any). */ record_count = 1; gen0_chunks = in_memory = 0; ondisk = NULL; for (cp = lsm_tree->chunk + lsm_tree->nchunks - 1; cp >= lsm_tree->chunk; --cp) if (!F_ISSET(*cp, WT_LSM_CHUNK_ONDISK)) { record_count += (*cp)->count; ++in_memory; } else { /* * Assign ondisk to the last chunk that has been * flushed since the tree was last opened (i.e it's on * disk and stable is not set). */ if (ondisk == NULL && ((*cp)->generation == 0 && !F_ISSET(*cp, WT_LSM_CHUNK_STABLE))) ondisk = *cp; if ((*cp)->generation == 0 && !F_ISSET(*cp, WT_LSM_CHUNK_MERGING)) ++gen0_chunks; } last_chunk = lsm_tree->chunk[lsm_tree->nchunks - 1]; /* Checkpoint throttling, based on the number of in-memory chunks. */ if (!F_ISSET(lsm_tree, WT_LSM_TREE_THROTTLE) || in_memory <= 3) lsm_tree->ckpt_throttle = 0; else if (decrease_only) ; /* Nothing to do */ else if (ondisk == NULL) { /* * No checkpoint has completed this run. Keep slowing down * inserts until one does. */ lsm_tree->ckpt_throttle = WT_MAX(WT_LSM_THROTTLE_START, 2 * lsm_tree->ckpt_throttle); } else { WT_ASSERT(session, WT_TIMECMP(last_chunk->create_ts, ondisk->create_ts) >= 0); timediff = WT_TIMEDIFF(last_chunk->create_ts, ondisk->create_ts); lsm_tree->ckpt_throttle = (long)((in_memory - 2) * timediff / (20 * record_count)); /* * Get more aggressive as the number of in memory chunks * consumes a large proportion of the cache. In memory chunks * are allowed to grow up to twice as large as the configured * value when checkpoints aren't keeping up. That worst case * is when this calculation is relevant. * There is nothing particularly special about the chosen * multipliers. */ cache_used = in_memory * lsm_tree->chunk_size * 2; if (cache_used > cache_sz * 0.8) lsm_tree->ckpt_throttle *= 5; } /* * Merge throttling, based on the number of on-disk, level 0 chunks. * * Don't throttle if the tree has less than a single level's number * of chunks. */ if (lsm_tree->nchunks < lsm_tree->merge_max) lsm_tree->merge_throttle = 0; else if (gen0_chunks < WT_LSM_MERGE_THROTTLE_THRESHOLD) WT_LSM_MERGE_THROTTLE_DECREASE(lsm_tree->merge_throttle); else if (!decrease_only) WT_LSM_MERGE_THROTTLE_INCREASE(lsm_tree->merge_throttle); /* Put an upper bound of 1s on both throttle calculations. */ lsm_tree->ckpt_throttle = WT_MIN(1000000, lsm_tree->ckpt_throttle); lsm_tree->merge_throttle = WT_MIN(1000000, lsm_tree->merge_throttle); /* * Update our estimate of how long each in-memory chunk stays active. * Filter out some noise by keeping a weighted history of the * calculated value. Wait until we have enough chunks that we can * check that the new value is sane: otherwise, after a long idle * period, we can calculate a crazy value. */ if (in_memory > 1 && ondisk != NULL) { prev_chunk = lsm_tree->chunk[lsm_tree->nchunks - 2]; WT_ASSERT(session, prev_chunk->generation == 0); WT_ASSERT(session, WT_TIMECMP( last_chunk->create_ts, prev_chunk->create_ts) >= 0); timediff = WT_TIMEDIFF(last_chunk->create_ts, prev_chunk->create_ts); WT_ASSERT(session, WT_TIMECMP(prev_chunk->create_ts, ondisk->create_ts) >= 0); oldtime = WT_TIMEDIFF(prev_chunk->create_ts, ondisk->create_ts); if (timediff < 10 * oldtime) lsm_tree->chunk_fill_ms = (3 * lsm_tree->chunk_fill_ms + timediff / 1000000) / 4; } }
int run_truncate(CONFIG *cfg, CONFIG_THREAD *thread, WT_CURSOR *cursor, WT_SESSION *session, int *truncatedp) { TRUNCATE_CONFIG *trunc_cfg; TRUNCATE_QUEUE_ENTRY *truncate_item; char *truncate_key; int ret, t_ret; uint64_t used_stone_gap; ret = 0; trunc_cfg = &thread->trunc_cfg; *truncatedp = 0; /* Update the total inserts */ trunc_cfg->total_inserts = sum_insert_ops(cfg); trunc_cfg->expected_total += (trunc_cfg->total_inserts - trunc_cfg->last_total_inserts); trunc_cfg->last_total_inserts = trunc_cfg->total_inserts; /* We are done if there isn't enough data to trigger a new milestone. */ if (trunc_cfg->expected_total <= thread->workload->truncate_count) return (0); /* * If we are falling behind and using more than one stone per lap we * should widen the stone gap for this lap to try and catch up quicker. */ if (trunc_cfg->expected_total > thread->workload->truncate_count + trunc_cfg->stone_gap) { /* * Increase the multiplier until we create stones that are * almost large enough to truncate the whole expected table size * in one operation. */ trunc_cfg->catchup_multiplier = WT_MIN(trunc_cfg->catchup_multiplier + 1, trunc_cfg->needed_stones - 1); } else { /* Back off if we start seeing an improvement */ trunc_cfg->catchup_multiplier = WT_MAX(trunc_cfg->catchup_multiplier - 1, 1); } used_stone_gap = trunc_cfg->stone_gap * trunc_cfg->catchup_multiplier; while (trunc_cfg->num_stones < trunc_cfg->needed_stones) { trunc_cfg->last_key += used_stone_gap; truncate_key = calloc(cfg->key_sz, 1); if (truncate_key == NULL) { lprintf(cfg, ENOMEM, 0, "truncate: couldn't allocate key array"); return (ENOMEM); } truncate_item = calloc(sizeof(TRUNCATE_QUEUE_ENTRY), 1); if (truncate_item == NULL) { free(truncate_key); lprintf(cfg, ENOMEM, 0, "truncate: couldn't allocate item"); return (ENOMEM); } generate_key(cfg, truncate_key, trunc_cfg->last_key); truncate_item->key = truncate_key; truncate_item->diff = used_stone_gap; TAILQ_INSERT_TAIL(&cfg->stone_head, truncate_item, q); trunc_cfg->num_stones++; } /* We are done if there isn't enough data to trigger a truncate. */ if (trunc_cfg->num_stones == 0 || trunc_cfg->expected_total <= thread->workload->truncate_count) return (0); truncate_item = TAILQ_FIRST(&cfg->stone_head); trunc_cfg->num_stones--; TAILQ_REMOVE(&cfg->stone_head, truncate_item, q); cursor->set_key(cursor,truncate_item->key); if ((ret = cursor->search(cursor)) != 0) { lprintf(cfg, ret, 0, "Truncate search: failed"); goto err; } if ((ret = session->truncate(session, NULL, NULL, cursor, NULL)) != 0) { lprintf(cfg, ret, 0, "Truncate: failed"); goto err; } *truncatedp = 1; trunc_cfg->expected_total -= truncate_item->diff; err: free(truncate_item->key); free(truncate_item); t_ret = cursor->reset(cursor); if (t_ret != 0) lprintf(cfg, t_ret, 0, "Cursor reset failed"); if (ret == 0 && t_ret != 0) ret = t_ret; return (ret); }