/* * __global_calibrate_ticks -- * Calibrate a ratio from rdtsc ticks to nanoseconds. */ static void __global_calibrate_ticks(void) { /* * Default to using __wt_epoch until we have a good value for the ratio. */ __wt_process.tsc_nsec_ratio = WT_TSC_DEFAULT_RATIO; __wt_process.use_epochtime = true; #if defined (__i386) || defined (__amd64) { struct timespec start, stop; double ratio; uint64_t diff_nsec, diff_tsc, min_nsec, min_tsc; uint64_t tries, tsc_start, tsc_stop; volatile uint64_t i; /* * Run this calibration loop a few times to make sure we get a * reading that does not have a potential scheduling shift in it. * The inner loop is CPU intensive but a scheduling change in the * middle could throw off calculations. Take the minimum amount * of time and compute the ratio. */ min_nsec = min_tsc = UINT64_MAX; for (tries = 0; tries < 3; ++tries) { /* This needs to be CPU intensive and large enough. */ __wt_epoch(NULL, &start); tsc_start = __wt_rdtsc(); for (i = 0; i < 100 * WT_MILLION; i++) ; tsc_stop = __wt_rdtsc(); __wt_epoch(NULL, &stop); diff_nsec = WT_TIMEDIFF_NS(stop, start); diff_tsc = tsc_stop - tsc_start; /* If the clock didn't tick over, we don't have a sample. */ if (diff_nsec == 0 || diff_tsc == 0) continue; min_nsec = WT_MIN(min_nsec, diff_nsec); min_tsc = WT_MIN(min_tsc, diff_tsc); } /* * Only use rdtsc if we got a good reading. One reason this might fail * is that the system's clock granularity is not fine-grained enough. */ if (min_nsec != UINT64_MAX) { ratio = (double)min_tsc / (double)min_nsec; if (ratio > DBL_EPSILON) { __wt_process.tsc_nsec_ratio = ratio; __wt_process.use_epochtime = false; } } } #endif }
/* * __snapsort_impl -- * Custom quick sort implementation for snapshots. */ static void __snapsort_impl(uint64_t *array, uint32_t f, uint32_t l) { while (f + 16 < l) { uint64_t v1 = array[f], v2 = array[l], v3 = array[(f + l)/2]; uint64_t median = v1 < v2 ? (v3 < v1 ? v1 : WT_MIN(v2, v3)) : (v3 < v2 ? v2 : WT_MIN(v1, v3)); uint32_t m = __snapsort_partition(array, f, l, median); __snapsort_impl(array, f, m); f = m + 1; } }
/* * __ovfl_reuse_verbose -- * Dump information about a reuse overflow record. */ static int __ovfl_reuse_verbose(WT_SESSION_IMPL *session, WT_PAGE *page, WT_OVFL_REUSE *reuse, const char *tag) { WT_DECL_ITEM(tmp); WT_DECL_RET; WT_RET(__wt_scr_alloc(session, 64, &tmp)); WT_ERR(__wt_verbose(session, WT_VERB_OVERFLOW, "reuse: %s%s%p %s (%s%s%s) {%.*s}", tag == NULL ? "" : tag, tag == NULL ? "" : ": ", page, __wt_addr_string( session, WT_OVFL_REUSE_ADDR(reuse), reuse->addr_size, tmp), F_ISSET(reuse, WT_OVFL_REUSE_INUSE) ? "inuse" : "", F_ISSET(reuse, WT_OVFL_REUSE_INUSE) && F_ISSET(reuse, WT_OVFL_REUSE_JUST_ADDED) ? ", " : "", F_ISSET(reuse, WT_OVFL_REUSE_JUST_ADDED) ? "just-added" : "", WT_MIN(reuse->value_size, 40), (char *)WT_OVFL_REUSE_VALUE(reuse))); err: __wt_scr_free(session, &tmp); return (ret); }
/* * __ovfl_txnc_skip_search_stack -- * Search an overflow transaction-cache skiplist, returning an * insert/remove stack. */ static void __ovfl_txnc_skip_search_stack(WT_OVFL_TXNC **head, WT_OVFL_TXNC ***stack, const void *addr, size_t addr_size) { WT_OVFL_TXNC **e; size_t len; int cmp, i; /* * Start at the highest skip level, then go as far as possible at each * level before stepping down to the next. */ for (i = WT_SKIP_MAXDEPTH - 1, e = &head[i]; i >= 0;) { if (*e == NULL) { /* Empty levels */ stack[i--] = e--; continue; } /* * If the skiplist addr is larger than the search addr, or * they compare equally and the skiplist addr is longer than * the search addr, drop down a level, otherwise continue on * this level. */ len = WT_MIN((*e)->addr_size, addr_size); cmp = memcmp(WT_OVFL_TXNC_ADDR(*e), addr, len); if (cmp > 0 || (cmp == 0 && (*e)->addr_size > addr_size)) stack[i--] = e--; /* Drop down a level */ else e = &(*e)->next[i]; /* Keep going at this level */ } }
/* * __posix_file_write -- * POSIX pwrite. */ static int __posix_file_write(WT_FILE_HANDLE *file_handle, WT_SESSION *wt_session, wt_off_t offset, size_t len, const void *buf) { WT_FILE_HANDLE_POSIX *pfh; WT_SESSION_IMPL *session; size_t chunk; ssize_t nw; const uint8_t *addr; session = (WT_SESSION_IMPL *)wt_session; pfh = (WT_FILE_HANDLE_POSIX *)file_handle; /* Assert direct I/O is aligned and a multiple of the alignment. */ WT_ASSERT(session, !pfh->direct_io || S2C(session)->buffer_alignment == 0 || (!((uintptr_t)buf & (uintptr_t)(S2C(session)->buffer_alignment - 1)) && len >= S2C(session)->buffer_alignment && len % S2C(session)->buffer_alignment == 0)); /* Break writes larger than 1GB into 1GB chunks. */ for (addr = buf; len > 0; addr += nw, len -= (size_t)nw, offset += nw) { chunk = WT_MIN(len, WT_GIGABYTE); if ((nw = pwrite(pfh->fd, addr, chunk, offset)) < 0) WT_RET_MSG(session, __wt_errno(), "%s: handle-write: pwrite: failed to write %" WT_SIZET_FMT " bytes at offset %" PRIuMAX, file_handle->name, chunk, (uintmax_t)offset); } return (0); }
int __wt_read(WT_SESSION_IMPL *session, WT_FH *fh, wt_off_t offset, size_t len, void *buf) { size_t chunk; ssize_t nr; uint8_t *addr; WT_STAT_FAST_CONN_INCR(session, read_io); WT_RET(__wt_verbose(session, WT_VERB_FILEOPS, "%s: read %" WT_SIZET_FMT " bytes at offset %" PRIuMAX, fh->name, len, (uintmax_t)offset)); /* Assert direct I/O is aligned and a multiple of the alignment. */ WT_ASSERT(session, !fh->direct_io || S2C(session)->buffer_alignment == 0 || (!((uintptr_t)buf & (uintptr_t)(S2C(session)->buffer_alignment - 1)) && len >= S2C(session)->buffer_alignment && len % S2C(session)->buffer_alignment == 0)); /* Break reads larger than 1GB into 1GB chunks. */ for (addr = buf; len > 0; addr += nr, len -= (size_t)nr, offset += nr) { chunk = WT_MIN(len, WT_GIGABYTE); if ((nr = pread(fh->fd, addr, chunk, offset)) <= 0) WT_RET_MSG(session, nr == 0 ? WT_ERROR : __wt_errno(), "%s read error: failed to read %" WT_SIZET_FMT " bytes at offset %" PRIuMAX, fh->name, chunk, (uintmax_t)offset); } return (0); }
/* * __ovfl_reuse_skip_search -- * Return the first, not in-use, matching value in the overflow reuse list. */ static WT_OVFL_REUSE * __ovfl_reuse_skip_search( WT_OVFL_REUSE **head, const void *value, size_t value_size) { WT_OVFL_REUSE **e, *next; size_t len; int cmp, i; /* * Start at the highest skip level, then go as far as possible at each * level before stepping down to the next. */ for (i = WT_SKIP_MAXDEPTH - 1, e = &head[i]; i >= 0;) { if (*e == NULL) { /* Empty levels */ --i; --e; continue; } /* * Values are not unique, and it's possible to have long lists * of identical overflow items. (We've seen it in benchmarks.) * Move through a list of identical items at the current level * as long as the next one is in-use, otherwise, drop down a * level. When at the bottom level, return items if reusable, * else NULL. */ len = WT_MIN((*e)->value_size, value_size); cmp = memcmp(WT_OVFL_REUSE_VALUE(*e), value, len); if (cmp == 0 && (*e)->value_size == value_size) { if (i == 0) return (F_ISSET(*e, WT_OVFL_REUSE_INUSE) ? NULL : *e); if ((next = (*e)->next[i]) == NULL || !F_ISSET(next, WT_OVFL_REUSE_INUSE) || next->value_size != len || memcmp( WT_OVFL_REUSE_VALUE(next), value, len) != 0) { --i; /* Drop down a level */ --e; } else /* Keep going at this level */ e = &(*e)->next[i]; continue; } /* * If the skiplist value is larger than the search value, or * they compare equally and the skiplist value is longer than * the search value, drop down a level, otherwise continue on * this level. */ if (cmp > 0 || (cmp == 0 && (*e)->value_size > value_size)) { --i; /* Drop down a level */ --e; } else /* Keep going at this level */ e = &(*e)->next[i]; } return (NULL); }
/* * __wt_log_slot_init -- * Initialize the slot array. */ int __wt_log_slot_init(WT_SESSION_IMPL *session, bool alloc) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_LOG *log; WT_LOGSLOT *slot; int32_t i; conn = S2C(session); log = conn->log; for (i = 0; i < WT_SLOT_POOL; i++) log->slot_pool[i].slot_state = WT_LOG_SLOT_FREE; /* * Allocate memory for buffers now that the arrays are setup. Separate * this from the loop above to make error handling simpler. */ /* * !!! If the buffer size is too close to the log file size, we will * switch log files very aggressively. Scale back the buffer for * small log file sizes. */ if (alloc) { log->slot_buf_size = (uint32_t)WT_MIN( (size_t)conn->log_file_max / 10, WT_LOG_SLOT_BUF_SIZE); for (i = 0; i < WT_SLOT_POOL; i++) { WT_ERR(__wt_buf_init(session, &log->slot_pool[i].slot_buf, log->slot_buf_size)); F_SET(&log->slot_pool[i], WT_SLOT_INIT_FLAGS); } WT_STAT_CONN_SET(session, log_buffer_size, log->slot_buf_size * WT_SLOT_POOL); } /* * Set up the available slot from the pool the first time. */ slot = &log->slot_pool[0]; /* * We cannot initialize the release LSN in the activate function * because that function can be called after a log file switch. * The release LSN is usually the same as the slot_start_lsn except * around a log file switch. */ slot->slot_release_lsn = log->alloc_lsn; __wt_log_slot_activate(session, slot); log->active_slot = slot; log->pool_index = 0; if (0) { err: while (--i >= 0) __wt_buf_free(session, &log->slot_pool[i].slot_buf); } return (ret); }
/* * __win_file_read -- * Read a chunk. */ static int __win_file_read(WT_FILE_HANDLE *file_handle, WT_SESSION *wt_session, wt_off_t offset, size_t len, void *buf) { DWORD chunk, nr, windows_error; OVERLAPPED overlapped = { 0 }; WT_DECL_RET; WT_FILE_HANDLE_WIN *win_fh; WT_SESSION_IMPL *session; uint8_t *addr; win_fh = (WT_FILE_HANDLE_WIN *)file_handle; session = (WT_SESSION_IMPL *)wt_session; nr = 0; /* Assert direct I/O is aligned and a multiple of the alignment. */ WT_ASSERT(session, !win_fh->direct_io || S2C(session)->buffer_alignment == 0 || (!((uintptr_t)buf & (uintptr_t)(S2C(session)->buffer_alignment - 1)) && len >= S2C(session)->buffer_alignment && len % S2C(session)->buffer_alignment == 0)); /* Break reads larger than 1GB into 1GB chunks. */ for (addr = buf; len > 0; addr += nr, len -= (size_t)nr, offset += nr) { chunk = (DWORD)WT_MIN(len, WT_GIGABYTE); overlapped.Offset = UINT32_MAX & offset; overlapped.OffsetHigh = UINT32_MAX & (offset >> 32); if (!ReadFile( win_fh->filehandle, addr, chunk, &nr, &overlapped)) { windows_error = __wt_getlasterror(); ret = __wt_map_windows_error(windows_error); if (ret == WT_ERROR) F_SET(S2C(session), WT_CONN_DATA_CORRUPTION); __wt_err(session, ret, "%s: handle-read: ReadFile: failed to read %lu " "bytes at offset %" PRIuMAX ": %s", file_handle->name, chunk, (uintmax_t)offset, __wt_formatmessage(session, windows_error)); return (ret); } } return (0); }
/* * __wt_log_slot_init -- * Initialize the slot array. */ int __wt_log_slot_init(WT_SESSION_IMPL *session) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_LOG *log; WT_LOGSLOT *slot; int32_t i; conn = S2C(session); log = conn->log; for (i = 0; i < WT_SLOT_POOL; i++) { log->slot_pool[i].slot_state = WT_LOG_SLOT_FREE; log->slot_pool[i].slot_index = WT_SLOT_INVALID_INDEX; } /* * Set up the available slots from the pool the first time. */ for (i = 0; i < WT_SLOT_ACTIVE; i++) { slot = &log->slot_pool[i]; slot->slot_index = (uint32_t)i; slot->slot_state = WT_LOG_SLOT_READY; log->slot_array[i] = slot; } /* * Allocate memory for buffers now that the arrays are setup. Split * this out to make error handling simpler. * * Cap the slot buffer to the log file size. */ log->slot_buf_size = WT_MIN((size_t)conn->log_file_max, WT_LOG_SLOT_BUF_SIZE); for (i = 0; i < WT_SLOT_POOL; i++) { WT_ERR(__wt_buf_init(session, &log->slot_pool[i].slot_buf, log->slot_buf_size)); F_SET(&log->slot_pool[i], WT_SLOT_INIT_FLAGS); } WT_STAT_FAST_CONN_INCRV(session, log_buffer_size, log->slot_buf_size * WT_SLOT_POOL); if (0) { err: while (--i >= 0) __wt_buf_free(session, &log->slot_pool[i].slot_buf); } return (ret); }
/* * __wt_log_open -- * Open the appropriate log file for the connection. The purpose is * to find the last log file that exists, open it and set our initial * LSNs to the end of that file. If none exist, call __wt_log_newfile * to create it. */ int __wt_log_open(WT_SESSION_IMPL *session) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_LOG *log; uint32_t firstlog, lastlog, lognum; u_int i, logcount; char **logfiles; conn = S2C(session); log = conn->log; lastlog = 0; firstlog = UINT32_MAX; WT_RET(__wt_log_get_files(session, &logfiles, &logcount)); for (i = 0; i < logcount; i++) { WT_ERR(__wt_log_extract_lognum(session, logfiles[i], &lognum)); lastlog = WT_MAX(lastlog, lognum); firstlog = WT_MIN(firstlog, lognum); } log->fileid = lastlog; WT_ERR(__wt_verbose(session, WT_VERB_LOG, "log_open: first log %d last log %d", firstlog, lastlog)); log->first_lsn.file = firstlog; log->first_lsn.offset = 0; /* * Start logging at the beginning of the next log file, no matter * where the previous log file ends. */ WT_ERR(__wt_log_newfile(session, 1)); /* * If there were log files, run recovery. * XXX belongs at a higher level than this. */ if (logcount > 0) { log->trunc_lsn = log->alloc_lsn; WT_ERR(__wt_txn_recover(session)); } err: __wt_log_files_free(session, logfiles, logcount); return (ret); }
/* * __fstream_getline -- * Get a line from a stream. * * Implementation of the POSIX getline or BSD fgetln functions (finding the * function in a portable way is hard, it's simple enough to write it instead). * * Note: Unlike the standard getline calls, this function doesn't include the * trailing newline character in the returned buffer and discards empty lines * (so the caller's EOF marker is a returned line length of 0). */ static int __fstream_getline(WT_SESSION_IMPL *session, WT_FSTREAM *fstr, WT_ITEM *buf) { const char *p; size_t len; char c; /* * We always NUL-terminate the returned string (even if it's empty), * make sure there's buffer space for a trailing NUL in all cases. */ WT_RET(__wt_buf_init(session, buf, 100)); for (;;) { /* Check if we need to refill the buffer. */ if (WT_PTRDIFF(fstr->buf.data, fstr->buf.mem) >= fstr->buf.size) { len = WT_MIN(WT_STREAM_BUFSIZE, (size_t)(fstr->size - fstr->off)); if (len == 0) break; /* EOF */ WT_RET(__wt_buf_initsize(session, &fstr->buf, len)); WT_RET(__wt_read( session, fstr->fh, fstr->off, len, fstr->buf.mem)); fstr->off += (wt_off_t)len; } c = *(p = fstr->buf.data); fstr->buf.data = ++p; /* Leave space for a trailing NUL. */ WT_RET(__wt_buf_extend(session, buf, buf->size + 2)); if (c == '\n') { if (buf->size == 0) continue; break; } ((char *)buf->mem)[buf->size++] = c; } ((char *)buf->mem)[buf->size] = '\0'; return (0); }
/* * __win_file_write -- * Write a chunk. */ static int __win_file_write(WT_FILE_HANDLE *file_handle, WT_SESSION *wt_session, wt_off_t offset, size_t len, const void *buf) { DWORD chunk, nw, windows_error; const uint8_t *addr; OVERLAPPED overlapped = { 0 }; WT_FILE_HANDLE_WIN *win_fh; WT_SESSION_IMPL *session; win_fh = (WT_FILE_HANDLE_WIN *)file_handle; session = (WT_SESSION_IMPL *)wt_session; nw = 0; /* Assert direct I/O is aligned and a multiple of the alignment. */ WT_ASSERT(session, !win_fh->direct_io || S2C(session)->buffer_alignment == 0 || (!((uintptr_t)buf & (uintptr_t)(S2C(session)->buffer_alignment - 1)) && len >= S2C(session)->buffer_alignment && len % S2C(session)->buffer_alignment == 0)); /* Break writes larger than 1GB into 1GB chunks. */ for (addr = buf; len > 0; addr += nw, len -= (size_t)nw, offset += nw) { chunk = (DWORD)WT_MIN(len, WT_GIGABYTE); overlapped.Offset = UINT32_MAX & offset; overlapped.OffsetHigh = UINT32_MAX & (offset >> 32); if (!WriteFile( win_fh->filehandle, addr, chunk, &nw, &overlapped)) { windows_error = __wt_getlasterror(); __wt_errx(session, "%s: handle-write: WriteFile: failed to write %lu " "bytes at offset %" PRIuMAX ": %s", file_handle->name, chunk, (uintmax_t)offset, __wt_formatmessage(session, windows_error)); return (__wt_map_windows_error(windows_error)); } } return (0); }
/* * __ovfl_txnc_skip_search -- * Return the first matching addr in the overflow transaction-cache list. */ static WT_OVFL_TXNC * __ovfl_txnc_skip_search(WT_OVFL_TXNC **head, const void *addr, size_t addr_size) { WT_OVFL_TXNC **e; size_t len; int cmp, i; /* * Start at the highest skip level, then go as far as possible at each * level before stepping down to the next. */ for (i = WT_SKIP_MAXDEPTH - 1, e = &head[i]; i >= 0;) { if (*e == NULL) { /* Empty levels */ --i; --e; continue; } /* * Return any exact matches: we don't care in what search level * we found a match. */ len = WT_MIN((*e)->addr_size, addr_size); cmp = memcmp(WT_OVFL_TXNC_ADDR(*e), addr, len); if (cmp == 0 && (*e)->addr_size == addr_size) return (*e); /* * If the skiplist address is larger than the search address, or * they compare equally and the skiplist address is longer than * the search address, drop down a level, otherwise continue on * this level. */ if (cmp > 0 || (cmp == 0 && (*e)->addr_size > addr_size)) { --i; /* Drop down a level */ --e; } else /* Keep going at this level */ e = &(*e)->next[i]; } return (NULL); }
/* * Run the throttle function. We will sleep if needed and then reload the * counter to perform more operations. */ int worker_throttle(CONFIG_THREAD *thread) { THROTTLE_CONFIG *throttle_cfg; struct timespec now; uint64_t usecs_delta; throttle_cfg = &thread->throttle_cfg; WT_RET(__wt_epoch(NULL, &now)); /* * If we did enough operations in the current interval, sleep for * the rest of the interval. Then add more operations to the queue. */ usecs_delta = WT_TIMEDIFF_US(now, throttle_cfg->last_increment); if (usecs_delta < throttle_cfg->usecs_increment) { (void)usleep( (useconds_t)(throttle_cfg->usecs_increment - usecs_delta)); throttle_cfg->ops_count = throttle_cfg->ops_per_increment; /* * After sleeping, set the interval to the current time. */ WT_RET(__wt_epoch(NULL, &throttle_cfg->last_increment)); } else { throttle_cfg->ops_count = (usecs_delta * throttle_cfg->ops_per_increment) / throttle_cfg->usecs_increment; throttle_cfg->last_increment = now; } /* * Take the minimum so we don't overfill the queue. */ throttle_cfg->ops_count = WT_MIN(throttle_cfg->ops_count, thread->workload->throttle); return (0); }
/* * __wt_bloom_finalize -- * Writes the Bloom filter to stable storage. After calling finalize, only * read operations can be performed on the bloom filter. */ int __wt_bloom_finalize(WT_BLOOM *bloom) { WT_CURSOR *c; WT_DECL_RET; WT_ITEM values; WT_SESSION *wt_session; uint64_t i; wt_session = (WT_SESSION *)bloom->session; WT_CLEAR(values); /* * Create a bit table to store the bloom filter in. * TODO: should this call __wt_schema_create directly? */ WT_RET(wt_session->create(wt_session, bloom->uri, bloom->config)); WT_RET(wt_session->open_cursor( wt_session, bloom->uri, NULL, "bulk=bitmap", &c)); /* Add the entries from the array into the table. */ for (i = 0; i < bloom->m; i += values.size) { /* Adjust bits to bytes for string offset */ values.data = bloom->bitstring + (i >> 3); /* * Shave off some bytes for pure paranoia, in case WiredTiger * reserves some special sizes. Choose a value so that if * we do multiple inserts, it will be on an byte boundary. */ values.size = (uint32_t)WT_MIN(bloom->m - i, UINT32_MAX - 127); c->set_value(c, &values); WT_ERR(c->insert(c)); } err: WT_TRET(c->close(c)); __wt_free(bloom->session, bloom->bitstring); bloom->bitstring = NULL; return (ret); }
/* * __ovfl_txnc_verbose -- * Dump information about a transaction-cached overflow record. */ static int __ovfl_txnc_verbose(WT_SESSION_IMPL *session, WT_PAGE *page, WT_OVFL_TXNC *txnc, const char *tag) { WT_DECL_ITEM(tmp); WT_DECL_RET; WT_RET(__wt_scr_alloc(session, 64, &tmp)); WT_ERR(__wt_verbose(session, WT_VERB_OVERFLOW, "txn-cache: %s%s%p %s %" PRIu64 " {%.*s}", tag == NULL ? "" : tag, tag == NULL ? "" : ": ", page, __wt_addr_string( session, WT_OVFL_TXNC_ADDR(txnc), txnc->addr_size, tmp), txnc->current, WT_MIN(txnc->value_size, 40), (char *)WT_OVFL_TXNC_VALUE(txnc))); err: __wt_scr_free(session, &tmp); return (ret); }
/* * __wt_mmap_preload -- * Cause a section of a memory map to be faulted in. */ int __wt_mmap_preload(WT_SESSION_IMPL *session, const void *p, size_t size) { #ifdef HAVE_POSIX_MADVISE /* Linux requires the address be aligned to a 4KB boundary. */ WT_BM *bm = S2BT(session)->bm; WT_DECL_RET; void *blk = (void *)((uintptr_t)p & ~(uintptr_t)(WT_VM_PAGESIZE - 1)); size += WT_PTRDIFF(p, blk); /* XXX proxy for "am I doing a scan?" -- manual read-ahead */ if (F_ISSET(session, WT_SESSION_NO_CACHE)) { /* Read in 2MB blocks every 1MB of data. */ if (((uintptr_t)((uint8_t *)blk + size) & (uintptr_t)((1<<20) - 1)) < (uintptr_t)blk) return (0); size = WT_MIN(WT_MAX(20 * size, 2 << 20), WT_PTRDIFF((uint8_t *)bm->map + bm->maplen, blk)); } /* * Manual pages aren't clear on whether alignment is required for the * size, so we will be conservative. */ size &= ~(size_t)(WT_VM_PAGESIZE - 1); if (size > WT_VM_PAGESIZE && (ret = posix_madvise(blk, size, POSIX_MADV_WILLNEED)) != 0) WT_RET_MSG(session, ret, "posix_madvise will need"); #else WT_UNUSED(session); WT_UNUSED(p); WT_UNUSED(size); #endif return (0); }
/* * __log_archive_once -- * Perform one iteration of log archiving. Must be called with the * log archive lock held. */ static int __log_archive_once(WT_SESSION_IMPL *session, uint32_t backup_file) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_LOG *log; uint32_t lognum, min_lognum; u_int i, locked, logcount; char **logfiles; conn = S2C(session); log = conn->log; logcount = 0; logfiles = NULL; /* * If we're coming from a backup cursor we want the smaller of * the last full log file copied in backup or the checkpoint LSN. * Otherwise we want the minimum of the last log file written to * disk and the checkpoint LSN. */ if (backup_file != 0) min_lognum = WT_MIN(log->ckpt_lsn.file, backup_file); else min_lognum = WT_MIN(log->ckpt_lsn.file, log->sync_lsn.file); WT_RET(__wt_verbose(session, WT_VERB_LOG, "log_archive: archive to log number %" PRIu32, min_lognum)); /* * Main archive code. Get the list of all log files and * remove any earlier than the minimum log number. */ WT_RET(__wt_dirlist(session, conn->log_path, WT_LOG_FILENAME, WT_DIRLIST_INCLUDE, &logfiles, &logcount)); /* * We can only archive files if a hot backup is not in progress or * if we are the backup. */ WT_RET(__wt_readlock(session, conn->hot_backup_lock)); locked = 1; if (conn->hot_backup == 0 || backup_file != 0) { for (i = 0; i < logcount; i++) { WT_ERR(__wt_log_extract_lognum( session, logfiles[i], &lognum)); if (lognum < min_lognum) WT_ERR(__wt_log_remove( session, WT_LOG_FILENAME, lognum)); } } WT_ERR(__wt_readunlock(session, conn->hot_backup_lock)); locked = 0; __wt_log_files_free(session, logfiles, logcount); logfiles = NULL; logcount = 0; /* * Indicate what is our new earliest LSN. It is the start * of the log file containing the last checkpoint. */ log->first_lsn.file = min_lognum; log->first_lsn.offset = 0; if (0) err: __wt_err(session, ret, "log archive server error"); if (locked) WT_TRET(__wt_readunlock(session, conn->hot_backup_lock)); if (logfiles != NULL) __wt_log_files_free(session, logfiles, logcount); return (ret); }
/* * __ckpt_process -- * Process the list of checkpoints. */ static int __ckpt_process(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_CKPT *ckptbase) { WT_BLOCK_CKPT *a, *b, *ci; WT_CKPT *ckpt, *next_ckpt; WT_DECL_ITEM(tmp); WT_DECL_RET; uint64_t ckpt_size; bool deleting, fatal, locked; ci = &block->live; fatal = locked = false; #ifdef HAVE_DIAGNOSTIC WT_RET(__ckpt_verify(session, ckptbase)); #endif /* * Checkpoints are a two-step process: first, write a new checkpoint to * disk (including all the new extent lists for modified checkpoints * and the live system). As part of this, create a list of file blocks * newly available for reallocation, based on checkpoints being deleted. * We then return the locations of the new checkpoint information to our * caller. Our caller has to write that information into some kind of * stable storage, and once that's done, we can actually allocate from * that list of newly available file blocks. (We can't allocate from * that list immediately because the allocation might happen before our * caller saves the new checkpoint information, and if we crashed before * the new checkpoint location was saved, we'd have overwritten blocks * still referenced by checkpoints in the system.) In summary, there is * a second step: after our caller saves the checkpoint information, we * are called to add the newly available blocks into the live system's * available list. * * This function is the first step, the second step is in the resolve * function. * * If we're called to checkpoint the same file twice (without the second * resolution step), or re-entered for any reason, it's an error in our * caller, and our choices are all bad: leak blocks or potentially crash * with our caller not yet having saved previous checkpoint information * to stable storage. */ __wt_spin_lock(session, &block->live_lock); if (block->ckpt_inprogress) ret = __wt_block_panic(session, EINVAL, "%s: unexpected checkpoint ordering", block->name); else block->ckpt_inprogress = true; __wt_spin_unlock(session, &block->live_lock); WT_RET(ret); /* * Extents newly available as a result of deleting previous checkpoints * are added to a list of extents. The list should be empty, but as * described above, there is no "free the checkpoint information" call * into the block manager; if there was an error in an upper level that * resulted in some previous checkpoint never being resolved, the list * may not be empty. We should have caught that with the "checkpoint * in progress" test, but it doesn't cost us anything to be cautious. * * We free the checkpoint's allocation and discard extent lists as part * of the resolution step, not because they're needed at that time, but * because it's potentially a lot of work, and waiting allows the btree * layer to continue eviction sooner. As for the checkpoint-available * list, make sure they get cleaned out. */ __wt_block_extlist_free(session, &ci->ckpt_avail); WT_RET(__wt_block_extlist_init( session, &ci->ckpt_avail, "live", "ckpt_avail", true)); __wt_block_extlist_free(session, &ci->ckpt_alloc); __wt_block_extlist_free(session, &ci->ckpt_discard); /* * To delete a checkpoint, we'll need checkpoint information for it and * the subsequent checkpoint into which it gets rolled; read them from * disk before we lock things down. */ deleting = false; WT_CKPT_FOREACH(ckptbase, ckpt) { if (F_ISSET(ckpt, WT_CKPT_FAKE) || !F_ISSET(ckpt, WT_CKPT_DELETE)) continue; deleting = true; /* * Read the checkpoint and next checkpoint extent lists if we * haven't already read them (we may have already read these * extent blocks if there is more than one deleted checkpoint). */ if (ckpt->bpriv == NULL) WT_ERR(__ckpt_extlist_read(session, block, ckpt)); for (next_ckpt = ckpt + 1;; ++next_ckpt) if (!F_ISSET(next_ckpt, WT_CKPT_FAKE)) break; /* * The "next" checkpoint may be the live tree which has no * extent blocks to read. */ if (next_ckpt->bpriv == NULL && !F_ISSET(next_ckpt, WT_CKPT_ADD)) WT_ERR(__ckpt_extlist_read(session, block, next_ckpt)); } /* * Failures are now fatal: we can't currently back out the merge of any * deleted checkpoint extent lists into the live system's extent lists, * so continuing after error would leave the live system's extent lists * corrupted for any subsequent checkpoint (and potentially, should a * subsequent checkpoint succeed, for recovery). */ fatal = true; /* * Hold a lock so the live extent lists and the file size can't change * underneath us. I suspect we'll tighten this if checkpoints take too * much time away from real work: we read the historic checkpoint * information without a lock, but we could also merge and re-write the * deleted and merged checkpoint information without a lock, except for * the final merge of ranges into the live tree. */ __wt_spin_lock(session, &block->live_lock); locked = true; /* * We've allocated our last page, update the checkpoint size. We need * to calculate the live system's checkpoint size before merging * checkpoint allocation and discard information from the checkpoints * we're deleting, those operations change the underlying byte counts. */ ckpt_size = ci->ckpt_size; ckpt_size += ci->alloc.bytes; ckpt_size -= ci->discard.bytes; /* Skip the additional processing if we aren't deleting checkpoints. */ if (!deleting) goto live_update; /* * Delete any no-longer-needed checkpoints: we do this first as it frees * blocks to the live lists, and the freed blocks will then be included * when writing the live extent lists. */ WT_CKPT_FOREACH(ckptbase, ckpt) { if (F_ISSET(ckpt, WT_CKPT_FAKE) || !F_ISSET(ckpt, WT_CKPT_DELETE)) continue; #ifdef HAVE_VERBOSE if (WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT)) { if (tmp == NULL) WT_ERR(__wt_scr_alloc(session, 0, &tmp)); WT_ERR(__ckpt_string( session, block, ckpt->raw.data, tmp)); __wt_verbose(session, WT_VERB_CHECKPOINT, "%s: delete-checkpoint: %s: %s", block->name, ckpt->name, (const char *)tmp->data); } #endif /* * Find the checkpoint into which we'll roll this checkpoint's * blocks: it's the next real checkpoint in the list, and it * better have been read in (if it's not the add slot). */ for (next_ckpt = ckpt + 1;; ++next_ckpt) if (!F_ISSET(next_ckpt, WT_CKPT_FAKE)) break; /* * Set the from/to checkpoint structures, where the "to" value * may be the live tree. */ a = ckpt->bpriv; if (F_ISSET(next_ckpt, WT_CKPT_ADD)) b = &block->live; else b = next_ckpt->bpriv; /* * Free the root page: there's nothing special about this free, * the root page is allocated using normal rules, that is, it * may have been taken from the avail list, and was entered on * the live system's alloc list at that time. We free it into * the checkpoint's discard list, however, not the live system's * list because it appears on the checkpoint's alloc list and so * must be paired in the checkpoint. */ if (a->root_offset != WT_BLOCK_INVALID_OFFSET) WT_ERR(__wt_block_insert_ext(session, block, &a->discard, a->root_offset, a->root_size)); /* * Free the blocks used to hold the "from" checkpoint's extent * lists, including the avail list. */ WT_ERR(__ckpt_extlist_fblocks(session, block, &a->alloc)); WT_ERR(__ckpt_extlist_fblocks(session, block, &a->avail)); WT_ERR(__ckpt_extlist_fblocks(session, block, &a->discard)); /* * Roll the "from" alloc and discard extent lists into the "to" * checkpoint's lists. */ if (a->alloc.entries != 0) WT_ERR(__wt_block_extlist_merge( session, block, &a->alloc, &b->alloc)); if (a->discard.entries != 0) WT_ERR(__wt_block_extlist_merge( session, block, &a->discard, &b->discard)); /* * If the "to" checkpoint is also being deleted, we're done with * it, it's merged into some other checkpoint in the next loop. * This means the extent lists may aggregate over a number of * checkpoints, but that's OK, they're disjoint sets of ranges. */ if (F_ISSET(next_ckpt, WT_CKPT_DELETE)) continue; /* * Find blocks for re-use: wherever the "to" checkpoint's * allocate and discard lists overlap, move the range to * the live system's checkpoint available list. */ WT_ERR(__wt_block_extlist_overlap(session, block, b)); /* * If we're updating the live system's information, we're done. */ if (F_ISSET(next_ckpt, WT_CKPT_ADD)) continue; /* * We have to write the "to" checkpoint's extent lists out in * new blocks, and update its cookie. * * Free the blocks used to hold the "to" checkpoint's extent * lists; don't include the avail list, it's not changing. */ WT_ERR(__ckpt_extlist_fblocks(session, block, &b->alloc)); WT_ERR(__ckpt_extlist_fblocks(session, block, &b->discard)); F_SET(next_ckpt, WT_CKPT_UPDATE); } /* Update checkpoints marked for update. */ WT_CKPT_FOREACH(ckptbase, ckpt) if (F_ISSET(ckpt, WT_CKPT_UPDATE)) WT_ERR(__ckpt_update( session, block, ckpt, ckpt->bpriv, false)); live_update: /* Truncate the file if that's possible. */ WT_ERR(__wt_block_extlist_truncate(session, block, &ci->avail)); /* Update the final, added checkpoint based on the live system. */ WT_CKPT_FOREACH(ckptbase, ckpt) if (F_ISSET(ckpt, WT_CKPT_ADD)) { /* * !!! * Our caller wants the final checkpoint size. Setting * the size here violates layering, but the alternative * is a call for the btree layer to crack the checkpoint * cookie into its components, and that's a fair amount * of work. */ ckpt->ckpt_size = ckpt_size; /* * Set the rolling checkpoint size for the live system. * The current size includes the current checkpoint's * root page size (root pages are on the checkpoint's * block allocation list as root pages are allocated * with the usual block allocation functions). That's * correct, but we don't want to include it in the size * for the next checkpoint. */ ckpt_size -= ci->root_size; /* * Additionally, we had a bug for awhile where the live * checkpoint size grew without bound. We can't sanity * check the value, that would require walking the tree * as part of the checkpoint. Bound any bug at the size * of the file. * It isn't practical to assert that the value is within * bounds since databases created with older versions * of WiredTiger (2.8.0) would likely see an error. */ ci->ckpt_size = WT_MIN(ckpt_size, (uint64_t)block->size); WT_ERR(__ckpt_update(session, block, ckpt, ci, true)); } /* * Reset the live system's alloc and discard extent lists, leave the * avail list alone. This includes freeing a lot of extents, so do it * outside of the system's lock by copying and resetting the original, * then doing the work later. */ ci->ckpt_alloc = ci->alloc; WT_ERR(__wt_block_extlist_init( session, &ci->alloc, "live", "alloc", false)); ci->ckpt_discard = ci->discard; WT_ERR(__wt_block_extlist_init( session, &ci->discard, "live", "discard", false)); #ifdef HAVE_DIAGNOSTIC /* * The first checkpoint in the system should always have an empty * discard list. If we've read that checkpoint and/or created it, * check. */ WT_CKPT_FOREACH(ckptbase, ckpt) if (!F_ISSET(ckpt, WT_CKPT_DELETE)) break; if ((a = ckpt->bpriv) == NULL) a = &block->live; if (a->discard.entries != 0) WT_ERR_MSG(session, WT_ERROR, "first checkpoint incorrectly has blocks on the discard " "list"); #endif err: if (ret != 0 && fatal) ret = __wt_block_panic(session, ret, "%s: fatal checkpoint failure", block->name); if (locked) __wt_spin_unlock(session, &block->live_lock); /* Discard any checkpoint information we loaded. */ WT_CKPT_FOREACH(ckptbase, ckpt) if ((ci = ckpt->bpriv) != NULL) __wt_block_ckpt_destroy(session, ci); __wt_scr_free(session, &tmp); return (ret); }
int run_truncate(CONFIG *cfg, CONFIG_THREAD *thread, WT_CURSOR *cursor, WT_SESSION *session, int *truncatedp) { TRUNCATE_CONFIG *trunc_cfg; TRUNCATE_QUEUE_ENTRY *truncate_item; char *truncate_key; int ret, t_ret; uint64_t used_stone_gap; ret = 0; trunc_cfg = &thread->trunc_cfg; *truncatedp = 0; /* Update the total inserts */ trunc_cfg->total_inserts = sum_insert_ops(cfg); trunc_cfg->expected_total += (trunc_cfg->total_inserts - trunc_cfg->last_total_inserts); trunc_cfg->last_total_inserts = trunc_cfg->total_inserts; /* We are done if there isn't enough data to trigger a new milestone. */ if (trunc_cfg->expected_total <= thread->workload->truncate_count) return (0); /* * If we are falling behind and using more than one stone per lap we * should widen the stone gap for this lap to try and catch up quicker. */ if (trunc_cfg->expected_total > thread->workload->truncate_count + trunc_cfg->stone_gap) { /* * Increase the multiplier until we create stones that are * almost large enough to truncate the whole expected table size * in one operation. */ trunc_cfg->catchup_multiplier = WT_MIN(trunc_cfg->catchup_multiplier + 1, trunc_cfg->needed_stones - 1); } else { /* Back off if we start seeing an improvement */ trunc_cfg->catchup_multiplier = WT_MAX(trunc_cfg->catchup_multiplier - 1, 1); } used_stone_gap = trunc_cfg->stone_gap * trunc_cfg->catchup_multiplier; while (trunc_cfg->num_stones < trunc_cfg->needed_stones) { trunc_cfg->last_key += used_stone_gap; truncate_key = calloc(cfg->key_sz, 1); if (truncate_key == NULL) { lprintf(cfg, ENOMEM, 0, "truncate: couldn't allocate key array"); return (ENOMEM); } truncate_item = calloc(sizeof(TRUNCATE_QUEUE_ENTRY), 1); if (truncate_item == NULL) { free(truncate_key); lprintf(cfg, ENOMEM, 0, "truncate: couldn't allocate item"); return (ENOMEM); } generate_key(cfg, truncate_key, trunc_cfg->last_key); truncate_item->key = truncate_key; truncate_item->diff = used_stone_gap; TAILQ_INSERT_TAIL(&cfg->stone_head, truncate_item, q); trunc_cfg->num_stones++; } /* We are done if there isn't enough data to trigger a truncate. */ if (trunc_cfg->num_stones == 0 || trunc_cfg->expected_total <= thread->workload->truncate_count) return (0); truncate_item = TAILQ_FIRST(&cfg->stone_head); trunc_cfg->num_stones--; TAILQ_REMOVE(&cfg->stone_head, truncate_item, q); cursor->set_key(cursor,truncate_item->key); if ((ret = cursor->search(cursor)) != 0) { lprintf(cfg, ret, 0, "Truncate search: failed"); goto err; } if ((ret = session->truncate(session, NULL, NULL, cursor, NULL)) != 0) { lprintf(cfg, ret, 0, "Truncate: failed"); goto err; } *truncatedp = 1; trunc_cfg->expected_total -= truncate_item->diff; err: free(truncate_item->key); free(truncate_item); t_ret = cursor->reset(cursor); if (t_ret != 0) lprintf(cfg, t_ret, 0, "Cursor reset failed"); if (ret == 0 && t_ret != 0) ret = t_ret; return (ret); }
/* * __wt_page_in_func -- * Acquire a hazard pointer to a page; if the page is not in-memory, * read it from the disk and build an in-memory version. */ int __wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags #ifdef HAVE_DIAGNOSTIC , const char *file, int line #endif ) { WT_BTREE *btree; WT_DECL_RET; WT_PAGE *page; u_int sleep_cnt, wait_cnt; int busy, cache_work, force_attempts, oldgen, stalled; btree = S2BT(session); stalled = 0; for (force_attempts = oldgen = 0, sleep_cnt = wait_cnt = 0;;) { switch (ref->state) { case WT_REF_DISK: case WT_REF_DELETED: if (LF_ISSET(WT_READ_CACHE)) return (WT_NOTFOUND); /* * The page isn't in memory, read it. If this thread is * allowed to do eviction work, check for space in the * cache. */ if (!LF_ISSET(WT_READ_NO_EVICT)) WT_RET(__wt_cache_eviction_check( session, 1, NULL)); WT_RET(__page_read(session, ref)); oldgen = LF_ISSET(WT_READ_WONT_NEED) || F_ISSET(session, WT_SESSION_NO_CACHE); continue; case WT_REF_READING: if (LF_ISSET(WT_READ_CACHE)) return (WT_NOTFOUND); if (LF_ISSET(WT_READ_NO_WAIT)) return (WT_NOTFOUND); /* Waiting on another thread's read, stall. */ WT_STAT_FAST_CONN_INCR(session, page_read_blocked); stalled = 1; break; case WT_REF_LOCKED: if (LF_ISSET(WT_READ_NO_WAIT)) return (WT_NOTFOUND); /* Waiting on eviction, stall. */ WT_STAT_FAST_CONN_INCR(session, page_locked_blocked); stalled = 1; break; case WT_REF_SPLIT: return (WT_RESTART); case WT_REF_MEM: /* * The page is in memory. * * Get a hazard pointer if one is required. We cannot * be evicting if no hazard pointer is required, we're * done. */ if (F_ISSET(btree, WT_BTREE_IN_MEMORY)) goto skip_evict; /* * The expected reason we can't get a hazard pointer is * because the page is being evicted, yield, try again. */ #ifdef HAVE_DIAGNOSTIC WT_RET( __wt_hazard_set(session, ref, &busy, file, line)); #else WT_RET(__wt_hazard_set(session, ref, &busy)); #endif if (busy) { WT_STAT_FAST_CONN_INCR( session, page_busy_blocked); break; } /* * If eviction is configured for this file, check to see * if the page qualifies for forced eviction and update * the page's generation number. If eviction isn't being * done on this file, we're done. */ if (LF_ISSET(WT_READ_NO_EVICT) || F_ISSET(session, WT_SESSION_NO_EVICTION) || F_ISSET(btree, WT_BTREE_NO_EVICTION)) goto skip_evict; /* * Forcibly evict pages that are too big. */ page = ref->page; if (force_attempts < 10 && __evict_force_check(session, page)) { ++force_attempts; ret = __wt_page_release_evict(session, ref); /* If forced eviction fails, stall. */ if (ret == EBUSY) { ret = 0; WT_STAT_FAST_CONN_INCR(session, page_forcible_evict_blocked); stalled = 1; break; } WT_RET(ret); /* * The result of a successful forced eviction * is a page-state transition (potentially to * an in-memory page we can use, or a restart * return for our caller), continue the outer * page-acquisition loop. */ continue; } /* * If we read the page and we are configured to not * trash the cache, set the oldest read generation so * the page is forcibly evicted as soon as possible. * * Otherwise, update the page's read generation. */ if (oldgen && page->read_gen == WT_READGEN_NOTSET) __wt_page_evict_soon(page); else if (!LF_ISSET(WT_READ_NO_GEN) && page->read_gen != WT_READGEN_OLDEST && page->read_gen < __wt_cache_read_gen(session)) page->read_gen = __wt_cache_read_gen_bump(session); skip_evict: /* * Check if we need an autocommit transaction. * Starting a transaction can trigger eviction, so skip * it if eviction isn't permitted. */ return (LF_ISSET(WT_READ_NO_EVICT) ? 0 : __wt_txn_autocommit_check(session)); WT_ILLEGAL_VALUE(session); } /* * We failed to get the page -- yield before retrying, and if * we've yielded enough times, start sleeping so we don't burn * CPU to no purpose. */ if (stalled) wait_cnt += 1000; else if (++wait_cnt < 1000) { __wt_yield(); continue; } /* * If stalling and this thread is allowed to do eviction work, * check if the cache needs help. If we do work for the cache, * substitute that for a sleep. */ if (!LF_ISSET(WT_READ_NO_EVICT)) { WT_RET( __wt_cache_eviction_check(session, 1, &cache_work)); if (cache_work) continue; } sleep_cnt = WT_MIN(sleep_cnt + 1000, 10000); WT_STAT_FAST_CONN_INCRV(session, page_sleep, sleep_cnt); __wt_sleep(0, sleep_cnt); } }
/* * __wt_hazard_set -- * Set a hazard pointer. */ int __wt_hazard_set(WT_SESSION_IMPL *session, WT_REF *ref, int *busyp #ifdef HAVE_DIAGNOSTIC , const char *file, int line #endif ) { WT_BTREE *btree; WT_HAZARD *hp; int restarts = 0; btree = S2BT(session); *busyp = 0; /* If a file can never be evicted, hazard pointers aren't required. */ if (F_ISSET(btree, WT_BTREE_NO_HAZARD)) return (0); /* * Do the dance: * * The memory location which makes a page "real" is the WT_REF's state * of WT_REF_MEM, which can be set to WT_REF_LOCKED at any time by the * page eviction server. * * Add the WT_REF reference to the session's hazard list and flush the * write, then see if the page's state is still valid. If so, we can * use the page because the page eviction server will see our hazard * pointer before it discards the page (the eviction server sets the * state to WT_REF_LOCKED, then flushes memory and checks the hazard * pointers). * * For sessions with many active hazard pointers, skip most of the * active slots: there may be a free slot in there, but checking is * expensive. Most hazard pointers are released quickly: optimize * for that case. */ for (hp = session->hazard + session->nhazard;; ++hp) { /* Expand the number of hazard pointers if available.*/ if (hp >= session->hazard + session->hazard_size) { if (session->hazard_size >= S2C(session)->hazard_max) break; /* Restart the search. */ if (session->nhazard < session->hazard_size && restarts++ == 0) { hp = session->hazard; continue; } WT_PUBLISH(session->hazard_size, WT_MIN(session->hazard_size + WT_HAZARD_INCR, S2C(session)->hazard_max)); } if (hp->page != NULL) continue; hp->page = ref->page; #ifdef HAVE_DIAGNOSTIC hp->file = file; hp->line = line; #endif /* Publish the hazard pointer before reading page's state. */ WT_FULL_BARRIER(); /* * Check if the page state is still valid, where valid means a * state of WT_REF_MEM or WT_REF_EVICT_WALK and the pointer is * unchanged. (The pointer can change, it means the page was * evicted between the time we set our hazard pointer and the * publication. It would theoretically be possible for the * page to be evicted and a different page read into the same * memory, so the pointer hasn't changed but the contents have. * That's OK, we found this page using the tree's key space, * whatever page we find here is the page for us to use.) */ if (ref->page == hp->page && (ref->state == WT_REF_MEM || ref->state == WT_REF_EVICT_WALK)) { WT_VERBOSE_RET(session, hazard, "session %p hazard %p: set", session, ref->page); ++session->nhazard; return (0); } /* * The page isn't available, it's being considered for eviction * (or being evicted, for all we know). If the eviction server * sees our hazard pointer before evicting the page, it will * return the page to use, no harm done, if it doesn't, it will * go ahead and complete the eviction. * * We don't bother publishing this update: the worst case is we * prevent some random page from being evicted. */ hp->page = NULL; *busyp = 1; return (0); } __wt_errx(session, "session %p: hazard pointer table full", session); #ifdef HAVE_DIAGNOSTIC __hazard_dump(session); #endif return (ENOMEM); }
/* * __wt_hazard_set -- * Set a hazard pointer. */ int __wt_hazard_set(WT_SESSION_IMPL *session, WT_REF *ref, bool *busyp #ifdef HAVE_DIAGNOSTIC , const char *file, int line #endif ) { WT_BTREE *btree; WT_CONNECTION_IMPL *conn; WT_HAZARD *hp; int restarts = 0; btree = S2BT(session); conn = S2C(session); *busyp = false; /* If a file can never be evicted, hazard pointers aren't required. */ if (F_ISSET(btree, WT_BTREE_IN_MEMORY)) return (0); /* * Do the dance: * * The memory location which makes a page "real" is the WT_REF's state * of WT_REF_MEM, which can be set to WT_REF_LOCKED at any time by the * page eviction server. * * Add the WT_REF reference to the session's hazard list and flush the * write, then see if the page's state is still valid. If so, we can * use the page because the page eviction server will see our hazard * pointer before it discards the page (the eviction server sets the * state to WT_REF_LOCKED, then flushes memory and checks the hazard * pointers). * * For sessions with many active hazard pointers, skip most of the * active slots: there may be a free slot in there, but checking is * expensive. Most hazard pointers are released quickly: optimize * for that case. */ for (hp = session->hazard + session->nhazard;; ++hp) { /* * If we get to the end of the array, either: * 1. If we know there are free slots somewhere, and this is * the first time through, continue the search from the * start. Don't actually continue the loop because that * will skip the first slot. * 2. If we have searched all the way through and we have * allocated the maximum number of slots, give up. * 3. Allocate another increment of slots, up to the maximum. * The slot we are on should now be available. */ if (hp >= session->hazard + session->hazard_size) { if (session->nhazard < session->hazard_size && restarts++ == 0) hp = session->hazard; else if (session->hazard_size >= conn->hazard_max) break; else WT_PUBLISH(session->hazard_size, WT_MIN( session->hazard_size + WT_HAZARD_INCR, conn->hazard_max)); } if (hp->page != NULL) continue; hp->page = ref->page; #ifdef HAVE_DIAGNOSTIC hp->file = file; hp->line = line; #endif /* Publish the hazard pointer before reading page's state. */ WT_FULL_BARRIER(); /* * Check if the page state is still valid, where valid means a * state of WT_REF_MEM and the pointer is unchanged. (The * pointer can change, it means the page was evicted between * the time we set our hazard pointer and the publication. It * would theoretically be possible for the page to be evicted * and a different page read into the same memory, so the * pointer hasn't changed but the contents have. That's OK, we * found this page using the tree's key space, whatever page we * find here is the page for us to use.) */ if (ref->page == hp->page && ref->state == WT_REF_MEM) { ++session->nhazard; return (0); } /* * The page isn't available, it's being considered for eviction * (or being evicted, for all we know). If the eviction server * sees our hazard pointer before evicting the page, it will * return the page to use, no harm done, if it doesn't, it will * go ahead and complete the eviction. * * We don't bother publishing this update: the worst case is we * prevent some random page from being evicted. */ hp->page = NULL; *busyp = true; return (0); } __wt_errx(session, "session %p: hazard pointer table full", (void *)session); #ifdef HAVE_DIAGNOSTIC __hazard_dump(session); #endif return (ENOMEM); }
/* * __verify_tree -- * Verify a tree, recursively descending through it in depth-first fashion. * The page argument was physically verified (so we know it's correctly formed), * and the in-memory version built. Our job is to check logical relationships * in the page and in the tree. */ static int __verify_tree(WT_SESSION_IMPL *session, WT_REF *ref, WT_VSTUFF *vs) { WT_BM *bm; WT_CELL *cell; WT_CELL_UNPACK *unpack, _unpack; WT_COL *cip; WT_DECL_RET; WT_PAGE *page; WT_REF *child_ref; uint64_t recno; uint32_t entry, i; bool found; bm = S2BT(session)->bm; page = ref->page; unpack = &_unpack; WT_CLEAR(*unpack); /* -Wuninitialized */ WT_RET(__wt_verbose(session, WT_VERB_VERIFY, "%s %s", __wt_page_addr_string(session, ref, vs->tmp1), __wt_page_type_string(page->type))); /* Optionally dump the address. */ if (vs->dump_address) WT_RET(__wt_msg(session, "%s %s", __wt_page_addr_string(session, ref, vs->tmp1), __wt_page_type_string(page->type))); /* Track the shape of the tree. */ if (WT_PAGE_IS_INTERNAL(page)) ++vs->depth_internal[ WT_MIN(vs->depth, WT_ELEMENTS(vs->depth_internal) - 1)]; else ++vs->depth_leaf[ WT_MIN(vs->depth, WT_ELEMENTS(vs->depth_internal) - 1)]; /* * The page's physical structure was verified when it was read into * memory by the read server thread, and then the in-memory version * of the page was built. Now we make sure the page and tree are * logically consistent. * * !!! * The problem: (1) the read server has to build the in-memory version * of the page because the read server is the thread that flags when * any thread can access the page in the tree; (2) we can't build the * in-memory version of the page until the physical structure is known * to be OK, so the read server has to verify at least the physical * structure of the page; (3) doing complete page verification requires * reading additional pages (for example, overflow keys imply reading * overflow pages in order to test the key's order in the page); (4) * the read server cannot read additional pages because it will hang * waiting on itself. For this reason, we split page verification * into a physical verification, which allows the in-memory version * of the page to be built, and then a subsequent logical verification * which happens here. * * Report progress occasionally. */ #define WT_VERIFY_PROGRESS_INTERVAL 100 if (++vs->fcnt % WT_VERIFY_PROGRESS_INTERVAL == 0) WT_RET(__wt_progress(session, NULL, vs->fcnt)); #ifdef HAVE_DIAGNOSTIC /* Optionally dump the blocks or page in debugging mode. */ if (vs->dump_blocks) WT_RET(__wt_debug_disk(session, page->dsk, NULL)); if (vs->dump_pages) WT_RET(__wt_debug_page(session, page, NULL)); #endif /* * Column-store key order checks: check the page's record number and * then update the total record count. */ switch (page->type) { case WT_PAGE_COL_FIX: recno = page->pg_fix_recno; goto recno_chk; case WT_PAGE_COL_INT: recno = page->pg_intl_recno; goto recno_chk; case WT_PAGE_COL_VAR: recno = page->pg_var_recno; recno_chk: if (recno != vs->record_total + 1) WT_RET_MSG(session, WT_ERROR, "page at %s has a starting record of %" PRIu64 " when the expected starting record is %" PRIu64, __wt_page_addr_string(session, ref, vs->tmp1), recno, vs->record_total + 1); break; } switch (page->type) { case WT_PAGE_COL_FIX: vs->record_total += page->pg_fix_entries; break; case WT_PAGE_COL_VAR: recno = 0; WT_COL_FOREACH(page, cip, i) if ((cell = WT_COL_PTR(page, cip)) == NULL) ++recno; else { __wt_cell_unpack(cell, unpack); recno += __wt_cell_rle(unpack); } vs->record_total += recno; break; } /* * Row-store leaf page key order check: it's a depth-first traversal, * the first key on this page should be larger than any key previously * seen. */ switch (page->type) { case WT_PAGE_ROW_LEAF: WT_RET(__verify_row_leaf_key_order(session, ref, vs)); break; } /* If it's not the root page, unpack the parent cell. */ if (!__wt_ref_is_root(ref)) { __wt_cell_unpack(ref->addr, unpack); /* Compare the parent cell against the page type. */ switch (page->type) { case WT_PAGE_COL_FIX: if (unpack->raw != WT_CELL_ADDR_LEAF_NO) goto celltype_err; break; case WT_PAGE_COL_VAR: if (unpack->raw != WT_CELL_ADDR_LEAF && unpack->raw != WT_CELL_ADDR_LEAF_NO) goto celltype_err; break; case WT_PAGE_ROW_LEAF: if (unpack->raw != WT_CELL_ADDR_DEL && unpack->raw != WT_CELL_ADDR_LEAF && unpack->raw != WT_CELL_ADDR_LEAF_NO) goto celltype_err; break; case WT_PAGE_COL_INT: case WT_PAGE_ROW_INT: if (unpack->raw != WT_CELL_ADDR_INT) celltype_err: WT_RET_MSG(session, WT_ERROR, "page at %s, of type %s, is referenced in " "its parent by a cell of type %s", __wt_page_addr_string( session, ref, vs->tmp1), __wt_page_type_string(page->type), __wt_cell_type_string(unpack->raw)); break; } } /* * Check overflow pages. We check overflow cells separately from other * tests that walk the page as it's simpler, and I don't care much how * fast table verify runs. */ switch (page->type) { case WT_PAGE_COL_VAR: case WT_PAGE_ROW_INT: case WT_PAGE_ROW_LEAF: WT_RET(__verify_overflow_cell(session, ref, &found, vs)); if (__wt_ref_is_root(ref) || page->type == WT_PAGE_ROW_INT) break; /* * Object if a leaf-no-overflow address cell references a page * with overflow keys, but don't object if a leaf address cell * references a page without overflow keys. Reconciliation * doesn't guarantee every leaf page without overflow items will * be a leaf-no-overflow type. */ if (found && unpack->raw == WT_CELL_ADDR_LEAF_NO) WT_RET_MSG(session, WT_ERROR, "page at %s, of type %s and referenced in its " "parent by a cell of type %s, contains overflow " "items", __wt_page_addr_string(session, ref, vs->tmp1), __wt_page_type_string(page->type), __wt_cell_type_string(WT_CELL_ADDR_LEAF_NO)); break; } /* Check tree connections and recursively descend the tree. */ switch (page->type) { case WT_PAGE_COL_INT: /* For each entry in an internal page, verify the subtree. */ entry = 0; WT_INTL_FOREACH_BEGIN(session, page, child_ref) { /* * It's a depth-first traversal: this entry's starting * record number should be 1 more than the total records * reviewed to this point. */ ++entry; if (child_ref->key.recno != vs->record_total + 1) { WT_RET_MSG(session, WT_ERROR, "the starting record number in entry %" PRIu32 " of the column internal page at " "%s is %" PRIu64 " and the expected " "starting record number is %" PRIu64, entry, __wt_page_addr_string( session, child_ref, vs->tmp1), child_ref->key.recno, vs->record_total + 1); } /* Verify the subtree. */ ++vs->depth; WT_RET(__wt_page_in(session, child_ref, 0)); ret = __verify_tree(session, child_ref, vs); WT_TRET(__wt_page_release(session, child_ref, 0)); --vs->depth; WT_RET(ret); __wt_cell_unpack(child_ref->addr, unpack); WT_RET(bm->verify_addr( bm, session, unpack->data, unpack->size)); } WT_INTL_FOREACH_END; break; case WT_PAGE_ROW_INT: /* For each entry in an internal page, verify the subtree. */ entry = 0; WT_INTL_FOREACH_BEGIN(session, page, child_ref) { /* * It's a depth-first traversal: this entry's starting * key should be larger than the largest key previously * reviewed. * * The 0th key of any internal page is magic, and we * can't test against it. */ ++entry; if (entry != 1) WT_RET(__verify_row_int_key_order( session, page, child_ref, entry, vs)); /* Verify the subtree. */ ++vs->depth; WT_RET(__wt_page_in(session, child_ref, 0)); ret = __verify_tree(session, child_ref, vs); WT_TRET(__wt_page_release(session, child_ref, 0)); --vs->depth; WT_RET(ret); __wt_cell_unpack(child_ref->addr, unpack); WT_RET(bm->verify_addr( bm, session, unpack->data, unpack->size)); } WT_INTL_FOREACH_END;
/* * __wt_log_scan -- * Scan the logs, calling a function on each record found. */ int __wt_log_scan(WT_SESSION_IMPL *session, WT_LSN *lsnp, uint32_t flags, int (*func)(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp, void *cookie), void *cookie) { WT_CONNECTION_IMPL *conn; WT_ITEM buf; WT_DECL_RET; WT_FH *log_fh; WT_LOG *log; WT_LOG_RECORD *logrec; WT_LSN end_lsn, rd_lsn, start_lsn; off_t log_size; uint32_t allocsize, cksum, firstlog, lastlog, lognum, rdup_len, reclen; u_int i, logcount; int eol; char **logfiles; conn = S2C(session); log = conn->log; log_fh = NULL; logcount = 0; logfiles = NULL; eol = 0; WT_CLEAR(buf); /* * If the caller did not give us a callback function there is nothing * to do. */ if (func == NULL) return (0); if (LF_ISSET(WT_LOGSCAN_RECOVER)) WT_RET(__wt_verbose(session, WT_VERB_LOG, "__wt_log_scan truncating to %u/%" PRIuMAX, log->trunc_lsn.file, (uintmax_t)log->trunc_lsn.offset)); if (log != NULL) { allocsize = log->allocsize; if (lsnp == NULL) { if (LF_ISSET(WT_LOGSCAN_FIRST)) start_lsn = log->first_lsn; else if (LF_ISSET(WT_LOGSCAN_FROM_CKP)) start_lsn = log->ckpt_lsn; else return (WT_ERROR); /* Illegal usage */ } else { if (LF_ISSET(WT_LOGSCAN_FIRST|WT_LOGSCAN_FROM_CKP)) WT_RET_MSG(session, WT_ERROR, "choose either a start LSN or a start flag"); /* Offsets must be on allocation boundaries. */ if (lsnp->offset % allocsize != 0 || lsnp->file > log->fileid) return (WT_NOTFOUND); /* * Log cursors may not know the starting LSN. If an * LSN pointer is passed in, but it is the INIT_LSN, * start from the first_lsn. */ start_lsn = *lsnp; if (IS_INIT_LSN(&start_lsn)) start_lsn = log->first_lsn; } end_lsn = log->alloc_lsn; } else { /* * If logging is not configured, we can still print out the log * if log files exist. We just need to set the LSNs from what * is in the files versus what is in the live connection. */ /* * Set allocsize to the minimum alignment it could be. Larger * records and larger allocation boundaries should always be * a multiple of this. */ allocsize = LOG_ALIGN; lastlog = 0; firstlog = UINT32_MAX; WT_RET(__wt_log_get_files(session, &logfiles, &logcount)); if (logcount == 0) /* * Return it is not supported if none don't exist. */ return (ENOTSUP); for (i = 0; i < logcount; i++) { WT_ERR(__wt_log_extract_lognum(session, logfiles[i], &lognum)); lastlog = WT_MAX(lastlog, lognum); firstlog = WT_MIN(firstlog, lognum); } start_lsn.file = firstlog; end_lsn.file = lastlog; start_lsn.offset = end_lsn.offset = 0; __wt_log_files_free(session, logfiles, logcount); logfiles = NULL; } WT_ERR(__log_openfile(session, 0, &log_fh, start_lsn.file)); WT_ERR(__log_filesize(session, log_fh, &log_size)); rd_lsn = start_lsn; WT_ERR(__wt_buf_initsize(session, &buf, LOG_ALIGN)); for (;;) { if (rd_lsn.offset + allocsize > log_size) { advance: /* * If we read the last record, go to the next file. */ WT_ERR(__wt_close(session, log_fh)); log_fh = NULL; eol = 1; /* * Truncate this log file before we move to the next. */ if (LF_ISSET(WT_LOGSCAN_RECOVER)) WT_ERR(__log_truncate(session, &rd_lsn, 1)); rd_lsn.file++; rd_lsn.offset = 0; /* * Avoid an error message when we reach end of log * by checking here. */ if (rd_lsn.file > end_lsn.file) break; WT_ERR(__log_openfile( session, 0, &log_fh, rd_lsn.file)); WT_ERR(__log_filesize(session, log_fh, &log_size)); continue; } /* * Read the minimum allocation size a record could be. */ WT_ASSERT(session, buf.memsize >= allocsize); WT_ERR(__wt_read(session, log_fh, rd_lsn.offset, (size_t)allocsize, buf.mem)); /* * First 8 bytes is the real record length. See if we * need to read more than the allocation size. We expect * that we rarely will have to read more. Most log records * will be fairly small. */ reclen = *(uint32_t *)buf.mem; /* * Log files are pre-allocated. We never expect a zero length * unless we've reached the end of the log. The log can be * written out of order, so when recovery finds the end of * the log, truncate the file and remove any later log files * that may exist. */ if (reclen == 0) { /* This LSN is the end. */ break; } rdup_len = __wt_rduppo2(reclen, allocsize); if (reclen > allocsize) { /* * The log file end could be the middle of this * log record. */ if (rd_lsn.offset + rdup_len > log_size) goto advance; /* * We need to round up and read in the full padded * record, especially for direct I/O. */ WT_ERR(__wt_buf_grow(session, &buf, rdup_len)); WT_ERR(__wt_read(session, log_fh, rd_lsn.offset, (size_t)rdup_len, buf.mem)); WT_STAT_FAST_CONN_INCR(session, log_scan_rereads); } /* * We read in the record, verify checksum. */ buf.size = reclen; logrec = (WT_LOG_RECORD *)buf.mem; cksum = logrec->checksum; logrec->checksum = 0; logrec->checksum = __wt_cksum(logrec, logrec->len); if (logrec->checksum != cksum) { /* * A checksum mismatch means we have reached the end of * the useful part of the log. This should be found on * the first pass through recovery. In the second pass * where we truncate the log, this is where it should * end. */ if (log != NULL) log->trunc_lsn = rd_lsn; break; } /* * We have a valid log record. If it is not the log file * header, invoke the callback. */ WT_STAT_FAST_CONN_INCR(session, log_scan_records); if (rd_lsn.offset != 0) { WT_ERR((*func)(session, &buf, &rd_lsn, cookie)); if (LF_ISSET(WT_LOGSCAN_ONE)) break; } rd_lsn.offset += (off_t)rdup_len; } /* Truncate if we're in recovery. */ if (LF_ISSET(WT_LOGSCAN_RECOVER) && LOG_CMP(&rd_lsn, &log->trunc_lsn) < 0) WT_ERR(__log_truncate(session, &rd_lsn, 0)); err: WT_STAT_FAST_CONN_INCR(session, log_scans); if (logfiles != NULL) __wt_log_files_free(session, logfiles, logcount); __wt_buf_free(session, &buf); /* * If the caller wants one record and it is at the end of log, * return WT_NOTFOUND. */ if (LF_ISSET(WT_LOGSCAN_ONE) && eol && ret == 0) ret = WT_NOTFOUND; if (ret == ENOENT) ret = 0; if (log_fh != NULL) WT_TRET(__wt_close(session, log_fh)); return (ret); }
/* * __curjoin_iter_set_entry -- * Set the current entry for an iterator. */ static int __curjoin_iter_set_entry(WT_CURSOR_JOIN_ITER *iter, u_int entry_pos) { WT_CURSOR *c, *to_dup; WT_CURSOR_JOIN *cjoin, *topjoin; WT_CURSOR_JOIN_ENTRY *entry; WT_DECL_RET; WT_SESSION_IMPL *session; size_t size; const char *raw_cfg[] = { WT_CONFIG_BASE( iter->session, WT_SESSION_open_cursor), "raw", NULL }; const char *def_cfg[] = { WT_CONFIG_BASE( iter->session, WT_SESSION_open_cursor), NULL }; const char **config; char *uri; session = iter->session; cjoin = iter->cjoin; uri = NULL; entry = iter->entry = &cjoin->entries[entry_pos]; iter->positioned = false; iter->entry_pos = entry_pos; iter->end_pos = 0; iter->is_equal = (entry->ends_next == 1 && WT_CURJOIN_END_RANGE(&entry->ends[0]) == WT_CURJOIN_END_EQ); iter->end_skip = (entry->ends_next > 0 && WT_CURJOIN_END_RANGE(&entry->ends[0]) == WT_CURJOIN_END_GE) ? 1 : 0; iter->end_count = WT_MIN(1, entry->ends_next); if (F_ISSET(cjoin, WT_CURJOIN_DISJUNCTION)) { iter->entry_count = cjoin->entries_next; if (iter->is_equal) iter->end_count = entry->ends_next; } else iter->entry_count = 1; WT_ASSERT(iter->session, iter->entry_pos < iter->entry_count); entry->stats.iterated = 0; if (entry->subjoin == NULL) { for (topjoin = iter->cjoin; topjoin->parent != NULL; topjoin = topjoin->parent) ; to_dup = entry->ends[0].cursor; if (F_ISSET((WT_CURSOR *)topjoin, WT_CURSTD_RAW)) config = &raw_cfg[0]; else config = &def_cfg[0]; size = strlen(to_dup->internal_uri) + 3; WT_ERR(__wt_calloc(session, size, 1, &uri)); WT_ERR(__wt_snprintf(uri, size, "%s()", to_dup->internal_uri)); if ((c = iter->cursor) == NULL || strcmp(c->uri, uri) != 0) { iter->cursor = NULL; if (c != NULL) WT_ERR(c->close(c)); WT_ERR(__wt_open_cursor(session, uri, (WT_CURSOR *)topjoin, config, &iter->cursor)); } WT_ERR(__wt_cursor_dup_position(to_dup, iter->cursor)); } else if (iter->cursor != NULL) { WT_ERR(iter->cursor->close(iter->cursor)); iter->cursor = NULL; } err: __wt_free(session, uri); return (ret); }
void bdb_truncate(uint64_t start, uint64_t stop) { DBC *dbc = g.dbc; size_t len; int cmp, ret, notfound; /* Deleting a fixed-length item is the same as setting the bits to 0. */ if (g.type == FIX) { /* * If we're deleting from/to the start/end of the database, * correct for the number of records we have. */ if (start == 0) start = 1; if (stop == 0) stop = g.rows; for (; start <= stop; ++start) bdb_remove(start, ¬found); return; } if (start == 0) { ret = dbc->get(dbc, &key, &value, DB_FIRST); if (ret != 0 && ret != DB_NOTFOUND) bdb_die(ret, "%s", "dbc.get: DB_FIRST"); } else { key_gen(&keyitem, start); key.data = (void *)keyitem.data; key.size = (u_int32_t)keyitem.size; ret = dbc->get(dbc, &key, &value, DB_SET_RANGE); if (ret != 0 && ret != DB_NOTFOUND) bdb_die(ret, "dbc.get: DB_SET: {%.*s}", (int)key.size, (char *)key.data); } if (ret == DB_NOTFOUND) return; if (stop == 0) { do { ret = dbc->del(dbc, 0); if (ret != 0 && ret != DB_NOTFOUND) bdb_die(ret, "dbc.del: {%.*s}", (int)key.size, (char *)key.data); } while ((ret = dbc->get(dbc, &key, &value, DB_NEXT)) == 0); } else { key_gen(&keyitem, stop); do { len = WT_MIN(key.size, keyitem.size); cmp = memcmp(key.data, keyitem.data, len); if (g.c_reverse) { if (cmp < 0 || (cmp == 0 && key.size < keyitem.size)) break; } else if (cmp > 0 || (cmp == 0 && key.size > keyitem.size)) break; ret = dbc->del(dbc, 0); if (ret != 0 && ret != DB_NOTFOUND) bdb_die(ret, "dbc.del: {%.*s}", (int)key.size, (char *)key.data); } while ((ret = dbc->get(dbc, &key, &value, DB_NEXT)) == 0); } if (ret != 0 && ret != DB_NOTFOUND) bdb_die(ret, "%s", "dbc.get: DB_NEXT"); }
/* * __wt_lsm_tree_throttle -- * Calculate whether LSM updates need to be throttled. Must be called * with the LSM tree lock held. */ void __wt_lsm_tree_throttle( WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, int decrease_only) { WT_LSM_CHUNK *last_chunk, **cp, *ondisk, *prev_chunk; uint64_t cache_sz, cache_used, oldtime, record_count, timediff; uint32_t in_memory, gen0_chunks; /* Never throttle in small trees. */ if (lsm_tree->nchunks < 3) { lsm_tree->ckpt_throttle = lsm_tree->merge_throttle = 0; return; } cache_sz = S2C(session)->cache_size; /* * In the steady state, we expect that the checkpoint worker thread * will keep up with inserts. If not, throttle the insert rate to * avoid filling the cache with in-memory chunks. Threads sleep every * 100 operations, so take that into account in the calculation. * * Also throttle based on whether merge threads are keeping up. If * there are enough chunks that have never been merged we slow down * inserts so that merges have some chance of keeping up. * * Count the number of in-memory chunks, the number of unmerged chunk * on disk, and find the most recent on-disk chunk (if any). */ record_count = 1; gen0_chunks = in_memory = 0; ondisk = NULL; for (cp = lsm_tree->chunk + lsm_tree->nchunks - 1; cp >= lsm_tree->chunk; --cp) if (!F_ISSET(*cp, WT_LSM_CHUNK_ONDISK)) { record_count += (*cp)->count; ++in_memory; } else { /* * Assign ondisk to the last chunk that has been * flushed since the tree was last opened (i.e it's on * disk and stable is not set). */ if (ondisk == NULL && ((*cp)->generation == 0 && !F_ISSET(*cp, WT_LSM_CHUNK_STABLE))) ondisk = *cp; if ((*cp)->generation == 0 && !F_ISSET(*cp, WT_LSM_CHUNK_MERGING)) ++gen0_chunks; } last_chunk = lsm_tree->chunk[lsm_tree->nchunks - 1]; /* Checkpoint throttling, based on the number of in-memory chunks. */ if (!F_ISSET(lsm_tree, WT_LSM_TREE_THROTTLE) || in_memory <= 3) lsm_tree->ckpt_throttle = 0; else if (decrease_only) ; /* Nothing to do */ else if (ondisk == NULL) { /* * No checkpoint has completed this run. Keep slowing down * inserts until one does. */ lsm_tree->ckpt_throttle = WT_MAX(WT_LSM_THROTTLE_START, 2 * lsm_tree->ckpt_throttle); } else { WT_ASSERT(session, WT_TIMECMP(last_chunk->create_ts, ondisk->create_ts) >= 0); timediff = WT_TIMEDIFF(last_chunk->create_ts, ondisk->create_ts); lsm_tree->ckpt_throttle = (long)((in_memory - 2) * timediff / (20 * record_count)); /* * Get more aggressive as the number of in memory chunks * consumes a large proportion of the cache. In memory chunks * are allowed to grow up to twice as large as the configured * value when checkpoints aren't keeping up. That worst case * is when this calculation is relevant. * There is nothing particularly special about the chosen * multipliers. */ cache_used = in_memory * lsm_tree->chunk_size * 2; if (cache_used > cache_sz * 0.8) lsm_tree->ckpt_throttle *= 5; } /* * Merge throttling, based on the number of on-disk, level 0 chunks. * * Don't throttle if the tree has less than a single level's number * of chunks. */ if (lsm_tree->nchunks < lsm_tree->merge_max) lsm_tree->merge_throttle = 0; else if (gen0_chunks < WT_LSM_MERGE_THROTTLE_THRESHOLD) WT_LSM_MERGE_THROTTLE_DECREASE(lsm_tree->merge_throttle); else if (!decrease_only) WT_LSM_MERGE_THROTTLE_INCREASE(lsm_tree->merge_throttle); /* Put an upper bound of 1s on both throttle calculations. */ lsm_tree->ckpt_throttle = WT_MIN(1000000, lsm_tree->ckpt_throttle); lsm_tree->merge_throttle = WT_MIN(1000000, lsm_tree->merge_throttle); /* * Update our estimate of how long each in-memory chunk stays active. * Filter out some noise by keeping a weighted history of the * calculated value. Wait until we have enough chunks that we can * check that the new value is sane: otherwise, after a long idle * period, we can calculate a crazy value. */ if (in_memory > 1 && ondisk != NULL) { prev_chunk = lsm_tree->chunk[lsm_tree->nchunks - 2]; WT_ASSERT(session, prev_chunk->generation == 0); WT_ASSERT(session, WT_TIMECMP( last_chunk->create_ts, prev_chunk->create_ts) >= 0); timediff = WT_TIMEDIFF(last_chunk->create_ts, prev_chunk->create_ts); WT_ASSERT(session, WT_TIMECMP(prev_chunk->create_ts, ondisk->create_ts) >= 0); oldtime = WT_TIMEDIFF(prev_chunk->create_ts, ondisk->create_ts); if (timediff < 10 * oldtime) lsm_tree->chunk_fill_ms = (3 * lsm_tree->chunk_fill_ms + timediff / 1000000) / 4; } }
/* * __wt_page_in_func -- * Acquire a hazard pointer to a page; if the page is not in-memory, * read it from the disk and build an in-memory version. */ int __wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags #ifdef HAVE_DIAGNOSTIC , const char *file, int line #endif ) { WT_DECL_RET; WT_PAGE *page; u_int sleep_cnt, wait_cnt; int busy, force_attempts, oldgen; for (force_attempts = oldgen = 0, wait_cnt = 0;;) { switch (ref->state) { case WT_REF_DISK: case WT_REF_DELETED: if (LF_ISSET(WT_READ_CACHE)) return (WT_NOTFOUND); /* * The page isn't in memory, attempt to read it. * Make sure there is space in the cache. */ WT_RET(__wt_cache_full_check(session)); WT_RET(__wt_cache_read(session, ref)); oldgen = LF_ISSET(WT_READ_WONT_NEED) || F_ISSET(session, WT_SESSION_NO_CACHE); continue; case WT_REF_READING: if (LF_ISSET(WT_READ_CACHE)) return (WT_NOTFOUND); if (LF_ISSET(WT_READ_NO_WAIT)) return (WT_NOTFOUND); WT_STAT_FAST_CONN_INCR(session, page_read_blocked); break; case WT_REF_LOCKED: if (LF_ISSET(WT_READ_NO_WAIT)) return (WT_NOTFOUND); WT_STAT_FAST_CONN_INCR(session, page_locked_blocked); break; case WT_REF_SPLIT: return (WT_RESTART); case WT_REF_MEM: /* * The page is in memory: get a hazard pointer, update * the page's LRU and return. The expected reason we * can't get a hazard pointer is because the page is * being evicted; yield and try again. */ #ifdef HAVE_DIAGNOSTIC WT_RET( __wt_hazard_set(session, ref, &busy, file, line)); #else WT_RET(__wt_hazard_set(session, ref, &busy)); #endif if (busy) { WT_STAT_FAST_CONN_INCR( session, page_busy_blocked); break; } page = ref->page; WT_ASSERT(session, page != NULL); /* * Forcibly evict pages that are too big. */ if (force_attempts < 10 && __evict_force_check(session, page, flags)) { ++force_attempts; ret = __wt_page_release_evict(session, ref); /* If forced eviction fails, stall. */ if (ret == EBUSY) { ret = 0; wait_cnt += 1000; WT_STAT_FAST_CONN_INCR(session, page_forcible_evict_blocked); break; } else WT_RET(ret); /* * The result of a successful forced eviction * is a page-state transition (potentially to * an in-memory page we can use, or a restart * return for our caller), continue the outer * page-acquisition loop. */ continue; } /* Check if we need an autocommit transaction. */ if ((ret = __wt_txn_autocommit_check(session)) != 0) { WT_TRET(__wt_hazard_clear(session, page)); return (ret); } /* * If we read the page and we are configured to not * trash the cache, set the oldest read generation so * the page is forcibly evicted as soon as possible. * * Otherwise, update the page's read generation. */ if (oldgen && page->read_gen == WT_READGEN_NOTSET) __wt_page_evict_soon(page); else if (!LF_ISSET(WT_READ_NO_GEN) && page->read_gen != WT_READGEN_OLDEST && page->read_gen < __wt_cache_read_gen(session)) page->read_gen = __wt_cache_read_gen_set(session); return (0); WT_ILLEGAL_VALUE(session); } /* * We failed to get the page -- yield before retrying, and if * we've yielded enough times, start sleeping so we don't burn * CPU to no purpose. */ if (++wait_cnt < 1000) __wt_yield(); else { sleep_cnt = WT_MIN(wait_cnt, 10000); wait_cnt *= 2; WT_STAT_FAST_CONN_INCRV(session, page_sleep, sleep_cnt); __wt_sleep(0, sleep_cnt); } } }