Beispiel #1
0
/*
 * __global_calibrate_ticks --
 *	Calibrate a ratio from rdtsc ticks to nanoseconds.
 */
static void
__global_calibrate_ticks(void)
{
	/*
	 * Default to using __wt_epoch until we have a good value for the ratio.
	 */
	__wt_process.tsc_nsec_ratio = WT_TSC_DEFAULT_RATIO;
	__wt_process.use_epochtime = true;

#if defined (__i386) || defined (__amd64)
	{
	struct timespec start, stop;
	double ratio;
	uint64_t diff_nsec, diff_tsc, min_nsec, min_tsc;
	uint64_t tries, tsc_start, tsc_stop;
	volatile uint64_t i;

	/*
	 * Run this calibration loop a few times to make sure we get a
	 * reading that does not have a potential scheduling shift in it.
	 * The inner loop is CPU intensive but a scheduling change in the
	 * middle could throw off calculations. Take the minimum amount
	 * of time and compute the ratio.
	 */
	min_nsec = min_tsc = UINT64_MAX;
	for (tries = 0; tries < 3; ++tries) {
		/* This needs to be CPU intensive and large enough. */
		__wt_epoch(NULL, &start);
		tsc_start = __wt_rdtsc();
		for (i = 0; i < 100 * WT_MILLION; i++)
			;
		tsc_stop = __wt_rdtsc();
		__wt_epoch(NULL, &stop);
		diff_nsec = WT_TIMEDIFF_NS(stop, start);
		diff_tsc = tsc_stop - tsc_start;

		/* If the clock didn't tick over, we don't have a sample. */
		if (diff_nsec == 0 || diff_tsc == 0)
			continue;
		min_nsec = WT_MIN(min_nsec, diff_nsec);
		min_tsc = WT_MIN(min_tsc, diff_tsc);
	}

	/*
	 * Only use rdtsc if we got a good reading.  One reason this might fail
	 * is that the system's clock granularity is not fine-grained enough.
	 */
	if (min_nsec != UINT64_MAX) {
		ratio = (double)min_tsc / (double)min_nsec;
		if (ratio > DBL_EPSILON) {
			__wt_process.tsc_nsec_ratio = ratio;
			__wt_process.use_epochtime = false;
		}
	}
	}
#endif
}
Beispiel #2
0
Datei: txn.c Projekt: To4e/mongo
/*
 * __snapsort_impl --
 *	Custom quick sort implementation for snapshots.
 */
static void
__snapsort_impl(uint64_t *array, uint32_t f, uint32_t l)
{
	while (f + 16 < l) {
		uint64_t v1 = array[f], v2 = array[l], v3 = array[(f + l)/2];
		uint64_t median = v1 < v2 ?
		    (v3 < v1 ? v1 : WT_MIN(v2, v3)) :
		    (v3 < v2 ? v2 : WT_MIN(v1, v3));
		uint32_t m = __snapsort_partition(array, f, l, median);
		__snapsort_impl(array, f, m);
		f = m + 1;
	}
}
Beispiel #3
0
/*
 * __ovfl_reuse_verbose --
 *	Dump information about a reuse overflow record.
 */
static int
__ovfl_reuse_verbose(WT_SESSION_IMPL *session,
    WT_PAGE *page, WT_OVFL_REUSE *reuse, const char *tag)
{
	WT_DECL_ITEM(tmp);
	WT_DECL_RET;

	WT_RET(__wt_scr_alloc(session, 64, &tmp));

	WT_ERR(__wt_verbose(session, WT_VERB_OVERFLOW,
	    "reuse: %s%s%p %s (%s%s%s) {%.*s}",
	    tag == NULL ? "" : tag,
	    tag == NULL ? "" : ": ",
	    page,
	    __wt_addr_string(
		session, WT_OVFL_REUSE_ADDR(reuse), reuse->addr_size, tmp),
	    F_ISSET(reuse, WT_OVFL_REUSE_INUSE) ? "inuse" : "",
	    F_ISSET(reuse, WT_OVFL_REUSE_INUSE) &&
	    F_ISSET(reuse, WT_OVFL_REUSE_JUST_ADDED) ? ", " : "",
	    F_ISSET(reuse, WT_OVFL_REUSE_JUST_ADDED) ? "just-added" : "",
	    WT_MIN(reuse->value_size, 40), (char *)WT_OVFL_REUSE_VALUE(reuse)));

err:	__wt_scr_free(session, &tmp);
	return (ret);
}
Beispiel #4
0
/*
 * __ovfl_txnc_skip_search_stack --
 *	 Search an overflow transaction-cache skiplist, returning an
 * insert/remove stack.
 */
static void
__ovfl_txnc_skip_search_stack(WT_OVFL_TXNC **head,
    WT_OVFL_TXNC ***stack, const void *addr, size_t addr_size)
{
	WT_OVFL_TXNC **e;
	size_t len;
	int cmp, i;

	/*
	 * Start at the highest skip level, then go as far as possible at each
	 * level before stepping down to the next.
	 */
	for (i = WT_SKIP_MAXDEPTH - 1, e = &head[i]; i >= 0;) {
		if (*e == NULL) {		/* Empty levels */
			stack[i--] = e--;
			continue;
		}

		/*
		 * If the skiplist addr is larger than the search addr, or
		 * they compare equally and the skiplist addr is longer than
		 * the search addr, drop down a level, otherwise continue on
		 * this level.
		 */
		len = WT_MIN((*e)->addr_size, addr_size);
		cmp = memcmp(WT_OVFL_TXNC_ADDR(*e), addr, len);
		if (cmp > 0 || (cmp == 0 && (*e)->addr_size > addr_size))
			stack[i--] = e--;	/* Drop down a level */
		else
			e = &(*e)->next[i];	/* Keep going at this level */
	}
}
Beispiel #5
0
/*
 * __posix_file_write --
 *	POSIX pwrite.
 */
static int
__posix_file_write(WT_FILE_HANDLE *file_handle, WT_SESSION *wt_session,
    wt_off_t offset, size_t len, const void *buf)
{
	WT_FILE_HANDLE_POSIX *pfh;
	WT_SESSION_IMPL *session;
	size_t chunk;
	ssize_t nw;
	const uint8_t *addr;

	session = (WT_SESSION_IMPL *)wt_session;
	pfh = (WT_FILE_HANDLE_POSIX *)file_handle;

	/* Assert direct I/O is aligned and a multiple of the alignment. */
	WT_ASSERT(session,
	    !pfh->direct_io ||
	    S2C(session)->buffer_alignment == 0 ||
	    (!((uintptr_t)buf &
	    (uintptr_t)(S2C(session)->buffer_alignment - 1)) &&
	    len >= S2C(session)->buffer_alignment &&
	    len % S2C(session)->buffer_alignment == 0));

	/* Break writes larger than 1GB into 1GB chunks. */
	for (addr = buf; len > 0; addr += nw, len -= (size_t)nw, offset += nw) {
		chunk = WT_MIN(len, WT_GIGABYTE);
		if ((nw = pwrite(pfh->fd, addr, chunk, offset)) < 0)
			WT_RET_MSG(session, __wt_errno(),
			    "%s: handle-write: pwrite: failed to write %"
			    WT_SIZET_FMT " bytes at offset %" PRIuMAX,
			    file_handle->name, chunk, (uintmax_t)offset);
	}
	return (0);
}
Beispiel #6
0
int __wt_read(WT_SESSION_IMPL *session, WT_FH *fh, wt_off_t offset, size_t len, void *buf)
{
	size_t chunk;
	ssize_t nr;
	uint8_t *addr;

	WT_STAT_FAST_CONN_INCR(session, read_io);

	WT_RET(__wt_verbose(session, WT_VERB_FILEOPS, "%s: read %" WT_SIZET_FMT " bytes at offset %" PRIuMAX,
	    fh->name, len, (uintmax_t)offset));

	/* Assert direct I/O is aligned and a multiple of the alignment. */
	WT_ASSERT(session, !fh->direct_io || S2C(session)->buffer_alignment == 0 ||
	    (!((uintptr_t)buf & (uintptr_t)(S2C(session)->buffer_alignment - 1)) &&
	    len >= S2C(session)->buffer_alignment && len % S2C(session)->buffer_alignment == 0));

	/* Break reads larger than 1GB into 1GB chunks. */
	for (addr = buf; len > 0; addr += nr, len -= (size_t)nr, offset += nr) {
		chunk = WT_MIN(len, WT_GIGABYTE);
		if ((nr = pread(fh->fd, addr, chunk, offset)) <= 0)
			WT_RET_MSG(session, nr == 0 ? WT_ERROR : __wt_errno(), "%s read error: failed to read %" WT_SIZET_FMT " bytes at offset %" PRIuMAX, 
				fh->name, chunk, (uintmax_t)offset);
	}
	return (0);
}
Beispiel #7
0
/*
 * __ovfl_reuse_skip_search --
 *	Return the first, not in-use, matching value in the overflow reuse list.
 */
static WT_OVFL_REUSE *
__ovfl_reuse_skip_search(
    WT_OVFL_REUSE **head, const void *value, size_t value_size)
{
	WT_OVFL_REUSE **e, *next;
	size_t len;
	int cmp, i;

	/*
	 * Start at the highest skip level, then go as far as possible at each
	 * level before stepping down to the next.
	 */
	for (i = WT_SKIP_MAXDEPTH - 1, e = &head[i]; i >= 0;) {
		if (*e == NULL) {		/* Empty levels */
			--i;
			--e;
			continue;
		}

		/*
		 * Values are not unique, and it's possible to have long lists
		 * of identical overflow items.  (We've seen it in benchmarks.)
		 * Move through a list of identical items at the current level
		 * as long as the next one is in-use, otherwise, drop down a
		 * level. When at the bottom level, return items if reusable,
		 * else NULL.
		 */
		len = WT_MIN((*e)->value_size, value_size);
		cmp = memcmp(WT_OVFL_REUSE_VALUE(*e), value, len);
		if (cmp == 0 && (*e)->value_size == value_size) {
			if (i == 0)
				return (F_ISSET(*e,
				    WT_OVFL_REUSE_INUSE) ? NULL : *e);
			if ((next = (*e)->next[i]) == NULL ||
			    !F_ISSET(next, WT_OVFL_REUSE_INUSE) ||
			    next->value_size != len || memcmp(
			    WT_OVFL_REUSE_VALUE(next), value, len) != 0) {
				--i;		/* Drop down a level */
				--e;
			} else			/* Keep going at this level */
				e = &(*e)->next[i];
			continue;
		}

		/*
		 * If the skiplist value is larger than the search value, or
		 * they compare equally and the skiplist value is longer than
		 * the search value, drop down a level, otherwise continue on
		 * this level.
		 */
		if (cmp > 0 || (cmp == 0 && (*e)->value_size > value_size)) {
			--i;			/* Drop down a level */
			--e;
		} else				/* Keep going at this level */
			e = &(*e)->next[i];
	}
	return (NULL);
}
Beispiel #8
0
/*
 * __wt_log_slot_init --
 *	Initialize the slot array.
 */
int
__wt_log_slot_init(WT_SESSION_IMPL *session, bool alloc)
{
	WT_CONNECTION_IMPL *conn;
	WT_DECL_RET;
	WT_LOG *log;
	WT_LOGSLOT *slot;
	int32_t i;

	conn = S2C(session);
	log = conn->log;
	for (i = 0; i < WT_SLOT_POOL; i++)
		log->slot_pool[i].slot_state = WT_LOG_SLOT_FREE;

	/*
	 * Allocate memory for buffers now that the arrays are setup. Separate
	 * this from the loop above to make error handling simpler.
	 */
	/*
	 * !!! If the buffer size is too close to the log file size, we will
	 * switch log files very aggressively.  Scale back the buffer for
	 * small log file sizes.
	 */
	if (alloc) {
		log->slot_buf_size = (uint32_t)WT_MIN(
		    (size_t)conn->log_file_max / 10, WT_LOG_SLOT_BUF_SIZE);
		for (i = 0; i < WT_SLOT_POOL; i++) {
			WT_ERR(__wt_buf_init(session,
			    &log->slot_pool[i].slot_buf, log->slot_buf_size));
			F_SET(&log->slot_pool[i], WT_SLOT_INIT_FLAGS);
		}
		WT_STAT_CONN_SET(session,
		    log_buffer_size, log->slot_buf_size * WT_SLOT_POOL);
	}
	/*
	 * Set up the available slot from the pool the first time.
	 */
	slot = &log->slot_pool[0];
	/*
	 * We cannot initialize the release LSN in the activate function
	 * because that function can be called after a log file switch.
	 * The release LSN is usually the same as the slot_start_lsn except
	 * around a log file switch.
	 */
	slot->slot_release_lsn = log->alloc_lsn;
	__wt_log_slot_activate(session, slot);
	log->active_slot = slot;
	log->pool_index = 0;

	if (0) {
err:		while (--i >= 0)
			__wt_buf_free(session, &log->slot_pool[i].slot_buf);
	}
	return (ret);
}
Beispiel #9
0
/*
 * __win_file_read --
 *	Read a chunk.
 */
static int
__win_file_read(WT_FILE_HANDLE *file_handle,
    WT_SESSION *wt_session, wt_off_t offset, size_t len, void *buf)
{
	DWORD chunk, nr, windows_error;
	OVERLAPPED overlapped = { 0 };
	WT_DECL_RET;
	WT_FILE_HANDLE_WIN *win_fh;
	WT_SESSION_IMPL *session;
	uint8_t *addr;

	win_fh = (WT_FILE_HANDLE_WIN *)file_handle;
	session = (WT_SESSION_IMPL *)wt_session;

	nr = 0;

	/* Assert direct I/O is aligned and a multiple of the alignment. */
	WT_ASSERT(session,
	    !win_fh->direct_io ||
	    S2C(session)->buffer_alignment == 0 ||
	    (!((uintptr_t)buf &
	    (uintptr_t)(S2C(session)->buffer_alignment - 1)) &&
	    len >= S2C(session)->buffer_alignment &&
	    len % S2C(session)->buffer_alignment == 0));

	/* Break reads larger than 1GB into 1GB chunks. */
	for (addr = buf; len > 0; addr += nr, len -= (size_t)nr, offset += nr) {
		chunk = (DWORD)WT_MIN(len, WT_GIGABYTE);
		overlapped.Offset = UINT32_MAX & offset;
		overlapped.OffsetHigh = UINT32_MAX & (offset >> 32);

		if (!ReadFile(
		    win_fh->filehandle, addr, chunk, &nr, &overlapped)) {
			windows_error = __wt_getlasterror();
			ret = __wt_map_windows_error(windows_error);
			if (ret == WT_ERROR)
				F_SET(S2C(session), WT_CONN_DATA_CORRUPTION);
			__wt_err(session, ret,
			    "%s: handle-read: ReadFile: failed to read %lu "
			    "bytes at offset %" PRIuMAX ": %s",
			    file_handle->name, chunk, (uintmax_t)offset,
			    __wt_formatmessage(session, windows_error));
			return (ret);
		}
	}
	return (0);
}
Beispiel #10
0
/*
 * __wt_log_slot_init --
 *	Initialize the slot array.
 */
int
__wt_log_slot_init(WT_SESSION_IMPL *session)
{
	WT_CONNECTION_IMPL *conn;
	WT_DECL_RET;
	WT_LOG *log;
	WT_LOGSLOT *slot;
	int32_t i;

	conn = S2C(session);
	log = conn->log;
	for (i = 0; i < WT_SLOT_POOL; i++) {
		log->slot_pool[i].slot_state = WT_LOG_SLOT_FREE;
		log->slot_pool[i].slot_index = WT_SLOT_INVALID_INDEX;
	}

	/*
	 * Set up the available slots from the pool the first time.
	 */
	for (i = 0; i < WT_SLOT_ACTIVE; i++) {
		slot = &log->slot_pool[i];
		slot->slot_index = (uint32_t)i;
		slot->slot_state = WT_LOG_SLOT_READY;
		log->slot_array[i] = slot;
	}

	/*
	 * Allocate memory for buffers now that the arrays are setup. Split
	 * this out to make error handling simpler.
	 *
	 * Cap the slot buffer to the log file size.
	 */
	log->slot_buf_size =
	    WT_MIN((size_t)conn->log_file_max, WT_LOG_SLOT_BUF_SIZE);
	for (i = 0; i < WT_SLOT_POOL; i++) {
		WT_ERR(__wt_buf_init(session,
		    &log->slot_pool[i].slot_buf, log->slot_buf_size));
		F_SET(&log->slot_pool[i], WT_SLOT_INIT_FLAGS);
	}
	WT_STAT_FAST_CONN_INCRV(session,
	    log_buffer_size, log->slot_buf_size * WT_SLOT_POOL);
	if (0) {
err:		while (--i >= 0)
			__wt_buf_free(session, &log->slot_pool[i].slot_buf);
	}
	return (ret);
}
Beispiel #11
0
/*
 * __wt_log_open --
 *	Open the appropriate log file for the connection.  The purpose is
 *	to find the last log file that exists, open it and set our initial
 *	LSNs to the end of that file.  If none exist, call __wt_log_newfile
 *	to create it.
 */
int
__wt_log_open(WT_SESSION_IMPL *session)
{
	WT_CONNECTION_IMPL *conn;
	WT_DECL_RET;
	WT_LOG *log;
	uint32_t firstlog, lastlog, lognum;
	u_int i, logcount;
	char **logfiles;

	conn = S2C(session);
	log = conn->log;
	lastlog = 0;
	firstlog = UINT32_MAX;

	WT_RET(__wt_log_get_files(session, &logfiles, &logcount));
	for (i = 0; i < logcount; i++) {
		WT_ERR(__wt_log_extract_lognum(session, logfiles[i], &lognum));
		lastlog = WT_MAX(lastlog, lognum);
		firstlog = WT_MIN(firstlog, lognum);
	}
	log->fileid = lastlog;
	WT_ERR(__wt_verbose(session, WT_VERB_LOG,
	    "log_open: first log %d last log %d", firstlog, lastlog));
	log->first_lsn.file = firstlog;
	log->first_lsn.offset = 0;

	/*
	 * Start logging at the beginning of the next log file, no matter
	 * where the previous log file ends.
	 */
	WT_ERR(__wt_log_newfile(session, 1));

	/*
	 * If there were log files, run recovery.
	 * XXX belongs at a higher level than this.
	 */
	if (logcount > 0) {
		log->trunc_lsn = log->alloc_lsn;
		WT_ERR(__wt_txn_recover(session));
	}

err:	__wt_log_files_free(session, logfiles, logcount);
	return (ret);
}
Beispiel #12
0
/*
 * __fstream_getline --
 *	Get a line from a stream.
 *
 * Implementation of the POSIX getline or BSD fgetln functions (finding the
 * function in a portable way is hard, it's simple enough to write it instead).
 *
 * Note: Unlike the standard getline calls, this function doesn't include the
 * trailing newline character in the returned buffer and discards empty lines
 * (so the caller's EOF marker is a returned line length of 0).
 */
static int
__fstream_getline(WT_SESSION_IMPL *session, WT_FSTREAM *fstr, WT_ITEM *buf)
{
	const char *p;
	size_t len;
	char c;

	/*
	 * We always NUL-terminate the returned string (even if it's empty),
	 * make sure there's buffer space for a trailing NUL in all cases.
	 */
	WT_RET(__wt_buf_init(session, buf, 100));

	for (;;) {
		/* Check if we need to refill the buffer. */
		if (WT_PTRDIFF(fstr->buf.data, fstr->buf.mem) >=
		    fstr->buf.size) {
			len = WT_MIN(WT_STREAM_BUFSIZE,
			    (size_t)(fstr->size - fstr->off));
			if (len == 0)
				break; /* EOF */
			WT_RET(__wt_buf_initsize(session, &fstr->buf, len));
			WT_RET(__wt_read(
			    session, fstr->fh, fstr->off, len, fstr->buf.mem));
			fstr->off += (wt_off_t)len;
		}

		c = *(p = fstr->buf.data);
		fstr->buf.data = ++p;

		/* Leave space for a trailing NUL. */
		WT_RET(__wt_buf_extend(session, buf, buf->size + 2));
		if (c == '\n') {
			if (buf->size == 0)
				continue;
			break;
		}
		((char *)buf->mem)[buf->size++] = c;
	}

	((char *)buf->mem)[buf->size] = '\0';

	return (0);
}
Beispiel #13
0
/*
 * __win_file_write --
 *	Write a chunk.
 */
static int
__win_file_write(WT_FILE_HANDLE *file_handle,
    WT_SESSION *wt_session, wt_off_t offset, size_t len, const void *buf)
{
	DWORD chunk, nw, windows_error;
	const uint8_t *addr;
	OVERLAPPED overlapped = { 0 };
	WT_FILE_HANDLE_WIN *win_fh;
	WT_SESSION_IMPL *session;

	win_fh = (WT_FILE_HANDLE_WIN *)file_handle;
	session = (WT_SESSION_IMPL *)wt_session;

	nw = 0;

	/* Assert direct I/O is aligned and a multiple of the alignment. */
	WT_ASSERT(session,
	    !win_fh->direct_io ||
	    S2C(session)->buffer_alignment == 0 ||
	    (!((uintptr_t)buf &
	    (uintptr_t)(S2C(session)->buffer_alignment - 1)) &&
	    len >= S2C(session)->buffer_alignment &&
	    len % S2C(session)->buffer_alignment == 0));

	/* Break writes larger than 1GB into 1GB chunks. */
	for (addr = buf; len > 0; addr += nw, len -= (size_t)nw, offset += nw) {
		chunk = (DWORD)WT_MIN(len, WT_GIGABYTE);
		overlapped.Offset = UINT32_MAX & offset;
		overlapped.OffsetHigh = UINT32_MAX & (offset >> 32);

		if (!WriteFile(
		    win_fh->filehandle, addr, chunk, &nw, &overlapped)) {
			windows_error = __wt_getlasterror();
			__wt_errx(session,
			    "%s: handle-write: WriteFile: failed to write %lu "
			    "bytes at offset %" PRIuMAX ": %s",
			    file_handle->name, chunk, (uintmax_t)offset,
			    __wt_formatmessage(session, windows_error));
			return (__wt_map_windows_error(windows_error));
		}
	}
	return (0);
}
Beispiel #14
0
/*
 * __ovfl_txnc_skip_search --
 *	Return the first matching addr in the overflow transaction-cache list.
 */
static WT_OVFL_TXNC *
__ovfl_txnc_skip_search(WT_OVFL_TXNC **head, const void *addr, size_t addr_size)
{
	WT_OVFL_TXNC **e;
	size_t len;
	int cmp, i;

	/*
	 * Start at the highest skip level, then go as far as possible at each
	 * level before stepping down to the next.
	 */
	for (i = WT_SKIP_MAXDEPTH - 1, e = &head[i]; i >= 0;) {
		if (*e == NULL) {		/* Empty levels */
			--i;
			--e;
			continue;
		}

		/*
		 * Return any exact matches: we don't care in what search level
		 * we found a match.
		 */
		len = WT_MIN((*e)->addr_size, addr_size);
		cmp = memcmp(WT_OVFL_TXNC_ADDR(*e), addr, len);
		if (cmp == 0 && (*e)->addr_size == addr_size)
			return (*e);

		/*
		 * If the skiplist address is larger than the search address, or
		 * they compare equally and the skiplist address is longer than
		 * the search address, drop down a level, otherwise continue on
		 * this level.
		 */
		if (cmp > 0 || (cmp == 0 && (*e)->addr_size > addr_size)) {
			--i;			/* Drop down a level */
			--e;
		} else				/* Keep going at this level */
			e = &(*e)->next[i];
	}
	return (NULL);
}
Beispiel #15
0
/*
 * Run the throttle function.  We will sleep if needed and then reload the
 * counter to perform more operations.
 */
int
worker_throttle(CONFIG_THREAD *thread)
{
	THROTTLE_CONFIG *throttle_cfg;
	struct timespec now;
	uint64_t usecs_delta;

	throttle_cfg = &thread->throttle_cfg;

	WT_RET(__wt_epoch(NULL, &now));

	/*
	 * If we did enough operations in the current interval, sleep for
	 * the rest of the interval. Then add more operations to the queue.
	 */
	usecs_delta = WT_TIMEDIFF_US(now, throttle_cfg->last_increment);
	if (usecs_delta < throttle_cfg->usecs_increment) {
		(void)usleep(
		    (useconds_t)(throttle_cfg->usecs_increment - usecs_delta));
		throttle_cfg->ops_count =
		     throttle_cfg->ops_per_increment;
		/*
		 * After sleeping, set the interval to the current time.
		 */
		WT_RET(__wt_epoch(NULL, &throttle_cfg->last_increment));
	} else {
		throttle_cfg->ops_count = (usecs_delta *
		    throttle_cfg->ops_per_increment) /
		    throttle_cfg->usecs_increment;
		throttle_cfg->last_increment = now;
	}

	/*
	 * Take the minimum so we don't overfill the queue.
	 */
	throttle_cfg->ops_count =
	    WT_MIN(throttle_cfg->ops_count, thread->workload->throttle);

	return (0);
}
Beispiel #16
0
/*
 * __wt_bloom_finalize --
 *	Writes the Bloom filter to stable storage. After calling finalize, only
 *	read operations can be performed on the bloom filter.
 */
int
__wt_bloom_finalize(WT_BLOOM *bloom)
{
	WT_CURSOR *c;
	WT_DECL_RET;
	WT_ITEM values;
	WT_SESSION *wt_session;
	uint64_t i;

	wt_session = (WT_SESSION *)bloom->session;
	WT_CLEAR(values);

	/*
	 * Create a bit table to store the bloom filter in.
	 * TODO: should this call __wt_schema_create directly?
	 */
	WT_RET(wt_session->create(wt_session, bloom->uri, bloom->config));
	WT_RET(wt_session->open_cursor(
	    wt_session, bloom->uri, NULL, "bulk=bitmap", &c));

	/* Add the entries from the array into the table. */
	for (i = 0; i < bloom->m; i += values.size) {
		/* Adjust bits to bytes for string offset */
		values.data = bloom->bitstring + (i >> 3);
		/*
		 * Shave off some bytes for pure paranoia, in case WiredTiger
		 * reserves some special sizes. Choose a value so that if
		 * we do multiple inserts, it will be on an byte boundary.
		 */
		values.size = (uint32_t)WT_MIN(bloom->m - i, UINT32_MAX - 127);
		c->set_value(c, &values);
		WT_ERR(c->insert(c));
	}

err:	WT_TRET(c->close(c));
	__wt_free(bloom->session, bloom->bitstring);
	bloom->bitstring = NULL;

	return (ret);
}
Beispiel #17
0
/*
 * __ovfl_txnc_verbose --
 *	Dump information about a transaction-cached overflow record.
 */
static int
__ovfl_txnc_verbose(WT_SESSION_IMPL *session,
    WT_PAGE *page, WT_OVFL_TXNC *txnc, const char *tag)
{
	WT_DECL_ITEM(tmp);
	WT_DECL_RET;

	WT_RET(__wt_scr_alloc(session, 64, &tmp));

	WT_ERR(__wt_verbose(session, WT_VERB_OVERFLOW,
	    "txn-cache: %s%s%p %s %" PRIu64 " {%.*s}",
	    tag == NULL ? "" : tag,
	    tag == NULL ? "" : ": ",
	    page,
	    __wt_addr_string(
		session, WT_OVFL_TXNC_ADDR(txnc), txnc->addr_size, tmp),
	    txnc->current,
	    WT_MIN(txnc->value_size, 40), (char *)WT_OVFL_TXNC_VALUE(txnc)));

err:	__wt_scr_free(session, &tmp);
	return (ret);
}
Beispiel #18
0
/*
 * __wt_mmap_preload --
 *	Cause a section of a memory map to be faulted in.
 */
int
__wt_mmap_preload(WT_SESSION_IMPL *session, const void *p, size_t size)
{
#ifdef HAVE_POSIX_MADVISE
	/* Linux requires the address be aligned to a 4KB boundary. */
	WT_BM *bm = S2BT(session)->bm;
	WT_DECL_RET;
	void *blk = (void *)((uintptr_t)p & ~(uintptr_t)(WT_VM_PAGESIZE - 1));
	size += WT_PTRDIFF(p, blk);

	/* XXX proxy for "am I doing a scan?" -- manual read-ahead */
	if (F_ISSET(session, WT_SESSION_NO_CACHE)) {
		/* Read in 2MB blocks every 1MB of data. */
		if (((uintptr_t)((uint8_t *)blk + size) &
		    (uintptr_t)((1<<20) - 1)) < (uintptr_t)blk)
			return (0);
		size = WT_MIN(WT_MAX(20 * size, 2 << 20),
		    WT_PTRDIFF((uint8_t *)bm->map + bm->maplen, blk));
	}

	/*
	 * Manual pages aren't clear on whether alignment is required for the
	 * size, so we will be conservative.
	 */
	size &= ~(size_t)(WT_VM_PAGESIZE - 1);

	if (size > WT_VM_PAGESIZE &&
	    (ret = posix_madvise(blk, size, POSIX_MADV_WILLNEED)) != 0)
		WT_RET_MSG(session, ret, "posix_madvise will need");
#else
	WT_UNUSED(session);
	WT_UNUSED(p);
	WT_UNUSED(size);
#endif

	return (0);
}
Beispiel #19
0
/*
 * __log_archive_once --
 *	Perform one iteration of log archiving.  Must be called with the
 *	log archive lock held.
 */
static int
__log_archive_once(WT_SESSION_IMPL *session, uint32_t backup_file)
{
	WT_CONNECTION_IMPL *conn;
	WT_DECL_RET;
	WT_LOG *log;
	uint32_t lognum, min_lognum;
	u_int i, locked, logcount;
	char **logfiles;

	conn = S2C(session);
	log = conn->log;
	logcount = 0;
	logfiles = NULL;

	/*
	 * If we're coming from a backup cursor we want the smaller of
	 * the last full log file copied in backup or the checkpoint LSN.
	 * Otherwise we want the minimum of the last log file written to
	 * disk and the checkpoint LSN.
	 */
	if (backup_file != 0)
		min_lognum = WT_MIN(log->ckpt_lsn.file, backup_file);
	else
		min_lognum = WT_MIN(log->ckpt_lsn.file, log->sync_lsn.file);
	WT_RET(__wt_verbose(session, WT_VERB_LOG,
	    "log_archive: archive to log number %" PRIu32, min_lognum));

	/*
	 * Main archive code.  Get the list of all log files and
	 * remove any earlier than the minimum log number.
	 */
	WT_RET(__wt_dirlist(session, conn->log_path,
	    WT_LOG_FILENAME, WT_DIRLIST_INCLUDE, &logfiles, &logcount));

	/*
	 * We can only archive files if a hot backup is not in progress or
	 * if we are the backup.
	 */
	WT_RET(__wt_readlock(session, conn->hot_backup_lock));
	locked = 1;
	if (conn->hot_backup == 0 || backup_file != 0) {
		for (i = 0; i < logcount; i++) {
			WT_ERR(__wt_log_extract_lognum(
			    session, logfiles[i], &lognum));
			if (lognum < min_lognum)
				WT_ERR(__wt_log_remove(
				    session, WT_LOG_FILENAME, lognum));
		}
	}
	WT_ERR(__wt_readunlock(session, conn->hot_backup_lock));
	locked = 0;
	__wt_log_files_free(session, logfiles, logcount);
	logfiles = NULL;
	logcount = 0;

	/*
	 * Indicate what is our new earliest LSN.  It is the start
	 * of the log file containing the last checkpoint.
	 */
	log->first_lsn.file = min_lognum;
	log->first_lsn.offset = 0;

	if (0)
err:		__wt_err(session, ret, "log archive server error");
	if (locked)
		WT_TRET(__wt_readunlock(session, conn->hot_backup_lock));
	if (logfiles != NULL)
		__wt_log_files_free(session, logfiles, logcount);
	return (ret);
}
Beispiel #20
0
/*
 * __ckpt_process --
 *	Process the list of checkpoints.
 */
static int
__ckpt_process(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_CKPT *ckptbase)
{
	WT_BLOCK_CKPT *a, *b, *ci;
	WT_CKPT *ckpt, *next_ckpt;
	WT_DECL_ITEM(tmp);
	WT_DECL_RET;
	uint64_t ckpt_size;
	bool deleting, fatal, locked;

	ci = &block->live;
	fatal = locked = false;

#ifdef HAVE_DIAGNOSTIC
	WT_RET(__ckpt_verify(session, ckptbase));
#endif

	/*
	 * Checkpoints are a two-step process: first, write a new checkpoint to
	 * disk (including all the new extent lists for modified checkpoints
	 * and the live system).  As part of this, create a list of file blocks
	 * newly available for reallocation, based on checkpoints being deleted.
	 * We then return the locations of the new checkpoint information to our
	 * caller.  Our caller has to write that information into some kind of
	 * stable storage, and once that's done, we can actually allocate from
	 * that list of newly available file blocks.  (We can't allocate from
	 * that list immediately because the allocation might happen before our
	 * caller saves the new checkpoint information, and if we crashed before
	 * the new checkpoint location was saved, we'd have overwritten blocks
	 * still referenced by checkpoints in the system.)  In summary, there is
	 * a second step: after our caller saves the checkpoint information, we
	 * are called to add the newly available blocks into the live system's
	 * available list.
	 *
	 * This function is the first step, the second step is in the resolve
	 * function.
	 *
	 * If we're called to checkpoint the same file twice (without the second
	 * resolution step), or re-entered for any reason, it's an error in our
	 * caller, and our choices are all bad: leak blocks or potentially crash
	 * with our caller not yet having saved previous checkpoint information
	 * to stable storage.
	 */
	__wt_spin_lock(session, &block->live_lock);
	if (block->ckpt_inprogress)
		ret = __wt_block_panic(session, EINVAL,
		    "%s: unexpected checkpoint ordering", block->name);
	else
		block->ckpt_inprogress = true;
	__wt_spin_unlock(session, &block->live_lock);
	WT_RET(ret);

	/*
	 * Extents newly available as a result of deleting previous checkpoints
	 * are added to a list of extents.  The list should be empty, but as
	 * described above, there is no "free the checkpoint information" call
	 * into the block manager; if there was an error in an upper level that
	 * resulted in some previous checkpoint never being resolved, the list
	 * may not be empty.  We should have caught that with the "checkpoint
	 * in progress" test, but it doesn't cost us anything to be cautious.
	 *
	 * We free the checkpoint's allocation and discard extent lists as part
	 * of the resolution step, not because they're needed at that time, but
	 * because it's potentially a lot of work, and waiting allows the btree
	 * layer to continue eviction sooner.  As for the checkpoint-available
	 * list, make sure they get cleaned out.
	 */
	__wt_block_extlist_free(session, &ci->ckpt_avail);
	WT_RET(__wt_block_extlist_init(
	    session, &ci->ckpt_avail, "live", "ckpt_avail", true));
	__wt_block_extlist_free(session, &ci->ckpt_alloc);
	__wt_block_extlist_free(session, &ci->ckpt_discard);

	/*
	 * To delete a checkpoint, we'll need checkpoint information for it and
	 * the subsequent checkpoint into which it gets rolled; read them from
	 * disk before we lock things down.
	 */
	deleting = false;
	WT_CKPT_FOREACH(ckptbase, ckpt) {
		if (F_ISSET(ckpt, WT_CKPT_FAKE) ||
		    !F_ISSET(ckpt, WT_CKPT_DELETE))
			continue;
		deleting = true;

		/*
		 * Read the checkpoint and next checkpoint extent lists if we
		 * haven't already read them (we may have already read these
		 * extent blocks if there is more than one deleted checkpoint).
		 */
		if (ckpt->bpriv == NULL)
			WT_ERR(__ckpt_extlist_read(session, block, ckpt));

		for (next_ckpt = ckpt + 1;; ++next_ckpt)
			if (!F_ISSET(next_ckpt, WT_CKPT_FAKE))
				break;

		/*
		 * The "next" checkpoint may be the live tree which has no
		 * extent blocks to read.
		 */
		if (next_ckpt->bpriv == NULL &&
		    !F_ISSET(next_ckpt, WT_CKPT_ADD))
			WT_ERR(__ckpt_extlist_read(session, block, next_ckpt));
	}

	/*
	 * Failures are now fatal: we can't currently back out the merge of any
	 * deleted checkpoint extent lists into the live system's extent lists,
	 * so continuing after error would leave the live system's extent lists
	 * corrupted for any subsequent checkpoint (and potentially, should a
	 * subsequent checkpoint succeed, for recovery).
	 */
	fatal = true;

	/*
	 * Hold a lock so the live extent lists and the file size can't change
	 * underneath us.  I suspect we'll tighten this if checkpoints take too
	 * much time away from real work: we read the historic checkpoint
	 * information without a lock, but we could also merge and re-write the
	 * deleted and merged checkpoint information without a lock, except for
	 * the final merge of ranges into the live tree.
	 */
	__wt_spin_lock(session, &block->live_lock);
	locked = true;

	/*
	 * We've allocated our last page, update the checkpoint size.  We need
	 * to calculate the live system's checkpoint size before merging
	 * checkpoint allocation and discard information from the checkpoints
	 * we're deleting, those operations change the underlying byte counts.
	 */
	ckpt_size = ci->ckpt_size;
	ckpt_size += ci->alloc.bytes;
	ckpt_size -= ci->discard.bytes;

	/* Skip the additional processing if we aren't deleting checkpoints. */
	if (!deleting)
		goto live_update;

	/*
	 * Delete any no-longer-needed checkpoints: we do this first as it frees
	 * blocks to the live lists, and the freed blocks will then be included
	 * when writing the live extent lists.
	 */
	WT_CKPT_FOREACH(ckptbase, ckpt) {
		if (F_ISSET(ckpt, WT_CKPT_FAKE) ||
		    !F_ISSET(ckpt, WT_CKPT_DELETE))
			continue;

#ifdef HAVE_VERBOSE
		if (WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT)) {
			if (tmp == NULL)
				WT_ERR(__wt_scr_alloc(session, 0, &tmp));
			WT_ERR(__ckpt_string(
			    session, block, ckpt->raw.data, tmp));
			__wt_verbose(session, WT_VERB_CHECKPOINT,
			    "%s: delete-checkpoint: %s: %s",
			    block->name, ckpt->name, (const char *)tmp->data);
		}
#endif
		/*
		 * Find the checkpoint into which we'll roll this checkpoint's
		 * blocks: it's the next real checkpoint in the list, and it
		 * better have been read in (if it's not the add slot).
		 */
		for (next_ckpt = ckpt + 1;; ++next_ckpt)
			if (!F_ISSET(next_ckpt, WT_CKPT_FAKE))
				break;

		/*
		 * Set the from/to checkpoint structures, where the "to" value
		 * may be the live tree.
		 */
		a = ckpt->bpriv;
		if (F_ISSET(next_ckpt, WT_CKPT_ADD))
			b = &block->live;
		else
			b = next_ckpt->bpriv;

		/*
		 * Free the root page: there's nothing special about this free,
		 * the root page is allocated using normal rules, that is, it
		 * may have been taken from the avail list, and was entered on
		 * the live system's alloc list at that time.  We free it into
		 * the checkpoint's discard list, however, not the live system's
		 * list because it appears on the checkpoint's alloc list and so
		 * must be paired in the checkpoint.
		 */
		if (a->root_offset != WT_BLOCK_INVALID_OFFSET)
			WT_ERR(__wt_block_insert_ext(session, block,
			    &a->discard, a->root_offset, a->root_size));

		/*
		 * Free the blocks used to hold the "from" checkpoint's extent
		 * lists, including the avail list.
		 */
		WT_ERR(__ckpt_extlist_fblocks(session, block, &a->alloc));
		WT_ERR(__ckpt_extlist_fblocks(session, block, &a->avail));
		WT_ERR(__ckpt_extlist_fblocks(session, block, &a->discard));

		/*
		 * Roll the "from" alloc and discard extent lists into the "to"
		 * checkpoint's lists.
		 */
		if (a->alloc.entries != 0)
			WT_ERR(__wt_block_extlist_merge(
			    session, block, &a->alloc, &b->alloc));
		if (a->discard.entries != 0)
			WT_ERR(__wt_block_extlist_merge(
			    session, block, &a->discard, &b->discard));

		/*
		 * If the "to" checkpoint is also being deleted, we're done with
		 * it, it's merged into some other checkpoint in the next loop.
		 * This means the extent lists may aggregate over a number of
		 * checkpoints, but that's OK, they're disjoint sets of ranges.
		 */
		if (F_ISSET(next_ckpt, WT_CKPT_DELETE))
			continue;

		/*
		 * Find blocks for re-use: wherever the "to" checkpoint's
		 * allocate and discard lists overlap, move the range to
		 * the live system's checkpoint available list.
		 */
		WT_ERR(__wt_block_extlist_overlap(session, block, b));

		/*
		 * If we're updating the live system's information, we're done.
		 */
		if (F_ISSET(next_ckpt, WT_CKPT_ADD))
			continue;

		/*
		 * We have to write the "to" checkpoint's extent lists out in
		 * new blocks, and update its cookie.
		 *
		 * Free the blocks used to hold the "to" checkpoint's extent
		 * lists; don't include the avail list, it's not changing.
		 */
		WT_ERR(__ckpt_extlist_fblocks(session, block, &b->alloc));
		WT_ERR(__ckpt_extlist_fblocks(session, block, &b->discard));

		F_SET(next_ckpt, WT_CKPT_UPDATE);
	}

	/* Update checkpoints marked for update. */
	WT_CKPT_FOREACH(ckptbase, ckpt)
		if (F_ISSET(ckpt, WT_CKPT_UPDATE))
			WT_ERR(__ckpt_update(
			    session, block, ckpt, ckpt->bpriv, false));

live_update:
	/* Truncate the file if that's possible. */
	WT_ERR(__wt_block_extlist_truncate(session, block, &ci->avail));

	/* Update the final, added checkpoint based on the live system. */
	WT_CKPT_FOREACH(ckptbase, ckpt)
		if (F_ISSET(ckpt, WT_CKPT_ADD)) {
			/*
			 * !!!
			 * Our caller wants the final checkpoint size.  Setting
			 * the size here violates layering, but the alternative
			 * is a call for the btree layer to crack the checkpoint
			 * cookie into its components, and that's a fair amount
			 * of work.
			 */
			ckpt->ckpt_size = ckpt_size;

			/*
			 * Set the rolling checkpoint size for the live system.
			 * The current size includes the current checkpoint's
			 * root page size (root pages are on the checkpoint's
			 * block allocation list as root pages are allocated
			 * with the usual block allocation functions). That's
			 * correct, but we don't want to include it in the size
			 * for the next checkpoint.
			 */
			ckpt_size -= ci->root_size;

			/*
			 * Additionally, we had a bug for awhile where the live
			 * checkpoint size grew without bound. We can't sanity
			 * check the value, that would require walking the tree
			 * as part of the checkpoint. Bound any bug at the size
			 * of the file.
			 * It isn't practical to assert that the value is within
			 * bounds since databases created with older versions
			 * of WiredTiger (2.8.0) would likely see an error.
			 */
			ci->ckpt_size =
			    WT_MIN(ckpt_size, (uint64_t)block->size);

			WT_ERR(__ckpt_update(session, block, ckpt, ci, true));
		}

	/*
	 * Reset the live system's alloc and discard extent lists, leave the
	 * avail list alone.  This includes freeing a lot of extents, so do it
	 * outside of the system's lock by copying and resetting the original,
	 * then doing the work later.
	 */
	ci->ckpt_alloc = ci->alloc;
	WT_ERR(__wt_block_extlist_init(
	    session, &ci->alloc, "live", "alloc", false));
	ci->ckpt_discard = ci->discard;
	WT_ERR(__wt_block_extlist_init(
	    session, &ci->discard, "live", "discard", false));

#ifdef HAVE_DIAGNOSTIC
	/*
	 * The first checkpoint in the system should always have an empty
	 * discard list.  If we've read that checkpoint and/or created it,
	 * check.
	 */
	WT_CKPT_FOREACH(ckptbase, ckpt)
		if (!F_ISSET(ckpt, WT_CKPT_DELETE))
			break;
	if ((a = ckpt->bpriv) == NULL)
		a = &block->live;
	if (a->discard.entries != 0)
		WT_ERR_MSG(session, WT_ERROR,
		    "first checkpoint incorrectly has blocks on the discard "
		    "list");
#endif

err:	if (ret != 0 && fatal)
		ret = __wt_block_panic(session, ret,
		    "%s: fatal checkpoint failure", block->name);

	if (locked)
		__wt_spin_unlock(session, &block->live_lock);

	/* Discard any checkpoint information we loaded. */
	WT_CKPT_FOREACH(ckptbase, ckpt)
		if ((ci = ckpt->bpriv) != NULL)
			__wt_block_ckpt_destroy(session, ci);

	__wt_scr_free(session, &tmp);
	return (ret);
}
int
run_truncate(CONFIG *cfg, CONFIG_THREAD *thread,
    WT_CURSOR *cursor, WT_SESSION *session, int *truncatedp) {

	TRUNCATE_CONFIG *trunc_cfg;
	TRUNCATE_QUEUE_ENTRY *truncate_item;
	char *truncate_key;
	int ret, t_ret;
	uint64_t used_stone_gap;

	ret = 0;
	trunc_cfg = &thread->trunc_cfg;

	*truncatedp = 0;
	/* Update the total inserts */
	trunc_cfg->total_inserts = sum_insert_ops(cfg);
	trunc_cfg->expected_total +=
	    (trunc_cfg->total_inserts - trunc_cfg->last_total_inserts);
	trunc_cfg->last_total_inserts = trunc_cfg->total_inserts;

	/* We are done if there isn't enough data to trigger a new milestone. */
	if (trunc_cfg->expected_total <= thread->workload->truncate_count)
		return (0);

	/*
	 * If we are falling behind and using more than one stone per lap we
	 * should widen the stone gap for this lap to try and catch up quicker.
	 */
	if (trunc_cfg->expected_total >
	    thread->workload->truncate_count + trunc_cfg->stone_gap) {
		/*
		 * Increase the multiplier until we create stones that are
		 * almost large enough to truncate the whole expected table size
		 * in one operation.
		 */
		trunc_cfg->catchup_multiplier =
		    WT_MIN(trunc_cfg->catchup_multiplier + 1,
		    trunc_cfg->needed_stones - 1);
	} else {
		/* Back off if we start seeing an improvement */
		trunc_cfg->catchup_multiplier =
		    WT_MAX(trunc_cfg->catchup_multiplier - 1, 1);
	}
	used_stone_gap = trunc_cfg->stone_gap * trunc_cfg->catchup_multiplier;

	while (trunc_cfg->num_stones < trunc_cfg->needed_stones) {
		trunc_cfg->last_key += used_stone_gap;
		truncate_key = calloc(cfg->key_sz, 1);
		if (truncate_key == NULL) {
			lprintf(cfg, ENOMEM, 0,
			    "truncate: couldn't allocate key array");
			return (ENOMEM);
		}
		truncate_item = calloc(sizeof(TRUNCATE_QUEUE_ENTRY), 1);
		if (truncate_item == NULL) {
			free(truncate_key);
			lprintf(cfg, ENOMEM, 0,
			    "truncate: couldn't allocate item");
			return (ENOMEM);
		}
		generate_key(cfg, truncate_key, trunc_cfg->last_key);
		truncate_item->key = truncate_key;
		truncate_item->diff = used_stone_gap;
		TAILQ_INSERT_TAIL(&cfg->stone_head, truncate_item, q);
		trunc_cfg->num_stones++;
	}

	/* We are done if there isn't enough data to trigger a truncate. */
	if (trunc_cfg->num_stones == 0 ||
	    trunc_cfg->expected_total <= thread->workload->truncate_count)
		return (0);

	truncate_item = TAILQ_FIRST(&cfg->stone_head);
	trunc_cfg->num_stones--;
	TAILQ_REMOVE(&cfg->stone_head, truncate_item, q);
	cursor->set_key(cursor,truncate_item->key);
	if ((ret = cursor->search(cursor)) != 0) {
		lprintf(cfg, ret, 0, "Truncate search: failed");
		goto err;
	}

	if ((ret = session->truncate(session, NULL, NULL, cursor, NULL)) != 0) {
		lprintf(cfg, ret, 0, "Truncate: failed");
		goto err;
	}

	*truncatedp = 1;
	trunc_cfg->expected_total -= truncate_item->diff;

err:	free(truncate_item->key);
	free(truncate_item);
	t_ret = cursor->reset(cursor);
	if (t_ret != 0)
		lprintf(cfg, t_ret, 0, "Cursor reset failed");
	if (ret == 0 && t_ret != 0)
		ret = t_ret;
	return (ret);
}
Beispiel #22
0
/*
 * __wt_page_in_func --
 *	Acquire a hazard pointer to a page; if the page is not in-memory,
 *	read it from the disk and build an in-memory version.
 */
int
__wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags
#ifdef HAVE_DIAGNOSTIC
    , const char *file, int line
#endif
    )
{
	WT_BTREE *btree;
	WT_DECL_RET;
	WT_PAGE *page;
	u_int sleep_cnt, wait_cnt;
	int busy, cache_work, force_attempts, oldgen, stalled;

	btree = S2BT(session);
	stalled = 0;

	for (force_attempts = oldgen = 0, sleep_cnt = wait_cnt = 0;;) {
		switch (ref->state) {
		case WT_REF_DISK:
		case WT_REF_DELETED:
			if (LF_ISSET(WT_READ_CACHE))
				return (WT_NOTFOUND);

			/*
			 * The page isn't in memory, read it. If this thread is
			 * allowed to do eviction work, check for space in the
			 * cache.
			 */
			if (!LF_ISSET(WT_READ_NO_EVICT))
				WT_RET(__wt_cache_eviction_check(
				    session, 1, NULL));
			WT_RET(__page_read(session, ref));
			oldgen = LF_ISSET(WT_READ_WONT_NEED) ||
			    F_ISSET(session, WT_SESSION_NO_CACHE);
			continue;
		case WT_REF_READING:
			if (LF_ISSET(WT_READ_CACHE))
				return (WT_NOTFOUND);
			if (LF_ISSET(WT_READ_NO_WAIT))
				return (WT_NOTFOUND);

			/* Waiting on another thread's read, stall. */
			WT_STAT_FAST_CONN_INCR(session, page_read_blocked);
			stalled = 1;
			break;
		case WT_REF_LOCKED:
			if (LF_ISSET(WT_READ_NO_WAIT))
				return (WT_NOTFOUND);

			/* Waiting on eviction, stall. */
			WT_STAT_FAST_CONN_INCR(session, page_locked_blocked);
			stalled = 1;
			break;
		case WT_REF_SPLIT:
			return (WT_RESTART);
		case WT_REF_MEM:
			/*
			 * The page is in memory.
			 *
			 * Get a hazard pointer if one is required. We cannot
			 * be evicting if no hazard pointer is required, we're
			 * done.
			 */
			if (F_ISSET(btree, WT_BTREE_IN_MEMORY))
				goto skip_evict;

			/*
			 * The expected reason we can't get a hazard pointer is
			 * because the page is being evicted, yield, try again.
			 */
#ifdef HAVE_DIAGNOSTIC
			WT_RET(
			    __wt_hazard_set(session, ref, &busy, file, line));
#else
			WT_RET(__wt_hazard_set(session, ref, &busy));
#endif
			if (busy) {
				WT_STAT_FAST_CONN_INCR(
				    session, page_busy_blocked);
				break;
			}

			/*
			 * If eviction is configured for this file, check to see
			 * if the page qualifies for forced eviction and update
			 * the page's generation number. If eviction isn't being
			 * done on this file, we're done.
			 */
			if (LF_ISSET(WT_READ_NO_EVICT) ||
			    F_ISSET(session, WT_SESSION_NO_EVICTION) ||
			    F_ISSET(btree, WT_BTREE_NO_EVICTION))
				goto skip_evict;

			/*
			 * Forcibly evict pages that are too big.
			 */
			page = ref->page;
			if (force_attempts < 10 &&
			    __evict_force_check(session, page)) {
				++force_attempts;
				ret = __wt_page_release_evict(session, ref);
				/* If forced eviction fails, stall. */
				if (ret == EBUSY) {
					ret = 0;
					WT_STAT_FAST_CONN_INCR(session,
					    page_forcible_evict_blocked);
					stalled = 1;
					break;
				}
				WT_RET(ret);

				/*
				 * The result of a successful forced eviction
				 * is a page-state transition (potentially to
				 * an in-memory page we can use, or a restart
				 * return for our caller), continue the outer
				 * page-acquisition loop.
				 */
				continue;
			}

			/*
			 * If we read the page and we are configured to not
			 * trash the cache, set the oldest read generation so
			 * the page is forcibly evicted as soon as possible.
			 *
			 * Otherwise, update the page's read generation.
			 */
			if (oldgen && page->read_gen == WT_READGEN_NOTSET)
				__wt_page_evict_soon(page);
			else if (!LF_ISSET(WT_READ_NO_GEN) &&
			    page->read_gen != WT_READGEN_OLDEST &&
			    page->read_gen < __wt_cache_read_gen(session))
				page->read_gen =
				    __wt_cache_read_gen_bump(session);
skip_evict:
			/*
			 * Check if we need an autocommit transaction.
			 * Starting a transaction can trigger eviction, so skip
			 * it if eviction isn't permitted.
			 */
			return (LF_ISSET(WT_READ_NO_EVICT) ? 0 :
			    __wt_txn_autocommit_check(session));
		WT_ILLEGAL_VALUE(session);
		}

		/*
		 * We failed to get the page -- yield before retrying, and if
		 * we've yielded enough times, start sleeping so we don't burn
		 * CPU to no purpose.
		 */
		if (stalled)
			wait_cnt += 1000;
		else if (++wait_cnt < 1000) {
			__wt_yield();
			continue;
		}

		/*
		 * If stalling and this thread is allowed to do eviction work,
		 * check if the cache needs help. If we do work for the cache,
		 * substitute that for a sleep.
		 */
		if (!LF_ISSET(WT_READ_NO_EVICT)) {
			WT_RET(
			    __wt_cache_eviction_check(session, 1, &cache_work));
			if (cache_work)
				continue;
		}
		sleep_cnt = WT_MIN(sleep_cnt + 1000, 10000);
		WT_STAT_FAST_CONN_INCRV(session, page_sleep, sleep_cnt);
		__wt_sleep(0, sleep_cnt);
	}
}
Beispiel #23
0
/*
 * __wt_hazard_set --
 *	Set a hazard pointer.
 */
int
__wt_hazard_set(WT_SESSION_IMPL *session, WT_REF *ref, int *busyp
#ifdef HAVE_DIAGNOSTIC
                , const char *file, int line
#endif
               )
{
    WT_BTREE *btree;
    WT_HAZARD *hp;
    int restarts = 0;

    btree = S2BT(session);
    *busyp = 0;

    /* If a file can never be evicted, hazard pointers aren't required. */
    if (F_ISSET(btree, WT_BTREE_NO_HAZARD))
        return (0);

    /*
     * Do the dance:
     *
     * The memory location which makes a page "real" is the WT_REF's state
     * of WT_REF_MEM, which can be set to WT_REF_LOCKED at any time by the
     * page eviction server.
     *
     * Add the WT_REF reference to the session's hazard list and flush the
     * write, then see if the page's state is still valid.  If so, we can
     * use the page because the page eviction server will see our hazard
     * pointer before it discards the page (the eviction server sets the
     * state to WT_REF_LOCKED, then flushes memory and checks the hazard
     * pointers).
     *
     * For sessions with many active hazard pointers, skip most of the
     * active slots: there may be a free slot in there, but checking is
     * expensive.  Most hazard pointers are released quickly: optimize
     * for that case.
     */
    for (hp = session->hazard + session->nhazard;; ++hp) {
        /* Expand the number of hazard pointers if available.*/
        if (hp >= session->hazard + session->hazard_size) {
            if (session->hazard_size >= S2C(session)->hazard_max)
                break;
            /* Restart the search. */
            if (session->nhazard < session->hazard_size &&
                    restarts++ == 0) {
                hp = session->hazard;
                continue;
            }
            WT_PUBLISH(session->hazard_size,
                       WT_MIN(session->hazard_size + WT_HAZARD_INCR,
                              S2C(session)->hazard_max));
        }

        if (hp->page != NULL)
            continue;

        hp->page = ref->page;
#ifdef HAVE_DIAGNOSTIC
        hp->file = file;
        hp->line = line;
#endif
        /* Publish the hazard pointer before reading page's state. */
        WT_FULL_BARRIER();

        /*
         * Check if the page state is still valid, where valid means a
         * state of WT_REF_MEM or WT_REF_EVICT_WALK and the pointer is
         * unchanged.  (The pointer can change, it means the page was
         * evicted between the time we set our hazard pointer and the
         * publication.  It would theoretically be possible for the
         * page to be evicted and a different page read into the same
         * memory, so the pointer hasn't changed but the contents have.
         * That's OK, we found this page using the tree's key space,
         * whatever page we find here is the page for us to use.)
         */
        if (ref->page == hp->page &&
                (ref->state == WT_REF_MEM ||
                 ref->state == WT_REF_EVICT_WALK)) {
            WT_VERBOSE_RET(session, hazard,
                           "session %p hazard %p: set", session, ref->page);

            ++session->nhazard;
            return (0);
        }

        /*
         * The page isn't available, it's being considered for eviction
         * (or being evicted, for all we know).  If the eviction server
         * sees our hazard pointer before evicting the page, it will
         * return the page to use, no harm done, if it doesn't, it will
         * go ahead and complete the eviction.
         *
         * We don't bother publishing this update: the worst case is we
         * prevent some random page from being evicted.
         */
        hp->page = NULL;
        *busyp = 1;
        return (0);
    }

    __wt_errx(session,
              "session %p: hazard pointer table full", session);
#ifdef HAVE_DIAGNOSTIC
    __hazard_dump(session);
#endif

    return (ENOMEM);
}
Beispiel #24
0
/*
 * __wt_hazard_set --
 *	Set a hazard pointer.
 */
int
__wt_hazard_set(WT_SESSION_IMPL *session, WT_REF *ref, bool *busyp
#ifdef HAVE_DIAGNOSTIC
    , const char *file, int line
#endif
    )
{
	WT_BTREE *btree;
	WT_CONNECTION_IMPL *conn;
	WT_HAZARD *hp;
	int restarts = 0;

	btree = S2BT(session);
	conn = S2C(session);
	*busyp = false;

	/* If a file can never be evicted, hazard pointers aren't required. */
	if (F_ISSET(btree, WT_BTREE_IN_MEMORY))
		return (0);

	/*
	 * Do the dance:
	 *
	 * The memory location which makes a page "real" is the WT_REF's state
	 * of WT_REF_MEM, which can be set to WT_REF_LOCKED at any time by the
	 * page eviction server.
	 *
	 * Add the WT_REF reference to the session's hazard list and flush the
	 * write, then see if the page's state is still valid.  If so, we can
	 * use the page because the page eviction server will see our hazard
	 * pointer before it discards the page (the eviction server sets the
	 * state to WT_REF_LOCKED, then flushes memory and checks the hazard
	 * pointers).
	 *
	 * For sessions with many active hazard pointers, skip most of the
	 * active slots: there may be a free slot in there, but checking is
	 * expensive.  Most hazard pointers are released quickly: optimize
	 * for that case.
	 */
	for (hp = session->hazard + session->nhazard;; ++hp) {
		/*
		 * If we get to the end of the array, either:
		 * 1. If we know there are free slots somewhere, and this is
		 *    the first time through, continue the search from the
		 *    start.  Don't actually continue the loop because that
		 *    will skip the first slot.
		 * 2. If we have searched all the way through and we have
		 *    allocated the maximum number of slots, give up.
		 * 3. Allocate another increment of slots, up to the maximum.
		 *    The slot we are on should now be available.
		 */
		if (hp >= session->hazard + session->hazard_size) {
			if (session->nhazard < session->hazard_size &&
			    restarts++ == 0)
				hp = session->hazard;
			else if (session->hazard_size >= conn->hazard_max)
				break;
			else
				WT_PUBLISH(session->hazard_size, WT_MIN(
				    session->hazard_size + WT_HAZARD_INCR,
				    conn->hazard_max));
		}

		if (hp->page != NULL)
			continue;

		hp->page = ref->page;
#ifdef HAVE_DIAGNOSTIC
		hp->file = file;
		hp->line = line;
#endif
		/* Publish the hazard pointer before reading page's state. */
		WT_FULL_BARRIER();

		/*
		 * Check if the page state is still valid, where valid means a
		 * state of WT_REF_MEM and the pointer is unchanged.  (The
		 * pointer can change, it means the page was evicted between
		 * the time we set our hazard pointer and the publication.  It
		 * would theoretically be possible for the page to be evicted
		 * and a different page read into the same memory, so the
		 * pointer hasn't changed but the contents have.  That's OK, we
		 * found this page using the tree's key space, whatever page we
		 * find here is the page for us to use.)
		 */
		if (ref->page == hp->page && ref->state == WT_REF_MEM) {
			++session->nhazard;
			return (0);
		}

		/*
		 * The page isn't available, it's being considered for eviction
		 * (or being evicted, for all we know).  If the eviction server
		 * sees our hazard pointer before evicting the page, it will
		 * return the page to use, no harm done, if it doesn't, it will
		 * go ahead and complete the eviction.
		 *
		 * We don't bother publishing this update: the worst case is we
		 * prevent some random page from being evicted.
		 */
		hp->page = NULL;
		*busyp = true;
		return (0);
	}

	__wt_errx(session,
	    "session %p: hazard pointer table full", (void *)session);
#ifdef HAVE_DIAGNOSTIC
	__hazard_dump(session);
#endif

	return (ENOMEM);
}
Beispiel #25
0
/*
 * __verify_tree --
 *	Verify a tree, recursively descending through it in depth-first fashion.
 * The page argument was physically verified (so we know it's correctly formed),
 * and the in-memory version built.  Our job is to check logical relationships
 * in the page and in the tree.
 */
static int
__verify_tree(WT_SESSION_IMPL *session, WT_REF *ref, WT_VSTUFF *vs)
{
	WT_BM *bm;
	WT_CELL *cell;
	WT_CELL_UNPACK *unpack, _unpack;
	WT_COL *cip;
	WT_DECL_RET;
	WT_PAGE *page;
	WT_REF *child_ref;
	uint64_t recno;
	uint32_t entry, i;
	bool found;

	bm = S2BT(session)->bm;
	page = ref->page;

	unpack = &_unpack;
	WT_CLEAR(*unpack);	/* -Wuninitialized */

	WT_RET(__wt_verbose(session, WT_VERB_VERIFY, "%s %s",
	    __wt_page_addr_string(session, ref, vs->tmp1),
	    __wt_page_type_string(page->type)));

	/* Optionally dump the address. */
	if (vs->dump_address)
		WT_RET(__wt_msg(session, "%s %s",
		    __wt_page_addr_string(session, ref, vs->tmp1),
		    __wt_page_type_string(page->type)));

	/* Track the shape of the tree. */
	if (WT_PAGE_IS_INTERNAL(page))
		++vs->depth_internal[
		    WT_MIN(vs->depth, WT_ELEMENTS(vs->depth_internal) - 1)];
	else
		++vs->depth_leaf[
		    WT_MIN(vs->depth, WT_ELEMENTS(vs->depth_internal) - 1)];

	/*
	 * The page's physical structure was verified when it was read into
	 * memory by the read server thread, and then the in-memory version
	 * of the page was built. Now we make sure the page and tree are
	 * logically consistent.
	 *
	 * !!!
	 * The problem: (1) the read server has to build the in-memory version
	 * of the page because the read server is the thread that flags when
	 * any thread can access the page in the tree; (2) we can't build the
	 * in-memory version of the page until the physical structure is known
	 * to be OK, so the read server has to verify at least the physical
	 * structure of the page; (3) doing complete page verification requires
	 * reading additional pages (for example, overflow keys imply reading
	 * overflow pages in order to test the key's order in the page); (4)
	 * the read server cannot read additional pages because it will hang
	 * waiting on itself.  For this reason, we split page verification
	 * into a physical verification, which allows the in-memory version
	 * of the page to be built, and then a subsequent logical verification
	 * which happens here.
	 *
	 * Report progress occasionally.
	 */
#define	WT_VERIFY_PROGRESS_INTERVAL	100
	if (++vs->fcnt % WT_VERIFY_PROGRESS_INTERVAL == 0)
		WT_RET(__wt_progress(session, NULL, vs->fcnt));

#ifdef HAVE_DIAGNOSTIC
	/* Optionally dump the blocks or page in debugging mode. */
	if (vs->dump_blocks)
		WT_RET(__wt_debug_disk(session, page->dsk, NULL));
	if (vs->dump_pages)
		WT_RET(__wt_debug_page(session, page, NULL));
#endif

	/*
	 * Column-store key order checks: check the page's record number and
	 * then update the total record count.
	 */
	switch (page->type) {
	case WT_PAGE_COL_FIX:
		recno = page->pg_fix_recno;
		goto recno_chk;
	case WT_PAGE_COL_INT:
		recno = page->pg_intl_recno;
		goto recno_chk;
	case WT_PAGE_COL_VAR:
		recno = page->pg_var_recno;
recno_chk:	if (recno != vs->record_total + 1)
			WT_RET_MSG(session, WT_ERROR,
			    "page at %s has a starting record of %" PRIu64
			    " when the expected starting record is %" PRIu64,
			    __wt_page_addr_string(session, ref, vs->tmp1),
			    recno, vs->record_total + 1);
		break;
	}
	switch (page->type) {
	case WT_PAGE_COL_FIX:
		vs->record_total += page->pg_fix_entries;
		break;
	case WT_PAGE_COL_VAR:
		recno = 0;
		WT_COL_FOREACH(page, cip, i)
			if ((cell = WT_COL_PTR(page, cip)) == NULL)
				++recno;
			else {
				__wt_cell_unpack(cell, unpack);
				recno += __wt_cell_rle(unpack);
			}
		vs->record_total += recno;
		break;
	}

	/*
	 * Row-store leaf page key order check: it's a depth-first traversal,
	 * the first key on this page should be larger than any key previously
	 * seen.
	 */
	switch (page->type) {
	case WT_PAGE_ROW_LEAF:
		WT_RET(__verify_row_leaf_key_order(session, ref, vs));
		break;
	}

	/* If it's not the root page, unpack the parent cell. */
	if (!__wt_ref_is_root(ref)) {
		__wt_cell_unpack(ref->addr, unpack);

		/* Compare the parent cell against the page type. */
		switch (page->type) {
		case WT_PAGE_COL_FIX:
			if (unpack->raw != WT_CELL_ADDR_LEAF_NO)
				goto celltype_err;
			break;
		case WT_PAGE_COL_VAR:
			if (unpack->raw != WT_CELL_ADDR_LEAF &&
			    unpack->raw != WT_CELL_ADDR_LEAF_NO)
				goto celltype_err;
			break;
		case WT_PAGE_ROW_LEAF:
			if (unpack->raw != WT_CELL_ADDR_DEL &&
			    unpack->raw != WT_CELL_ADDR_LEAF &&
			    unpack->raw != WT_CELL_ADDR_LEAF_NO)
				goto celltype_err;
			break;
		case WT_PAGE_COL_INT:
		case WT_PAGE_ROW_INT:
			if (unpack->raw != WT_CELL_ADDR_INT)
celltype_err:			WT_RET_MSG(session, WT_ERROR,
				    "page at %s, of type %s, is referenced in "
				    "its parent by a cell of type %s",
				    __wt_page_addr_string(
					session, ref, vs->tmp1),
				    __wt_page_type_string(page->type),
				    __wt_cell_type_string(unpack->raw));
			break;
		}
	}

	/*
	 * Check overflow pages.  We check overflow cells separately from other
	 * tests that walk the page as it's simpler, and I don't care much how
	 * fast table verify runs.
	 */
	switch (page->type) {
	case WT_PAGE_COL_VAR:
	case WT_PAGE_ROW_INT:
	case WT_PAGE_ROW_LEAF:
		WT_RET(__verify_overflow_cell(session, ref, &found, vs));
		if (__wt_ref_is_root(ref) || page->type == WT_PAGE_ROW_INT)
			break;

		/*
		 * Object if a leaf-no-overflow address cell references a page
		 * with overflow keys, but don't object if a leaf address cell
		 * references a page without overflow keys.  Reconciliation
		 * doesn't guarantee every leaf page without overflow items will
		 * be a leaf-no-overflow type.
		 */
		if (found && unpack->raw == WT_CELL_ADDR_LEAF_NO)
			WT_RET_MSG(session, WT_ERROR,
			    "page at %s, of type %s and referenced in its "
			    "parent by a cell of type %s, contains overflow "
			    "items",
			    __wt_page_addr_string(session, ref, vs->tmp1),
			    __wt_page_type_string(page->type),
			    __wt_cell_type_string(WT_CELL_ADDR_LEAF_NO));
		break;
	}

	/* Check tree connections and recursively descend the tree. */
	switch (page->type) {
	case WT_PAGE_COL_INT:
		/* For each entry in an internal page, verify the subtree. */
		entry = 0;
		WT_INTL_FOREACH_BEGIN(session, page, child_ref) {
			/*
			 * It's a depth-first traversal: this entry's starting
			 * record number should be 1 more than the total records
			 * reviewed to this point.
			 */
			++entry;
			if (child_ref->key.recno != vs->record_total + 1) {
				WT_RET_MSG(session, WT_ERROR,
				    "the starting record number in entry %"
				    PRIu32 " of the column internal page at "
				    "%s is %" PRIu64 " and the expected "
				    "starting record number is %" PRIu64,
				    entry,
				    __wt_page_addr_string(
				    session, child_ref, vs->tmp1),
				    child_ref->key.recno,
				    vs->record_total + 1);
			}

			/* Verify the subtree. */
			++vs->depth;
			WT_RET(__wt_page_in(session, child_ref, 0));
			ret = __verify_tree(session, child_ref, vs);
			WT_TRET(__wt_page_release(session, child_ref, 0));
			--vs->depth;
			WT_RET(ret);

			__wt_cell_unpack(child_ref->addr, unpack);
			WT_RET(bm->verify_addr(
			    bm, session, unpack->data, unpack->size));
		} WT_INTL_FOREACH_END;
		break;
	case WT_PAGE_ROW_INT:
		/* For each entry in an internal page, verify the subtree. */
		entry = 0;
		WT_INTL_FOREACH_BEGIN(session, page, child_ref) {
			/*
			 * It's a depth-first traversal: this entry's starting
			 * key should be larger than the largest key previously
			 * reviewed.
			 *
			 * The 0th key of any internal page is magic, and we
			 * can't test against it.
			 */
			++entry;
			if (entry != 1)
				WT_RET(__verify_row_int_key_order(
				    session, page, child_ref, entry, vs));

			/* Verify the subtree. */
			++vs->depth;
			WT_RET(__wt_page_in(session, child_ref, 0));
			ret = __verify_tree(session, child_ref, vs);
			WT_TRET(__wt_page_release(session, child_ref, 0));
			--vs->depth;
			WT_RET(ret);

			__wt_cell_unpack(child_ref->addr, unpack);
			WT_RET(bm->verify_addr(
			    bm, session, unpack->data, unpack->size));
		} WT_INTL_FOREACH_END;
Beispiel #26
0
/*
 * __wt_log_scan --
 *	Scan the logs, calling a function on each record found.
 */
int
__wt_log_scan(WT_SESSION_IMPL *session, WT_LSN *lsnp, uint32_t flags,
    int (*func)(WT_SESSION_IMPL *session,
    WT_ITEM *record, WT_LSN *lsnp, void *cookie), void *cookie)
{
	WT_CONNECTION_IMPL *conn;
	WT_ITEM buf;
	WT_DECL_RET;
	WT_FH *log_fh;
	WT_LOG *log;
	WT_LOG_RECORD *logrec;
	WT_LSN end_lsn, rd_lsn, start_lsn;
	off_t log_size;
	uint32_t allocsize, cksum, firstlog, lastlog, lognum, rdup_len, reclen;
	u_int i, logcount;
	int eol;
	char **logfiles;

	conn = S2C(session);
	log = conn->log;
	log_fh = NULL;
	logcount = 0;
	logfiles = NULL;
	eol = 0;
	WT_CLEAR(buf);

	/*
	 * If the caller did not give us a callback function there is nothing
	 * to do.
	 */
	if (func == NULL)
		return (0);

	if (LF_ISSET(WT_LOGSCAN_RECOVER))
		WT_RET(__wt_verbose(session, WT_VERB_LOG,
		    "__wt_log_scan truncating to %u/%" PRIuMAX,
		    log->trunc_lsn.file, (uintmax_t)log->trunc_lsn.offset));

	if (log != NULL) {
		allocsize = log->allocsize;

		if (lsnp == NULL) {
			if (LF_ISSET(WT_LOGSCAN_FIRST))
				start_lsn = log->first_lsn;
			else if (LF_ISSET(WT_LOGSCAN_FROM_CKP))
				start_lsn = log->ckpt_lsn;
			else
				return (WT_ERROR);	/* Illegal usage */
		} else {
			if (LF_ISSET(WT_LOGSCAN_FIRST|WT_LOGSCAN_FROM_CKP))
				WT_RET_MSG(session, WT_ERROR,
			    "choose either a start LSN or a start flag");

			/* Offsets must be on allocation boundaries. */
			if (lsnp->offset % allocsize != 0 ||
			    lsnp->file > log->fileid)
				return (WT_NOTFOUND);

			/*
			 * Log cursors may not know the starting LSN.  If an
			 * LSN pointer is passed in, but it is the INIT_LSN,
			 * start from the first_lsn.
			 */
			start_lsn = *lsnp;
			if (IS_INIT_LSN(&start_lsn))
				start_lsn = log->first_lsn;
		}
		end_lsn = log->alloc_lsn;
	} else {
		/*
		 * If logging is not configured, we can still print out the log
		 * if log files exist.  We just need to set the LSNs from what
		 * is in the files versus what is in the live connection.
		 */
		/*
		 * Set allocsize to the minimum alignment it could be.  Larger
		 * records and larger allocation boundaries should always be
		 * a multiple of this.
		 */
		allocsize = LOG_ALIGN;
		lastlog = 0;
		firstlog = UINT32_MAX;
		WT_RET(__wt_log_get_files(session, &logfiles, &logcount));
		if (logcount == 0)
			/*
			 * Return it is not supported if none don't exist.
			 */
			return (ENOTSUP);
		for (i = 0; i < logcount; i++) {
			WT_ERR(__wt_log_extract_lognum(session, logfiles[i],
			    &lognum));
			lastlog = WT_MAX(lastlog, lognum);
			firstlog = WT_MIN(firstlog, lognum);
		}
		start_lsn.file = firstlog;
		end_lsn.file = lastlog;
		start_lsn.offset = end_lsn.offset = 0;
		__wt_log_files_free(session, logfiles, logcount);
		logfiles = NULL;
	}
	WT_ERR(__log_openfile(session, 0, &log_fh, start_lsn.file));
	WT_ERR(__log_filesize(session, log_fh, &log_size));
	rd_lsn = start_lsn;
	WT_ERR(__wt_buf_initsize(session, &buf, LOG_ALIGN));
	for (;;) {
		if (rd_lsn.offset + allocsize > log_size) {
advance:
			/*
			 * If we read the last record, go to the next file.
			 */
			WT_ERR(__wt_close(session, log_fh));
			log_fh = NULL;
			eol = 1;
			/*
			 * Truncate this log file before we move to the next.
			 */
			if (LF_ISSET(WT_LOGSCAN_RECOVER))
				WT_ERR(__log_truncate(session, &rd_lsn, 1));
			rd_lsn.file++;
			rd_lsn.offset = 0;
			/*
			 * Avoid an error message when we reach end of log
			 * by checking here.
			 */
			if (rd_lsn.file > end_lsn.file)
				break;
			WT_ERR(__log_openfile(
			    session, 0, &log_fh, rd_lsn.file));
			WT_ERR(__log_filesize(session, log_fh, &log_size));
			continue;
		}
		/*
		 * Read the minimum allocation size a record could be.
		 */
		WT_ASSERT(session, buf.memsize >= allocsize);
		WT_ERR(__wt_read(session,
		    log_fh, rd_lsn.offset, (size_t)allocsize, buf.mem));
		/*
		 * First 8 bytes is the real record length.  See if we
		 * need to read more than the allocation size.  We expect
		 * that we rarely will have to read more.  Most log records
		 * will be fairly small.
		 */
		reclen = *(uint32_t *)buf.mem;
		/*
		 * Log files are pre-allocated.  We never expect a zero length
		 * unless we've reached the end of the log.  The log can be
		 * written out of order, so when recovery finds the end of
		 * the log, truncate the file and remove any later log files
		 * that may exist.
		 */
		if (reclen == 0) {
			/* This LSN is the end. */
			break;
		}
		rdup_len = __wt_rduppo2(reclen, allocsize);
		if (reclen > allocsize) {
			/*
			 * The log file end could be the middle of this
			 * log record.
			 */
			if (rd_lsn.offset + rdup_len > log_size)
				goto advance;
			/*
			 * We need to round up and read in the full padded
			 * record, especially for direct I/O.
			 */
			WT_ERR(__wt_buf_grow(session, &buf, rdup_len));
			WT_ERR(__wt_read(session,
			    log_fh, rd_lsn.offset, (size_t)rdup_len, buf.mem));
			WT_STAT_FAST_CONN_INCR(session, log_scan_rereads);
		}
		/*
		 * We read in the record, verify checksum.
		 */
		buf.size = reclen;
		logrec = (WT_LOG_RECORD *)buf.mem;
		cksum = logrec->checksum;
		logrec->checksum = 0;
		logrec->checksum = __wt_cksum(logrec, logrec->len);
		if (logrec->checksum != cksum) {
			/*
			 * A checksum mismatch means we have reached the end of
			 * the useful part of the log.  This should be found on
			 * the first pass through recovery.  In the second pass
			 * where we truncate the log, this is where it should
			 * end.
			 */
			if (log != NULL)
				log->trunc_lsn = rd_lsn;
			break;
		}

		/*
		 * We have a valid log record.  If it is not the log file
		 * header, invoke the callback.
		 */
		WT_STAT_FAST_CONN_INCR(session, log_scan_records);
		if (rd_lsn.offset != 0) {
			WT_ERR((*func)(session, &buf, &rd_lsn, cookie));
			if (LF_ISSET(WT_LOGSCAN_ONE))
				break;
		}
		rd_lsn.offset += (off_t)rdup_len;
	}

	/* Truncate if we're in recovery. */
	if (LF_ISSET(WT_LOGSCAN_RECOVER) &&
	    LOG_CMP(&rd_lsn, &log->trunc_lsn) < 0)
		WT_ERR(__log_truncate(session, &rd_lsn, 0));

err:	WT_STAT_FAST_CONN_INCR(session, log_scans);
	if (logfiles != NULL)
		__wt_log_files_free(session, logfiles, logcount);
	__wt_buf_free(session, &buf);
	/*
	 * If the caller wants one record and it is at the end of log,
	 * return WT_NOTFOUND.
	 */
	if (LF_ISSET(WT_LOGSCAN_ONE) && eol && ret == 0)
		ret = WT_NOTFOUND;
	if (ret == ENOENT)
		ret = 0;
	if (log_fh != NULL)
		WT_TRET(__wt_close(session, log_fh));
	return (ret);
}
Beispiel #27
0
/*
 * __curjoin_iter_set_entry --
 *	Set the current entry for an iterator.
 */
static int
__curjoin_iter_set_entry(WT_CURSOR_JOIN_ITER *iter, u_int entry_pos)
{
	WT_CURSOR *c, *to_dup;
	WT_CURSOR_JOIN *cjoin, *topjoin;
	WT_CURSOR_JOIN_ENTRY *entry;
	WT_DECL_RET;
	WT_SESSION_IMPL *session;
	size_t size;
	const char *raw_cfg[] = { WT_CONFIG_BASE(
	    iter->session, WT_SESSION_open_cursor), "raw", NULL };
	const char *def_cfg[] = { WT_CONFIG_BASE(
	    iter->session, WT_SESSION_open_cursor), NULL };
	const char **config;
	char *uri;

	session = iter->session;
	cjoin = iter->cjoin;
	uri = NULL;
	entry = iter->entry = &cjoin->entries[entry_pos];
	iter->positioned = false;
	iter->entry_pos = entry_pos;
	iter->end_pos = 0;

	iter->is_equal = (entry->ends_next == 1 &&
	    WT_CURJOIN_END_RANGE(&entry->ends[0]) == WT_CURJOIN_END_EQ);
	iter->end_skip = (entry->ends_next > 0 &&
	    WT_CURJOIN_END_RANGE(&entry->ends[0]) == WT_CURJOIN_END_GE) ? 1 : 0;

	iter->end_count = WT_MIN(1, entry->ends_next);
	if (F_ISSET(cjoin, WT_CURJOIN_DISJUNCTION)) {
		iter->entry_count = cjoin->entries_next;
		if (iter->is_equal)
			iter->end_count = entry->ends_next;
	} else
		iter->entry_count = 1;
	WT_ASSERT(iter->session, iter->entry_pos < iter->entry_count);

	entry->stats.iterated = 0;

	if (entry->subjoin == NULL) {
		for (topjoin = iter->cjoin; topjoin->parent != NULL;
		     topjoin = topjoin->parent)
			;
		to_dup = entry->ends[0].cursor;

		if (F_ISSET((WT_CURSOR *)topjoin, WT_CURSTD_RAW))
			config = &raw_cfg[0];
		else
			config = &def_cfg[0];

		size = strlen(to_dup->internal_uri) + 3;
		WT_ERR(__wt_calloc(session, size, 1, &uri));
		WT_ERR(__wt_snprintf(uri, size, "%s()", to_dup->internal_uri));
		if ((c = iter->cursor) == NULL || strcmp(c->uri, uri) != 0) {
			iter->cursor = NULL;
			if (c != NULL)
				WT_ERR(c->close(c));
			WT_ERR(__wt_open_cursor(session, uri,
			    (WT_CURSOR *)topjoin, config, &iter->cursor));
		}
		WT_ERR(__wt_cursor_dup_position(to_dup, iter->cursor));
	} else if (iter->cursor != NULL) {
		WT_ERR(iter->cursor->close(iter->cursor));
		iter->cursor = NULL;
	}

err:	__wt_free(session, uri);
	return (ret);
}
Beispiel #28
0
void
bdb_truncate(uint64_t start, uint64_t stop)
{
	DBC *dbc = g.dbc;
	size_t len;
	int cmp, ret, notfound;

	/* Deleting a fixed-length item is the same as setting the bits to 0. */
	if (g.type == FIX) {
		/*
		 * If we're deleting from/to the start/end of the database,
		 * correct for the number of records we have.
		 */
		if (start == 0)
			start = 1;
		if (stop == 0)
			stop = g.rows;
		for (; start <= stop; ++start)
			bdb_remove(start, &notfound);
		return;
	}

	if (start == 0) {
		ret = dbc->get(dbc, &key, &value, DB_FIRST);
		if (ret != 0 && ret != DB_NOTFOUND)
			bdb_die(ret, "%s", "dbc.get: DB_FIRST");
	} else {
		key_gen(&keyitem, start);
		key.data = (void *)keyitem.data;
		key.size = (u_int32_t)keyitem.size;
		ret = dbc->get(dbc, &key, &value, DB_SET_RANGE);
		if (ret != 0 && ret != DB_NOTFOUND)
			bdb_die(ret, "dbc.get: DB_SET: {%.*s}",
			    (int)key.size, (char *)key.data);
	}
	if (ret == DB_NOTFOUND)
		return;

	if (stop == 0) {
		do {
			ret = dbc->del(dbc, 0);
			if (ret != 0 && ret != DB_NOTFOUND)
				bdb_die(ret, "dbc.del: {%.*s}",
				    (int)key.size, (char *)key.data);
		} while ((ret = dbc->get(dbc, &key, &value, DB_NEXT)) == 0);
	} else {
		key_gen(&keyitem, stop);
		do {
			len = WT_MIN(key.size, keyitem.size);
			cmp = memcmp(key.data, keyitem.data, len);
			if (g.c_reverse) {
				if (cmp < 0 ||
				    (cmp == 0 && key.size < keyitem.size))
					break;
			} else
				if (cmp > 0 ||
				    (cmp == 0 && key.size > keyitem.size))
					break;
			ret = dbc->del(dbc, 0);
			if (ret != 0 && ret != DB_NOTFOUND)
				bdb_die(ret, "dbc.del: {%.*s}",
				    (int)key.size, (char *)key.data);
		} while ((ret = dbc->get(dbc, &key, &value, DB_NEXT)) == 0);
	}
	if (ret != 0 && ret != DB_NOTFOUND)
		bdb_die(ret, "%s", "dbc.get: DB_NEXT");
}
Beispiel #29
0
/*
 * __wt_lsm_tree_throttle --
 *	Calculate whether LSM updates need to be throttled. Must be called
 *	with the LSM tree lock held.
 */
void
__wt_lsm_tree_throttle(
    WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree, int decrease_only)
{
	WT_LSM_CHUNK *last_chunk, **cp, *ondisk, *prev_chunk;
	uint64_t cache_sz, cache_used, oldtime, record_count, timediff;
	uint32_t in_memory, gen0_chunks;

	/* Never throttle in small trees. */
	if (lsm_tree->nchunks < 3) {
		lsm_tree->ckpt_throttle = lsm_tree->merge_throttle = 0;
		return;
	}

	cache_sz = S2C(session)->cache_size;

	/*
	 * In the steady state, we expect that the checkpoint worker thread
	 * will keep up with inserts.  If not, throttle the insert rate to
	 * avoid filling the cache with in-memory chunks.  Threads sleep every
	 * 100 operations, so take that into account in the calculation.
	 *
	 * Also throttle based on whether merge threads are keeping up.  If
	 * there are enough chunks that have never been merged we slow down
	 * inserts so that merges have some chance of keeping up.
	 *
	 * Count the number of in-memory chunks, the number of unmerged chunk
	 * on disk, and find the most recent on-disk chunk (if any).
	 */
	record_count = 1;
	gen0_chunks = in_memory = 0;
	ondisk = NULL;
	for (cp = lsm_tree->chunk + lsm_tree->nchunks - 1;
	    cp >= lsm_tree->chunk;
	    --cp)
		if (!F_ISSET(*cp, WT_LSM_CHUNK_ONDISK)) {
			record_count += (*cp)->count;
			++in_memory;
		} else {
			/*
			 * Assign ondisk to the last chunk that has been
			 * flushed since the tree was last opened (i.e it's on
			 * disk and stable is not set).
			 */
			if (ondisk == NULL &&
			    ((*cp)->generation == 0 &&
			    !F_ISSET(*cp, WT_LSM_CHUNK_STABLE)))
				ondisk = *cp;

			if ((*cp)->generation == 0 &&
			    !F_ISSET(*cp, WT_LSM_CHUNK_MERGING))
				++gen0_chunks;
		}

	last_chunk = lsm_tree->chunk[lsm_tree->nchunks - 1];

	/* Checkpoint throttling, based on the number of in-memory chunks. */
	if (!F_ISSET(lsm_tree, WT_LSM_TREE_THROTTLE) || in_memory <= 3)
		lsm_tree->ckpt_throttle = 0;
	else if (decrease_only)
		; /* Nothing to do */
	else if (ondisk == NULL) {
		/*
		 * No checkpoint has completed this run.  Keep slowing down
		 * inserts until one does.
		 */
		lsm_tree->ckpt_throttle =
		    WT_MAX(WT_LSM_THROTTLE_START, 2 * lsm_tree->ckpt_throttle);
	} else {
		WT_ASSERT(session,
		    WT_TIMECMP(last_chunk->create_ts, ondisk->create_ts) >= 0);
		timediff =
		    WT_TIMEDIFF(last_chunk->create_ts, ondisk->create_ts);
		lsm_tree->ckpt_throttle =
		    (long)((in_memory - 2) * timediff / (20 * record_count));

		/*
		 * Get more aggressive as the number of in memory chunks
		 * consumes a large proportion of the cache. In memory chunks
		 * are allowed to grow up to twice as large as the configured
		 * value when checkpoints aren't keeping up. That worst case
		 * is when this calculation is relevant.
		 * There is nothing particularly special about the chosen
		 * multipliers.
		 */
		cache_used = in_memory * lsm_tree->chunk_size * 2;
		if (cache_used > cache_sz * 0.8)
			lsm_tree->ckpt_throttle *= 5;
	}

	/*
	 * Merge throttling, based on the number of on-disk, level 0 chunks.
	 *
	 * Don't throttle if the tree has less than a single level's number
	 * of chunks.
	 */
	if (lsm_tree->nchunks < lsm_tree->merge_max)
		lsm_tree->merge_throttle = 0;
	else if (gen0_chunks < WT_LSM_MERGE_THROTTLE_THRESHOLD)
		WT_LSM_MERGE_THROTTLE_DECREASE(lsm_tree->merge_throttle);
	else if (!decrease_only)
		WT_LSM_MERGE_THROTTLE_INCREASE(lsm_tree->merge_throttle);

	/* Put an upper bound of 1s on both throttle calculations. */
	lsm_tree->ckpt_throttle = WT_MIN(1000000, lsm_tree->ckpt_throttle);
	lsm_tree->merge_throttle = WT_MIN(1000000, lsm_tree->merge_throttle);

	/*
	 * Update our estimate of how long each in-memory chunk stays active.
	 * Filter out some noise by keeping a weighted history of the
	 * calculated value.  Wait until we have enough chunks that we can
	 * check that the new value is sane: otherwise, after a long idle
	 * period, we can calculate a crazy value.
	 */
	if (in_memory > 1 && ondisk != NULL) {
		prev_chunk = lsm_tree->chunk[lsm_tree->nchunks - 2];
		WT_ASSERT(session, prev_chunk->generation == 0);
		WT_ASSERT(session, WT_TIMECMP(
		    last_chunk->create_ts, prev_chunk->create_ts) >= 0);
		timediff =
		    WT_TIMEDIFF(last_chunk->create_ts, prev_chunk->create_ts);
		WT_ASSERT(session,
		    WT_TIMECMP(prev_chunk->create_ts, ondisk->create_ts) >= 0);
		oldtime = WT_TIMEDIFF(prev_chunk->create_ts, ondisk->create_ts);
		if (timediff < 10 * oldtime)
			lsm_tree->chunk_fill_ms =
			    (3 * lsm_tree->chunk_fill_ms +
			    timediff / 1000000) / 4;
	}
}
Beispiel #30
0
/*
 * __wt_page_in_func --
 *	Acquire a hazard pointer to a page; if the page is not in-memory,
 *	read it from the disk and build an in-memory version.
 */
int
__wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags
#ifdef HAVE_DIAGNOSTIC
    , const char *file, int line
#endif
    )
{
	WT_DECL_RET;
	WT_PAGE *page;
	u_int sleep_cnt, wait_cnt;
	int busy, force_attempts, oldgen;

	for (force_attempts = oldgen = 0, wait_cnt = 0;;) {
		switch (ref->state) {
		case WT_REF_DISK:
		case WT_REF_DELETED:
			if (LF_ISSET(WT_READ_CACHE))
				return (WT_NOTFOUND);

			/*
			 * The page isn't in memory, attempt to read it.
			 * Make sure there is space in the cache.
			 */
			WT_RET(__wt_cache_full_check(session));
			WT_RET(__wt_cache_read(session, ref));
			oldgen = LF_ISSET(WT_READ_WONT_NEED) ||
			    F_ISSET(session, WT_SESSION_NO_CACHE);
			continue;
		case WT_REF_READING:
			if (LF_ISSET(WT_READ_CACHE))
				return (WT_NOTFOUND);
			if (LF_ISSET(WT_READ_NO_WAIT))
				return (WT_NOTFOUND);
			WT_STAT_FAST_CONN_INCR(session, page_read_blocked);
			break;
		case WT_REF_LOCKED:
			if (LF_ISSET(WT_READ_NO_WAIT))
				return (WT_NOTFOUND);
			WT_STAT_FAST_CONN_INCR(session, page_locked_blocked);
			break;
		case WT_REF_SPLIT:
			return (WT_RESTART);
		case WT_REF_MEM:
			/*
			 * The page is in memory: get a hazard pointer, update
			 * the page's LRU and return.  The expected reason we
			 * can't get a hazard pointer is because the page is
			 * being evicted; yield and try again.
			 */
#ifdef HAVE_DIAGNOSTIC
			WT_RET(
			    __wt_hazard_set(session, ref, &busy, file, line));
#else
			WT_RET(__wt_hazard_set(session, ref, &busy));
#endif
			if (busy) {
				WT_STAT_FAST_CONN_INCR(
				    session, page_busy_blocked);
				break;
			}

			page = ref->page;
			WT_ASSERT(session, page != NULL);

			/*
			 * Forcibly evict pages that are too big.
			 */
			if (force_attempts < 10 &&
			    __evict_force_check(session, page, flags)) {
				++force_attempts;
				ret = __wt_page_release_evict(session, ref);
				/* If forced eviction fails, stall. */
				if (ret == EBUSY) {
					ret = 0;
					wait_cnt += 1000;
					WT_STAT_FAST_CONN_INCR(session,
					    page_forcible_evict_blocked);
					break;
				} else
					WT_RET(ret);

				/*
				 * The result of a successful forced eviction
				 * is a page-state transition (potentially to
				 * an in-memory page we can use, or a restart
				 * return for our caller), continue the outer
				 * page-acquisition loop.
				 */
				continue;
			}

			/* Check if we need an autocommit transaction. */
			if ((ret = __wt_txn_autocommit_check(session)) != 0) {
				WT_TRET(__wt_hazard_clear(session, page));
				return (ret);
			}

			/*
			 * If we read the page and we are configured to not
			 * trash the cache, set the oldest read generation so
			 * the page is forcibly evicted as soon as possible.
			 *
			 * Otherwise, update the page's read generation.
			 */
			if (oldgen && page->read_gen == WT_READGEN_NOTSET)
				__wt_page_evict_soon(page);
			else if (!LF_ISSET(WT_READ_NO_GEN) &&
			    page->read_gen != WT_READGEN_OLDEST &&
			    page->read_gen < __wt_cache_read_gen(session))
				page->read_gen =
				    __wt_cache_read_gen_set(session);

			return (0);
		WT_ILLEGAL_VALUE(session);
		}

		/*
		 * We failed to get the page -- yield before retrying, and if
		 * we've yielded enough times, start sleeping so we don't burn
		 * CPU to no purpose.
		 */
		if (++wait_cnt < 1000)
			__wt_yield();
		else {
			sleep_cnt = WT_MIN(wait_cnt, 10000);
			wait_cnt *= 2;
			WT_STAT_FAST_CONN_INCRV(session, page_sleep, sleep_cnt);
			__wt_sleep(0, sleep_cnt);
		}
	}
}