コード例 #1
0
ファイル: log_slot.c プロジェクト: SeanLiangYoung/mongo
/*
 * __wt_log_slot_free --
 *	Free a slot back into the pool.
 */
int
__wt_log_slot_free(WT_SESSION_IMPL *session, WT_LOGSLOT *slot)
{
	WT_DECL_RET;

	ret = 0;
	/*
	 * Grow the buffer if needed before returning it to the pool.
	 */
	if (F_ISSET(slot, WT_SLOT_BUF_GROW)) {
		WT_STAT_FAST_CONN_INCR(session, log_buffer_grow);
		WT_STAT_FAST_CONN_INCRV(session,
		    log_buffer_size, slot->slot_buf.memsize);
		WT_ERR(__wt_buf_grow(session,
		    &slot->slot_buf, slot->slot_buf.memsize * 2));
	}
err:
	/*
	 * No matter if there is an error, we always want to free
	 * the slot back to the pool.
	 */
	/*
	 * Make sure flags don't get retained between uses.
	 * We have to reset them them here because multiple threads may
	 * change the flags when joining the slot.
	 */
	slot->flags = WT_SLOT_INIT_FLAGS;
	slot->slot_state = WT_LOG_SLOT_FREE;
	return (ret);
}
コード例 #2
0
ファイル: scratch.c プロジェクト: niumowm/wiredtiger
/*
 * __wt_buf_init --
 *	Initialize a buffer at a specific size.
 */
int
__wt_buf_init(WT_SESSION_IMPL *session, WT_ITEM *buf, size_t size)
{
	buf->data = buf->mem;
	WT_RET(__wt_buf_grow(session, buf, size));
	buf->size = 0;

	return (0);
}
コード例 #3
0
ファイル: bt_debug.c プロジェクト: Zhangwusheng/wiredtiger
/*
 * __dmsg --
 *	Debug message.
 */
static void
__dmsg(WT_DBG *ds, const char *fmt, ...)
{
	va_list ap;
	WT_ITEM *msg;
	WT_SESSION_IMPL *session;
	size_t len, space;
	char *p;

	session = ds->session;

	/*
	 * Debug output chunks are not necessarily terminated with a newline
	 * character.  It's easy if we're dumping to a stream, but if we're
	 * dumping to an event handler, which is line-oriented, we must buffer
	 * the output chunk, and pass it to the event handler once we see a
	 * terminating newline.
	 */
	if (ds->fp == NULL) {
		msg = ds->msg;
		for (;;) {
			p = (char *)msg->mem + msg->size;
			space = msg->memsize - msg->size;
			va_start(ap, fmt);
			len = (size_t)vsnprintf(p, space, fmt, ap);
			va_end(ap);

			/* Check if there was enough space. */
			if (len < space) {
				msg->size += len;
				break;
			}

			/*
			 * There's not much to do on error without checking for
			 * an error return on every single printf.  Anyway, it's
			 * pretty unlikely and this is debugging output, I'm not
			 * going to worry about it.
			 */
			if (__wt_buf_grow(
			    session, msg, msg->memsize + len + 128) != 0)
				return;
		}
		if (((uint8_t *)msg->mem)[msg->size - 1] == '\n') {
			((uint8_t *)msg->mem)[msg->size - 1] = '\0';
			(void)__wt_msg(session, "%s", (char *)msg->mem);
			msg->size = 0;
		}
	} else {
		va_start(ap, fmt);
		(void)__wt_vfprintf(ds->fp, fmt, ap);
		va_end(ap);
	}
}
コード例 #4
0
ファイル: log_slot.c プロジェクト: rueckstiess/mongo
/*
 * __wt_log_slot_grow_buffers --
 *	Increase the buffer size of all available slots in the buffer pool.
 *	Go to some lengths to include active (but unused) slots to handle
 *	the case where all log write record sizes exceed the size of the
 *	active buffer.
 */
int
__wt_log_slot_grow_buffers(WT_SESSION_IMPL *session, size_t newsize)
{
	WT_CONNECTION_IMPL *conn;
	WT_DECL_RET;
	WT_LOG *log;
	WT_LOGSLOT *slot;
	int64_t orig_state;
	uint64_t old_size, total_growth;
	int i;

	conn = S2C(session);
	log = conn->log;
	total_growth = 0;
	WT_STAT_FAST_CONN_INCR(session, log_buffer_grow);
	/*
	 * Take the log slot lock to prevent other threads growing buffers
	 * at the same time. Could tighten the scope of this lock, or have
	 * a separate lock if there is contention.
	 */
	__wt_spin_lock(session, &log->log_slot_lock);
	for (i = 0; i < SLOT_POOL; i++) {
		slot = &log->slot_pool[i];
		/* Avoid atomic operations if they won't succeed. */
		if (slot->slot_state != WT_LOG_SLOT_FREE &&
		    slot->slot_state != WT_LOG_SLOT_READY)
			continue;
		/* Don't keep growing unrelated buffers. */
		if (slot->slot_buf.memsize > (10 * newsize) &&
		    !F_ISSET(slot, SLOT_BUF_GROW))
			continue;
		orig_state = WT_ATOMIC_CAS_VAL8(
		    slot->slot_state, WT_LOG_SLOT_FREE, WT_LOG_SLOT_PENDING);
		if (orig_state != WT_LOG_SLOT_FREE) {
			orig_state = WT_ATOMIC_CAS_VAL8(slot->slot_state,
			    WT_LOG_SLOT_READY, WT_LOG_SLOT_PENDING);
			if (orig_state != WT_LOG_SLOT_READY)
				continue;
		}

		/* We have a slot - now go ahead and grow the buffer. */
		old_size = slot->slot_buf.memsize;
		F_CLR(slot, SLOT_BUF_GROW);
		WT_ERR(__wt_buf_grow(session, &slot->slot_buf,
		    WT_MAX(slot->slot_buf.memsize * 2, newsize)));
		slot->slot_state = orig_state;
		total_growth += slot->slot_buf.memsize - old_size;
	}
err:	__wt_spin_unlock(session, &log->log_slot_lock);
	WT_STAT_FAST_CONN_INCRV(session, log_buffer_size, total_growth);
	return (ret);
}
コード例 #5
0
ファイル: cur_table.c プロジェクト: nicopoliakov/mongo
/*
 * __curextract_insert --
 *	Handle a key produced by a custom extractor.
 */
static int
__curextract_insert(WT_CURSOR *cursor) {
	WT_CURSOR_EXTRACTOR *cextract;
	WT_ITEM *key, ikey, pkey;
	WT_SESSION_IMPL *session;

	cextract = (WT_CURSOR_EXTRACTOR *)cursor;
	session = (WT_SESSION_IMPL *)cursor->session;

	WT_ITEM_SET(ikey, cursor->key);
	/*
	 * We appended a padding byte to the key to avoid rewriting the last
	 * column.  Strip that away here.
	 */
	WT_ASSERT(session, ikey.size > 0);
	--ikey.size;
	WT_RET(__wt_cursor_get_raw_key(cextract->ctable->cg_cursors[0], &pkey));

	/*
	 * We have the index key in the format we need, and all of the primary
	 * key columns are required: just append them.
	 */
	key = &cextract->idxc->key;
	WT_RET(__wt_buf_grow(session, key, ikey.size + pkey.size));
	memcpy((uint8_t *)key->mem, ikey.data, ikey.size);
	memcpy((uint8_t *)key->mem + ikey.size, pkey.data, pkey.size);
	key->size = ikey.size + pkey.size;

	/*
	 * The index key is now set and the value is empty (it starts clear and
	 * is never set).
	 */
	F_SET(cextract->idxc, WT_CURSTD_KEY_EXT | WT_CURSTD_VALUE_EXT);

	/* Call the underlying cursor function to update the index. */
	return (cextract->f(cextract->idxc));
}
コード例 #6
0
ファイル: bt_vrfy_dsk.c プロジェクト: radik/mongo
/*
 * __verify_dsk_row --
 *	Walk a WT_PAGE_ROW_INT or WT_PAGE_ROW_LEAF disk page and verify it.
 */
static int
__verify_dsk_row(
    WT_SESSION_IMPL *session, const char *tag, const WT_PAGE_HEADER *dsk)
{
	WT_BM *bm;
	WT_BTREE *btree;
	WT_CELL *cell;
	WT_CELL_UNPACK *unpack, _unpack;
	WT_DECL_ITEM(current);
	WT_DECL_ITEM(last_ovfl);
	WT_DECL_ITEM(last_pfx);
	WT_DECL_RET;
	WT_ITEM *last;
	enum { FIRST, WAS_KEY, WAS_VALUE } last_cell_type;
	void *huffman;
	uint32_t cell_num, cell_type, i, key_cnt, prefix;
	uint8_t *end;
	int cmp;

	btree = S2BT(session);
	bm = btree->bm;
	unpack = &_unpack;
	huffman = dsk->type == WT_PAGE_ROW_INT ? NULL : btree->huffman_key;

	WT_ERR(__wt_scr_alloc(session, 0, &current));
	WT_ERR(__wt_scr_alloc(session, 0, &last_pfx));
	WT_ERR(__wt_scr_alloc(session, 0, &last_ovfl));
	last = last_ovfl;

	end = (uint8_t *)dsk + dsk->mem_size;

	last_cell_type = FIRST;
	cell_num = 0;
	key_cnt = 0;
	WT_CELL_FOREACH(btree, dsk, cell, unpack, i) {
		++cell_num;

		/* Carefully unpack the cell. */
		if (__wt_cell_unpack_safe(cell, unpack, end) != 0) {
			ret = __err_cell_corrupted(session, cell_num, tag);
			goto err;
		}

		/* Check the raw and collapsed cell types. */
		WT_ERR(__err_cell_type(
		    session, cell_num, tag, unpack->raw, dsk->type));
		WT_ERR(__err_cell_type(
		    session, cell_num, tag, unpack->type, dsk->type));
		cell_type = unpack->type;

		/*
		 * Check ordering relationships between the WT_CELL entries.
		 * For row-store internal pages, check for:
		 *	two values in a row,
		 *	two keys in a row,
		 *	a value as the first cell on a page.
		 * For row-store leaf pages, check for:
		 *	two values in a row,
		 *	a value as the first cell on a page.
		 */
		switch (cell_type) {
		case WT_CELL_KEY:
		case WT_CELL_KEY_OVFL:
			++key_cnt;
			switch (last_cell_type) {
			case FIRST:
			case WAS_VALUE:
				break;
			case WAS_KEY:
				if (dsk->type == WT_PAGE_ROW_LEAF)
					break;
				WT_ERR_VRFY(session,
				    "cell %" PRIu32 " on page at %s is the "
				    "first of two adjacent keys",
				    cell_num - 1, tag);
			}
			last_cell_type = WAS_KEY;
			break;
		case WT_CELL_ADDR_DEL:
		case WT_CELL_ADDR_INT:
		case WT_CELL_ADDR_LEAF:
		case WT_CELL_ADDR_LEAF_NO:
		case WT_CELL_VALUE:
		case WT_CELL_VALUE_OVFL:
			switch (last_cell_type) {
			case FIRST:
				WT_ERR_VRFY(session,
				    "page at %s begins with a value", tag);
			case WAS_KEY:
				break;
			case WAS_VALUE:
				WT_ERR_VRFY(session,
				    "cell %" PRIu32 " on page at %s is the "
				    "first of two adjacent values",
				    cell_num - 1, tag);
			}
			last_cell_type = WAS_VALUE;
			break;
		}

		/* Check if any referenced item has a valid address. */
		switch (cell_type) {
		case WT_CELL_ADDR_DEL:
		case WT_CELL_ADDR_INT:
		case WT_CELL_ADDR_LEAF:
		case WT_CELL_ADDR_LEAF_NO:
		case WT_CELL_KEY_OVFL:
		case WT_CELL_VALUE_OVFL:
			if (!bm->addr_valid(bm,
			    session, unpack->data, unpack->size))
				goto eof;
			break;
		}

		/*
		 * Remaining checks are for key order and prefix compression.
		 * If this cell isn't a key, we're done, move to the next cell.
		 * If this cell is an overflow item, instantiate the key and
		 * compare it with the last key. Otherwise, we have to deal with
		 * prefix compression.
		 */
		switch (cell_type) {
		case WT_CELL_KEY:
			break;
		case WT_CELL_KEY_OVFL:
			WT_ERR(__wt_dsk_cell_data_ref(
			    session, dsk->type, unpack, current));
			goto key_compare;
		default:
			/* Not a key -- continue with the next cell. */
			continue;
		}

		/*
		 * Prefix compression checks.
		 *
		 * Confirm the first non-overflow key on a page has a zero
		 * prefix compression count.
		 */
		prefix = unpack->prefix;
		if (last_pfx->size == 0 && prefix != 0)
			WT_ERR_VRFY(session,
			    "the %" PRIu32 " key on page at %s is the first "
			    "non-overflow key on the page and has a non-zero "
			    "prefix compression value",
			    cell_num, tag);

		/* Confirm the prefix compression count is possible. */
		if (cell_num > 1 && prefix > last->size)
			WT_ERR_VRFY(session,
			    "key %" PRIu32 " on page at %s has a prefix "
			    "compression count of %" PRIu32 ", larger than "
			    "the length of the previous key, %" WT_SIZET_FMT,
			    cell_num, tag, prefix, last->size);

		/*
		 * If Huffman decoding required, unpack the cell to build the
		 * key, then resolve the prefix.  Else, we can do it faster
		 * internally because we don't have to shuffle memory around as
		 * much.
		 */
		if (huffman != NULL) {
			WT_ERR(__wt_dsk_cell_data_ref(
			    session, dsk->type, unpack, current));

			/*
			 * If there's a prefix, make sure there's enough buffer
			 * space, then shift the decoded data past the prefix
			 * and copy the prefix into place.  Take care with the
			 * pointers: current->data may be pointing inside the
			 * buffer.
			 */
			if (prefix != 0) {
				WT_ERR(__wt_buf_grow(
				    session, current, prefix + current->size));
				memmove((uint8_t *)current->mem + prefix,
				    current->data, current->size);
				memcpy(current->mem, last->data, prefix);
				current->data = current->mem;
				current->size += prefix;
			}
		} else {
			/*
			 * Get the cell's data/length and make sure we have
			 * enough buffer space.
			 */
			WT_ERR(__wt_buf_init(
			    session, current, prefix + unpack->size));

			/* Copy the prefix then the data into place. */
			if (prefix != 0)
				memcpy(current->mem, last->data, prefix);
			memcpy((uint8_t *)current->mem + prefix, unpack->data,
			    unpack->size);
			current->size = prefix + unpack->size;
		}

key_compare:	/*
		 * Compare the current key against the last key.
		 *
		 * Be careful about the 0th key on internal pages: we only store
		 * the first byte and custom collators may not be able to handle
		 * truncated keys.
		 */
		if ((dsk->type == WT_PAGE_ROW_INT && cell_num > 3) ||
		    (dsk->type != WT_PAGE_ROW_INT && cell_num > 1)) {
			WT_ERR(__wt_compare(
			    session, btree->collator, last, current, &cmp));
			if (cmp >= 0)
				WT_ERR_VRFY(session,
				    "the %" PRIu32 " and %" PRIu32 " keys on "
				    "page at %s are incorrectly sorted",
				    cell_num - 2, cell_num, tag);
		}

		/*
		 * Swap the buffers: last always references the last key entry,
		 * last_pfx and last_ovfl reference the last prefix-compressed
		 * and last overflow key entries.  Current gets pointed to the
		 * buffer we're not using this time around, which is where the
		 * next key goes.
		 */
		last = current;
		if (cell_type == WT_CELL_KEY) {
			current = last_pfx;
			last_pfx = last;
		} else {
			current = last_ovfl;
			last_ovfl = last;
		}
		WT_ASSERT(session, last != current);
	}
コード例 #7
0
ファイル: log.c プロジェクト: EaseTech/wiredtiger
/*
 * __wt_log_write --
 *	Write a record into the log.
 */
int
__wt_log_write(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp,
    uint32_t flags)
{
	WT_CONNECTION_IMPL *conn;
	WT_DECL_RET;
	WT_LOG *log;
	WT_LOG_RECORD *logrec;
	WT_LSN lsn;
	WT_MYSLOT myslot;
	uint32_t rdup_len;
	int locked;

	conn = S2C(session);
	log = conn->log;
	locked = 0;
	INIT_LSN(&lsn);
	myslot.slot = NULL;
	/*
	 * Assume the WT_ITEM the user passed is a WT_LOG_RECORD, which has
	 * a header at the beginning for us to fill in.
	 *
	 * If using direct_io, the caller should pass us an aligned record.
	 * But we need to make sure it is big enough and zero-filled so
	 * that we can write the full amount.  Do this whether or not
	 * direct_io is in use because it makes the reading code cleaner.
	 */
	WT_STAT_FAST_CONN_INCRV(session, log_bytes_user, record->size);
	rdup_len = __wt_rduppo2((uint32_t)record->size, log->allocsize);
	WT_ERR(__wt_buf_grow(session, record, rdup_len));
	WT_ASSERT(session, record->data == record->mem);
	/*
	 * If the caller's record only partially fills the necessary
	 * space, we need to zero-fill the remainder.
	 */
	if (record->size != rdup_len) {
		memset((uint8_t *)record->mem + record->size, 0,
		    rdup_len - record->size);
		record->size = rdup_len;
	}
	logrec = (WT_LOG_RECORD *)record->mem;
	logrec->len = (uint32_t)record->size;
	logrec->checksum = 0;
	logrec->checksum = __wt_cksum(logrec, record->size);

	WT_STAT_FAST_CONN_INCR(session, log_writes);

	if (!F_ISSET(log, WT_LOG_FORCE_CONSOLIDATE)) {
		ret = __log_direct_write(session, record, lsnp, flags);
		if (ret == 0)
			return (0);
		if (ret != EAGAIN)
			WT_ERR(ret);
		/*
		 * An EAGAIN return means we failed to get the try lock -
		 * fall through to the consolidation code in that case.
		 */
	}

	/*
	 * As soon as we see contention for the log slot, disable direct
	 * log writes. We get better performance by forcing writes through
	 * the consolidation code. This is because individual writes flood
	 * the I/O system faster than they contend on the log slot lock.
	 */
	F_SET(log, WT_LOG_FORCE_CONSOLIDATE);
	if ((ret = __wt_log_slot_join(
	    session, rdup_len, flags, &myslot)) == ENOMEM) {
		/*
		 * If we couldn't find a consolidated slot for this record
		 * write the record directly.
		 */
		while ((ret = __log_direct_write(
		    session, record, lsnp, flags)) == EAGAIN)
			;
		WT_ERR(ret);
		/*
		 * Increase the buffer size of any slots we can get access
		 * to, so future consolidations are likely to succeed.
		 */
		WT_ERR(__wt_log_slot_grow_buffers(session, 4 * rdup_len));
		return (0);
	}
	WT_ERR(ret);
	if (myslot.offset == 0) {
		__wt_spin_lock(session, &log->log_slot_lock);
		locked = 1;
		WT_ERR(__wt_log_slot_close(session, myslot.slot));
		WT_ERR(__log_acquire(
		    session, myslot.slot->slot_group_size, myslot.slot));
		__wt_spin_unlock(session, &log->log_slot_lock);
		locked = 0;
		WT_ERR(__wt_log_slot_notify(session, myslot.slot));
	} else
		WT_ERR(__wt_log_slot_wait(session, myslot.slot));
	WT_ERR(__log_fill(session, &myslot, 0, record, &lsn));
	if (__wt_log_slot_release(myslot.slot, rdup_len) == WT_LOG_SLOT_DONE) {
		WT_ERR(__log_release(session, myslot.slot));
		WT_ERR(__wt_log_slot_free(myslot.slot));
	} else if (LF_ISSET(WT_LOG_FSYNC)) {
		/* Wait for our writes to reach disk */
		while (LOG_CMP(&log->sync_lsn, &lsn) <= 0 &&
		    myslot.slot->slot_error == 0)
			(void)__wt_cond_wait(
			    session, log->log_sync_cond, 10000);
	}
err:
	if (locked)
		__wt_spin_unlock(session, &log->log_slot_lock);
	if (ret == 0 && lsnp != NULL)
		*lsnp = lsn;
	/*
	 * If we're synchronous and some thread had an error, we don't know
	 * if our write made it out to the file or not.  The error could be
	 * before or after us.  So, if anyone got an error, we report it.
	 * If we're not synchronous, only report if our own operation got
	 * an error.
	 */
	if (LF_ISSET(WT_LOG_DSYNC | WT_LOG_FSYNC) && ret == 0 &&
	    myslot.slot != NULL)
		ret = myslot.slot->slot_error;
	return (ret);
}
コード例 #8
0
ファイル: log.c プロジェクト: EaseTech/wiredtiger
/*
 * __wt_log_scan --
 *	Scan the logs, calling a function on each record found.
 */
int
__wt_log_scan(WT_SESSION_IMPL *session, WT_LSN *lsnp, uint32_t flags,
    int (*func)(WT_SESSION_IMPL *session,
    WT_ITEM *record, WT_LSN *lsnp, void *cookie), void *cookie)
{
	WT_CONNECTION_IMPL *conn;
	WT_ITEM buf;
	WT_DECL_RET;
	WT_FH *log_fh;
	WT_LOG *log;
	WT_LOG_RECORD *logrec;
	WT_LSN end_lsn, rd_lsn, start_lsn;
	off_t log_size;
	uint32_t allocsize, cksum, firstlog, lastlog, lognum, rdup_len, reclen;
	u_int i, logcount;
	int eol;
	char **logfiles;

	conn = S2C(session);
	log = conn->log;
	log_fh = NULL;
	logcount = 0;
	logfiles = NULL;
	eol = 0;
	WT_CLEAR(buf);

	/*
	 * If the caller did not give us a callback function there is nothing
	 * to do.
	 */
	if (func == NULL)
		return (0);

	if (LF_ISSET(WT_LOGSCAN_RECOVER))
		WT_RET(__wt_verbose(session, WT_VERB_LOG,
		    "__wt_log_scan truncating to %u/%" PRIuMAX,
		    log->trunc_lsn.file, (uintmax_t)log->trunc_lsn.offset));

	if (log != NULL) {
		allocsize = log->allocsize;

		if (lsnp == NULL) {
			if (LF_ISSET(WT_LOGSCAN_FIRST))
				start_lsn = log->first_lsn;
			else if (LF_ISSET(WT_LOGSCAN_FROM_CKP))
				start_lsn = log->ckpt_lsn;
			else
				return (WT_ERROR);	/* Illegal usage */
		} else {
			if (LF_ISSET(WT_LOGSCAN_FIRST|WT_LOGSCAN_FROM_CKP))
				WT_RET_MSG(session, WT_ERROR,
			    "choose either a start LSN or a start flag");

			/* Offsets must be on allocation boundaries. */
			if (lsnp->offset % allocsize != 0 ||
			    lsnp->file > log->fileid)
				return (WT_NOTFOUND);

			/*
			 * Log cursors may not know the starting LSN.  If an
			 * LSN pointer is passed in, but it is the INIT_LSN,
			 * start from the first_lsn.
			 */
			start_lsn = *lsnp;
			if (IS_INIT_LSN(&start_lsn))
				start_lsn = log->first_lsn;
		}
		end_lsn = log->alloc_lsn;
	} else {
		/*
		 * If logging is not configured, we can still print out the log
		 * if log files exist.  We just need to set the LSNs from what
		 * is in the files versus what is in the live connection.
		 */
		/*
		 * Set allocsize to the minimum alignment it could be.  Larger
		 * records and larger allocation boundaries should always be
		 * a multiple of this.
		 */
		allocsize = LOG_ALIGN;
		lastlog = 0;
		firstlog = UINT32_MAX;
		WT_RET(__wt_log_get_files(session, &logfiles, &logcount));
		if (logcount == 0)
			/*
			 * Return it is not supported if none don't exist.
			 */
			return (ENOTSUP);
		for (i = 0; i < logcount; i++) {
			WT_ERR(__wt_log_extract_lognum(session, logfiles[i],
			    &lognum));
			lastlog = WT_MAX(lastlog, lognum);
			firstlog = WT_MIN(firstlog, lognum);
		}
		start_lsn.file = firstlog;
		end_lsn.file = lastlog;
		start_lsn.offset = end_lsn.offset = 0;
		__wt_log_files_free(session, logfiles, logcount);
		logfiles = NULL;
	}
	WT_ERR(__log_openfile(session, 0, &log_fh, start_lsn.file));
	WT_ERR(__log_filesize(session, log_fh, &log_size));
	rd_lsn = start_lsn;
	WT_ERR(__wt_buf_initsize(session, &buf, LOG_ALIGN));
	for (;;) {
		if (rd_lsn.offset + allocsize > log_size) {
advance:
			/*
			 * If we read the last record, go to the next file.
			 */
			WT_ERR(__wt_close(session, log_fh));
			log_fh = NULL;
			eol = 1;
			/*
			 * Truncate this log file before we move to the next.
			 */
			if (LF_ISSET(WT_LOGSCAN_RECOVER))
				WT_ERR(__log_truncate(session, &rd_lsn, 1));
			rd_lsn.file++;
			rd_lsn.offset = 0;
			/*
			 * Avoid an error message when we reach end of log
			 * by checking here.
			 */
			if (rd_lsn.file > end_lsn.file)
				break;
			WT_ERR(__log_openfile(
			    session, 0, &log_fh, rd_lsn.file));
			WT_ERR(__log_filesize(session, log_fh, &log_size));
			continue;
		}
		/*
		 * Read the minimum allocation size a record could be.
		 */
		WT_ASSERT(session, buf.memsize >= allocsize);
		WT_ERR(__wt_read(session,
		    log_fh, rd_lsn.offset, (size_t)allocsize, buf.mem));
		/*
		 * First 8 bytes is the real record length.  See if we
		 * need to read more than the allocation size.  We expect
		 * that we rarely will have to read more.  Most log records
		 * will be fairly small.
		 */
		reclen = *(uint32_t *)buf.mem;
		/*
		 * Log files are pre-allocated.  We never expect a zero length
		 * unless we've reached the end of the log.  The log can be
		 * written out of order, so when recovery finds the end of
		 * the log, truncate the file and remove any later log files
		 * that may exist.
		 */
		if (reclen == 0) {
			/* This LSN is the end. */
			break;
		}
		rdup_len = __wt_rduppo2(reclen, allocsize);
		if (reclen > allocsize) {
			/*
			 * The log file end could be the middle of this
			 * log record.
			 */
			if (rd_lsn.offset + rdup_len > log_size)
				goto advance;
			/*
			 * We need to round up and read in the full padded
			 * record, especially for direct I/O.
			 */
			WT_ERR(__wt_buf_grow(session, &buf, rdup_len));
			WT_ERR(__wt_read(session,
			    log_fh, rd_lsn.offset, (size_t)rdup_len, buf.mem));
			WT_STAT_FAST_CONN_INCR(session, log_scan_rereads);
		}
		/*
		 * We read in the record, verify checksum.
		 */
		buf.size = reclen;
		logrec = (WT_LOG_RECORD *)buf.mem;
		cksum = logrec->checksum;
		logrec->checksum = 0;
		logrec->checksum = __wt_cksum(logrec, logrec->len);
		if (logrec->checksum != cksum) {
			/*
			 * A checksum mismatch means we have reached the end of
			 * the useful part of the log.  This should be found on
			 * the first pass through recovery.  In the second pass
			 * where we truncate the log, this is where it should
			 * end.
			 */
			if (log != NULL)
				log->trunc_lsn = rd_lsn;
			break;
		}

		/*
		 * We have a valid log record.  If it is not the log file
		 * header, invoke the callback.
		 */
		WT_STAT_FAST_CONN_INCR(session, log_scan_records);
		if (rd_lsn.offset != 0) {
			WT_ERR((*func)(session, &buf, &rd_lsn, cookie));
			if (LF_ISSET(WT_LOGSCAN_ONE))
				break;
		}
		rd_lsn.offset += (off_t)rdup_len;
	}

	/* Truncate if we're in recovery. */
	if (LF_ISSET(WT_LOGSCAN_RECOVER) &&
	    LOG_CMP(&rd_lsn, &log->trunc_lsn) < 0)
		WT_ERR(__log_truncate(session, &rd_lsn, 0));

err:	WT_STAT_FAST_CONN_INCR(session, log_scans);
	if (logfiles != NULL)
		__wt_log_files_free(session, logfiles, logcount);
	__wt_buf_free(session, &buf);
	/*
	 * If the caller wants one record and it is at the end of log,
	 * return WT_NOTFOUND.
	 */
	if (LF_ISSET(WT_LOGSCAN_ONE) && eol && ret == 0)
		ret = WT_NOTFOUND;
	if (ret == ENOENT)
		ret = 0;
	if (log_fh != NULL)
		WT_TRET(__wt_close(session, log_fh));
	return (ret);
}
コード例 #9
0
ファイル: log.c プロジェクト: EaseTech/wiredtiger
/*
 * __wt_log_read --
 *	Read the log record at the given LSN.  Return the record (including
 *	the log header) in the WT_ITEM.  Caller is responsible for freeing it.
 */
int
__wt_log_read(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp,
    uint32_t flags)
{
	WT_CONNECTION_IMPL *conn;
	WT_DECL_RET;
	WT_FH *log_fh;
	WT_LOG *log;
	WT_LOG_RECORD *logrec;
	uint32_t cksum, rdup_len, reclen;

	WT_UNUSED(flags);
	/*
	 * If the caller didn't give us an LSN or something to return,
	 * there's nothing to do.
	 */
	if (lsnp == NULL || record == NULL)
		return (0);
	conn = S2C(session);
	log = conn->log;
	/*
	 * If the offset isn't on an allocation boundary it must be wrong.
	 */
	if (lsnp->offset % log->allocsize != 0 || lsnp->file > log->fileid)
		return (WT_NOTFOUND);

	WT_RET(__log_openfile(session, 0, &log_fh, lsnp->file));
	/*
	 * Read the minimum allocation size a record could be.
	 */
	WT_ERR(__wt_buf_init(session, record, log->allocsize));
	WT_ERR(__wt_read(session,
	    log_fh, lsnp->offset, (size_t)log->allocsize, record->mem));
	/*
	 * First 4 bytes is the real record length.  See if we
	 * need to read more than the allocation size.  We expect
	 * that we rarely will have to read more.  Most log records
	 * will be fairly small.
	 */
	reclen = *(uint32_t *)record->mem;
	if (reclen == 0) {
		ret = WT_NOTFOUND;
		goto err;
	}
	if (reclen > log->allocsize) {
		rdup_len = __wt_rduppo2(reclen, log->allocsize);
		WT_ERR(__wt_buf_grow(session, record, rdup_len));
		WT_ERR(__wt_read(session,
		    log_fh, lsnp->offset, (size_t)rdup_len, record->mem));
	}
	/*
	 * We read in the record, verify checksum.
	 */
	logrec = (WT_LOG_RECORD *)record->mem;
	cksum = logrec->checksum;
	logrec->checksum = 0;
	logrec->checksum = __wt_cksum(logrec, logrec->len);
	if (logrec->checksum != cksum)
		WT_ERR_MSG(session, WT_ERROR, "log_read: Bad checksum");
	record->size = logrec->len;
	WT_STAT_FAST_CONN_INCR(session, log_reads);
err:
	WT_TRET(__wt_close(session, log_fh));
	return (ret);
}
コード例 #10
0
ファイル: log.c プロジェクト: EaseTech/wiredtiger
/*
 * __log_release --
 *	Release a log slot.
 */
static int
__log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot)
{
	WT_CONNECTION_IMPL *conn;
	WT_DECL_RET;
	WT_FH *close_fh;
	WT_LOG *log;
	WT_LSN sync_lsn;
	size_t write_size;
	WT_DECL_SPINLOCK_ID(id);			/* Must appear last */

	conn = S2C(session);
	log = conn->log;
	/*
	 * If we're going to have to close our log file, make a local copy
	 * of the file handle structure.
	 */
	close_fh = NULL;
	if (F_ISSET(slot, SLOT_CLOSEFH)) {
		close_fh = log->log_close_fh;
		log->log_close_fh = NULL;
		F_CLR(slot, SLOT_CLOSEFH);
	}

	/* Write the buffered records */
	if (F_ISSET(slot, SLOT_BUFFERED)) {
		write_size = (size_t)
		    (slot->slot_end_lsn.offset - slot->slot_start_offset);
		WT_ERR(__wt_write(session, slot->slot_fh,
		    slot->slot_start_offset, write_size, slot->slot_buf.mem));
	}

	/*
	 * Wait for earlier groups to finish, otherwise there could be holes
	 * in the log file.
	 */
	while (LOG_CMP(&log->write_lsn, &slot->slot_release_lsn) != 0)
		__wt_yield();
	log->write_lsn = slot->slot_end_lsn;
	/*
	 * Try to consolidate calls to fsync to wait less.  Acquire a spin lock
	 * so that threads finishing writing to the log will wait while the
	 * current fsync completes and advance log->write_lsn.
	 */
	while (F_ISSET(slot, SLOT_SYNC) &&
	    LOG_CMP(&log->sync_lsn, &slot->slot_end_lsn) < 0) {
		if (__wt_spin_trylock(session, &log->log_sync_lock, &id) != 0) {
			(void)__wt_cond_wait(
			    session, log->log_sync_cond, 10000);
			continue;
		}
		/*
		 * Record the current end of log after we grabbed the lock.
		 * That is how far our fsync call with guarantee.
		 */
		sync_lsn = log->write_lsn;
		if (LOG_CMP(&log->sync_lsn, &slot->slot_end_lsn) < 0) {
			WT_STAT_FAST_CONN_INCR(session, log_sync);
			ret = __wt_fsync(session, log->log_fh);
			if (ret == 0) {
				F_CLR(slot, SLOT_SYNC);
				log->sync_lsn = sync_lsn;
				ret = __wt_cond_signal(
				    session, log->log_sync_cond);
			}
		}
		__wt_spin_unlock(session, &log->log_sync_lock);
		WT_ERR(ret);
	}
	if (F_ISSET(slot, SLOT_BUF_GROW)) {
		WT_STAT_FAST_CONN_INCR(session, log_buffer_grow);
		F_CLR(slot, SLOT_BUF_GROW);
		WT_STAT_FAST_CONN_INCRV(session,
		    log_buffer_size, slot->slot_buf.memsize);
		WT_ERR(__wt_buf_grow(session,
		    &slot->slot_buf, slot->slot_buf.memsize * 2));
	}
	/*
	 * If we have a file to close, close it now.
	 */
	if (close_fh)
		WT_ERR(__wt_close(session, close_fh));

err:	if (ret != 0 && slot->slot_error == 0)
		slot->slot_error = ret;
	return (ret);
}
コード例 #11
0
/*
 * __wt_schema_project_merge --
 *	Given list of cursors and a projection, build a buffer containing the
 *	column values read from the cursors.
 */
int
__wt_schema_project_merge(WT_SESSION_IMPL *session,
    WT_CURSOR **cp, const char *proj_arg, const char *vformat, WT_ITEM *value)
{
	WT_CURSOR *c;
	WT_DECL_PACK(pack);
	WT_DECL_PACK_VALUE(pv);
	WT_DECL_PACK_VALUE(vpv);
	WT_ITEM *buf;
	WT_PACK vpack;
	u_long arg;
	char *proj;
	const uint8_t *p, *end;
	uint8_t *vp;
	size_t len;

	p = end = NULL;		/* -Wuninitialized */

	WT_RET(__wt_buf_init(session, value, 0));
	WT_RET(__pack_init(session, &vpack, vformat));

	for (proj = (char *)proj_arg; *proj != '\0'; proj++) {
		arg = strtoul(proj, &proj, 10);

		switch (*proj) {
		case WT_PROJ_KEY:
			c = cp[arg];
			if (WT_CURSOR_RECNO(c)) {
				c->key.data = &c->recno;
				c->key.size = sizeof(c->recno);
				WT_RET(__pack_init(session, &pack, "R"));
			} else
				WT_RET(__pack_init(
				    session, &pack, c->key_format));
			buf = &c->key;
			p = buf->data;
			end = p + buf->size;
			continue;

		case WT_PROJ_VALUE:
			c = cp[arg];
			WT_RET(__pack_init(session, &pack, c->value_format));
			buf = &c->value;
			p = buf->data;
			end = p + buf->size;
			continue;
		}

		/*
		 * Otherwise, the argument is a count, where a missing
		 * count means a count of 1.
		 */
		for (arg = (arg == 0) ? 1 : arg; arg > 0; arg--) {
			switch (*proj) {
			case WT_PROJ_NEXT:
			case WT_PROJ_SKIP:
			case WT_PROJ_REUSE:
				WT_RET(__pack_next(&pack, &pv));
				WT_RET(__unpack_read(session, &pv,
				    &p, (size_t)(end - p)));
				/* Only copy the value out once. */
				if (*proj != WT_PROJ_NEXT)
					break;

				WT_RET(__pack_next(&vpack, &vpv));
				/* Make sure the types are compatible. */
				WT_ASSERT(session,
				    __wt_tolower((u_char)pv.type) ==
				    __wt_tolower((u_char)vpv.type));
				vpv.u = pv.u;
				WT_RET(__pack_size(session, &vpv, &len));
				WT_RET(__wt_buf_grow(session,
				    value, value->size + len));
				vp = (uint8_t *)value->mem + value->size;
				WT_RET(__pack_write(session, &vpv, &vp, len));
				value->size += len;
				break;
			}
		}
	}

	return (0);
}
コード例 #12
0
/*
 * __wt_schema_project_slice --
 *	Given list of cursors and a projection, read columns from the
 *	a raw buffer.
 */
int
__wt_schema_project_slice(WT_SESSION_IMPL *session, WT_CURSOR **cp,
    const char *proj_arg, bool key_only, const char *vformat, WT_ITEM *value)
{
	WT_CURSOR *c;
	WT_DECL_ITEM(buf);
	WT_DECL_PACK(pack);
	WT_DECL_PACK_VALUE(pv);
	WT_DECL_PACK_VALUE(vpv);
	WT_PACK vpack;
	u_long arg;
	char *proj;
	uint8_t *end, *p;
	const uint8_t *next, *vp, *vend;
	size_t len, offset, old_len;
	bool skip;

	p = end = NULL;		/* -Wuninitialized */

	WT_RET(__pack_init(session, &vpack, vformat));
	vp = value->data;
	vend = vp + value->size;

	/* Reset any of the buffers we will be setting. */
	for (proj = (char *)proj_arg; *proj != '\0'; proj++) {
		arg = strtoul(proj, &proj, 10);
		if (*proj == WT_PROJ_KEY) {
			c = cp[arg];
			WT_RET(__wt_buf_init(session, &c->key, 0));
		} else if (*proj == WT_PROJ_VALUE && !key_only) {
			c = cp[arg];
			WT_RET(__wt_buf_init(session, &c->value, 0));
		}
	}

	skip = key_only;
	for (proj = (char *)proj_arg; *proj != '\0'; proj++) {
		arg = strtoul(proj, &proj, 10);

		switch (*proj) {
		case WT_PROJ_KEY:
			skip = false;
			c = cp[arg];
			if (WT_CURSOR_RECNO(c)) {
				c->key.data = &c->recno;
				c->key.size = sizeof(c->recno);
				WT_RET(__pack_init(session, &pack, "R"));
			} else
				WT_RET(__pack_init(
				    session, &pack, c->key_format));
			buf = &c->key;
			p = (uint8_t *)buf->data;
			end = p + buf->size;
			continue;

		case WT_PROJ_VALUE:
			skip = key_only;
			if (skip)
				continue;
			c = cp[arg];
			WT_RET(__pack_init(session, &pack, c->value_format));
			buf = &c->value;
			p = (uint8_t *)buf->data;
			end = p + buf->size;
			continue;
		}

		/* We have to get a key or value before any operations. */
		WT_ASSERT(session, skip || buf != NULL);

		/*
		 * Otherwise, the argument is a count, where a missing
		 * count means a count of 1.
		 */
		for (arg = (arg == 0) ? 1 : arg; arg > 0; arg--) {
			switch (*proj) {
			case WT_PROJ_SKIP:
				if (skip)
					break;
				WT_RET(__pack_next(&pack, &pv));

				/*
				 * A nasty case: if we are inserting
				 * out-of-order, append a zero value to keep
				 * the buffer in the correct format.
				 */
				if (p == end) {
					/* Set up an empty value. */
					WT_CLEAR(pv.u);
					if (pv.type == 'S' || pv.type == 's')
						pv.u.s = "";

					WT_RET(__pack_size(session, &pv, &len));
					WT_RET(__wt_buf_grow(session,
					    buf, buf->size + len));
					p = (uint8_t *)buf->data + buf->size;
					WT_RET(__pack_write(
					    session, &pv, &p, len));
					end = p;
					buf->size += len;
				} else
					WT_RET(__unpack_read(session,
					    &pv, (const uint8_t **)&p,
					    (size_t)(end - p)));
				break;

			case WT_PROJ_NEXT:
				WT_RET(__pack_next(&vpack, &vpv));
				WT_RET(__unpack_read(session, &vpv,
				    &vp, (size_t)(vend - vp)));
				/* FALLTHROUGH */

			case WT_PROJ_REUSE:
				if (skip)
					break;

				/*
				 * Read the item we're about to overwrite.
				 *
				 * There is subtlety here: the value format
				 * may not exactly match the cursor's format.
				 * In particular, we need lengths with raw
				 * columns in the middle of a packed struct,
				 * but not if they are at the end of a struct.
				 */
				WT_RET(__pack_next(&pack, &pv));

				next = p;
				if (p < end)
					WT_RET(__unpack_read(session, &pv,
					    &next, (size_t)(end - p)));
				old_len = (size_t)(next - p);

				/* Make sure the types are compatible. */
				WT_ASSERT(session,
				    __wt_tolower((u_char)pv.type) ==
				    __wt_tolower((u_char)vpv.type));
				pv.u = vpv.u;

				WT_RET(__pack_size(session, &pv, &len));
				offset = WT_PTRDIFF(p, buf->data);
				/*
				 * Avoid growing the buffer if the value fits.
				 * This is not just a performance issue: it
				 * covers the case of record number keys, which
				 * have to be written to cursor->recno.
				 */
				if (len > old_len)
					WT_RET(__wt_buf_grow(session,
					    buf, buf->size + len - old_len));
				p = (uint8_t *)buf->data + offset;
				/* Make room if we're inserting out-of-order. */
				if (offset + old_len < buf->size)
					memmove(p + len, p + old_len,
					    buf->size - (offset + old_len));
				WT_RET(__pack_write(session, &pv, &p, len));
				buf->size += len - old_len;
				end = (uint8_t *)buf->data + buf->size;
				break;
			default:
				WT_RET_MSG(session, EINVAL,
				    "unexpected projection plan: %c",
				    (int)*proj);
			}
		}
	}

	return (0);
}
コード例 #13
0
/*
 * __wt_schema_project_in --
 *	Given list of cursors and a projection, read columns from the
 *	application into the dependent cursors.
 */
int
__wt_schema_project_in(WT_SESSION_IMPL *session,
    WT_CURSOR **cp, const char *proj_arg, va_list ap)
{
	WT_CURSOR *c;
	WT_DECL_ITEM(buf);
	WT_DECL_PACK(pack);
	WT_DECL_PACK_VALUE(pv);
	WT_PACK_VALUE old_pv;
	size_t len, offset, old_len;
	u_long arg;
	char *proj;
	uint8_t *p, *end;
	const uint8_t *next;

	p = end = NULL;		/* -Wuninitialized */

	/* Reset any of the buffers we will be setting. */
	for (proj = (char *)proj_arg; *proj != '\0'; proj++) {
		arg = strtoul(proj, &proj, 10);
		if (*proj == WT_PROJ_KEY) {
			c = cp[arg];
			WT_RET(__wt_buf_init(session, &c->key, 0));
		} else if (*proj == WT_PROJ_VALUE) {
			c = cp[arg];
			WT_RET(__wt_buf_init(session, &c->value, 0));
		}
	}

	for (proj = (char *)proj_arg; *proj != '\0'; proj++) {
		arg = strtoul(proj, &proj, 10);

		switch (*proj) {
		case WT_PROJ_KEY:
			c = cp[arg];
			if (WT_CURSOR_RECNO(c)) {
				c->key.data = &c->recno;
				c->key.size = sizeof(c->recno);
				WT_RET(__pack_init(session, &pack, "R"));
			} else
				WT_RET(__pack_init(
				    session, &pack, c->key_format));
			buf = &c->key;
			p = (uint8_t *)buf->data;
			end = p + buf->size;
			continue;

		case WT_PROJ_VALUE:
			c = cp[arg];
			WT_RET(__pack_init(session, &pack, c->value_format));
			buf = &c->value;
			p = (uint8_t *)buf->data;
			end = p + buf->size;
			continue;
		}

		/* We have to get a key or value before any operations. */
		WT_ASSERT(session, buf != NULL);

		/*
		 * Otherwise, the argument is a count, where a missing
		 * count means a count of 1.
		 */
		for (arg = (arg == 0) ? 1 : arg; arg > 0; arg--) {
			switch (*proj) {
			case WT_PROJ_SKIP:
				WT_RET(__pack_next(&pack, &pv));
				/*
				 * A nasty case: if we are inserting
				 * out-of-order, we may reach the end of the
				 * data.  That's okay: we want to append in
				 * that case, and we're positioned to do that.
				 */
				if (p == end) {
					/* Set up an empty value. */
					WT_CLEAR(pv.u);
					if (pv.type == 'S' || pv.type == 's')
						pv.u.s = "";

					WT_RET(__pack_size(session, &pv, &len));
					WT_RET(__wt_buf_grow(session,
					    buf, buf->size + len));
					p = (uint8_t *)buf->mem + buf->size;
					WT_RET(__pack_write(
					    session, &pv, &p, len));
					buf->size += len;
					end = (uint8_t *)buf->mem + buf->size;
				} else if (*proj == WT_PROJ_SKIP)
					WT_RET(__unpack_read(session,
					    &pv, (const uint8_t **)&p,
					    (size_t)(end - p)));
				break;

			case WT_PROJ_NEXT:
				WT_RET(__pack_next(&pack, &pv));
				WT_PACK_GET(session, pv, ap);
				/* FALLTHROUGH */

			case WT_PROJ_REUSE:
				/* Read the item we're about to overwrite. */
				next = p;
				if (p < end) {
					old_pv = pv;
					WT_RET(__unpack_read(session, &old_pv,
					    &next, (size_t)(end - p)));
				}
				old_len = (size_t)(next - p);

				WT_RET(__pack_size(session, &pv, &len));
				offset = WT_PTRDIFF(p, buf->mem);
				WT_RET(__wt_buf_grow(session,
				    buf, buf->size + len));
				p = (uint8_t *)buf->mem + offset;
				end = (uint8_t *)buf->mem + buf->size + len;
				/* Make room if we're inserting out-of-order. */
				if (offset + old_len < buf->size)
					memmove(p + len, p + old_len,
					    buf->size - (offset + old_len));
				WT_RET(__pack_write(session, &pv, &p, len));
				buf->size += len;
				break;

			default:
				WT_RET_MSG(session, EINVAL,
				    "unexpected projection plan: %c",
				    (int)*proj);
			}
		}
	}

	return (0);
}
コード例 #14
0
ファイル: schema_project.c プロジェクト: qixin/wiredtiger
/*
 * __wt_schema_project_merge --
 *	Given list of cursors and a projection, build a buffer containing the
 *	column values read from the cursors.
 */
int
__wt_schema_project_merge(WT_SESSION_IMPL *session,
    WT_CURSOR **cp, const char *proj_arg, const char *vformat, WT_ITEM *value)
{
	WT_CURSOR *c;
	WT_ITEM *buf;
	WT_PACK pack, vpack;
	WT_PACK_VALUE pv, vpv;
	char *proj;
	uint8_t *p, *end, *vp;
	size_t len;
	uint32_t arg;

	WT_CLEAR(pack);         /* -Wuninitialized */
	WT_CLEAR(pv);           /* -Wuninitialized */
	WT_CLEAR(vpv);          /* -Wuninitialized */
	p = end = NULL;         /* -Wuninitialized */

	WT_RET(__wt_buf_init(session, value, 0));
	WT_RET(__pack_init(session, &vpack, vformat));

	for (proj = (char *)proj_arg; *proj != '\0'; proj++) {
		arg = (uint32_t)strtoul(proj, &proj, 10);

		switch (*proj) {
		case WT_PROJ_KEY:
			c = cp[arg];
			if (WT_CURSOR_RECNO(c)) {
				c->key.data = &c->recno;
				c->key.size = sizeof(c->recno);
				WT_RET(__pack_init(session, &pack, "R"));
			} else
				WT_RET(__pack_init(
				    session, &pack, c->key_format));
			buf = &c->key;
			p = (uint8_t *)buf->data;
			end = p + buf->size;
			continue;

		case WT_PROJ_VALUE:
			c = cp[arg];
			WT_RET(__pack_init(session, &pack, c->value_format));
			buf = &c->value;
			p = (uint8_t *)buf->data;
			end = p + buf->size;
			continue;
		}

		/*
		 * Otherwise, the argument is a count, where a missing
		 * count means a count of 1.
		 */
		for (arg = (arg == 0) ? 1 : arg; arg > 0; arg--) {
			switch (*proj) {
			case WT_PROJ_NEXT:
			case WT_PROJ_SKIP:
				WT_RET(__pack_next(&pack, &pv));
				WT_RET(__unpack_read(session, &pv,
				    (const uint8_t **)&p,
				    (size_t)(end - p)));
				if (*proj == WT_PROJ_SKIP)
					break;

				WT_RET(__pack_next(&vpack, &vpv));
				vpv.u = pv.u;
				len = __pack_size(session, &vpv);
				WT_RET(__wt_buf_grow(session,
				    value, value->size + len));
				vp = (uint8_t *)value->data + value->size;
				WT_RET(__pack_write(session, &vpv, &vp, len));
				value->size += WT_STORE_SIZE(len);
				/* FALLTHROUGH */

			case WT_PROJ_REUSE:
				/* Don't copy the same value twice. */
				break;
			}
		}
	}

	return (0);
}
コード例 #15
0
ファイル: schema_project.c プロジェクト: qixin/wiredtiger
/*
 * __wt_schema_project_slice --
 *	Given list of cursors and a projection, read columns from the
 *	a raw buffer.
 */
int
__wt_schema_project_slice(WT_SESSION_IMPL *session, WT_CURSOR **cp,
    const char *proj_arg, int key_only, const char *vformat, WT_ITEM *value)
{
	WT_CURSOR *c;
	WT_ITEM *buf;
	WT_PACK pack, vpack;
	WT_PACK_VALUE pv, vpv;
	char *proj;
	uint8_t *end, *p;
	const uint8_t *next, *vp, *vend;
	size_t len, offset, old_len;
	uint32_t arg;
	int skip;

	WT_CLEAR(pack);         /* -Wuninitialized */
	WT_CLEAR(vpv);          /* -Wuninitialized */
	buf = NULL;             /* -Wuninitialized */
	p = end = NULL;         /* -Wuninitialized */

	WT_RET(__pack_init(session, &vpack, vformat));
	vp = (uint8_t *)value->data;
	vend = vp + value->size;

	/* Reset any of the buffers we will be setting. */
	for (proj = (char *)proj_arg; *proj != '\0'; proj++) {
		arg = (uint32_t)strtoul(proj, &proj, 10);
		if (*proj == WT_PROJ_KEY) {
			c = cp[arg];
			WT_RET(__wt_buf_init(session, &c->key, 0));
		} else if (*proj == WT_PROJ_VALUE && !key_only) {
			c = cp[arg];
			WT_RET(__wt_buf_init(session, &c->value, 0));
		}
	}

	skip = key_only;
	for (proj = (char *)proj_arg; *proj != '\0'; proj++) {
		arg = (uint32_t)strtoul(proj, &proj, 10);

		switch (*proj) {
		case WT_PROJ_KEY:
			skip = 0;
			c = cp[arg];
			if (WT_CURSOR_RECNO(c)) {
				c->key.data = &c->recno;
				c->key.size = sizeof(c->recno);
				WT_RET(__pack_init(session, &pack, "R"));
			} else
				WT_RET(__pack_init(
				    session, &pack, c->key_format));
			buf = &c->key;
			p = (uint8_t *)buf->data;
			end = p + buf->size;
			continue;

		case WT_PROJ_VALUE:
			if ((skip = key_only) != 0)
				continue;
			c = cp[arg];
			WT_RET(__pack_init(session, &pack, c->value_format));
			buf = &c->value;
			p = (uint8_t *)buf->data;
			end = p + buf->size;
			continue;
		}

		/*
		 * Otherwise, the argument is a count, where a missing
		 * count means a count of 1.
		 */
		for (arg = (arg == 0) ? 1 : arg; arg > 0; arg--) {
			switch (*proj) {
			case WT_PROJ_NEXT:
			case WT_PROJ_SKIP:
				if (!skip) {
					WT_RET(__pack_next(&pack, &pv));

					/*
					 * A nasty case: if we are inserting
					 * out-of-order, append a zero value
					 * to keep the buffer in the correct
					 * format.
					 */
					if (*proj == WT_PROJ_SKIP &&
					    p == end) {
						/* Set up an empty value. */
						WT_CLEAR(pv.u);
						if (pv.type == 'S' ||
						    pv.type == 's')
							pv.u.s = "";

						len = __pack_size(session, &pv);
						WT_RET(__wt_buf_grow(session,
						    buf, buf->size + len));
						p = (uint8_t *)buf->data +
						    buf->size;
						WT_RET(__pack_write(
						    session, &pv, &p, len));
						end = p;
						buf->size += WT_STORE_SIZE(len);
					} else if (*proj == WT_PROJ_SKIP)
						WT_RET(__unpack_read(session,
						    &pv, (const uint8_t **)&p,
						    (size_t)(end - p)));
				}
				if (*proj == WT_PROJ_SKIP)
					break;
				WT_RET(__pack_next(&vpack, &vpv));
				WT_RET(__unpack_read(session, &vpv,
				    &vp, (size_t)(vend - vp)));
				/* FALLTHROUGH */
			case WT_PROJ_REUSE:
				if (skip)
					break;

				/* Read the item we're about to overwrite. */
				next = p;
				if (p < end)
					WT_RET(__unpack_read(session, &pv,
					    &next, (size_t)(end - p)));
				old_len = (size_t)(next - p);

				/*
				 * There is subtlety here: the value format
				 * may not exactly match the cursor's format.
				 * In particular, we need lengths with raw
				 * columns in the middle of a packed struct,
				 * but not if they are at the end of a column.
				 */
				pv.u = vpv.u;

				len = __pack_size(session, &pv);
				offset = WT_PTRDIFF(p, buf->data);
				WT_RET(__wt_buf_grow(session,
				    buf, buf->size + len - old_len));
				p = (uint8_t *)buf->data + offset;
				/* Make room if we're inserting out-of-order. */
				if (offset + old_len < buf->size)
					memmove(p + len, p + old_len,
					    buf->size - (offset + old_len));
				WT_RET(__pack_write(session, &pv, &p, len));
				buf->size += WT_STORE_SIZE(len - old_len);
				end = (uint8_t *)buf->data + buf->size;
				break;
			default:
				WT_RET_MSG(session, EINVAL,
				    "unexpected projection plan: %c",
				    (int)*proj);
			}
		}
	}

	return (0);
}
コード例 #16
0
ファイル: main.c プロジェクト: ajdavis/mongo
/*
 * slow_apply_api --
 *	Apply a set of modification changes using a different algorithm.
 */
static void
slow_apply_api(WT_ITEM *orig)
{
	static WT_ITEM _tb;
	WT_ITEM *ta, *tb, *tmp, _tmp;
	size_t len, size;
	int i;

	ta = orig;
	tb = &_tb;

	/* Mess up anything not initialized in the buffers. */
	memset((uint8_t *)ta->mem + ta->size, 0xff, ta->memsize - ta->size);
	memset((uint8_t *)tb->mem, 0xff, tb->memsize);

	/*
	 * Process the entries to figure out how large a buffer we need. This is
	 * a bit pessimistic because we're ignoring replacement bytes, but it's
	 * a simpler calculation.
	 */
	for (size = ta->size, i = 0; i < nentries; ++i) {
		if (entries[i].offset >= size)
			size = entries[i].offset;
		size += entries[i].data.size;
	}

	testutil_check(__wt_buf_grow(NULL, ta, size));
	testutil_check(__wt_buf_grow(NULL, tb, size));

#if DEBUG
	show(ta, "slow-apply start");
#endif
	/*
	 * From the starting buffer, create a new buffer b based on changes
	 * in the entries array. We're doing a brute force solution here to
	 * test the faster solution implemented in the library.
	 */
	for (i = 0; i < nentries; ++i) {
		/* Take leading bytes from the original, plus any gap bytes. */
		if (entries[i].offset >= ta->size) {
			memcpy(tb->mem, ta->mem, ta->size);
			if (entries[i].offset > ta->size)
				memset((uint8_t *)tb->mem + ta->size,
				    '\0', entries[i].offset - ta->size);
		} else
			if (entries[i].offset > 0)
				memcpy(tb->mem, ta->mem, entries[i].offset);
		tb->size = entries[i].offset;

		/* Take replacement bytes. */
		if (entries[i].data.size > 0) {
			memcpy((uint8_t *)tb->mem + tb->size,
			    entries[i].data.data, entries[i].data.size);
			tb->size += entries[i].data.size;
		}

		/* Take trailing bytes from the original. */
		len = entries[i].offset + entries[i].size;
		if (ta->size > len) {
			memcpy((uint8_t *)tb->mem + tb->size,
			    (uint8_t *)ta->mem + len, ta->size - len);
			tb->size += ta->size - len;
		}
		testutil_assert(tb->size <= size);

		/* Swap the buffers and do it again. */
		tmp = ta;
		ta = tb;
		tb = tmp;
	}
	ta->data = ta->mem;
	tb->data = tb->mem;

	/*
	 * The final results may not be in the original buffer, in which case
	 * we swap them back around.
	 */
	if (ta != orig) {
		_tmp = *ta;
		*ta = *tb;
		*tb = _tmp;
	}

#if DEBUG
	show(ta, "slow-apply finish");
#endif
}