示例#1
0
/*
 * __wt_btcur_update --
 *	Update a record in the tree.
 */
int
__wt_btcur_update(WT_CURSOR_BTREE *cbt)
{
	WT_BTREE *btree;
	WT_CURSOR *cursor;
	WT_SESSION_IMPL *session;
	int ret;

	btree = cbt->btree;
	cursor = &cbt->iface;
	session = (WT_SESSION_IMPL *)cursor->session;
	WT_BSTAT_INCR(session, cursor_updates);

	if (btree->type == BTREE_ROW)
		WT_RET(__cursor_size_chk(session, &cursor->key));
	WT_RET(__cursor_size_chk(session, &cursor->value));

retry:	__cursor_func_init(cbt, 1);

	switch (btree->type) {
	case BTREE_COL_FIX:
		if (cursor->value.size != 1)
			WT_RET_MSG(session, EINVAL,
			    "item size of %" PRIu32 " does not match "
			    "fixed-length file requirement of 1 byte",
			    cursor->value.size);
		/* FALLTHROUGH */
	case BTREE_COL_VAR:
		WT_ERR(__wt_col_search(session, cbt, 1));

		/*
		 * Update the record if it exists.  Creating a record past the
		 * end of the tree in a fixed-length column-store implicitly
		 * fills the gap with empty records.  Update the record in that
		 * case, the record exists.
		 */
		if ((cbt->compare != 0 || __cursor_invalid(cbt)) &&
		    !__cursor_fix_implicit(btree, cbt))
			ret = WT_NOTFOUND;
		else if ((ret = __wt_col_modify(session, cbt, 3)) == WT_RESTART)
			goto retry;
		break;
	case BTREE_ROW:
		/* Update the record it it exists. */
		WT_ERR(__wt_row_search(session, cbt, 1));
		if (cbt->compare != 0 || __cursor_invalid(cbt))
			ret = WT_NOTFOUND;
		else if ((ret = __wt_row_modify(session, cbt, 0)) == WT_RESTART)
			goto retry;
		break;
	WT_ILLEGAL_VALUE(session);
	}

err:	__cursor_func_resolve(cbt, ret);

	return (ret);
}
示例#2
0
/*
 * __wt_btcur_reset --
 *	Invalidate the cursor position.
 */
int
__wt_btcur_reset(WT_CURSOR_BTREE *cbt)
{
	WT_SESSION_IMPL *session;

	session = (WT_SESSION_IMPL *)cbt->iface.session;
	WT_BSTAT_INCR(session, cursor_resets);

	__cursor_func_init(cbt, 1);
	__cursor_search_clear(cbt);

	return (0);
}
示例#3
0
/*
 * __wt_btcur_remove --
 *	Remove a record from the tree.
 */
int
__wt_btcur_remove(WT_CURSOR_BTREE *cbt)
{
	WT_BTREE *btree;
	WT_CURSOR *cursor;
	WT_SESSION_IMPL *session;
	int ret;

	btree = cbt->btree;
	cursor = &cbt->iface;
	session = (WT_SESSION_IMPL *)cursor->session;
	WT_BSTAT_INCR(session, cursor_removes);

	if (btree->type == BTREE_ROW)
		WT_RET(__cursor_size_chk(session, &cursor->key));

retry:	__cursor_func_init(cbt, 1);

	switch (btree->type) {
	case BTREE_COL_FIX:
	case BTREE_COL_VAR:
		WT_ERR(__wt_col_search(session, cbt, 1));

		/*
		 * Remove the record if it exists.  Creating a record past the
		 * end of the tree in a fixed-length column-store implicitly
		 * fills the gap with empty records.  Return success in that
		 * case, the record was deleted successfully.
		 */
		if (cbt->compare != 0 || __cursor_invalid(cbt))
			ret =
			    __cursor_fix_implicit(btree, cbt) ? 0 : WT_NOTFOUND;
		else if ((ret = __wt_col_modify(session, cbt, 2)) == WT_RESTART)
			goto retry;
		break;
	case BTREE_ROW:
		/* Remove the record if it exists. */
		WT_ERR(__wt_row_search(session, cbt, 1));
		if (cbt->compare != 0 || __cursor_invalid(cbt))
			ret = WT_NOTFOUND;
		else if ((ret = __wt_row_modify(session, cbt, 1)) == WT_RESTART)
			goto retry;
		break;
	WT_ILLEGAL_VALUE(session);
	}

err:	__cursor_func_resolve(cbt, ret);

	return (ret);
}
示例#4
0
/*
 * __wt_btcur_search --
 *	Search for a matching record in the tree.
 */
int
__wt_btcur_search(WT_CURSOR_BTREE *cbt)
{
	WT_BTREE *btree;
	WT_CURSOR *cursor;
	WT_ITEM *val;
	WT_SESSION_IMPL *session;
	int ret;

	btree = cbt->btree;
	cursor = &cbt->iface;
	session = (WT_SESSION_IMPL *)cursor->session;
	WT_BSTAT_INCR(session, cursor_read);

	if (btree->type == BTREE_ROW)
		WT_RET(__cursor_size_chk(session, &cursor->key));

	__cursor_func_init(cbt, 1);

	WT_ERR(btree->type == BTREE_ROW ?
	    __wt_row_search(session, cbt, 0) :
	    __wt_col_search(session, cbt, 0));
	if (cbt->compare != 0 || __cursor_invalid(cbt)) {
		/*
		 * Creating a record past the end of the tree in a fixed-length
		 * column-store implicitly fills the gap with empty records.
		 */
		if (__cursor_fix_implicit(btree, cbt)) {
			cbt->v = 0;
			val = &cbt->iface.value;
			val->data = &cbt->v;
			val->size = 1;
		} else
			ret = WT_NOTFOUND;
	} else
		ret = __wt_kv_return(session, cbt, 0);

err:	__cursor_func_resolve(cbt, ret);

	return (ret);
}
示例#5
0
/*
 * __hazard_exclusive --
 *	Request exclusive access to a page.
 */
static int
__hazard_exclusive(WT_SESSION_IMPL *session, WT_REF *ref, int top)
{
	/*
	 * Make sure there is space to track exclusive access so we can unlock
	 * to clean up.
	 */
	if (session->excl_next * sizeof(WT_REF *) == session->excl_allocated)
		WT_RET(__wt_realloc(session, &session->excl_allocated,
		    (session->excl_next + 50) * sizeof(WT_REF *),
		    &session->excl));

	/*
	 * Hazard references are acquired down the tree, which means we can't
	 * deadlock.
	 *
	 * Request exclusive access to the page.  The top-level page should
	 * already be in the locked state, lock child pages in memory.
	 * If another thread already has this page, give up.
	 */
	if (!top && !WT_ATOMIC_CAS(ref->state, WT_REF_MEM, WT_REF_LOCKED))
		return (EBUSY);	/* We couldn't change the state. */
	WT_ASSERT(session, ref->state == WT_REF_LOCKED);

	session->excl[session->excl_next++] = ref;

	/* Check for a matching hazard reference. */
	if (__wt_page_hazard_check(session, ref->page) == NULL)
		return (0);

	WT_BSTAT_INCR(session, rec_hazard);
	WT_CSTAT_INCR(session, cache_evict_hazard);

	WT_VERBOSE_RET(
	    session, evict, "page %p hazard request failed", ref->page);
	return (EBUSY);
}
示例#6
0
/*
 * __wt_btcur_prev --
 *	Move to the previous record in the tree.
 */
int
__wt_btcur_prev(WT_CURSOR_BTREE *cbt)
{
	WT_DECL_RET;
	WT_SESSION_IMPL *session;
	int newpage;

	session = (WT_SESSION_IMPL *)cbt->iface.session;
	WT_BSTAT_INCR(session, cursor_read_prev);

	__cursor_func_init(cbt, 0);

	/*
	 * If we aren't already iterating in the right direction, there's
	 * some setup to do.
	 */
	if (!F_ISSET(cbt, WT_CBT_ITERATE_PREV))
		__wt_btcur_iterate_setup(cbt, 0);

	/*
	 * Walk any page we're holding until the underlying call returns not-
	 * found.  Then, move to the previous page, until we reach the start
	 * of the file.
	 */
	for (newpage = 0;; newpage = 1) {
		if (F_ISSET(cbt, WT_CBT_ITERATE_APPEND)) {
			switch (cbt->page->type) {
			case WT_PAGE_COL_FIX:
				ret = __cursor_fix_append_prev(cbt, newpage);
				break;
			case WT_PAGE_COL_VAR:
				ret = __cursor_var_append_prev(cbt, newpage);
				break;
			WT_ILLEGAL_VALUE_ERR(session);
			}
			if (ret == 0)
				break;
			F_CLR(cbt, WT_CBT_ITERATE_APPEND);
			if (ret != WT_NOTFOUND)
				break;
			newpage = 1;
		}
		if (cbt->page != NULL) {
			switch (cbt->page->type) {
			case WT_PAGE_COL_FIX:
				ret = __cursor_fix_prev(cbt, newpage);
				break;
			case WT_PAGE_COL_VAR:
				ret = __cursor_var_prev(cbt, newpage);
				break;
			case WT_PAGE_ROW_LEAF:
				ret = __cursor_row_prev(cbt, newpage);
				break;
			WT_ILLEGAL_VALUE_ERR(session);
			}
			if (ret != WT_NOTFOUND)
				break;
		}

		do {
			WT_ERR(__wt_tree_np(session, &cbt->page, 0, 0));
			WT_ERR_TEST(cbt->page == NULL, WT_NOTFOUND);
		} while (
		    cbt->page->type == WT_PAGE_COL_INT ||
		    cbt->page->type == WT_PAGE_ROW_INT);

		/*
		 * The last page in a column-store has appended entries.
		 * We handle it separately from the usual cursor code:
		 * it's only that one page and it's in a simple format.
		 */
		if (cbt->page->type != WT_PAGE_ROW_LEAF &&
		    (cbt->ins_head = WT_COL_APPEND(cbt->page)) != NULL)
			F_SET(cbt, WT_CBT_ITERATE_APPEND);
	}

err:	__cursor_func_resolve(cbt, ret);
	return (ret);
}
示例#7
0
/*
 * __wt_btcur_insert --
 *	Insert a record into the tree.
 */
int
__wt_btcur_insert(WT_CURSOR_BTREE *cbt)
{
	WT_BTREE *btree;
	WT_CURSOR *cursor;
	WT_SESSION_IMPL *session;
	int ret;

	btree = cbt->btree;
	cursor = &cbt->iface;
	session = (WT_SESSION_IMPL *)cursor->session;
	WT_BSTAT_INCR(session, cursor_inserts);

	if (btree->type == BTREE_ROW)
		WT_RET(__cursor_size_chk(session, &cursor->key));
	WT_RET(__cursor_size_chk(session, &cursor->value));

retry:	__cursor_func_init(cbt, 1);

	switch (btree->type) {
	case BTREE_COL_FIX:
	case BTREE_COL_VAR:
		/*
		 * If WT_CURSTD_APPEND is set, insert a new record (ignoring
		 * the application's record number).  First we search for the
		 * maximum possible record number so the search ends on the
		 * last page.  The real record number is assigned by the
		 * serialized append operation.
		 * __wt_col_append_serial_func
		 */
		if (F_ISSET(cursor, WT_CURSTD_APPEND))
			cbt->iface.recno = UINT64_MAX;

		WT_ERR(__wt_col_search(session, cbt, 1));

		if (F_ISSET(cursor, WT_CURSTD_APPEND))
			cbt->iface.recno = 0;

		/*
		 * If WT_CURSTD_OVERWRITE set, insert/update the key/value pair.
		 *
		 * If WT_CURSTD_OVERWRITE not set, fail if the key exists, else
		 * insert the key/value pair.  Creating a record past the end
		 * of the tree in a fixed-length column-store implicitly fills
		 * the gap with empty records.  Fail in that case, the record
		 * exists.
		 */
		if (!F_ISSET(cursor, WT_CURSTD_OVERWRITE) &&
		    ((cbt->compare == 0 && !__cursor_invalid(cbt)) ||
		    (cbt->compare != 0 && __cursor_fix_implicit(btree, cbt)))) {
			ret = WT_DUPLICATE_KEY;
			break;
		}
		if ((ret = __wt_col_modify(session, cbt, 3)) == WT_RESTART)
			goto retry;
		if (F_ISSET(cursor, WT_CURSTD_APPEND) && ret == 0)
			cbt->iface.recno = cbt->recno;
		break;
	case BTREE_ROW:
		/*
		 * If WT_CURSTD_OVERWRITE not set, fail if the key exists, else
		 * insert the key/value pair.
		 *
		 * If WT_CURSTD_OVERWRITE set, insert/update the key/value pair.
		 */
		WT_ERR(__wt_row_search(session, cbt, 1));
		if (cbt->compare == 0 &&
		    !__cursor_invalid(cbt) &&
		    !F_ISSET(cursor, WT_CURSTD_OVERWRITE)) {
			ret = WT_DUPLICATE_KEY;
			break;
		}
		if ((ret = __wt_row_modify(session, cbt, 0)) == WT_RESTART)
			goto retry;
		break;
	WT_ILLEGAL_VALUE(session);
	}

err:	__cursor_func_resolve(cbt, ret);

	return (ret);
}
示例#8
0
/*
 * __wt_btcur_search_near --
 *	Search for a record in the tree.
 */
int
__wt_btcur_search_near(WT_CURSOR_BTREE *cbt, int *exact)
{
	WT_BTREE *btree;
	WT_ITEM *val;
	WT_CURSOR *cursor;
	WT_SESSION_IMPL *session;
	int ret;

	btree = cbt->btree;
	cursor = &cbt->iface;
	session = (WT_SESSION_IMPL *)cursor->session;
	WT_BSTAT_INCR(session, cursor_read_near);

	if (btree->type == BTREE_ROW)
		WT_RET(__cursor_size_chk(session, &cursor->key));

	__cursor_func_init(cbt, 1);

	WT_ERR(btree->type == BTREE_ROW ?
	    __wt_row_search(session, cbt, 0) :
	    __wt_col_search(session, cbt, 0));

	/*
	 * Creating a record past the end of the tree in a fixed-length column-
	 * store implicitly fills the gap with empty records.  In this case, we
	 * instantiate the empty record, it's an exact match.
	 *
	 * Else, if we find a valid key (one that wasn't deleted), return it.
	 *
	 * Else, if we found a deleted key, try to move to the next key in the
	 * tree (bias for prefix searches).  Cursor next skips deleted records,
	 * so we don't have to test for them again.
	 *
	 * Else if there's no larger tree key, redo the search and try and find
	 * an earlier record.  If that fails, quit, there's no record to return.
	 */
	if (cbt->compare != 0 && __cursor_fix_implicit(btree, cbt)) {
		cbt->v = 0;
		val = &cbt->iface.value;
		val->data = &cbt->v;
		val->size = 1;
		*exact = 0;
	} else if (!__cursor_invalid(cbt)) {
		*exact = cbt->compare;
		ret = __wt_kv_return(session, cbt, cbt->compare == 0 ? 0 : 1);
	} else if ((ret = __wt_btcur_next(cbt)) != WT_NOTFOUND)
		*exact = 1;
	else {
		WT_ERR(btree->type == BTREE_ROW ?
		    __wt_row_search(session, cbt, 0) :
		    __wt_col_search(session, cbt, 0));
		if (!__cursor_invalid(cbt)) {
			*exact = cbt->compare;
			ret = __wt_kv_return(
			    session, cbt, cbt->compare == 0 ? 0 : 1);
		} else if ((ret = __wt_btcur_prev(cbt)) != WT_NOTFOUND)
			*exact = -1;
	}

err:	__cursor_func_resolve(cbt, ret);
	return (ret);
}
示例#9
0
/*
 * __wt_block_read_off --
 *	Read an addr/size pair referenced block into a buffer.
 */
int
__wt_block_read_off(WT_SESSION_IMPL *session, WT_BLOCK *block,
    WT_ITEM *buf, off_t offset, uint32_t size, uint32_t cksum)
{
	WT_BLOCK_HEADER *blk;
	WT_DECL_ITEM(tmp);
	WT_DECL_RET;
	WT_PAGE_HEADER *dsk;
	size_t result_len;
	uint32_t page_cksum;

	WT_VERBOSE_RET(session, read,
	    "off %" PRIuMAX ", size %" PRIu32 ", cksum %" PRIu32,
	    (uintmax_t)offset, size, cksum);

#ifdef HAVE_DIAGNOSTIC
	/*
	 * In diagnostic mode, verify the block we're about to read isn't on
	 * either the available or discard lists.
	 *
	 * Don't check during salvage, it's possible we're reading an already
	 * freed overflow page.
	 */
	if (!F_ISSET(session, WT_SESSION_SALVAGE_QUIET_ERR))
		WT_RET(
		    __wt_block_misplaced(session, block, "read", offset, size));
#endif

	/*
	 * If we're compressing the file blocks, place the initial read into a
	 * scratch buffer, we're going to have to re-allocate more memory for
	 * decompression.  Else check the caller's buffer size and grow it as
	 * necessary, there will only be one buffer.
	 */
	if (block->compressor == NULL) {
		F_SET(buf, WT_ITEM_ALIGNED);
		WT_RET(__wt_buf_init(session, buf, size));
		buf->size = size;
		dsk = buf->mem;
	} else {
		WT_RET(__wt_scr_alloc(session, size, &tmp));
		tmp->size = size;
		dsk = tmp->mem;
	}

	/* Read. */
	WT_ERR(__wt_read(session, block->fh, offset, size, dsk));
	blk = WT_BLOCK_HEADER_REF(dsk);

	/* Validate the checksum. */
	if (block->checksum &&
	    cksum != WT_BLOCK_CHECKSUM_NOT_SET &&
	    blk->cksum != WT_BLOCK_CHECKSUM_NOT_SET) {
		blk->cksum = 0;
		page_cksum = __wt_cksum(dsk, size);
		if (page_cksum == WT_BLOCK_CHECKSUM_NOT_SET)
			++page_cksum;
		if (cksum != page_cksum) {
			if (!F_ISSET(session, WT_SESSION_SALVAGE_QUIET_ERR))
				__wt_errx(session,
				    "read checksum error [%"
				    PRIu32 "B @ %" PRIuMAX ", %"
				    PRIu32 " != %" PRIu32 "]",
				    size, (uintmax_t)offset, cksum, page_cksum);
			WT_ERR(WT_ERROR);
		}
	}

	/*
	 * If the in-memory block size is larger than the on-disk block size,
	 * the block is compressed.   Size the user's buffer, copy the skipped
	 * bytes of the original image into place, then decompress.
	 *
	 * If the in-memory block size is less than or equal to the on-disk
	 * block size, the block is not compressed.
	 */
	if (blk->disk_size < dsk->size) {
		if (block->compressor == NULL)
			WT_ERR(__wt_illegal_value(session, block->name));

		WT_ERR(__wt_buf_init(session, buf, dsk->size));
		buf->size = dsk->size;

		/*
		 * Note the source length is NOT the number of compressed bytes,
		 * it's the length of the block we just read (minus the skipped
		 * bytes).  We don't store the number of compressed bytes: some
		 * compression engines need that length stored externally, they
		 * don't have markers in the stream to signal the end of the
		 * compressed bytes.  Those engines must store the compressed
		 * byte length somehow, see the snappy compression extension for
		 * an example.
		 */
		memcpy(buf->mem, tmp->mem, WT_BLOCK_COMPRESS_SKIP);
		WT_ERR(block->compressor->decompress(
		    block->compressor, &session->iface,
		    (uint8_t *)tmp->mem + WT_BLOCK_COMPRESS_SKIP,
		    tmp->size - WT_BLOCK_COMPRESS_SKIP,
		    (uint8_t *)buf->mem + WT_BLOCK_COMPRESS_SKIP,
		    dsk->size - WT_BLOCK_COMPRESS_SKIP,
		    &result_len));
		if (result_len != dsk->size - WT_BLOCK_COMPRESS_SKIP)
			WT_ERR(__wt_illegal_value(session, block->name));
	} else
		if (block->compressor == NULL)
			buf->size = dsk->size;
		else
			/*
			 * We guessed wrong: there was a compressor, but this
			 * block was not compressed, and now the page is in the
			 * wrong buffer and the buffer may be of the wrong size.
			 * This should be rare, why configure a compressor that
			 * doesn't work?  Allocate a buffer of the right size
			 * (we used a scratch buffer which might be large), and
			 * copy the data into place.
			 */
			WT_ERR(
			    __wt_buf_set(session, buf, tmp->data, dsk->size));

	WT_BSTAT_INCR(session, page_read);
	WT_CSTAT_INCR(session, block_read);

err:	__wt_scr_free(&tmp);
	return (ret);
}