Esempio n. 1
0
/*
 * __cursor_var_append_prev --
 *	Return the previous variable-length entry on the append list.
 */
static inline int
__cursor_var_append_prev(WT_CURSOR_BTREE *cbt, int newpage)
{
	WT_ITEM *val;
	WT_SESSION_IMPL *session;
	WT_UPDATE *upd;

	session = (WT_SESSION_IMPL *)cbt->iface.session;
	val = &cbt->iface.value;

	if (newpage) {
		cbt->ins = WT_SKIP_LAST(cbt->ins_head);
		goto new_page;
	}

	for (;;) {
		WT_RET(__cursor_skip_prev(cbt));
new_page:	if (cbt->ins == NULL)
			return (WT_NOTFOUND);

		__cursor_set_recno(cbt, WT_INSERT_RECNO(cbt->ins));
		if ((upd = __wt_txn_read(session, cbt->ins->upd)) == NULL ||
		    WT_UPDATE_DELETED_ISSET(upd))
			continue;
		val->data = WT_UPDATE_DATA(upd);
		val->size = upd->size;
		break;
	}
	return (0);
}
Esempio n. 2
0
/*
 * __cursor_var_append_prev --
 *	Return the previous variable-length entry on the append list.
 */
static inline int
__cursor_var_append_prev(WT_CURSOR_BTREE *cbt, bool newpage)
{
	WT_ITEM *val;
	WT_SESSION_IMPL *session;
	WT_UPDATE *upd;

	session = (WT_SESSION_IMPL *)cbt->iface.session;
	val = &cbt->iface.value;

	if (newpage) {
		cbt->ins = WT_SKIP_LAST(cbt->ins_head);
		goto new_page;
	}

	for (;;) {
		WT_RET(__cursor_skip_prev(cbt));
new_page:	if (cbt->ins == NULL)
			return (WT_NOTFOUND);

		__cursor_set_recno(cbt, WT_INSERT_RECNO(cbt->ins));
		if ((upd = __wt_txn_read(session, cbt->ins->upd)) == NULL)
			continue;
		if (WT_UPDATE_DELETED_ISSET(upd)) {
			if (__wt_txn_visible_all(session, upd->txnid))
				++cbt->page_deleted_count;
			continue;
		}
		val->data = WT_UPDATE_DATA(upd);
		val->size = upd->size;
		return (0);
	}
	/* NOTREACHED */
}
Esempio n. 3
0
/*
 * __wt_update_alloc --
 *	Allocate a WT_UPDATE structure and associated value and fill it in.
 */
int
__wt_update_alloc(WT_SESSION_IMPL *session,
    WT_ITEM *value, WT_UPDATE **updp, size_t *sizep)
{
	WT_UPDATE *upd;
	size_t size;

	/*
	 * Allocate the WT_UPDATE structure and room for the value, then copy
	 * the value into place.
	 */
	size = value == NULL ? 0 : value->size;
	WT_RET(__wt_calloc(session, 1, sizeof(WT_UPDATE) + size, &upd));
	if (value == NULL)
		WT_UPDATE_DELETED_SET(upd);
	else {
		upd->size = WT_STORE_SIZE(size);
		memcpy(WT_UPDATE_DATA(upd), value->data, size);
	}

	*updp = upd;
	if (sizep != NULL)
		*sizep = sizeof(WT_UPDATE) + size;
	return (0);
}
Esempio n. 4
0
/*btree cursor移向下一个记录,仅仅在append list上移动*/
static inline int __cursor_fix_append_next(WT_CURSOR_BTREE* cbt, int newpage)
{
	WT_ITEM *val;
	WT_SESSION_IMPL *session;
	WT_UPDATE *upd;

	session = (WT_SESSION_IMPL *)cbt->iface.session;
	val = &cbt->iface.value;

	/*新载入的page,判断ins_head是否为空,如果为空表示没有append的记录*/
	if (newpage){
		if ((cbt->ins = WT_SKIP_FIRST(cbt->ins_head)) == NULL)
			return (WT_NOTFOUND);
	}
	else{
		/*已经到append list的最后一条记录了,后面没有记录*/
		if (cbt->recno >= WT_INSERT_RECNO(cbt->ins) && (cbt->ins = WT_SKIP_NEXT(cbt->ins)) == NULL)
			return (WT_NOTFOUND);
	}
	/*
	* This code looks different from the cursor-previous code.  The append
	* list appears on the last page of the tree, but it may be preceded by
	* other rows, which means the cursor's recno will be set to a value and
	* we simply want to increment it.  If the cursor's recno is NOT set,
	* we're starting our iteration in a tree that has only appended items.
	* In that case, recno will be 0 and happily enough the increment will
	* set it to 1, which is correct.
	*/
	__cursor_set_recno(cbt, cbt->recno + 1);

	/*
	* Fixed-width column store appends are inherently non-transactional.
	* Even a non-visible update by a concurrent or aborted transaction
	* changes the effective end of the data.  The effect is subtle because
	* of the blurring between deleted and empty values, but ideally we
	* would skip all uncommitted changes at the end of the data.  This
	* doesn't apply to variable-width column stores because the implicitly
	* created records written by reconciliation are deleted and so can be
	* never seen by a read.
	*
	* The problem is that we don't know at this point whether there may be
	* multiple uncommitted changes at the end of the data, and it would be
	* expensive to check every time we hit an aborted update.  If an
	* insert is aborted, we simply return zero (empty), regardless of
	* whether we are at the end of the data.
	*/
	if (cbt->recno < WT_INSERT_RECNO(cbt->ins) || (upd = __wt_txn_read(session, cbt->ins->upd)) == NULL){ /*没有可见的记录值,直接返回0*/
		cbt->v = 0;
		val->data = &cbt->v;
	}
	else
		val->data = WT_UPDATE_DATA(upd);
	val->size = 1;
	return 0;
}
Esempio n. 5
0
/*
 * __cursor_fix_prev --
 *	Move to the previous, fixed-length column-store item.
 */
static inline int
__cursor_fix_prev(WT_CURSOR_BTREE *cbt, int newpage)
{
	WT_BTREE *btree;
	WT_ITEM *val;
	WT_SESSION_IMPL *session;
	WT_UPDATE *upd;

	session = (WT_SESSION_IMPL *)cbt->iface.session;
	btree = session->btree;
	val = &cbt->iface.value;

	/* Initialize for each new page. */
	if (newpage) {
		cbt->last_standard_recno = __col_last_recno(cbt->page);
		if (cbt->last_standard_recno == 0)
			return (WT_NOTFOUND);
		__cursor_set_recno(cbt, cbt->last_standard_recno);
		goto new_page;
	}

	/* Move to the previous entry and return the item. */
	for (;;) {
		if (cbt->recno == cbt->page->u.col_fix.recno)
			return (WT_NOTFOUND);
		__cursor_set_recno(cbt, cbt->recno - 1);

new_page:	/* Check any insert list for a matching record. */
		cbt->ins_head = WT_COL_UPDATE_SINGLE(cbt->page);
		cbt->ins = __col_insert_search(
		    cbt->ins_head, cbt->ins_stack, cbt->next_stack, cbt->recno);
		if (cbt->ins != NULL &&
		    cbt->recno != WT_INSERT_RECNO(cbt->ins))
			cbt->ins = NULL;
		upd = cbt->ins == NULL ?
		    NULL : __wt_txn_read(session, cbt->ins->upd);
		if (upd != NULL) {
			val->data = WT_UPDATE_DATA(upd);
			val->size = 1;
			return (0);
		}

		cbt->v = __bit_getv_recno(cbt->page, cbt->recno, btree->bitcnt);
		val->data = &cbt->v;
		val->size = 1;
		return (0);
	}
	/* NOTREACHED */
}
Esempio n. 6
0
/*btree cursor移向下一个记录(fix col方式存储),在btree树空间上移动*/
static inline int __cursor_fix_next(WT_CURSOR_BTREE* cbt, int newpage)
{
	WT_BTREE *btree;
	WT_ITEM *val;
	WT_PAGE *page;
	WT_SESSION_IMPL *session;
	WT_UPDATE *upd;

	session = (WT_SESSION_IMPL *)cbt->iface.session;
	btree = S2BT(session);
	page = cbt->ref->page;
	val = &cbt->iface.value;

	/*切换到新的page上做next操作*/
	if (newpage){
		cbt->last_standard_recno = __col_fix_last_recno(page);
		if (cbt->last_standard_recno == 0)
			return (WT_NOTFOUND);

		__cursor_set_recno(cbt, page->pg_fix_recno);
		goto new_page;
	}

	/*记录序号超出最后一个序号,到末尾了*/
	if (cbt->recno >= cbt->last_standard_recno)
		return WT_NOTFOUND;
	__cursor_set_recno(cbt, cbt->recno + 1);

new_page:
	/*获得第一个修改列表*/
	cbt->ins_head = WT_COL_UPDATE_SINGLE(page);
	/*定位recno所在的修改条目*/
	cbt->ins = __col_insert_search(cbt->ins_head, cbt->ins_stack, cbt->next_stack, cbt->recno);
	if (cbt->ins != NULL && cbt->recno != WT_INSERT_RECNO(cbt->ins)) /*不正确的修改定位,将ins设置为NULL表示定位失败*/
		cbt->ins = NULL;
	
	/*做事务隔离读取记录版本*/
	upd = cbt->ins == NULL ? NULL : __wt_txn_read(session, cbt->ins->upd);
	if (upd == NULL){
		cbt->v = __bit_getv_recno(page, cbt->recno, btree->bitcnt); /*V赋值*/
		val->data = &cbt->v;
	}
	else
		val->data = WT_UPDATE_DATA(upd);
	val->size = 1;

	return 0;
}
Esempio n. 7
0
/*
 * __cursor_fix_prev --
 *	Move to the previous, fixed-length column-store item.
 */
static inline int
__cursor_fix_prev(WT_CURSOR_BTREE *cbt, int newpage)
{
	WT_BTREE *btree;
	WT_INSERT *ins;
	WT_ITEM *val;
	WT_SESSION_IMPL *session;
	WT_UPDATE *upd;
	uint64_t *recnop;

	session = (WT_SESSION_IMPL *)cbt->iface.session;
	btree = session->btree;

	recnop = &cbt->iface.recno;
	val = &cbt->iface.value;

	/* Initialize for each new page. */
	if (newpage) {
		cbt->last_standard_recno = __col_last_recno(cbt->page);
		if (cbt->last_standard_recno == 0)
			return (WT_NOTFOUND);
		cbt->recno = cbt->last_standard_recno;
		goto new_page;
	}

	/* Move to the previous entry and return the item. */
	for (;;) {
		if (cbt->recno == cbt->page->u.col_fix.recno)
			return (WT_NOTFOUND);
		--cbt->recno;
new_page:	*recnop = cbt->recno;

		/* Check any insert list for a matching record. */
		if ((ins = cbt->ins = __col_insert_search_match(
		    WT_COL_UPDATE_SINGLE(cbt->page), cbt->recno)) != NULL &&
		    (upd = __wt_txn_read(session, ins->upd)) != NULL) {
			val->data = WT_UPDATE_DATA(upd);
			val->size = 1;
			return (0);
		}

		cbt->v = __bit_getv_recno(cbt->page, cbt->recno, btree->bitcnt);
		val->data = &cbt->v;
		val->size = 1;
		return (0);
	}
	/* NOTREACHED */
}
Esempio n. 8
0
/*
 * __txn_op_log --
 *	Log an operation for the current transaction.
 */
static int
__txn_op_log(WT_SESSION_IMPL *session,
    WT_ITEM *logrec, WT_TXN_OP *op, WT_CURSOR_BTREE *cbt)
{
	WT_DECL_RET;
	WT_ITEM key, value;
	WT_UPDATE *upd;
	uint64_t recno;

	WT_CLEAR(key);
	upd = op->u.upd;
	value.data = WT_UPDATE_DATA(upd);
	value.size = upd->size;

	/*
	 * Log the operation.  It must be one of the following:
	 * 1) column store remove;
	 * 2) column store insert/update;
	 * 3) row store remove; or
	 * 4) row store insert/update.
	 */
	if (cbt->btree->type == BTREE_ROW) {
		WT_ERR(__wt_cursor_row_leaf_key(cbt, &key));

		if (WT_UPDATE_DELETED_ISSET(upd))
			WT_ERR(__wt_logop_row_remove_pack(session, logrec,
			    op->fileid, &key));
		else
			WT_ERR(__wt_logop_row_put_pack(session, logrec,
			    op->fileid, &key, &value));
	} else {
		recno = WT_INSERT_RECNO(cbt->ins);
		WT_ASSERT(session, recno != WT_RECNO_OOB);

		if (WT_UPDATE_DELETED_ISSET(upd))
			WT_ERR(__wt_logop_col_remove_pack(session, logrec,
			    op->fileid, recno));
		else
			WT_ERR(__wt_logop_col_put_pack(session, logrec,
			    op->fileid, recno, &value));
	}

err:	__wt_buf_free(session, &key);
	return (ret);
}
Esempio n. 9
0
/*
 * __cursor_fix_append_prev --
 *	Return the previous fixed-length entry on the append list.
 */
static inline int
__cursor_fix_append_prev(WT_CURSOR_BTREE *cbt, int newpage)
{
	WT_ITEM *val;
	WT_SESSION_IMPL *session;
	WT_UPDATE *upd;

	session = (WT_SESSION_IMPL *)cbt->iface.session;
	val = &cbt->iface.value;

	if (newpage) {
		if ((cbt->ins = WT_SKIP_LAST(cbt->ins_head)) == NULL)
			return (WT_NOTFOUND);
		cbt->recno = WT_INSERT_RECNO(cbt->ins);
	} else {
		if (cbt->recno == WT_INSERT_RECNO(cbt->ins)) {
			__cursor_skip_prev(cbt);
			if (cbt->ins == NULL)
				return (WT_NOTFOUND);
		}
		--cbt->recno;
	}

	/*
	 * Column store appends are inherently non-transactional.
	 *
	 * Even a non-visible update by a concurrent or aborted transaction
	 * changes the effective end of the data.  The effect is subtle because
	 * of the blurring between deleted and empty values, but ideally we
	 * would skip all uncommitted changes at the end of the data.
	 */
	cbt->iface.recno = cbt->recno;
	if (cbt->recno > WT_INSERT_RECNO(cbt->ins) ||
	    (upd = __wt_txn_read(session, cbt->ins->upd)) == NULL) {
		cbt->v = 0;
		val->data = &cbt->v;
	} else
		val->data = WT_UPDATE_DATA(upd);
	val->size = 1;
	return (0);
}
Esempio n. 10
0
/*
 * __txn_op_log --
 *	Log an operation for the current transaction.
 */
static int
__txn_op_log(WT_SESSION_IMPL *session, WT_ITEM *logrec, WT_TXN_OP *op)
{
	WT_ITEM value;
	uint64_t recno;

	value.data = WT_UPDATE_DATA(op->u.op.upd);
	value.size = op->u.op.upd->size;

	/*
	 * Log the operation.  It must be one of the following:
	 * 1) column store remove;
	 * 2) column store insert/update;
	 * 3) row store remove; or
	 * 4) row store insert/update.
	 */
	if (op->u.op.key.data == NULL) {
		WT_ASSERT(session, op->u.op.ins != NULL);
		recno = op->u.op.ins->u.recno;

		if (WT_UPDATE_DELETED_ISSET(op->u.op.upd))
			WT_RET(__wt_logop_col_remove_pack(session, logrec,
			    op->fileid, recno));
		else
			WT_RET(__wt_logop_col_put_pack(session, logrec,
			    op->fileid, recno, &value));
	} else {
		if (WT_UPDATE_DELETED_ISSET(op->u.op.upd))
			WT_RET(__wt_logop_row_remove_pack(session, logrec,
			    op->fileid, &op->u.op.key));
		else
			WT_RET(__wt_logop_row_put_pack(session, logrec,
			    op->fileid, &op->u.op.key, &value));
	}

	return (0);
}
Esempio n. 11
0
/*在append list上移动variable-length类型的btree cursor*/
static inline int __cursor_var_append_next(WT_CURSOR_BTREE* cbt, int newpage)
{
	WT_ITEM *val;
	WT_SESSION_IMPL *session;
	WT_UPDATE *upd;

	session = (WT_SESSION_IMPL *)cbt->iface.session;
	val = &cbt->iface.value;

	if (newpage){
		cbt->ins = WT_SKIP_FIRST(cbt->ins_head);
		goto new_page;
	}

	for (;;){
		cbt->ins = WT_SKIP_NEXT(cbt->ins);
new_page:
		if (cbt->ins == NULL)
			return (WT_NOTFOUND);

		__cursor_set_recno(cbt, WT_INSERT_RECNO(cbt->ins));
		/*事务隔离读,对本事务不可见,继续向前*/
		if ((upd = __wt_txn_read(session, cbt->ins->upd)) == NULL)
			continue;
		/*删除集合,不做指向这条记录,继续向下移动*/
		if (WT_UPDATE_DELETED_ISSET(upd)) {
			++cbt->page_deleted_count;
			continue;
		}
		/*赋值value*/
		val->data = WT_UPDATE_DATA(upd);
		val->size = upd->size;

		return 0;
	}
}
Esempio n. 12
0
/*
 * __wt_update_alloc --
 *	Allocate a WT_UPDATE structure and associated value from the session's
 *	buffer and fill it in.
 */
int
__wt_update_alloc(WT_SESSION_IMPL *session,
    WT_ITEM *value, WT_UPDATE **updp, size_t *sizep)
{
	WT_DECL_RET;
	WT_UPDATE *upd;
	size_t size;

	/*
	 * Allocate the WT_UPDATE structure and room for the value, then copy
	 * the value into place.
	 */
	size = value == NULL ? 0 : value->size;
	WT_RET(__wt_calloc(session, 1, sizeof(WT_UPDATE) + size, &upd));
	if (value == NULL)
		WT_UPDATE_DELETED_SET(upd);
	else {
		upd->size = WT_STORE_SIZE(size);
		memcpy(WT_UPDATE_DATA(upd), value->data, size);
	}

	/*
	 * This must come last: after __wt_txn_modify succeeds, we must return
	 * a non-NULL upd so our callers can call __wt_txn_unmodify on any
	 * subsequent failure.
	 */
	if ((ret = __wt_txn_modify(session, &upd->txnid)) != 0) {
		__wt_free(session, upd);
		return (ret);
	}

	*updp = upd;
	if (sizep != NULL)
		*sizep = sizeof(WT_UPDATE) + size;
	return (0);
}
Esempio n. 13
0
/*
 * __wt_kv_return --
 *	Return a page referenced key/value pair to the application.
 */
int
__wt_kv_return(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt)
{
    WT_BTREE *btree;
    WT_CELL *cell;
    WT_CELL_UNPACK unpack;
    WT_CURSOR *cursor;
    WT_DECL_RET;
    WT_PAGE *page;
    WT_ROW *rip;
    WT_UPDATE *upd;
    uint8_t v;

    btree = S2BT(session);

    page = cbt->page;
    cursor = &cbt->iface;

    switch (page->type) {
    case WT_PAGE_COL_FIX:
        /*
         * The interface cursor's record has usually been set, but that
         * isn't universally true, specifically, cursor.search_near may
         * call here without first setting the interface cursor.
         */
        cursor->recno = cbt->recno;

        /*
         * If the cursor references a WT_INSERT item, take the related
         * WT_UPDATE item.
         */
        if (cbt->ins != NULL &&
                (upd = __wt_txn_read(session, cbt->ins->upd)) != NULL) {
            cursor->value.data = WT_UPDATE_DATA(upd);
            cursor->value.size = upd->size;
            return (0);
        }
        v = __bit_getv_recno(page, cbt->iface.recno, btree->bitcnt);
        return (__wt_buf_set(session, &cursor->value, &v, 1));
    case WT_PAGE_COL_VAR:
        /*
         * The interface cursor's record has usually been set, but that
         * isn't universally true, specifically, cursor.search_near may
         * call here without first setting the interface cursor.
         */
        cursor->recno = cbt->recno;

        /*
         * If the cursor references a WT_INSERT item, take the related
         * WT_UPDATE item.
         */
        if (cbt->ins != NULL &&
                (upd = __wt_txn_read(session, cbt->ins->upd)) != NULL) {
            cursor->value.data = WT_UPDATE_DATA(upd);
            cursor->value.size = upd->size;
            return (0);
        }
        cell = WT_COL_PTR(page, &page->u.col_var.d[cbt->slot]);
        break;
    case WT_PAGE_ROW_LEAF:
        rip = &page->u.row.d[cbt->slot];

        /*
         * If the cursor references a WT_INSERT item, take the key and
         * related WT_UPDATE item.   Otherwise, take the key from the
         * original page, and the value from any related WT_UPDATE item,
         * or the page if the key was never updated.
         */
        if (cbt->ins != NULL &&
                (upd = __wt_txn_read(session, cbt->ins->upd)) != NULL) {
            cursor->key.data = WT_INSERT_KEY(cbt->ins);
            cursor->key.size = WT_INSERT_KEY_SIZE(cbt->ins);
        } else {
            WT_RET(
                __wt_row_key(session, page, rip, &cursor->key, 0));
            upd = __wt_txn_read(session, WT_ROW_UPDATE(page, rip));
        }
        if (upd != NULL) {
            cursor->value.data = WT_UPDATE_DATA(upd);
            cursor->value.size = upd->size;
            return (0);
        }

        /* Take the original cell (which may be empty). */
        if ((cell = __wt_row_value(page, rip)) == NULL) {
            cursor->value.size = 0;
            return (0);
        }
        break;
        WT_ILLEGAL_VALUE(session);
    }

    /* The value is an on-page cell, unpack and expand it as necessary. */
    __wt_cell_unpack(cell, &unpack);
    ret = __wt_cell_unpack_ref(session, &unpack, &cursor->value);

    /*
     * Restart for a variable-length column-store.  We could catch restart
     * higher up the call-stack but there's no point to it: unlike row-store
     * (where a normal search path finds cached overflow values), we have to
     * access the page's reconciliation structures, and that's as easy here
     * as higher up the stack.
     */
    if (ret == WT_RESTART && page->type == WT_PAGE_COL_VAR)
        ret = __wt_ovfl_cache_col_restart(
                  session, page, &unpack, &cursor->value);
    return (ret);
}
Esempio n. 14
0
/*
 * __cursor_row_prev --
 *	Move to the previous row-store item.
 */
static inline int
__cursor_row_prev(WT_CURSOR_BTREE *cbt, bool newpage)
{
	WT_INSERT *ins;
	WT_ITEM *key, *val;
	WT_PAGE *page;
	WT_ROW *rip;
	WT_SESSION_IMPL *session;
	WT_UPDATE *upd;

	session = (WT_SESSION_IMPL *)cbt->iface.session;
	page = cbt->ref->page;
	key = &cbt->iface.key;
	val = &cbt->iface.value;

	/*
	 * For row-store pages, we need a single item that tells us the part
	 * of the page we're walking (otherwise switching from next to prev
	 * and vice-versa is just too complicated), so we map the WT_ROW and
	 * WT_INSERT_HEAD insert array slots into a single name space: slot 1
	 * is the "smallest key insert list", slot 2 is WT_ROW[0], slot 3 is
	 * WT_INSERT_HEAD[0], and so on.  This means WT_INSERT lists are
	 * odd-numbered slots, and WT_ROW array slots are even-numbered slots.
	 *
	 * New page configuration.
	 */
	if (newpage) {
		/*
		 * If we haven't instantiated keys on this page, do so, else it
		 * is a very, very slow traversal.
		 */
		if (!F_ISSET_ATOMIC(page, WT_PAGE_BUILD_KEYS))
			WT_RET(__wt_row_leaf_keys(session, page));

		if (page->pg_row_entries == 0)
			cbt->ins_head = WT_ROW_INSERT_SMALLEST(page);
		else
			cbt->ins_head =
			    WT_ROW_INSERT_SLOT(page, page->pg_row_entries - 1);
		cbt->ins = WT_SKIP_LAST(cbt->ins_head);
		cbt->row_iteration_slot = page->pg_row_entries * 2 + 1;
		goto new_insert;
	}

	/* Move to the previous entry and return the item. */
	for (;;) {
		/*
		 * Continue traversing any insert list.  Maintain the reference
		 * to the current insert element in case we switch to a cursor
		 * next movement.
		 */
		if (cbt->ins != NULL)
			WT_RET(__cursor_skip_prev(cbt));

new_insert:	if ((ins = cbt->ins) != NULL) {
			if ((upd = __wt_txn_read(session, ins->upd)) == NULL)
				continue;
			if (WT_UPDATE_DELETED_ISSET(upd)) {
				if (__wt_txn_visible_all(session, upd->txnid))
					++cbt->page_deleted_count;
				continue;
			}
			key->data = WT_INSERT_KEY(ins);
			key->size = WT_INSERT_KEY_SIZE(ins);
			val->data = WT_UPDATE_DATA(upd);
			val->size = upd->size;
			return (0);
		}

		/* Check for the beginning of the page. */
		if (cbt->row_iteration_slot == 1)
			return (WT_NOTFOUND);
		--cbt->row_iteration_slot;

		/*
		 * Odd-numbered slots configure as WT_INSERT_HEAD entries,
		 * even-numbered slots configure as WT_ROW entries.
		 */
		if (cbt->row_iteration_slot & 0x01) {
			cbt->ins_head = cbt->row_iteration_slot == 1 ?
			    WT_ROW_INSERT_SMALLEST(page) :
			    WT_ROW_INSERT_SLOT(
				page, cbt->row_iteration_slot / 2 - 1);
			cbt->ins = WT_SKIP_LAST(cbt->ins_head);
			goto new_insert;
		}
		cbt->ins_head = NULL;
		cbt->ins = NULL;

		cbt->slot = cbt->row_iteration_slot / 2 - 1;
		rip = &page->pg_row_d[cbt->slot];
		upd = __wt_txn_read(session, WT_ROW_UPDATE(page, rip));
		if (upd != NULL && WT_UPDATE_DELETED_ISSET(upd)) {
			if (__wt_txn_visible_all(session, upd->txnid))
				++cbt->page_deleted_count;
			continue;
		}

		return (__cursor_row_slot_return(cbt, rip, upd));
	}
	/* NOTREACHED */
}
Esempio n. 15
0
/*
 * __cursor_var_prev --
 *	Move to the previous, variable-length column-store item.
 */
static inline int
__cursor_var_prev(WT_CURSOR_BTREE *cbt, bool newpage)
{
	WT_CELL *cell;
	WT_CELL_UNPACK unpack;
	WT_COL *cip;
	WT_INSERT *ins;
	WT_ITEM *val;
	WT_PAGE *page;
	WT_SESSION_IMPL *session;
	WT_UPDATE *upd;
	uint64_t rle_start;

	session = (WT_SESSION_IMPL *)cbt->iface.session;
	page = cbt->ref->page;
	val = &cbt->iface.value;

	rle_start = 0;			/* -Werror=maybe-uninitialized */

	/* Initialize for each new page. */
	if (newpage) {
		cbt->last_standard_recno = __col_var_last_recno(page);
		if (cbt->last_standard_recno == 0)
			return (WT_NOTFOUND);
		__cursor_set_recno(cbt, cbt->last_standard_recno);
		goto new_page;
	}

	/* Move to the previous entry and return the item. */
	for (;;) {
		__cursor_set_recno(cbt, cbt->recno - 1);

new_page:	if (cbt->recno < page->pg_var_recno)
			return (WT_NOTFOUND);

		/* Find the matching WT_COL slot. */
		if ((cip =
		    __col_var_search(page, cbt->recno, &rle_start)) == NULL)
			return (WT_NOTFOUND);
		cbt->slot = WT_COL_SLOT(page, cip);

		/* Check any insert list for a matching record. */
		cbt->ins_head = WT_COL_UPDATE_SLOT(page, cbt->slot);
		cbt->ins = __col_insert_search_match(cbt->ins_head, cbt->recno);
		upd = cbt->ins == NULL ?
		    NULL : __wt_txn_read(session, cbt->ins->upd);
		if (upd != NULL) {
			if (WT_UPDATE_DELETED_ISSET(upd)) {
				if (__wt_txn_visible_all(session, upd->txnid))
					++cbt->page_deleted_count;
				continue;
			}

			val->data = WT_UPDATE_DATA(upd);
			val->size = upd->size;
			return (0);
		}

		/*
		 * If we're at the same slot as the last reference and there's
		 * no matching insert list item, re-use the return information
		 * (so encoded items with large repeat counts aren't repeatedly
		 * decoded).  Otherwise, unpack the cell and build the return
		 * information.
		 */
		if (cbt->cip_saved != cip) {
			if ((cell = WT_COL_PTR(page, cip)) == NULL)
				continue;
			__wt_cell_unpack(cell, &unpack);
			if (unpack.type == WT_CELL_DEL) {
				if (__wt_cell_rle(&unpack) == 1)
					continue;
				/*
				 * There can be huge gaps in the variable-length
				 * column-store name space appearing as deleted
				 * records. If more than one deleted record, do
				 * the work of finding the next record to return
				 * instead of looping through the records.
				 *
				 * First, find the largest record in the update
				 * list that's smaller than the current record.
				 */
				ins = __col_insert_search_lt(
				    cbt->ins_head, cbt->recno);

				/*
				 * Second, for records with RLEs greater than 1,
				 * the above call to __col_var_search located
				 * this record in the page's list of repeating
				 * records, and returned the starting record.
				 * The starting record - 1 is the record to
				 * which we could skip, if there was no larger
				 * record in the update list.
				 */
				cbt->recno = rle_start - 1;
				if (ins != NULL &&
				    WT_INSERT_RECNO(ins) > cbt->recno)
					cbt->recno = WT_INSERT_RECNO(ins);

				/* Adjust for the outer loop decrement. */
				++cbt->recno;
				continue;
			}
			WT_RET(__wt_page_cell_data_ref(
			    session, page, &unpack, cbt->tmp));

			cbt->cip_saved = cip;
		}
		val->data = cbt->tmp->data;
		val->size = cbt->tmp->size;
		return (0);
	}
	/* NOTREACHED */
}
Esempio n. 16
0
/*
 * __wt_kv_return --
 *	Return a page referenced key/value pair to the application.
 */
int
__wt_kv_return(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, int key_ret)
{
	WT_BTREE *btree;
	WT_CELL *cell;
	WT_CELL_UNPACK *unpack, _unpack;
	WT_CURSOR *cursor;
	WT_IKEY *ikey;
	WT_PAGE *page;
	WT_ROW *rip;
	WT_UPDATE *upd;
	uint8_t v;

	btree = session->btree;
	unpack = &_unpack;

	page = cbt->page;
	cursor = &cbt->iface;

	switch (page->type) {
	case WT_PAGE_COL_FIX:
		if (key_ret)
			cursor->recno = cbt->recno;

		/*
		 * If the cursor references a WT_INSERT item, take the related
		 * WT_UPDATE item.
		 */
		if (cbt->ins != NULL) {
			upd = cbt->ins->upd;
			cursor->value.data = WT_UPDATE_DATA(upd);
			cursor->value.size = upd->size;
			return (0);
		}
		v = __bit_getv_recno(page, cbt->iface.recno, btree->bitcnt);
		return (__wt_buf_set(session, &cursor->value, &v, 1));
	case WT_PAGE_COL_VAR:
		if (key_ret)
			cursor->recno = cbt->recno;

		/*
		 * If the cursor references a WT_INSERT item, take the related
		 * WT_UPDATE item.
		 */
		if (cbt->ins != NULL) {
			upd = cbt->ins->upd;
			cursor->value.data = WT_UPDATE_DATA(upd);
			cursor->value.size = upd->size;
			return (0);
		}
		cell = WT_COL_PTR(page, &page->u.col_var.d[cbt->slot]);
		break;
	case WT_PAGE_ROW_LEAF:
		rip = &page->u.row.d[cbt->slot];

		/*
		 * If the cursor references a WT_INSERT item, take the key and
		 * related WT_UPDATE item.   Otherwise, take the key from the
		 * original page, and the value from any related WT_UPDATE item,
		 * or the page if the key was never updated.
		 */
		if (cbt->ins == NULL) {
			if (key_ret) {
				if (__wt_off_page(page, rip->key)) {
					ikey = rip->key;
					cursor->key.data = WT_IKEY_DATA(ikey);
					cursor->key.size = ikey->size;
				} else
					WT_RET(__wt_row_key(
					    session, page, rip, &cursor->key));
			}
			upd = WT_ROW_UPDATE(page, rip);
		} else {
			if (key_ret) {
				cursor->key.data = WT_INSERT_KEY(cbt->ins);
				cursor->key.size = WT_INSERT_KEY_SIZE(cbt->ins);
			}
			upd = cbt->ins->upd;
		}
		if (upd != NULL) {
			cursor->value.data = WT_UPDATE_DATA(upd);
			cursor->value.size = upd->size;
			return (0);
		}

		/* Take the original cell (which may be empty). */
		if ((cell = __wt_row_value(page, rip)) == NULL) {
			cursor->value.size = 0;
			return (0);
		}
		break;
	WT_ILLEGAL_VALUE(session);
	}

	/* It's a cell, unpack and expand it as necessary. */
	__wt_cell_unpack(cell, unpack);
	if (btree->huffman_value == NULL && unpack->type == WT_CELL_VALUE) {
		cursor->value.data = unpack->data;
		cursor->value.size = unpack->size;
		return (0);
	} else
		return (__wt_cell_unpack_copy(session, unpack, &cursor->value));
}
Esempio n. 17
0
/*
 * __cursor_fix_append_prev --
 *	Return the previous fixed-length entry on the append list.
 */
static inline int
__cursor_fix_append_prev(WT_CURSOR_BTREE *cbt, bool newpage)
{
	WT_ITEM *val;
	WT_PAGE *page;
	WT_SESSION_IMPL *session;
	WT_UPDATE *upd;

	session = (WT_SESSION_IMPL *)cbt->iface.session;
	page = cbt->ref->page;
	val = &cbt->iface.value;

	if (newpage) {
		if ((cbt->ins = WT_SKIP_LAST(cbt->ins_head)) == NULL)
			return (WT_NOTFOUND);
	} else {
		/*
		 * Handle the special case of leading implicit records, that is,
		 * there aren't any records in the tree not on the append list,
		 * and the first record on the append list isn't record 1.
		 *
		 * The "right" place to handle this is probably in our caller.
		 * The high-level cursor-previous routine would:
		 *    -- call this routine to walk the append list
		 *    -- call the routine to walk the standard page items
		 *    -- call the tree walk routine looking for a previous page
		 * Each of them returns WT_NOTFOUND, at which point our caller
		 * checks the cursor record number, and if it's larger than 1,
		 * returns the implicit records.  Instead, I'm trying to detect
		 * the case here, mostly because I don't want to put that code
		 * into our caller.  Anyway, if this code breaks for any reason,
		 * that's the way I'd go.
		 *
		 * If we're not pointing to a WT_INSERT entry, or we can't find
		 * a WT_INSERT record that precedes our record name-space, check
		 * if there are any records on the page.  If there aren't, then
		 * we're in the magic zone, keep going until we get to a record
		 * number of 1.
		 */
		if (cbt->ins != NULL &&
		    cbt->recno <= WT_INSERT_RECNO(cbt->ins))
			WT_RET(__cursor_skip_prev(cbt));
		if (cbt->ins == NULL &&
		    (cbt->recno == 1 || __col_fix_last_recno(page) != 0))
			return (WT_NOTFOUND);
	}

	/*
	 * This code looks different from the cursor-next code.  The append
	 * list appears on the last page of the tree and contains the last
	 * records in the tree.  If we're iterating through the tree, starting
	 * at the last record in the tree, by definition we're starting a new
	 * iteration and we set the record number to the last record found in
	 * the tree.  Otherwise, decrement the record.
	 */
	if (newpage)
		__cursor_set_recno(cbt, WT_INSERT_RECNO(cbt->ins));
	else
		__cursor_set_recno(cbt, cbt->recno - 1);

	/*
	 * Fixed-width column store appends are inherently non-transactional.
	 * Even a non-visible update by a concurrent or aborted transaction
	 * changes the effective end of the data.  The effect is subtle because
	 * of the blurring between deleted and empty values, but ideally we
	 * would skip all uncommitted changes at the end of the data.  This
	 * doesn't apply to variable-width column stores because the implicitly
	 * created records written by reconciliation are deleted and so can be
	 * never seen by a read.
	 */
	if (cbt->ins == NULL ||
	    cbt->recno > WT_INSERT_RECNO(cbt->ins) ||
	    (upd = __wt_txn_read(session, cbt->ins->upd)) == NULL) {
		cbt->v = 0;
		val->data = &cbt->v;
	} else
		val->data = WT_UPDATE_DATA(upd);
	val->size = 1;
	return (0);
}
Esempio n. 18
0
/*
 * __cursor_var_prev --
 *	Move to the previous, variable-length column-store item.
 */
static inline int
__cursor_var_prev(WT_CURSOR_BTREE *cbt, int newpage)
{
	WT_CELL *cell;
	WT_CELL_UNPACK unpack;
	WT_COL *cip;
	WT_ITEM *val;
	WT_PAGE *page;
	WT_SESSION_IMPL *session;
	WT_UPDATE *upd;

	session = (WT_SESSION_IMPL *)cbt->iface.session;
	page = cbt->ref->page;
	val = &cbt->iface.value;

	/* Initialize for each new page. */
	if (newpage) {
		cbt->last_standard_recno = __col_var_last_recno(page);
		if (cbt->last_standard_recno == 0)
			return (WT_NOTFOUND);
		__cursor_set_recno(cbt, cbt->last_standard_recno);
		goto new_page;
	}

	/* Move to the previous entry and return the item. */
	for (;;) {
		__cursor_set_recno(cbt, cbt->recno - 1);

new_page:	if (cbt->recno < page->pg_var_recno)
			return (WT_NOTFOUND);

		/* Find the matching WT_COL slot. */
		if ((cip = __col_var_search(page, cbt->recno)) == NULL)
			return (WT_NOTFOUND);
		cbt->slot = WT_COL_SLOT(page, cip);

		/* Check any insert list for a matching record. */
		cbt->ins_head = WT_COL_UPDATE_SLOT(page, cbt->slot);
		cbt->ins = __col_insert_search_match(cbt->ins_head, cbt->recno);
		upd = cbt->ins == NULL ?
		    NULL : __wt_txn_read(session, cbt->ins->upd);
		if (upd != NULL) {
			if (WT_UPDATE_DELETED_ISSET(upd)) {
				++cbt->page_deleted_count;
				continue;
			}

			val->data = WT_UPDATE_DATA(upd);
			val->size = upd->size;
			return (0);
		}

		/*
		 * If we're at the same slot as the last reference and there's
		 * no matching insert list item, re-use the return information
		 * (so encoded items with large repeat counts aren't repeatedly
		 * decoded).  Otherwise, unpack the cell and build the return
		 * information.
		 */
		if (cbt->cip_saved != cip) {
			if ((cell = WT_COL_PTR(page, cip)) == NULL)
				continue;
			__wt_cell_unpack(cell, &unpack);
			if (unpack.type == WT_CELL_DEL)
				continue;
			WT_RET(__wt_page_cell_data_ref(
			    session, page, &unpack, &cbt->tmp));

			cbt->cip_saved = cip;
		}
		val->data = cbt->tmp.data;
		val->size = cbt->tmp.size;
		return (0);
	}
	/* NOTREACHED */
}
Esempio n. 19
0
/*
 * __cursor_row_next --
 *	Move to the next row-store item.
 */
static inline int
__cursor_row_next(WT_CURSOR_BTREE *cbt, int newpage)
{
	WT_INSERT *ins;
	WT_ITEM *key, *val;
	WT_ROW *rip;
	WT_SESSION_IMPL *session;
	WT_UPDATE *upd;

	key = &cbt->iface.key;
	val = &cbt->iface.value;
	session = (WT_SESSION_IMPL *)cbt->iface.session;

	/*
	 * For row-store pages, we need a single item that tells us the part
	 * of the page we're walking (otherwise switching from next to prev
	 * and vice-versa is just too complicated), so we map the WT_ROW and
	 * WT_INSERT_HEAD insert array slots into a single name space: slot 1
	 * is the "smallest key insert list", slot 2 is WT_ROW[0], slot 3 is
	 * WT_INSERT_HEAD[0], and so on.  This means WT_INSERT lists are
	 * odd-numbered slots, and WT_ROW array slots are even-numbered slots.
	 *
	 * New page configuration.
	 */
	if (newpage) {
		cbt->ins_head = WT_ROW_INSERT_SMALLEST(cbt->page);
		cbt->ins = WT_SKIP_FIRST(cbt->ins_head);
		cbt->row_iteration_slot = 1;
		goto new_insert;
	}

	/* Move to the next entry and return the item. */
	for (;;) {
		/*
		 * Continue traversing any insert list; maintain the insert list
		 * head reference and entry count in case we switch to a cursor
		 * previous movement.
		 */
		if (cbt->ins != NULL)
			cbt->ins = WT_SKIP_NEXT(cbt->ins);

new_insert:	if ((ins = cbt->ins) != NULL) {
			if ((upd = __wt_txn_read(session, ins->upd)) == NULL ||
			    WT_UPDATE_DELETED_ISSET(upd))
				continue;
			key->data = WT_INSERT_KEY(ins);
			key->size = WT_INSERT_KEY_SIZE(ins);
			val->data = WT_UPDATE_DATA(upd);
			val->size = upd->size;
			return (0);
		}

		/* Check for the end of the page. */
		if (cbt->row_iteration_slot >= cbt->page->entries * 2 + 1)
			return (WT_NOTFOUND);
		++cbt->row_iteration_slot;

		/*
		 * Odd-numbered slots configure as WT_INSERT_HEAD entries,
		 * even-numbered slots configure as WT_ROW entries.
		 */
		if (cbt->row_iteration_slot & 0x01) {
			cbt->ins_head = WT_ROW_INSERT_SLOT(
			    cbt->page, cbt->row_iteration_slot / 2 - 1);
			cbt->ins = WT_SKIP_FIRST(cbt->ins_head);
			goto new_insert;
		}
		cbt->ins_head = NULL;
		cbt->ins = NULL;

		cbt->slot = cbt->row_iteration_slot / 2 - 1;
		rip = &cbt->page->u.row.d[cbt->slot];
		upd = __wt_txn_read(session, WT_ROW_UPDATE(cbt->page, rip));
		if (upd != NULL && WT_UPDATE_DELETED_ISSET(upd))
			continue;

		return (__cursor_row_slot_return(cbt, rip, upd));
	}
	/* NOTREACHED */
}
Esempio n. 20
0
int __wt_kv_return(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd)
{
	WT_BTREE *btree;
	WT_CELL *cell;
	WT_CELL_UNPACK unpack;
	WT_CURSOR *cursor;
	WT_PAGE *page;
	WT_ROW *rip;
	uint8_t v;

	switch (page->type){
	case WT_PAGE_COL_FIX:
		cursor->recno = cbt->recno;
		/*cursor对应的是一个upd,直接返回value*/
		if (upd != NULL){
			cursor->value.data = WT_UPDATE_DATA(upd);
			cursor->value.size = upd->size;
			return 0;
		}

		v = __bit_getv_recno(page, cbt->iface.recno, btree->bitcnt);
		return __wt_buf_set(session, &cursor->value, &v, 1);
		
	case WT_PAGE_COL_VAR:
		cursor->recno = cbt->recno;

		if (upd != NULL) {
			cursor->value.data = WT_UPDATE_DATA(upd);
			cursor->value.size = upd->size;
			return (0);
		}
		/*获得对应的cell,并通过cell得到K/V值*/
		cell = WT_COL_PTR(page, &page->pg_var_d[cbt->slot]);
		break;

	case WT_PAGE_ROW_LEAF:
		rip = &page->pg_row_d[cbt->slot];

		if (cbt->ins != NULL){ /*插入的k/v对*/
			cursor->key.data = WT_INSERT_KEY(cbt->ins);
			cursor->key.size = WT_INSERT_KEY_SIZE(cbt->ins);
		}
		else if (cbt->compare == 0){/*比较器定位到了对应的k/v对*/
			cursor->key.data = cbt->search_key.data;
			cursor->key.size = cbt->search_key.size;
		}
		else
			WT_RET(__wt_row_leaf_key(session, page, rip, &cursor->key, 0)); /*设置key的值*/

		/*值是在append/update list当中,从当中取*/
		if (upd != NULL) {
			cursor->value.data = WT_UPDATE_DATA(upd);
			cursor->value.size = upd->size;
			return (0);
		}
		/*可以直接通过rip指针获得value,K/V是存储在cell空间之内*/
		if (__wt_row_leaf_value(page, rip, &cursor->value))
			return 0;

		/*不是连续存储的,需要通过解析cell来定位到value*/
		if (cell = __wt_row_leaf_value_cell(page, rip, NULL) == NULL){
			cursor->value.size = 0;
			return 0;
		}
		break;

		WT_ILLEGAL_VALUE(session);
	}
	/*通过cell解析到对应的value值, ovfl item*/
	__wt_cell_unpack(cell, &unpack);
	WT_RET(__wt_page_cell_data_ref(session, page, &unpack, &cursor->value));

	return 0;
}
Esempio n. 21
0
/*
 * __cursor_var_prev --
 *	Move to the previous, variable-length column-store item.
 */
static inline int
__cursor_var_prev(WT_CURSOR_BTREE *cbt, int newpage)
{
	WT_CELL *cell;
	WT_CELL_UNPACK unpack;
	WT_COL *cip;
	WT_INSERT *ins;
	WT_ITEM *val;
	WT_SESSION_IMPL *session;
	WT_UPDATE *upd;
	uint64_t *recnop;

	session = (WT_SESSION_IMPL *)cbt->iface.session;
	recnop = &cbt->iface.recno;
	val = &cbt->iface.value;

	/* Initialize for each new page. */
	if (newpage) {
		cbt->last_standard_recno = __col_last_recno(cbt->page);
		if (cbt->last_standard_recno == 0)
			return (WT_NOTFOUND);
		cbt->recno = cbt->last_standard_recno;
		goto new_page;
	}

	/* Move to the previous entry and return the item. */
	for (;;) {
		--cbt->recno;
new_page:	*recnop = cbt->recno;
		if (cbt->recno < cbt->page->u.col_var.recno)
			return (WT_NOTFOUND);

		/* Find the matching WT_COL slot. */
		if ((cip = __col_var_search(cbt->page, cbt->recno)) == NULL)
			return (WT_NOTFOUND);

		/* Check any insert list for a matching record. */
		if ((ins = __col_insert_search_match(
		    WT_COL_UPDATE(cbt->page, cip), cbt->recno)) != NULL &&
		    (upd = __wt_txn_read(session, ins->upd)) != NULL) {
			if (WT_UPDATE_DELETED_ISSET(upd))
				continue;
			cbt->ins = ins;
			val->data = WT_UPDATE_DATA(upd);
			val->size = upd->size;
			return (0);
		}

		/*
		 * If we're at the same slot as the last reference and there's
		 * no matching insert list item, re-use the return information
		 * (so encoded items with large repeat counts aren't repeatedly
		 * decoded).  Otherwise, unpack the cell and build the return
		 * information.
		 */
		if (cbt->cip_saved != cip) {
			if ((cell = WT_COL_PTR(cbt->page, cip)) == NULL)
				continue;
			__wt_cell_unpack(cell, &unpack);
			switch (unpack.type) {
			case WT_CELL_DEL:
				continue;
			case WT_CELL_VALUE:
				if (session->btree->huffman_value == NULL) {
					cbt->tmp.data = unpack.data;
					cbt->tmp.size = unpack.size;
					break;
				}
				/* FALLTHROUGH */
			default:
				WT_RET(__wt_cell_unpack_copy(
				    session, &unpack, &cbt->tmp));
			}
			cbt->cip_saved = cip;
		}
		val->data = cbt->tmp.data;
		val->size = cbt->tmp.size;
		return (0);
	}
	/* NOTREACHED */
}
Esempio n. 22
0
/*移向行存储的下一个行对象*/
static inline int __cursor_row_next(WT_CURSOR_BTREE* cbt, int newpage)
{
	WT_INSERT *ins;
	WT_ITEM *key, *val;
	WT_PAGE *page;
	WT_ROW *rip;
	WT_SESSION_IMPL *session;
	WT_UPDATE *upd;

	session = (WT_SESSION_IMPL *)cbt->iface.session;
	page = cbt->ref->page;
	key = &cbt->iface.key;
	val = &cbt->iface.value;

	/*假如是newpage,定位到insert修改队列的头位置*/
	if (newpage){
		cbt->ins_head = WT_ROW_INSERT_SMALLEST(page);
		cbt->ins = WT_SKIP_FIRST(cbt->ins_head);
		cbt->row_iteration_slot = 1;
		goto new_insert;
	}

	for (;;){
		if (cbt->ins != NULL)
			cbt->ins = WT_SKIP_NEXT(cbt->ins);

new_insert:
		if ((ins = cbt->ins) != NULL) {
			/*事务可见数据读取*/
			if ((upd = __wt_txn_read(session, ins->upd)) == NULL)
				continue;
			/*判断是否删除,如果删除,跳过被删除的对象*/
			if (WT_UPDATE_DELETED_ISSET(upd)) {
				++cbt->page_deleted_count;
				continue;
			}

			key->data = WT_INSERT_KEY(ins);
			key->size = WT_INSERT_KEY_SIZE(ins);
			val->data = WT_UPDATE_DATA(upd);
			val->size = upd->size;
			return 0;
		}

		/*检索page row entires数组, 到了page的末尾*/
		if (cbt->row_iteration_slot >= page->pg_row_entries * 2 + 1)
			return (WT_NOTFOUND);
		++cbt->row_iteration_slot;

		/*
		* Odd-numbered slots configure as WT_INSERT_HEAD entries,
		* even-numbered slots configure as WT_ROW entries.
		*/
		if (cbt->row_iteration_slot & 0x01) {
			cbt->ins_head = WT_ROW_INSERT_SLOT(page, cbt->row_iteration_slot / 2 - 1);
			cbt->ins = WT_SKIP_FIRST(cbt->ins_head);
			goto new_insert;
		}

		cbt->ins_head = NULL;
		cbt->ins = NULL;

		/*计算定位slot*/
		cbt->slot = cbt->row_iteration_slot / 2 - 1;
		rip = &page->pg_row_d[cbt->slot];

		upd = __wt_txn_read(session, WT_ROW_UPDATE(page, rip));
		if (upd != NULL && WT_UPDATE_DELETED_ISSET(upd)) {
			++cbt->page_deleted_count;
			continue;
		}

		return __cursor_row_slot_return(cbt, rip, upd);
	}
}
Esempio n. 23
0
/*移向下条variable-length column-store 记录*/
static inline int __cursor_var_next(WT_CURSOR_BTREE* cbt, int newpage)
{
	WT_CELL *cell;
	WT_CELL_UNPACK unpack;
	WT_COL *cip;
	WT_ITEM *val;
	WT_INSERT *ins;
	WT_PAGE *page;
	WT_SESSION_IMPL *session;
	WT_UPDATE *upd;
	uint64_t rle, rle_start;

	session = (WT_SESSION_IMPL *)cbt->iface.session;
	page = cbt->ref->page;
	val = &cbt->iface.value;

	rle_start = 0;			/* -Werror=maybe-uninitialized */

	if (newpage){
		cbt->last_standard_recno = __col_var_last_recno(page);
		if (cbt->last_standard_recno == 0)
			return (WT_NOTFOUND);
		__cursor_set_recno(cbt, page->pg_var_recno);
		goto new_page;
	}

	for (;;){
		if (cbt->recno >= cbt->last_standard_recno)
			return (WT_NOTFOUND);
		__cursor_set_recno(cbt, cbt->recno + 1);

	new_page:
		/*定位到recno对应的WT_COL slot*/
		if ((cip = __col_var_search(page, cbt->recno, &rle_start)) == NULL)
			return (WT_NOTFOUND);
		cbt->slot = WT_COL_SLOT(page, cip);

		/*读取内容值*/
		cbt->ins_head = WT_COL_UPDATE_SLOT(page, cbt->slot);
		cbt->ins = __col_insert_search_match(cbt->ins_head, cbt->recno);
		upd = cbt->ins == NULL ? NULL : __wt_txn_read(session, cbt->ins->upd);
		if (upd != NULL) {
			if (WT_UPDATE_DELETED_ISSET(upd)) {
				++cbt->page_deleted_count;
				continue;
			}

			val->data = WT_UPDATE_DATA(upd);
			val->size = upd->size;
			return (0);
		}

		/*
		* If we're at the same slot as the last reference and there's
		* no matching insert list item, re-use the return information
		* (so encoded items with large repeat counts aren't repeatedly
		* decoded).  Otherwise, unpack the cell and build the return
		* information.
		* upd == NULL, 记录可能被删除放入到了insert列表中,slot可能被重用了,那么需要进行cell unpack取值
		*/

		if (cbt->cip_saved != cip) {
			if ((cell = WT_COL_PTR(page, cip)) == NULL)
				continue;
			__wt_cell_unpack(cell, &unpack);
			if (unpack.type == WT_CELL_DEL) {
				if ((rle = __wt_cell_rle(&unpack)) == 1)
					continue;

				/*定位到修改列表中的记录*/
				ins = __col_insert_search_gt(cbt->ins_head, cbt->recno);
				cbt->recno = rle_start + rle;
				if (ins != NULL && WT_INSERT_RECNO(ins) < cbt->recno)
					cbt->recno = WT_INSERT_RECNO(ins);

				/* Adjust for the outer loop increment. */
				--cbt->recno;
				continue;
			}

			/*取出cell中的值到tmp中*/
			WT_RET(__wt_page_cell_data_ref(session, page, &unpack, &cbt->tmp));
			cbt->cip_saved = cip;
		}
		val->data = cbt->tmp.data;
		val->size = cbt->tmp.size;
		return 0;
	}
}
Esempio n. 24
0
/*
 * __cursor_var_prev --
 *	Move to the previous, variable-length column-store item.
 */
static inline int
__cursor_var_prev(WT_CURSOR_BTREE *cbt, int newpage)
{
	WT_CELL *cell;
	WT_CELL_UNPACK unpack;
	WT_COL *cip;
	WT_DECL_RET;
	WT_ITEM *val;
	WT_SESSION_IMPL *session;
	WT_UPDATE *upd;

	session = (WT_SESSION_IMPL *)cbt->iface.session;
	val = &cbt->iface.value;

	/* Initialize for each new page. */
	if (newpage) {
		cbt->last_standard_recno = __col_last_recno(cbt->page);
		if (cbt->last_standard_recno == 0)
			return (WT_NOTFOUND);
		__cursor_set_recno(cbt, cbt->last_standard_recno);
		goto new_page;
	}

	/* Move to the previous entry and return the item. */
	for (;;) {
		__cursor_set_recno(cbt, cbt->recno - 1);

new_page:	if (cbt->recno < cbt->page->u.col_var.recno)
			return (WT_NOTFOUND);

		/* Find the matching WT_COL slot. */
		if ((cip = __col_var_search(cbt->page, cbt->recno)) == NULL)
			return (WT_NOTFOUND);
		cbt->slot = WT_COL_SLOT(cbt->page, cip);

		/* Check any insert list for a matching record. */
		cbt->ins_head = WT_COL_UPDATE_SLOT(cbt->page, cbt->slot);
		cbt->ins = __col_insert_search_match(cbt->ins_head, cbt->recno);
		upd = cbt->ins == NULL ?
		    NULL : __wt_txn_read(session, cbt->ins->upd);
		if (upd != NULL) {
			if (WT_UPDATE_DELETED_ISSET(upd))
				continue;

			val->data = WT_UPDATE_DATA(upd);
			val->size = upd->size;
			return (0);
		}

		/*
		 * If we're at the same slot as the last reference and there's
		 * no matching insert list item, re-use the return information
		 * (so encoded items with large repeat counts aren't repeatedly
		 * decoded).  Otherwise, unpack the cell and build the return
		 * information.
		 */
		if (cbt->cip_saved != cip) {
			if ((cell = WT_COL_PTR(cbt->page, cip)) == NULL)
				continue;
			__wt_cell_unpack(cell, &unpack);
			if (unpack.type == WT_CELL_DEL)
				continue;

			/*
			 * Restart for a variable-length column-store.  We could
			 * catch restart higher up the call-stack but there's no
			 * point to it: unlike row-store (where a normal search
			 * path finds cached overflow values), we have to access
			 * the page's reconciliation structures, and that's as
			 * easy here as higher up the stack.
			 */
			if ((ret = __wt_cell_unpack_ref(
			    session, &unpack, &cbt->tmp)) == WT_RESTART)
				ret = __wt_ovfl_cache_col_restart(
				    session, cbt->page, &unpack, &cbt->tmp);
			WT_RET(ret);

			cbt->cip_saved = cip;
		}
		val->data = cbt->tmp.data;
		val->size = cbt->tmp.size;
		return (0);
	}
	/* NOTREACHED */
}
Esempio n. 25
0
/*
 * __wt_kv_return --
 *	Return a page referenced key/value pair to the application.
 */
int
__wt_kv_return(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd)
{
	WT_BTREE *btree;
	WT_CELL *cell;
	WT_CELL_UNPACK unpack;
	WT_CURSOR *cursor;
	WT_ITEM *tmp;
	WT_PAGE *page;
	WT_ROW *rip;
	uint8_t v;

	btree = S2BT(session);

	page = cbt->ref->page;
	cursor = &cbt->iface;

	switch (page->type) {
	case WT_PAGE_COL_FIX:
		/*
		 * The interface cursor's record has usually been set, but that
		 * isn't universally true, specifically, cursor.search_near may
		 * call here without first setting the interface cursor.
		 */
		cursor->recno = cbt->recno;

		/* If the cursor references a WT_UPDATE item, return it. */
		if (upd != NULL) {
			cursor->value.data = WT_UPDATE_DATA(upd);
			cursor->value.size = upd->size;
			return (0);
		}

		/* Take the value from the original page. */
		v = __bit_getv_recno(page, cbt->iface.recno, btree->bitcnt);
		return (__wt_buf_set(session, &cursor->value, &v, 1));
	case WT_PAGE_COL_VAR:
		/*
		 * The interface cursor's record has usually been set, but that
		 * isn't universally true, specifically, cursor.search_near may
		 * call here without first setting the interface cursor.
		 */
		cursor->recno = cbt->recno;

		/* If the cursor references a WT_UPDATE item, return it. */
		if (upd != NULL) {
			cursor->value.data = WT_UPDATE_DATA(upd);
			cursor->value.size = upd->size;
			return (0);
		}

		/* Take the value from the original page cell. */
		cell = WT_COL_PTR(page, &page->pg_var_d[cbt->slot]);
		break;
	case WT_PAGE_ROW_LEAF:
		rip = &page->pg_row_d[cbt->slot];

		/*
		 * If the cursor references a WT_INSERT item, take its key.
		 * Else, if we have an exact match, we copied the key in the
		 * search function, take it from there.
		 * If we don't have an exact match, take the key from the
		 * original page.
		 */
		if (cbt->ins != NULL) {
			cursor->key.data = WT_INSERT_KEY(cbt->ins);
			cursor->key.size = WT_INSERT_KEY_SIZE(cbt->ins);
		} else if (cbt->compare == 0) {
			/*
			 * If not in an insert list and there's an exact match,
			 * the row-store search function built the key we want
			 * to return in the cursor's temporary buffer. Swap the
			 * cursor's search-key and temporary buffers so we can
			 * return it (it's unsafe to return the temporary buffer
			 * itself because our caller might do another search in
			 * this table using the key we return, and we'd corrupt
			 * the search key during any subsequent search that used
			 * the temporary buffer.
			 */
			tmp = cbt->row_key;
			cbt->row_key = cbt->tmp;
			cbt->tmp = tmp;

			cursor->key.data = cbt->row_key->data;
			cursor->key.size = cbt->row_key->size;
		} else
			WT_RET(__wt_row_leaf_key(
			    session, page, rip, &cursor->key, false));

		/* If the cursor references a WT_UPDATE item, return it. */
		if (upd != NULL) {
			cursor->value.data = WT_UPDATE_DATA(upd);
			cursor->value.size = upd->size;
			return (0);
		}

		/* Simple values have their location encoded in the WT_ROW. */
		if (__wt_row_leaf_value(page, rip, &cursor->value))
			return (0);

		/*
		 * Take the value from the original page cell (which may be
		 * empty).
		 */
		if ((cell =
		    __wt_row_leaf_value_cell(page, rip, NULL)) == NULL) {
			cursor->value.size = 0;
			return (0);
		}
		break;
	WT_ILLEGAL_VALUE(session);
	}

	/* The value is an on-page cell, unpack and expand it as necessary. */
	__wt_cell_unpack(cell, &unpack);
	WT_RET(__wt_page_cell_data_ref(session, page, &unpack, &cursor->value));

	return (0);
}