Пример #1
0
/*
 * __cursor_var_prev --
 *	Move to the previous, variable-length column-store item.
 */
static inline int
__cursor_var_prev(WT_CURSOR_BTREE *cbt, int newpage)
{
	WT_CELL *cell;
	WT_CELL_UNPACK unpack;
	WT_COL *cip;
	WT_DECL_RET;
	WT_ITEM *val;
	WT_SESSION_IMPL *session;
	WT_UPDATE *upd;

	session = (WT_SESSION_IMPL *)cbt->iface.session;
	val = &cbt->iface.value;

	/* Initialize for each new page. */
	if (newpage) {
		cbt->last_standard_recno = __col_last_recno(cbt->page);
		if (cbt->last_standard_recno == 0)
			return (WT_NOTFOUND);
		__cursor_set_recno(cbt, cbt->last_standard_recno);
		goto new_page;
	}

	/* Move to the previous entry and return the item. */
	for (;;) {
		__cursor_set_recno(cbt, cbt->recno - 1);

new_page:	if (cbt->recno < cbt->page->u.col_var.recno)
			return (WT_NOTFOUND);

		/* Find the matching WT_COL slot. */
		if ((cip = __col_var_search(cbt->page, cbt->recno)) == NULL)
			return (WT_NOTFOUND);
		cbt->slot = WT_COL_SLOT(cbt->page, cip);

		/* Check any insert list for a matching record. */
		cbt->ins_head = WT_COL_UPDATE_SLOT(cbt->page, cbt->slot);
		cbt->ins = __col_insert_search_match(cbt->ins_head, cbt->recno);
		upd = cbt->ins == NULL ?
		    NULL : __wt_txn_read(session, cbt->ins->upd);
		if (upd != NULL) {
			if (WT_UPDATE_DELETED_ISSET(upd))
				continue;

			val->data = WT_UPDATE_DATA(upd);
			val->size = upd->size;
			return (0);
		}

		/*
		 * If we're at the same slot as the last reference and there's
		 * no matching insert list item, re-use the return information
		 * (so encoded items with large repeat counts aren't repeatedly
		 * decoded).  Otherwise, unpack the cell and build the return
		 * information.
		 */
		if (cbt->cip_saved != cip) {
			if ((cell = WT_COL_PTR(cbt->page, cip)) == NULL)
				continue;
			__wt_cell_unpack(cell, &unpack);
			if (unpack.type == WT_CELL_DEL)
				continue;

			/*
			 * Restart for a variable-length column-store.  We could
			 * catch restart higher up the call-stack but there's no
			 * point to it: unlike row-store (where a normal search
			 * path finds cached overflow values), we have to access
			 * the page's reconciliation structures, and that's as
			 * easy here as higher up the stack.
			 */
			if ((ret = __wt_cell_unpack_ref(
			    session, &unpack, &cbt->tmp)) == WT_RESTART)
				ret = __wt_ovfl_cache_col_restart(
				    session, cbt->page, &unpack, &cbt->tmp);
			WT_RET(ret);

			cbt->cip_saved = cip;
		}
		val->data = cbt->tmp.data;
		val->size = cbt->tmp.size;
		return (0);
	}
	/* NOTREACHED */
}
Пример #2
0
/*
 * __verify_dsk_row --
 *	Walk a WT_PAGE_ROW_INT or WT_PAGE_ROW_LEAF disk page and verify it.
 */
static int
__verify_dsk_row(
    WT_SESSION_IMPL *session, const char *addr, WT_PAGE_HEADER *dsk)
{
	WT_BM *bm;
	WT_BTREE *btree;
	WT_CELL *cell;
	WT_CELL_UNPACK *unpack, _unpack;
	WT_DECL_ITEM(current);
	WT_DECL_ITEM(last_ovfl);
	WT_DECL_ITEM(last_pfx);
	WT_DECL_RET;
	WT_ITEM *last;
	enum { FIRST, WAS_KEY, WAS_VALUE } last_cell_type;
	void *huffman;
	uint32_t cell_num, cell_type, i, prefix;
	uint8_t *end;
	int cmp;

	btree = S2BT(session);
	bm = btree->bm;
	huffman = btree->huffman_key;
	unpack = &_unpack;

	WT_ERR(__wt_scr_alloc(session, 0, &current));
	WT_ERR(__wt_scr_alloc(session, 0, &last_pfx));
	WT_ERR(__wt_scr_alloc(session, 0, &last_ovfl));
	last = last_ovfl;

	end = (uint8_t *)dsk + dsk->mem_size;

	last_cell_type = FIRST;
	cell_num = 0;
	WT_CELL_FOREACH(btree, dsk, cell, unpack, i) {
		++cell_num;

		/* Carefully unpack the cell. */
		if (__wt_cell_unpack_safe(cell, unpack, end) != 0) {
			ret = __err_cell_corrupted(session, cell_num, addr);
			goto err;
		}

		/* Check the raw and collapsed cell types. */
		WT_ERR(__err_cell_type(
		    session, cell_num, addr, unpack->raw, dsk->type));
		WT_ERR(__err_cell_type(
		    session, cell_num, addr, unpack->type, dsk->type));
		cell_type = unpack->type;

		/*
		 * Check ordering relationships between the WT_CELL entries.
		 * For row-store internal pages, check for:
		 *	two values in a row,
		 *	two keys in a row,
		 *	a value as the first cell on a page.
		 * For row-store leaf pages, check for:
		 *	two values in a row,
		 *	a value as the first cell on a page.
		 */
		switch (cell_type) {
		case WT_CELL_KEY:
		case WT_CELL_KEY_OVFL:
			switch (last_cell_type) {
			case FIRST:
			case WAS_VALUE:
				break;
			case WAS_KEY:
				if (dsk->type == WT_PAGE_ROW_LEAF)
					break;
				WT_ERR_VRFY(session,
				    "cell %" PRIu32 " on page at %s is the "
				    "first of two adjacent keys",
				    cell_num - 1, addr);
			}
			last_cell_type = WAS_KEY;
			break;
		case WT_CELL_ADDR:
		case WT_CELL_VALUE:
		case WT_CELL_VALUE_OVFL:
			switch (last_cell_type) {
			case FIRST:
				WT_ERR_VRFY(session,
				    "page at %s begins with a value", addr);
			case WAS_KEY:
				break;
			case WAS_VALUE:
				WT_ERR_VRFY(session,
				    "cell %" PRIu32 " on page at %s is the "
				    "first of two adjacent values",
				    cell_num - 1, addr);
			}
			last_cell_type = WAS_VALUE;
			break;
		}

		/* Check if any referenced item is entirely in the file. */
		switch (cell_type) {
		case WT_CELL_ADDR:
		case WT_CELL_KEY_OVFL:
		case WT_CELL_VALUE_OVFL:
			if (!bm->addr_valid(bm,
			    session, unpack->data, unpack->size))
				goto eof;
			break;
		}

		/*
		 * Remaining checks are for key order and prefix compression.
		 * If this cell isn't a key, we're done, move to the next cell.
		 * If this cell is an overflow item, instantiate the key and
		 * compare it with the last key.   Otherwise, we have to deal
		 * with prefix compression.
		 */
		switch (cell_type) {
		case WT_CELL_KEY:
			break;
		case WT_CELL_KEY_OVFL:
			WT_ERR(__wt_cell_unpack_ref(session, unpack, current));
			goto key_compare;
		default:
			/* Not a key -- continue with the next cell. */
			continue;
		}

		/*
		 * Prefix compression checks.
		 *
		 * Confirm the first non-overflow key on a page has a zero
		 * prefix compression count.
		 */
		prefix = unpack->prefix;
		if (last_pfx->size == 0 && prefix != 0)
			WT_ERR_VRFY(session,
			    "the %" PRIu32 " key on page at %s is the first "
			    "non-overflow key on the page and has a non-zero "
			    "prefix compression value",
			    cell_num, addr);

		/* Confirm the prefix compression count is possible. */
		if (cell_num > 1 && prefix > last->size)
			WT_ERR_VRFY(session,
			    "key %" PRIu32 " on page at %s has a prefix "
			    "compression count of %" PRIu32
			    ", larger than the length of the previous key, %"
			    PRIu32,
			    cell_num, addr, prefix, last->size);

		/*
		 * If Huffman decoding required, unpack the cell to build the
		 * key, then resolve the prefix.  Else, we can do it faster
		 * internally because we don't have to shuffle memory around as
		 * much.
		 */
		if (huffman != NULL) {
			WT_ERR(__wt_cell_unpack_ref(session, unpack, current));

			/*
			 * If there's a prefix, make sure there's enough buffer
			 * space, then shift the decoded data past the prefix
			 * and copy the prefix into place.  Take care with the
			 * pointers: current->data may be pointing inside the
			 */
			if (prefix != 0) {
				WT_ERR(__wt_buf_grow(
				    session, current, prefix + current->size));
				memmove((uint8_t *)current->mem + prefix,
				    current->data, current->size);
				memcpy(current->mem, last->data, prefix);
				current->data = current->mem;
				current->size += prefix;
			}
		} else {
			/*
			 * Get the cell's data/length and make sure we have
			 * enough buffer space.
			 */
			WT_ERR(__wt_buf_init(
			    session, current, prefix + unpack->size));

			/* Copy the prefix then the data into place. */
			if (prefix != 0)
				memcpy(current->mem, last->data, prefix);
			memcpy((uint8_t *)current->mem + prefix, unpack->data,
			    unpack->size);
			current->size = prefix + unpack->size;
		}

key_compare:	/*
		 * Compare the current key against the last key.
		 *
		 * Be careful about the 0th key on internal pages: we only store
		 * the first byte and custom collators may not be able to handle
		 * truncated keys.
		 */
		if ((dsk->type == WT_PAGE_ROW_INT && cell_num > 3) ||
		    (dsk->type != WT_PAGE_ROW_INT && cell_num > 1)) {
			WT_ERR(
			    WT_BTREE_CMP(session, btree, last, current, cmp));
			if (cmp >= 0)
				WT_ERR_VRFY(session,
				    "the %" PRIu32 " and %" PRIu32 " keys on "
				    "page at %s are incorrectly sorted",
				    cell_num - 2, cell_num, addr);
		}

		/*
		 * Swap the buffers: last always references the last key entry,
		 * last_pfx and last_ovfl reference the last prefix-compressed
		 * and last overflow key entries.  Current gets pointed to the
		 * buffer we're not using this time around, which is where the
		 * next key goes.
		 */
		last = current;
		if (cell_type == WT_CELL_KEY) {
			current = last_pfx;
			last_pfx = last;
		} else {
			current = last_ovfl;
			last_ovfl = last;
		}
		WT_ASSERT(session, last != current);
	}
Пример #3
0
/*
 * __wt_kv_return --
 *	Return a page referenced key/value pair to the application.
 */
int
__wt_kv_return(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt)
{
    WT_BTREE *btree;
    WT_CELL *cell;
    WT_CELL_UNPACK unpack;
    WT_CURSOR *cursor;
    WT_DECL_RET;
    WT_PAGE *page;
    WT_ROW *rip;
    WT_UPDATE *upd;
    uint8_t v;

    btree = S2BT(session);

    page = cbt->page;
    cursor = &cbt->iface;

    switch (page->type) {
    case WT_PAGE_COL_FIX:
        /*
         * The interface cursor's record has usually been set, but that
         * isn't universally true, specifically, cursor.search_near may
         * call here without first setting the interface cursor.
         */
        cursor->recno = cbt->recno;

        /*
         * If the cursor references a WT_INSERT item, take the related
         * WT_UPDATE item.
         */
        if (cbt->ins != NULL &&
                (upd = __wt_txn_read(session, cbt->ins->upd)) != NULL) {
            cursor->value.data = WT_UPDATE_DATA(upd);
            cursor->value.size = upd->size;
            return (0);
        }
        v = __bit_getv_recno(page, cbt->iface.recno, btree->bitcnt);
        return (__wt_buf_set(session, &cursor->value, &v, 1));
    case WT_PAGE_COL_VAR:
        /*
         * The interface cursor's record has usually been set, but that
         * isn't universally true, specifically, cursor.search_near may
         * call here without first setting the interface cursor.
         */
        cursor->recno = cbt->recno;

        /*
         * If the cursor references a WT_INSERT item, take the related
         * WT_UPDATE item.
         */
        if (cbt->ins != NULL &&
                (upd = __wt_txn_read(session, cbt->ins->upd)) != NULL) {
            cursor->value.data = WT_UPDATE_DATA(upd);
            cursor->value.size = upd->size;
            return (0);
        }
        cell = WT_COL_PTR(page, &page->u.col_var.d[cbt->slot]);
        break;
    case WT_PAGE_ROW_LEAF:
        rip = &page->u.row.d[cbt->slot];

        /*
         * If the cursor references a WT_INSERT item, take the key and
         * related WT_UPDATE item.   Otherwise, take the key from the
         * original page, and the value from any related WT_UPDATE item,
         * or the page if the key was never updated.
         */
        if (cbt->ins != NULL &&
                (upd = __wt_txn_read(session, cbt->ins->upd)) != NULL) {
            cursor->key.data = WT_INSERT_KEY(cbt->ins);
            cursor->key.size = WT_INSERT_KEY_SIZE(cbt->ins);
        } else {
            WT_RET(
                __wt_row_key(session, page, rip, &cursor->key, 0));
            upd = __wt_txn_read(session, WT_ROW_UPDATE(page, rip));
        }
        if (upd != NULL) {
            cursor->value.data = WT_UPDATE_DATA(upd);
            cursor->value.size = upd->size;
            return (0);
        }

        /* Take the original cell (which may be empty). */
        if ((cell = __wt_row_value(page, rip)) == NULL) {
            cursor->value.size = 0;
            return (0);
        }
        break;
        WT_ILLEGAL_VALUE(session);
    }

    /* The value is an on-page cell, unpack and expand it as necessary. */
    __wt_cell_unpack(cell, &unpack);
    ret = __wt_cell_unpack_ref(session, &unpack, &cursor->value);

    /*
     * Restart for a variable-length column-store.  We could catch restart
     * higher up the call-stack but there's no point to it: unlike row-store
     * (where a normal search path finds cached overflow values), we have to
     * access the page's reconciliation structures, and that's as easy here
     * as higher up the stack.
     */
    if (ret == WT_RESTART && page->type == WT_PAGE_COL_VAR)
        ret = __wt_ovfl_cache_col_restart(
                  session, page, &unpack, &cursor->value);
    return (ret);
}